diff --git "a/checkpoint-2000/trainer_state.json" "b/checkpoint-2000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2000/trainer_state.json" @@ -0,0 +1,18034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.19981018032868775, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 9.990509016434387e-05, + "grad_norm": 41.79611587524414, + "learning_rate": 0.0, + "loss": 3.8758, + "mean_token_accuracy": 0.4469400495290756, + "num_tokens": 271856.0, + "step": 1 + }, + { + "epoch": 0.00019981018032868775, + "grad_norm": 40.14211654663086, + "learning_rate": 3.322259136212625e-08, + "loss": 3.8135, + "mean_token_accuracy": 0.45300351083278656, + "num_tokens": 533728.0, + "step": 2 + }, + { + "epoch": 0.0002997152704930316, + "grad_norm": 40.6630859375, + "learning_rate": 6.64451827242525e-08, + "loss": 3.8105, + "mean_token_accuracy": 0.4535742551088333, + "num_tokens": 796414.0, + "step": 3 + }, + { + "epoch": 0.0003996203606573755, + "grad_norm": 39.04735565185547, + "learning_rate": 9.966777408637874e-08, + "loss": 3.8224, + "mean_token_accuracy": 0.4568437784910202, + "num_tokens": 1046510.0, + "step": 4 + }, + { + "epoch": 0.0004995254508217194, + "grad_norm": 40.753395080566406, + "learning_rate": 1.32890365448505e-07, + "loss": 3.8109, + "mean_token_accuracy": 0.4522576630115509, + "num_tokens": 1310212.0, + "step": 5 + }, + { + "epoch": 0.0005994305409860632, + "grad_norm": 41.47862243652344, + "learning_rate": 1.6611295681063126e-07, + "loss": 3.8464, + "mean_token_accuracy": 0.4475090205669403, + "num_tokens": 1566558.0, + "step": 6 + }, + { + "epoch": 0.0006993356311504072, + "grad_norm": 39.02691650390625, + "learning_rate": 1.9933554817275749e-07, + "loss": 3.8049, + "mean_token_accuracy": 0.45628662407398224, + "num_tokens": 1831247.0, + "step": 7 + }, + { + "epoch": 0.000799240721314751, + "grad_norm": 43.989749908447266, + "learning_rate": 2.3255813953488374e-07, + "loss": 3.8234, + "mean_token_accuracy": 0.4484795480966568, + "num_tokens": 2100787.0, + "step": 8 + }, + { + "epoch": 0.0008991458114790948, + "grad_norm": 41.014198303222656, + "learning_rate": 2.6578073089701e-07, + "loss": 3.8437, + "mean_token_accuracy": 0.45342305302619934, + "num_tokens": 2367354.0, + "step": 9 + }, + { + "epoch": 0.0009990509016434388, + "grad_norm": 39.063045501708984, + "learning_rate": 2.9900332225913623e-07, + "loss": 3.8036, + "mean_token_accuracy": 0.4565405249595642, + "num_tokens": 2636126.0, + "step": 10 + }, + { + "epoch": 0.0010989559918077826, + "grad_norm": 39.58705520629883, + "learning_rate": 3.322259136212625e-07, + "loss": 3.7866, + "mean_token_accuracy": 0.45870402455329895, + "num_tokens": 2913429.0, + "step": 11 + }, + { + "epoch": 0.0011988610819721264, + "grad_norm": 40.420528411865234, + "learning_rate": 3.654485049833888e-07, + "loss": 3.7664, + "mean_token_accuracy": 0.4619610607624054, + "num_tokens": 3172984.0, + "step": 12 + }, + { + "epoch": 0.0012987661721364703, + "grad_norm": 39.24951934814453, + "learning_rate": 3.9867109634551497e-07, + "loss": 3.7542, + "mean_token_accuracy": 0.4635259658098221, + "num_tokens": 3442733.0, + "step": 13 + }, + { + "epoch": 0.0013986712623008143, + "grad_norm": 40.36895751953125, + "learning_rate": 4.318936877076412e-07, + "loss": 3.7343, + "mean_token_accuracy": 0.463445782661438, + "num_tokens": 3711769.0, + "step": 14 + }, + { + "epoch": 0.0014985763524651581, + "grad_norm": 43.73731994628906, + "learning_rate": 4.651162790697675e-07, + "loss": 3.6851, + "mean_token_accuracy": 0.4721117913722992, + "num_tokens": 3982403.0, + "step": 15 + }, + { + "epoch": 0.001598481442629502, + "grad_norm": 38.29069900512695, + "learning_rate": 4.983388704318938e-07, + "loss": 3.6427, + "mean_token_accuracy": 0.4776879549026489, + "num_tokens": 4249233.0, + "step": 16 + }, + { + "epoch": 0.0016983865327938458, + "grad_norm": 40.74815368652344, + "learning_rate": 5.3156146179402e-07, + "loss": 3.4201, + "mean_token_accuracy": 0.5095992088317871, + "num_tokens": 4514621.0, + "step": 17 + }, + { + "epoch": 0.0017982916229581896, + "grad_norm": 33.40875244140625, + "learning_rate": 5.647840531561462e-07, + "loss": 3.3745, + "mean_token_accuracy": 0.5157257616519928, + "num_tokens": 4781818.0, + "step": 18 + }, + { + "epoch": 0.0018981967131225337, + "grad_norm": 32.56108093261719, + "learning_rate": 5.980066445182725e-07, + "loss": 3.3691, + "mean_token_accuracy": 0.5143713653087616, + "num_tokens": 5047123.0, + "step": 19 + }, + { + "epoch": 0.0019981018032868775, + "grad_norm": 33.79883575439453, + "learning_rate": 6.312292358803987e-07, + "loss": 3.3621, + "mean_token_accuracy": 0.5140093266963959, + "num_tokens": 5314002.0, + "step": 20 + }, + { + "epoch": 0.0020980068934512213, + "grad_norm": 31.099313735961914, + "learning_rate": 6.64451827242525e-07, + "loss": 3.321, + "mean_token_accuracy": 0.5205503404140472, + "num_tokens": 5578034.0, + "step": 21 + }, + { + "epoch": 0.002197911983615565, + "grad_norm": 31.2415771484375, + "learning_rate": 6.976744186046513e-07, + "loss": 3.3087, + "mean_token_accuracy": 0.5242648720741272, + "num_tokens": 5846059.0, + "step": 22 + }, + { + "epoch": 0.002297817073779909, + "grad_norm": 38.30427932739258, + "learning_rate": 7.308970099667776e-07, + "loss": 2.9381, + "mean_token_accuracy": 0.5826029181480408, + "num_tokens": 6120586.0, + "step": 23 + }, + { + "epoch": 0.002397722163944253, + "grad_norm": 26.074649810791016, + "learning_rate": 7.641196013289037e-07, + "loss": 2.6974, + "mean_token_accuracy": 0.6085172891616821, + "num_tokens": 6384765.0, + "step": 24 + }, + { + "epoch": 0.0024976272541085967, + "grad_norm": 28.203914642333984, + "learning_rate": 7.973421926910299e-07, + "loss": 2.6985, + "mean_token_accuracy": 0.604841023683548, + "num_tokens": 6644617.0, + "step": 25 + }, + { + "epoch": 0.0025975323442729405, + "grad_norm": 26.225017547607422, + "learning_rate": 8.305647840531563e-07, + "loss": 2.662, + "mean_token_accuracy": 0.6013334393501282, + "num_tokens": 6910849.0, + "step": 26 + }, + { + "epoch": 0.0026974374344372848, + "grad_norm": 28.116497039794922, + "learning_rate": 8.637873754152824e-07, + "loss": 2.5965, + "mean_token_accuracy": 0.6129036545753479, + "num_tokens": 7183063.0, + "step": 27 + }, + { + "epoch": 0.0027973425246016286, + "grad_norm": 28.50816535949707, + "learning_rate": 8.970099667774087e-07, + "loss": 2.5572, + "mean_token_accuracy": 0.6064099073410034, + "num_tokens": 7460136.0, + "step": 28 + }, + { + "epoch": 0.0028972476147659724, + "grad_norm": 28.441904067993164, + "learning_rate": 9.30232558139535e-07, + "loss": 2.4537, + "mean_token_accuracy": 0.6172919273376465, + "num_tokens": 7725225.0, + "step": 29 + }, + { + "epoch": 0.0029971527049303163, + "grad_norm": 26.38285255432129, + "learning_rate": 9.634551495016612e-07, + "loss": 2.4036, + "mean_token_accuracy": 0.6154768466949463, + "num_tokens": 7986103.0, + "step": 30 + }, + { + "epoch": 0.00309705779509466, + "grad_norm": 20.234909057617188, + "learning_rate": 9.966777408637875e-07, + "loss": 2.074, + "mean_token_accuracy": 0.6539514362812042, + "num_tokens": 8249292.0, + "step": 31 + }, + { + "epoch": 0.003196962885259004, + "grad_norm": 20.781661987304688, + "learning_rate": 1.0299003322259137e-06, + "loss": 1.8105, + "mean_token_accuracy": 0.6715203821659088, + "num_tokens": 8516281.0, + "step": 32 + }, + { + "epoch": 0.0032968679754233478, + "grad_norm": 27.671207427978516, + "learning_rate": 1.06312292358804e-06, + "loss": 1.7587, + "mean_token_accuracy": 0.6864829063415527, + "num_tokens": 8790632.0, + "step": 33 + }, + { + "epoch": 0.0033967730655876916, + "grad_norm": 24.890382766723633, + "learning_rate": 1.0963455149501661e-06, + "loss": 1.6918, + "mean_token_accuracy": 0.7169185280799866, + "num_tokens": 9057180.0, + "step": 34 + }, + { + "epoch": 0.0034966781557520354, + "grad_norm": 29.5397891998291, + "learning_rate": 1.1295681063122925e-06, + "loss": 1.6179, + "mean_token_accuracy": 0.7279886901378632, + "num_tokens": 9316994.0, + "step": 35 + }, + { + "epoch": 0.0035965832459163793, + "grad_norm": 21.603797912597656, + "learning_rate": 1.1627906976744188e-06, + "loss": 1.5873, + "mean_token_accuracy": 0.7304977774620056, + "num_tokens": 9584032.0, + "step": 36 + }, + { + "epoch": 0.0036964883360807235, + "grad_norm": 23.421491622924805, + "learning_rate": 1.196013289036545e-06, + "loss": 1.5607, + "mean_token_accuracy": 0.7315825521945953, + "num_tokens": 9846764.0, + "step": 37 + }, + { + "epoch": 0.0037963934262450674, + "grad_norm": 21.477184295654297, + "learning_rate": 1.2292358803986712e-06, + "loss": 1.5235, + "mean_token_accuracy": 0.7438421547412872, + "num_tokens": 10112713.0, + "step": 38 + }, + { + "epoch": 0.003896298516409411, + "grad_norm": 15.853423118591309, + "learning_rate": 1.2624584717607974e-06, + "loss": 1.4561, + "mean_token_accuracy": 0.748769611120224, + "num_tokens": 10373988.0, + "step": 39 + }, + { + "epoch": 0.003996203606573755, + "grad_norm": 17.296192169189453, + "learning_rate": 1.2956810631229235e-06, + "loss": 1.4209, + "mean_token_accuracy": 0.7579092383384705, + "num_tokens": 10639122.0, + "step": 40 + }, + { + "epoch": 0.004096108696738099, + "grad_norm": 15.76535415649414, + "learning_rate": 1.32890365448505e-06, + "loss": 1.3929, + "mean_token_accuracy": 0.7634686231613159, + "num_tokens": 10901206.0, + "step": 41 + }, + { + "epoch": 0.004196013786902443, + "grad_norm": 24.107501983642578, + "learning_rate": 1.3621262458471762e-06, + "loss": 1.3602, + "mean_token_accuracy": 0.7800469100475311, + "num_tokens": 11168325.0, + "step": 42 + }, + { + "epoch": 0.0042959188770667865, + "grad_norm": 16.515037536621094, + "learning_rate": 1.3953488372093025e-06, + "loss": 1.2465, + "mean_token_accuracy": 0.809214174747467, + "num_tokens": 11429818.0, + "step": 43 + }, + { + "epoch": 0.00439582396723113, + "grad_norm": 19.235319137573242, + "learning_rate": 1.4285714285714286e-06, + "loss": 1.2486, + "mean_token_accuracy": 0.8124915957450867, + "num_tokens": 11699109.0, + "step": 44 + }, + { + "epoch": 0.004495729057395474, + "grad_norm": 11.698029518127441, + "learning_rate": 1.4617940199335552e-06, + "loss": 1.1618, + "mean_token_accuracy": 0.8254410028457642, + "num_tokens": 11961951.0, + "step": 45 + }, + { + "epoch": 0.004595634147559818, + "grad_norm": 19.546875, + "learning_rate": 1.4950166112956813e-06, + "loss": 1.1616, + "mean_token_accuracy": 0.825401097536087, + "num_tokens": 12222067.0, + "step": 46 + }, + { + "epoch": 0.004695539237724162, + "grad_norm": 23.256893157958984, + "learning_rate": 1.5282392026578074e-06, + "loss": 1.145, + "mean_token_accuracy": 0.8251653909683228, + "num_tokens": 12484896.0, + "step": 47 + }, + { + "epoch": 0.004795444327888506, + "grad_norm": 17.308635711669922, + "learning_rate": 1.5614617940199335e-06, + "loss": 1.0953, + "mean_token_accuracy": 0.8317568004131317, + "num_tokens": 12754049.0, + "step": 48 + }, + { + "epoch": 0.0048953494180528495, + "grad_norm": 6.540695667266846, + "learning_rate": 1.5946843853820599e-06, + "loss": 1.0881, + "mean_token_accuracy": 0.8316325545310974, + "num_tokens": 13023905.0, + "step": 49 + }, + { + "epoch": 0.004995254508217193, + "grad_norm": 8.553227424621582, + "learning_rate": 1.6279069767441862e-06, + "loss": 1.081, + "mean_token_accuracy": 0.8326809704303741, + "num_tokens": 13293737.0, + "step": 50 + }, + { + "epoch": 0.005095159598381537, + "grad_norm": 8.320778846740723, + "learning_rate": 1.6611295681063126e-06, + "loss": 1.0503, + "mean_token_accuracy": 0.8355827331542969, + "num_tokens": 13562237.0, + "step": 51 + }, + { + "epoch": 0.005195064688545881, + "grad_norm": 7.993632793426514, + "learning_rate": 1.6943521594684387e-06, + "loss": 1.0548, + "mean_token_accuracy": 0.8384113013744354, + "num_tokens": 13828887.0, + "step": 52 + }, + { + "epoch": 0.005294969778710226, + "grad_norm": 6.307920455932617, + "learning_rate": 1.7275747508305648e-06, + "loss": 1.0377, + "mean_token_accuracy": 0.8339864313602448, + "num_tokens": 14094129.0, + "step": 53 + }, + { + "epoch": 0.0053948748688745695, + "grad_norm": 8.077619552612305, + "learning_rate": 1.7607973421926911e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.8432050347328186, + "num_tokens": 14359255.0, + "step": 54 + }, + { + "epoch": 0.005494779959038913, + "grad_norm": 11.858952522277832, + "learning_rate": 1.7940199335548175e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.8377295434474945, + "num_tokens": 14620638.0, + "step": 55 + }, + { + "epoch": 0.005594685049203257, + "grad_norm": 6.933170795440674, + "learning_rate": 1.8272425249169438e-06, + "loss": 1.0121, + "mean_token_accuracy": 0.8427193760871887, + "num_tokens": 14884999.0, + "step": 56 + }, + { + "epoch": 0.005694590139367601, + "grad_norm": 6.329719066619873, + "learning_rate": 1.86046511627907e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.8452832400798798, + "num_tokens": 15148370.0, + "step": 57 + }, + { + "epoch": 0.005794495229531945, + "grad_norm": 7.655465126037598, + "learning_rate": 1.893687707641196e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.8511391878128052, + "num_tokens": 15409877.0, + "step": 58 + }, + { + "epoch": 0.005894400319696289, + "grad_norm": 5.050930976867676, + "learning_rate": 1.9269102990033224e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.8480638563632965, + "num_tokens": 15675669.0, + "step": 59 + }, + { + "epoch": 0.0059943054098606325, + "grad_norm": 6.080487251281738, + "learning_rate": 1.9601328903654487e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.8505652546882629, + "num_tokens": 15937140.0, + "step": 60 + }, + { + "epoch": 0.006094210500024976, + "grad_norm": 4.876977920532227, + "learning_rate": 1.993355481727575e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.8516053259372711, + "num_tokens": 16204717.0, + "step": 61 + }, + { + "epoch": 0.00619411559018932, + "grad_norm": 4.480071067810059, + "learning_rate": 2.026578073089701e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.8568776249885559, + "num_tokens": 16464122.0, + "step": 62 + }, + { + "epoch": 0.006294020680353664, + "grad_norm": 5.257747173309326, + "learning_rate": 2.0598006644518273e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.8568149507045746, + "num_tokens": 16722572.0, + "step": 63 + }, + { + "epoch": 0.006393925770518008, + "grad_norm": 5.633082866668701, + "learning_rate": 2.0930232558139536e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.8547744452953339, + "num_tokens": 16987939.0, + "step": 64 + }, + { + "epoch": 0.006493830860682352, + "grad_norm": 5.518905162811279, + "learning_rate": 2.12624584717608e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.8585240244865417, + "num_tokens": 17247816.0, + "step": 65 + }, + { + "epoch": 0.0065937359508466955, + "grad_norm": 4.776093482971191, + "learning_rate": 2.1594684385382063e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.8630226254463196, + "num_tokens": 17515042.0, + "step": 66 + }, + { + "epoch": 0.006693641041011039, + "grad_norm": 5.030599117279053, + "learning_rate": 2.1926910299003322e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.8657163083553314, + "num_tokens": 17781855.0, + "step": 67 + }, + { + "epoch": 0.006793546131175383, + "grad_norm": 5.430304050445557, + "learning_rate": 2.2259136212624586e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.8687067031860352, + "num_tokens": 18039575.0, + "step": 68 + }, + { + "epoch": 0.006893451221339727, + "grad_norm": 5.290153980255127, + "learning_rate": 2.259136212624585e-06, + "loss": 0.7857, + "mean_token_accuracy": 0.8692563474178314, + "num_tokens": 18308504.0, + "step": 69 + }, + { + "epoch": 0.006993356311504071, + "grad_norm": 6.238577842712402, + "learning_rate": 2.2923588039867112e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.8685836493968964, + "num_tokens": 18570515.0, + "step": 70 + }, + { + "epoch": 0.007093261401668415, + "grad_norm": 6.52593469619751, + "learning_rate": 2.3255813953488376e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.8740846514701843, + "num_tokens": 18825827.0, + "step": 71 + }, + { + "epoch": 0.0071931664918327585, + "grad_norm": 5.760095119476318, + "learning_rate": 2.3588039867109635e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.8720021843910217, + "num_tokens": 19095478.0, + "step": 72 + }, + { + "epoch": 0.007293071581997102, + "grad_norm": 8.679697036743164, + "learning_rate": 2.39202657807309e-06, + "loss": 0.7296, + "mean_token_accuracy": 0.8695126175880432, + "num_tokens": 19361094.0, + "step": 73 + }, + { + "epoch": 0.007392976672161447, + "grad_norm": 6.1288323402404785, + "learning_rate": 2.425249169435216e-06, + "loss": 0.7052, + "mean_token_accuracy": 0.8723514080047607, + "num_tokens": 19627408.0, + "step": 74 + }, + { + "epoch": 0.007492881762325791, + "grad_norm": 5.9774909019470215, + "learning_rate": 2.4584717607973425e-06, + "loss": 0.6962, + "mean_token_accuracy": 0.8895685374736786, + "num_tokens": 19894809.0, + "step": 75 + }, + { + "epoch": 0.007592786852490135, + "grad_norm": 7.313020706176758, + "learning_rate": 2.4916943521594684e-06, + "loss": 0.66, + "mean_token_accuracy": 0.8900130987167358, + "num_tokens": 20161146.0, + "step": 76 + }, + { + "epoch": 0.0076926919426544785, + "grad_norm": 6.493581771850586, + "learning_rate": 2.5249169435215947e-06, + "loss": 0.6198, + "mean_token_accuracy": 0.8932929337024689, + "num_tokens": 20427159.0, + "step": 77 + }, + { + "epoch": 0.007792597032818822, + "grad_norm": 3.436570644378662, + "learning_rate": 2.558139534883721e-06, + "loss": 0.6126, + "mean_token_accuracy": 0.889729231595993, + "num_tokens": 20704101.0, + "step": 78 + }, + { + "epoch": 0.007892502122983166, + "grad_norm": 2.698483467102051, + "learning_rate": 2.591362126245847e-06, + "loss": 0.6049, + "mean_token_accuracy": 0.8903420865535736, + "num_tokens": 20960458.0, + "step": 79 + }, + { + "epoch": 0.00799240721314751, + "grad_norm": 3.5634539127349854, + "learning_rate": 2.6245847176079738e-06, + "loss": 0.6071, + "mean_token_accuracy": 0.891887366771698, + "num_tokens": 21234676.0, + "step": 80 + }, + { + "epoch": 0.008092312303311854, + "grad_norm": 11.913636207580566, + "learning_rate": 2.6578073089701e-06, + "loss": 0.6203, + "mean_token_accuracy": 0.8900243937969208, + "num_tokens": 21500218.0, + "step": 81 + }, + { + "epoch": 0.008192217393476198, + "grad_norm": 12.856719017028809, + "learning_rate": 2.691029900332226e-06, + "loss": 0.6198, + "mean_token_accuracy": 0.8921903967857361, + "num_tokens": 21755079.0, + "step": 82 + }, + { + "epoch": 0.008292122483640542, + "grad_norm": 9.613741874694824, + "learning_rate": 2.7242524916943523e-06, + "loss": 0.6235, + "mean_token_accuracy": 0.8917334377765656, + "num_tokens": 22015921.0, + "step": 83 + }, + { + "epoch": 0.008392027573804885, + "grad_norm": 5.273900985717773, + "learning_rate": 2.7574750830564782e-06, + "loss": 0.6058, + "mean_token_accuracy": 0.890205591917038, + "num_tokens": 22284355.0, + "step": 84 + }, + { + "epoch": 0.00849193266396923, + "grad_norm": 4.108988285064697, + "learning_rate": 2.790697674418605e-06, + "loss": 0.6013, + "mean_token_accuracy": 0.8899712264537811, + "num_tokens": 22554485.0, + "step": 85 + }, + { + "epoch": 0.008591837754133573, + "grad_norm": 3.0706963539123535, + "learning_rate": 2.8239202657807313e-06, + "loss": 0.595, + "mean_token_accuracy": 0.8914024829864502, + "num_tokens": 22812047.0, + "step": 86 + }, + { + "epoch": 0.008691742844297917, + "grad_norm": 4.06828498840332, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.5787, + "mean_token_accuracy": 0.8917503356933594, + "num_tokens": 23077232.0, + "step": 87 + }, + { + "epoch": 0.00879164793446226, + "grad_norm": 2.3549933433532715, + "learning_rate": 2.8903654485049836e-06, + "loss": 0.5858, + "mean_token_accuracy": 0.8947541117668152, + "num_tokens": 23347218.0, + "step": 88 + }, + { + "epoch": 0.008891553024626605, + "grad_norm": 3.8939905166625977, + "learning_rate": 2.9235880398671104e-06, + "loss": 0.583, + "mean_token_accuracy": 0.8941412568092346, + "num_tokens": 23620872.0, + "step": 89 + }, + { + "epoch": 0.008991458114790948, + "grad_norm": 11.007421493530273, + "learning_rate": 2.9568106312292363e-06, + "loss": 0.592, + "mean_token_accuracy": 0.8915394246578217, + "num_tokens": 23883053.0, + "step": 90 + }, + { + "epoch": 0.009091363204955292, + "grad_norm": 2.4590296745300293, + "learning_rate": 2.9900332225913626e-06, + "loss": 0.5965, + "mean_token_accuracy": 0.891523003578186, + "num_tokens": 24150786.0, + "step": 91 + }, + { + "epoch": 0.009191268295119636, + "grad_norm": 3.3189525604248047, + "learning_rate": 3.0232558139534885e-06, + "loss": 0.5868, + "mean_token_accuracy": 0.8894113302230835, + "num_tokens": 24425352.0, + "step": 92 + }, + { + "epoch": 0.00929117338528398, + "grad_norm": 3.0429913997650146, + "learning_rate": 3.056478405315615e-06, + "loss": 0.5798, + "mean_token_accuracy": 0.8934867084026337, + "num_tokens": 24687283.0, + "step": 93 + }, + { + "epoch": 0.009391078475448324, + "grad_norm": 4.384122371673584, + "learning_rate": 3.089700996677741e-06, + "loss": 0.5759, + "mean_token_accuracy": 0.896876186132431, + "num_tokens": 24961579.0, + "step": 94 + }, + { + "epoch": 0.009490983565612668, + "grad_norm": 2.9858405590057373, + "learning_rate": 3.122923588039867e-06, + "loss": 0.5792, + "mean_token_accuracy": 0.8958857953548431, + "num_tokens": 25231176.0, + "step": 95 + }, + { + "epoch": 0.009590888655777011, + "grad_norm": 3.086737632751465, + "learning_rate": 3.156146179401994e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.8931178748607635, + "num_tokens": 25500602.0, + "step": 96 + }, + { + "epoch": 0.009690793745941355, + "grad_norm": 2.5759334564208984, + "learning_rate": 3.1893687707641198e-06, + "loss": 0.5719, + "mean_token_accuracy": 0.8956139385700226, + "num_tokens": 25764539.0, + "step": 97 + }, + { + "epoch": 0.009790698836105699, + "grad_norm": 3.8631985187530518, + "learning_rate": 3.222591362126246e-06, + "loss": 0.5659, + "mean_token_accuracy": 0.8941615521907806, + "num_tokens": 26039747.0, + "step": 98 + }, + { + "epoch": 0.009890603926270043, + "grad_norm": 3.3039705753326416, + "learning_rate": 3.2558139534883724e-06, + "loss": 0.5669, + "mean_token_accuracy": 0.8978874683380127, + "num_tokens": 26313546.0, + "step": 99 + }, + { + "epoch": 0.009990509016434387, + "grad_norm": 3.4185988903045654, + "learning_rate": 3.2890365448504984e-06, + "loss": 0.5691, + "mean_token_accuracy": 0.8939110636711121, + "num_tokens": 26579513.0, + "step": 100 + }, + { + "epoch": 0.01009041410659873, + "grad_norm": 3.8113791942596436, + "learning_rate": 3.322259136212625e-06, + "loss": 0.5739, + "mean_token_accuracy": 0.8946503698825836, + "num_tokens": 26833096.0, + "step": 101 + }, + { + "epoch": 0.010190319196763074, + "grad_norm": 3.189736843109131, + "learning_rate": 3.355481727574751e-06, + "loss": 0.5681, + "mean_token_accuracy": 0.8961003422737122, + "num_tokens": 27087288.0, + "step": 102 + }, + { + "epoch": 0.010290224286927418, + "grad_norm": 17.174964904785156, + "learning_rate": 3.3887043189368774e-06, + "loss": 0.5678, + "mean_token_accuracy": 0.8967376351356506, + "num_tokens": 27353765.0, + "step": 103 + }, + { + "epoch": 0.010390129377091762, + "grad_norm": 4.047720909118652, + "learning_rate": 3.4219269102990037e-06, + "loss": 0.571, + "mean_token_accuracy": 0.8951614499092102, + "num_tokens": 27621699.0, + "step": 104 + }, + { + "epoch": 0.010490034467256106, + "grad_norm": 4.139532089233398, + "learning_rate": 3.4551495016611296e-06, + "loss": 0.5597, + "mean_token_accuracy": 0.8933297991752625, + "num_tokens": 27896245.0, + "step": 105 + }, + { + "epoch": 0.010589939557420451, + "grad_norm": 5.003537654876709, + "learning_rate": 3.4883720930232564e-06, + "loss": 0.5554, + "mean_token_accuracy": 0.8977648317813873, + "num_tokens": 28160787.0, + "step": 106 + }, + { + "epoch": 0.010689844647584795, + "grad_norm": 3.1412997245788574, + "learning_rate": 3.5215946843853823e-06, + "loss": 0.5604, + "mean_token_accuracy": 0.8966701030731201, + "num_tokens": 28424688.0, + "step": 107 + }, + { + "epoch": 0.010789749737749139, + "grad_norm": 3.4662961959838867, + "learning_rate": 3.5548172757475086e-06, + "loss": 0.5624, + "mean_token_accuracy": 0.8958131968975067, + "num_tokens": 28685650.0, + "step": 108 + }, + { + "epoch": 0.010889654827913483, + "grad_norm": 4.921558380126953, + "learning_rate": 3.588039867109635e-06, + "loss": 0.5564, + "mean_token_accuracy": 0.8968528807163239, + "num_tokens": 28949690.0, + "step": 109 + }, + { + "epoch": 0.010989559918077827, + "grad_norm": 3.547546625137329, + "learning_rate": 3.621262458471761e-06, + "loss": 0.5608, + "mean_token_accuracy": 0.8963256776332855, + "num_tokens": 29205060.0, + "step": 110 + }, + { + "epoch": 0.01108946500824217, + "grad_norm": 2.9072265625, + "learning_rate": 3.6544850498338876e-06, + "loss": 0.5661, + "mean_token_accuracy": 0.890740305185318, + "num_tokens": 29459754.0, + "step": 111 + }, + { + "epoch": 0.011189370098406514, + "grad_norm": 3.45780873298645, + "learning_rate": 3.6877076411960135e-06, + "loss": 0.5612, + "mean_token_accuracy": 0.8946048617362976, + "num_tokens": 29724024.0, + "step": 112 + }, + { + "epoch": 0.011289275188570858, + "grad_norm": 3.6322343349456787, + "learning_rate": 3.72093023255814e-06, + "loss": 0.5635, + "mean_token_accuracy": 0.8971585035324097, + "num_tokens": 30001359.0, + "step": 113 + }, + { + "epoch": 0.011389180278735202, + "grad_norm": 5.144083499908447, + "learning_rate": 3.754152823920266e-06, + "loss": 0.5555, + "mean_token_accuracy": 0.8972598612308502, + "num_tokens": 30263706.0, + "step": 114 + }, + { + "epoch": 0.011489085368899546, + "grad_norm": 9.87804889678955, + "learning_rate": 3.787375415282392e-06, + "loss": 0.547, + "mean_token_accuracy": 0.8973714709281921, + "num_tokens": 30530598.0, + "step": 115 + }, + { + "epoch": 0.01158899045906389, + "grad_norm": 2.4454424381256104, + "learning_rate": 3.8205980066445185e-06, + "loss": 0.5503, + "mean_token_accuracy": 0.8978724181652069, + "num_tokens": 30794755.0, + "step": 116 + }, + { + "epoch": 0.011688895549228234, + "grad_norm": 3.7496933937072754, + "learning_rate": 3.853820598006645e-06, + "loss": 0.5572, + "mean_token_accuracy": 0.8956330418586731, + "num_tokens": 31059926.0, + "step": 117 + }, + { + "epoch": 0.011788800639392577, + "grad_norm": 4.361268520355225, + "learning_rate": 3.887043189368771e-06, + "loss": 0.5583, + "mean_token_accuracy": 0.895323783159256, + "num_tokens": 31318717.0, + "step": 118 + }, + { + "epoch": 0.011888705729556921, + "grad_norm": 2.848219633102417, + "learning_rate": 3.9202657807308975e-06, + "loss": 0.5625, + "mean_token_accuracy": 0.8947817981243134, + "num_tokens": 31581898.0, + "step": 119 + }, + { + "epoch": 0.011988610819721265, + "grad_norm": 2.4445395469665527, + "learning_rate": 3.953488372093024e-06, + "loss": 0.5557, + "mean_token_accuracy": 0.8948699235916138, + "num_tokens": 31851986.0, + "step": 120 + }, + { + "epoch": 0.012088515909885609, + "grad_norm": 2.521683931350708, + "learning_rate": 3.98671096345515e-06, + "loss": 0.5449, + "mean_token_accuracy": 0.8973753750324249, + "num_tokens": 32116518.0, + "step": 121 + }, + { + "epoch": 0.012188421000049953, + "grad_norm": 4.091718673706055, + "learning_rate": 4.0199335548172765e-06, + "loss": 0.5566, + "mean_token_accuracy": 0.896776407957077, + "num_tokens": 32381559.0, + "step": 122 + }, + { + "epoch": 0.012288326090214297, + "grad_norm": 3.5614852905273438, + "learning_rate": 4.053156146179402e-06, + "loss": 0.5537, + "mean_token_accuracy": 0.8952924013137817, + "num_tokens": 32645251.0, + "step": 123 + }, + { + "epoch": 0.01238823118037864, + "grad_norm": 3.4004063606262207, + "learning_rate": 4.086378737541528e-06, + "loss": 0.5491, + "mean_token_accuracy": 0.8968701362609863, + "num_tokens": 32908442.0, + "step": 124 + }, + { + "epoch": 0.012488136270542984, + "grad_norm": 5.192618370056152, + "learning_rate": 4.119601328903655e-06, + "loss": 0.5541, + "mean_token_accuracy": 0.8957848250865936, + "num_tokens": 33167129.0, + "step": 125 + }, + { + "epoch": 0.012588041360707328, + "grad_norm": 4.123863220214844, + "learning_rate": 4.152823920265781e-06, + "loss": 0.5534, + "mean_token_accuracy": 0.8941038250923157, + "num_tokens": 33437049.0, + "step": 126 + }, + { + "epoch": 0.012687946450871672, + "grad_norm": 3.6582815647125244, + "learning_rate": 4.186046511627907e-06, + "loss": 0.5505, + "mean_token_accuracy": 0.8949277400970459, + "num_tokens": 33699772.0, + "step": 127 + }, + { + "epoch": 0.012787851541036016, + "grad_norm": 3.971299409866333, + "learning_rate": 4.219269102990034e-06, + "loss": 0.5536, + "mean_token_accuracy": 0.8964903652667999, + "num_tokens": 33964224.0, + "step": 128 + }, + { + "epoch": 0.01288775663120036, + "grad_norm": 3.467442750930786, + "learning_rate": 4.25249169435216e-06, + "loss": 0.5593, + "mean_token_accuracy": 0.8978101909160614, + "num_tokens": 34231348.0, + "step": 129 + }, + { + "epoch": 0.012987661721364703, + "grad_norm": 4.144760608673096, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.5566, + "mean_token_accuracy": 0.8994596600532532, + "num_tokens": 34498007.0, + "step": 130 + }, + { + "epoch": 0.013087566811529047, + "grad_norm": 4.5387115478515625, + "learning_rate": 4.318936877076413e-06, + "loss": 0.5534, + "mean_token_accuracy": 0.8947086036205292, + "num_tokens": 34767787.0, + "step": 131 + }, + { + "epoch": 0.013187471901693391, + "grad_norm": 9.031929016113281, + "learning_rate": 4.352159468438539e-06, + "loss": 0.5457, + "mean_token_accuracy": 0.8964603543281555, + "num_tokens": 35029779.0, + "step": 132 + }, + { + "epoch": 0.013287376991857735, + "grad_norm": 2.4342191219329834, + "learning_rate": 4.3853820598006645e-06, + "loss": 0.5518, + "mean_token_accuracy": 0.8963800668716431, + "num_tokens": 35292377.0, + "step": 133 + }, + { + "epoch": 0.013387282082022079, + "grad_norm": 2.980353832244873, + "learning_rate": 4.418604651162791e-06, + "loss": 0.5488, + "mean_token_accuracy": 0.8967847526073456, + "num_tokens": 35554533.0, + "step": 134 + }, + { + "epoch": 0.013487187172186423, + "grad_norm": 2.6285557746887207, + "learning_rate": 4.451827242524917e-06, + "loss": 0.5459, + "mean_token_accuracy": 0.8965582549571991, + "num_tokens": 35829518.0, + "step": 135 + }, + { + "epoch": 0.013587092262350766, + "grad_norm": 2.2669618129730225, + "learning_rate": 4.4850498338870435e-06, + "loss": 0.5465, + "mean_token_accuracy": 0.8978439569473267, + "num_tokens": 36094934.0, + "step": 136 + }, + { + "epoch": 0.01368699735251511, + "grad_norm": 2.8973069190979004, + "learning_rate": 4.51827242524917e-06, + "loss": 0.5556, + "mean_token_accuracy": 0.8958915174007416, + "num_tokens": 36355601.0, + "step": 137 + }, + { + "epoch": 0.013786902442679454, + "grad_norm": 3.3992233276367188, + "learning_rate": 4.551495016611296e-06, + "loss": 0.5549, + "mean_token_accuracy": 0.8970613777637482, + "num_tokens": 36610790.0, + "step": 138 + }, + { + "epoch": 0.013886807532843798, + "grad_norm": 2.887101173400879, + "learning_rate": 4.5847176079734225e-06, + "loss": 0.5607, + "mean_token_accuracy": 0.8964815139770508, + "num_tokens": 36861073.0, + "step": 139 + }, + { + "epoch": 0.013986712623008142, + "grad_norm": 3.277336835861206, + "learning_rate": 4.617940199335549e-06, + "loss": 0.5362, + "mean_token_accuracy": 0.9003787934780121, + "num_tokens": 37134926.0, + "step": 140 + }, + { + "epoch": 0.014086617713172486, + "grad_norm": 1.5360214710235596, + "learning_rate": 4.651162790697675e-06, + "loss": 0.5491, + "mean_token_accuracy": 0.8981333076953888, + "num_tokens": 37395776.0, + "step": 141 + }, + { + "epoch": 0.01418652280333683, + "grad_norm": 2.4213919639587402, + "learning_rate": 4.6843853820598015e-06, + "loss": 0.5484, + "mean_token_accuracy": 0.8972634077072144, + "num_tokens": 37665184.0, + "step": 142 + }, + { + "epoch": 0.014286427893501173, + "grad_norm": 2.798407793045044, + "learning_rate": 4.717607973421927e-06, + "loss": 0.5483, + "mean_token_accuracy": 0.8964361250400543, + "num_tokens": 37920133.0, + "step": 143 + }, + { + "epoch": 0.014386332983665517, + "grad_norm": 3.18338680267334, + "learning_rate": 4.750830564784053e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.8978064060211182, + "num_tokens": 38185187.0, + "step": 144 + }, + { + "epoch": 0.01448623807382986, + "grad_norm": 2.136925220489502, + "learning_rate": 4.78405315614618e-06, + "loss": 0.5349, + "mean_token_accuracy": 0.899265319108963, + "num_tokens": 38456456.0, + "step": 145 + }, + { + "epoch": 0.014586143163994205, + "grad_norm": 1.7260761260986328, + "learning_rate": 4.817275747508306e-06, + "loss": 0.551, + "mean_token_accuracy": 0.894838273525238, + "num_tokens": 38726261.0, + "step": 146 + }, + { + "epoch": 0.014686048254158549, + "grad_norm": 2.151317596435547, + "learning_rate": 4.850498338870432e-06, + "loss": 0.5504, + "mean_token_accuracy": 0.8968658149242401, + "num_tokens": 38986037.0, + "step": 147 + }, + { + "epoch": 0.014785953344322894, + "grad_norm": 5.040338039398193, + "learning_rate": 4.883720930232559e-06, + "loss": 0.5442, + "mean_token_accuracy": 0.8982843458652496, + "num_tokens": 39249151.0, + "step": 148 + }, + { + "epoch": 0.014885858434487238, + "grad_norm": 1.994243860244751, + "learning_rate": 4.916943521594685e-06, + "loss": 0.5501, + "mean_token_accuracy": 0.8955949246883392, + "num_tokens": 39524278.0, + "step": 149 + }, + { + "epoch": 0.014985763524651582, + "grad_norm": 1.931748628616333, + "learning_rate": 4.950166112956811e-06, + "loss": 0.5484, + "mean_token_accuracy": 0.8966254889965057, + "num_tokens": 39794708.0, + "step": 150 + }, + { + "epoch": 0.015085668614815926, + "grad_norm": 16.05190658569336, + "learning_rate": 4.983388704318937e-06, + "loss": 0.5562, + "mean_token_accuracy": 0.8987342119216919, + "num_tokens": 40067591.0, + "step": 151 + }, + { + "epoch": 0.01518557370498027, + "grad_norm": 6.1061110496521, + "learning_rate": 5.016611295681063e-06, + "loss": 0.5527, + "mean_token_accuracy": 0.8962307572364807, + "num_tokens": 40317258.0, + "step": 152 + }, + { + "epoch": 0.015285478795144613, + "grad_norm": 2.9510061740875244, + "learning_rate": 5.0498338870431895e-06, + "loss": 0.5578, + "mean_token_accuracy": 0.8982059061527252, + "num_tokens": 40590721.0, + "step": 153 + }, + { + "epoch": 0.015385383885308957, + "grad_norm": 2.5234501361846924, + "learning_rate": 5.083056478405316e-06, + "loss": 0.5597, + "mean_token_accuracy": 0.8963050544261932, + "num_tokens": 40862127.0, + "step": 154 + }, + { + "epoch": 0.015485288975473301, + "grad_norm": 2.2699873447418213, + "learning_rate": 5.116279069767442e-06, + "loss": 0.5658, + "mean_token_accuracy": 0.8947420418262482, + "num_tokens": 41120469.0, + "step": 155 + }, + { + "epoch": 0.015585194065637645, + "grad_norm": 2.205301284790039, + "learning_rate": 5.149501661129569e-06, + "loss": 0.5489, + "mean_token_accuracy": 0.897253692150116, + "num_tokens": 41383369.0, + "step": 156 + }, + { + "epoch": 0.015685099155801987, + "grad_norm": 2.2770707607269287, + "learning_rate": 5.182724252491694e-06, + "loss": 0.5691, + "mean_token_accuracy": 0.8916217982769012, + "num_tokens": 41652104.0, + "step": 157 + }, + { + "epoch": 0.015785004245966332, + "grad_norm": 2.158076524734497, + "learning_rate": 5.215946843853821e-06, + "loss": 0.5508, + "mean_token_accuracy": 0.8988172113895416, + "num_tokens": 41912661.0, + "step": 158 + }, + { + "epoch": 0.015884909336130675, + "grad_norm": 2.5837714672088623, + "learning_rate": 5.2491694352159475e-06, + "loss": 0.5428, + "mean_token_accuracy": 0.8988452255725861, + "num_tokens": 42176597.0, + "step": 159 + }, + { + "epoch": 0.01598481442629502, + "grad_norm": 3.0549392700195312, + "learning_rate": 5.282392026578074e-06, + "loss": 0.5446, + "mean_token_accuracy": 0.898723304271698, + "num_tokens": 42438313.0, + "step": 160 + }, + { + "epoch": 0.016084719516459362, + "grad_norm": 2.125825881958008, + "learning_rate": 5.3156146179402e-06, + "loss": 0.5411, + "mean_token_accuracy": 0.8992765247821808, + "num_tokens": 42699216.0, + "step": 161 + }, + { + "epoch": 0.016184624606623708, + "grad_norm": 4.3962178230285645, + "learning_rate": 5.348837209302326e-06, + "loss": 0.5493, + "mean_token_accuracy": 0.8981699049472809, + "num_tokens": 42963287.0, + "step": 162 + }, + { + "epoch": 0.01628452969678805, + "grad_norm": 2.566693067550659, + "learning_rate": 5.382059800664452e-06, + "loss": 0.548, + "mean_token_accuracy": 0.8977328836917877, + "num_tokens": 43222916.0, + "step": 163 + }, + { + "epoch": 0.016384434786952395, + "grad_norm": 2.6525440216064453, + "learning_rate": 5.415282392026578e-06, + "loss": 0.553, + "mean_token_accuracy": 0.8972665667533875, + "num_tokens": 43488930.0, + "step": 164 + }, + { + "epoch": 0.016484339877116738, + "grad_norm": 4.2710795402526855, + "learning_rate": 5.448504983388705e-06, + "loss": 0.5527, + "mean_token_accuracy": 0.8932741284370422, + "num_tokens": 43755136.0, + "step": 165 + }, + { + "epoch": 0.016584244967281083, + "grad_norm": 2.794689416885376, + "learning_rate": 5.481727574750831e-06, + "loss": 0.5527, + "mean_token_accuracy": 0.8974431157112122, + "num_tokens": 44019910.0, + "step": 166 + }, + { + "epoch": 0.016684150057445425, + "grad_norm": 2.6774673461914062, + "learning_rate": 5.5149501661129565e-06, + "loss": 0.5545, + "mean_token_accuracy": 0.8949822187423706, + "num_tokens": 44283406.0, + "step": 167 + }, + { + "epoch": 0.01678405514760977, + "grad_norm": 6.112437725067139, + "learning_rate": 5.548172757475083e-06, + "loss": 0.5471, + "mean_token_accuracy": 0.8961467742919922, + "num_tokens": 44550295.0, + "step": 168 + }, + { + "epoch": 0.016883960237774116, + "grad_norm": 2.5542187690734863, + "learning_rate": 5.58139534883721e-06, + "loss": 0.5442, + "mean_token_accuracy": 0.897214263677597, + "num_tokens": 44807205.0, + "step": 169 + }, + { + "epoch": 0.01698386532793846, + "grad_norm": 1.707021713256836, + "learning_rate": 5.614617940199336e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.8945862650871277, + "num_tokens": 45074551.0, + "step": 170 + }, + { + "epoch": 0.017083770418102804, + "grad_norm": 2.6763174533843994, + "learning_rate": 5.647840531561463e-06, + "loss": 0.54, + "mean_token_accuracy": 0.8999200761318207, + "num_tokens": 45337851.0, + "step": 171 + }, + { + "epoch": 0.017183675508267146, + "grad_norm": 4.333861351013184, + "learning_rate": 5.681063122923588e-06, + "loss": 0.5437, + "mean_token_accuracy": 0.898822695016861, + "num_tokens": 45608434.0, + "step": 172 + }, + { + "epoch": 0.01728358059843149, + "grad_norm": 2.950639486312866, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.5461, + "mean_token_accuracy": 0.8982053697109222, + "num_tokens": 45877416.0, + "step": 173 + }, + { + "epoch": 0.017383485688595834, + "grad_norm": 6.19303035736084, + "learning_rate": 5.747508305647841e-06, + "loss": 0.5395, + "mean_token_accuracy": 0.8968684375286102, + "num_tokens": 46148646.0, + "step": 174 + }, + { + "epoch": 0.01748339077876018, + "grad_norm": 9.152772903442383, + "learning_rate": 5.780730897009967e-06, + "loss": 0.5515, + "mean_token_accuracy": 0.8983980119228363, + "num_tokens": 46417576.0, + "step": 175 + }, + { + "epoch": 0.01758329586892452, + "grad_norm": 7.762626647949219, + "learning_rate": 5.8139534883720935e-06, + "loss": 0.5499, + "mean_token_accuracy": 0.8982058465480804, + "num_tokens": 46695591.0, + "step": 176 + }, + { + "epoch": 0.017683200959088867, + "grad_norm": 4.6273016929626465, + "learning_rate": 5.847176079734221e-06, + "loss": 0.5498, + "mean_token_accuracy": 0.8977615535259247, + "num_tokens": 46969551.0, + "step": 177 + }, + { + "epoch": 0.01778310604925321, + "grad_norm": 3.151172161102295, + "learning_rate": 5.880398671096345e-06, + "loss": 0.541, + "mean_token_accuracy": 0.9003558158874512, + "num_tokens": 47240738.0, + "step": 178 + }, + { + "epoch": 0.017883011139417555, + "grad_norm": 3.9313786029815674, + "learning_rate": 5.9136212624584725e-06, + "loss": 0.539, + "mean_token_accuracy": 0.8965206444263458, + "num_tokens": 47501732.0, + "step": 179 + }, + { + "epoch": 0.017982916229581897, + "grad_norm": 8.608522415161133, + "learning_rate": 5.946843853820599e-06, + "loss": 0.5449, + "mean_token_accuracy": 0.8974732756614685, + "num_tokens": 47778813.0, + "step": 180 + }, + { + "epoch": 0.018082821319746242, + "grad_norm": 2.205305576324463, + "learning_rate": 5.980066445182725e-06, + "loss": 0.54, + "mean_token_accuracy": 0.8964247405529022, + "num_tokens": 48039208.0, + "step": 181 + }, + { + "epoch": 0.018182726409910584, + "grad_norm": 4.633362770080566, + "learning_rate": 6.0132890365448515e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.8992270529270172, + "num_tokens": 48298671.0, + "step": 182 + }, + { + "epoch": 0.01828263150007493, + "grad_norm": 4.872408866882324, + "learning_rate": 6.046511627906977e-06, + "loss": 0.5427, + "mean_token_accuracy": 0.896783858537674, + "num_tokens": 48561034.0, + "step": 183 + }, + { + "epoch": 0.018382536590239272, + "grad_norm": 19.151906967163086, + "learning_rate": 6.079734219269103e-06, + "loss": 0.545, + "mean_token_accuracy": 0.8956416249275208, + "num_tokens": 48818444.0, + "step": 184 + }, + { + "epoch": 0.018482441680403618, + "grad_norm": 1.9246405363082886, + "learning_rate": 6.11295681063123e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.8959218561649323, + "num_tokens": 49082938.0, + "step": 185 + }, + { + "epoch": 0.01858234677056796, + "grad_norm": 1.6679128408432007, + "learning_rate": 6.146179401993356e-06, + "loss": 0.5375, + "mean_token_accuracy": 0.8971186876296997, + "num_tokens": 49354779.0, + "step": 186 + }, + { + "epoch": 0.018682251860732305, + "grad_norm": 1.9116359949111938, + "learning_rate": 6.179401993355482e-06, + "loss": 0.5376, + "mean_token_accuracy": 0.9012064933776855, + "num_tokens": 49624106.0, + "step": 187 + }, + { + "epoch": 0.018782156950896647, + "grad_norm": 7.084646701812744, + "learning_rate": 6.212624584717608e-06, + "loss": 0.5388, + "mean_token_accuracy": 0.9007435739040375, + "num_tokens": 49884616.0, + "step": 188 + }, + { + "epoch": 0.018882062041060993, + "grad_norm": 2.26000714302063, + "learning_rate": 6.245847176079734e-06, + "loss": 0.5416, + "mean_token_accuracy": 0.8985685706138611, + "num_tokens": 50153662.0, + "step": 189 + }, + { + "epoch": 0.018981967131225335, + "grad_norm": 2.452465534210205, + "learning_rate": 6.279069767441861e-06, + "loss": 0.5412, + "mean_token_accuracy": 0.9004049897193909, + "num_tokens": 50409854.0, + "step": 190 + }, + { + "epoch": 0.01908187222138968, + "grad_norm": 3.334059000015259, + "learning_rate": 6.312292358803988e-06, + "loss": 0.5377, + "mean_token_accuracy": 0.9027298986911774, + "num_tokens": 50675028.0, + "step": 191 + }, + { + "epoch": 0.019181777311554023, + "grad_norm": 1.9043229818344116, + "learning_rate": 6.345514950166114e-06, + "loss": 0.5391, + "mean_token_accuracy": 0.9001175463199615, + "num_tokens": 50936093.0, + "step": 192 + }, + { + "epoch": 0.01928168240171837, + "grad_norm": 2.539210557937622, + "learning_rate": 6.3787375415282395e-06, + "loss": 0.5387, + "mean_token_accuracy": 0.8995794653892517, + "num_tokens": 51202663.0, + "step": 193 + }, + { + "epoch": 0.01938158749188271, + "grad_norm": 2.0801281929016113, + "learning_rate": 6.411960132890366e-06, + "loss": 0.5405, + "mean_token_accuracy": 0.8981446623802185, + "num_tokens": 51462452.0, + "step": 194 + }, + { + "epoch": 0.019481492582047056, + "grad_norm": 1.2673317193984985, + "learning_rate": 6.445182724252492e-06, + "loss": 0.5388, + "mean_token_accuracy": 0.8961288332939148, + "num_tokens": 51723262.0, + "step": 195 + }, + { + "epoch": 0.019581397672211398, + "grad_norm": 3.4383647441864014, + "learning_rate": 6.4784053156146185e-06, + "loss": 0.5418, + "mean_token_accuracy": 0.8978481590747833, + "num_tokens": 51990843.0, + "step": 196 + }, + { + "epoch": 0.019681302762375744, + "grad_norm": 2.23270845413208, + "learning_rate": 6.511627906976745e-06, + "loss": 0.5328, + "mean_token_accuracy": 0.8985138237476349, + "num_tokens": 52255834.0, + "step": 197 + }, + { + "epoch": 0.019781207852540086, + "grad_norm": 2.544766426086426, + "learning_rate": 6.54485049833887e-06, + "loss": 0.5337, + "mean_token_accuracy": 0.9026228487491608, + "num_tokens": 52530902.0, + "step": 198 + }, + { + "epoch": 0.01988111294270443, + "grad_norm": 2.2225232124328613, + "learning_rate": 6.578073089700997e-06, + "loss": 0.5376, + "mean_token_accuracy": 0.8986892700195312, + "num_tokens": 52791761.0, + "step": 199 + }, + { + "epoch": 0.019981018032868773, + "grad_norm": 2.544659376144409, + "learning_rate": 6.611295681063124e-06, + "loss": 0.5389, + "mean_token_accuracy": 0.9008254110813141, + "num_tokens": 53055931.0, + "step": 200 + }, + { + "epoch": 0.02008092312303312, + "grad_norm": 4.479158401489258, + "learning_rate": 6.64451827242525e-06, + "loss": 0.5437, + "mean_token_accuracy": 0.8953225314617157, + "num_tokens": 53324159.0, + "step": 201 + }, + { + "epoch": 0.02018082821319746, + "grad_norm": 2.7528769969940186, + "learning_rate": 6.6777408637873766e-06, + "loss": 0.5432, + "mean_token_accuracy": 0.8958341777324677, + "num_tokens": 53583255.0, + "step": 202 + }, + { + "epoch": 0.020280733303361807, + "grad_norm": 6.200131416320801, + "learning_rate": 6.710963455149502e-06, + "loss": 0.5441, + "mean_token_accuracy": 0.8975926339626312, + "num_tokens": 53849628.0, + "step": 203 + }, + { + "epoch": 0.02038063839352615, + "grad_norm": 2.273153066635132, + "learning_rate": 6.744186046511628e-06, + "loss": 0.5408, + "mean_token_accuracy": 0.8991481363773346, + "num_tokens": 54110118.0, + "step": 204 + }, + { + "epoch": 0.020480543483690494, + "grad_norm": 1.463924765586853, + "learning_rate": 6.777408637873755e-06, + "loss": 0.5386, + "mean_token_accuracy": 0.899332731962204, + "num_tokens": 54367465.0, + "step": 205 + }, + { + "epoch": 0.020580448573854836, + "grad_norm": 1.6965149641036987, + "learning_rate": 6.810631229235881e-06, + "loss": 0.5379, + "mean_token_accuracy": 0.8979399502277374, + "num_tokens": 54629276.0, + "step": 206 + }, + { + "epoch": 0.020680353664019182, + "grad_norm": 3.383091926574707, + "learning_rate": 6.843853820598007e-06, + "loss": 0.5353, + "mean_token_accuracy": 0.9003303050994873, + "num_tokens": 54903787.0, + "step": 207 + }, + { + "epoch": 0.020780258754183524, + "grad_norm": 1.7195576429367065, + "learning_rate": 6.877076411960133e-06, + "loss": 0.5428, + "mean_token_accuracy": 0.8978247344493866, + "num_tokens": 55162832.0, + "step": 208 + }, + { + "epoch": 0.02088016384434787, + "grad_norm": 2.5698046684265137, + "learning_rate": 6.910299003322259e-06, + "loss": 0.5409, + "mean_token_accuracy": 0.8996523916721344, + "num_tokens": 55420556.0, + "step": 209 + }, + { + "epoch": 0.02098006893451221, + "grad_norm": 1.8306021690368652, + "learning_rate": 6.9435215946843855e-06, + "loss": 0.5287, + "mean_token_accuracy": 0.9000126123428345, + "num_tokens": 55689977.0, + "step": 210 + }, + { + "epoch": 0.021079974024676557, + "grad_norm": 3.652418851852417, + "learning_rate": 6.976744186046513e-06, + "loss": 0.5376, + "mean_token_accuracy": 0.8974618017673492, + "num_tokens": 55950204.0, + "step": 211 + }, + { + "epoch": 0.021179879114840903, + "grad_norm": 1.9224114418029785, + "learning_rate": 7.009966777408639e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.9000983834266663, + "num_tokens": 56220463.0, + "step": 212 + }, + { + "epoch": 0.021279784205005245, + "grad_norm": 2.942293882369995, + "learning_rate": 7.0431893687707646e-06, + "loss": 0.5396, + "mean_token_accuracy": 0.8967028856277466, + "num_tokens": 56487673.0, + "step": 213 + }, + { + "epoch": 0.02137968929516959, + "grad_norm": 3.202317953109741, + "learning_rate": 7.076411960132891e-06, + "loss": 0.5362, + "mean_token_accuracy": 0.90107461810112, + "num_tokens": 56752153.0, + "step": 214 + }, + { + "epoch": 0.021479594385333933, + "grad_norm": 2.2774498462677, + "learning_rate": 7.109634551495017e-06, + "loss": 0.5366, + "mean_token_accuracy": 0.8983028829097748, + "num_tokens": 57005401.0, + "step": 215 + }, + { + "epoch": 0.021579499475498278, + "grad_norm": 1.686529517173767, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.5342, + "mean_token_accuracy": 0.8994486033916473, + "num_tokens": 57271198.0, + "step": 216 + }, + { + "epoch": 0.02167940456566262, + "grad_norm": 2.3411271572113037, + "learning_rate": 7.17607973421927e-06, + "loss": 0.5301, + "mean_token_accuracy": 0.8995662033557892, + "num_tokens": 57536299.0, + "step": 217 + }, + { + "epoch": 0.021779309655826966, + "grad_norm": 2.049517869949341, + "learning_rate": 7.209302325581395e-06, + "loss": 0.5413, + "mean_token_accuracy": 0.8988032042980194, + "num_tokens": 57809535.0, + "step": 218 + }, + { + "epoch": 0.021879214745991308, + "grad_norm": 4.220499038696289, + "learning_rate": 7.242524916943522e-06, + "loss": 0.54, + "mean_token_accuracy": 0.8980070352554321, + "num_tokens": 58086513.0, + "step": 219 + }, + { + "epoch": 0.021979119836155653, + "grad_norm": 3.9438488483428955, + "learning_rate": 7.275747508305648e-06, + "loss": 0.5515, + "mean_token_accuracy": 0.8950827419757843, + "num_tokens": 58357427.0, + "step": 220 + }, + { + "epoch": 0.022079024926319996, + "grad_norm": 11.791813850402832, + "learning_rate": 7.308970099667775e-06, + "loss": 0.5395, + "mean_token_accuracy": 0.8974934816360474, + "num_tokens": 58622349.0, + "step": 221 + }, + { + "epoch": 0.02217893001648434, + "grad_norm": 3.280186653137207, + "learning_rate": 7.342192691029902e-06, + "loss": 0.5415, + "mean_token_accuracy": 0.9001385569572449, + "num_tokens": 58889391.0, + "step": 222 + }, + { + "epoch": 0.022278835106648683, + "grad_norm": 2.6358444690704346, + "learning_rate": 7.375415282392027e-06, + "loss": 0.5352, + "mean_token_accuracy": 0.8982924222946167, + "num_tokens": 59147580.0, + "step": 223 + }, + { + "epoch": 0.02237874019681303, + "grad_norm": 4.82296085357666, + "learning_rate": 7.408637873754153e-06, + "loss": 0.5449, + "mean_token_accuracy": 0.8987541794776917, + "num_tokens": 59421063.0, + "step": 224 + }, + { + "epoch": 0.02247864528697737, + "grad_norm": 11.059097290039062, + "learning_rate": 7.44186046511628e-06, + "loss": 0.5372, + "mean_token_accuracy": 0.8977135717868805, + "num_tokens": 59681655.0, + "step": 225 + }, + { + "epoch": 0.022578550377141716, + "grad_norm": 3.6797680854797363, + "learning_rate": 7.475083056478406e-06, + "loss": 0.5327, + "mean_token_accuracy": 0.8982837498188019, + "num_tokens": 59939228.0, + "step": 226 + }, + { + "epoch": 0.02267845546730606, + "grad_norm": 3.6843626499176025, + "learning_rate": 7.508305647840532e-06, + "loss": 0.537, + "mean_token_accuracy": 0.8948313295841217, + "num_tokens": 60203812.0, + "step": 227 + }, + { + "epoch": 0.022778360557470404, + "grad_norm": 1.8683735132217407, + "learning_rate": 7.541528239202659e-06, + "loss": 0.537, + "mean_token_accuracy": 0.8961237370967865, + "num_tokens": 60465929.0, + "step": 228 + }, + { + "epoch": 0.022878265647634746, + "grad_norm": 3.125511646270752, + "learning_rate": 7.574750830564784e-06, + "loss": 0.5336, + "mean_token_accuracy": 0.8998444676399231, + "num_tokens": 60725447.0, + "step": 229 + }, + { + "epoch": 0.022978170737799092, + "grad_norm": 18.256074905395508, + "learning_rate": 7.6079734219269106e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.8994393646717072, + "num_tokens": 60989571.0, + "step": 230 + }, + { + "epoch": 0.023078075827963434, + "grad_norm": 4.146905422210693, + "learning_rate": 7.641196013289037e-06, + "loss": 0.5345, + "mean_token_accuracy": 0.9008025825023651, + "num_tokens": 61252589.0, + "step": 231 + }, + { + "epoch": 0.02317798091812778, + "grad_norm": 2.2567057609558105, + "learning_rate": 7.674418604651164e-06, + "loss": 0.5295, + "mean_token_accuracy": 0.9000889956951141, + "num_tokens": 61514954.0, + "step": 232 + }, + { + "epoch": 0.02327788600829212, + "grad_norm": 4.815765857696533, + "learning_rate": 7.70764119601329e-06, + "loss": 0.5392, + "mean_token_accuracy": 0.8986950218677521, + "num_tokens": 61781439.0, + "step": 233 + }, + { + "epoch": 0.023377791098456467, + "grad_norm": 1.9264072179794312, + "learning_rate": 7.740863787375415e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.8984262943267822, + "num_tokens": 62039601.0, + "step": 234 + }, + { + "epoch": 0.02347769618862081, + "grad_norm": 5.12638521194458, + "learning_rate": 7.774086378737542e-06, + "loss": 0.5287, + "mean_token_accuracy": 0.9006338119506836, + "num_tokens": 62311989.0, + "step": 235 + }, + { + "epoch": 0.023577601278785155, + "grad_norm": 1.6448711156845093, + "learning_rate": 7.807308970099668e-06, + "loss": 0.5354, + "mean_token_accuracy": 0.8994677662849426, + "num_tokens": 62557924.0, + "step": 236 + }, + { + "epoch": 0.023677506368949497, + "grad_norm": 1.7197799682617188, + "learning_rate": 7.840531561461795e-06, + "loss": 0.5353, + "mean_token_accuracy": 0.9006043374538422, + "num_tokens": 62814112.0, + "step": 237 + }, + { + "epoch": 0.023777411459113842, + "grad_norm": 2.7424752712249756, + "learning_rate": 7.873754152823922e-06, + "loss": 0.5368, + "mean_token_accuracy": 0.8999449908733368, + "num_tokens": 63076823.0, + "step": 238 + }, + { + "epoch": 0.023877316549278185, + "grad_norm": 3.2354328632354736, + "learning_rate": 7.906976744186048e-06, + "loss": 0.5366, + "mean_token_accuracy": 0.8983865082263947, + "num_tokens": 63339716.0, + "step": 239 + }, + { + "epoch": 0.02397722163944253, + "grad_norm": 1.2640604972839355, + "learning_rate": 7.940199335548173e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8999717533588409, + "num_tokens": 63605346.0, + "step": 240 + }, + { + "epoch": 0.024077126729606872, + "grad_norm": 1.9379035234451294, + "learning_rate": 7.9734219269103e-06, + "loss": 0.5396, + "mean_token_accuracy": 0.8977775573730469, + "num_tokens": 63871137.0, + "step": 241 + }, + { + "epoch": 0.024177031819771218, + "grad_norm": 1.2289472818374634, + "learning_rate": 8.006644518272426e-06, + "loss": 0.5392, + "mean_token_accuracy": 0.8984501957893372, + "num_tokens": 64141911.0, + "step": 242 + }, + { + "epoch": 0.02427693690993556, + "grad_norm": 1.6378623247146606, + "learning_rate": 8.039867109634553e-06, + "loss": 0.5291, + "mean_token_accuracy": 0.90381720662117, + "num_tokens": 64410393.0, + "step": 243 + }, + { + "epoch": 0.024376842000099905, + "grad_norm": 1.517802119255066, + "learning_rate": 8.073089700996678e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.9010347127914429, + "num_tokens": 64681092.0, + "step": 244 + }, + { + "epoch": 0.024476747090264248, + "grad_norm": 1.0437833070755005, + "learning_rate": 8.106312292358804e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.9025360941886902, + "num_tokens": 64951512.0, + "step": 245 + }, + { + "epoch": 0.024576652180428593, + "grad_norm": 1.3692548274993896, + "learning_rate": 8.139534883720931e-06, + "loss": 0.5351, + "mean_token_accuracy": 0.9013669788837433, + "num_tokens": 65226726.0, + "step": 246 + }, + { + "epoch": 0.024676557270592935, + "grad_norm": 1.3844069242477417, + "learning_rate": 8.172757475083057e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.8988857269287109, + "num_tokens": 65482426.0, + "step": 247 + }, + { + "epoch": 0.02477646236075728, + "grad_norm": 1.7387361526489258, + "learning_rate": 8.205980066445184e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.8978105783462524, + "num_tokens": 65743669.0, + "step": 248 + }, + { + "epoch": 0.024876367450921623, + "grad_norm": 2.5658299922943115, + "learning_rate": 8.23920265780731e-06, + "loss": 0.5408, + "mean_token_accuracy": 0.8962993621826172, + "num_tokens": 65990310.0, + "step": 249 + }, + { + "epoch": 0.02497627254108597, + "grad_norm": 1.4570274353027344, + "learning_rate": 8.272425249169436e-06, + "loss": 0.533, + "mean_token_accuracy": 0.8999358415603638, + "num_tokens": 66246549.0, + "step": 250 + }, + { + "epoch": 0.02507617763125031, + "grad_norm": 2.2563259601593018, + "learning_rate": 8.305647840531562e-06, + "loss": 0.543, + "mean_token_accuracy": 0.8959168493747711, + "num_tokens": 66505889.0, + "step": 251 + }, + { + "epoch": 0.025176082721414656, + "grad_norm": 1.2261892557144165, + "learning_rate": 8.338870431893689e-06, + "loss": 0.5389, + "mean_token_accuracy": 0.8986565470695496, + "num_tokens": 66770765.0, + "step": 252 + }, + { + "epoch": 0.025275987811579, + "grad_norm": 1.4537079334259033, + "learning_rate": 8.372093023255815e-06, + "loss": 0.5338, + "mean_token_accuracy": 0.9000676572322845, + "num_tokens": 67047123.0, + "step": 253 + }, + { + "epoch": 0.025375892901743344, + "grad_norm": 1.4453496932983398, + "learning_rate": 8.40531561461794e-06, + "loss": 0.5411, + "mean_token_accuracy": 0.8987282514572144, + "num_tokens": 67311539.0, + "step": 254 + }, + { + "epoch": 0.02547579799190769, + "grad_norm": 1.4592816829681396, + "learning_rate": 8.438538205980067e-06, + "loss": 0.5371, + "mean_token_accuracy": 0.8984167277812958, + "num_tokens": 67571172.0, + "step": 255 + }, + { + "epoch": 0.02557570308207203, + "grad_norm": 1.7796326875686646, + "learning_rate": 8.471760797342193e-06, + "loss": 0.5341, + "mean_token_accuracy": 0.8995403945446014, + "num_tokens": 67834528.0, + "step": 256 + }, + { + "epoch": 0.025675608172236377, + "grad_norm": 1.3002378940582275, + "learning_rate": 8.50498338870432e-06, + "loss": 0.5495, + "mean_token_accuracy": 0.8989010751247406, + "num_tokens": 68106355.0, + "step": 257 + }, + { + "epoch": 0.02577551326240072, + "grad_norm": 1.580952763557434, + "learning_rate": 8.538205980066447e-06, + "loss": 0.5404, + "mean_token_accuracy": 0.8991585373878479, + "num_tokens": 68363888.0, + "step": 258 + }, + { + "epoch": 0.025875418352565065, + "grad_norm": 2.470028877258301, + "learning_rate": 8.571428571428571e-06, + "loss": 0.5349, + "mean_token_accuracy": 0.8996194005012512, + "num_tokens": 68645103.0, + "step": 259 + }, + { + "epoch": 0.025975323442729407, + "grad_norm": 2.768704414367676, + "learning_rate": 8.604651162790698e-06, + "loss": 0.5256, + "mean_token_accuracy": 0.9004732668399811, + "num_tokens": 68898800.0, + "step": 260 + }, + { + "epoch": 0.026075228532893752, + "grad_norm": 1.287067174911499, + "learning_rate": 8.637873754152825e-06, + "loss": 0.5436, + "mean_token_accuracy": 0.8989366888999939, + "num_tokens": 69160506.0, + "step": 261 + }, + { + "epoch": 0.026175133623058094, + "grad_norm": 3.3200857639312744, + "learning_rate": 8.67109634551495e-06, + "loss": 0.549, + "mean_token_accuracy": 0.8982260823249817, + "num_tokens": 69432594.0, + "step": 262 + }, + { + "epoch": 0.02627503871322244, + "grad_norm": 1.8411730527877808, + "learning_rate": 8.704318936877078e-06, + "loss": 0.5316, + "mean_token_accuracy": 0.9001834988594055, + "num_tokens": 69694054.0, + "step": 263 + }, + { + "epoch": 0.026374943803386782, + "grad_norm": 2.7311034202575684, + "learning_rate": 8.737541528239203e-06, + "loss": 0.5483, + "mean_token_accuracy": 0.8979843556880951, + "num_tokens": 69961139.0, + "step": 264 + }, + { + "epoch": 0.026474848893551128, + "grad_norm": 1.2144609689712524, + "learning_rate": 8.770764119601329e-06, + "loss": 0.539, + "mean_token_accuracy": 0.9005507826805115, + "num_tokens": 70218689.0, + "step": 265 + }, + { + "epoch": 0.02657475398371547, + "grad_norm": 1.1141756772994995, + "learning_rate": 8.803986710963456e-06, + "loss": 0.5379, + "mean_token_accuracy": 0.9004265666007996, + "num_tokens": 70480427.0, + "step": 266 + }, + { + "epoch": 0.026674659073879815, + "grad_norm": 1.1145318746566772, + "learning_rate": 8.837209302325582e-06, + "loss": 0.5365, + "mean_token_accuracy": 0.8993416130542755, + "num_tokens": 70750465.0, + "step": 267 + }, + { + "epoch": 0.026774564164044157, + "grad_norm": 1.14609694480896, + "learning_rate": 8.870431893687709e-06, + "loss": 0.5431, + "mean_token_accuracy": 0.9010970294475555, + "num_tokens": 71019547.0, + "step": 268 + }, + { + "epoch": 0.026874469254208503, + "grad_norm": 1.1803454160690308, + "learning_rate": 8.903654485049834e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.9009835720062256, + "num_tokens": 71287207.0, + "step": 269 + }, + { + "epoch": 0.026974374344372845, + "grad_norm": 1.0699255466461182, + "learning_rate": 8.93687707641196e-06, + "loss": 0.5409, + "mean_token_accuracy": 0.897551417350769, + "num_tokens": 71550754.0, + "step": 270 + }, + { + "epoch": 0.02707427943453719, + "grad_norm": 1.1542104482650757, + "learning_rate": 8.970099667774087e-06, + "loss": 0.531, + "mean_token_accuracy": 0.8997795879840851, + "num_tokens": 71814943.0, + "step": 271 + }, + { + "epoch": 0.027174184524701533, + "grad_norm": 1.167953610420227, + "learning_rate": 9.003322259136214e-06, + "loss": 0.5329, + "mean_token_accuracy": 0.900679886341095, + "num_tokens": 72076469.0, + "step": 272 + }, + { + "epoch": 0.02727408961486588, + "grad_norm": 1.0117502212524414, + "learning_rate": 9.03654485049834e-06, + "loss": 0.5431, + "mean_token_accuracy": 0.8967209756374359, + "num_tokens": 72338816.0, + "step": 273 + }, + { + "epoch": 0.02737399470503022, + "grad_norm": 2.6449577808380127, + "learning_rate": 9.069767441860465e-06, + "loss": 0.5408, + "mean_token_accuracy": 0.8962429165840149, + "num_tokens": 72606974.0, + "step": 274 + }, + { + "epoch": 0.027473899795194566, + "grad_norm": 1.148666501045227, + "learning_rate": 9.102990033222592e-06, + "loss": 0.5372, + "mean_token_accuracy": 0.9008255004882812, + "num_tokens": 72873899.0, + "step": 275 + }, + { + "epoch": 0.027573804885358908, + "grad_norm": 1.2291401624679565, + "learning_rate": 9.136212624584718e-06, + "loss": 0.5384, + "mean_token_accuracy": 0.9001466631889343, + "num_tokens": 73144970.0, + "step": 276 + }, + { + "epoch": 0.027673709975523254, + "grad_norm": 1.706945776939392, + "learning_rate": 9.169435215946845e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.9022749364376068, + "num_tokens": 73402194.0, + "step": 277 + }, + { + "epoch": 0.027773615065687596, + "grad_norm": 1.0378382205963135, + "learning_rate": 9.20265780730897e-06, + "loss": 0.5378, + "mean_token_accuracy": 0.900369793176651, + "num_tokens": 73664794.0, + "step": 278 + }, + { + "epoch": 0.02787352015585194, + "grad_norm": 0.9761217832565308, + "learning_rate": 9.235880398671098e-06, + "loss": 0.534, + "mean_token_accuracy": 0.8996143043041229, + "num_tokens": 73927770.0, + "step": 279 + }, + { + "epoch": 0.027973425246016283, + "grad_norm": 0.9541959762573242, + "learning_rate": 9.269102990033223e-06, + "loss": 0.5397, + "mean_token_accuracy": 0.8976276814937592, + "num_tokens": 74192537.0, + "step": 280 + }, + { + "epoch": 0.02807333033618063, + "grad_norm": 1.3261549472808838, + "learning_rate": 9.30232558139535e-06, + "loss": 0.5394, + "mean_token_accuracy": 0.8950575292110443, + "num_tokens": 74456211.0, + "step": 281 + }, + { + "epoch": 0.02817323542634497, + "grad_norm": 0.9239968657493591, + "learning_rate": 9.335548172757476e-06, + "loss": 0.5334, + "mean_token_accuracy": 0.9008587598800659, + "num_tokens": 74727617.0, + "step": 282 + }, + { + "epoch": 0.028273140516509317, + "grad_norm": 1.3281149864196777, + "learning_rate": 9.368770764119603e-06, + "loss": 0.5445, + "mean_token_accuracy": 0.8983404636383057, + "num_tokens": 75001228.0, + "step": 283 + }, + { + "epoch": 0.02837304560667366, + "grad_norm": 0.9862871170043945, + "learning_rate": 9.401993355481728e-06, + "loss": 0.5437, + "mean_token_accuracy": 0.8973506093025208, + "num_tokens": 75266883.0, + "step": 284 + }, + { + "epoch": 0.028472950696838004, + "grad_norm": 0.9654200673103333, + "learning_rate": 9.435215946843854e-06, + "loss": 0.5425, + "mean_token_accuracy": 0.8986485600471497, + "num_tokens": 75531432.0, + "step": 285 + }, + { + "epoch": 0.028572855787002346, + "grad_norm": 0.906114935874939, + "learning_rate": 9.468438538205981e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.898255318403244, + "num_tokens": 75788161.0, + "step": 286 + }, + { + "epoch": 0.028672760877166692, + "grad_norm": 0.8665235638618469, + "learning_rate": 9.501661129568107e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.9024683237075806, + "num_tokens": 76056365.0, + "step": 287 + }, + { + "epoch": 0.028772665967331034, + "grad_norm": 1.1349400281906128, + "learning_rate": 9.534883720930234e-06, + "loss": 0.5349, + "mean_token_accuracy": 0.9000137746334076, + "num_tokens": 76324850.0, + "step": 288 + }, + { + "epoch": 0.02887257105749538, + "grad_norm": 1.1611502170562744, + "learning_rate": 9.56810631229236e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.8999358415603638, + "num_tokens": 76584602.0, + "step": 289 + }, + { + "epoch": 0.02897247614765972, + "grad_norm": 1.406546711921692, + "learning_rate": 9.601328903654485e-06, + "loss": 0.5407, + "mean_token_accuracy": 0.8988520801067352, + "num_tokens": 76852506.0, + "step": 290 + }, + { + "epoch": 0.029072381237824067, + "grad_norm": 0.8746483325958252, + "learning_rate": 9.634551495016612e-06, + "loss": 0.5363, + "mean_token_accuracy": 0.9000678658485413, + "num_tokens": 77118948.0, + "step": 291 + }, + { + "epoch": 0.02917228632798841, + "grad_norm": 1.090211033821106, + "learning_rate": 9.66777408637874e-06, + "loss": 0.5389, + "mean_token_accuracy": 0.8990329205989838, + "num_tokens": 77393231.0, + "step": 292 + }, + { + "epoch": 0.029272191418152755, + "grad_norm": 0.8494070768356323, + "learning_rate": 9.700996677740865e-06, + "loss": 0.5282, + "mean_token_accuracy": 0.9012806713581085, + "num_tokens": 77666495.0, + "step": 293 + }, + { + "epoch": 0.029372096508317097, + "grad_norm": 1.0729533433914185, + "learning_rate": 9.734219269102992e-06, + "loss": 0.5307, + "mean_token_accuracy": 0.9007712602615356, + "num_tokens": 77933385.0, + "step": 294 + }, + { + "epoch": 0.029472001598481443, + "grad_norm": 1.4491238594055176, + "learning_rate": 9.767441860465117e-06, + "loss": 0.5344, + "mean_token_accuracy": 0.9007526636123657, + "num_tokens": 78200122.0, + "step": 295 + }, + { + "epoch": 0.029571906688645788, + "grad_norm": 1.1012029647827148, + "learning_rate": 9.800664451827243e-06, + "loss": 0.5362, + "mean_token_accuracy": 0.898694634437561, + "num_tokens": 78467734.0, + "step": 296 + }, + { + "epoch": 0.02967181177881013, + "grad_norm": 0.9329684376716614, + "learning_rate": 9.83388704318937e-06, + "loss": 0.5384, + "mean_token_accuracy": 0.8998262286186218, + "num_tokens": 78732409.0, + "step": 297 + }, + { + "epoch": 0.029771716868974476, + "grad_norm": 1.1532882452011108, + "learning_rate": 9.867109634551495e-06, + "loss": 0.5385, + "mean_token_accuracy": 0.8998410105705261, + "num_tokens": 79000467.0, + "step": 298 + }, + { + "epoch": 0.029871621959138818, + "grad_norm": 1.3990613222122192, + "learning_rate": 9.900332225913623e-06, + "loss": 0.5398, + "mean_token_accuracy": 0.8994455933570862, + "num_tokens": 79272082.0, + "step": 299 + }, + { + "epoch": 0.029971527049303164, + "grad_norm": 1.223876714706421, + "learning_rate": 9.933554817275748e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.8997305333614349, + "num_tokens": 79544069.0, + "step": 300 + }, + { + "epoch": 0.030071432139467506, + "grad_norm": 1.120954155921936, + "learning_rate": 9.966777408637874e-06, + "loss": 0.5386, + "mean_token_accuracy": 0.8992290496826172, + "num_tokens": 79808779.0, + "step": 301 + }, + { + "epoch": 0.03017133722963185, + "grad_norm": 1.5120283365249634, + "learning_rate": 1e-05, + "loss": 0.5332, + "mean_token_accuracy": 0.897520512342453, + "num_tokens": 80069190.0, + "step": 302 + }, + { + "epoch": 0.030271242319796193, + "grad_norm": 1.2026379108428955, + "learning_rate": 9.999999738247555e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.9003824591636658, + "num_tokens": 80334540.0, + "step": 303 + }, + { + "epoch": 0.03037114740996054, + "grad_norm": 0.8970181941986084, + "learning_rate": 9.999998952990247e-06, + "loss": 0.5368, + "mean_token_accuracy": 0.8966312110424042, + "num_tokens": 80600962.0, + "step": 304 + }, + { + "epoch": 0.03047105250012488, + "grad_norm": 1.167366623878479, + "learning_rate": 9.999997644228155e-06, + "loss": 0.5353, + "mean_token_accuracy": 0.9009692370891571, + "num_tokens": 80866411.0, + "step": 305 + }, + { + "epoch": 0.030570957590289227, + "grad_norm": 1.7419533729553223, + "learning_rate": 9.999995811961418e-06, + "loss": 0.5401, + "mean_token_accuracy": 0.8973059356212616, + "num_tokens": 81128979.0, + "step": 306 + }, + { + "epoch": 0.03067086268045357, + "grad_norm": 1.5084632635116577, + "learning_rate": 9.99999345619023e-06, + "loss": 0.5304, + "mean_token_accuracy": 0.9010638296604156, + "num_tokens": 81399711.0, + "step": 307 + }, + { + "epoch": 0.030770767770617914, + "grad_norm": 1.0181889533996582, + "learning_rate": 9.999990576914835e-06, + "loss": 0.5323, + "mean_token_accuracy": 0.9003554880619049, + "num_tokens": 81663556.0, + "step": 308 + }, + { + "epoch": 0.030870672860782256, + "grad_norm": 1.5539512634277344, + "learning_rate": 9.999987174135537e-06, + "loss": 0.5323, + "mean_token_accuracy": 0.8973708152770996, + "num_tokens": 81928340.0, + "step": 309 + }, + { + "epoch": 0.030970577950946602, + "grad_norm": 1.2207074165344238, + "learning_rate": 9.999983247852688e-06, + "loss": 0.541, + "mean_token_accuracy": 0.8982049226760864, + "num_tokens": 82188391.0, + "step": 310 + }, + { + "epoch": 0.031070483041110944, + "grad_norm": 2.265333652496338, + "learning_rate": 9.999978798066705e-06, + "loss": 0.5376, + "mean_token_accuracy": 0.897799015045166, + "num_tokens": 82454140.0, + "step": 311 + }, + { + "epoch": 0.03117038813127529, + "grad_norm": 1.186658501625061, + "learning_rate": 9.999973824778048e-06, + "loss": 0.5344, + "mean_token_accuracy": 0.9013415277004242, + "num_tokens": 82723519.0, + "step": 312 + }, + { + "epoch": 0.031270293221439635, + "grad_norm": 0.8635936975479126, + "learning_rate": 9.999968327987242e-06, + "loss": 0.5314, + "mean_token_accuracy": 0.8985745906829834, + "num_tokens": 82988517.0, + "step": 313 + }, + { + "epoch": 0.031370198311603974, + "grad_norm": 0.9559968709945679, + "learning_rate": 9.999962307694859e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.9036704301834106, + "num_tokens": 83248676.0, + "step": 314 + }, + { + "epoch": 0.03147010340176832, + "grad_norm": 0.8834469318389893, + "learning_rate": 9.999955763901532e-06, + "loss": 0.5347, + "mean_token_accuracy": 0.897765725851059, + "num_tokens": 83516824.0, + "step": 315 + }, + { + "epoch": 0.031570008491932665, + "grad_norm": 1.5610493421554565, + "learning_rate": 9.999948696607946e-06, + "loss": 0.5323, + "mean_token_accuracy": 0.9023308455944061, + "num_tokens": 83789868.0, + "step": 316 + }, + { + "epoch": 0.03166991358209701, + "grad_norm": 1.1661590337753296, + "learning_rate": 9.99994110581484e-06, + "loss": 0.5241, + "mean_token_accuracy": 0.9006113111972809, + "num_tokens": 84053526.0, + "step": 317 + }, + { + "epoch": 0.03176981867226135, + "grad_norm": 0.9711254835128784, + "learning_rate": 9.999932991523009e-06, + "loss": 0.5374, + "mean_token_accuracy": 0.8989990055561066, + "num_tokens": 84320431.0, + "step": 318 + }, + { + "epoch": 0.031869723762425695, + "grad_norm": 0.9321589469909668, + "learning_rate": 9.999924353733303e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.9013648629188538, + "num_tokens": 84588957.0, + "step": 319 + }, + { + "epoch": 0.03196962885259004, + "grad_norm": 0.8733459115028381, + "learning_rate": 9.999915192446626e-06, + "loss": 0.525, + "mean_token_accuracy": 0.9001585245132446, + "num_tokens": 84847287.0, + "step": 320 + }, + { + "epoch": 0.032069533942754386, + "grad_norm": 0.9093964099884033, + "learning_rate": 9.999905507663936e-06, + "loss": 0.5311, + "mean_token_accuracy": 0.8978051543235779, + "num_tokens": 85118654.0, + "step": 321 + }, + { + "epoch": 0.032169439032918724, + "grad_norm": 0.846508264541626, + "learning_rate": 9.999895299386248e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.8997663855552673, + "num_tokens": 85397282.0, + "step": 322 + }, + { + "epoch": 0.03226934412308307, + "grad_norm": 1.200446367263794, + "learning_rate": 9.999884567614634e-06, + "loss": 0.5301, + "mean_token_accuracy": 0.8983258008956909, + "num_tokens": 85658184.0, + "step": 323 + }, + { + "epoch": 0.032369249213247415, + "grad_norm": 0.9146994352340698, + "learning_rate": 9.99987331235021e-06, + "loss": 0.5303, + "mean_token_accuracy": 0.8968923687934875, + "num_tokens": 85934427.0, + "step": 324 + }, + { + "epoch": 0.03246915430341176, + "grad_norm": 0.9827555418014526, + "learning_rate": 9.999861533594162e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.898957759141922, + "num_tokens": 86204071.0, + "step": 325 + }, + { + "epoch": 0.0325690593935761, + "grad_norm": 0.8164798617362976, + "learning_rate": 9.99984923134772e-06, + "loss": 0.5359, + "mean_token_accuracy": 0.900073766708374, + "num_tokens": 86467772.0, + "step": 326 + }, + { + "epoch": 0.032668964483740445, + "grad_norm": 1.2717024087905884, + "learning_rate": 9.999836405612173e-06, + "loss": 0.5321, + "mean_token_accuracy": 0.8981437087059021, + "num_tokens": 86725556.0, + "step": 327 + }, + { + "epoch": 0.03276886957390479, + "grad_norm": 0.8449565768241882, + "learning_rate": 9.999823056388862e-06, + "loss": 0.522, + "mean_token_accuracy": 0.9040334522724152, + "num_tokens": 86988974.0, + "step": 328 + }, + { + "epoch": 0.032868774664069136, + "grad_norm": 0.7429953217506409, + "learning_rate": 9.999809183679186e-06, + "loss": 0.5354, + "mean_token_accuracy": 0.8974517583847046, + "num_tokens": 87260047.0, + "step": 329 + }, + { + "epoch": 0.032968679754233475, + "grad_norm": 0.7189333438873291, + "learning_rate": 9.999794787484599e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.899368017911911, + "num_tokens": 87528795.0, + "step": 330 + }, + { + "epoch": 0.03306858484439782, + "grad_norm": 0.8288807272911072, + "learning_rate": 9.999779867806604e-06, + "loss": 0.53, + "mean_token_accuracy": 0.8965699672698975, + "num_tokens": 87797273.0, + "step": 331 + }, + { + "epoch": 0.033168489934562166, + "grad_norm": 0.806265115737915, + "learning_rate": 9.999764424646768e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.8963014781475067, + "num_tokens": 88067603.0, + "step": 332 + }, + { + "epoch": 0.03326839502472651, + "grad_norm": 0.9729634523391724, + "learning_rate": 9.999748458006705e-06, + "loss": 0.5453, + "mean_token_accuracy": 0.8989296555519104, + "num_tokens": 88344454.0, + "step": 333 + }, + { + "epoch": 0.03336830011489085, + "grad_norm": 0.8563212752342224, + "learning_rate": 9.999731967888088e-06, + "loss": 0.5295, + "mean_token_accuracy": 0.9007050693035126, + "num_tokens": 88611644.0, + "step": 334 + }, + { + "epoch": 0.033468205205055196, + "grad_norm": 1.0024820566177368, + "learning_rate": 9.999714954292641e-06, + "loss": 0.532, + "mean_token_accuracy": 0.9006758332252502, + "num_tokens": 88881031.0, + "step": 335 + }, + { + "epoch": 0.03356811029521954, + "grad_norm": 0.73809814453125, + "learning_rate": 9.99969741722215e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8985277414321899, + "num_tokens": 89153979.0, + "step": 336 + }, + { + "epoch": 0.03366801538538389, + "grad_norm": 1.1387648582458496, + "learning_rate": 9.999679356678447e-06, + "loss": 0.5306, + "mean_token_accuracy": 0.9023995995521545, + "num_tokens": 89422274.0, + "step": 337 + }, + { + "epoch": 0.03376792047554823, + "grad_norm": 0.804765522480011, + "learning_rate": 9.999660772663425e-06, + "loss": 0.5301, + "mean_token_accuracy": 0.903612494468689, + "num_tokens": 89682716.0, + "step": 338 + }, + { + "epoch": 0.03386782556571257, + "grad_norm": 0.8349082469940186, + "learning_rate": 9.99964166517903e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.898638516664505, + "num_tokens": 89951729.0, + "step": 339 + }, + { + "epoch": 0.03396773065587692, + "grad_norm": 0.8403303623199463, + "learning_rate": 9.99962203422726e-06, + "loss": 0.5299, + "mean_token_accuracy": 0.8997751176357269, + "num_tokens": 90215518.0, + "step": 340 + }, + { + "epoch": 0.03406763574604126, + "grad_norm": 0.933017909526825, + "learning_rate": 9.999601879810172e-06, + "loss": 0.5226, + "mean_token_accuracy": 0.8996160626411438, + "num_tokens": 90481398.0, + "step": 341 + }, + { + "epoch": 0.03416754083620561, + "grad_norm": 0.901400625705719, + "learning_rate": 9.999581201929878e-06, + "loss": 0.5281, + "mean_token_accuracy": 0.9006513059139252, + "num_tokens": 90754024.0, + "step": 342 + }, + { + "epoch": 0.03426744592636995, + "grad_norm": 0.7615812420845032, + "learning_rate": 9.99956000058854e-06, + "loss": 0.5303, + "mean_token_accuracy": 0.9000217616558075, + "num_tokens": 91021265.0, + "step": 343 + }, + { + "epoch": 0.03436735101653429, + "grad_norm": 0.7642658948898315, + "learning_rate": 9.99953827578838e-06, + "loss": 0.5343, + "mean_token_accuracy": 0.9011240303516388, + "num_tokens": 91287480.0, + "step": 344 + }, + { + "epoch": 0.03446725610669864, + "grad_norm": 0.8507989645004272, + "learning_rate": 9.999516027531671e-06, + "loss": 0.5381, + "mean_token_accuracy": 0.8954927921295166, + "num_tokens": 91550705.0, + "step": 345 + }, + { + "epoch": 0.03456716119686298, + "grad_norm": 1.0094056129455566, + "learning_rate": 9.999493255820744e-06, + "loss": 0.5248, + "mean_token_accuracy": 0.9018283486366272, + "num_tokens": 91814285.0, + "step": 346 + }, + { + "epoch": 0.03466706628702732, + "grad_norm": 0.7362138032913208, + "learning_rate": 9.999469960657982e-06, + "loss": 0.5252, + "mean_token_accuracy": 0.9011112451553345, + "num_tokens": 92079511.0, + "step": 347 + }, + { + "epoch": 0.03476697137719167, + "grad_norm": 0.8857962489128113, + "learning_rate": 9.999446142045823e-06, + "loss": 0.5305, + "mean_token_accuracy": 0.89968341588974, + "num_tokens": 92349661.0, + "step": 348 + }, + { + "epoch": 0.03486687646735601, + "grad_norm": 0.90533846616745, + "learning_rate": 9.999421799986764e-06, + "loss": 0.5368, + "mean_token_accuracy": 0.8991372287273407, + "num_tokens": 92620150.0, + "step": 349 + }, + { + "epoch": 0.03496678155752036, + "grad_norm": 0.7973365783691406, + "learning_rate": 9.999396934483351e-06, + "loss": 0.5185, + "mean_token_accuracy": 0.9037493765354156, + "num_tokens": 92886936.0, + "step": 350 + }, + { + "epoch": 0.0350666866476847, + "grad_norm": 0.7930285334587097, + "learning_rate": 9.99937154553819e-06, + "loss": 0.529, + "mean_token_accuracy": 0.8986435830593109, + "num_tokens": 93162716.0, + "step": 351 + }, + { + "epoch": 0.03516659173784904, + "grad_norm": 0.8688386082649231, + "learning_rate": 9.999345633153935e-06, + "loss": 0.5238, + "mean_token_accuracy": 0.9017488360404968, + "num_tokens": 93417811.0, + "step": 352 + }, + { + "epoch": 0.03526649682801339, + "grad_norm": 0.9004324674606323, + "learning_rate": 9.999319197333304e-06, + "loss": 0.5304, + "mean_token_accuracy": 0.9007116556167603, + "num_tokens": 93688108.0, + "step": 353 + }, + { + "epoch": 0.035366401918177734, + "grad_norm": 0.7342990040779114, + "learning_rate": 9.999292238079061e-06, + "loss": 0.526, + "mean_token_accuracy": 0.9008886516094208, + "num_tokens": 93960679.0, + "step": 354 + }, + { + "epoch": 0.03546630700834207, + "grad_norm": 0.7962128520011902, + "learning_rate": 9.99926475539403e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8980855643749237, + "num_tokens": 94226055.0, + "step": 355 + }, + { + "epoch": 0.03556621209850642, + "grad_norm": 0.987658679485321, + "learning_rate": 9.999236749281089e-06, + "loss": 0.519, + "mean_token_accuracy": 0.9042068123817444, + "num_tokens": 94497521.0, + "step": 356 + }, + { + "epoch": 0.035666117188670764, + "grad_norm": 0.8201354146003723, + "learning_rate": 9.99920821974317e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.9024033844470978, + "num_tokens": 94758328.0, + "step": 357 + }, + { + "epoch": 0.03576602227883511, + "grad_norm": 0.8745734095573425, + "learning_rate": 9.999179166783259e-06, + "loss": 0.5311, + "mean_token_accuracy": 0.8993251919746399, + "num_tokens": 95016444.0, + "step": 358 + }, + { + "epoch": 0.03586592736899945, + "grad_norm": 1.0294010639190674, + "learning_rate": 9.9991495904044e-06, + "loss": 0.5328, + "mean_token_accuracy": 0.9012271761894226, + "num_tokens": 95269172.0, + "step": 359 + }, + { + "epoch": 0.03596583245916379, + "grad_norm": 1.0314713716506958, + "learning_rate": 9.999119490609688e-06, + "loss": 0.5277, + "mean_token_accuracy": 0.9013132154941559, + "num_tokens": 95531823.0, + "step": 360 + }, + { + "epoch": 0.03606573754932814, + "grad_norm": 0.8191397786140442, + "learning_rate": 9.999088867402276e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.9017006754875183, + "num_tokens": 95791927.0, + "step": 361 + }, + { + "epoch": 0.036165642639492485, + "grad_norm": 0.8690682053565979, + "learning_rate": 9.999057720785368e-06, + "loss": 0.5222, + "mean_token_accuracy": 0.898152768611908, + "num_tokens": 96057523.0, + "step": 362 + }, + { + "epoch": 0.03626554772965682, + "grad_norm": 0.8215792179107666, + "learning_rate": 9.999026050762227e-06, + "loss": 0.5304, + "mean_token_accuracy": 0.8985540866851807, + "num_tokens": 96324907.0, + "step": 363 + }, + { + "epoch": 0.03636545281982117, + "grad_norm": 0.7375494241714478, + "learning_rate": 9.998993857336167e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.9019704163074493, + "num_tokens": 96588639.0, + "step": 364 + }, + { + "epoch": 0.036465357909985514, + "grad_norm": 0.7159033417701721, + "learning_rate": 9.99896114051056e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.9007600247859955, + "num_tokens": 96855715.0, + "step": 365 + }, + { + "epoch": 0.03656526300014986, + "grad_norm": 0.7982690930366516, + "learning_rate": 9.998927900288833e-06, + "loss": 0.5317, + "mean_token_accuracy": 0.9015039205551147, + "num_tokens": 97132327.0, + "step": 366 + }, + { + "epoch": 0.0366651680903142, + "grad_norm": 0.7839443683624268, + "learning_rate": 9.998894136674464e-06, + "loss": 0.5278, + "mean_token_accuracy": 0.8992767333984375, + "num_tokens": 97398717.0, + "step": 367 + }, + { + "epoch": 0.036765073180478544, + "grad_norm": 0.7718364596366882, + "learning_rate": 9.998859849670987e-06, + "loss": 0.5264, + "mean_token_accuracy": 0.8993207216262817, + "num_tokens": 97671979.0, + "step": 368 + }, + { + "epoch": 0.03686497827064289, + "grad_norm": 2.014164924621582, + "learning_rate": 9.998825039281997e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.9010153114795685, + "num_tokens": 97940349.0, + "step": 369 + }, + { + "epoch": 0.036964883360807235, + "grad_norm": 0.8387324810028076, + "learning_rate": 9.998789705511131e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.9009349942207336, + "num_tokens": 98211896.0, + "step": 370 + }, + { + "epoch": 0.037064788450971574, + "grad_norm": 0.7812924385070801, + "learning_rate": 9.998753848362096e-06, + "loss": 0.5292, + "mean_token_accuracy": 0.9018773138523102, + "num_tokens": 98475782.0, + "step": 371 + }, + { + "epoch": 0.03716469354113592, + "grad_norm": 0.7312209606170654, + "learning_rate": 9.998717467838643e-06, + "loss": 0.5337, + "mean_token_accuracy": 0.900028258562088, + "num_tokens": 98744427.0, + "step": 372 + }, + { + "epoch": 0.037264598631300265, + "grad_norm": 0.747881293296814, + "learning_rate": 9.99868056394458e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.9023405909538269, + "num_tokens": 99010317.0, + "step": 373 + }, + { + "epoch": 0.03736450372146461, + "grad_norm": 0.6842890381813049, + "learning_rate": 9.998643136683772e-06, + "loss": 0.5293, + "mean_token_accuracy": 0.9008198082447052, + "num_tokens": 99281434.0, + "step": 374 + }, + { + "epoch": 0.03746440881162895, + "grad_norm": 0.7344777584075928, + "learning_rate": 9.998605186060138e-06, + "loss": 0.5255, + "mean_token_accuracy": 0.902273029088974, + "num_tokens": 99535392.0, + "step": 375 + }, + { + "epoch": 0.037564313901793295, + "grad_norm": 0.8973487019538879, + "learning_rate": 9.99856671207765e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.9025901854038239, + "num_tokens": 99799278.0, + "step": 376 + }, + { + "epoch": 0.03766421899195764, + "grad_norm": 0.7494037747383118, + "learning_rate": 9.99852771474034e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.8994888663291931, + "num_tokens": 100064645.0, + "step": 377 + }, + { + "epoch": 0.037764124082121986, + "grad_norm": 0.7869205474853516, + "learning_rate": 9.998488194052287e-06, + "loss": 0.5241, + "mean_token_accuracy": 0.9040055572986603, + "num_tokens": 100334297.0, + "step": 378 + }, + { + "epoch": 0.037864029172286325, + "grad_norm": 1.8634133338928223, + "learning_rate": 9.99844815001763e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.8983111083507538, + "num_tokens": 100600957.0, + "step": 379 + }, + { + "epoch": 0.03796393426245067, + "grad_norm": 1.050470232963562, + "learning_rate": 9.99840758264056e-06, + "loss": 0.532, + "mean_token_accuracy": 0.9012022614479065, + "num_tokens": 100860005.0, + "step": 380 + }, + { + "epoch": 0.038063839352615016, + "grad_norm": 0.8709249496459961, + "learning_rate": 9.99836649192533e-06, + "loss": 0.521, + "mean_token_accuracy": 0.902355968952179, + "num_tokens": 101128010.0, + "step": 381 + }, + { + "epoch": 0.03816374444277936, + "grad_norm": 0.9782320261001587, + "learning_rate": 9.998324877876237e-06, + "loss": 0.5319, + "mean_token_accuracy": 0.8986072242259979, + "num_tokens": 101391353.0, + "step": 382 + }, + { + "epoch": 0.03826364953294371, + "grad_norm": 0.7670050859451294, + "learning_rate": 9.99828274049764e-06, + "loss": 0.5268, + "mean_token_accuracy": 0.8975474834442139, + "num_tokens": 101656557.0, + "step": 383 + }, + { + "epoch": 0.038363554623108045, + "grad_norm": 1.0253820419311523, + "learning_rate": 9.99824007979395e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8991518616676331, + "num_tokens": 101914782.0, + "step": 384 + }, + { + "epoch": 0.03846345971327239, + "grad_norm": 0.7712920308113098, + "learning_rate": 9.998196895769637e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.900462806224823, + "num_tokens": 102176381.0, + "step": 385 + }, + { + "epoch": 0.03856336480343674, + "grad_norm": 0.7854849100112915, + "learning_rate": 9.998153188429216e-06, + "loss": 0.5214, + "mean_token_accuracy": 0.9052407145500183, + "num_tokens": 102437029.0, + "step": 386 + }, + { + "epoch": 0.03866326989360108, + "grad_norm": 0.6535704135894775, + "learning_rate": 9.998108957777269e-06, + "loss": 0.517, + "mean_token_accuracy": 0.9009873867034912, + "num_tokens": 102704588.0, + "step": 387 + }, + { + "epoch": 0.03876317498376542, + "grad_norm": 0.7511783838272095, + "learning_rate": 9.998064203818423e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.9029517769813538, + "num_tokens": 102986684.0, + "step": 388 + }, + { + "epoch": 0.038863080073929766, + "grad_norm": 0.8388070464134216, + "learning_rate": 9.998018926557366e-06, + "loss": 0.5254, + "mean_token_accuracy": 0.9005339741706848, + "num_tokens": 103246383.0, + "step": 389 + }, + { + "epoch": 0.03896298516409411, + "grad_norm": 0.7153409719467163, + "learning_rate": 9.997973125998837e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.9034371972084045, + "num_tokens": 103520342.0, + "step": 390 + }, + { + "epoch": 0.03906289025425846, + "grad_norm": 0.8111726641654968, + "learning_rate": 9.997926802147635e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.9007230699062347, + "num_tokens": 103792140.0, + "step": 391 + }, + { + "epoch": 0.039162795344422796, + "grad_norm": 1.0048549175262451, + "learning_rate": 9.997879955008607e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.9004335403442383, + "num_tokens": 104062935.0, + "step": 392 + }, + { + "epoch": 0.03926270043458714, + "grad_norm": 1.3446204662322998, + "learning_rate": 9.997832584586657e-06, + "loss": 0.5244, + "mean_token_accuracy": 0.8985872566699982, + "num_tokens": 104324961.0, + "step": 393 + }, + { + "epoch": 0.03936260552475149, + "grad_norm": 1.194883942604065, + "learning_rate": 9.997784690886747e-06, + "loss": 0.5248, + "mean_token_accuracy": 0.9031952619552612, + "num_tokens": 104584038.0, + "step": 394 + }, + { + "epoch": 0.03946251061491583, + "grad_norm": 0.8315118551254272, + "learning_rate": 9.99773627391389e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.9016036689281464, + "num_tokens": 104850644.0, + "step": 395 + }, + { + "epoch": 0.03956241570508017, + "grad_norm": 1.250533938407898, + "learning_rate": 9.997687333673158e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.8990642428398132, + "num_tokens": 105120194.0, + "step": 396 + }, + { + "epoch": 0.03966232079524452, + "grad_norm": 0.8506054282188416, + "learning_rate": 9.997637870169673e-06, + "loss": 0.5297, + "mean_token_accuracy": 0.9006818532943726, + "num_tokens": 105390659.0, + "step": 397 + }, + { + "epoch": 0.03976222588540886, + "grad_norm": 2.9271957874298096, + "learning_rate": 9.997587883408611e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.9029192924499512, + "num_tokens": 105658235.0, + "step": 398 + }, + { + "epoch": 0.03986213097557321, + "grad_norm": 0.8778564929962158, + "learning_rate": 9.997537373395212e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.8986867368221283, + "num_tokens": 105918666.0, + "step": 399 + }, + { + "epoch": 0.03996203606573755, + "grad_norm": 0.8519964814186096, + "learning_rate": 9.997486340134759e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.9009114503860474, + "num_tokens": 106176381.0, + "step": 400 + }, + { + "epoch": 0.04006194115590189, + "grad_norm": 0.8624478578567505, + "learning_rate": 9.997434783632599e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.9013112187385559, + "num_tokens": 106443873.0, + "step": 401 + }, + { + "epoch": 0.04016184624606624, + "grad_norm": 1.043567180633545, + "learning_rate": 9.997382703894128e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.9033298492431641, + "num_tokens": 106697016.0, + "step": 402 + }, + { + "epoch": 0.04026175133623058, + "grad_norm": 1.0373932123184204, + "learning_rate": 9.9973301009248e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.9008671343326569, + "num_tokens": 106958494.0, + "step": 403 + }, + { + "epoch": 0.04036165642639492, + "grad_norm": 0.676684558391571, + "learning_rate": 9.997276974730121e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.9028112590312958, + "num_tokens": 107227730.0, + "step": 404 + }, + { + "epoch": 0.04046156151655927, + "grad_norm": 1.650298833847046, + "learning_rate": 9.997223325315652e-06, + "loss": 0.5259, + "mean_token_accuracy": 0.9014280140399933, + "num_tokens": 107496428.0, + "step": 405 + }, + { + "epoch": 0.04056146660672361, + "grad_norm": 0.8032082915306091, + "learning_rate": 9.997169152687016e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.9028812348842621, + "num_tokens": 107759027.0, + "step": 406 + }, + { + "epoch": 0.04066137169688796, + "grad_norm": 1.0715609788894653, + "learning_rate": 9.99711445684988e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.9033588171005249, + "num_tokens": 108020380.0, + "step": 407 + }, + { + "epoch": 0.0407612767870523, + "grad_norm": 1.0002901554107666, + "learning_rate": 9.997059237809973e-06, + "loss": 0.5277, + "mean_token_accuracy": 0.8982800245285034, + "num_tokens": 108289722.0, + "step": 408 + }, + { + "epoch": 0.04086118187721664, + "grad_norm": 0.8748329281806946, + "learning_rate": 9.997003495573073e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.8976249396800995, + "num_tokens": 108551458.0, + "step": 409 + }, + { + "epoch": 0.04096108696738099, + "grad_norm": 1.2390552759170532, + "learning_rate": 9.99694723014502e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.902109295129776, + "num_tokens": 108825355.0, + "step": 410 + }, + { + "epoch": 0.041060992057545334, + "grad_norm": 0.9568818211555481, + "learning_rate": 9.996890441531702e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.8985587954521179, + "num_tokens": 109094208.0, + "step": 411 + }, + { + "epoch": 0.04116089714770967, + "grad_norm": 0.6799317002296448, + "learning_rate": 9.996833129739068e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.9016802310943604, + "num_tokens": 109370578.0, + "step": 412 + }, + { + "epoch": 0.04126080223787402, + "grad_norm": 0.8281707763671875, + "learning_rate": 9.996775294773118e-06, + "loss": 0.5267, + "mean_token_accuracy": 0.8978945314884186, + "num_tokens": 109634525.0, + "step": 413 + }, + { + "epoch": 0.041360707328038364, + "grad_norm": 0.9265066385269165, + "learning_rate": 9.996716936639905e-06, + "loss": 0.5279, + "mean_token_accuracy": 0.9000034630298615, + "num_tokens": 109902505.0, + "step": 414 + }, + { + "epoch": 0.04146061241820271, + "grad_norm": 0.7587083578109741, + "learning_rate": 9.996658055345542e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.9021679759025574, + "num_tokens": 110160580.0, + "step": 415 + }, + { + "epoch": 0.04156051750836705, + "grad_norm": 8.025537490844727, + "learning_rate": 9.996598650896191e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.901133120059967, + "num_tokens": 110421632.0, + "step": 416 + }, + { + "epoch": 0.041660422598531394, + "grad_norm": 1.037459373474121, + "learning_rate": 9.996538723298075e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.9029141962528229, + "num_tokens": 110687673.0, + "step": 417 + }, + { + "epoch": 0.04176032768869574, + "grad_norm": 0.9394749402999878, + "learning_rate": 9.996478272557465e-06, + "loss": 0.5259, + "mean_token_accuracy": 0.9023944139480591, + "num_tokens": 110956170.0, + "step": 418 + }, + { + "epoch": 0.041860232778860085, + "grad_norm": 1.0585800409317017, + "learning_rate": 9.996417298680695e-06, + "loss": 0.5283, + "mean_token_accuracy": 0.9009409248828888, + "num_tokens": 111218765.0, + "step": 419 + }, + { + "epoch": 0.04196013786902442, + "grad_norm": 1.2777636051177979, + "learning_rate": 9.996355801674145e-06, + "loss": 0.5224, + "mean_token_accuracy": 0.9023891389369965, + "num_tokens": 111482237.0, + "step": 420 + }, + { + "epoch": 0.04206004295918877, + "grad_norm": 0.8263205289840698, + "learning_rate": 9.996293781544255e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.8996022939682007, + "num_tokens": 111753469.0, + "step": 421 + }, + { + "epoch": 0.042159948049353115, + "grad_norm": 1.0318872928619385, + "learning_rate": 9.996231238297516e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.9029535055160522, + "num_tokens": 112018498.0, + "step": 422 + }, + { + "epoch": 0.04225985313951746, + "grad_norm": 0.7547467350959778, + "learning_rate": 9.996168171940482e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.8997613489627838, + "num_tokens": 112283614.0, + "step": 423 + }, + { + "epoch": 0.042359758229681806, + "grad_norm": 0.7789050340652466, + "learning_rate": 9.996104582479752e-06, + "loss": 0.5232, + "mean_token_accuracy": 0.9002035558223724, + "num_tokens": 112546389.0, + "step": 424 + }, + { + "epoch": 0.042459663319846144, + "grad_norm": 0.8349210619926453, + "learning_rate": 9.996040469921983e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.8989750742912292, + "num_tokens": 112810176.0, + "step": 425 + }, + { + "epoch": 0.04255956841001049, + "grad_norm": 0.9414175748825073, + "learning_rate": 9.99597583427389e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.9016525447368622, + "num_tokens": 113078585.0, + "step": 426 + }, + { + "epoch": 0.042659473500174835, + "grad_norm": 0.8506211638450623, + "learning_rate": 9.995910675542243e-06, + "loss": 0.5259, + "mean_token_accuracy": 0.901216447353363, + "num_tokens": 113345726.0, + "step": 427 + }, + { + "epoch": 0.04275937859033918, + "grad_norm": 0.9154441356658936, + "learning_rate": 9.995844993733857e-06, + "loss": 0.5222, + "mean_token_accuracy": 0.9018054902553558, + "num_tokens": 113614211.0, + "step": 428 + }, + { + "epoch": 0.04285928368050352, + "grad_norm": 0.868958055973053, + "learning_rate": 9.995778788855614e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.9016129672527313, + "num_tokens": 113872484.0, + "step": 429 + }, + { + "epoch": 0.042959188770667865, + "grad_norm": 0.7964997887611389, + "learning_rate": 9.995712060914445e-06, + "loss": 0.5207, + "mean_token_accuracy": 0.9008412957191467, + "num_tokens": 114137001.0, + "step": 430 + }, + { + "epoch": 0.04305909386083221, + "grad_norm": 0.8561015725135803, + "learning_rate": 9.995644809917337e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.9002349674701691, + "num_tokens": 114403745.0, + "step": 431 + }, + { + "epoch": 0.043158998950996556, + "grad_norm": 1.475362777709961, + "learning_rate": 9.99557703587133e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.9026619493961334, + "num_tokens": 114667015.0, + "step": 432 + }, + { + "epoch": 0.043258904041160895, + "grad_norm": 1.3513208627700806, + "learning_rate": 9.99550873878352e-06, + "loss": 0.512, + "mean_token_accuracy": 0.9027556478977203, + "num_tokens": 114933407.0, + "step": 433 + }, + { + "epoch": 0.04335880913132524, + "grad_norm": 0.9064937233924866, + "learning_rate": 9.995439918661058e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.8999009728431702, + "num_tokens": 115196816.0, + "step": 434 + }, + { + "epoch": 0.043458714221489586, + "grad_norm": 0.7769956588745117, + "learning_rate": 9.995370575511151e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.9036073684692383, + "num_tokens": 115451077.0, + "step": 435 + }, + { + "epoch": 0.04355861931165393, + "grad_norm": 0.7949682474136353, + "learning_rate": 9.995300709341058e-06, + "loss": 0.522, + "mean_token_accuracy": 0.9056840240955353, + "num_tokens": 115721974.0, + "step": 436 + }, + { + "epoch": 0.04365852440181827, + "grad_norm": 0.8644154071807861, + "learning_rate": 9.995230320158092e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.8989141583442688, + "num_tokens": 115979269.0, + "step": 437 + }, + { + "epoch": 0.043758429491982616, + "grad_norm": 0.7386981844902039, + "learning_rate": 9.995159407969626e-06, + "loss": 0.519, + "mean_token_accuracy": 0.9005712270736694, + "num_tokens": 116252596.0, + "step": 438 + }, + { + "epoch": 0.04385833458214696, + "grad_norm": 0.9219418168067932, + "learning_rate": 9.995087972783084e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.901971697807312, + "num_tokens": 116521580.0, + "step": 439 + }, + { + "epoch": 0.04395823967231131, + "grad_norm": 0.8714570999145508, + "learning_rate": 9.995016014605945e-06, + "loss": 0.5252, + "mean_token_accuracy": 0.8984814882278442, + "num_tokens": 116777980.0, + "step": 440 + }, + { + "epoch": 0.044058144762475646, + "grad_norm": 0.9194080233573914, + "learning_rate": 9.994943533445742e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.9024464190006256, + "num_tokens": 117046407.0, + "step": 441 + }, + { + "epoch": 0.04415804985263999, + "grad_norm": 0.8342689275741577, + "learning_rate": 9.994870529310065e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.901452898979187, + "num_tokens": 117310647.0, + "step": 442 + }, + { + "epoch": 0.04425795494280434, + "grad_norm": 0.8505469560623169, + "learning_rate": 9.994797002206558e-06, + "loss": 0.517, + "mean_token_accuracy": 0.9013265669345856, + "num_tokens": 117577476.0, + "step": 443 + }, + { + "epoch": 0.04435786003296868, + "grad_norm": 0.6402555704116821, + "learning_rate": 9.994722952142919e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.9066931903362274, + "num_tokens": 117833655.0, + "step": 444 + }, + { + "epoch": 0.04445776512313302, + "grad_norm": 0.8343015909194946, + "learning_rate": 9.9946483791269e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.9012129008769989, + "num_tokens": 118098778.0, + "step": 445 + }, + { + "epoch": 0.044557670213297366, + "grad_norm": 0.7171058058738708, + "learning_rate": 9.99457328316631e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.9030857980251312, + "num_tokens": 118369314.0, + "step": 446 + }, + { + "epoch": 0.04465757530346171, + "grad_norm": 0.6876112818717957, + "learning_rate": 9.99449766426901e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.9028674960136414, + "num_tokens": 118644298.0, + "step": 447 + }, + { + "epoch": 0.04475748039362606, + "grad_norm": 0.7702680230140686, + "learning_rate": 9.99442152244292e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.9025936126708984, + "num_tokens": 118911982.0, + "step": 448 + }, + { + "epoch": 0.044857385483790396, + "grad_norm": 0.825798511505127, + "learning_rate": 9.99434485769601e-06, + "loss": 0.515, + "mean_token_accuracy": 0.905123382806778, + "num_tokens": 119183640.0, + "step": 449 + }, + { + "epoch": 0.04495729057395474, + "grad_norm": 0.720318615436554, + "learning_rate": 9.994267670036309e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.9029909074306488, + "num_tokens": 119451385.0, + "step": 450 + }, + { + "epoch": 0.04505719566411909, + "grad_norm": 0.7323959469795227, + "learning_rate": 9.994189959471895e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.8979023694992065, + "num_tokens": 119721739.0, + "step": 451 + }, + { + "epoch": 0.04515710075428343, + "grad_norm": 0.7322348356246948, + "learning_rate": 9.994111726010909e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8990514576435089, + "num_tokens": 119979723.0, + "step": 452 + }, + { + "epoch": 0.04525700584444777, + "grad_norm": 0.6747550368309021, + "learning_rate": 9.99403296966154e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.9013518989086151, + "num_tokens": 120241968.0, + "step": 453 + }, + { + "epoch": 0.04535691093461212, + "grad_norm": 0.6736500859260559, + "learning_rate": 9.993953690432032e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.9015143811702728, + "num_tokens": 120511051.0, + "step": 454 + }, + { + "epoch": 0.04545681602477646, + "grad_norm": 0.7221965789794922, + "learning_rate": 9.993873888330688e-06, + "loss": 0.5185, + "mean_token_accuracy": 0.9009664356708527, + "num_tokens": 120774564.0, + "step": 455 + }, + { + "epoch": 0.04555672111494081, + "grad_norm": 1.2612738609313965, + "learning_rate": 9.993793563365864e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.9041774570941925, + "num_tokens": 121033247.0, + "step": 456 + }, + { + "epoch": 0.04565662620510515, + "grad_norm": 0.7048320174217224, + "learning_rate": 9.993712715545966e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.8999360799789429, + "num_tokens": 121301415.0, + "step": 457 + }, + { + "epoch": 0.04575653129526949, + "grad_norm": 0.7111458778381348, + "learning_rate": 9.993631344879465e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.9030219614505768, + "num_tokens": 121562665.0, + "step": 458 + }, + { + "epoch": 0.04585643638543384, + "grad_norm": 0.6726027727127075, + "learning_rate": 9.993549451374873e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.8991037905216217, + "num_tokens": 121828689.0, + "step": 459 + }, + { + "epoch": 0.045956341475598184, + "grad_norm": 0.6425039172172546, + "learning_rate": 9.993467035040772e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.901127815246582, + "num_tokens": 122100031.0, + "step": 460 + }, + { + "epoch": 0.04605624656576252, + "grad_norm": 0.6966259479522705, + "learning_rate": 9.993384095885786e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.9042024314403534, + "num_tokens": 122350084.0, + "step": 461 + }, + { + "epoch": 0.04615615165592687, + "grad_norm": 0.7601854801177979, + "learning_rate": 9.993300633918602e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.9031278789043427, + "num_tokens": 122608029.0, + "step": 462 + }, + { + "epoch": 0.04625605674609121, + "grad_norm": 0.7282091975212097, + "learning_rate": 9.993216649147955e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.9024856090545654, + "num_tokens": 122880517.0, + "step": 463 + }, + { + "epoch": 0.04635596183625556, + "grad_norm": 0.6603074073791504, + "learning_rate": 9.99313214158264e-06, + "loss": 0.528, + "mean_token_accuracy": 0.8996098339557648, + "num_tokens": 123141334.0, + "step": 464 + }, + { + "epoch": 0.046455866926419905, + "grad_norm": 0.7044258713722229, + "learning_rate": 9.993047111231507e-06, + "loss": 0.5222, + "mean_token_accuracy": 0.8998009860515594, + "num_tokens": 123399224.0, + "step": 465 + }, + { + "epoch": 0.04655577201658424, + "grad_norm": 0.6589083671569824, + "learning_rate": 9.992961558103455e-06, + "loss": 0.523, + "mean_token_accuracy": 0.8989686369895935, + "num_tokens": 123669779.0, + "step": 466 + }, + { + "epoch": 0.04665567710674859, + "grad_norm": 0.691650390625, + "learning_rate": 9.992875482207445e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.9008677005767822, + "num_tokens": 123939610.0, + "step": 467 + }, + { + "epoch": 0.046755582196912934, + "grad_norm": 1.0302625894546509, + "learning_rate": 9.992788883552487e-06, + "loss": 0.5237, + "mean_token_accuracy": 0.9026167690753937, + "num_tokens": 124209804.0, + "step": 468 + }, + { + "epoch": 0.04685548728707728, + "grad_norm": 0.6549057960510254, + "learning_rate": 9.99270176214765e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.9009135365486145, + "num_tokens": 124475968.0, + "step": 469 + }, + { + "epoch": 0.04695539237724162, + "grad_norm": 1.0417643785476685, + "learning_rate": 9.992614118002054e-06, + "loss": 0.521, + "mean_token_accuracy": 0.9004765152931213, + "num_tokens": 124739882.0, + "step": 470 + }, + { + "epoch": 0.047055297467405964, + "grad_norm": 0.7204427123069763, + "learning_rate": 9.992525951124873e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.9007734954357147, + "num_tokens": 125006511.0, + "step": 471 + }, + { + "epoch": 0.04715520255757031, + "grad_norm": 0.7402814626693726, + "learning_rate": 9.992437261525343e-06, + "loss": 0.5248, + "mean_token_accuracy": 0.9022645950317383, + "num_tokens": 125276725.0, + "step": 472 + }, + { + "epoch": 0.047255107647734655, + "grad_norm": 0.7293504476547241, + "learning_rate": 9.99234804921275e-06, + "loss": 0.5218, + "mean_token_accuracy": 0.9013746082782745, + "num_tokens": 125543386.0, + "step": 473 + }, + { + "epoch": 0.047355012737898994, + "grad_norm": 0.8905327916145325, + "learning_rate": 9.99225831419643e-06, + "loss": 0.512, + "mean_token_accuracy": 0.9039784967899323, + "num_tokens": 125807317.0, + "step": 474 + }, + { + "epoch": 0.04745491782806334, + "grad_norm": 0.7015806436538696, + "learning_rate": 9.992168056485781e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.9035473763942719, + "num_tokens": 126074006.0, + "step": 475 + }, + { + "epoch": 0.047554822918227685, + "grad_norm": 0.6687757968902588, + "learning_rate": 9.992077276090254e-06, + "loss": 0.5264, + "mean_token_accuracy": 0.8964598774909973, + "num_tokens": 126339004.0, + "step": 476 + }, + { + "epoch": 0.04765472800839203, + "grad_norm": 3.634535074234009, + "learning_rate": 9.991985973019351e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8992540836334229, + "num_tokens": 126601723.0, + "step": 477 + }, + { + "epoch": 0.04775463309855637, + "grad_norm": 0.89568030834198, + "learning_rate": 9.991894147282635e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.9006851017475128, + "num_tokens": 126869765.0, + "step": 478 + }, + { + "epoch": 0.047854538188720715, + "grad_norm": 0.6421401500701904, + "learning_rate": 9.991801798889718e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.9022082984447479, + "num_tokens": 127132453.0, + "step": 479 + }, + { + "epoch": 0.04795444327888506, + "grad_norm": 0.891727864742279, + "learning_rate": 9.99170892785027e-06, + "loss": 0.5267, + "mean_token_accuracy": 0.8982411623001099, + "num_tokens": 127403710.0, + "step": 480 + }, + { + "epoch": 0.048054348369049406, + "grad_norm": 1.1518586874008179, + "learning_rate": 9.991615534174014e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.8983463048934937, + "num_tokens": 127661377.0, + "step": 481 + }, + { + "epoch": 0.048154253459213744, + "grad_norm": 0.7184730172157288, + "learning_rate": 9.991521617870726e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.904174268245697, + "num_tokens": 127929155.0, + "step": 482 + }, + { + "epoch": 0.04825415854937809, + "grad_norm": 0.6804986000061035, + "learning_rate": 9.991427178950243e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.9017981588840485, + "num_tokens": 128196513.0, + "step": 483 + }, + { + "epoch": 0.048354063639542436, + "grad_norm": 0.8421298265457153, + "learning_rate": 9.991332217422454e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.9021549820899963, + "num_tokens": 128463853.0, + "step": 484 + }, + { + "epoch": 0.04845396872970678, + "grad_norm": 0.7779698967933655, + "learning_rate": 9.991236733297295e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.9014600515365601, + "num_tokens": 128731140.0, + "step": 485 + }, + { + "epoch": 0.04855387381987112, + "grad_norm": 1.0114569664001465, + "learning_rate": 9.99114072658477e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.9045655727386475, + "num_tokens": 128997446.0, + "step": 486 + }, + { + "epoch": 0.048653778910035465, + "grad_norm": 0.7589089870452881, + "learning_rate": 9.991044197294927e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.9026931822299957, + "num_tokens": 129269200.0, + "step": 487 + }, + { + "epoch": 0.04875368400019981, + "grad_norm": 0.7057653665542603, + "learning_rate": 9.990947145437878e-06, + "loss": 0.514, + "mean_token_accuracy": 0.9042235612869263, + "num_tokens": 129540437.0, + "step": 488 + }, + { + "epoch": 0.048853589090364156, + "grad_norm": 0.9161249995231628, + "learning_rate": 9.990849571023775e-06, + "loss": 0.52, + "mean_token_accuracy": 0.902158260345459, + "num_tokens": 129814431.0, + "step": 489 + }, + { + "epoch": 0.048953494180528495, + "grad_norm": 0.7021593451499939, + "learning_rate": 9.990751474062843e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.9001196920871735, + "num_tokens": 130081916.0, + "step": 490 + }, + { + "epoch": 0.04905339927069284, + "grad_norm": 0.7025592923164368, + "learning_rate": 9.990652854565348e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.9003038108348846, + "num_tokens": 130353745.0, + "step": 491 + }, + { + "epoch": 0.049153304360857186, + "grad_norm": 0.7680575847625732, + "learning_rate": 9.990553712541617e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.8990825116634369, + "num_tokens": 130610607.0, + "step": 492 + }, + { + "epoch": 0.04925320945102153, + "grad_norm": 0.6936595439910889, + "learning_rate": 9.990454048002033e-06, + "loss": 0.522, + "mean_token_accuracy": 0.9017257690429688, + "num_tokens": 130872751.0, + "step": 493 + }, + { + "epoch": 0.04935311454118587, + "grad_norm": 1.0107386112213135, + "learning_rate": 9.990353860957025e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.9021632075309753, + "num_tokens": 131129533.0, + "step": 494 + }, + { + "epoch": 0.049453019631350216, + "grad_norm": 1.012527346611023, + "learning_rate": 9.990253151417087e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.9018363654613495, + "num_tokens": 131394998.0, + "step": 495 + }, + { + "epoch": 0.04955292472151456, + "grad_norm": 0.7528091669082642, + "learning_rate": 9.990151919392762e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8988025784492493, + "num_tokens": 131664801.0, + "step": 496 + }, + { + "epoch": 0.04965282981167891, + "grad_norm": 0.7700282335281372, + "learning_rate": 9.99005016489465e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.8993037045001984, + "num_tokens": 131932686.0, + "step": 497 + }, + { + "epoch": 0.049752734901843246, + "grad_norm": 0.7612524628639221, + "learning_rate": 9.989947887933404e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.9020034670829773, + "num_tokens": 132195264.0, + "step": 498 + }, + { + "epoch": 0.04985263999200759, + "grad_norm": 0.6769097447395325, + "learning_rate": 9.989845088519732e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.9015448987483978, + "num_tokens": 132463058.0, + "step": 499 + }, + { + "epoch": 0.04995254508217194, + "grad_norm": 0.7107875943183899, + "learning_rate": 9.989741766664399e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.9035780429840088, + "num_tokens": 132731336.0, + "step": 500 + }, + { + "epoch": 0.05005245017233628, + "grad_norm": 0.7589768171310425, + "learning_rate": 9.989637922378222e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.9040534794330597, + "num_tokens": 132992945.0, + "step": 501 + }, + { + "epoch": 0.05015235526250062, + "grad_norm": 0.7260648608207703, + "learning_rate": 9.989533555672074e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.8993750214576721, + "num_tokens": 133258868.0, + "step": 502 + }, + { + "epoch": 0.05025226035266497, + "grad_norm": 0.6706081032752991, + "learning_rate": 9.98942866655688e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.9064238667488098, + "num_tokens": 133527174.0, + "step": 503 + }, + { + "epoch": 0.05035216544282931, + "grad_norm": 0.8524659276008606, + "learning_rate": 9.989323255043623e-06, + "loss": 0.516, + "mean_token_accuracy": 0.9021318554878235, + "num_tokens": 133798402.0, + "step": 504 + }, + { + "epoch": 0.05045207053299366, + "grad_norm": 0.7078625559806824, + "learning_rate": 9.989217321143342e-06, + "loss": 0.5238, + "mean_token_accuracy": 0.9020358026027679, + "num_tokens": 134066330.0, + "step": 505 + }, + { + "epoch": 0.050551975623158, + "grad_norm": 0.7255425453186035, + "learning_rate": 9.989110864867126e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.9019149839878082, + "num_tokens": 134331344.0, + "step": 506 + }, + { + "epoch": 0.05065188071332234, + "grad_norm": 0.7135269641876221, + "learning_rate": 9.989003886226123e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.9013754427433014, + "num_tokens": 134597543.0, + "step": 507 + }, + { + "epoch": 0.05075178580348669, + "grad_norm": 0.7564067840576172, + "learning_rate": 9.988896385231532e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.9037482738494873, + "num_tokens": 134869033.0, + "step": 508 + }, + { + "epoch": 0.05085169089365103, + "grad_norm": 1.2056206464767456, + "learning_rate": 9.988788361894609e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.901914119720459, + "num_tokens": 135134431.0, + "step": 509 + }, + { + "epoch": 0.05095159598381538, + "grad_norm": 0.7789416313171387, + "learning_rate": 9.988679816226665e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.9005151093006134, + "num_tokens": 135398055.0, + "step": 510 + }, + { + "epoch": 0.05105150107397972, + "grad_norm": 0.928634762763977, + "learning_rate": 9.988570748239062e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.9029776155948639, + "num_tokens": 135663246.0, + "step": 511 + }, + { + "epoch": 0.05115140616414406, + "grad_norm": 1.0377219915390015, + "learning_rate": 9.988461157943223e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8995865881443024, + "num_tokens": 135937122.0, + "step": 512 + }, + { + "epoch": 0.05125131125430841, + "grad_norm": 0.6329984664916992, + "learning_rate": 9.988351045350622e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.9017043113708496, + "num_tokens": 136211287.0, + "step": 513 + }, + { + "epoch": 0.051351216344472754, + "grad_norm": 0.7651588320732117, + "learning_rate": 9.988240410472784e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.900004118680954, + "num_tokens": 136481419.0, + "step": 514 + }, + { + "epoch": 0.05145112143463709, + "grad_norm": 1.023106575012207, + "learning_rate": 9.988129253321298e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.8998804092407227, + "num_tokens": 136744134.0, + "step": 515 + }, + { + "epoch": 0.05155102652480144, + "grad_norm": 0.86357581615448, + "learning_rate": 9.988017573907798e-06, + "loss": 0.52, + "mean_token_accuracy": 0.9009144604206085, + "num_tokens": 137004191.0, + "step": 516 + }, + { + "epoch": 0.051650931614965784, + "grad_norm": 0.7397897243499756, + "learning_rate": 9.987905372243979e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.906571626663208, + "num_tokens": 137266465.0, + "step": 517 + }, + { + "epoch": 0.05175083670513013, + "grad_norm": 0.7697049379348755, + "learning_rate": 9.987792648341587e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.8999640941619873, + "num_tokens": 137528266.0, + "step": 518 + }, + { + "epoch": 0.05185074179529447, + "grad_norm": 0.7625311017036438, + "learning_rate": 9.987679402212426e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.9054868817329407, + "num_tokens": 137792863.0, + "step": 519 + }, + { + "epoch": 0.051950646885458814, + "grad_norm": 0.9867026209831238, + "learning_rate": 9.987565633868355e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.9018554985523224, + "num_tokens": 138057108.0, + "step": 520 + }, + { + "epoch": 0.05205055197562316, + "grad_norm": 0.7108213305473328, + "learning_rate": 9.98745134332128e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.903113603591919, + "num_tokens": 138328620.0, + "step": 521 + }, + { + "epoch": 0.052150457065787505, + "grad_norm": 0.9113190770149231, + "learning_rate": 9.987336530583171e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.9039855599403381, + "num_tokens": 138593599.0, + "step": 522 + }, + { + "epoch": 0.05225036215595184, + "grad_norm": 0.9305064678192139, + "learning_rate": 9.987221195666048e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.9047682285308838, + "num_tokens": 138847709.0, + "step": 523 + }, + { + "epoch": 0.05235026724611619, + "grad_norm": 0.8093949556350708, + "learning_rate": 9.987105338581988e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.9030449688434601, + "num_tokens": 139114727.0, + "step": 524 + }, + { + "epoch": 0.052450172336280534, + "grad_norm": 1.0964332818984985, + "learning_rate": 9.986988959343121e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.9029381573200226, + "num_tokens": 139385333.0, + "step": 525 + }, + { + "epoch": 0.05255007742644488, + "grad_norm": 0.9064055681228638, + "learning_rate": 9.98687205796163e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.902116596698761, + "num_tokens": 139644769.0, + "step": 526 + }, + { + "epoch": 0.05264998251660922, + "grad_norm": 0.9090225100517273, + "learning_rate": 9.986754634449756e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.9018265306949615, + "num_tokens": 139916061.0, + "step": 527 + }, + { + "epoch": 0.052749887606773564, + "grad_norm": 0.8661444187164307, + "learning_rate": 9.986636688819795e-06, + "loss": 0.515, + "mean_token_accuracy": 0.9024776220321655, + "num_tokens": 140189535.0, + "step": 528 + }, + { + "epoch": 0.05284979269693791, + "grad_norm": 1.12009859085083, + "learning_rate": 9.986518221084094e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.9009732306003571, + "num_tokens": 140451594.0, + "step": 529 + }, + { + "epoch": 0.052949697787102255, + "grad_norm": 1.3297531604766846, + "learning_rate": 9.986399231255057e-06, + "loss": 0.5249, + "mean_token_accuracy": 0.9011409282684326, + "num_tokens": 140716287.0, + "step": 530 + }, + { + "epoch": 0.053049602877266594, + "grad_norm": 0.8928907513618469, + "learning_rate": 9.986279719345142e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.9018188118934631, + "num_tokens": 140980784.0, + "step": 531 + }, + { + "epoch": 0.05314950796743094, + "grad_norm": 0.6397438049316406, + "learning_rate": 9.986159685366862e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9023649096488953, + "num_tokens": 141248181.0, + "step": 532 + }, + { + "epoch": 0.053249413057595285, + "grad_norm": 0.7014410495758057, + "learning_rate": 9.986039129332787e-06, + "loss": 0.514, + "mean_token_accuracy": 0.9038279354572296, + "num_tokens": 141515241.0, + "step": 533 + }, + { + "epoch": 0.05334931814775963, + "grad_norm": 0.9531790018081665, + "learning_rate": 9.985918051255537e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.9027712941169739, + "num_tokens": 141780897.0, + "step": 534 + }, + { + "epoch": 0.05344922323792397, + "grad_norm": 0.7469579577445984, + "learning_rate": 9.985796451147789e-06, + "loss": 0.513, + "mean_token_accuracy": 0.9034770131111145, + "num_tokens": 142042334.0, + "step": 535 + }, + { + "epoch": 0.053549128328088315, + "grad_norm": 0.8443761467933655, + "learning_rate": 9.985674329022275e-06, + "loss": 0.513, + "mean_token_accuracy": 0.9030035734176636, + "num_tokens": 142308549.0, + "step": 536 + }, + { + "epoch": 0.05364903341825266, + "grad_norm": 0.9733178019523621, + "learning_rate": 9.985551684891784e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.9012086689472198, + "num_tokens": 142577879.0, + "step": 537 + }, + { + "epoch": 0.053748938508417006, + "grad_norm": 0.8651155233383179, + "learning_rate": 9.985428518769151e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.9024726450443268, + "num_tokens": 142836568.0, + "step": 538 + }, + { + "epoch": 0.053848843598581345, + "grad_norm": 0.6812898516654968, + "learning_rate": 9.985304830667278e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.9049395024776459, + "num_tokens": 143105551.0, + "step": 539 + }, + { + "epoch": 0.05394874868874569, + "grad_norm": 0.8326764702796936, + "learning_rate": 9.98518062059911e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.9041145741939545, + "num_tokens": 143375160.0, + "step": 540 + }, + { + "epoch": 0.054048653778910036, + "grad_norm": 1.0278266668319702, + "learning_rate": 9.985055888577656e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.9024625718593597, + "num_tokens": 143637376.0, + "step": 541 + }, + { + "epoch": 0.05414855886907438, + "grad_norm": 0.6356834769248962, + "learning_rate": 9.984930634615973e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.9033419489860535, + "num_tokens": 143902220.0, + "step": 542 + }, + { + "epoch": 0.05424846395923872, + "grad_norm": 0.6184828877449036, + "learning_rate": 9.984804858727175e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.9007395803928375, + "num_tokens": 144174986.0, + "step": 543 + }, + { + "epoch": 0.054348369049403066, + "grad_norm": 0.7099010348320007, + "learning_rate": 9.984678560924433e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.9022358953952789, + "num_tokens": 144433056.0, + "step": 544 + }, + { + "epoch": 0.05444827413956741, + "grad_norm": 1.2514402866363525, + "learning_rate": 9.98455174122097e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8992709815502167, + "num_tokens": 144696629.0, + "step": 545 + }, + { + "epoch": 0.05454817922973176, + "grad_norm": 0.7746785283088684, + "learning_rate": 9.984424399630064e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.9023463129997253, + "num_tokens": 144965464.0, + "step": 546 + }, + { + "epoch": 0.054648084319896095, + "grad_norm": 0.8966716527938843, + "learning_rate": 9.984296536165046e-06, + "loss": 0.5185, + "mean_token_accuracy": 0.9019818603992462, + "num_tokens": 145233277.0, + "step": 547 + }, + { + "epoch": 0.05474798941006044, + "grad_norm": 0.9793383479118347, + "learning_rate": 9.984168150839305e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.905871570110321, + "num_tokens": 145490447.0, + "step": 548 + }, + { + "epoch": 0.054847894500224786, + "grad_norm": 1.1300071477890015, + "learning_rate": 9.984039243666284e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.8995006680488586, + "num_tokens": 145757685.0, + "step": 549 + }, + { + "epoch": 0.05494779959038913, + "grad_norm": 0.7780466675758362, + "learning_rate": 9.983909814659476e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.9025764167308807, + "num_tokens": 146034694.0, + "step": 550 + }, + { + "epoch": 0.05504770468055348, + "grad_norm": 0.7449960708618164, + "learning_rate": 9.983779863832436e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.904653012752533, + "num_tokens": 146306004.0, + "step": 551 + }, + { + "epoch": 0.055147609770717816, + "grad_norm": 0.9403952956199646, + "learning_rate": 9.983649391198771e-06, + "loss": 0.5228, + "mean_token_accuracy": 0.9030623435974121, + "num_tokens": 146573489.0, + "step": 552 + }, + { + "epoch": 0.05524751486088216, + "grad_norm": 0.8424383401870728, + "learning_rate": 9.983518396772138e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.9038223624229431, + "num_tokens": 146838324.0, + "step": 553 + }, + { + "epoch": 0.05534741995104651, + "grad_norm": 0.8817145228385925, + "learning_rate": 9.983386880566253e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.90077343583107, + "num_tokens": 147101238.0, + "step": 554 + }, + { + "epoch": 0.05544732504121085, + "grad_norm": 0.6977766752243042, + "learning_rate": 9.983254842594887e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.9024248421192169, + "num_tokens": 147362409.0, + "step": 555 + }, + { + "epoch": 0.05554723013137519, + "grad_norm": 0.6309251189231873, + "learning_rate": 9.983122282871865e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.9013061821460724, + "num_tokens": 147629343.0, + "step": 556 + }, + { + "epoch": 0.05564713522153954, + "grad_norm": 0.5933849215507507, + "learning_rate": 9.982989201411064e-06, + "loss": 0.513, + "mean_token_accuracy": 0.9030575752258301, + "num_tokens": 147896549.0, + "step": 557 + }, + { + "epoch": 0.05574704031170388, + "grad_norm": 0.6931910514831543, + "learning_rate": 9.98285559822642e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.9013265669345856, + "num_tokens": 148162874.0, + "step": 558 + }, + { + "epoch": 0.05584694540186823, + "grad_norm": 1.1912868022918701, + "learning_rate": 9.98272147333192e-06, + "loss": 0.5207, + "mean_token_accuracy": 0.9013843834400177, + "num_tokens": 148427922.0, + "step": 559 + }, + { + "epoch": 0.05594685049203257, + "grad_norm": 0.6476121544837952, + "learning_rate": 9.982586826741609e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.903841108083725, + "num_tokens": 148697035.0, + "step": 560 + }, + { + "epoch": 0.05604675558219691, + "grad_norm": 0.6202062964439392, + "learning_rate": 9.98245165846958e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.9050305485725403, + "num_tokens": 148971110.0, + "step": 561 + }, + { + "epoch": 0.05614666067236126, + "grad_norm": 0.6220694780349731, + "learning_rate": 9.98231596852999e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.9020660817623138, + "num_tokens": 149245223.0, + "step": 562 + }, + { + "epoch": 0.056246565762525604, + "grad_norm": 0.9104385375976562, + "learning_rate": 9.982179756937044e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.9042539298534393, + "num_tokens": 149505534.0, + "step": 563 + }, + { + "epoch": 0.05634647085268994, + "grad_norm": 0.7312887907028198, + "learning_rate": 9.982043023705004e-06, + "loss": 0.5185, + "mean_token_accuracy": 0.902607798576355, + "num_tokens": 149767233.0, + "step": 564 + }, + { + "epoch": 0.05644637594285429, + "grad_norm": 0.7505432367324829, + "learning_rate": 9.981905768848186e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.9029010236263275, + "num_tokens": 150026880.0, + "step": 565 + }, + { + "epoch": 0.05654628103301863, + "grad_norm": 0.7303380370140076, + "learning_rate": 9.98176799238096e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.9036974608898163, + "num_tokens": 150292347.0, + "step": 566 + }, + { + "epoch": 0.05664618612318298, + "grad_norm": 0.6219602823257446, + "learning_rate": 9.98162969431775e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.9012829959392548, + "num_tokens": 150555688.0, + "step": 567 + }, + { + "epoch": 0.05674609121334732, + "grad_norm": 0.6403664946556091, + "learning_rate": 9.98149087467304e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.9032070934772491, + "num_tokens": 150811542.0, + "step": 568 + }, + { + "epoch": 0.05684599630351166, + "grad_norm": 0.6713922023773193, + "learning_rate": 9.98135153346136e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.9004948139190674, + "num_tokens": 151078571.0, + "step": 569 + }, + { + "epoch": 0.05694590139367601, + "grad_norm": 0.6759093999862671, + "learning_rate": 9.981211670697303e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.9024394750595093, + "num_tokens": 151340293.0, + "step": 570 + }, + { + "epoch": 0.057045806483840354, + "grad_norm": 0.7257122993469238, + "learning_rate": 9.981071286395513e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.9037904441356659, + "num_tokens": 151608339.0, + "step": 571 + }, + { + "epoch": 0.05714571157400469, + "grad_norm": 0.6588451862335205, + "learning_rate": 9.980930380570683e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.9016490578651428, + "num_tokens": 151870493.0, + "step": 572 + }, + { + "epoch": 0.05724561666416904, + "grad_norm": 0.7690349221229553, + "learning_rate": 9.980788953237572e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.9039574563503265, + "num_tokens": 152139492.0, + "step": 573 + }, + { + "epoch": 0.057345521754333384, + "grad_norm": 0.6968956589698792, + "learning_rate": 9.980647004410986e-06, + "loss": 0.51, + "mean_token_accuracy": 0.9023677408695221, + "num_tokens": 152408330.0, + "step": 574 + }, + { + "epoch": 0.05744542684449773, + "grad_norm": 0.7655267715454102, + "learning_rate": 9.980504534105784e-06, + "loss": 0.516, + "mean_token_accuracy": 0.9017879366874695, + "num_tokens": 152674038.0, + "step": 575 + }, + { + "epoch": 0.05754533193466207, + "grad_norm": 0.6194596886634827, + "learning_rate": 9.980361542336887e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.9028837978839874, + "num_tokens": 152936781.0, + "step": 576 + }, + { + "epoch": 0.057645237024826414, + "grad_norm": 0.6129767298698425, + "learning_rate": 9.980218029119264e-06, + "loss": 0.508, + "mean_token_accuracy": 0.9022084772586823, + "num_tokens": 153196146.0, + "step": 577 + }, + { + "epoch": 0.05774514211499076, + "grad_norm": 0.6798054575920105, + "learning_rate": 9.98007399446794e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.9036797881126404, + "num_tokens": 153461970.0, + "step": 578 + }, + { + "epoch": 0.057845047205155105, + "grad_norm": 0.8616960048675537, + "learning_rate": 9.979929438397997e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.8990062773227692, + "num_tokens": 153739883.0, + "step": 579 + }, + { + "epoch": 0.05794495229531944, + "grad_norm": 0.93268221616745, + "learning_rate": 9.979784360924571e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.9043946266174316, + "num_tokens": 154002251.0, + "step": 580 + }, + { + "epoch": 0.05804485738548379, + "grad_norm": 0.7099646925926208, + "learning_rate": 9.979638762062851e-06, + "loss": 0.5207, + "mean_token_accuracy": 0.9002720415592194, + "num_tokens": 154275607.0, + "step": 581 + }, + { + "epoch": 0.058144762475648135, + "grad_norm": 0.7331972718238831, + "learning_rate": 9.979492641828082e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.9011991620063782, + "num_tokens": 154541308.0, + "step": 582 + }, + { + "epoch": 0.05824466756581248, + "grad_norm": 0.688657283782959, + "learning_rate": 9.979346000235562e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.9050512313842773, + "num_tokens": 154812429.0, + "step": 583 + }, + { + "epoch": 0.05834457265597682, + "grad_norm": 0.6945686936378479, + "learning_rate": 9.979198837300644e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.8992209136486053, + "num_tokens": 155073942.0, + "step": 584 + }, + { + "epoch": 0.058444477746141164, + "grad_norm": 0.7442430257797241, + "learning_rate": 9.979051153038737e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8999350965023041, + "num_tokens": 155334307.0, + "step": 585 + }, + { + "epoch": 0.05854438283630551, + "grad_norm": 0.6989916563034058, + "learning_rate": 9.978902947465304e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.9017228484153748, + "num_tokens": 155595358.0, + "step": 586 + }, + { + "epoch": 0.058644287926469855, + "grad_norm": 0.5932838916778564, + "learning_rate": 9.978754220595861e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.9046531915664673, + "num_tokens": 155862977.0, + "step": 587 + }, + { + "epoch": 0.058744193016634194, + "grad_norm": 0.5388416647911072, + "learning_rate": 9.978604972445983e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.9031755924224854, + "num_tokens": 156123233.0, + "step": 588 + }, + { + "epoch": 0.05884409810679854, + "grad_norm": 0.6982474327087402, + "learning_rate": 9.978455203031292e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.902615875005722, + "num_tokens": 156383761.0, + "step": 589 + }, + { + "epoch": 0.058944003196962885, + "grad_norm": 0.7076588869094849, + "learning_rate": 9.97830491236747e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.8986367583274841, + "num_tokens": 156651157.0, + "step": 590 + }, + { + "epoch": 0.05904390828712723, + "grad_norm": 0.7257279753684998, + "learning_rate": 9.978154100470255e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.8995548486709595, + "num_tokens": 156909580.0, + "step": 591 + }, + { + "epoch": 0.059143813377291576, + "grad_norm": 0.6307842135429382, + "learning_rate": 9.978002767355437e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.9034350216388702, + "num_tokens": 157176024.0, + "step": 592 + }, + { + "epoch": 0.059243718467455915, + "grad_norm": 0.6571227312088013, + "learning_rate": 9.977850913038858e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.9043467938899994, + "num_tokens": 157437925.0, + "step": 593 + }, + { + "epoch": 0.05934362355762026, + "grad_norm": 0.9135153889656067, + "learning_rate": 9.97769853753642e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.9023022651672363, + "num_tokens": 157698230.0, + "step": 594 + }, + { + "epoch": 0.059443528647784606, + "grad_norm": 0.5978872179985046, + "learning_rate": 9.977545640864073e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.9019834399223328, + "num_tokens": 157968257.0, + "step": 595 + }, + { + "epoch": 0.05954343373794895, + "grad_norm": 0.821200966835022, + "learning_rate": 9.97739222303783e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.9018275737762451, + "num_tokens": 158233741.0, + "step": 596 + }, + { + "epoch": 0.05964333882811329, + "grad_norm": 1.0283470153808594, + "learning_rate": 9.977238284073753e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.899562418460846, + "num_tokens": 158501602.0, + "step": 597 + }, + { + "epoch": 0.059743243918277636, + "grad_norm": 0.8015673756599426, + "learning_rate": 9.977083823987957e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.9036093056201935, + "num_tokens": 158765696.0, + "step": 598 + }, + { + "epoch": 0.05984314900844198, + "grad_norm": 0.6849085688591003, + "learning_rate": 9.976928842796616e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.9030007123947144, + "num_tokens": 159026654.0, + "step": 599 + }, + { + "epoch": 0.05994305409860633, + "grad_norm": 0.8089185953140259, + "learning_rate": 9.976773340515958e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.9043397903442383, + "num_tokens": 159299663.0, + "step": 600 + }, + { + "epoch": 0.060042959188770666, + "grad_norm": 1.529416799545288, + "learning_rate": 9.976617317162261e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.9058792591094971, + "num_tokens": 159571921.0, + "step": 601 + }, + { + "epoch": 0.06014286427893501, + "grad_norm": 0.936768114566803, + "learning_rate": 9.976460772751863e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.9014073312282562, + "num_tokens": 159842691.0, + "step": 602 + }, + { + "epoch": 0.06024276936909936, + "grad_norm": 0.8195129632949829, + "learning_rate": 9.976303707301155e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.9016938209533691, + "num_tokens": 160101308.0, + "step": 603 + }, + { + "epoch": 0.0603426744592637, + "grad_norm": 0.766822099685669, + "learning_rate": 9.97614612082658e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.9008556008338928, + "num_tokens": 160378017.0, + "step": 604 + }, + { + "epoch": 0.06044257954942804, + "grad_norm": 0.744566023349762, + "learning_rate": 9.975988013344638e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.9017702341079712, + "num_tokens": 160648624.0, + "step": 605 + }, + { + "epoch": 0.06054248463959239, + "grad_norm": 1.1021926403045654, + "learning_rate": 9.975829384871884e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.9034219980239868, + "num_tokens": 160907268.0, + "step": 606 + }, + { + "epoch": 0.06064238972975673, + "grad_norm": 0.7046933770179749, + "learning_rate": 9.975670235424927e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.9047935605049133, + "num_tokens": 161175555.0, + "step": 607 + }, + { + "epoch": 0.06074229481992108, + "grad_norm": 0.8091129660606384, + "learning_rate": 9.975510565020426e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.9015909135341644, + "num_tokens": 161431802.0, + "step": 608 + }, + { + "epoch": 0.060842199910085416, + "grad_norm": 0.7809988856315613, + "learning_rate": 9.975350373675101e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.9027710258960724, + "num_tokens": 161694391.0, + "step": 609 + }, + { + "epoch": 0.06094210500024976, + "grad_norm": 0.7829887866973877, + "learning_rate": 9.975189661405728e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.9020082652568817, + "num_tokens": 161955479.0, + "step": 610 + }, + { + "epoch": 0.06104201009041411, + "grad_norm": 1.0262449979782104, + "learning_rate": 9.975028428229128e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.9042467474937439, + "num_tokens": 162230383.0, + "step": 611 + }, + { + "epoch": 0.06114191518057845, + "grad_norm": 0.731964111328125, + "learning_rate": 9.974866674162186e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.9029311835765839, + "num_tokens": 162488300.0, + "step": 612 + }, + { + "epoch": 0.06124182027074279, + "grad_norm": 0.9965727925300598, + "learning_rate": 9.974704399221836e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.9060125052928925, + "num_tokens": 162759046.0, + "step": 613 + }, + { + "epoch": 0.06134172536090714, + "grad_norm": 0.6500602960586548, + "learning_rate": 9.97454160342507e-06, + "loss": 0.514, + "mean_token_accuracy": 0.901875376701355, + "num_tokens": 163032929.0, + "step": 614 + }, + { + "epoch": 0.06144163045107148, + "grad_norm": 0.8946840763092041, + "learning_rate": 9.97437828678893e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.9034351110458374, + "num_tokens": 163301187.0, + "step": 615 + }, + { + "epoch": 0.06154153554123583, + "grad_norm": 0.7113312482833862, + "learning_rate": 9.97421444933052e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.9000544846057892, + "num_tokens": 163566411.0, + "step": 616 + }, + { + "epoch": 0.06164144063140017, + "grad_norm": 0.7412269115447998, + "learning_rate": 9.974050091066989e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.9016856849193573, + "num_tokens": 163830040.0, + "step": 617 + }, + { + "epoch": 0.06174134572156451, + "grad_norm": 0.692356288433075, + "learning_rate": 9.973885212015545e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.902568519115448, + "num_tokens": 164092449.0, + "step": 618 + }, + { + "epoch": 0.06184125081172886, + "grad_norm": 0.724291205406189, + "learning_rate": 9.973719812193458e-06, + "loss": 0.518, + "mean_token_accuracy": 0.9012368023395538, + "num_tokens": 164358298.0, + "step": 619 + }, + { + "epoch": 0.061941155901893204, + "grad_norm": 0.904885470867157, + "learning_rate": 9.97355389161804e-06, + "loss": 0.516, + "mean_token_accuracy": 0.9009229838848114, + "num_tokens": 164624832.0, + "step": 620 + }, + { + "epoch": 0.06204106099205754, + "grad_norm": 0.7808142304420471, + "learning_rate": 9.973387450306663e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.9030810296535492, + "num_tokens": 164897307.0, + "step": 621 + }, + { + "epoch": 0.06214096608222189, + "grad_norm": 0.7786172032356262, + "learning_rate": 9.973220488276756e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.9024654030799866, + "num_tokens": 165162944.0, + "step": 622 + }, + { + "epoch": 0.06224087117238623, + "grad_norm": 2.0332489013671875, + "learning_rate": 9.973053005545798e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.9035106301307678, + "num_tokens": 165428537.0, + "step": 623 + }, + { + "epoch": 0.06234077626255058, + "grad_norm": 0.7368687987327576, + "learning_rate": 9.972885002131328e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.9016894698143005, + "num_tokens": 165691210.0, + "step": 624 + }, + { + "epoch": 0.06244068135271492, + "grad_norm": 0.7315918803215027, + "learning_rate": 9.97271647805093e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.9036740362644196, + "num_tokens": 165964369.0, + "step": 625 + }, + { + "epoch": 0.06254058644287927, + "grad_norm": 0.8499507308006287, + "learning_rate": 9.972547433322254e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.9017168283462524, + "num_tokens": 166244269.0, + "step": 626 + }, + { + "epoch": 0.0626404915330436, + "grad_norm": 0.9084515571594238, + "learning_rate": 9.972377867962998e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.9005888104438782, + "num_tokens": 166508219.0, + "step": 627 + }, + { + "epoch": 0.06274039662320795, + "grad_norm": 0.7716317772865295, + "learning_rate": 9.972207781990912e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.9003297388553619, + "num_tokens": 166770941.0, + "step": 628 + }, + { + "epoch": 0.06284030171337229, + "grad_norm": 0.6513050198554993, + "learning_rate": 9.97203717542381e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.9035277664661407, + "num_tokens": 167039045.0, + "step": 629 + }, + { + "epoch": 0.06294020680353664, + "grad_norm": 0.7350946664810181, + "learning_rate": 9.97186604827955e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.9026923179626465, + "num_tokens": 167301447.0, + "step": 630 + }, + { + "epoch": 0.06304011189370098, + "grad_norm": 0.7061342000961304, + "learning_rate": 9.971694400576053e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.9022524952888489, + "num_tokens": 167562265.0, + "step": 631 + }, + { + "epoch": 0.06314001698386533, + "grad_norm": 0.7501211166381836, + "learning_rate": 9.971522232331288e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.9032787382602692, + "num_tokens": 167826738.0, + "step": 632 + }, + { + "epoch": 0.06323992207402968, + "grad_norm": 0.6518118381500244, + "learning_rate": 9.97134954356328e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.9045359790325165, + "num_tokens": 168085030.0, + "step": 633 + }, + { + "epoch": 0.06333982716419402, + "grad_norm": 0.7099774479866028, + "learning_rate": 9.971176334290114e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.899995744228363, + "num_tokens": 168356330.0, + "step": 634 + }, + { + "epoch": 0.06343973225435837, + "grad_norm": 0.6602879166603088, + "learning_rate": 9.971002604529922e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.9035660922527313, + "num_tokens": 168629025.0, + "step": 635 + }, + { + "epoch": 0.0635396373445227, + "grad_norm": 0.6449518799781799, + "learning_rate": 9.970828354300895e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.903147965669632, + "num_tokens": 168897401.0, + "step": 636 + }, + { + "epoch": 0.06363954243468704, + "grad_norm": 0.7111853957176208, + "learning_rate": 9.970653583621275e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.9021528959274292, + "num_tokens": 169164139.0, + "step": 637 + }, + { + "epoch": 0.06373944752485139, + "grad_norm": 1.0055838823318481, + "learning_rate": 9.970478292509364e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.9031620919704437, + "num_tokens": 169418359.0, + "step": 638 + }, + { + "epoch": 0.06383935261501573, + "grad_norm": 0.8391542434692383, + "learning_rate": 9.970302480983511e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.8997828662395477, + "num_tokens": 169687455.0, + "step": 639 + }, + { + "epoch": 0.06393925770518008, + "grad_norm": 0.6754401922225952, + "learning_rate": 9.97012614906213e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.9037193655967712, + "num_tokens": 169957361.0, + "step": 640 + }, + { + "epoch": 0.06403916279534443, + "grad_norm": 0.9258714318275452, + "learning_rate": 9.969949296763675e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.9029320478439331, + "num_tokens": 170225317.0, + "step": 641 + }, + { + "epoch": 0.06413906788550877, + "grad_norm": 0.8364575505256653, + "learning_rate": 9.969771924106669e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.90181964635849, + "num_tokens": 170490032.0, + "step": 642 + }, + { + "epoch": 0.06423897297567312, + "grad_norm": 0.833506166934967, + "learning_rate": 9.969594031109681e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.9037043750286102, + "num_tokens": 170754587.0, + "step": 643 + }, + { + "epoch": 0.06433887806583745, + "grad_norm": 0.930084228515625, + "learning_rate": 9.969415617791336e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.9028688967227936, + "num_tokens": 171019982.0, + "step": 644 + }, + { + "epoch": 0.0644387831560018, + "grad_norm": 0.6846749782562256, + "learning_rate": 9.969236684170314e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.9033832252025604, + "num_tokens": 171298014.0, + "step": 645 + }, + { + "epoch": 0.06453868824616614, + "grad_norm": 0.8293594121932983, + "learning_rate": 9.969057230265351e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.9047979712486267, + "num_tokens": 171571037.0, + "step": 646 + }, + { + "epoch": 0.06463859333633049, + "grad_norm": 1.199247121810913, + "learning_rate": 9.968877256095234e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.9030216634273529, + "num_tokens": 171841461.0, + "step": 647 + }, + { + "epoch": 0.06473849842649483, + "grad_norm": 0.7359606623649597, + "learning_rate": 9.968696761678808e-06, + "loss": 0.513, + "mean_token_accuracy": 0.9037856757640839, + "num_tokens": 172109307.0, + "step": 648 + }, + { + "epoch": 0.06483840351665918, + "grad_norm": 0.646586000919342, + "learning_rate": 9.96851574703497e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.9051523506641388, + "num_tokens": 172367460.0, + "step": 649 + }, + { + "epoch": 0.06493830860682352, + "grad_norm": 0.6954528093338013, + "learning_rate": 9.968334212182674e-06, + "loss": 0.513, + "mean_token_accuracy": 0.9008308053016663, + "num_tokens": 172625887.0, + "step": 650 + }, + { + "epoch": 0.06503821369698787, + "grad_norm": 0.7314415574073792, + "learning_rate": 9.968152157140925e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.9007551968097687, + "num_tokens": 172893978.0, + "step": 651 + }, + { + "epoch": 0.0651381187871522, + "grad_norm": 0.6935586929321289, + "learning_rate": 9.967969581928784e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.9006082117557526, + "num_tokens": 173164951.0, + "step": 652 + }, + { + "epoch": 0.06523802387731654, + "grad_norm": 0.7036967277526855, + "learning_rate": 9.967786486565369e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.9022729992866516, + "num_tokens": 173434355.0, + "step": 653 + }, + { + "epoch": 0.06533792896748089, + "grad_norm": 0.7755014896392822, + "learning_rate": 9.96760287106985e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.9037308096885681, + "num_tokens": 173700746.0, + "step": 654 + }, + { + "epoch": 0.06543783405764524, + "grad_norm": 0.8337916731834412, + "learning_rate": 9.967418735461449e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.902480274438858, + "num_tokens": 173962491.0, + "step": 655 + }, + { + "epoch": 0.06553773914780958, + "grad_norm": 0.8287419080734253, + "learning_rate": 9.967234079759448e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.9002928137779236, + "num_tokens": 174229437.0, + "step": 656 + }, + { + "epoch": 0.06563764423797393, + "grad_norm": 0.9548794627189636, + "learning_rate": 9.967048903983178e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.9001343846321106, + "num_tokens": 174490910.0, + "step": 657 + }, + { + "epoch": 0.06573754932813827, + "grad_norm": 0.6101205945014954, + "learning_rate": 9.966863208152031e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.9035294353961945, + "num_tokens": 174764802.0, + "step": 658 + }, + { + "epoch": 0.06583745441830262, + "grad_norm": 0.6271787285804749, + "learning_rate": 9.966676992285447e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.8993997573852539, + "num_tokens": 175037685.0, + "step": 659 + }, + { + "epoch": 0.06593735950846695, + "grad_norm": 0.8699121475219727, + "learning_rate": 9.966490256402924e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.9037845730781555, + "num_tokens": 175309555.0, + "step": 660 + }, + { + "epoch": 0.0660372645986313, + "grad_norm": 0.7362117767333984, + "learning_rate": 9.966303000524011e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.9004186391830444, + "num_tokens": 175568996.0, + "step": 661 + }, + { + "epoch": 0.06613716968879564, + "grad_norm": 0.6825246214866638, + "learning_rate": 9.966115224668315e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9035219252109528, + "num_tokens": 175836423.0, + "step": 662 + }, + { + "epoch": 0.06623707477895999, + "grad_norm": 0.730711817741394, + "learning_rate": 9.965926928855498e-06, + "loss": 0.508, + "mean_token_accuracy": 0.9026734530925751, + "num_tokens": 176098119.0, + "step": 663 + }, + { + "epoch": 0.06633697986912433, + "grad_norm": 0.8065865635871887, + "learning_rate": 9.965738113105274e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.9020503759384155, + "num_tokens": 176364630.0, + "step": 664 + }, + { + "epoch": 0.06643688495928868, + "grad_norm": 0.9313201904296875, + "learning_rate": 9.965548777437411e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.9039636850357056, + "num_tokens": 176623134.0, + "step": 665 + }, + { + "epoch": 0.06653679004945302, + "grad_norm": 0.7791675925254822, + "learning_rate": 9.965358921871735e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.9031401872634888, + "num_tokens": 176890458.0, + "step": 666 + }, + { + "epoch": 0.06663669513961737, + "grad_norm": 0.7801092863082886, + "learning_rate": 9.965168546428122e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.9012789726257324, + "num_tokens": 177149818.0, + "step": 667 + }, + { + "epoch": 0.0667366002297817, + "grad_norm": 0.8540526032447815, + "learning_rate": 9.964977651126504e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.903413861989975, + "num_tokens": 177412865.0, + "step": 668 + }, + { + "epoch": 0.06683650531994605, + "grad_norm": 0.6136407256126404, + "learning_rate": 9.96478623598687e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9055083692073822, + "num_tokens": 177683045.0, + "step": 669 + }, + { + "epoch": 0.06693641041011039, + "grad_norm": 0.8545668125152588, + "learning_rate": 9.964594301029258e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.9021594226360321, + "num_tokens": 177950630.0, + "step": 670 + }, + { + "epoch": 0.06703631550027474, + "grad_norm": 0.8129372000694275, + "learning_rate": 9.964401846273769e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.90381720662117, + "num_tokens": 178221658.0, + "step": 671 + }, + { + "epoch": 0.06713622059043908, + "grad_norm": 0.6838666200637817, + "learning_rate": 9.964208871740548e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.901636391878128, + "num_tokens": 178481292.0, + "step": 672 + }, + { + "epoch": 0.06723612568060343, + "grad_norm": 0.6589635610580444, + "learning_rate": 9.964015377449803e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.9046314358711243, + "num_tokens": 178744851.0, + "step": 673 + }, + { + "epoch": 0.06733603077076777, + "grad_norm": 0.7424984574317932, + "learning_rate": 9.963821363421793e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.9007152318954468, + "num_tokens": 179006255.0, + "step": 674 + }, + { + "epoch": 0.06743593586093212, + "grad_norm": 0.7044169306755066, + "learning_rate": 9.963626829676829e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.9051276743412018, + "num_tokens": 179260626.0, + "step": 675 + }, + { + "epoch": 0.06753584095109647, + "grad_norm": 0.7495167255401611, + "learning_rate": 9.963431776235279e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8993471264839172, + "num_tokens": 179536253.0, + "step": 676 + }, + { + "epoch": 0.0676357460412608, + "grad_norm": 0.7562381625175476, + "learning_rate": 9.963236203117569e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.9023611843585968, + "num_tokens": 179806927.0, + "step": 677 + }, + { + "epoch": 0.06773565113142514, + "grad_norm": 0.6816434860229492, + "learning_rate": 9.963040110344173e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.9020212590694427, + "num_tokens": 180070187.0, + "step": 678 + }, + { + "epoch": 0.06783555622158949, + "grad_norm": 0.7836717963218689, + "learning_rate": 9.962843497935621e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.9028849005699158, + "num_tokens": 180342373.0, + "step": 679 + }, + { + "epoch": 0.06793546131175383, + "grad_norm": 0.6911202073097229, + "learning_rate": 9.9626463659125e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.9039743840694427, + "num_tokens": 180620058.0, + "step": 680 + }, + { + "epoch": 0.06803536640191818, + "grad_norm": 0.6511402726173401, + "learning_rate": 9.962448714295452e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.9034909605979919, + "num_tokens": 180885012.0, + "step": 681 + }, + { + "epoch": 0.06813527149208252, + "grad_norm": 0.8085684776306152, + "learning_rate": 9.962250543105167e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.9011252522468567, + "num_tokens": 181142870.0, + "step": 682 + }, + { + "epoch": 0.06823517658224687, + "grad_norm": 0.7621213793754578, + "learning_rate": 9.962051852362396e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.9016707539558411, + "num_tokens": 181406084.0, + "step": 683 + }, + { + "epoch": 0.06833508167241122, + "grad_norm": 1.1100740432739258, + "learning_rate": 9.961852642087943e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.9036278426647186, + "num_tokens": 181671972.0, + "step": 684 + }, + { + "epoch": 0.06843498676257555, + "grad_norm": 0.7447674870491028, + "learning_rate": 9.961652912302664e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.9041794836521149, + "num_tokens": 181941077.0, + "step": 685 + }, + { + "epoch": 0.0685348918527399, + "grad_norm": 0.8062470555305481, + "learning_rate": 9.96145266302747e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.9028307199478149, + "num_tokens": 182211362.0, + "step": 686 + }, + { + "epoch": 0.06863479694290424, + "grad_norm": 0.6685743927955627, + "learning_rate": 9.96125189428333e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.90384441614151, + "num_tokens": 182474962.0, + "step": 687 + }, + { + "epoch": 0.06873470203306858, + "grad_norm": 0.6662142276763916, + "learning_rate": 9.961050606091263e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.9005221128463745, + "num_tokens": 182737786.0, + "step": 688 + }, + { + "epoch": 0.06883460712323293, + "grad_norm": 0.7211425304412842, + "learning_rate": 9.960848798472344e-06, + "loss": 0.511, + "mean_token_accuracy": 0.9054778218269348, + "num_tokens": 183011159.0, + "step": 689 + }, + { + "epoch": 0.06893451221339728, + "grad_norm": 0.8270862698554993, + "learning_rate": 9.960646471447703e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.9032663702964783, + "num_tokens": 183278631.0, + "step": 690 + }, + { + "epoch": 0.06903441730356162, + "grad_norm": 0.6160503029823303, + "learning_rate": 9.960443625038525e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.9024447500705719, + "num_tokens": 183538802.0, + "step": 691 + }, + { + "epoch": 0.06913432239372597, + "grad_norm": 0.7146487832069397, + "learning_rate": 9.960240259266046e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.9035504758358002, + "num_tokens": 183809754.0, + "step": 692 + }, + { + "epoch": 0.0692342274838903, + "grad_norm": 0.6814123392105103, + "learning_rate": 9.960036374151557e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.902966320514679, + "num_tokens": 184080959.0, + "step": 693 + }, + { + "epoch": 0.06933413257405464, + "grad_norm": 0.6792786121368408, + "learning_rate": 9.959831969716412e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.9023387432098389, + "num_tokens": 184347965.0, + "step": 694 + }, + { + "epoch": 0.06943403766421899, + "grad_norm": 0.802288293838501, + "learning_rate": 9.959627045982006e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.9012621641159058, + "num_tokens": 184601002.0, + "step": 695 + }, + { + "epoch": 0.06953394275438333, + "grad_norm": 0.7214195728302002, + "learning_rate": 9.959421602969796e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.9020242393016815, + "num_tokens": 184874961.0, + "step": 696 + }, + { + "epoch": 0.06963384784454768, + "grad_norm": 0.8810122609138489, + "learning_rate": 9.959215640701292e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.9001144170761108, + "num_tokens": 185149925.0, + "step": 697 + }, + { + "epoch": 0.06973375293471203, + "grad_norm": 0.6956961154937744, + "learning_rate": 9.95900915919806e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8998056948184967, + "num_tokens": 185416651.0, + "step": 698 + }, + { + "epoch": 0.06983365802487637, + "grad_norm": 0.9325966835021973, + "learning_rate": 9.958802158481718e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8983359336853027, + "num_tokens": 185697746.0, + "step": 699 + }, + { + "epoch": 0.06993356311504072, + "grad_norm": 0.9080783128738403, + "learning_rate": 9.95859463857394e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.9024254977703094, + "num_tokens": 185964641.0, + "step": 700 + }, + { + "epoch": 0.07003346820520505, + "grad_norm": 0.9987565875053406, + "learning_rate": 9.95838659949645e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8998393714427948, + "num_tokens": 186227017.0, + "step": 701 + }, + { + "epoch": 0.0701333732953694, + "grad_norm": 0.6828138828277588, + "learning_rate": 9.958178041271035e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.9031189382076263, + "num_tokens": 186487218.0, + "step": 702 + }, + { + "epoch": 0.07023327838553374, + "grad_norm": 0.974902331829071, + "learning_rate": 9.957968963919527e-06, + "loss": 0.51, + "mean_token_accuracy": 0.9027196764945984, + "num_tokens": 186746611.0, + "step": 703 + }, + { + "epoch": 0.07033318347569809, + "grad_norm": 0.6161606907844543, + "learning_rate": 9.95775936746382e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.902582436800003, + "num_tokens": 187016274.0, + "step": 704 + }, + { + "epoch": 0.07043308856586243, + "grad_norm": 3.4652626514434814, + "learning_rate": 9.957549251925855e-06, + "loss": 0.5179, + "mean_token_accuracy": 0.9027336537837982, + "num_tokens": 187277093.0, + "step": 705 + }, + { + "epoch": 0.07053299365602678, + "grad_norm": 0.6377342343330383, + "learning_rate": 9.957338617327637e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.9026639759540558, + "num_tokens": 187541174.0, + "step": 706 + }, + { + "epoch": 0.07063289874619112, + "grad_norm": 0.7622672319412231, + "learning_rate": 9.957127463691215e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.9016647934913635, + "num_tokens": 187809358.0, + "step": 707 + }, + { + "epoch": 0.07073280383635547, + "grad_norm": 0.929966390132904, + "learning_rate": 9.956915791038696e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8996190130710602, + "num_tokens": 188075739.0, + "step": 708 + }, + { + "epoch": 0.0708327089265198, + "grad_norm": 0.8005189895629883, + "learning_rate": 9.956703599392246e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.9002282321453094, + "num_tokens": 188331424.0, + "step": 709 + }, + { + "epoch": 0.07093261401668415, + "grad_norm": 0.7948549389839172, + "learning_rate": 9.95649088877408e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.9027948081493378, + "num_tokens": 188604957.0, + "step": 710 + }, + { + "epoch": 0.07103251910684849, + "grad_norm": 0.8834391832351685, + "learning_rate": 9.95627765920647e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.9018129408359528, + "num_tokens": 188871338.0, + "step": 711 + }, + { + "epoch": 0.07113242419701284, + "grad_norm": 0.8550074100494385, + "learning_rate": 9.956063910711739e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.9015047550201416, + "num_tokens": 189131108.0, + "step": 712 + }, + { + "epoch": 0.07123232928717718, + "grad_norm": 0.8056725263595581, + "learning_rate": 9.955849643312272e-06, + "loss": 0.5174, + "mean_token_accuracy": 0.9025611877441406, + "num_tokens": 189398084.0, + "step": 713 + }, + { + "epoch": 0.07133223437734153, + "grad_norm": 0.8979281187057495, + "learning_rate": 9.955634857030495e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.9022387266159058, + "num_tokens": 189667389.0, + "step": 714 + }, + { + "epoch": 0.07143213946750587, + "grad_norm": 0.8521559238433838, + "learning_rate": 9.955419551888903e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8988560438156128, + "num_tokens": 189916717.0, + "step": 715 + }, + { + "epoch": 0.07153204455767022, + "grad_norm": 0.93377286195755, + "learning_rate": 9.955203727910037e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.9016458690166473, + "num_tokens": 190176933.0, + "step": 716 + }, + { + "epoch": 0.07163194964783456, + "grad_norm": 0.7117184996604919, + "learning_rate": 9.95498738511649e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.9029228389263153, + "num_tokens": 190441104.0, + "step": 717 + }, + { + "epoch": 0.0717318547379989, + "grad_norm": 3.794658899307251, + "learning_rate": 9.954770523530918e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.9017704725265503, + "num_tokens": 190705376.0, + "step": 718 + }, + { + "epoch": 0.07183175982816324, + "grad_norm": 0.8854956030845642, + "learning_rate": 9.954553143176026e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8989199995994568, + "num_tokens": 190972660.0, + "step": 719 + }, + { + "epoch": 0.07193166491832759, + "grad_norm": 0.977703332901001, + "learning_rate": 9.954335244074575e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.9018412530422211, + "num_tokens": 191231232.0, + "step": 720 + }, + { + "epoch": 0.07203157000849193, + "grad_norm": 0.8801239132881165, + "learning_rate": 9.954116826249373e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.9021929204463959, + "num_tokens": 191490552.0, + "step": 721 + }, + { + "epoch": 0.07213147509865628, + "grad_norm": 0.7038605809211731, + "learning_rate": 9.953897889723296e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.9022223949432373, + "num_tokens": 191754717.0, + "step": 722 + }, + { + "epoch": 0.07223138018882062, + "grad_norm": 1.448199987411499, + "learning_rate": 9.953678434519265e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.901728093624115, + "num_tokens": 192023364.0, + "step": 723 + }, + { + "epoch": 0.07233128527898497, + "grad_norm": 0.7242077589035034, + "learning_rate": 9.953458460660253e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.9047800600528717, + "num_tokens": 192295535.0, + "step": 724 + }, + { + "epoch": 0.07243119036914931, + "grad_norm": 1.4588963985443115, + "learning_rate": 9.953237968169295e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.9007120430469513, + "num_tokens": 192562203.0, + "step": 725 + }, + { + "epoch": 0.07253109545931365, + "grad_norm": 1.7409404516220093, + "learning_rate": 9.953016957069476e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.9035404622554779, + "num_tokens": 192829492.0, + "step": 726 + }, + { + "epoch": 0.07263100054947799, + "grad_norm": 0.8682035207748413, + "learning_rate": 9.952795427383938e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9033094942569733, + "num_tokens": 193097883.0, + "step": 727 + }, + { + "epoch": 0.07273090563964234, + "grad_norm": 2.6671600341796875, + "learning_rate": 9.952573379135872e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.9038330316543579, + "num_tokens": 193365226.0, + "step": 728 + }, + { + "epoch": 0.07283081072980668, + "grad_norm": 1.0770210027694702, + "learning_rate": 9.95235081234853e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.9006497859954834, + "num_tokens": 193622448.0, + "step": 729 + }, + { + "epoch": 0.07293071581997103, + "grad_norm": 0.9522941708564758, + "learning_rate": 9.95212772704521e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.9029240608215332, + "num_tokens": 193885123.0, + "step": 730 + }, + { + "epoch": 0.07303062091013537, + "grad_norm": 0.6869337558746338, + "learning_rate": 9.951904123249277e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.904687225818634, + "num_tokens": 194158099.0, + "step": 731 + }, + { + "epoch": 0.07313052600029972, + "grad_norm": 1.4533673524856567, + "learning_rate": 9.951680000984136e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.9039488732814789, + "num_tokens": 194423234.0, + "step": 732 + }, + { + "epoch": 0.07323043109046407, + "grad_norm": 0.9476101994514465, + "learning_rate": 9.951455360273255e-06, + "loss": 0.512, + "mean_token_accuracy": 0.901442140340805, + "num_tokens": 194692570.0, + "step": 733 + }, + { + "epoch": 0.0733303361806284, + "grad_norm": 1.0438958406448364, + "learning_rate": 9.951230201140155e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.9025404751300812, + "num_tokens": 194952143.0, + "step": 734 + }, + { + "epoch": 0.07343024127079274, + "grad_norm": 0.7427454590797424, + "learning_rate": 9.951004523608408e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.9053025841712952, + "num_tokens": 195216914.0, + "step": 735 + }, + { + "epoch": 0.07353014636095709, + "grad_norm": 0.8278500437736511, + "learning_rate": 9.950778327701643e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.8995220363140106, + "num_tokens": 195478365.0, + "step": 736 + }, + { + "epoch": 0.07363005145112143, + "grad_norm": 0.915534496307373, + "learning_rate": 9.950551613443546e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.9013346433639526, + "num_tokens": 195752689.0, + "step": 737 + }, + { + "epoch": 0.07372995654128578, + "grad_norm": 0.7691934704780579, + "learning_rate": 9.950324380857852e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.9020831286907196, + "num_tokens": 196020169.0, + "step": 738 + }, + { + "epoch": 0.07382986163145012, + "grad_norm": 0.9549238085746765, + "learning_rate": 9.950096629968353e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.9012324512004852, + "num_tokens": 196278869.0, + "step": 739 + }, + { + "epoch": 0.07392976672161447, + "grad_norm": 1.149106740951538, + "learning_rate": 9.949868360798893e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.9010577201843262, + "num_tokens": 196547352.0, + "step": 740 + }, + { + "epoch": 0.07402967181177882, + "grad_norm": 1.1745717525482178, + "learning_rate": 9.949639573373374e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.9008663594722748, + "num_tokens": 196807980.0, + "step": 741 + }, + { + "epoch": 0.07412957690194315, + "grad_norm": 2.7255194187164307, + "learning_rate": 9.94941026771575e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.8985883295536041, + "num_tokens": 197079886.0, + "step": 742 + }, + { + "epoch": 0.0742294819921075, + "grad_norm": 1.000184178352356, + "learning_rate": 9.949180443850028e-06, + "loss": 0.513, + "mean_token_accuracy": 0.903435081243515, + "num_tokens": 197337282.0, + "step": 743 + }, + { + "epoch": 0.07432938708227184, + "grad_norm": 1.0939234495162964, + "learning_rate": 9.948950101800274e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.9020608961582184, + "num_tokens": 197597962.0, + "step": 744 + }, + { + "epoch": 0.07442929217243618, + "grad_norm": 1.574346899986267, + "learning_rate": 9.948719241590602e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.9002684652805328, + "num_tokens": 197858020.0, + "step": 745 + }, + { + "epoch": 0.07452919726260053, + "grad_norm": 0.9248018264770508, + "learning_rate": 9.948487863245184e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.9044536352157593, + "num_tokens": 198126885.0, + "step": 746 + }, + { + "epoch": 0.07462910235276488, + "grad_norm": 0.8449240326881409, + "learning_rate": 9.948255966788247e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.9060591459274292, + "num_tokens": 198394221.0, + "step": 747 + }, + { + "epoch": 0.07472900744292922, + "grad_norm": 1.1097673177719116, + "learning_rate": 9.948023552244068e-06, + "loss": 0.507, + "mean_token_accuracy": 0.9026699364185333, + "num_tokens": 198660575.0, + "step": 748 + }, + { + "epoch": 0.07482891253309357, + "grad_norm": 1.2334719896316528, + "learning_rate": 9.947790619636984e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.8986608684062958, + "num_tokens": 198929934.0, + "step": 749 + }, + { + "epoch": 0.0749288176232579, + "grad_norm": 1.2098701000213623, + "learning_rate": 9.947557168991383e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9037385582923889, + "num_tokens": 199190576.0, + "step": 750 + }, + { + "epoch": 0.07502872271342224, + "grad_norm": 1.0440243482589722, + "learning_rate": 9.947323200331705e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.9034765064716339, + "num_tokens": 199454745.0, + "step": 751 + }, + { + "epoch": 0.07512862780358659, + "grad_norm": 14.035855293273926, + "learning_rate": 9.947088713682447e-06, + "loss": 0.517, + "mean_token_accuracy": 0.8974648118019104, + "num_tokens": 199723762.0, + "step": 752 + }, + { + "epoch": 0.07522853289375094, + "grad_norm": 1.1131073236465454, + "learning_rate": 9.946853709068163e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.8997080326080322, + "num_tokens": 199986750.0, + "step": 753 + }, + { + "epoch": 0.07532843798391528, + "grad_norm": 0.9192782044410706, + "learning_rate": 9.946618186513455e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.9023108780384064, + "num_tokens": 200251392.0, + "step": 754 + }, + { + "epoch": 0.07542834307407963, + "grad_norm": 3.0128870010375977, + "learning_rate": 9.946382146042986e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.9047132730484009, + "num_tokens": 200516240.0, + "step": 755 + }, + { + "epoch": 0.07552824816424397, + "grad_norm": 0.7508032917976379, + "learning_rate": 9.946145587681467e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.9052073657512665, + "num_tokens": 200780410.0, + "step": 756 + }, + { + "epoch": 0.07562815325440832, + "grad_norm": 0.9548751711845398, + "learning_rate": 9.945908511453663e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.9014316201210022, + "num_tokens": 201042003.0, + "step": 757 + }, + { + "epoch": 0.07572805834457265, + "grad_norm": 0.7120531797409058, + "learning_rate": 9.945670917384404e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.9027997553348541, + "num_tokens": 201305041.0, + "step": 758 + }, + { + "epoch": 0.075827963434737, + "grad_norm": 0.6950450539588928, + "learning_rate": 9.94543280549856e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.903362900018692, + "num_tokens": 201563743.0, + "step": 759 + }, + { + "epoch": 0.07592786852490134, + "grad_norm": 0.8240061402320862, + "learning_rate": 9.945194175821063e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.9041052758693695, + "num_tokens": 201838266.0, + "step": 760 + }, + { + "epoch": 0.07602777361506569, + "grad_norm": 1.0866037607192993, + "learning_rate": 9.944955028376899e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.9019879400730133, + "num_tokens": 202106604.0, + "step": 761 + }, + { + "epoch": 0.07612767870523003, + "grad_norm": 0.7511993050575256, + "learning_rate": 9.944715363191105e-06, + "loss": 0.508, + "mean_token_accuracy": 0.9024275243282318, + "num_tokens": 202376519.0, + "step": 762 + }, + { + "epoch": 0.07622758379539438, + "grad_norm": 0.899050235748291, + "learning_rate": 9.944475180288777e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.902192622423172, + "num_tokens": 202638869.0, + "step": 763 + }, + { + "epoch": 0.07632748888555872, + "grad_norm": 0.7790268063545227, + "learning_rate": 9.944234479695058e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.9056670367717743, + "num_tokens": 202896309.0, + "step": 764 + }, + { + "epoch": 0.07642739397572307, + "grad_norm": 0.7129129767417908, + "learning_rate": 9.943993261435155e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.9053696990013123, + "num_tokens": 203154132.0, + "step": 765 + }, + { + "epoch": 0.07652729906588741, + "grad_norm": 0.8389638066291809, + "learning_rate": 9.94375152553432e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.9017914533615112, + "num_tokens": 203417179.0, + "step": 766 + }, + { + "epoch": 0.07662720415605175, + "grad_norm": 0.7895957827568054, + "learning_rate": 9.943509272017863e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.904278576374054, + "num_tokens": 203675630.0, + "step": 767 + }, + { + "epoch": 0.07672710924621609, + "grad_norm": 1.2281180620193481, + "learning_rate": 9.943266500911152e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.9010506868362427, + "num_tokens": 203947822.0, + "step": 768 + }, + { + "epoch": 0.07682701433638044, + "grad_norm": 0.7298309803009033, + "learning_rate": 9.943023212239601e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.905655562877655, + "num_tokens": 204211090.0, + "step": 769 + }, + { + "epoch": 0.07692691942654478, + "grad_norm": 0.8006685972213745, + "learning_rate": 9.942779406028684e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.9029121696949005, + "num_tokens": 204476066.0, + "step": 770 + }, + { + "epoch": 0.07702682451670913, + "grad_norm": 0.7076317071914673, + "learning_rate": 9.942535082303927e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9029596745967865, + "num_tokens": 204748804.0, + "step": 771 + }, + { + "epoch": 0.07712672960687347, + "grad_norm": 1.0063605308532715, + "learning_rate": 9.942290241090916e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.9024139940738678, + "num_tokens": 205008231.0, + "step": 772 + }, + { + "epoch": 0.07722663469703782, + "grad_norm": 0.7303960919380188, + "learning_rate": 9.942044882415276e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.9054166972637177, + "num_tokens": 205277093.0, + "step": 773 + }, + { + "epoch": 0.07732653978720216, + "grad_norm": 0.7139415740966797, + "learning_rate": 9.941799006302705e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.9020906984806061, + "num_tokens": 205540565.0, + "step": 774 + }, + { + "epoch": 0.0774264448773665, + "grad_norm": 0.7376382350921631, + "learning_rate": 9.941552612778945e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.9068586826324463, + "num_tokens": 205816862.0, + "step": 775 + }, + { + "epoch": 0.07752634996753084, + "grad_norm": 0.8555020093917847, + "learning_rate": 9.941305701869792e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.9056871831417084, + "num_tokens": 206091248.0, + "step": 776 + }, + { + "epoch": 0.07762625505769519, + "grad_norm": 0.7940585017204285, + "learning_rate": 9.941058273601097e-06, + "loss": 0.515, + "mean_token_accuracy": 0.9045888483524323, + "num_tokens": 206348554.0, + "step": 777 + }, + { + "epoch": 0.07772616014785953, + "grad_norm": 0.8815169334411621, + "learning_rate": 9.940810327998768e-06, + "loss": 0.512, + "mean_token_accuracy": 0.9017514586448669, + "num_tokens": 206626980.0, + "step": 778 + }, + { + "epoch": 0.07782606523802388, + "grad_norm": 0.8436416387557983, + "learning_rate": 9.940561865088763e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.9028250575065613, + "num_tokens": 206888082.0, + "step": 779 + }, + { + "epoch": 0.07792597032818822, + "grad_norm": 0.851124107837677, + "learning_rate": 9.940312884897099e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.9040717482566833, + "num_tokens": 207146459.0, + "step": 780 + }, + { + "epoch": 0.07802587541835257, + "grad_norm": 0.9541505575180054, + "learning_rate": 9.940063387449843e-06, + "loss": 0.504, + "mean_token_accuracy": 0.9037564098834991, + "num_tokens": 207405126.0, + "step": 781 + }, + { + "epoch": 0.07812578050851691, + "grad_norm": 0.8188456296920776, + "learning_rate": 9.939813372773117e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.9042535424232483, + "num_tokens": 207669041.0, + "step": 782 + }, + { + "epoch": 0.07822568559868125, + "grad_norm": 1.1649017333984375, + "learning_rate": 9.9395628408931e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9009320735931396, + "num_tokens": 207938371.0, + "step": 783 + }, + { + "epoch": 0.07832559068884559, + "grad_norm": 0.8161141276359558, + "learning_rate": 9.93931179183602e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.9025471210479736, + "num_tokens": 208204170.0, + "step": 784 + }, + { + "epoch": 0.07842549577900994, + "grad_norm": 0.732075572013855, + "learning_rate": 9.939060225628162e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.901853084564209, + "num_tokens": 208473322.0, + "step": 785 + }, + { + "epoch": 0.07852540086917428, + "grad_norm": 1.0845293998718262, + "learning_rate": 9.938808142295871e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.9034782648086548, + "num_tokens": 208735254.0, + "step": 786 + }, + { + "epoch": 0.07862530595933863, + "grad_norm": 0.7761089205741882, + "learning_rate": 9.938555541865533e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.9029609560966492, + "num_tokens": 208998298.0, + "step": 787 + }, + { + "epoch": 0.07872521104950297, + "grad_norm": 1.343428373336792, + "learning_rate": 9.9383024243636e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.9012018442153931, + "num_tokens": 209262017.0, + "step": 788 + }, + { + "epoch": 0.07882511613966732, + "grad_norm": 0.8405594825744629, + "learning_rate": 9.938048789816573e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.9035851061344147, + "num_tokens": 209526884.0, + "step": 789 + }, + { + "epoch": 0.07892502122983167, + "grad_norm": 0.7558645606040955, + "learning_rate": 9.937794638251003e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.9047141969203949, + "num_tokens": 209792714.0, + "step": 790 + }, + { + "epoch": 0.079024926319996, + "grad_norm": 1.1616802215576172, + "learning_rate": 9.937539969693509e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.9049540162086487, + "num_tokens": 210055863.0, + "step": 791 + }, + { + "epoch": 0.07912483141016034, + "grad_norm": 0.9545094966888428, + "learning_rate": 9.937284784170746e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.9030608832836151, + "num_tokens": 210320099.0, + "step": 792 + }, + { + "epoch": 0.07922473650032469, + "grad_norm": 0.6713202595710754, + "learning_rate": 9.937029081709439e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.900375097990036, + "num_tokens": 210591622.0, + "step": 793 + }, + { + "epoch": 0.07932464159048903, + "grad_norm": 0.7591810822486877, + "learning_rate": 9.936772862336357e-06, + "loss": 0.507, + "mean_token_accuracy": 0.9038376808166504, + "num_tokens": 210862455.0, + "step": 794 + }, + { + "epoch": 0.07942454668065338, + "grad_norm": 0.7383404970169067, + "learning_rate": 9.936516126078326e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.9022226929664612, + "num_tokens": 211135628.0, + "step": 795 + }, + { + "epoch": 0.07952445177081773, + "grad_norm": 0.6108208894729614, + "learning_rate": 9.936258872962229e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.9028935432434082, + "num_tokens": 211406286.0, + "step": 796 + }, + { + "epoch": 0.07962435686098207, + "grad_norm": 2.255526304244995, + "learning_rate": 9.936001103014996e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.8996647596359253, + "num_tokens": 211667911.0, + "step": 797 + }, + { + "epoch": 0.07972426195114642, + "grad_norm": 0.6699540019035339, + "learning_rate": 9.935742816263622e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.9017909467220306, + "num_tokens": 211933324.0, + "step": 798 + }, + { + "epoch": 0.07982416704131075, + "grad_norm": 0.7529933452606201, + "learning_rate": 9.935484012735147e-06, + "loss": 0.512, + "mean_token_accuracy": 0.9019443690776825, + "num_tokens": 212209547.0, + "step": 799 + }, + { + "epoch": 0.0799240721314751, + "grad_norm": 0.6975520253181458, + "learning_rate": 9.935224692456665e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.9002538323402405, + "num_tokens": 212482064.0, + "step": 800 + }, + { + "epoch": 0.08002397722163944, + "grad_norm": 0.8915512561798096, + "learning_rate": 9.934964855455332e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.9007699489593506, + "num_tokens": 212748832.0, + "step": 801 + }, + { + "epoch": 0.08012388231180378, + "grad_norm": 0.9737935066223145, + "learning_rate": 9.93470450175835e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.9021221399307251, + "num_tokens": 213006359.0, + "step": 802 + }, + { + "epoch": 0.08022378740196813, + "grad_norm": 0.828292965888977, + "learning_rate": 9.934443631392979e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.9029291570186615, + "num_tokens": 213264688.0, + "step": 803 + }, + { + "epoch": 0.08032369249213248, + "grad_norm": 0.8376177549362183, + "learning_rate": 9.934182244386532e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.9021450877189636, + "num_tokens": 213517283.0, + "step": 804 + }, + { + "epoch": 0.08042359758229682, + "grad_norm": 0.8393986225128174, + "learning_rate": 9.933920340766379e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.9017106890678406, + "num_tokens": 213792094.0, + "step": 805 + }, + { + "epoch": 0.08052350267246117, + "grad_norm": 0.6917092204093933, + "learning_rate": 9.933657920559939e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.9028118252754211, + "num_tokens": 214064912.0, + "step": 806 + }, + { + "epoch": 0.08062340776262551, + "grad_norm": 0.8510833978652954, + "learning_rate": 9.933394983794688e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8997218310832977, + "num_tokens": 214334725.0, + "step": 807 + }, + { + "epoch": 0.08072331285278984, + "grad_norm": 1.323326826095581, + "learning_rate": 9.933131530498157e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.905322939157486, + "num_tokens": 214602498.0, + "step": 808 + }, + { + "epoch": 0.08082321794295419, + "grad_norm": 0.7687459588050842, + "learning_rate": 9.93286756069793e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.9002527594566345, + "num_tokens": 214868639.0, + "step": 809 + }, + { + "epoch": 0.08092312303311854, + "grad_norm": 0.7793306708335876, + "learning_rate": 9.93260307442164e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.9038735330104828, + "num_tokens": 215128633.0, + "step": 810 + }, + { + "epoch": 0.08102302812328288, + "grad_norm": 0.8951417803764343, + "learning_rate": 9.932338071696986e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8994477093219757, + "num_tokens": 215395316.0, + "step": 811 + }, + { + "epoch": 0.08112293321344723, + "grad_norm": 0.8378719091415405, + "learning_rate": 9.93207255255171e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.9051957428455353, + "num_tokens": 215658691.0, + "step": 812 + }, + { + "epoch": 0.08122283830361157, + "grad_norm": 1.2899501323699951, + "learning_rate": 9.931806517013612e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.9072140455245972, + "num_tokens": 215915819.0, + "step": 813 + }, + { + "epoch": 0.08132274339377592, + "grad_norm": 0.7126391530036926, + "learning_rate": 9.931539965110548e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.9033026695251465, + "num_tokens": 216178187.0, + "step": 814 + }, + { + "epoch": 0.08142264848394026, + "grad_norm": 0.6188310384750366, + "learning_rate": 9.931272896870427e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9019787907600403, + "num_tokens": 216443083.0, + "step": 815 + }, + { + "epoch": 0.0815225535741046, + "grad_norm": 0.7405858635902405, + "learning_rate": 9.931005312321208e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.9013383984565735, + "num_tokens": 216716150.0, + "step": 816 + }, + { + "epoch": 0.08162245866426894, + "grad_norm": 0.6273868083953857, + "learning_rate": 9.930737211490909e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.9062425196170807, + "num_tokens": 216965989.0, + "step": 817 + }, + { + "epoch": 0.08172236375443329, + "grad_norm": 0.918261706829071, + "learning_rate": 9.9304685944076e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.9054177701473236, + "num_tokens": 217226295.0, + "step": 818 + }, + { + "epoch": 0.08182226884459763, + "grad_norm": 0.7217230796813965, + "learning_rate": 9.930199461099406e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.9043220579624176, + "num_tokens": 217484288.0, + "step": 819 + }, + { + "epoch": 0.08192217393476198, + "grad_norm": 0.6550894379615784, + "learning_rate": 9.929929811594507e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.902253270149231, + "num_tokens": 217753149.0, + "step": 820 + }, + { + "epoch": 0.08202207902492632, + "grad_norm": 0.9111142158508301, + "learning_rate": 9.929659645921132e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.9045349657535553, + "num_tokens": 218025356.0, + "step": 821 + }, + { + "epoch": 0.08212198411509067, + "grad_norm": 0.7978716492652893, + "learning_rate": 9.929388964107572e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.9017807841300964, + "num_tokens": 218305473.0, + "step": 822 + }, + { + "epoch": 0.08222188920525501, + "grad_norm": 1.038499355316162, + "learning_rate": 9.929117766182164e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.9012724757194519, + "num_tokens": 218568608.0, + "step": 823 + }, + { + "epoch": 0.08232179429541935, + "grad_norm": 0.7279272675514221, + "learning_rate": 9.928846052173302e-06, + "loss": 0.513, + "mean_token_accuracy": 0.9013416171073914, + "num_tokens": 218839753.0, + "step": 824 + }, + { + "epoch": 0.08242169938558369, + "grad_norm": 0.7265940308570862, + "learning_rate": 9.92857382210944e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.9042467772960663, + "num_tokens": 219115268.0, + "step": 825 + }, + { + "epoch": 0.08252160447574804, + "grad_norm": 0.8186826705932617, + "learning_rate": 9.928301076019076e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8992559611797333, + "num_tokens": 219380901.0, + "step": 826 + }, + { + "epoch": 0.08262150956591238, + "grad_norm": 0.747736930847168, + "learning_rate": 9.928027813930769e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.9022840857505798, + "num_tokens": 219639253.0, + "step": 827 + }, + { + "epoch": 0.08272141465607673, + "grad_norm": 0.9603944420814514, + "learning_rate": 9.927754035873127e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.9031360745429993, + "num_tokens": 219905495.0, + "step": 828 + }, + { + "epoch": 0.08282131974624107, + "grad_norm": 0.8549366593360901, + "learning_rate": 9.927479741874819e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.9050015211105347, + "num_tokens": 220178831.0, + "step": 829 + }, + { + "epoch": 0.08292122483640542, + "grad_norm": 0.7528232932090759, + "learning_rate": 9.927204931964561e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9049951434135437, + "num_tokens": 220440224.0, + "step": 830 + }, + { + "epoch": 0.08302112992656976, + "grad_norm": 0.8833768367767334, + "learning_rate": 9.926929606171127e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8990103900432587, + "num_tokens": 220700069.0, + "step": 831 + }, + { + "epoch": 0.0831210350167341, + "grad_norm": 1.083404302597046, + "learning_rate": 9.926653764523343e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.9047702848911285, + "num_tokens": 220965247.0, + "step": 832 + }, + { + "epoch": 0.08322094010689844, + "grad_norm": 0.7962873578071594, + "learning_rate": 9.92637740705009e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.9015565812587738, + "num_tokens": 221229456.0, + "step": 833 + }, + { + "epoch": 0.08332084519706279, + "grad_norm": 0.9178759455680847, + "learning_rate": 9.926100533780304e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.9031528830528259, + "num_tokens": 221507343.0, + "step": 834 + }, + { + "epoch": 0.08342075028722713, + "grad_norm": 0.7690277099609375, + "learning_rate": 9.925823144742972e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.9016652405261993, + "num_tokens": 221756594.0, + "step": 835 + }, + { + "epoch": 0.08352065537739148, + "grad_norm": 0.6996787786483765, + "learning_rate": 9.925545239967141e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.9027078747749329, + "num_tokens": 222015097.0, + "step": 836 + }, + { + "epoch": 0.08362056046755582, + "grad_norm": 0.7167554497718811, + "learning_rate": 9.925266819481903e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.9002484977245331, + "num_tokens": 222282030.0, + "step": 837 + }, + { + "epoch": 0.08372046555772017, + "grad_norm": 0.6754700541496277, + "learning_rate": 9.92498788331641e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.9073294401168823, + "num_tokens": 222549076.0, + "step": 838 + }, + { + "epoch": 0.08382037064788452, + "grad_norm": 0.8683072924613953, + "learning_rate": 9.924708431499868e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.9035294651985168, + "num_tokens": 222814234.0, + "step": 839 + }, + { + "epoch": 0.08392027573804885, + "grad_norm": 0.7274170517921448, + "learning_rate": 9.924428464061536e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.905150979757309, + "num_tokens": 223074032.0, + "step": 840 + }, + { + "epoch": 0.08402018082821319, + "grad_norm": 0.7649213075637817, + "learning_rate": 9.924147981030728e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.898000031709671, + "num_tokens": 223334346.0, + "step": 841 + }, + { + "epoch": 0.08412008591837754, + "grad_norm": 0.8636115789413452, + "learning_rate": 9.923866982436807e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9062251448631287, + "num_tokens": 223603389.0, + "step": 842 + }, + { + "epoch": 0.08421999100854188, + "grad_norm": 0.8978091478347778, + "learning_rate": 9.923585468309197e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.9030169546604156, + "num_tokens": 223861253.0, + "step": 843 + }, + { + "epoch": 0.08431989609870623, + "grad_norm": 1.469775676727295, + "learning_rate": 9.923303438677373e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.9008837938308716, + "num_tokens": 224116664.0, + "step": 844 + }, + { + "epoch": 0.08441980118887057, + "grad_norm": 1.096786379814148, + "learning_rate": 9.923020893570861e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.9028151333332062, + "num_tokens": 224380156.0, + "step": 845 + }, + { + "epoch": 0.08451970627903492, + "grad_norm": 2.4234812259674072, + "learning_rate": 9.922737833019247e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.9055119156837463, + "num_tokens": 224646887.0, + "step": 846 + }, + { + "epoch": 0.08461961136919927, + "grad_norm": 1.026613473892212, + "learning_rate": 9.922454257052166e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.9017521142959595, + "num_tokens": 224899089.0, + "step": 847 + }, + { + "epoch": 0.08471951645936361, + "grad_norm": 0.8403490781784058, + "learning_rate": 9.922170165699307e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.8981121182441711, + "num_tokens": 225168782.0, + "step": 848 + }, + { + "epoch": 0.08481942154952794, + "grad_norm": 1.2272788286209106, + "learning_rate": 9.921885558990418e-06, + "loss": 0.515, + "mean_token_accuracy": 0.9017587304115295, + "num_tokens": 225443112.0, + "step": 849 + }, + { + "epoch": 0.08491932663969229, + "grad_norm": 0.9568389654159546, + "learning_rate": 9.921600436955297e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.9034649729728699, + "num_tokens": 225704342.0, + "step": 850 + }, + { + "epoch": 0.08501923172985663, + "grad_norm": 2.10701584815979, + "learning_rate": 9.921314799623796e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.9041014015674591, + "num_tokens": 225968080.0, + "step": 851 + }, + { + "epoch": 0.08511913682002098, + "grad_norm": 1.2219927310943604, + "learning_rate": 9.921028647025819e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.9021754860877991, + "num_tokens": 226228690.0, + "step": 852 + }, + { + "epoch": 0.08521904191018533, + "grad_norm": 0.8183008432388306, + "learning_rate": 9.92074197919133e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.9018539190292358, + "num_tokens": 226492621.0, + "step": 853 + }, + { + "epoch": 0.08531894700034967, + "grad_norm": 0.9297601580619812, + "learning_rate": 9.920454796150342e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.9018206894397736, + "num_tokens": 226760384.0, + "step": 854 + }, + { + "epoch": 0.08541885209051402, + "grad_norm": 2.2403316497802734, + "learning_rate": 9.920167097932923e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.9022210538387299, + "num_tokens": 227030054.0, + "step": 855 + }, + { + "epoch": 0.08551875718067836, + "grad_norm": 0.7399396300315857, + "learning_rate": 9.919878884569197e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.9032800793647766, + "num_tokens": 227298835.0, + "step": 856 + }, + { + "epoch": 0.0856186622708427, + "grad_norm": 0.7822316884994507, + "learning_rate": 9.919590156089338e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.9042707979679108, + "num_tokens": 227568550.0, + "step": 857 + }, + { + "epoch": 0.08571856736100704, + "grad_norm": 0.7993769645690918, + "learning_rate": 9.919300912523576e-06, + "loss": 0.515, + "mean_token_accuracy": 0.9031614661216736, + "num_tokens": 227830317.0, + "step": 858 + }, + { + "epoch": 0.08581847245117138, + "grad_norm": 1.1921783685684204, + "learning_rate": 9.919011153902196e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.9032695591449738, + "num_tokens": 228094361.0, + "step": 859 + }, + { + "epoch": 0.08591837754133573, + "grad_norm": 0.7775762677192688, + "learning_rate": 9.91872088025554e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.902563065290451, + "num_tokens": 228354105.0, + "step": 860 + }, + { + "epoch": 0.08601828263150008, + "grad_norm": 1.7056325674057007, + "learning_rate": 9.918430091613993e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.9049447774887085, + "num_tokens": 228621960.0, + "step": 861 + }, + { + "epoch": 0.08611818772166442, + "grad_norm": 0.8274292945861816, + "learning_rate": 9.918138788008003e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.9007602035999298, + "num_tokens": 228881551.0, + "step": 862 + }, + { + "epoch": 0.08621809281182877, + "grad_norm": 0.7494775056838989, + "learning_rate": 9.917846969468073e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.900622695684433, + "num_tokens": 229149022.0, + "step": 863 + }, + { + "epoch": 0.08631799790199311, + "grad_norm": 0.917517364025116, + "learning_rate": 9.917554636024754e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.9002282917499542, + "num_tokens": 229417753.0, + "step": 864 + }, + { + "epoch": 0.08641790299215744, + "grad_norm": 0.7882062196731567, + "learning_rate": 9.917261787708653e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.9026840627193451, + "num_tokens": 229683960.0, + "step": 865 + }, + { + "epoch": 0.08651780808232179, + "grad_norm": 0.8003268837928772, + "learning_rate": 9.916968424550432e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.9032711684703827, + "num_tokens": 229952332.0, + "step": 866 + }, + { + "epoch": 0.08661771317248614, + "grad_norm": 0.9046820998191833, + "learning_rate": 9.91667454658081e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.9065326452255249, + "num_tokens": 230221613.0, + "step": 867 + }, + { + "epoch": 0.08671761826265048, + "grad_norm": 0.8809050917625427, + "learning_rate": 9.916380153830549e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.9028875231742859, + "num_tokens": 230482598.0, + "step": 868 + }, + { + "epoch": 0.08681752335281483, + "grad_norm": 0.6560028195381165, + "learning_rate": 9.91608524633048e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.9007614254951477, + "num_tokens": 230752717.0, + "step": 869 + }, + { + "epoch": 0.08691742844297917, + "grad_norm": 0.5888882279396057, + "learning_rate": 9.915789824111474e-06, + "loss": 0.505, + "mean_token_accuracy": 0.9033994376659393, + "num_tokens": 231018733.0, + "step": 870 + }, + { + "epoch": 0.08701733353314352, + "grad_norm": 0.6918001174926758, + "learning_rate": 9.915493887204467e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.9012524485588074, + "num_tokens": 231279320.0, + "step": 871 + }, + { + "epoch": 0.08711723862330786, + "grad_norm": 0.7681354880332947, + "learning_rate": 9.91519743564044e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.9024344682693481, + "num_tokens": 231548311.0, + "step": 872 + }, + { + "epoch": 0.0872171437134722, + "grad_norm": 0.6726807355880737, + "learning_rate": 9.914900469450434e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9042438268661499, + "num_tokens": 231818654.0, + "step": 873 + }, + { + "epoch": 0.08731704880363654, + "grad_norm": 1.0226298570632935, + "learning_rate": 9.91460298866554e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.9030193090438843, + "num_tokens": 232088085.0, + "step": 874 + }, + { + "epoch": 0.08741695389380089, + "grad_norm": 0.757305383682251, + "learning_rate": 9.914304993316906e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.9018208086490631, + "num_tokens": 232354560.0, + "step": 875 + }, + { + "epoch": 0.08751685898396523, + "grad_norm": 0.8699764013290405, + "learning_rate": 9.914006483435732e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.9056728482246399, + "num_tokens": 232618186.0, + "step": 876 + }, + { + "epoch": 0.08761676407412958, + "grad_norm": 0.9012262225151062, + "learning_rate": 9.91370745905327e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.9020533859729767, + "num_tokens": 232873322.0, + "step": 877 + }, + { + "epoch": 0.08771666916429392, + "grad_norm": 0.5758890509605408, + "learning_rate": 9.913407920200832e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.9028805196285248, + "num_tokens": 233136338.0, + "step": 878 + }, + { + "epoch": 0.08781657425445827, + "grad_norm": 0.6829975247383118, + "learning_rate": 9.913107866909779e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.9046681821346283, + "num_tokens": 233398066.0, + "step": 879 + }, + { + "epoch": 0.08791647934462261, + "grad_norm": 0.6201022863388062, + "learning_rate": 9.912807299211524e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.9030256271362305, + "num_tokens": 233660933.0, + "step": 880 + }, + { + "epoch": 0.08801638443478695, + "grad_norm": 0.6833122968673706, + "learning_rate": 9.912506217137542e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.9026020765304565, + "num_tokens": 233919654.0, + "step": 881 + }, + { + "epoch": 0.08811628952495129, + "grad_norm": 0.8196830749511719, + "learning_rate": 9.91220462071935e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.9059647619724274, + "num_tokens": 234185821.0, + "step": 882 + }, + { + "epoch": 0.08821619461511564, + "grad_norm": 0.7247741222381592, + "learning_rate": 9.91190250998853e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.9008136093616486, + "num_tokens": 234451207.0, + "step": 883 + }, + { + "epoch": 0.08831609970527998, + "grad_norm": 0.7586658000946045, + "learning_rate": 9.911599884976712e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.905479222536087, + "num_tokens": 234725700.0, + "step": 884 + }, + { + "epoch": 0.08841600479544433, + "grad_norm": 1.2888108491897583, + "learning_rate": 9.911296745715583e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.9026051759719849, + "num_tokens": 234991240.0, + "step": 885 + }, + { + "epoch": 0.08851590988560867, + "grad_norm": 0.9445307850837708, + "learning_rate": 9.910993092236878e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.9013016819953918, + "num_tokens": 235258018.0, + "step": 886 + }, + { + "epoch": 0.08861581497577302, + "grad_norm": 0.9139958620071411, + "learning_rate": 9.910688924572392e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9024034738540649, + "num_tokens": 235523600.0, + "step": 887 + }, + { + "epoch": 0.08871572006593736, + "grad_norm": 0.8612871766090393, + "learning_rate": 9.910384242753973e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.9002180993556976, + "num_tokens": 235788948.0, + "step": 888 + }, + { + "epoch": 0.08881562515610171, + "grad_norm": 0.8679602146148682, + "learning_rate": 9.910079046813522e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.9008519947528839, + "num_tokens": 236055947.0, + "step": 889 + }, + { + "epoch": 0.08891553024626604, + "grad_norm": 1.1930643320083618, + "learning_rate": 9.909773336782987e-06, + "loss": 0.514, + "mean_token_accuracy": 0.9034154117107391, + "num_tokens": 236319751.0, + "step": 890 + }, + { + "epoch": 0.08901543533643039, + "grad_norm": 0.7094569802284241, + "learning_rate": 9.909467112694385e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.9006885886192322, + "num_tokens": 236592489.0, + "step": 891 + }, + { + "epoch": 0.08911534042659473, + "grad_norm": 0.7865568399429321, + "learning_rate": 9.90916037457977e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.902832955121994, + "num_tokens": 236857324.0, + "step": 892 + }, + { + "epoch": 0.08921524551675908, + "grad_norm": 1.040107011795044, + "learning_rate": 9.908853122471263e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.9040261507034302, + "num_tokens": 237130747.0, + "step": 893 + }, + { + "epoch": 0.08931515060692342, + "grad_norm": 0.7130714654922485, + "learning_rate": 9.908545356401032e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.9033016264438629, + "num_tokens": 237401013.0, + "step": 894 + }, + { + "epoch": 0.08941505569708777, + "grad_norm": 0.8372796177864075, + "learning_rate": 9.908237076401302e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.9034537076950073, + "num_tokens": 237661813.0, + "step": 895 + }, + { + "epoch": 0.08951496078725212, + "grad_norm": 0.6904829740524292, + "learning_rate": 9.907928282504347e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9046481251716614, + "num_tokens": 237917801.0, + "step": 896 + }, + { + "epoch": 0.08961486587741646, + "grad_norm": 0.7274646162986755, + "learning_rate": 9.907618974742499e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.9048393964767456, + "num_tokens": 238182001.0, + "step": 897 + }, + { + "epoch": 0.08971477096758079, + "grad_norm": 0.6989085674285889, + "learning_rate": 9.907309153148143e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.9035319089889526, + "num_tokens": 238451047.0, + "step": 898 + }, + { + "epoch": 0.08981467605774514, + "grad_norm": 1.1651198863983154, + "learning_rate": 9.90699881775372e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.9034185111522675, + "num_tokens": 238714785.0, + "step": 899 + }, + { + "epoch": 0.08991458114790948, + "grad_norm": 1.6997501850128174, + "learning_rate": 9.90668796859172e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.9000175893306732, + "num_tokens": 238975769.0, + "step": 900 + }, + { + "epoch": 0.09001448623807383, + "grad_norm": 0.6648746132850647, + "learning_rate": 9.90637660569469e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.9026800692081451, + "num_tokens": 239242528.0, + "step": 901 + }, + { + "epoch": 0.09011439132823817, + "grad_norm": 0.627750813961029, + "learning_rate": 9.906064729095229e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.9053917825222015, + "num_tokens": 239515158.0, + "step": 902 + }, + { + "epoch": 0.09021429641840252, + "grad_norm": 0.956549346446991, + "learning_rate": 9.90575233882599e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.9031286835670471, + "num_tokens": 239779974.0, + "step": 903 + }, + { + "epoch": 0.09031420150856687, + "grad_norm": 0.6238885521888733, + "learning_rate": 9.905439434919685e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.9030863046646118, + "num_tokens": 240050719.0, + "step": 904 + }, + { + "epoch": 0.09041410659873121, + "grad_norm": 1.4072372913360596, + "learning_rate": 9.905126017409072e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9044866561889648, + "num_tokens": 240313484.0, + "step": 905 + }, + { + "epoch": 0.09051401168889554, + "grad_norm": 0.8497151732444763, + "learning_rate": 9.904812086326965e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.9026490449905396, + "num_tokens": 240579545.0, + "step": 906 + }, + { + "epoch": 0.09061391677905989, + "grad_norm": 0.7048166394233704, + "learning_rate": 9.904497641706237e-06, + "loss": 0.515, + "mean_token_accuracy": 0.9043566286563873, + "num_tokens": 240847218.0, + "step": 907 + }, + { + "epoch": 0.09071382186922423, + "grad_norm": 0.6301460862159729, + "learning_rate": 9.904182683579807e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.9044999778270721, + "num_tokens": 241115538.0, + "step": 908 + }, + { + "epoch": 0.09081372695938858, + "grad_norm": 0.6210088133811951, + "learning_rate": 9.90386721198065e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.905032753944397, + "num_tokens": 241375864.0, + "step": 909 + }, + { + "epoch": 0.09091363204955293, + "grad_norm": 0.6533534526824951, + "learning_rate": 9.903551226941801e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.9041151702404022, + "num_tokens": 241648620.0, + "step": 910 + }, + { + "epoch": 0.09101353713971727, + "grad_norm": 0.7841688394546509, + "learning_rate": 9.903234728496341e-06, + "loss": 0.502, + "mean_token_accuracy": 0.905244380235672, + "num_tokens": 241916846.0, + "step": 911 + }, + { + "epoch": 0.09111344222988162, + "grad_norm": 0.6479050517082214, + "learning_rate": 9.902917716677409e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.9018596112728119, + "num_tokens": 242187742.0, + "step": 912 + }, + { + "epoch": 0.09121334732004596, + "grad_norm": 0.7153990864753723, + "learning_rate": 9.902600191518196e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9024521708488464, + "num_tokens": 242460620.0, + "step": 913 + }, + { + "epoch": 0.0913132524102103, + "grad_norm": 1.4957900047302246, + "learning_rate": 9.902282153051946e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.9029166400432587, + "num_tokens": 242728065.0, + "step": 914 + }, + { + "epoch": 0.09141315750037464, + "grad_norm": 0.7120001316070557, + "learning_rate": 9.901963601311959e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8996756374835968, + "num_tokens": 242996507.0, + "step": 915 + }, + { + "epoch": 0.09151306259053898, + "grad_norm": 0.7160025835037231, + "learning_rate": 9.901644536331588e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.9046156108379364, + "num_tokens": 243269366.0, + "step": 916 + }, + { + "epoch": 0.09161296768070333, + "grad_norm": 0.8013163805007935, + "learning_rate": 9.90132495814424e-06, + "loss": 0.514, + "mean_token_accuracy": 0.9008232951164246, + "num_tokens": 243522210.0, + "step": 917 + }, + { + "epoch": 0.09171287277086768, + "grad_norm": 0.7534878253936768, + "learning_rate": 9.901004866783372e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.9030795693397522, + "num_tokens": 243784505.0, + "step": 918 + }, + { + "epoch": 0.09181277786103202, + "grad_norm": 0.6801106333732605, + "learning_rate": 9.900684262282501e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.8980703353881836, + "num_tokens": 244044565.0, + "step": 919 + }, + { + "epoch": 0.09191268295119637, + "grad_norm": 0.5518785119056702, + "learning_rate": 9.900363144675194e-06, + "loss": 0.504, + "mean_token_accuracy": 0.9056805372238159, + "num_tokens": 244308166.0, + "step": 920 + }, + { + "epoch": 0.09201258804136071, + "grad_norm": 1.4680837392807007, + "learning_rate": 9.900041513995072e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.9041651785373688, + "num_tokens": 244571990.0, + "step": 921 + }, + { + "epoch": 0.09211249313152504, + "grad_norm": 1.0702910423278809, + "learning_rate": 9.89971937027581e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.9047485888004303, + "num_tokens": 244834852.0, + "step": 922 + }, + { + "epoch": 0.09221239822168939, + "grad_norm": 0.6226290464401245, + "learning_rate": 9.899396713551137e-06, + "loss": 0.506, + "mean_token_accuracy": 0.9036352038383484, + "num_tokens": 245094899.0, + "step": 923 + }, + { + "epoch": 0.09231230331185374, + "grad_norm": 0.6414684057235718, + "learning_rate": 9.899073543854833e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.9041974544525146, + "num_tokens": 245349158.0, + "step": 924 + }, + { + "epoch": 0.09241220840201808, + "grad_norm": 1.1307569742202759, + "learning_rate": 9.89874986122074e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8989608287811279, + "num_tokens": 245614848.0, + "step": 925 + }, + { + "epoch": 0.09251211349218243, + "grad_norm": 0.6928567886352539, + "learning_rate": 9.898425665682743e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.900767058134079, + "num_tokens": 245880727.0, + "step": 926 + }, + { + "epoch": 0.09261201858234677, + "grad_norm": 0.7067873477935791, + "learning_rate": 9.898100957274786e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.9028249979019165, + "num_tokens": 246146105.0, + "step": 927 + }, + { + "epoch": 0.09271192367251112, + "grad_norm": 0.6187695860862732, + "learning_rate": 9.897775736030867e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9052915871143341, + "num_tokens": 246409532.0, + "step": 928 + }, + { + "epoch": 0.09281182876267546, + "grad_norm": 0.662583589553833, + "learning_rate": 9.897450001985038e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.9020529091358185, + "num_tokens": 246675893.0, + "step": 929 + }, + { + "epoch": 0.09291173385283981, + "grad_norm": 0.5789754390716553, + "learning_rate": 9.897123755171403e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.9042350351810455, + "num_tokens": 246938284.0, + "step": 930 + }, + { + "epoch": 0.09301163894300414, + "grad_norm": 0.6504311561584473, + "learning_rate": 9.896796995624121e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.9047641754150391, + "num_tokens": 247199656.0, + "step": 931 + }, + { + "epoch": 0.09311154403316849, + "grad_norm": 0.7722715735435486, + "learning_rate": 9.896469723377402e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.901339054107666, + "num_tokens": 247473824.0, + "step": 932 + }, + { + "epoch": 0.09321144912333283, + "grad_norm": 0.8340456485748291, + "learning_rate": 9.896141938465513e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.9046193361282349, + "num_tokens": 247735154.0, + "step": 933 + }, + { + "epoch": 0.09331135421349718, + "grad_norm": 0.7578312754631042, + "learning_rate": 9.895813640922773e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.9053567349910736, + "num_tokens": 247995299.0, + "step": 934 + }, + { + "epoch": 0.09341125930366152, + "grad_norm": 0.8422245979309082, + "learning_rate": 9.895484830783557e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.9027031362056732, + "num_tokens": 248261902.0, + "step": 935 + }, + { + "epoch": 0.09351116439382587, + "grad_norm": 0.8142735958099365, + "learning_rate": 9.89515550808229e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9007234573364258, + "num_tokens": 248538119.0, + "step": 936 + }, + { + "epoch": 0.09361106948399021, + "grad_norm": 0.8387207984924316, + "learning_rate": 9.894825672853451e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.9004194140434265, + "num_tokens": 248798190.0, + "step": 937 + }, + { + "epoch": 0.09371097457415456, + "grad_norm": 0.7525430917739868, + "learning_rate": 9.894495325131577e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.9022934436798096, + "num_tokens": 249068775.0, + "step": 938 + }, + { + "epoch": 0.09381087966431889, + "grad_norm": 0.6841181516647339, + "learning_rate": 9.894164464951254e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.9048729836940765, + "num_tokens": 249340392.0, + "step": 939 + }, + { + "epoch": 0.09391078475448324, + "grad_norm": 0.8101212382316589, + "learning_rate": 9.893833092347125e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.9050234854221344, + "num_tokens": 249596769.0, + "step": 940 + }, + { + "epoch": 0.09401068984464758, + "grad_norm": 0.7530659437179565, + "learning_rate": 9.893501207353883e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.9023544788360596, + "num_tokens": 249852776.0, + "step": 941 + }, + { + "epoch": 0.09411059493481193, + "grad_norm": 0.8962984085083008, + "learning_rate": 9.893168810006277e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.9022932052612305, + "num_tokens": 250124079.0, + "step": 942 + }, + { + "epoch": 0.09421050002497627, + "grad_norm": 0.9324583411216736, + "learning_rate": 9.892835900339111e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.9019225239753723, + "num_tokens": 250387078.0, + "step": 943 + }, + { + "epoch": 0.09431040511514062, + "grad_norm": 0.6846917867660522, + "learning_rate": 9.892502478387239e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.9026526212692261, + "num_tokens": 250657490.0, + "step": 944 + }, + { + "epoch": 0.09441031020530496, + "grad_norm": 0.5574573278427124, + "learning_rate": 9.89216854418557e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9047479331493378, + "num_tokens": 250922386.0, + "step": 945 + }, + { + "epoch": 0.09451021529546931, + "grad_norm": 1.1752736568450928, + "learning_rate": 9.891834097769071e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.9027358889579773, + "num_tokens": 251181920.0, + "step": 946 + }, + { + "epoch": 0.09461012038563364, + "grad_norm": 0.7024989724159241, + "learning_rate": 9.891499139172755e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9093572199344635, + "num_tokens": 251456280.0, + "step": 947 + }, + { + "epoch": 0.09471002547579799, + "grad_norm": 0.6534249186515808, + "learning_rate": 9.891163668431696e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9016695916652679, + "num_tokens": 251719458.0, + "step": 948 + }, + { + "epoch": 0.09480993056596233, + "grad_norm": 0.8706385493278503, + "learning_rate": 9.890827685581014e-06, + "loss": 0.506, + "mean_token_accuracy": 0.9055192768573761, + "num_tokens": 251977533.0, + "step": 949 + }, + { + "epoch": 0.09490983565612668, + "grad_norm": 0.6492956280708313, + "learning_rate": 9.890491190655892e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9049142003059387, + "num_tokens": 252252085.0, + "step": 950 + }, + { + "epoch": 0.09500974074629102, + "grad_norm": 0.7916878461837769, + "learning_rate": 9.890154183691554e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.904685765504837, + "num_tokens": 252521687.0, + "step": 951 + }, + { + "epoch": 0.09510964583645537, + "grad_norm": 0.6813972592353821, + "learning_rate": 9.88981666472329e-06, + "loss": 0.508, + "mean_token_accuracy": 0.9029503762722015, + "num_tokens": 252790338.0, + "step": 952 + }, + { + "epoch": 0.09520955092661972, + "grad_norm": 0.8166825175285339, + "learning_rate": 9.88947863378644e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.9025663435459137, + "num_tokens": 253048564.0, + "step": 953 + }, + { + "epoch": 0.09530945601678406, + "grad_norm": 0.715034544467926, + "learning_rate": 9.889140090916394e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.9027352035045624, + "num_tokens": 253311781.0, + "step": 954 + }, + { + "epoch": 0.09540936110694839, + "grad_norm": 0.7824719548225403, + "learning_rate": 9.888801036148597e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.9019782245159149, + "num_tokens": 253583712.0, + "step": 955 + }, + { + "epoch": 0.09550926619711274, + "grad_norm": 0.6620200276374817, + "learning_rate": 9.888461469518547e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.9016610085964203, + "num_tokens": 253853377.0, + "step": 956 + }, + { + "epoch": 0.09560917128727708, + "grad_norm": 0.6099717617034912, + "learning_rate": 9.8881213910618e-06, + "loss": 0.508, + "mean_token_accuracy": 0.9020754992961884, + "num_tokens": 254124829.0, + "step": 957 + }, + { + "epoch": 0.09570907637744143, + "grad_norm": 0.8364931344985962, + "learning_rate": 9.887780800813963e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.9028995931148529, + "num_tokens": 254387992.0, + "step": 958 + }, + { + "epoch": 0.09580898146760577, + "grad_norm": 0.6552438139915466, + "learning_rate": 9.887439698810694e-06, + "loss": 0.515, + "mean_token_accuracy": 0.9003078043460846, + "num_tokens": 254656592.0, + "step": 959 + }, + { + "epoch": 0.09590888655777012, + "grad_norm": 0.6839868426322937, + "learning_rate": 9.887098085087707e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.9020451009273529, + "num_tokens": 254922999.0, + "step": 960 + }, + { + "epoch": 0.09600879164793447, + "grad_norm": 0.8898309469223022, + "learning_rate": 9.886755959680769e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.9026889204978943, + "num_tokens": 255196319.0, + "step": 961 + }, + { + "epoch": 0.09610869673809881, + "grad_norm": 0.7177220582962036, + "learning_rate": 9.886413322625703e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.8989916741847992, + "num_tokens": 255459609.0, + "step": 962 + }, + { + "epoch": 0.09620860182826314, + "grad_norm": 0.6837481260299683, + "learning_rate": 9.886070173958382e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.9020383954048157, + "num_tokens": 255725952.0, + "step": 963 + }, + { + "epoch": 0.09630850691842749, + "grad_norm": 0.5521579384803772, + "learning_rate": 9.885726513714732e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9024404287338257, + "num_tokens": 255990048.0, + "step": 964 + }, + { + "epoch": 0.09640841200859183, + "grad_norm": 0.5780631899833679, + "learning_rate": 9.885382341930739e-06, + "loss": 0.51, + "mean_token_accuracy": 0.9040232300758362, + "num_tokens": 256260727.0, + "step": 965 + }, + { + "epoch": 0.09650831709875618, + "grad_norm": 1.031743049621582, + "learning_rate": 9.885037658642436e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.903764009475708, + "num_tokens": 256516528.0, + "step": 966 + }, + { + "epoch": 0.09660822218892053, + "grad_norm": 0.5722321271896362, + "learning_rate": 9.88469246388591e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.9017018675804138, + "num_tokens": 256775890.0, + "step": 967 + }, + { + "epoch": 0.09670812727908487, + "grad_norm": 0.6246575713157654, + "learning_rate": 9.884346757697304e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.9032845199108124, + "num_tokens": 257032724.0, + "step": 968 + }, + { + "epoch": 0.09680803236924922, + "grad_norm": 0.6804587244987488, + "learning_rate": 9.884000540112814e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.9012353718280792, + "num_tokens": 257294760.0, + "step": 969 + }, + { + "epoch": 0.09690793745941356, + "grad_norm": 0.621557891368866, + "learning_rate": 9.883653811168693e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.9054560959339142, + "num_tokens": 257555322.0, + "step": 970 + }, + { + "epoch": 0.09700784254957791, + "grad_norm": 0.7004940509796143, + "learning_rate": 9.883306570901237e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.905738115310669, + "num_tokens": 257817176.0, + "step": 971 + }, + { + "epoch": 0.09710774763974224, + "grad_norm": 0.6567486524581909, + "learning_rate": 9.882958819346807e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.9044722020626068, + "num_tokens": 258082896.0, + "step": 972 + }, + { + "epoch": 0.09720765272990659, + "grad_norm": 0.5802069902420044, + "learning_rate": 9.882610556541812e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.9055845439434052, + "num_tokens": 258338814.0, + "step": 973 + }, + { + "epoch": 0.09730755782007093, + "grad_norm": 0.7584674954414368, + "learning_rate": 9.882261782522715e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9040151238441467, + "num_tokens": 258606958.0, + "step": 974 + }, + { + "epoch": 0.09740746291023528, + "grad_norm": 0.6707557439804077, + "learning_rate": 9.881912497326034e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.9051512181758881, + "num_tokens": 258864903.0, + "step": 975 + }, + { + "epoch": 0.09750736800039962, + "grad_norm": 0.6557103395462036, + "learning_rate": 9.88156270098834e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.9040635526180267, + "num_tokens": 259122777.0, + "step": 976 + }, + { + "epoch": 0.09760727309056397, + "grad_norm": 0.648277223110199, + "learning_rate": 9.881212393546253e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.9034152626991272, + "num_tokens": 259398683.0, + "step": 977 + }, + { + "epoch": 0.09770717818072831, + "grad_norm": 0.5597332119941711, + "learning_rate": 9.880861575036455e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.9027010500431061, + "num_tokens": 259655064.0, + "step": 978 + }, + { + "epoch": 0.09780708327089266, + "grad_norm": 0.5890397429466248, + "learning_rate": 9.880510245495675e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.9013778567314148, + "num_tokens": 259920957.0, + "step": 979 + }, + { + "epoch": 0.09790698836105699, + "grad_norm": 0.8805169463157654, + "learning_rate": 9.880158404960698e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.9017166197299957, + "num_tokens": 260177847.0, + "step": 980 + }, + { + "epoch": 0.09800689345122134, + "grad_norm": 0.6646280884742737, + "learning_rate": 9.879806053468361e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.9033617079257965, + "num_tokens": 260445921.0, + "step": 981 + }, + { + "epoch": 0.09810679854138568, + "grad_norm": 0.7725006937980652, + "learning_rate": 9.87945319105556e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.9055712521076202, + "num_tokens": 260709855.0, + "step": 982 + }, + { + "epoch": 0.09820670363155003, + "grad_norm": 0.6702960729598999, + "learning_rate": 9.879099817759232e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8986052572727203, + "num_tokens": 260978151.0, + "step": 983 + }, + { + "epoch": 0.09830660872171437, + "grad_norm": 0.7555749416351318, + "learning_rate": 9.878745933616383e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.9013402462005615, + "num_tokens": 261249956.0, + "step": 984 + }, + { + "epoch": 0.09840651381187872, + "grad_norm": 0.8709304332733154, + "learning_rate": 9.878391538664061e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.9014398753643036, + "num_tokens": 261513065.0, + "step": 985 + }, + { + "epoch": 0.09850641890204306, + "grad_norm": 0.7059509754180908, + "learning_rate": 9.878036632939374e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.9023667871952057, + "num_tokens": 261780782.0, + "step": 986 + }, + { + "epoch": 0.09860632399220741, + "grad_norm": 0.7430100440979004, + "learning_rate": 9.877681216479478e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9033133089542389, + "num_tokens": 262051016.0, + "step": 987 + }, + { + "epoch": 0.09870622908237174, + "grad_norm": 0.550703227519989, + "learning_rate": 9.877325289321587e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.9046134650707245, + "num_tokens": 262311867.0, + "step": 988 + }, + { + "epoch": 0.09880613417253609, + "grad_norm": 0.7419955730438232, + "learning_rate": 9.876968851502968e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9015516042709351, + "num_tokens": 262574606.0, + "step": 989 + }, + { + "epoch": 0.09890603926270043, + "grad_norm": 0.6539101004600525, + "learning_rate": 9.876611903060939e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.9032827615737915, + "num_tokens": 262841908.0, + "step": 990 + }, + { + "epoch": 0.09900594435286478, + "grad_norm": 0.6836303472518921, + "learning_rate": 9.876254444032873e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.902997612953186, + "num_tokens": 263111861.0, + "step": 991 + }, + { + "epoch": 0.09910584944302912, + "grad_norm": 0.8701339960098267, + "learning_rate": 9.875896474456197e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.9036066234111786, + "num_tokens": 263379614.0, + "step": 992 + }, + { + "epoch": 0.09920575453319347, + "grad_norm": 0.5941449403762817, + "learning_rate": 9.875537994368389e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.9028864204883575, + "num_tokens": 263657031.0, + "step": 993 + }, + { + "epoch": 0.09930565962335781, + "grad_norm": 0.5824651718139648, + "learning_rate": 9.875179003806985e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9048649966716766, + "num_tokens": 263922241.0, + "step": 994 + }, + { + "epoch": 0.09940556471352216, + "grad_norm": 0.7443870306015015, + "learning_rate": 9.87481950280957e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.9040978252887726, + "num_tokens": 264185662.0, + "step": 995 + }, + { + "epoch": 0.09950546980368649, + "grad_norm": 0.7453452944755554, + "learning_rate": 9.874459491413784e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9031435549259186, + "num_tokens": 264445774.0, + "step": 996 + }, + { + "epoch": 0.09960537489385084, + "grad_norm": 1.18523371219635, + "learning_rate": 9.874098969657321e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.8985654413700104, + "num_tokens": 264707597.0, + "step": 997 + }, + { + "epoch": 0.09970527998401518, + "grad_norm": 0.8436636328697205, + "learning_rate": 9.873737937577928e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.9031170308589935, + "num_tokens": 264974496.0, + "step": 998 + }, + { + "epoch": 0.09980518507417953, + "grad_norm": 0.8511901497840881, + "learning_rate": 9.873376395213405e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.9015651047229767, + "num_tokens": 265229942.0, + "step": 999 + }, + { + "epoch": 0.09990509016434387, + "grad_norm": 0.8095589280128479, + "learning_rate": 9.873014342601605e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.9035263061523438, + "num_tokens": 265503595.0, + "step": 1000 + }, + { + "epoch": 0.10000499525450822, + "grad_norm": 0.5831907391548157, + "learning_rate": 9.872651779780438e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.9024500846862793, + "num_tokens": 265765780.0, + "step": 1001 + }, + { + "epoch": 0.10010490034467256, + "grad_norm": 0.7451722025871277, + "learning_rate": 9.872288706787862e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.9026191234588623, + "num_tokens": 266025032.0, + "step": 1002 + }, + { + "epoch": 0.10020480543483691, + "grad_norm": 0.8122203946113586, + "learning_rate": 9.871925123661892e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.9060046672821045, + "num_tokens": 266287663.0, + "step": 1003 + }, + { + "epoch": 0.10030471052500124, + "grad_norm": 0.6724209785461426, + "learning_rate": 9.871561030440594e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.9000222980976105, + "num_tokens": 266550319.0, + "step": 1004 + }, + { + "epoch": 0.10040461561516559, + "grad_norm": 0.6614410281181335, + "learning_rate": 9.871196427162094e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.9016672372817993, + "num_tokens": 266815253.0, + "step": 1005 + }, + { + "epoch": 0.10050452070532993, + "grad_norm": 1.593506932258606, + "learning_rate": 9.87083131386456e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.9023639261722565, + "num_tokens": 267080187.0, + "step": 1006 + }, + { + "epoch": 0.10060442579549428, + "grad_norm": 0.6601667404174805, + "learning_rate": 9.870465690586223e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.9015019536018372, + "num_tokens": 267335466.0, + "step": 1007 + }, + { + "epoch": 0.10070433088565862, + "grad_norm": 0.7330176830291748, + "learning_rate": 9.870099557365367e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.9027435481548309, + "num_tokens": 267605593.0, + "step": 1008 + }, + { + "epoch": 0.10080423597582297, + "grad_norm": 0.62809157371521, + "learning_rate": 9.86973291424032e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.9007785320281982, + "num_tokens": 267873547.0, + "step": 1009 + }, + { + "epoch": 0.10090414106598732, + "grad_norm": 0.9075589179992676, + "learning_rate": 9.869365761249474e-06, + "loss": 0.512, + "mean_token_accuracy": 0.9011622071266174, + "num_tokens": 268139253.0, + "step": 1010 + }, + { + "epoch": 0.10100404615615166, + "grad_norm": 0.7175406217575073, + "learning_rate": 9.868998098431269e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.9056000113487244, + "num_tokens": 268395548.0, + "step": 1011 + }, + { + "epoch": 0.101103951246316, + "grad_norm": 0.9046884775161743, + "learning_rate": 9.8686299258242e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.9058517217636108, + "num_tokens": 268657716.0, + "step": 1012 + }, + { + "epoch": 0.10120385633648034, + "grad_norm": 1.0580805540084839, + "learning_rate": 9.868261243466815e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.9001063406467438, + "num_tokens": 268923372.0, + "step": 1013 + }, + { + "epoch": 0.10130376142664468, + "grad_norm": 0.6929600834846497, + "learning_rate": 9.867892051397714e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.9031453132629395, + "num_tokens": 269185361.0, + "step": 1014 + }, + { + "epoch": 0.10140366651680903, + "grad_norm": 1.6979970932006836, + "learning_rate": 9.867522349655555e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9037013947963715, + "num_tokens": 269455194.0, + "step": 1015 + }, + { + "epoch": 0.10150357160697338, + "grad_norm": 0.9894689917564392, + "learning_rate": 9.867152138279043e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.902478963136673, + "num_tokens": 269724944.0, + "step": 1016 + }, + { + "epoch": 0.10160347669713772, + "grad_norm": 0.7427690625190735, + "learning_rate": 9.866781417306943e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.9030346572399139, + "num_tokens": 269986206.0, + "step": 1017 + }, + { + "epoch": 0.10170338178730207, + "grad_norm": 0.7057008147239685, + "learning_rate": 9.866410186778066e-06, + "loss": 0.505, + "mean_token_accuracy": 0.9067362844944, + "num_tokens": 270263333.0, + "step": 1018 + }, + { + "epoch": 0.10180328687746641, + "grad_norm": 0.8070526719093323, + "learning_rate": 9.866038446731282e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.9037552177906036, + "num_tokens": 270523585.0, + "step": 1019 + }, + { + "epoch": 0.10190319196763076, + "grad_norm": 0.6917213201522827, + "learning_rate": 9.865666197205514e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.9021291136741638, + "num_tokens": 270790581.0, + "step": 1020 + }, + { + "epoch": 0.10200309705779509, + "grad_norm": 0.6528738737106323, + "learning_rate": 9.865293438239734e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9038994908332825, + "num_tokens": 271054322.0, + "step": 1021 + }, + { + "epoch": 0.10210300214795943, + "grad_norm": 0.6496425867080688, + "learning_rate": 9.864920169872972e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.9005432724952698, + "num_tokens": 271320838.0, + "step": 1022 + }, + { + "epoch": 0.10220290723812378, + "grad_norm": 0.8383355140686035, + "learning_rate": 9.864546392144309e-06, + "loss": 0.507, + "mean_token_accuracy": 0.9018782377243042, + "num_tokens": 271576958.0, + "step": 1023 + }, + { + "epoch": 0.10230281232828813, + "grad_norm": 0.8517588376998901, + "learning_rate": 9.86417210509288e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.9032650589942932, + "num_tokens": 271837932.0, + "step": 1024 + }, + { + "epoch": 0.10240271741845247, + "grad_norm": 0.6688122153282166, + "learning_rate": 9.863797308757872e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9022855460643768, + "num_tokens": 272100505.0, + "step": 1025 + }, + { + "epoch": 0.10250262250861682, + "grad_norm": 0.965001106262207, + "learning_rate": 9.863422003178528e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.9030641317367554, + "num_tokens": 272370844.0, + "step": 1026 + }, + { + "epoch": 0.10260252759878116, + "grad_norm": 0.7896378636360168, + "learning_rate": 9.863046188394145e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.9045498967170715, + "num_tokens": 272634663.0, + "step": 1027 + }, + { + "epoch": 0.10270243268894551, + "grad_norm": 0.6328387260437012, + "learning_rate": 9.862669864444068e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.9026427268981934, + "num_tokens": 272900831.0, + "step": 1028 + }, + { + "epoch": 0.10280233777910984, + "grad_norm": 0.6208814978599548, + "learning_rate": 9.862293031367698e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.9049585461616516, + "num_tokens": 273174079.0, + "step": 1029 + }, + { + "epoch": 0.10290224286927419, + "grad_norm": 1.0312687158584595, + "learning_rate": 9.86191568920449e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.8989483714103699, + "num_tokens": 273436149.0, + "step": 1030 + }, + { + "epoch": 0.10300214795943853, + "grad_norm": 0.8986730575561523, + "learning_rate": 9.861537837993957e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.902367502450943, + "num_tokens": 273709739.0, + "step": 1031 + }, + { + "epoch": 0.10310205304960288, + "grad_norm": 1.0141167640686035, + "learning_rate": 9.861159477775653e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.9052289724349976, + "num_tokens": 273973496.0, + "step": 1032 + }, + { + "epoch": 0.10320195813976722, + "grad_norm": 0.7790776491165161, + "learning_rate": 9.860780608589197e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.9065178036689758, + "num_tokens": 274243389.0, + "step": 1033 + }, + { + "epoch": 0.10330186322993157, + "grad_norm": 0.5929874777793884, + "learning_rate": 9.860401230474257e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.908386617898941, + "num_tokens": 274509792.0, + "step": 1034 + }, + { + "epoch": 0.10340176832009591, + "grad_norm": 1.0437512397766113, + "learning_rate": 9.860021343470554e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.9037416875362396, + "num_tokens": 274782208.0, + "step": 1035 + }, + { + "epoch": 0.10350167341026026, + "grad_norm": 1.171203851699829, + "learning_rate": 9.859640947617861e-06, + "loss": 0.512, + "mean_token_accuracy": 0.9031408727169037, + "num_tokens": 275047695.0, + "step": 1036 + }, + { + "epoch": 0.10360157850042459, + "grad_norm": 0.9081317782402039, + "learning_rate": 9.859260042956008e-06, + "loss": 0.513, + "mean_token_accuracy": 0.9025295376777649, + "num_tokens": 275310645.0, + "step": 1037 + }, + { + "epoch": 0.10370148359058894, + "grad_norm": 0.6192437410354614, + "learning_rate": 9.858878629524876e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.9025195240974426, + "num_tokens": 275588794.0, + "step": 1038 + }, + { + "epoch": 0.10380138868075328, + "grad_norm": 1.0601544380187988, + "learning_rate": 9.858496707364395e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.9026031196117401, + "num_tokens": 275857772.0, + "step": 1039 + }, + { + "epoch": 0.10390129377091763, + "grad_norm": 0.6399379372596741, + "learning_rate": 9.858114276514557e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.9040835499763489, + "num_tokens": 276115131.0, + "step": 1040 + }, + { + "epoch": 0.10400119886108197, + "grad_norm": 0.7723459005355835, + "learning_rate": 9.857731337015403e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.9018638730049133, + "num_tokens": 276377561.0, + "step": 1041 + }, + { + "epoch": 0.10410110395124632, + "grad_norm": 0.7138360142707825, + "learning_rate": 9.857347888907025e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.9026082754135132, + "num_tokens": 276641596.0, + "step": 1042 + }, + { + "epoch": 0.10420100904141066, + "grad_norm": 0.7006938457489014, + "learning_rate": 9.85696393222957e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.9018177390098572, + "num_tokens": 276887045.0, + "step": 1043 + }, + { + "epoch": 0.10430091413157501, + "grad_norm": 0.6651766896247864, + "learning_rate": 9.856579467023243e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.9011768698692322, + "num_tokens": 277155998.0, + "step": 1044 + }, + { + "epoch": 0.10440081922173934, + "grad_norm": 0.7423259019851685, + "learning_rate": 9.856194493328293e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.9034288227558136, + "num_tokens": 277430131.0, + "step": 1045 + }, + { + "epoch": 0.10450072431190369, + "grad_norm": 1.0806713104248047, + "learning_rate": 9.855809011185029e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.9008001089096069, + "num_tokens": 277690074.0, + "step": 1046 + }, + { + "epoch": 0.10460062940206803, + "grad_norm": 0.7041473984718323, + "learning_rate": 9.855423020633812e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.9004122912883759, + "num_tokens": 277951418.0, + "step": 1047 + }, + { + "epoch": 0.10470053449223238, + "grad_norm": 0.6171576380729675, + "learning_rate": 9.855036521715055e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.9042083024978638, + "num_tokens": 278222923.0, + "step": 1048 + }, + { + "epoch": 0.10480043958239672, + "grad_norm": 0.6791746020317078, + "learning_rate": 9.854649514469224e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.9054279923439026, + "num_tokens": 278499219.0, + "step": 1049 + }, + { + "epoch": 0.10490034467256107, + "grad_norm": 0.8476731181144714, + "learning_rate": 9.85426199893684e-06, + "loss": 0.511, + "mean_token_accuracy": 0.9046165943145752, + "num_tokens": 278765677.0, + "step": 1050 + }, + { + "epoch": 0.10500024976272541, + "grad_norm": 0.7902211546897888, + "learning_rate": 9.853873975158476e-06, + "loss": 0.506, + "mean_token_accuracy": 0.9039729535579681, + "num_tokens": 279029186.0, + "step": 1051 + }, + { + "epoch": 0.10510015485288976, + "grad_norm": 1.0358272790908813, + "learning_rate": 9.85348544317476e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.9025667011737823, + "num_tokens": 279287352.0, + "step": 1052 + }, + { + "epoch": 0.10520005994305409, + "grad_norm": 1.1408708095550537, + "learning_rate": 9.853096403026367e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.9056856632232666, + "num_tokens": 279553044.0, + "step": 1053 + }, + { + "epoch": 0.10529996503321844, + "grad_norm": 0.7597228288650513, + "learning_rate": 9.852706854754037e-06, + "loss": 0.509, + "mean_token_accuracy": 0.9034147560596466, + "num_tokens": 279811089.0, + "step": 1054 + }, + { + "epoch": 0.10539987012338278, + "grad_norm": 0.5902972221374512, + "learning_rate": 9.85231679839855e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.9051390886306763, + "num_tokens": 280078196.0, + "step": 1055 + }, + { + "epoch": 0.10549977521354713, + "grad_norm": 0.8786248564720154, + "learning_rate": 9.851926234000747e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.9050115346908569, + "num_tokens": 280346702.0, + "step": 1056 + }, + { + "epoch": 0.10559968030371147, + "grad_norm": 0.8268047571182251, + "learning_rate": 9.851535161601521e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.9028478860855103, + "num_tokens": 280601304.0, + "step": 1057 + }, + { + "epoch": 0.10569958539387582, + "grad_norm": 1.0481903553009033, + "learning_rate": 9.851143581241817e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.9002265334129333, + "num_tokens": 280872143.0, + "step": 1058 + }, + { + "epoch": 0.10579949048404017, + "grad_norm": 0.8656567931175232, + "learning_rate": 9.850751492962636e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.9011329114437103, + "num_tokens": 281136602.0, + "step": 1059 + }, + { + "epoch": 0.10589939557420451, + "grad_norm": 0.8184807300567627, + "learning_rate": 9.850358896805028e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.9053269326686859, + "num_tokens": 281406805.0, + "step": 1060 + }, + { + "epoch": 0.10599930066436886, + "grad_norm": 0.7005673050880432, + "learning_rate": 9.849965792810099e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.9002671539783478, + "num_tokens": 281673899.0, + "step": 1061 + }, + { + "epoch": 0.10609920575453319, + "grad_norm": 1.8369040489196777, + "learning_rate": 9.849572181019008e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.9044378399848938, + "num_tokens": 281942497.0, + "step": 1062 + }, + { + "epoch": 0.10619911084469753, + "grad_norm": 0.7511613368988037, + "learning_rate": 9.849178061472962e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.902895450592041, + "num_tokens": 282205599.0, + "step": 1063 + }, + { + "epoch": 0.10629901593486188, + "grad_norm": 0.804568886756897, + "learning_rate": 9.848783434213232e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.905504584312439, + "num_tokens": 282475417.0, + "step": 1064 + }, + { + "epoch": 0.10639892102502622, + "grad_norm": 0.5301011800765991, + "learning_rate": 9.848388299281132e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9055768847465515, + "num_tokens": 282742869.0, + "step": 1065 + }, + { + "epoch": 0.10649882611519057, + "grad_norm": 1.133992075920105, + "learning_rate": 9.847992656718035e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.9024232923984528, + "num_tokens": 283002736.0, + "step": 1066 + }, + { + "epoch": 0.10659873120535492, + "grad_norm": 0.7398302555084229, + "learning_rate": 9.847596506565365e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.9014241993427277, + "num_tokens": 283272532.0, + "step": 1067 + }, + { + "epoch": 0.10669863629551926, + "grad_norm": 4.1364970207214355, + "learning_rate": 9.847199848864597e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8999373018741608, + "num_tokens": 283536470.0, + "step": 1068 + }, + { + "epoch": 0.1067985413856836, + "grad_norm": 1.0223712921142578, + "learning_rate": 9.846802683657264e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9045616388320923, + "num_tokens": 283799435.0, + "step": 1069 + }, + { + "epoch": 0.10689844647584794, + "grad_norm": 0.8161273002624512, + "learning_rate": 9.846405010984948e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.9038158357143402, + "num_tokens": 284067980.0, + "step": 1070 + }, + { + "epoch": 0.10699835156601228, + "grad_norm": 0.8806303143501282, + "learning_rate": 9.846006830889285e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8994988203048706, + "num_tokens": 284330632.0, + "step": 1071 + }, + { + "epoch": 0.10709825665617663, + "grad_norm": 0.7069767117500305, + "learning_rate": 9.84560814341197e-06, + "loss": 0.507, + "mean_token_accuracy": 0.9026136100292206, + "num_tokens": 284590905.0, + "step": 1072 + }, + { + "epoch": 0.10719816174634098, + "grad_norm": 0.685492217540741, + "learning_rate": 9.845208948594739e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.9011662006378174, + "num_tokens": 284858129.0, + "step": 1073 + }, + { + "epoch": 0.10729806683650532, + "grad_norm": 0.8589521050453186, + "learning_rate": 9.844809246479392e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.9020446240901947, + "num_tokens": 285134262.0, + "step": 1074 + }, + { + "epoch": 0.10739797192666967, + "grad_norm": 0.8110690712928772, + "learning_rate": 9.844409037107778e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.9029203057289124, + "num_tokens": 285400017.0, + "step": 1075 + }, + { + "epoch": 0.10749787701683401, + "grad_norm": 0.7773404717445374, + "learning_rate": 9.844008320521798e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.9034918248653412, + "num_tokens": 285662705.0, + "step": 1076 + }, + { + "epoch": 0.10759778210699836, + "grad_norm": 1.2352386713027954, + "learning_rate": 9.843607096763408e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.9035278856754303, + "num_tokens": 285925809.0, + "step": 1077 + }, + { + "epoch": 0.10769768719716269, + "grad_norm": 0.7862106561660767, + "learning_rate": 9.843205365874619e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.9017794728279114, + "num_tokens": 286186717.0, + "step": 1078 + }, + { + "epoch": 0.10779759228732703, + "grad_norm": 0.820527195930481, + "learning_rate": 9.842803127897489e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.9029852151870728, + "num_tokens": 286442098.0, + "step": 1079 + }, + { + "epoch": 0.10789749737749138, + "grad_norm": 1.1837090253829956, + "learning_rate": 9.842400382874133e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9041242897510529, + "num_tokens": 286708035.0, + "step": 1080 + }, + { + "epoch": 0.10799740246765573, + "grad_norm": 1.2356716394424438, + "learning_rate": 9.84199713084672e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.9015583395957947, + "num_tokens": 286968035.0, + "step": 1081 + }, + { + "epoch": 0.10809730755782007, + "grad_norm": 0.8705529570579529, + "learning_rate": 9.841593371857472e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.9038357436656952, + "num_tokens": 287229619.0, + "step": 1082 + }, + { + "epoch": 0.10819721264798442, + "grad_norm": 0.8870646953582764, + "learning_rate": 9.841189105948661e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.9009186029434204, + "num_tokens": 287488026.0, + "step": 1083 + }, + { + "epoch": 0.10829711773814876, + "grad_norm": 1.662595272064209, + "learning_rate": 9.840784333162614e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.9046537578105927, + "num_tokens": 287737149.0, + "step": 1084 + }, + { + "epoch": 0.10839702282831311, + "grad_norm": 0.9252234101295471, + "learning_rate": 9.840379053541714e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.9029515087604523, + "num_tokens": 288002001.0, + "step": 1085 + }, + { + "epoch": 0.10849692791847744, + "grad_norm": 1.2596855163574219, + "learning_rate": 9.83997326712839e-06, + "loss": 0.507, + "mean_token_accuracy": 0.903984934091568, + "num_tokens": 288275784.0, + "step": 1086 + }, + { + "epoch": 0.10859683300864179, + "grad_norm": 0.7903404831886292, + "learning_rate": 9.83956697396513e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.9029177725315094, + "num_tokens": 288539258.0, + "step": 1087 + }, + { + "epoch": 0.10869673809880613, + "grad_norm": 0.7111457586288452, + "learning_rate": 9.839160174094476e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.903429388999939, + "num_tokens": 288804705.0, + "step": 1088 + }, + { + "epoch": 0.10879664318897048, + "grad_norm": 1.0210380554199219, + "learning_rate": 9.838752867559015e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.9037662744522095, + "num_tokens": 289077522.0, + "step": 1089 + }, + { + "epoch": 0.10889654827913482, + "grad_norm": 0.787175714969635, + "learning_rate": 9.838345054401398e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.9054860472679138, + "num_tokens": 289334040.0, + "step": 1090 + }, + { + "epoch": 0.10899645336929917, + "grad_norm": 0.8441920280456543, + "learning_rate": 9.837936734664318e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.9034286439418793, + "num_tokens": 289596923.0, + "step": 1091 + }, + { + "epoch": 0.10909635845946351, + "grad_norm": 0.7978174090385437, + "learning_rate": 9.83752790839053e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.9046914279460907, + "num_tokens": 289862799.0, + "step": 1092 + }, + { + "epoch": 0.10919626354962786, + "grad_norm": 0.7361218929290771, + "learning_rate": 9.837118575622839e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.9039119184017181, + "num_tokens": 290125191.0, + "step": 1093 + }, + { + "epoch": 0.10929616863979219, + "grad_norm": 1.0152934789657593, + "learning_rate": 9.836708736404099e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.9057305455207825, + "num_tokens": 290393836.0, + "step": 1094 + }, + { + "epoch": 0.10939607372995654, + "grad_norm": 0.9627482891082764, + "learning_rate": 9.836298390777226e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.9043967127799988, + "num_tokens": 290660486.0, + "step": 1095 + }, + { + "epoch": 0.10949597882012088, + "grad_norm": 0.939544677734375, + "learning_rate": 9.835887538785179e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.9036622941493988, + "num_tokens": 290922442.0, + "step": 1096 + }, + { + "epoch": 0.10959588391028523, + "grad_norm": 0.9401910305023193, + "learning_rate": 9.835476180470975e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.9022451043128967, + "num_tokens": 291194118.0, + "step": 1097 + }, + { + "epoch": 0.10969578900044957, + "grad_norm": 0.9415150284767151, + "learning_rate": 9.835064315877685e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.8990285396575928, + "num_tokens": 291448376.0, + "step": 1098 + }, + { + "epoch": 0.10979569409061392, + "grad_norm": 0.8487478494644165, + "learning_rate": 9.83465194504843e-06, + "loss": 0.505, + "mean_token_accuracy": 0.9051848351955414, + "num_tokens": 291713916.0, + "step": 1099 + }, + { + "epoch": 0.10989559918077826, + "grad_norm": 2.0404891967773438, + "learning_rate": 9.834239068026388e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.9070018231868744, + "num_tokens": 291970805.0, + "step": 1100 + }, + { + "epoch": 0.10999550427094261, + "grad_norm": 0.6519649624824524, + "learning_rate": 9.833825684854787e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.9052155911922455, + "num_tokens": 292238152.0, + "step": 1101 + }, + { + "epoch": 0.11009540936110696, + "grad_norm": 0.8890913724899292, + "learning_rate": 9.833411795576908e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.9047675132751465, + "num_tokens": 292497832.0, + "step": 1102 + }, + { + "epoch": 0.11019531445127129, + "grad_norm": 0.8213118314743042, + "learning_rate": 9.832997400236085e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.9041893482208252, + "num_tokens": 292754145.0, + "step": 1103 + }, + { + "epoch": 0.11029521954143563, + "grad_norm": 0.8537233471870422, + "learning_rate": 9.832582498875706e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.9000687301158905, + "num_tokens": 293015437.0, + "step": 1104 + }, + { + "epoch": 0.11039512463159998, + "grad_norm": 0.7606354355812073, + "learning_rate": 9.832167091539215e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.9051087498664856, + "num_tokens": 293286103.0, + "step": 1105 + }, + { + "epoch": 0.11049502972176432, + "grad_norm": 1.35732102394104, + "learning_rate": 9.831751178270099e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9015337824821472, + "num_tokens": 293549463.0, + "step": 1106 + }, + { + "epoch": 0.11059493481192867, + "grad_norm": 1.0138810873031616, + "learning_rate": 9.83133475911191e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.903365820646286, + "num_tokens": 293824116.0, + "step": 1107 + }, + { + "epoch": 0.11069483990209301, + "grad_norm": 0.7499532103538513, + "learning_rate": 9.830917834108245e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.9031332731246948, + "num_tokens": 294100969.0, + "step": 1108 + }, + { + "epoch": 0.11079474499225736, + "grad_norm": 1.4454493522644043, + "learning_rate": 9.830500403302756e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.9008395075798035, + "num_tokens": 294366921.0, + "step": 1109 + }, + { + "epoch": 0.1108946500824217, + "grad_norm": 1.648669719696045, + "learning_rate": 9.830082466739149e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.9036948978900909, + "num_tokens": 294629417.0, + "step": 1110 + }, + { + "epoch": 0.11099455517258604, + "grad_norm": 0.7849748134613037, + "learning_rate": 9.829664024461183e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.9042954444885254, + "num_tokens": 294898792.0, + "step": 1111 + }, + { + "epoch": 0.11109446026275038, + "grad_norm": 0.9547297954559326, + "learning_rate": 9.82924507651267e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.9037017226219177, + "num_tokens": 295161467.0, + "step": 1112 + }, + { + "epoch": 0.11119436535291473, + "grad_norm": 2.148648977279663, + "learning_rate": 9.828825622937474e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.9046049118041992, + "num_tokens": 295429147.0, + "step": 1113 + }, + { + "epoch": 0.11129427044307907, + "grad_norm": 0.7304654717445374, + "learning_rate": 9.82840566377951e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.9001829922199249, + "num_tokens": 295700080.0, + "step": 1114 + }, + { + "epoch": 0.11139417553324342, + "grad_norm": 0.8470402359962463, + "learning_rate": 9.82798519908275e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.9015022814273834, + "num_tokens": 295967492.0, + "step": 1115 + }, + { + "epoch": 0.11149408062340777, + "grad_norm": 0.7481537461280823, + "learning_rate": 9.827564228891218e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9024154543876648, + "num_tokens": 296242833.0, + "step": 1116 + }, + { + "epoch": 0.11159398571357211, + "grad_norm": 0.9703457951545715, + "learning_rate": 9.827142753248986e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.9044367671012878, + "num_tokens": 296514815.0, + "step": 1117 + }, + { + "epoch": 0.11169389080373646, + "grad_norm": 0.9620572328567505, + "learning_rate": 9.826720772200187e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.9033811092376709, + "num_tokens": 296779379.0, + "step": 1118 + }, + { + "epoch": 0.11179379589390079, + "grad_norm": 1.4369707107543945, + "learning_rate": 9.826298285789002e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9042473137378693, + "num_tokens": 297049514.0, + "step": 1119 + }, + { + "epoch": 0.11189370098406513, + "grad_norm": 1.0852437019348145, + "learning_rate": 9.825875294059663e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9039795398712158, + "num_tokens": 297303788.0, + "step": 1120 + }, + { + "epoch": 0.11199360607422948, + "grad_norm": 0.7881616353988647, + "learning_rate": 9.825451797056462e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.9027439951896667, + "num_tokens": 297578452.0, + "step": 1121 + }, + { + "epoch": 0.11209351116439382, + "grad_norm": 1.1730620861053467, + "learning_rate": 9.825027794823738e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.901564747095108, + "num_tokens": 297842319.0, + "step": 1122 + }, + { + "epoch": 0.11219341625455817, + "grad_norm": 0.9823193550109863, + "learning_rate": 9.824603287405881e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.9044339954853058, + "num_tokens": 298118599.0, + "step": 1123 + }, + { + "epoch": 0.11229332134472252, + "grad_norm": 0.9240882992744446, + "learning_rate": 9.824178274847343e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.9036643803119659, + "num_tokens": 298375980.0, + "step": 1124 + }, + { + "epoch": 0.11239322643488686, + "grad_norm": 0.9668327569961548, + "learning_rate": 9.823752757192619e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.9034723937511444, + "num_tokens": 298640095.0, + "step": 1125 + }, + { + "epoch": 0.11249313152505121, + "grad_norm": 0.6387314796447754, + "learning_rate": 9.823326734486262e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.9031436145305634, + "num_tokens": 298906701.0, + "step": 1126 + }, + { + "epoch": 0.11259303661521554, + "grad_norm": 0.8632336258888245, + "learning_rate": 9.822900206772879e-06, + "loss": 0.501, + "mean_token_accuracy": 0.9051732122898102, + "num_tokens": 299171082.0, + "step": 1127 + }, + { + "epoch": 0.11269294170537988, + "grad_norm": 0.9192607402801514, + "learning_rate": 9.822473174097125e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9036321341991425, + "num_tokens": 299434493.0, + "step": 1128 + }, + { + "epoch": 0.11279284679554423, + "grad_norm": 1.1003910303115845, + "learning_rate": 9.822045636503713e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.9011651277542114, + "num_tokens": 299699074.0, + "step": 1129 + }, + { + "epoch": 0.11289275188570858, + "grad_norm": 1.9106860160827637, + "learning_rate": 9.821617594037405e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.9008009433746338, + "num_tokens": 299964231.0, + "step": 1130 + }, + { + "epoch": 0.11299265697587292, + "grad_norm": 0.877983570098877, + "learning_rate": 9.821189046743019e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.901772528886795, + "num_tokens": 300218091.0, + "step": 1131 + }, + { + "epoch": 0.11309256206603727, + "grad_norm": 0.7895647287368774, + "learning_rate": 9.820759994665422e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.9046706259250641, + "num_tokens": 300483269.0, + "step": 1132 + }, + { + "epoch": 0.11319246715620161, + "grad_norm": 1.453848958015442, + "learning_rate": 9.820330437849538e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.9068599343299866, + "num_tokens": 300749899.0, + "step": 1133 + }, + { + "epoch": 0.11329237224636596, + "grad_norm": 0.8723675608634949, + "learning_rate": 9.819900376340342e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9033614993095398, + "num_tokens": 301020275.0, + "step": 1134 + }, + { + "epoch": 0.11339227733653029, + "grad_norm": 2.0957000255584717, + "learning_rate": 9.819469810182862e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.9017063975334167, + "num_tokens": 301274308.0, + "step": 1135 + }, + { + "epoch": 0.11349218242669463, + "grad_norm": 0.8790689706802368, + "learning_rate": 9.819038739422178e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.902511864900589, + "num_tokens": 301540467.0, + "step": 1136 + }, + { + "epoch": 0.11359208751685898, + "grad_norm": 2.321272373199463, + "learning_rate": 9.818607164103425e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.9043411016464233, + "num_tokens": 301810257.0, + "step": 1137 + }, + { + "epoch": 0.11369199260702333, + "grad_norm": 1.2416681051254272, + "learning_rate": 9.818175084271786e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.9010232985019684, + "num_tokens": 302081659.0, + "step": 1138 + }, + { + "epoch": 0.11379189769718767, + "grad_norm": 1.2494441270828247, + "learning_rate": 9.817742499972502e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.89911288022995, + "num_tokens": 302345790.0, + "step": 1139 + }, + { + "epoch": 0.11389180278735202, + "grad_norm": 1.0588008165359497, + "learning_rate": 9.817309411250867e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.9044268429279327, + "num_tokens": 302609782.0, + "step": 1140 + }, + { + "epoch": 0.11399170787751636, + "grad_norm": 4.230846405029297, + "learning_rate": 9.816875818152225e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9057741463184357, + "num_tokens": 302872555.0, + "step": 1141 + }, + { + "epoch": 0.11409161296768071, + "grad_norm": 1.2560300827026367, + "learning_rate": 9.81644172072197e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9041599035263062, + "num_tokens": 303135845.0, + "step": 1142 + }, + { + "epoch": 0.11419151805784505, + "grad_norm": 1.6641335487365723, + "learning_rate": 9.816007119005557e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.9014337360858917, + "num_tokens": 303413165.0, + "step": 1143 + }, + { + "epoch": 0.11429142314800939, + "grad_norm": 1.6801905632019043, + "learning_rate": 9.815572013048486e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.901190996170044, + "num_tokens": 303672710.0, + "step": 1144 + }, + { + "epoch": 0.11439132823817373, + "grad_norm": 1.014325737953186, + "learning_rate": 9.815136402896316e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.901051938533783, + "num_tokens": 303936039.0, + "step": 1145 + }, + { + "epoch": 0.11449123332833808, + "grad_norm": 1.1326024532318115, + "learning_rate": 9.814700288594655e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.9043678939342499, + "num_tokens": 304191705.0, + "step": 1146 + }, + { + "epoch": 0.11459113841850242, + "grad_norm": 1.1575804948806763, + "learning_rate": 9.814263670189162e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9062510430812836, + "num_tokens": 304452929.0, + "step": 1147 + }, + { + "epoch": 0.11469104350866677, + "grad_norm": 0.9323338270187378, + "learning_rate": 9.813826547725553e-06, + "loss": 0.508, + "mean_token_accuracy": 0.9017843008041382, + "num_tokens": 304713341.0, + "step": 1148 + }, + { + "epoch": 0.11479094859883111, + "grad_norm": 1.4236687421798706, + "learning_rate": 9.813388921249595e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9033456742763519, + "num_tokens": 304976870.0, + "step": 1149 + }, + { + "epoch": 0.11489085368899546, + "grad_norm": 0.8875294327735901, + "learning_rate": 9.81295079080711e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.90523561835289, + "num_tokens": 305237446.0, + "step": 1150 + }, + { + "epoch": 0.1149907587791598, + "grad_norm": 0.7485526204109192, + "learning_rate": 9.812512156443967e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9040428102016449, + "num_tokens": 305503439.0, + "step": 1151 + }, + { + "epoch": 0.11509066386932414, + "grad_norm": 0.738543689250946, + "learning_rate": 9.812073018206094e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.9050008654594421, + "num_tokens": 305779805.0, + "step": 1152 + }, + { + "epoch": 0.11519056895948848, + "grad_norm": 0.850217878818512, + "learning_rate": 9.81163337613947e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.902924120426178, + "num_tokens": 306053065.0, + "step": 1153 + }, + { + "epoch": 0.11529047404965283, + "grad_norm": 0.7323104739189148, + "learning_rate": 9.811193230290124e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.904002845287323, + "num_tokens": 306322326.0, + "step": 1154 + }, + { + "epoch": 0.11539037913981717, + "grad_norm": 0.9691538214683533, + "learning_rate": 9.81075258070414e-06, + "loss": 0.505, + "mean_token_accuracy": 0.9041692614555359, + "num_tokens": 306591720.0, + "step": 1155 + }, + { + "epoch": 0.11549028422998152, + "grad_norm": 0.8689768314361572, + "learning_rate": 9.810311427427653e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.9032854735851288, + "num_tokens": 306857758.0, + "step": 1156 + }, + { + "epoch": 0.11559018932014586, + "grad_norm": 0.8438069820404053, + "learning_rate": 9.809869770506855e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.9050447940826416, + "num_tokens": 307121950.0, + "step": 1157 + }, + { + "epoch": 0.11569009441031021, + "grad_norm": 1.3357731103897095, + "learning_rate": 9.809427609987987e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.9052115082740784, + "num_tokens": 307390004.0, + "step": 1158 + }, + { + "epoch": 0.11578999950047456, + "grad_norm": 0.899009644985199, + "learning_rate": 9.808984945917344e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.902001678943634, + "num_tokens": 307659959.0, + "step": 1159 + }, + { + "epoch": 0.11588990459063889, + "grad_norm": 0.6774052381515503, + "learning_rate": 9.808541778341272e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.9030162692070007, + "num_tokens": 307926566.0, + "step": 1160 + }, + { + "epoch": 0.11598980968080323, + "grad_norm": 0.8723652362823486, + "learning_rate": 9.808098107306172e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.9013499915599823, + "num_tokens": 308188406.0, + "step": 1161 + }, + { + "epoch": 0.11608971477096758, + "grad_norm": 0.8729996085166931, + "learning_rate": 9.807653932858497e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.9058549106121063, + "num_tokens": 308459742.0, + "step": 1162 + }, + { + "epoch": 0.11618961986113192, + "grad_norm": 0.8650737404823303, + "learning_rate": 9.807209255044752e-06, + "loss": 0.506, + "mean_token_accuracy": 0.905139684677124, + "num_tokens": 308717622.0, + "step": 1163 + }, + { + "epoch": 0.11628952495129627, + "grad_norm": 0.8554286956787109, + "learning_rate": 9.806764073911496e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8989475667476654, + "num_tokens": 308986733.0, + "step": 1164 + }, + { + "epoch": 0.11638943004146061, + "grad_norm": 0.6889458298683167, + "learning_rate": 9.806318389505338e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.9052130281925201, + "num_tokens": 309257057.0, + "step": 1165 + }, + { + "epoch": 0.11648933513162496, + "grad_norm": 0.943598210811615, + "learning_rate": 9.805872201872943e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.9032472968101501, + "num_tokens": 309519173.0, + "step": 1166 + }, + { + "epoch": 0.1165892402217893, + "grad_norm": 1.3201313018798828, + "learning_rate": 9.805425511061028e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.9008933305740356, + "num_tokens": 309789912.0, + "step": 1167 + }, + { + "epoch": 0.11668914531195364, + "grad_norm": 0.8943611979484558, + "learning_rate": 9.804978317116362e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.9030976891517639, + "num_tokens": 310063301.0, + "step": 1168 + }, + { + "epoch": 0.11678905040211798, + "grad_norm": 0.8986904621124268, + "learning_rate": 9.804530620085764e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.903691291809082, + "num_tokens": 310326638.0, + "step": 1169 + }, + { + "epoch": 0.11688895549228233, + "grad_norm": 0.9264902472496033, + "learning_rate": 9.80408242001611e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8999199271202087, + "num_tokens": 310574296.0, + "step": 1170 + }, + { + "epoch": 0.11698886058244667, + "grad_norm": 0.7557982802391052, + "learning_rate": 9.803633716954329e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.9043073654174805, + "num_tokens": 310846052.0, + "step": 1171 + }, + { + "epoch": 0.11708876567261102, + "grad_norm": 0.7137724757194519, + "learning_rate": 9.803184510947397e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.9019661545753479, + "num_tokens": 311116895.0, + "step": 1172 + }, + { + "epoch": 0.11718867076277537, + "grad_norm": 0.8428925275802612, + "learning_rate": 9.802734802042347e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.9025680422782898, + "num_tokens": 311392894.0, + "step": 1173 + }, + { + "epoch": 0.11728857585293971, + "grad_norm": 0.9974237084388733, + "learning_rate": 9.802284590286267e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.9011355936527252, + "num_tokens": 311654413.0, + "step": 1174 + }, + { + "epoch": 0.11738848094310406, + "grad_norm": 0.9738135933876038, + "learning_rate": 9.80183387572629e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.9058691561222076, + "num_tokens": 311932911.0, + "step": 1175 + }, + { + "epoch": 0.11748838603326839, + "grad_norm": 0.7912975549697876, + "learning_rate": 9.801382658409611e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.9022245407104492, + "num_tokens": 312190192.0, + "step": 1176 + }, + { + "epoch": 0.11758829112343273, + "grad_norm": 0.7819976806640625, + "learning_rate": 9.80093093838347e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.9034339487552643, + "num_tokens": 312442067.0, + "step": 1177 + }, + { + "epoch": 0.11768819621359708, + "grad_norm": 0.834560751914978, + "learning_rate": 9.800478715695165e-06, + "loss": 0.508, + "mean_token_accuracy": 0.9028506278991699, + "num_tokens": 312705537.0, + "step": 1178 + }, + { + "epoch": 0.11778810130376142, + "grad_norm": 0.7640529870986938, + "learning_rate": 9.80002599039204e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.9002387523651123, + "num_tokens": 312976399.0, + "step": 1179 + }, + { + "epoch": 0.11788800639392577, + "grad_norm": 1.0481462478637695, + "learning_rate": 9.799572762521499e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8998821377754211, + "num_tokens": 313240702.0, + "step": 1180 + }, + { + "epoch": 0.11798791148409012, + "grad_norm": 0.6875625252723694, + "learning_rate": 9.799119032130995e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9031319320201874, + "num_tokens": 313499428.0, + "step": 1181 + }, + { + "epoch": 0.11808781657425446, + "grad_norm": 0.9059860110282898, + "learning_rate": 9.798664799268032e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.9049837589263916, + "num_tokens": 313769991.0, + "step": 1182 + }, + { + "epoch": 0.11818772166441881, + "grad_norm": 0.6636269092559814, + "learning_rate": 9.798210063980172e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.9015688002109528, + "num_tokens": 314027831.0, + "step": 1183 + }, + { + "epoch": 0.11828762675458315, + "grad_norm": 0.869792640209198, + "learning_rate": 9.797754826315025e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.9028908312320709, + "num_tokens": 314296128.0, + "step": 1184 + }, + { + "epoch": 0.11838753184474748, + "grad_norm": 0.5546156764030457, + "learning_rate": 9.797299086320253e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.9054187834262848, + "num_tokens": 314568877.0, + "step": 1185 + }, + { + "epoch": 0.11848743693491183, + "grad_norm": 0.8014997839927673, + "learning_rate": 9.796842844043574e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.901533305644989, + "num_tokens": 314837254.0, + "step": 1186 + }, + { + "epoch": 0.11858734202507618, + "grad_norm": 0.5778117775917053, + "learning_rate": 9.796386099532756e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.9044305682182312, + "num_tokens": 315110561.0, + "step": 1187 + }, + { + "epoch": 0.11868724711524052, + "grad_norm": 0.7514216303825378, + "learning_rate": 9.795928852835621e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.9027313590049744, + "num_tokens": 315384077.0, + "step": 1188 + }, + { + "epoch": 0.11878715220540487, + "grad_norm": 0.7949862480163574, + "learning_rate": 9.795471104000046e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.9021328389644623, + "num_tokens": 315648263.0, + "step": 1189 + }, + { + "epoch": 0.11888705729556921, + "grad_norm": 0.7993762493133545, + "learning_rate": 9.795012853073954e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.9016287922859192, + "num_tokens": 315904619.0, + "step": 1190 + }, + { + "epoch": 0.11898696238573356, + "grad_norm": 0.7125086188316345, + "learning_rate": 9.794554100105325e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.9025373756885529, + "num_tokens": 316173188.0, + "step": 1191 + }, + { + "epoch": 0.1190868674758979, + "grad_norm": 0.7236100435256958, + "learning_rate": 9.794094845142192e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.903777003288269, + "num_tokens": 316444635.0, + "step": 1192 + }, + { + "epoch": 0.11918677256606224, + "grad_norm": 0.6730424165725708, + "learning_rate": 9.793635088232638e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.9055990874767303, + "num_tokens": 316700053.0, + "step": 1193 + }, + { + "epoch": 0.11928667765622658, + "grad_norm": 1.0836066007614136, + "learning_rate": 9.793174829424801e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.9033236503601074, + "num_tokens": 316967609.0, + "step": 1194 + }, + { + "epoch": 0.11938658274639093, + "grad_norm": 0.833530843257904, + "learning_rate": 9.792714068766872e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.9032167792320251, + "num_tokens": 317223175.0, + "step": 1195 + }, + { + "epoch": 0.11948648783655527, + "grad_norm": 0.6840106248855591, + "learning_rate": 9.79225280630709e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9028248190879822, + "num_tokens": 317498667.0, + "step": 1196 + }, + { + "epoch": 0.11958639292671962, + "grad_norm": 0.8250011205673218, + "learning_rate": 9.791791042093752e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9047706127166748, + "num_tokens": 317759251.0, + "step": 1197 + }, + { + "epoch": 0.11968629801688396, + "grad_norm": 0.6433112621307373, + "learning_rate": 9.791328776175204e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.9038620889186859, + "num_tokens": 318023178.0, + "step": 1198 + }, + { + "epoch": 0.11978620310704831, + "grad_norm": 0.9350913763046265, + "learning_rate": 9.790866008599846e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.9022045135498047, + "num_tokens": 318282479.0, + "step": 1199 + }, + { + "epoch": 0.11988610819721265, + "grad_norm": 0.8715639114379883, + "learning_rate": 9.790402739416131e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.9008211493492126, + "num_tokens": 318544169.0, + "step": 1200 + }, + { + "epoch": 0.11998601328737699, + "grad_norm": 0.8963138461112976, + "learning_rate": 9.789938968672562e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.9019471108913422, + "num_tokens": 318810099.0, + "step": 1201 + }, + { + "epoch": 0.12008591837754133, + "grad_norm": 0.7168876528739929, + "learning_rate": 9.789474696417698e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.9021722674369812, + "num_tokens": 319076892.0, + "step": 1202 + }, + { + "epoch": 0.12018582346770568, + "grad_norm": 1.3628754615783691, + "learning_rate": 9.789009922700147e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.9070237278938293, + "num_tokens": 319347134.0, + "step": 1203 + }, + { + "epoch": 0.12028572855787002, + "grad_norm": 0.8670856356620789, + "learning_rate": 9.788544647568574e-06, + "loss": 0.51, + "mean_token_accuracy": 0.9005059897899628, + "num_tokens": 319606845.0, + "step": 1204 + }, + { + "epoch": 0.12038563364803437, + "grad_norm": 0.6639876961708069, + "learning_rate": 9.788078871071688e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.9014095067977905, + "num_tokens": 319877759.0, + "step": 1205 + }, + { + "epoch": 0.12048553873819871, + "grad_norm": 2.754384756088257, + "learning_rate": 9.787612593258265e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.9031222462654114, + "num_tokens": 320138302.0, + "step": 1206 + }, + { + "epoch": 0.12058544382836306, + "grad_norm": 0.6717814803123474, + "learning_rate": 9.787145814177118e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.9066732227802277, + "num_tokens": 320402206.0, + "step": 1207 + }, + { + "epoch": 0.1206853489185274, + "grad_norm": 0.6884675025939941, + "learning_rate": 9.78667853387712e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.9039801359176636, + "num_tokens": 320682426.0, + "step": 1208 + }, + { + "epoch": 0.12078525400869174, + "grad_norm": 0.6936149597167969, + "learning_rate": 9.786210752407199e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.904935747385025, + "num_tokens": 320943424.0, + "step": 1209 + }, + { + "epoch": 0.12088515909885608, + "grad_norm": 0.7565896511077881, + "learning_rate": 9.78574246981633e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.9023331999778748, + "num_tokens": 321209298.0, + "step": 1210 + }, + { + "epoch": 0.12098506418902043, + "grad_norm": 0.667426347732544, + "learning_rate": 9.785273686153542e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8999756872653961, + "num_tokens": 321481788.0, + "step": 1211 + }, + { + "epoch": 0.12108496927918477, + "grad_norm": 0.7218133807182312, + "learning_rate": 9.784804401467917e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.9026953279972076, + "num_tokens": 321755623.0, + "step": 1212 + }, + { + "epoch": 0.12118487436934912, + "grad_norm": 0.5718080401420593, + "learning_rate": 9.784334615808592e-06, + "loss": 0.501, + "mean_token_accuracy": 0.9043356776237488, + "num_tokens": 322014435.0, + "step": 1213 + }, + { + "epoch": 0.12128477945951346, + "grad_norm": 1.1748675107955933, + "learning_rate": 9.783864329224752e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.9033130705356598, + "num_tokens": 322294450.0, + "step": 1214 + }, + { + "epoch": 0.12138468454967781, + "grad_norm": 0.831176221370697, + "learning_rate": 9.783393541765639e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.904888778924942, + "num_tokens": 322560795.0, + "step": 1215 + }, + { + "epoch": 0.12148458963984216, + "grad_norm": 0.6166426539421082, + "learning_rate": 9.782922253480538e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.9043713212013245, + "num_tokens": 322823023.0, + "step": 1216 + }, + { + "epoch": 0.12158449473000649, + "grad_norm": 0.7370768189430237, + "learning_rate": 9.782450464418802e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.9040170311927795, + "num_tokens": 323093910.0, + "step": 1217 + }, + { + "epoch": 0.12168439982017083, + "grad_norm": 0.7206267714500427, + "learning_rate": 9.781978174629822e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.9039177298545837, + "num_tokens": 323354247.0, + "step": 1218 + }, + { + "epoch": 0.12178430491033518, + "grad_norm": 0.9409292340278625, + "learning_rate": 9.78150538416305e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.9036012589931488, + "num_tokens": 323611927.0, + "step": 1219 + }, + { + "epoch": 0.12188421000049952, + "grad_norm": 0.6964109539985657, + "learning_rate": 9.781032093067987e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9065404236316681, + "num_tokens": 323872225.0, + "step": 1220 + }, + { + "epoch": 0.12198411509066387, + "grad_norm": 0.8576578497886658, + "learning_rate": 9.780558301394187e-06, + "loss": 0.506, + "mean_token_accuracy": 0.9024285674095154, + "num_tokens": 324129560.0, + "step": 1221 + }, + { + "epoch": 0.12208402018082821, + "grad_norm": 0.8656182885169983, + "learning_rate": 9.780084009191255e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.9028759598731995, + "num_tokens": 324402327.0, + "step": 1222 + }, + { + "epoch": 0.12218392527099256, + "grad_norm": 0.822221577167511, + "learning_rate": 9.779609216508852e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.9037863314151764, + "num_tokens": 324663391.0, + "step": 1223 + }, + { + "epoch": 0.1222838303611569, + "grad_norm": 0.6473047137260437, + "learning_rate": 9.779133923396689e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.9045920968055725, + "num_tokens": 324922486.0, + "step": 1224 + }, + { + "epoch": 0.12238373545132125, + "grad_norm": 0.5470995306968689, + "learning_rate": 9.778658129904529e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.9011850357055664, + "num_tokens": 325185776.0, + "step": 1225 + }, + { + "epoch": 0.12248364054148558, + "grad_norm": 0.9262421131134033, + "learning_rate": 9.778181836082185e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.9005417227745056, + "num_tokens": 325452967.0, + "step": 1226 + }, + { + "epoch": 0.12258354563164993, + "grad_norm": 0.8606929183006287, + "learning_rate": 9.777705041979532e-06, + "loss": 0.51, + "mean_token_accuracy": 0.901580274105072, + "num_tokens": 325717321.0, + "step": 1227 + }, + { + "epoch": 0.12268345072181427, + "grad_norm": 1.1104562282562256, + "learning_rate": 9.777227747646488e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.9014719128608704, + "num_tokens": 325990090.0, + "step": 1228 + }, + { + "epoch": 0.12278335581197862, + "grad_norm": 0.8109843134880066, + "learning_rate": 9.776749953133022e-06, + "loss": 0.51, + "mean_token_accuracy": 0.9034325182437897, + "num_tokens": 326255489.0, + "step": 1229 + }, + { + "epoch": 0.12288326090214297, + "grad_norm": 0.7464098334312439, + "learning_rate": 9.776271658489165e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.9033547043800354, + "num_tokens": 326522351.0, + "step": 1230 + }, + { + "epoch": 0.12298316599230731, + "grad_norm": 0.6471456289291382, + "learning_rate": 9.775792863764992e-06, + "loss": 0.507, + "mean_token_accuracy": 0.9032576084136963, + "num_tokens": 326789586.0, + "step": 1231 + }, + { + "epoch": 0.12308307108247166, + "grad_norm": 0.6782404184341431, + "learning_rate": 9.775313569010635e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.9052684009075165, + "num_tokens": 327047586.0, + "step": 1232 + }, + { + "epoch": 0.123182976172636, + "grad_norm": 0.722642183303833, + "learning_rate": 9.774833774276278e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.9025213718414307, + "num_tokens": 327316862.0, + "step": 1233 + }, + { + "epoch": 0.12328288126280033, + "grad_norm": 0.5306056141853333, + "learning_rate": 9.774353479612151e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.9049557149410248, + "num_tokens": 327583515.0, + "step": 1234 + }, + { + "epoch": 0.12338278635296468, + "grad_norm": 0.8252025842666626, + "learning_rate": 9.773872685068543e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.9047762155532837, + "num_tokens": 327852801.0, + "step": 1235 + }, + { + "epoch": 0.12348269144312903, + "grad_norm": 0.7489535808563232, + "learning_rate": 9.7733913906958e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9041016101837158, + "num_tokens": 328124233.0, + "step": 1236 + }, + { + "epoch": 0.12358259653329337, + "grad_norm": 0.659155547618866, + "learning_rate": 9.772909596544304e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.9049120545387268, + "num_tokens": 328381732.0, + "step": 1237 + }, + { + "epoch": 0.12368250162345772, + "grad_norm": 0.6204277276992798, + "learning_rate": 9.772427302664507e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.900322824716568, + "num_tokens": 328646759.0, + "step": 1238 + }, + { + "epoch": 0.12378240671362206, + "grad_norm": 1.0530595779418945, + "learning_rate": 9.7719445091069e-06, + "loss": 0.503, + "mean_token_accuracy": 0.9011508226394653, + "num_tokens": 328898996.0, + "step": 1239 + }, + { + "epoch": 0.12388231180378641, + "grad_norm": 0.9423425793647766, + "learning_rate": 9.771461215922037e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.9014022648334503, + "num_tokens": 329168566.0, + "step": 1240 + }, + { + "epoch": 0.12398221689395075, + "grad_norm": 0.7136279940605164, + "learning_rate": 9.770977423160517e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.9039493501186371, + "num_tokens": 329432821.0, + "step": 1241 + }, + { + "epoch": 0.12408212198411508, + "grad_norm": 0.6179080605506897, + "learning_rate": 9.770493130872992e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.9050923883914948, + "num_tokens": 329705509.0, + "step": 1242 + }, + { + "epoch": 0.12418202707427943, + "grad_norm": 0.6984447240829468, + "learning_rate": 9.77000833911017e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.9019152522087097, + "num_tokens": 329963050.0, + "step": 1243 + }, + { + "epoch": 0.12428193216444378, + "grad_norm": 0.5743746161460876, + "learning_rate": 9.76952304792281e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.9052267074584961, + "num_tokens": 330230076.0, + "step": 1244 + }, + { + "epoch": 0.12438183725460812, + "grad_norm": 0.5526230335235596, + "learning_rate": 9.76903725736172e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.9034181237220764, + "num_tokens": 330497930.0, + "step": 1245 + }, + { + "epoch": 0.12448174234477247, + "grad_norm": 0.7884277701377869, + "learning_rate": 9.768550967477763e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.9048866927623749, + "num_tokens": 330771000.0, + "step": 1246 + }, + { + "epoch": 0.12458164743493681, + "grad_norm": 0.6421458721160889, + "learning_rate": 9.768064178321857e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.9065020084381104, + "num_tokens": 331035534.0, + "step": 1247 + }, + { + "epoch": 0.12468155252510116, + "grad_norm": 0.8063413500785828, + "learning_rate": 9.767576889944965e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.9014884829521179, + "num_tokens": 331309227.0, + "step": 1248 + }, + { + "epoch": 0.1247814576152655, + "grad_norm": 0.784085214138031, + "learning_rate": 9.767089102398111e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.9035598337650299, + "num_tokens": 331575378.0, + "step": 1249 + }, + { + "epoch": 0.12488136270542984, + "grad_norm": 0.6408994793891907, + "learning_rate": 9.766600815732363e-06, + "loss": 0.505, + "mean_token_accuracy": 0.9025774002075195, + "num_tokens": 331848627.0, + "step": 1250 + }, + { + "epoch": 0.12498126779559418, + "grad_norm": 0.7327790856361389, + "learning_rate": 9.766112029998847e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9039163887500763, + "num_tokens": 332119871.0, + "step": 1251 + }, + { + "epoch": 0.12508117288575854, + "grad_norm": 0.592624843120575, + "learning_rate": 9.765622745248739e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.9030689299106598, + "num_tokens": 332390238.0, + "step": 1252 + }, + { + "epoch": 0.12518107797592287, + "grad_norm": 0.7173775434494019, + "learning_rate": 9.765132961533269e-06, + "loss": 0.515, + "mean_token_accuracy": 0.9031840264797211, + "num_tokens": 332650733.0, + "step": 1253 + }, + { + "epoch": 0.1252809830660872, + "grad_norm": 0.6694276332855225, + "learning_rate": 9.764642678903714e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.9012039601802826, + "num_tokens": 332915142.0, + "step": 1254 + }, + { + "epoch": 0.12538088815625156, + "grad_norm": 0.6219826340675354, + "learning_rate": 9.76415189741141e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.900557667016983, + "num_tokens": 333181131.0, + "step": 1255 + }, + { + "epoch": 0.1254807932464159, + "grad_norm": 0.8491196632385254, + "learning_rate": 9.763660617107744e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.9023039937019348, + "num_tokens": 333446177.0, + "step": 1256 + }, + { + "epoch": 0.12558069833658025, + "grad_norm": 0.5764880776405334, + "learning_rate": 9.76316883804415e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.9037206768989563, + "num_tokens": 333709930.0, + "step": 1257 + }, + { + "epoch": 0.12568060342674459, + "grad_norm": 1.5406200885772705, + "learning_rate": 9.762676560272118e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.9015002548694611, + "num_tokens": 333981147.0, + "step": 1258 + }, + { + "epoch": 0.12578050851690895, + "grad_norm": 0.6132174134254456, + "learning_rate": 9.762183783843191e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8993602395057678, + "num_tokens": 334250890.0, + "step": 1259 + }, + { + "epoch": 0.12588041360707328, + "grad_norm": 0.543785810470581, + "learning_rate": 9.761690508808966e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.9022267162799835, + "num_tokens": 334507916.0, + "step": 1260 + }, + { + "epoch": 0.12598031869723764, + "grad_norm": 0.7801241874694824, + "learning_rate": 9.761196735221083e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.9014892876148224, + "num_tokens": 334769981.0, + "step": 1261 + }, + { + "epoch": 0.12608022378740197, + "grad_norm": 0.599848210811615, + "learning_rate": 9.760702463131247e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.9058886766433716, + "num_tokens": 335035809.0, + "step": 1262 + }, + { + "epoch": 0.1261801288775663, + "grad_norm": 0.5172415375709534, + "learning_rate": 9.760207692591207e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.9032151401042938, + "num_tokens": 335295867.0, + "step": 1263 + }, + { + "epoch": 0.12628003396773066, + "grad_norm": 0.6296993494033813, + "learning_rate": 9.759712423652761e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.9053309559822083, + "num_tokens": 335563973.0, + "step": 1264 + }, + { + "epoch": 0.126379939057895, + "grad_norm": 0.948650598526001, + "learning_rate": 9.75921665636777e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.9044639468193054, + "num_tokens": 335830433.0, + "step": 1265 + }, + { + "epoch": 0.12647984414805935, + "grad_norm": 0.605172336101532, + "learning_rate": 9.758720390788139e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.9025920331478119, + "num_tokens": 336099413.0, + "step": 1266 + }, + { + "epoch": 0.12657974923822368, + "grad_norm": 0.689486563205719, + "learning_rate": 9.758223626965828e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.9007285535335541, + "num_tokens": 336363356.0, + "step": 1267 + }, + { + "epoch": 0.12667965432838804, + "grad_norm": 0.5758201479911804, + "learning_rate": 9.757726364952849e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.9044080376625061, + "num_tokens": 336635688.0, + "step": 1268 + }, + { + "epoch": 0.12677955941855237, + "grad_norm": 1.1214765310287476, + "learning_rate": 9.757228604801266e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.9047435820102692, + "num_tokens": 336897400.0, + "step": 1269 + }, + { + "epoch": 0.12687946450871673, + "grad_norm": 0.602318525314331, + "learning_rate": 9.756730346563193e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.906250536441803, + "num_tokens": 337168681.0, + "step": 1270 + }, + { + "epoch": 0.12697936959888106, + "grad_norm": 0.6802629828453064, + "learning_rate": 9.7562315902908e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.9019666612148285, + "num_tokens": 337426167.0, + "step": 1271 + }, + { + "epoch": 0.1270792746890454, + "grad_norm": 0.5597381591796875, + "learning_rate": 9.755732336036306e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.9026459753513336, + "num_tokens": 337697654.0, + "step": 1272 + }, + { + "epoch": 0.12717917977920976, + "grad_norm": 0.589751660823822, + "learning_rate": 9.755232583851986e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9032682180404663, + "num_tokens": 337958648.0, + "step": 1273 + }, + { + "epoch": 0.1272790848693741, + "grad_norm": 0.5642845034599304, + "learning_rate": 9.754732333790161e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.9043671488761902, + "num_tokens": 338224505.0, + "step": 1274 + }, + { + "epoch": 0.12737898995953845, + "grad_norm": 0.5853906869888306, + "learning_rate": 9.754231585903208e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.901353508234024, + "num_tokens": 338494789.0, + "step": 1275 + }, + { + "epoch": 0.12747889504970278, + "grad_norm": 0.650915801525116, + "learning_rate": 9.75373034024356e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.9039398431777954, + "num_tokens": 338758166.0, + "step": 1276 + }, + { + "epoch": 0.12757880013986714, + "grad_norm": 0.5398294925689697, + "learning_rate": 9.753228596863694e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.9016003608703613, + "num_tokens": 339023525.0, + "step": 1277 + }, + { + "epoch": 0.12767870523003147, + "grad_norm": 0.5572508573532104, + "learning_rate": 9.752726355816144e-06, + "loss": 0.502, + "mean_token_accuracy": 0.9036333560943604, + "num_tokens": 339281528.0, + "step": 1278 + }, + { + "epoch": 0.1277786103201958, + "grad_norm": 0.8004305958747864, + "learning_rate": 9.752223617153495e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.9042928218841553, + "num_tokens": 339561607.0, + "step": 1279 + }, + { + "epoch": 0.12787851541036016, + "grad_norm": 0.5939939618110657, + "learning_rate": 9.751720380928384e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.9013244807720184, + "num_tokens": 339819261.0, + "step": 1280 + }, + { + "epoch": 0.1279784205005245, + "grad_norm": 0.6038373708724976, + "learning_rate": 9.751216647193502e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.9046531617641449, + "num_tokens": 340084910.0, + "step": 1281 + }, + { + "epoch": 0.12807832559068885, + "grad_norm": 0.6991779208183289, + "learning_rate": 9.750712416001588e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9049456119537354, + "num_tokens": 340347899.0, + "step": 1282 + }, + { + "epoch": 0.12817823068085318, + "grad_norm": 0.7745751142501831, + "learning_rate": 9.750207687405437e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.9045600891113281, + "num_tokens": 340621901.0, + "step": 1283 + }, + { + "epoch": 0.12827813577101754, + "grad_norm": 0.8019020557403564, + "learning_rate": 9.749702461457895e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9043082296848297, + "num_tokens": 340877202.0, + "step": 1284 + }, + { + "epoch": 0.12837804086118187, + "grad_norm": 0.630473792552948, + "learning_rate": 9.749196738211859e-06, + "loss": 0.508, + "mean_token_accuracy": 0.9014025330543518, + "num_tokens": 341138228.0, + "step": 1285 + }, + { + "epoch": 0.12847794595134623, + "grad_norm": 0.6187614798545837, + "learning_rate": 9.748690517720278e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.9043008983135223, + "num_tokens": 341399927.0, + "step": 1286 + }, + { + "epoch": 0.12857785104151057, + "grad_norm": 0.8891966342926025, + "learning_rate": 9.748183800036154e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.9051049053668976, + "num_tokens": 341666109.0, + "step": 1287 + }, + { + "epoch": 0.1286777561316749, + "grad_norm": 0.5843973159790039, + "learning_rate": 9.747676585212542e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.9021169245243073, + "num_tokens": 341925535.0, + "step": 1288 + }, + { + "epoch": 0.12877766122183926, + "grad_norm": 0.6416739225387573, + "learning_rate": 9.747168873302545e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.9046791791915894, + "num_tokens": 342186750.0, + "step": 1289 + }, + { + "epoch": 0.1288775663120036, + "grad_norm": 0.7877860069274902, + "learning_rate": 9.746660664359326e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9046924412250519, + "num_tokens": 342455878.0, + "step": 1290 + }, + { + "epoch": 0.12897747140216795, + "grad_norm": 0.6351723074913025, + "learning_rate": 9.74615195843609e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.9001562893390656, + "num_tokens": 342722878.0, + "step": 1291 + }, + { + "epoch": 0.12907737649233228, + "grad_norm": 0.7687746286392212, + "learning_rate": 9.745642755586102e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.9024607539176941, + "num_tokens": 342986320.0, + "step": 1292 + }, + { + "epoch": 0.12917728158249664, + "grad_norm": 0.8915553092956543, + "learning_rate": 9.745133055862676e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.9046798348426819, + "num_tokens": 343255104.0, + "step": 1293 + }, + { + "epoch": 0.12927718667266097, + "grad_norm": 0.5820026993751526, + "learning_rate": 9.744622859319175e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8995090425014496, + "num_tokens": 343530971.0, + "step": 1294 + }, + { + "epoch": 0.1293770917628253, + "grad_norm": 0.6585788726806641, + "learning_rate": 9.744112166009022e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.9048334360122681, + "num_tokens": 343796720.0, + "step": 1295 + }, + { + "epoch": 0.12947699685298966, + "grad_norm": 1.6437357664108276, + "learning_rate": 9.743600975985681e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.9044744670391083, + "num_tokens": 344053239.0, + "step": 1296 + }, + { + "epoch": 0.129576901943154, + "grad_norm": 0.6734384894371033, + "learning_rate": 9.74308928930268e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.9010572731494904, + "num_tokens": 344319575.0, + "step": 1297 + }, + { + "epoch": 0.12967680703331835, + "grad_norm": 0.5610374212265015, + "learning_rate": 9.74257710601359e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.9048634171485901, + "num_tokens": 344582316.0, + "step": 1298 + }, + { + "epoch": 0.12977671212348268, + "grad_norm": 0.8624292016029358, + "learning_rate": 9.742064426172035e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.903486579656601, + "num_tokens": 344855784.0, + "step": 1299 + }, + { + "epoch": 0.12987661721364704, + "grad_norm": 0.5695969462394714, + "learning_rate": 9.7415512498317e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.903805673122406, + "num_tokens": 345122879.0, + "step": 1300 + }, + { + "epoch": 0.12997652230381138, + "grad_norm": 0.865609347820282, + "learning_rate": 9.741037577046308e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.9039060175418854, + "num_tokens": 345386265.0, + "step": 1301 + }, + { + "epoch": 0.13007642739397574, + "grad_norm": 0.6323187947273254, + "learning_rate": 9.740523407869643e-06, + "loss": 0.512, + "mean_token_accuracy": 0.902576744556427, + "num_tokens": 345659926.0, + "step": 1302 + }, + { + "epoch": 0.13017633248414007, + "grad_norm": 0.5897668600082397, + "learning_rate": 9.740008742355542e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.9031775593757629, + "num_tokens": 345936221.0, + "step": 1303 + }, + { + "epoch": 0.1302762375743044, + "grad_norm": 0.7004216313362122, + "learning_rate": 9.739493580557888e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.9036730825901031, + "num_tokens": 346203660.0, + "step": 1304 + }, + { + "epoch": 0.13037614266446876, + "grad_norm": 0.543488085269928, + "learning_rate": 9.738977922530618e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.9018558859825134, + "num_tokens": 346464552.0, + "step": 1305 + }, + { + "epoch": 0.1304760477546331, + "grad_norm": 0.4982336163520813, + "learning_rate": 9.738461768327725e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.9019861221313477, + "num_tokens": 346725246.0, + "step": 1306 + }, + { + "epoch": 0.13057595284479745, + "grad_norm": 0.6364060044288635, + "learning_rate": 9.73794511800325e-06, + "loss": 0.5174, + "mean_token_accuracy": 0.902429610490799, + "num_tokens": 347003106.0, + "step": 1307 + }, + { + "epoch": 0.13067585793496178, + "grad_norm": 0.7224275469779968, + "learning_rate": 9.737427971611287e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9023118317127228, + "num_tokens": 347260494.0, + "step": 1308 + }, + { + "epoch": 0.13077576302512614, + "grad_norm": 0.6177303791046143, + "learning_rate": 9.73691032920598e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.9041770398616791, + "num_tokens": 347527455.0, + "step": 1309 + }, + { + "epoch": 0.13087566811529047, + "grad_norm": 0.8536177277565002, + "learning_rate": 9.736392190841526e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9056220650672913, + "num_tokens": 347796069.0, + "step": 1310 + }, + { + "epoch": 0.13097557320545483, + "grad_norm": 0.609987199306488, + "learning_rate": 9.735873556572177e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.9047615230083466, + "num_tokens": 348061091.0, + "step": 1311 + }, + { + "epoch": 0.13107547829561916, + "grad_norm": 0.6084150075912476, + "learning_rate": 9.735354426452235e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.9052123129367828, + "num_tokens": 348326511.0, + "step": 1312 + }, + { + "epoch": 0.1311753833857835, + "grad_norm": 0.5612214803695679, + "learning_rate": 9.734834800536053e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.903865784406662, + "num_tokens": 348592188.0, + "step": 1313 + }, + { + "epoch": 0.13127528847594785, + "grad_norm": 0.6108132004737854, + "learning_rate": 9.734314678878033e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9039753079414368, + "num_tokens": 348859396.0, + "step": 1314 + }, + { + "epoch": 0.1313751935661122, + "grad_norm": 0.6221902370452881, + "learning_rate": 9.733794061532636e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9040237963199615, + "num_tokens": 349121138.0, + "step": 1315 + }, + { + "epoch": 0.13147509865627655, + "grad_norm": 1.6233543157577515, + "learning_rate": 9.73327294855437e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.9029362499713898, + "num_tokens": 349390960.0, + "step": 1316 + }, + { + "epoch": 0.13157500374644088, + "grad_norm": 0.6172822713851929, + "learning_rate": 9.732751339997795e-06, + "loss": 0.509, + "mean_token_accuracy": 0.9022683501243591, + "num_tokens": 349659654.0, + "step": 1317 + }, + { + "epoch": 0.13167490883660524, + "grad_norm": 0.4812072515487671, + "learning_rate": 9.732229235917526e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.9047927856445312, + "num_tokens": 349920640.0, + "step": 1318 + }, + { + "epoch": 0.13177481392676957, + "grad_norm": 0.578676700592041, + "learning_rate": 9.731706636368228e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.9041873514652252, + "num_tokens": 350191679.0, + "step": 1319 + }, + { + "epoch": 0.1318747190169339, + "grad_norm": 0.5800369381904602, + "learning_rate": 9.731183541404615e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.9034478664398193, + "num_tokens": 350448467.0, + "step": 1320 + }, + { + "epoch": 0.13197462410709826, + "grad_norm": 0.7272772192955017, + "learning_rate": 9.730659951081456e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.9020675122737885, + "num_tokens": 350715995.0, + "step": 1321 + }, + { + "epoch": 0.1320745291972626, + "grad_norm": 0.5252895951271057, + "learning_rate": 9.730135865453572e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.9059669077396393, + "num_tokens": 350987266.0, + "step": 1322 + }, + { + "epoch": 0.13217443428742695, + "grad_norm": 0.6603588461875916, + "learning_rate": 9.729611284575837e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.9014370739459991, + "num_tokens": 351248120.0, + "step": 1323 + }, + { + "epoch": 0.13227433937759128, + "grad_norm": 0.5815677046775818, + "learning_rate": 9.729086208503174e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.9049550890922546, + "num_tokens": 351512930.0, + "step": 1324 + }, + { + "epoch": 0.13237424446775564, + "grad_norm": 0.5762872695922852, + "learning_rate": 9.728560637290558e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.9038210809230804, + "num_tokens": 351774415.0, + "step": 1325 + }, + { + "epoch": 0.13247414955791997, + "grad_norm": 0.6364012360572815, + "learning_rate": 9.72803457099302e-06, + "loss": 0.515, + "mean_token_accuracy": 0.9009778201580048, + "num_tokens": 352041872.0, + "step": 1326 + }, + { + "epoch": 0.13257405464808433, + "grad_norm": 0.8604117035865784, + "learning_rate": 9.727508009665633e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.9000213742256165, + "num_tokens": 352308642.0, + "step": 1327 + }, + { + "epoch": 0.13267395973824866, + "grad_norm": 0.7244608998298645, + "learning_rate": 9.726980953363536e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.901829719543457, + "num_tokens": 352571017.0, + "step": 1328 + }, + { + "epoch": 0.132773864828413, + "grad_norm": 0.6511553525924683, + "learning_rate": 9.726453402141906e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.9036870300769806, + "num_tokens": 352827457.0, + "step": 1329 + }, + { + "epoch": 0.13287376991857736, + "grad_norm": 0.7041537761688232, + "learning_rate": 9.725925356055984e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.9022332727909088, + "num_tokens": 353083588.0, + "step": 1330 + }, + { + "epoch": 0.1329736750087417, + "grad_norm": 0.6721916794776917, + "learning_rate": 9.725396815161053e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.9048410058021545, + "num_tokens": 353345426.0, + "step": 1331 + }, + { + "epoch": 0.13307358009890605, + "grad_norm": 0.6861441731452942, + "learning_rate": 9.724867779512453e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.898344486951828, + "num_tokens": 353615228.0, + "step": 1332 + }, + { + "epoch": 0.13317348518907038, + "grad_norm": 0.8002551794052124, + "learning_rate": 9.724338249165575e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.9027277529239655, + "num_tokens": 353874539.0, + "step": 1333 + }, + { + "epoch": 0.13327339027923474, + "grad_norm": 0.6792767643928528, + "learning_rate": 9.723808224175859e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.9048967659473419, + "num_tokens": 354140572.0, + "step": 1334 + }, + { + "epoch": 0.13337329536939907, + "grad_norm": 0.5957971215248108, + "learning_rate": 9.723277704598803e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.9010719656944275, + "num_tokens": 354409014.0, + "step": 1335 + }, + { + "epoch": 0.1334732004595634, + "grad_norm": 0.6256676912307739, + "learning_rate": 9.722746690489949e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.9046356678009033, + "num_tokens": 354678119.0, + "step": 1336 + }, + { + "epoch": 0.13357310554972776, + "grad_norm": 0.5491651296615601, + "learning_rate": 9.722215181904897e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.9021613001823425, + "num_tokens": 354942863.0, + "step": 1337 + }, + { + "epoch": 0.1336730106398921, + "grad_norm": 0.6272311210632324, + "learning_rate": 9.721683178899297e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.9039164781570435, + "num_tokens": 355215319.0, + "step": 1338 + }, + { + "epoch": 0.13377291573005645, + "grad_norm": 0.6523739099502563, + "learning_rate": 9.721150681528848e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.9033850729465485, + "num_tokens": 355481396.0, + "step": 1339 + }, + { + "epoch": 0.13387282082022078, + "grad_norm": 0.6236670017242432, + "learning_rate": 9.720617689849304e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.9036861062049866, + "num_tokens": 355738543.0, + "step": 1340 + }, + { + "epoch": 0.13397272591038514, + "grad_norm": 0.634648859500885, + "learning_rate": 9.720084203916472e-06, + "loss": 0.512, + "mean_token_accuracy": 0.9039049744606018, + "num_tokens": 355994242.0, + "step": 1341 + }, + { + "epoch": 0.13407263100054947, + "grad_norm": 0.6905008554458618, + "learning_rate": 9.719550223786204e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.9035544097423553, + "num_tokens": 356274227.0, + "step": 1342 + }, + { + "epoch": 0.13417253609071383, + "grad_norm": 0.6661592721939087, + "learning_rate": 9.71901574951441e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.9042218029499054, + "num_tokens": 356546767.0, + "step": 1343 + }, + { + "epoch": 0.13427244118087817, + "grad_norm": 0.5702944397926331, + "learning_rate": 9.718480781157054e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.9041827917098999, + "num_tokens": 356812352.0, + "step": 1344 + }, + { + "epoch": 0.1343723462710425, + "grad_norm": 0.5702375173568726, + "learning_rate": 9.717945318770142e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.9053398072719574, + "num_tokens": 357079292.0, + "step": 1345 + }, + { + "epoch": 0.13447225136120686, + "grad_norm": 0.6236252188682556, + "learning_rate": 9.71740936240974e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.9027320444583893, + "num_tokens": 357342715.0, + "step": 1346 + }, + { + "epoch": 0.1345721564513712, + "grad_norm": 0.6221140027046204, + "learning_rate": 9.716872912131964e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.9029887318611145, + "num_tokens": 357606082.0, + "step": 1347 + }, + { + "epoch": 0.13467206154153555, + "grad_norm": 0.5848428606987, + "learning_rate": 9.716335967992979e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.9056502282619476, + "num_tokens": 357868188.0, + "step": 1348 + }, + { + "epoch": 0.13477196663169988, + "grad_norm": 0.6150679588317871, + "learning_rate": 9.715798530049006e-06, + "loss": 0.505, + "mean_token_accuracy": 0.9047699570655823, + "num_tokens": 358138066.0, + "step": 1349 + }, + { + "epoch": 0.13487187172186424, + "grad_norm": 0.6166964173316956, + "learning_rate": 9.71526059835631e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.900386780500412, + "num_tokens": 358395515.0, + "step": 1350 + }, + { + "epoch": 0.13497177681202857, + "grad_norm": 0.6883940100669861, + "learning_rate": 9.71472217297122e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.9035024046897888, + "num_tokens": 358659427.0, + "step": 1351 + }, + { + "epoch": 0.13507168190219293, + "grad_norm": 0.6140916347503662, + "learning_rate": 9.714183253950104e-06, + "loss": 0.503, + "mean_token_accuracy": 0.9046562016010284, + "num_tokens": 358930071.0, + "step": 1352 + }, + { + "epoch": 0.13517158699235726, + "grad_norm": 0.5806080102920532, + "learning_rate": 9.713643841349392e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.9019347429275513, + "num_tokens": 359202847.0, + "step": 1353 + }, + { + "epoch": 0.1352714920825216, + "grad_norm": 0.533565878868103, + "learning_rate": 9.713103935225559e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.90446537733078, + "num_tokens": 359472889.0, + "step": 1354 + }, + { + "epoch": 0.13537139717268595, + "grad_norm": 0.6901300549507141, + "learning_rate": 9.712563535635131e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9037901163101196, + "num_tokens": 359742477.0, + "step": 1355 + }, + { + "epoch": 0.13547130226285028, + "grad_norm": 0.5195861458778381, + "learning_rate": 9.712022642634691e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9061339199542999, + "num_tokens": 360007175.0, + "step": 1356 + }, + { + "epoch": 0.13557120735301464, + "grad_norm": 0.5528311729431152, + "learning_rate": 9.711481256280872e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.9051561951637268, + "num_tokens": 360264025.0, + "step": 1357 + }, + { + "epoch": 0.13567111244317898, + "grad_norm": 0.6268184185028076, + "learning_rate": 9.710939376630356e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.9064360857009888, + "num_tokens": 360537970.0, + "step": 1358 + }, + { + "epoch": 0.13577101753334334, + "grad_norm": 1.4820789098739624, + "learning_rate": 9.710397003739879e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8991971015930176, + "num_tokens": 360808217.0, + "step": 1359 + }, + { + "epoch": 0.13587092262350767, + "grad_norm": 0.6029367446899414, + "learning_rate": 9.709854137666228e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.9055202007293701, + "num_tokens": 361075015.0, + "step": 1360 + }, + { + "epoch": 0.135970827713672, + "grad_norm": 0.44909483194351196, + "learning_rate": 9.709310778466241e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9049392640590668, + "num_tokens": 361324157.0, + "step": 1361 + }, + { + "epoch": 0.13607073280383636, + "grad_norm": 0.5419716238975525, + "learning_rate": 9.708766926196809e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9053288698196411, + "num_tokens": 361589922.0, + "step": 1362 + }, + { + "epoch": 0.1361706378940007, + "grad_norm": 0.5701816082000732, + "learning_rate": 9.708222580914872e-06, + "loss": 0.501, + "mean_token_accuracy": 0.903176486492157, + "num_tokens": 361859166.0, + "step": 1363 + }, + { + "epoch": 0.13627054298416505, + "grad_norm": 0.5882217884063721, + "learning_rate": 9.707677742677427e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.9025399386882782, + "num_tokens": 362119580.0, + "step": 1364 + }, + { + "epoch": 0.13637044807432938, + "grad_norm": 0.6000105738639832, + "learning_rate": 9.707132411541516e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.9018959701061249, + "num_tokens": 362386483.0, + "step": 1365 + }, + { + "epoch": 0.13647035316449374, + "grad_norm": 0.6041340827941895, + "learning_rate": 9.706586587564236e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.9044257402420044, + "num_tokens": 362646227.0, + "step": 1366 + }, + { + "epoch": 0.13657025825465807, + "grad_norm": 0.6122722029685974, + "learning_rate": 9.706040270802736e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.9010279476642609, + "num_tokens": 362908789.0, + "step": 1367 + }, + { + "epoch": 0.13667016334482243, + "grad_norm": 0.759567379951477, + "learning_rate": 9.705493461314217e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.9049019813537598, + "num_tokens": 363169258.0, + "step": 1368 + }, + { + "epoch": 0.13677006843498676, + "grad_norm": 0.6032987833023071, + "learning_rate": 9.70494615915593e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.9074707627296448, + "num_tokens": 363436695.0, + "step": 1369 + }, + { + "epoch": 0.1368699735251511, + "grad_norm": 0.6452275514602661, + "learning_rate": 9.704398364385177e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.903675377368927, + "num_tokens": 363693011.0, + "step": 1370 + }, + { + "epoch": 0.13696987861531545, + "grad_norm": 0.6377112865447998, + "learning_rate": 9.703850077059314e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.9045532941818237, + "num_tokens": 363954150.0, + "step": 1371 + }, + { + "epoch": 0.1370697837054798, + "grad_norm": 0.7726524472236633, + "learning_rate": 9.703301297235745e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.9031749367713928, + "num_tokens": 364220206.0, + "step": 1372 + }, + { + "epoch": 0.13716968879564415, + "grad_norm": 0.5517889261245728, + "learning_rate": 9.702752024971929e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.9062909483909607, + "num_tokens": 364485250.0, + "step": 1373 + }, + { + "epoch": 0.13726959388580848, + "grad_norm": 0.5404722094535828, + "learning_rate": 9.702202260325377e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.9021565616130829, + "num_tokens": 364754483.0, + "step": 1374 + }, + { + "epoch": 0.13736949897597284, + "grad_norm": 0.631691038608551, + "learning_rate": 9.701652003353648e-06, + "loss": 0.507, + "mean_token_accuracy": 0.9023702144622803, + "num_tokens": 365020414.0, + "step": 1375 + }, + { + "epoch": 0.13746940406613717, + "grad_norm": 0.6598083972930908, + "learning_rate": 9.701101254114354e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.9052841365337372, + "num_tokens": 365284364.0, + "step": 1376 + }, + { + "epoch": 0.1375693091563015, + "grad_norm": 2.11628794670105, + "learning_rate": 9.70055001266516e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.9027170240879059, + "num_tokens": 365547828.0, + "step": 1377 + }, + { + "epoch": 0.13766921424646586, + "grad_norm": 0.5242030024528503, + "learning_rate": 9.699998279063783e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9037593007087708, + "num_tokens": 365819846.0, + "step": 1378 + }, + { + "epoch": 0.1377691193366302, + "grad_norm": 0.5215385556221008, + "learning_rate": 9.699446053367985e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.9015273451805115, + "num_tokens": 366083980.0, + "step": 1379 + }, + { + "epoch": 0.13786902442679455, + "grad_norm": 0.6304190754890442, + "learning_rate": 9.698893335635591e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.9055605530738831, + "num_tokens": 366346572.0, + "step": 1380 + }, + { + "epoch": 0.13796892951695888, + "grad_norm": 0.6059827208518982, + "learning_rate": 9.698340125924468e-06, + "loss": 0.506, + "mean_token_accuracy": 0.9041842818260193, + "num_tokens": 366609992.0, + "step": 1381 + }, + { + "epoch": 0.13806883460712324, + "grad_norm": 0.5843242406845093, + "learning_rate": 9.697786424292536e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.904990166425705, + "num_tokens": 366871481.0, + "step": 1382 + }, + { + "epoch": 0.13816873969728757, + "grad_norm": 0.5729286074638367, + "learning_rate": 9.69723223079777e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.9040187299251556, + "num_tokens": 367136083.0, + "step": 1383 + }, + { + "epoch": 0.13826864478745193, + "grad_norm": 0.513022243976593, + "learning_rate": 9.696677545498195e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.9032474160194397, + "num_tokens": 367400351.0, + "step": 1384 + }, + { + "epoch": 0.13836854987761626, + "grad_norm": 0.522972822189331, + "learning_rate": 9.696122368451887e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9064809679985046, + "num_tokens": 367660292.0, + "step": 1385 + }, + { + "epoch": 0.1384684549677806, + "grad_norm": 0.5110925436019897, + "learning_rate": 9.695566699716971e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.9014586806297302, + "num_tokens": 367929729.0, + "step": 1386 + }, + { + "epoch": 0.13856836005794496, + "grad_norm": 0.5369998216629028, + "learning_rate": 9.695010539351631e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9022297859191895, + "num_tokens": 368194212.0, + "step": 1387 + }, + { + "epoch": 0.1386682651481093, + "grad_norm": 0.5471382737159729, + "learning_rate": 9.694453887414093e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.9035443663597107, + "num_tokens": 368465829.0, + "step": 1388 + }, + { + "epoch": 0.13876817023827365, + "grad_norm": 0.7081511616706848, + "learning_rate": 9.69389674396264e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.9064981937408447, + "num_tokens": 368734977.0, + "step": 1389 + }, + { + "epoch": 0.13886807532843798, + "grad_norm": 0.5515230298042297, + "learning_rate": 9.693339109055608e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9052196741104126, + "num_tokens": 369002478.0, + "step": 1390 + }, + { + "epoch": 0.13896798041860234, + "grad_norm": 0.48621585965156555, + "learning_rate": 9.69278098275138e-06, + "loss": 0.502, + "mean_token_accuracy": 0.9051587283611298, + "num_tokens": 369270160.0, + "step": 1391 + }, + { + "epoch": 0.13906788550876667, + "grad_norm": 0.7078155875205994, + "learning_rate": 9.69222236510839e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.9033458232879639, + "num_tokens": 369533635.0, + "step": 1392 + }, + { + "epoch": 0.13916779059893103, + "grad_norm": 0.5800857543945312, + "learning_rate": 9.691663256185131e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.9036988019943237, + "num_tokens": 369792272.0, + "step": 1393 + }, + { + "epoch": 0.13926769568909536, + "grad_norm": 0.4874478578567505, + "learning_rate": 9.691103656040137e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.9043151140213013, + "num_tokens": 370048596.0, + "step": 1394 + }, + { + "epoch": 0.1393676007792597, + "grad_norm": 0.7144757509231567, + "learning_rate": 9.690543564732001e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.9030154049396515, + "num_tokens": 370317485.0, + "step": 1395 + }, + { + "epoch": 0.13946750586942405, + "grad_norm": 0.5705639123916626, + "learning_rate": 9.689982982319369e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.9014999568462372, + "num_tokens": 370591346.0, + "step": 1396 + }, + { + "epoch": 0.13956741095958838, + "grad_norm": 0.6177432537078857, + "learning_rate": 9.689421908860928e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.902140200138092, + "num_tokens": 370855332.0, + "step": 1397 + }, + { + "epoch": 0.13966731604975274, + "grad_norm": 0.7031309008598328, + "learning_rate": 9.688860344415425e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.9040344655513763, + "num_tokens": 371125418.0, + "step": 1398 + }, + { + "epoch": 0.13976722113991707, + "grad_norm": 0.5673732161521912, + "learning_rate": 9.688298289041658e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.9021673202514648, + "num_tokens": 371388167.0, + "step": 1399 + }, + { + "epoch": 0.13986712623008143, + "grad_norm": 0.5528824925422668, + "learning_rate": 9.687735742798475e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.903087705373764, + "num_tokens": 371650982.0, + "step": 1400 + }, + { + "epoch": 0.13996703132024577, + "grad_norm": 0.554568350315094, + "learning_rate": 9.687172705744773e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.9052112996578217, + "num_tokens": 371920571.0, + "step": 1401 + }, + { + "epoch": 0.1400669364104101, + "grad_norm": 0.6864446401596069, + "learning_rate": 9.686609177939504e-06, + "loss": 0.512, + "mean_token_accuracy": 0.9008426070213318, + "num_tokens": 372185200.0, + "step": 1402 + }, + { + "epoch": 0.14016684150057446, + "grad_norm": 0.6021192073822021, + "learning_rate": 9.686045159441669e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.9012116193771362, + "num_tokens": 372451017.0, + "step": 1403 + }, + { + "epoch": 0.1402667465907388, + "grad_norm": 0.5193629860877991, + "learning_rate": 9.685480650310319e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.9043718576431274, + "num_tokens": 372708474.0, + "step": 1404 + }, + { + "epoch": 0.14036665168090315, + "grad_norm": 0.6288788914680481, + "learning_rate": 9.684915650604566e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.9034686088562012, + "num_tokens": 372978155.0, + "step": 1405 + }, + { + "epoch": 0.14046655677106748, + "grad_norm": 0.6182999610900879, + "learning_rate": 9.684350160383557e-06, + "loss": 0.51, + "mean_token_accuracy": 0.9025026559829712, + "num_tokens": 373249033.0, + "step": 1406 + }, + { + "epoch": 0.14056646186123184, + "grad_norm": 0.468044638633728, + "learning_rate": 9.683784179706507e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.9030006229877472, + "num_tokens": 373526046.0, + "step": 1407 + }, + { + "epoch": 0.14066636695139617, + "grad_norm": 0.5208225250244141, + "learning_rate": 9.68321770863267e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.9054751396179199, + "num_tokens": 373786098.0, + "step": 1408 + }, + { + "epoch": 0.14076627204156053, + "grad_norm": 0.7847944498062134, + "learning_rate": 9.682650747221357e-06, + "loss": 0.507, + "mean_token_accuracy": 0.906504213809967, + "num_tokens": 374055243.0, + "step": 1409 + }, + { + "epoch": 0.14086617713172486, + "grad_norm": 0.5092572569847107, + "learning_rate": 9.682083295531932e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9037464559078217, + "num_tokens": 374319981.0, + "step": 1410 + }, + { + "epoch": 0.1409660822218892, + "grad_norm": 0.6192145347595215, + "learning_rate": 9.681515353623806e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.9020903706550598, + "num_tokens": 374591468.0, + "step": 1411 + }, + { + "epoch": 0.14106598731205355, + "grad_norm": 0.8066653609275818, + "learning_rate": 9.68094692155644e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.9002091586589813, + "num_tokens": 374850462.0, + "step": 1412 + }, + { + "epoch": 0.14116589240221789, + "grad_norm": 0.56251460313797, + "learning_rate": 9.680377999389355e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9024154841899872, + "num_tokens": 375117723.0, + "step": 1413 + }, + { + "epoch": 0.14126579749238224, + "grad_norm": 0.6938750743865967, + "learning_rate": 9.679808587182113e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.904209554195404, + "num_tokens": 375385784.0, + "step": 1414 + }, + { + "epoch": 0.14136570258254658, + "grad_norm": 0.5217888355255127, + "learning_rate": 9.679238684994334e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.9058369696140289, + "num_tokens": 375659039.0, + "step": 1415 + }, + { + "epoch": 0.14146560767271094, + "grad_norm": 0.5741299986839294, + "learning_rate": 9.678668292885687e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.9046463668346405, + "num_tokens": 375923209.0, + "step": 1416 + }, + { + "epoch": 0.14156551276287527, + "grad_norm": 0.5930612087249756, + "learning_rate": 9.678097410915894e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.902845025062561, + "num_tokens": 376191361.0, + "step": 1417 + }, + { + "epoch": 0.1416654178530396, + "grad_norm": 0.5495707988739014, + "learning_rate": 9.677526039144724e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.9049372375011444, + "num_tokens": 376466407.0, + "step": 1418 + }, + { + "epoch": 0.14176532294320396, + "grad_norm": 0.6353015303611755, + "learning_rate": 9.676954177632006e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.9043039381504059, + "num_tokens": 376737427.0, + "step": 1419 + }, + { + "epoch": 0.1418652280333683, + "grad_norm": 0.5742506384849548, + "learning_rate": 9.676381826437606e-06, + "loss": 0.51, + "mean_token_accuracy": 0.902463436126709, + "num_tokens": 377002867.0, + "step": 1420 + }, + { + "epoch": 0.14196513312353265, + "grad_norm": 0.6437283754348755, + "learning_rate": 9.675808985621456e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.903777003288269, + "num_tokens": 377265376.0, + "step": 1421 + }, + { + "epoch": 0.14206503821369698, + "grad_norm": 0.4784611165523529, + "learning_rate": 9.675235655243532e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.9052164256572723, + "num_tokens": 377536468.0, + "step": 1422 + }, + { + "epoch": 0.14216494330386134, + "grad_norm": 0.5258316993713379, + "learning_rate": 9.67466183536386e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.9067069292068481, + "num_tokens": 377800881.0, + "step": 1423 + }, + { + "epoch": 0.14226484839402567, + "grad_norm": 0.557099461555481, + "learning_rate": 9.67408752604252e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9048696458339691, + "num_tokens": 378071067.0, + "step": 1424 + }, + { + "epoch": 0.14236475348419003, + "grad_norm": 0.5744380354881287, + "learning_rate": 9.673512727339644e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9027285575866699, + "num_tokens": 378338069.0, + "step": 1425 + }, + { + "epoch": 0.14246465857435436, + "grad_norm": 0.6774719953536987, + "learning_rate": 9.672937439315415e-06, + "loss": 0.513, + "mean_token_accuracy": 0.9039033949375153, + "num_tokens": 378604917.0, + "step": 1426 + }, + { + "epoch": 0.1425645636645187, + "grad_norm": 0.5467531681060791, + "learning_rate": 9.672361662030063e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.9032441079616547, + "num_tokens": 378874321.0, + "step": 1427 + }, + { + "epoch": 0.14266446875468305, + "grad_norm": 0.6154409646987915, + "learning_rate": 9.671785395543876e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.9037917852401733, + "num_tokens": 379137661.0, + "step": 1428 + }, + { + "epoch": 0.1427643738448474, + "grad_norm": 0.5142212510108948, + "learning_rate": 9.671208639917186e-06, + "loss": 0.503, + "mean_token_accuracy": 0.9049134254455566, + "num_tokens": 379396245.0, + "step": 1429 + }, + { + "epoch": 0.14286427893501175, + "grad_norm": 0.6962456107139587, + "learning_rate": 9.670631395210384e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.905133068561554, + "num_tokens": 379648510.0, + "step": 1430 + }, + { + "epoch": 0.14296418402517608, + "grad_norm": 0.497173935174942, + "learning_rate": 9.670053661483904e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.9038079082965851, + "num_tokens": 379912109.0, + "step": 1431 + }, + { + "epoch": 0.14306408911534044, + "grad_norm": 0.5851778984069824, + "learning_rate": 9.669475438798238e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.9045672118663788, + "num_tokens": 380174049.0, + "step": 1432 + }, + { + "epoch": 0.14316399420550477, + "grad_norm": 0.5214958190917969, + "learning_rate": 9.668896727213925e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9067535698413849, + "num_tokens": 380443354.0, + "step": 1433 + }, + { + "epoch": 0.14326389929566913, + "grad_norm": 0.5601566433906555, + "learning_rate": 9.668317526791559e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.9010151922702789, + "num_tokens": 380714491.0, + "step": 1434 + }, + { + "epoch": 0.14336380438583346, + "grad_norm": 0.6211361885070801, + "learning_rate": 9.66773783759178e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9022555649280548, + "num_tokens": 380977594.0, + "step": 1435 + }, + { + "epoch": 0.1434637094759978, + "grad_norm": 0.5492438077926636, + "learning_rate": 9.667157659675284e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.9037165641784668, + "num_tokens": 381244789.0, + "step": 1436 + }, + { + "epoch": 0.14356361456616215, + "grad_norm": 0.6068366765975952, + "learning_rate": 9.666576993102814e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.9033069312572479, + "num_tokens": 381505123.0, + "step": 1437 + }, + { + "epoch": 0.14366351965632648, + "grad_norm": 0.7422967553138733, + "learning_rate": 9.665995837935168e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.9056528806686401, + "num_tokens": 381765312.0, + "step": 1438 + }, + { + "epoch": 0.14376342474649084, + "grad_norm": 0.5959188342094421, + "learning_rate": 9.665414194233194e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9028597772121429, + "num_tokens": 382034051.0, + "step": 1439 + }, + { + "epoch": 0.14386332983665517, + "grad_norm": 0.5737951397895813, + "learning_rate": 9.66483206205779e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.905031144618988, + "num_tokens": 382293542.0, + "step": 1440 + }, + { + "epoch": 0.14396323492681953, + "grad_norm": 7.863020420074463, + "learning_rate": 9.664249441469905e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.9038644134998322, + "num_tokens": 382552743.0, + "step": 1441 + }, + { + "epoch": 0.14406314001698386, + "grad_norm": 0.7155924439430237, + "learning_rate": 9.663666332530541e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.9049937725067139, + "num_tokens": 382816594.0, + "step": 1442 + }, + { + "epoch": 0.1441630451071482, + "grad_norm": 0.657209575176239, + "learning_rate": 9.66308273530075e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9039423167705536, + "num_tokens": 383089662.0, + "step": 1443 + }, + { + "epoch": 0.14426295019731256, + "grad_norm": 0.6919188499450684, + "learning_rate": 9.662498649841635e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.9034125506877899, + "num_tokens": 383358289.0, + "step": 1444 + }, + { + "epoch": 0.1443628552874769, + "grad_norm": 0.538453996181488, + "learning_rate": 9.661914076214349e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.9048163294792175, + "num_tokens": 383629132.0, + "step": 1445 + }, + { + "epoch": 0.14446276037764125, + "grad_norm": 0.5701302886009216, + "learning_rate": 9.6613290144801e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.9041389524936676, + "num_tokens": 383895548.0, + "step": 1446 + }, + { + "epoch": 0.14456266546780558, + "grad_norm": 0.529692530632019, + "learning_rate": 9.660743464700144e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9046887159347534, + "num_tokens": 384159275.0, + "step": 1447 + }, + { + "epoch": 0.14466257055796994, + "grad_norm": 0.6255033612251282, + "learning_rate": 9.660157426935785e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.9020108580589294, + "num_tokens": 384432885.0, + "step": 1448 + }, + { + "epoch": 0.14476247564813427, + "grad_norm": 0.5650694966316223, + "learning_rate": 9.659570901248388e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9034271240234375, + "num_tokens": 384700459.0, + "step": 1449 + }, + { + "epoch": 0.14486238073829863, + "grad_norm": 0.6440607309341431, + "learning_rate": 9.658983887699359e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.9028624594211578, + "num_tokens": 384954462.0, + "step": 1450 + }, + { + "epoch": 0.14496228582846296, + "grad_norm": 0.5365544557571411, + "learning_rate": 9.658396386350157e-06, + "loss": 0.508, + "mean_token_accuracy": 0.902338981628418, + "num_tokens": 385214274.0, + "step": 1451 + }, + { + "epoch": 0.1450621909186273, + "grad_norm": 0.6281372904777527, + "learning_rate": 9.657808397262297e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9068315029144287, + "num_tokens": 385473622.0, + "step": 1452 + }, + { + "epoch": 0.14516209600879165, + "grad_norm": 0.7755463123321533, + "learning_rate": 9.657219920497343e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9035310447216034, + "num_tokens": 385743225.0, + "step": 1453 + }, + { + "epoch": 0.14526200109895598, + "grad_norm": 0.7854356169700623, + "learning_rate": 9.656630956116905e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9029616713523865, + "num_tokens": 386012413.0, + "step": 1454 + }, + { + "epoch": 0.14536190618912034, + "grad_norm": 0.4932137131690979, + "learning_rate": 9.656041504182651e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.9032808542251587, + "num_tokens": 386266826.0, + "step": 1455 + }, + { + "epoch": 0.14546181127928468, + "grad_norm": 0.5647332072257996, + "learning_rate": 9.655451564756299e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9053909778594971, + "num_tokens": 386528878.0, + "step": 1456 + }, + { + "epoch": 0.14556171636944903, + "grad_norm": 0.5880170464515686, + "learning_rate": 9.654861137899613e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.9018560945987701, + "num_tokens": 386796163.0, + "step": 1457 + }, + { + "epoch": 0.14566162145961337, + "grad_norm": 0.5747405290603638, + "learning_rate": 9.654270223674411e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9036469459533691, + "num_tokens": 387064242.0, + "step": 1458 + }, + { + "epoch": 0.1457615265497777, + "grad_norm": 0.7772547602653503, + "learning_rate": 9.653678822142564e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.9037244617938995, + "num_tokens": 387330262.0, + "step": 1459 + }, + { + "epoch": 0.14586143163994206, + "grad_norm": 0.5586439371109009, + "learning_rate": 9.653086933365994e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.9048254787921906, + "num_tokens": 387596508.0, + "step": 1460 + }, + { + "epoch": 0.1459613367301064, + "grad_norm": 0.5059940814971924, + "learning_rate": 9.652494557406666e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.9054936468601227, + "num_tokens": 387860056.0, + "step": 1461 + }, + { + "epoch": 0.14606124182027075, + "grad_norm": 0.49470290541648865, + "learning_rate": 9.651901694326611e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9041278660297394, + "num_tokens": 388118005.0, + "step": 1462 + }, + { + "epoch": 0.14616114691043508, + "grad_norm": 0.7476475834846497, + "learning_rate": 9.651308344187895e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.9004609882831573, + "num_tokens": 388378957.0, + "step": 1463 + }, + { + "epoch": 0.14626105200059944, + "grad_norm": 0.6196343302726746, + "learning_rate": 9.650714507052646e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.903377503156662, + "num_tokens": 388642674.0, + "step": 1464 + }, + { + "epoch": 0.14636095709076377, + "grad_norm": 0.6176442503929138, + "learning_rate": 9.650120182983038e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9036211669445038, + "num_tokens": 388908647.0, + "step": 1465 + }, + { + "epoch": 0.14646086218092813, + "grad_norm": 0.5320133566856384, + "learning_rate": 9.6495253720413e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.9035174548625946, + "num_tokens": 389179713.0, + "step": 1466 + }, + { + "epoch": 0.14656076727109246, + "grad_norm": 0.5404263138771057, + "learning_rate": 9.648930074289704e-06, + "loss": 0.509, + "mean_token_accuracy": 0.9051808714866638, + "num_tokens": 389447634.0, + "step": 1467 + }, + { + "epoch": 0.1466606723612568, + "grad_norm": 0.6720655560493469, + "learning_rate": 9.648334289790585e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9041742980480194, + "num_tokens": 389709864.0, + "step": 1468 + }, + { + "epoch": 0.14676057745142115, + "grad_norm": 0.6427189111709595, + "learning_rate": 9.647738018606315e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.9011449217796326, + "num_tokens": 389983385.0, + "step": 1469 + }, + { + "epoch": 0.14686048254158549, + "grad_norm": 0.5418072938919067, + "learning_rate": 9.64714126079933e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9027393758296967, + "num_tokens": 390254167.0, + "step": 1470 + }, + { + "epoch": 0.14696038763174984, + "grad_norm": 0.49934592843055725, + "learning_rate": 9.646544016432109e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.9032570719718933, + "num_tokens": 390516120.0, + "step": 1471 + }, + { + "epoch": 0.14706029272191418, + "grad_norm": 0.6593673825263977, + "learning_rate": 9.645946285567183e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.9035212099552155, + "num_tokens": 390780490.0, + "step": 1472 + }, + { + "epoch": 0.14716019781207854, + "grad_norm": 0.5229738354682922, + "learning_rate": 9.645348068267136e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.905113697052002, + "num_tokens": 391057785.0, + "step": 1473 + }, + { + "epoch": 0.14726010290224287, + "grad_norm": 0.9526671171188354, + "learning_rate": 9.644749364594604e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.9020603597164154, + "num_tokens": 391317208.0, + "step": 1474 + }, + { + "epoch": 0.1473600079924072, + "grad_norm": 0.5660941004753113, + "learning_rate": 9.644150174612267e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.9031522274017334, + "num_tokens": 391583130.0, + "step": 1475 + }, + { + "epoch": 0.14745991308257156, + "grad_norm": 1.4324390888214111, + "learning_rate": 9.643550498382865e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.9050728380680084, + "num_tokens": 391849010.0, + "step": 1476 + }, + { + "epoch": 0.1475598181727359, + "grad_norm": 0.5756385326385498, + "learning_rate": 9.642950335969183e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.9037899672985077, + "num_tokens": 392115918.0, + "step": 1477 + }, + { + "epoch": 0.14765972326290025, + "grad_norm": 2.701481580734253, + "learning_rate": 9.642349687434059e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.9054371416568756, + "num_tokens": 392390313.0, + "step": 1478 + }, + { + "epoch": 0.14775962835306458, + "grad_norm": 0.5249820351600647, + "learning_rate": 9.64174855284038e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.9043957591056824, + "num_tokens": 392656385.0, + "step": 1479 + }, + { + "epoch": 0.14785953344322894, + "grad_norm": 0.6177316904067993, + "learning_rate": 9.641146932251088e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9023524224758148, + "num_tokens": 392916116.0, + "step": 1480 + }, + { + "epoch": 0.14795943853339327, + "grad_norm": 0.5763983726501465, + "learning_rate": 9.640544825729173e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9019996225833893, + "num_tokens": 393186732.0, + "step": 1481 + }, + { + "epoch": 0.14805934362355763, + "grad_norm": 0.7432951331138611, + "learning_rate": 9.639942233337674e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.902596116065979, + "num_tokens": 393460920.0, + "step": 1482 + }, + { + "epoch": 0.14815924871372196, + "grad_norm": 0.5877786874771118, + "learning_rate": 9.639339155139684e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.9027770757675171, + "num_tokens": 393715776.0, + "step": 1483 + }, + { + "epoch": 0.1482591538038863, + "grad_norm": 0.515677273273468, + "learning_rate": 9.638735591198347e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.9038309156894684, + "num_tokens": 393979284.0, + "step": 1484 + }, + { + "epoch": 0.14835905889405065, + "grad_norm": 0.4980180859565735, + "learning_rate": 9.638131541576854e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.901726633310318, + "num_tokens": 394254200.0, + "step": 1485 + }, + { + "epoch": 0.148458963984215, + "grad_norm": 0.5900108218193054, + "learning_rate": 9.637527006338454e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9049893915653229, + "num_tokens": 394526598.0, + "step": 1486 + }, + { + "epoch": 0.14855886907437935, + "grad_norm": 0.7910662889480591, + "learning_rate": 9.636921985546438e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.9016940891742706, + "num_tokens": 394791057.0, + "step": 1487 + }, + { + "epoch": 0.14865877416454368, + "grad_norm": 0.5103524327278137, + "learning_rate": 9.636316479264154e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9038230180740356, + "num_tokens": 395058878.0, + "step": 1488 + }, + { + "epoch": 0.14875867925470804, + "grad_norm": 0.5710228085517883, + "learning_rate": 9.635710487555e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.9047639667987823, + "num_tokens": 395334316.0, + "step": 1489 + }, + { + "epoch": 0.14885858434487237, + "grad_norm": 0.6927698850631714, + "learning_rate": 9.635104010482422e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.9027532637119293, + "num_tokens": 395606187.0, + "step": 1490 + }, + { + "epoch": 0.14895848943503673, + "grad_norm": 0.6006709337234497, + "learning_rate": 9.63449704810992e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.9025911390781403, + "num_tokens": 395865972.0, + "step": 1491 + }, + { + "epoch": 0.14905839452520106, + "grad_norm": 0.6185697317123413, + "learning_rate": 9.633889600501043e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.9033693373203278, + "num_tokens": 396123258.0, + "step": 1492 + }, + { + "epoch": 0.1491582996153654, + "grad_norm": 0.5498040914535522, + "learning_rate": 9.633281667719394e-06, + "loss": 0.507, + "mean_token_accuracy": 0.9049420058727264, + "num_tokens": 396391660.0, + "step": 1493 + }, + { + "epoch": 0.14925820470552975, + "grad_norm": 0.6483862996101379, + "learning_rate": 9.632673249828618e-06, + "loss": 0.502, + "mean_token_accuracy": 0.904196560382843, + "num_tokens": 396663895.0, + "step": 1494 + }, + { + "epoch": 0.14935810979569408, + "grad_norm": 0.5295635461807251, + "learning_rate": 9.632064346892425e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.9046507477760315, + "num_tokens": 396927558.0, + "step": 1495 + }, + { + "epoch": 0.14945801488585844, + "grad_norm": 0.5918886065483093, + "learning_rate": 9.631454958974562e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.9022669792175293, + "num_tokens": 397197161.0, + "step": 1496 + }, + { + "epoch": 0.14955791997602277, + "grad_norm": 0.7849302291870117, + "learning_rate": 9.630845086138833e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.9031147658824921, + "num_tokens": 397465211.0, + "step": 1497 + }, + { + "epoch": 0.14965782506618713, + "grad_norm": 0.5059962272644043, + "learning_rate": 9.630234728449095e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9051655232906342, + "num_tokens": 397736525.0, + "step": 1498 + }, + { + "epoch": 0.14975773015635147, + "grad_norm": 0.6154648661613464, + "learning_rate": 9.62962388596925e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.903945654630661, + "num_tokens": 398000866.0, + "step": 1499 + }, + { + "epoch": 0.1498576352465158, + "grad_norm": 0.7144339084625244, + "learning_rate": 9.629012558763256e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.901612788438797, + "num_tokens": 398255992.0, + "step": 1500 + }, + { + "epoch": 0.14995754033668016, + "grad_norm": 0.5115087032318115, + "learning_rate": 9.628400746895119e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.9049118757247925, + "num_tokens": 398528959.0, + "step": 1501 + }, + { + "epoch": 0.1500574454268445, + "grad_norm": 0.5288372039794922, + "learning_rate": 9.627788450428896e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.9047894775867462, + "num_tokens": 398800081.0, + "step": 1502 + }, + { + "epoch": 0.15015735051700885, + "grad_norm": 0.5839836001396179, + "learning_rate": 9.627175669428695e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.9033406376838684, + "num_tokens": 399073106.0, + "step": 1503 + }, + { + "epoch": 0.15025725560717318, + "grad_norm": 0.5992316603660583, + "learning_rate": 9.626562403958674e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.9019407033920288, + "num_tokens": 399332941.0, + "step": 1504 + }, + { + "epoch": 0.15035716069733754, + "grad_norm": 0.5589104294776917, + "learning_rate": 9.625948654083043e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.9040060937404633, + "num_tokens": 399606420.0, + "step": 1505 + }, + { + "epoch": 0.15045706578750187, + "grad_norm": 0.6241596341133118, + "learning_rate": 9.625334419866064e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.901340514421463, + "num_tokens": 399878179.0, + "step": 1506 + }, + { + "epoch": 0.15055697087766623, + "grad_norm": 1.7007522583007812, + "learning_rate": 9.624719701372045e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9050316512584686, + "num_tokens": 400139826.0, + "step": 1507 + }, + { + "epoch": 0.15065687596783056, + "grad_norm": 0.7061233520507812, + "learning_rate": 9.624104498665353e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.902744472026825, + "num_tokens": 400397899.0, + "step": 1508 + }, + { + "epoch": 0.1507567810579949, + "grad_norm": 0.49984824657440186, + "learning_rate": 9.623488811810392e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9060215055942535, + "num_tokens": 400655350.0, + "step": 1509 + }, + { + "epoch": 0.15085668614815925, + "grad_norm": 0.5452098250389099, + "learning_rate": 9.622872640871632e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9036427140235901, + "num_tokens": 400915318.0, + "step": 1510 + }, + { + "epoch": 0.15095659123832358, + "grad_norm": 0.5423904061317444, + "learning_rate": 9.622255985913584e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.9042918682098389, + "num_tokens": 401176615.0, + "step": 1511 + }, + { + "epoch": 0.15105649632848794, + "grad_norm": 0.5750455260276794, + "learning_rate": 9.621638847000811e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.9025480151176453, + "num_tokens": 401449505.0, + "step": 1512 + }, + { + "epoch": 0.15115640141865228, + "grad_norm": 0.5485971570014954, + "learning_rate": 9.621021224197931e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.9026840329170227, + "num_tokens": 401710909.0, + "step": 1513 + }, + { + "epoch": 0.15125630650881663, + "grad_norm": 0.6629571318626404, + "learning_rate": 9.620403117569608e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9028066396713257, + "num_tokens": 401979035.0, + "step": 1514 + }, + { + "epoch": 0.15135621159898097, + "grad_norm": 0.9036003947257996, + "learning_rate": 9.619784527180559e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.9033937752246857, + "num_tokens": 402241578.0, + "step": 1515 + }, + { + "epoch": 0.1514561166891453, + "grad_norm": 0.9210726618766785, + "learning_rate": 9.619165453095549e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.9044755399227142, + "num_tokens": 402503200.0, + "step": 1516 + }, + { + "epoch": 0.15155602177930966, + "grad_norm": 1.2520782947540283, + "learning_rate": 9.618545895379398e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.9051457643508911, + "num_tokens": 402771954.0, + "step": 1517 + }, + { + "epoch": 0.151655926869474, + "grad_norm": 0.6077268719673157, + "learning_rate": 9.617925854096975e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.9036800563335419, + "num_tokens": 403040429.0, + "step": 1518 + }, + { + "epoch": 0.15175583195963835, + "grad_norm": 0.6945447325706482, + "learning_rate": 9.617305329313198e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9035133123397827, + "num_tokens": 403309572.0, + "step": 1519 + }, + { + "epoch": 0.15185573704980268, + "grad_norm": 0.6637132167816162, + "learning_rate": 9.616684321093035e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.9026598930358887, + "num_tokens": 403570546.0, + "step": 1520 + }, + { + "epoch": 0.15195564213996704, + "grad_norm": 0.6209186315536499, + "learning_rate": 9.616062829501507e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.9056476056575775, + "num_tokens": 403835903.0, + "step": 1521 + }, + { + "epoch": 0.15205554723013137, + "grad_norm": 0.6864840984344482, + "learning_rate": 9.615440854603686e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8992824852466583, + "num_tokens": 404103579.0, + "step": 1522 + }, + { + "epoch": 0.15215545232029573, + "grad_norm": 0.5145554542541504, + "learning_rate": 9.614818396464692e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9039113521575928, + "num_tokens": 404366611.0, + "step": 1523 + }, + { + "epoch": 0.15225535741046006, + "grad_norm": 0.8678667545318604, + "learning_rate": 9.614195455149698e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9021332859992981, + "num_tokens": 404628381.0, + "step": 1524 + }, + { + "epoch": 0.1523552625006244, + "grad_norm": 0.7589884400367737, + "learning_rate": 9.613572030723924e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.9049051403999329, + "num_tokens": 404895853.0, + "step": 1525 + }, + { + "epoch": 0.15245516759078875, + "grad_norm": 0.8338485956192017, + "learning_rate": 9.612948123252647e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.9021118581295013, + "num_tokens": 405161516.0, + "step": 1526 + }, + { + "epoch": 0.15255507268095309, + "grad_norm": 0.501793622970581, + "learning_rate": 9.612323732801187e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.9059396684169769, + "num_tokens": 405425376.0, + "step": 1527 + }, + { + "epoch": 0.15265497777111744, + "grad_norm": 0.6815211772918701, + "learning_rate": 9.611698859434923e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9064254462718964, + "num_tokens": 405688925.0, + "step": 1528 + }, + { + "epoch": 0.15275488286128178, + "grad_norm": 0.7643570899963379, + "learning_rate": 9.611073503219275e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.9022575616836548, + "num_tokens": 405952702.0, + "step": 1529 + }, + { + "epoch": 0.15285478795144614, + "grad_norm": 0.7101548910140991, + "learning_rate": 9.610447664219722e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.9037668406963348, + "num_tokens": 406214437.0, + "step": 1530 + }, + { + "epoch": 0.15295469304161047, + "grad_norm": 0.5639755129814148, + "learning_rate": 9.609821342501787e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.9031048119068146, + "num_tokens": 406486375.0, + "step": 1531 + }, + { + "epoch": 0.15305459813177483, + "grad_norm": 0.5666201710700989, + "learning_rate": 9.609194538131048e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.9034311175346375, + "num_tokens": 406751617.0, + "step": 1532 + }, + { + "epoch": 0.15315450322193916, + "grad_norm": 1.3425062894821167, + "learning_rate": 9.608567251173132e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.901758223772049, + "num_tokens": 407021011.0, + "step": 1533 + }, + { + "epoch": 0.1532544083121035, + "grad_norm": 0.6262737512588501, + "learning_rate": 9.607939481693717e-06, + "loss": 0.507, + "mean_token_accuracy": 0.9007153511047363, + "num_tokens": 407299456.0, + "step": 1534 + }, + { + "epoch": 0.15335431340226785, + "grad_norm": 0.5797330141067505, + "learning_rate": 9.607311229758531e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.9054293930530548, + "num_tokens": 407555691.0, + "step": 1535 + }, + { + "epoch": 0.15345421849243218, + "grad_norm": 0.7053268551826477, + "learning_rate": 9.606682495433352e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9032306373119354, + "num_tokens": 407821156.0, + "step": 1536 + }, + { + "epoch": 0.15355412358259654, + "grad_norm": 0.6278219223022461, + "learning_rate": 9.606053278784009e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9030491709709167, + "num_tokens": 408088109.0, + "step": 1537 + }, + { + "epoch": 0.15365402867276087, + "grad_norm": 0.7081257700920105, + "learning_rate": 9.605423579876381e-06, + "loss": 0.505, + "mean_token_accuracy": 0.9028661251068115, + "num_tokens": 408355488.0, + "step": 1538 + }, + { + "epoch": 0.15375393376292523, + "grad_norm": 0.8667905330657959, + "learning_rate": 9.6047933987764e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.9043941795825958, + "num_tokens": 408615521.0, + "step": 1539 + }, + { + "epoch": 0.15385383885308956, + "grad_norm": 0.6659677028656006, + "learning_rate": 9.604162735550045e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.9013002812862396, + "num_tokens": 408880828.0, + "step": 1540 + }, + { + "epoch": 0.1539537439432539, + "grad_norm": 0.6297799944877625, + "learning_rate": 9.603531590263348e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.9043773412704468, + "num_tokens": 409147306.0, + "step": 1541 + }, + { + "epoch": 0.15405364903341826, + "grad_norm": 0.6863802075386047, + "learning_rate": 9.60289996298239e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.9060810208320618, + "num_tokens": 409400838.0, + "step": 1542 + }, + { + "epoch": 0.1541535541235826, + "grad_norm": 0.5698356628417969, + "learning_rate": 9.602267853773301e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.9005958735942841, + "num_tokens": 409670187.0, + "step": 1543 + }, + { + "epoch": 0.15425345921374695, + "grad_norm": 0.7526476383209229, + "learning_rate": 9.60163526270227e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.9046715795993805, + "num_tokens": 409935590.0, + "step": 1544 + }, + { + "epoch": 0.15435336430391128, + "grad_norm": 0.8003020882606506, + "learning_rate": 9.601002189835522e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.9044642448425293, + "num_tokens": 410186145.0, + "step": 1545 + }, + { + "epoch": 0.15445326939407564, + "grad_norm": 0.5800787210464478, + "learning_rate": 9.600368635239343e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.9055214822292328, + "num_tokens": 410449782.0, + "step": 1546 + }, + { + "epoch": 0.15455317448423997, + "grad_norm": 0.5375522375106812, + "learning_rate": 9.59973459898007e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.9035554826259613, + "num_tokens": 410709902.0, + "step": 1547 + }, + { + "epoch": 0.15465307957440433, + "grad_norm": 0.7646605372428894, + "learning_rate": 9.599100081124083e-06, + "loss": 0.509, + "mean_token_accuracy": 0.9021894633769989, + "num_tokens": 410963953.0, + "step": 1548 + }, + { + "epoch": 0.15475298466456866, + "grad_norm": 0.5797030329704285, + "learning_rate": 9.59846508173782e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.9042819440364838, + "num_tokens": 411224488.0, + "step": 1549 + }, + { + "epoch": 0.154852889754733, + "grad_norm": 0.5622846484184265, + "learning_rate": 9.597829600887766e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.9043388664722443, + "num_tokens": 411485902.0, + "step": 1550 + }, + { + "epoch": 0.15495279484489735, + "grad_norm": 0.740879476070404, + "learning_rate": 9.597193638640451e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9031819403171539, + "num_tokens": 411737039.0, + "step": 1551 + }, + { + "epoch": 0.15505269993506168, + "grad_norm": 0.5903467535972595, + "learning_rate": 9.596557195062468e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.9043676853179932, + "num_tokens": 412009550.0, + "step": 1552 + }, + { + "epoch": 0.15515260502522604, + "grad_norm": 0.6382765173912048, + "learning_rate": 9.59592027022045e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.9034768640995026, + "num_tokens": 412275817.0, + "step": 1553 + }, + { + "epoch": 0.15525251011539037, + "grad_norm": 0.5693243741989136, + "learning_rate": 9.595282864181082e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9052755534648895, + "num_tokens": 412538770.0, + "step": 1554 + }, + { + "epoch": 0.15535241520555473, + "grad_norm": 0.9872763156890869, + "learning_rate": 9.594644977011103e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.9052473306655884, + "num_tokens": 412803588.0, + "step": 1555 + }, + { + "epoch": 0.15545232029571907, + "grad_norm": 0.5625317692756653, + "learning_rate": 9.5940066087773e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.904242604970932, + "num_tokens": 413070273.0, + "step": 1556 + }, + { + "epoch": 0.1555522253858834, + "grad_norm": 0.5439552664756775, + "learning_rate": 9.59336775954651e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.9037364721298218, + "num_tokens": 413323582.0, + "step": 1557 + }, + { + "epoch": 0.15565213047604776, + "grad_norm": 0.5443049669265747, + "learning_rate": 9.592728429385625e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.9017732441425323, + "num_tokens": 413588498.0, + "step": 1558 + }, + { + "epoch": 0.1557520355662121, + "grad_norm": 0.7603972554206848, + "learning_rate": 9.59208861836158e-06, + "loss": 0.512, + "mean_token_accuracy": 0.9022661447525024, + "num_tokens": 413857587.0, + "step": 1559 + }, + { + "epoch": 0.15585194065637645, + "grad_norm": 0.7607426047325134, + "learning_rate": 9.591448326541365e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.9042834639549255, + "num_tokens": 414120444.0, + "step": 1560 + }, + { + "epoch": 0.15595184574654078, + "grad_norm": 0.6554730534553528, + "learning_rate": 9.590807553992017e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.9033361375331879, + "num_tokens": 414390301.0, + "step": 1561 + }, + { + "epoch": 0.15605175083670514, + "grad_norm": 0.6943422555923462, + "learning_rate": 9.590166300780628e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.9031800329685211, + "num_tokens": 414642361.0, + "step": 1562 + }, + { + "epoch": 0.15615165592686947, + "grad_norm": 0.5568698048591614, + "learning_rate": 9.589524566974335e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9037858545780182, + "num_tokens": 414908061.0, + "step": 1563 + }, + { + "epoch": 0.15625156101703383, + "grad_norm": 0.5100056529045105, + "learning_rate": 9.588882352640332e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.9033072888851166, + "num_tokens": 415175637.0, + "step": 1564 + }, + { + "epoch": 0.15635146610719816, + "grad_norm": 0.5016840100288391, + "learning_rate": 9.588239657845857e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.9057670831680298, + "num_tokens": 415443749.0, + "step": 1565 + }, + { + "epoch": 0.1564513711973625, + "grad_norm": 0.5692324638366699, + "learning_rate": 9.587596482658201e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9068531095981598, + "num_tokens": 415713417.0, + "step": 1566 + }, + { + "epoch": 0.15655127628752685, + "grad_norm": 0.7529345750808716, + "learning_rate": 9.586952827144707e-06, + "loss": 0.504, + "mean_token_accuracy": 0.9023576378822327, + "num_tokens": 415976573.0, + "step": 1567 + }, + { + "epoch": 0.15665118137769118, + "grad_norm": 0.7895249724388123, + "learning_rate": 9.586308691372763e-06, + "loss": 0.502, + "mean_token_accuracy": 0.9045182764530182, + "num_tokens": 416242805.0, + "step": 1568 + }, + { + "epoch": 0.15675108646785554, + "grad_norm": 0.7351782917976379, + "learning_rate": 9.585664075409815e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.9007662236690521, + "num_tokens": 416512086.0, + "step": 1569 + }, + { + "epoch": 0.15685099155801988, + "grad_norm": 0.6061886548995972, + "learning_rate": 9.58501897932335e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9026832580566406, + "num_tokens": 416784127.0, + "step": 1570 + }, + { + "epoch": 0.15695089664818423, + "grad_norm": 0.6551501154899597, + "learning_rate": 9.584373403180914e-06, + "loss": 0.51, + "mean_token_accuracy": 0.9010500311851501, + "num_tokens": 417052874.0, + "step": 1571 + }, + { + "epoch": 0.15705080173834857, + "grad_norm": 0.6508028507232666, + "learning_rate": 9.583727347050098e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.9025426506996155, + "num_tokens": 417322777.0, + "step": 1572 + }, + { + "epoch": 0.15715070682851293, + "grad_norm": 0.6015965938568115, + "learning_rate": 9.583080810998545e-06, + "loss": 0.51, + "mean_token_accuracy": 0.9041687548160553, + "num_tokens": 417589307.0, + "step": 1573 + }, + { + "epoch": 0.15725061191867726, + "grad_norm": 0.6875624060630798, + "learning_rate": 9.582433795093944e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.9034349918365479, + "num_tokens": 417849163.0, + "step": 1574 + }, + { + "epoch": 0.1573505170088416, + "grad_norm": 0.5373759269714355, + "learning_rate": 9.581786299404046e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9037543535232544, + "num_tokens": 418117539.0, + "step": 1575 + }, + { + "epoch": 0.15745042209900595, + "grad_norm": 0.7458522915840149, + "learning_rate": 9.581138323996639e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.9045245051383972, + "num_tokens": 418388178.0, + "step": 1576 + }, + { + "epoch": 0.15755032718917028, + "grad_norm": 0.7486932277679443, + "learning_rate": 9.580489868939568e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.9019014835357666, + "num_tokens": 418668878.0, + "step": 1577 + }, + { + "epoch": 0.15765023227933464, + "grad_norm": 0.5179007649421692, + "learning_rate": 9.579840934300728e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.9034174680709839, + "num_tokens": 418935938.0, + "step": 1578 + }, + { + "epoch": 0.15775013736949897, + "grad_norm": 0.8290794491767883, + "learning_rate": 9.57919152014806e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.9008879959583282, + "num_tokens": 419205270.0, + "step": 1579 + }, + { + "epoch": 0.15785004245966333, + "grad_norm": 0.8048126101493835, + "learning_rate": 9.578541626549562e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9058973789215088, + "num_tokens": 419467039.0, + "step": 1580 + }, + { + "epoch": 0.15794994754982766, + "grad_norm": 0.7182457447052002, + "learning_rate": 9.577891253573274e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9041440188884735, + "num_tokens": 419734268.0, + "step": 1581 + }, + { + "epoch": 0.158049852639992, + "grad_norm": 2.4498038291931152, + "learning_rate": 9.577240401287297e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.90632364153862, + "num_tokens": 420005496.0, + "step": 1582 + }, + { + "epoch": 0.15814975773015635, + "grad_norm": 0.6336144208908081, + "learning_rate": 9.576589069759769e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.903453528881073, + "num_tokens": 420272988.0, + "step": 1583 + }, + { + "epoch": 0.15824966282032069, + "grad_norm": 0.8049970865249634, + "learning_rate": 9.575937259058891e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.9043312668800354, + "num_tokens": 420533798.0, + "step": 1584 + }, + { + "epoch": 0.15834956791048505, + "grad_norm": 1.0126502513885498, + "learning_rate": 9.575284969252904e-06, + "loss": 0.514, + "mean_token_accuracy": 0.903589129447937, + "num_tokens": 420802751.0, + "step": 1585 + }, + { + "epoch": 0.15844947300064938, + "grad_norm": 0.5984976887702942, + "learning_rate": 9.574632200410105e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.9029607772827148, + "num_tokens": 421065402.0, + "step": 1586 + }, + { + "epoch": 0.15854937809081374, + "grad_norm": 1.0624079704284668, + "learning_rate": 9.573978952598841e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.9026586413383484, + "num_tokens": 421333290.0, + "step": 1587 + }, + { + "epoch": 0.15864928318097807, + "grad_norm": 0.663805365562439, + "learning_rate": 9.573325225887506e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.9012669920921326, + "num_tokens": 421600043.0, + "step": 1588 + }, + { + "epoch": 0.15874918827114243, + "grad_norm": 0.598788321018219, + "learning_rate": 9.572671020344545e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.9026013016700745, + "num_tokens": 421874004.0, + "step": 1589 + }, + { + "epoch": 0.15884909336130676, + "grad_norm": 0.7077288031578064, + "learning_rate": 9.572016336038454e-06, + "loss": 0.507, + "mean_token_accuracy": 0.9041629731655121, + "num_tokens": 422129943.0, + "step": 1590 + }, + { + "epoch": 0.1589489984514711, + "grad_norm": 0.7457950115203857, + "learning_rate": 9.571361173037782e-06, + "loss": 0.509, + "mean_token_accuracy": 0.9025976955890656, + "num_tokens": 422401444.0, + "step": 1591 + }, + { + "epoch": 0.15904890354163545, + "grad_norm": 0.627871572971344, + "learning_rate": 9.570705531411122e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.9056496322154999, + "num_tokens": 422670899.0, + "step": 1592 + }, + { + "epoch": 0.15914880863179978, + "grad_norm": 0.7252890467643738, + "learning_rate": 9.570049411227122e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.9041589796543121, + "num_tokens": 422933938.0, + "step": 1593 + }, + { + "epoch": 0.15924871372196414, + "grad_norm": 0.5560283064842224, + "learning_rate": 9.56939281255448e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.9057256281375885, + "num_tokens": 423197816.0, + "step": 1594 + }, + { + "epoch": 0.15934861881212847, + "grad_norm": 0.5484156608581543, + "learning_rate": 9.568735735461938e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.901011735200882, + "num_tokens": 423458878.0, + "step": 1595 + }, + { + "epoch": 0.15944852390229283, + "grad_norm": 0.589509129524231, + "learning_rate": 9.568078180018295e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.8994538187980652, + "num_tokens": 423739031.0, + "step": 1596 + }, + { + "epoch": 0.15954842899245716, + "grad_norm": 0.6699275970458984, + "learning_rate": 9.5674201462924e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9021377563476562, + "num_tokens": 423988833.0, + "step": 1597 + }, + { + "epoch": 0.1596483340826215, + "grad_norm": 0.6398295164108276, + "learning_rate": 9.566761634353145e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.9050806760787964, + "num_tokens": 424251573.0, + "step": 1598 + }, + { + "epoch": 0.15974823917278586, + "grad_norm": 0.740921139717102, + "learning_rate": 9.56610264426948e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.9020445644855499, + "num_tokens": 424514956.0, + "step": 1599 + }, + { + "epoch": 0.1598481442629502, + "grad_norm": 0.6259545087814331, + "learning_rate": 9.565443176110402e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.9067873656749725, + "num_tokens": 424779024.0, + "step": 1600 + }, + { + "epoch": 0.15994804935311455, + "grad_norm": 0.611455500125885, + "learning_rate": 9.564783229944958e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.9045741856098175, + "num_tokens": 425031927.0, + "step": 1601 + }, + { + "epoch": 0.16004795444327888, + "grad_norm": 1.818587064743042, + "learning_rate": 9.564122805842244e-06, + "loss": 0.51, + "mean_token_accuracy": 0.9036681354045868, + "num_tokens": 425292897.0, + "step": 1602 + }, + { + "epoch": 0.16014785953344324, + "grad_norm": 0.7964921593666077, + "learning_rate": 9.563461903871407e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9027300179004669, + "num_tokens": 425566449.0, + "step": 1603 + }, + { + "epoch": 0.16024776462360757, + "grad_norm": 1.4149465560913086, + "learning_rate": 9.562800524101644e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.9007247984409332, + "num_tokens": 425829213.0, + "step": 1604 + }, + { + "epoch": 0.16034766971377193, + "grad_norm": 0.8085068464279175, + "learning_rate": 9.562138666602204e-06, + "loss": 0.503, + "mean_token_accuracy": 0.9055664241313934, + "num_tokens": 426089645.0, + "step": 1605 + }, + { + "epoch": 0.16044757480393626, + "grad_norm": 0.7232611179351807, + "learning_rate": 9.56147633144238e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.9044082164764404, + "num_tokens": 426356879.0, + "step": 1606 + }, + { + "epoch": 0.1605474798941006, + "grad_norm": 0.8532090783119202, + "learning_rate": 9.560813518691524e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.9026918113231659, + "num_tokens": 426620342.0, + "step": 1607 + }, + { + "epoch": 0.16064738498426495, + "grad_norm": 0.6758686900138855, + "learning_rate": 9.560150228419031e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.8999322652816772, + "num_tokens": 426883343.0, + "step": 1608 + }, + { + "epoch": 0.16074729007442928, + "grad_norm": 0.6668797731399536, + "learning_rate": 9.559486460694348e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.9043004214763641, + "num_tokens": 427150924.0, + "step": 1609 + }, + { + "epoch": 0.16084719516459364, + "grad_norm": 0.8283735513687134, + "learning_rate": 9.55882221558697e-06, + "loss": 0.503, + "mean_token_accuracy": 0.9038105010986328, + "num_tokens": 427418246.0, + "step": 1610 + }, + { + "epoch": 0.16094710025475797, + "grad_norm": 0.7971392869949341, + "learning_rate": 9.55815749316645e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.9044902920722961, + "num_tokens": 427676778.0, + "step": 1611 + }, + { + "epoch": 0.16104700534492233, + "grad_norm": 0.7245474457740784, + "learning_rate": 9.557492293502379e-06, + "loss": 0.508, + "mean_token_accuracy": 0.9051124751567841, + "num_tokens": 427934499.0, + "step": 1612 + }, + { + "epoch": 0.16114691043508667, + "grad_norm": 0.6747304797172546, + "learning_rate": 9.556826616664408e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.904435932636261, + "num_tokens": 428194904.0, + "step": 1613 + }, + { + "epoch": 0.16124681552525102, + "grad_norm": 0.5995769500732422, + "learning_rate": 9.556160462722231e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.906059205532074, + "num_tokens": 428450818.0, + "step": 1614 + }, + { + "epoch": 0.16134672061541536, + "grad_norm": 0.5886339545249939, + "learning_rate": 9.555493831745598e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.9034746885299683, + "num_tokens": 428718851.0, + "step": 1615 + }, + { + "epoch": 0.1614466257055797, + "grad_norm": 0.6095187664031982, + "learning_rate": 9.554826723804304e-06, + "loss": 0.516, + "mean_token_accuracy": 0.9012976586818695, + "num_tokens": 428980398.0, + "step": 1616 + }, + { + "epoch": 0.16154653079574405, + "grad_norm": 0.5832844376564026, + "learning_rate": 9.554159138968195e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.9043067395687103, + "num_tokens": 429243783.0, + "step": 1617 + }, + { + "epoch": 0.16164643588590838, + "grad_norm": 0.6553124785423279, + "learning_rate": 9.55349107730717e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.9017988741397858, + "num_tokens": 429505100.0, + "step": 1618 + }, + { + "epoch": 0.16174634097607274, + "grad_norm": 0.6644808650016785, + "learning_rate": 9.552822538891175e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9056373536586761, + "num_tokens": 429774885.0, + "step": 1619 + }, + { + "epoch": 0.16184624606623707, + "grad_norm": 0.5235426425933838, + "learning_rate": 9.552153523790207e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.9034063816070557, + "num_tokens": 430047619.0, + "step": 1620 + }, + { + "epoch": 0.16194615115640143, + "grad_norm": 0.6214118599891663, + "learning_rate": 9.551484032074312e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.9053822457790375, + "num_tokens": 430311200.0, + "step": 1621 + }, + { + "epoch": 0.16204605624656576, + "grad_norm": 0.6034170985221863, + "learning_rate": 9.550814063813585e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.9045069813728333, + "num_tokens": 430569314.0, + "step": 1622 + }, + { + "epoch": 0.1621459613367301, + "grad_norm": 0.796087384223938, + "learning_rate": 9.550143619078175e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.9023241996765137, + "num_tokens": 430831208.0, + "step": 1623 + }, + { + "epoch": 0.16224586642689445, + "grad_norm": 0.5319629907608032, + "learning_rate": 9.549472697938275e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.903765857219696, + "num_tokens": 431092906.0, + "step": 1624 + }, + { + "epoch": 0.16234577151705878, + "grad_norm": 0.920511782169342, + "learning_rate": 9.548801300464135e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.9060468077659607, + "num_tokens": 431366780.0, + "step": 1625 + }, + { + "epoch": 0.16244567660722314, + "grad_norm": 0.6824195981025696, + "learning_rate": 9.548129426726048e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.9044918119907379, + "num_tokens": 431628376.0, + "step": 1626 + }, + { + "epoch": 0.16254558169738748, + "grad_norm": 0.662686824798584, + "learning_rate": 9.54745707679436e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9035419225692749, + "num_tokens": 431900690.0, + "step": 1627 + }, + { + "epoch": 0.16264548678755184, + "grad_norm": 0.8338550925254822, + "learning_rate": 9.546784250739468e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.9016840755939484, + "num_tokens": 432153766.0, + "step": 1628 + }, + { + "epoch": 0.16274539187771617, + "grad_norm": 0.6369472146034241, + "learning_rate": 9.546110948631817e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.9046359360218048, + "num_tokens": 432423734.0, + "step": 1629 + }, + { + "epoch": 0.16284529696788053, + "grad_norm": 0.7888449430465698, + "learning_rate": 9.545437170541903e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.9024972319602966, + "num_tokens": 432686336.0, + "step": 1630 + }, + { + "epoch": 0.16294520205804486, + "grad_norm": 0.558550238609314, + "learning_rate": 9.544762916540271e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.9010066092014313, + "num_tokens": 432945457.0, + "step": 1631 + }, + { + "epoch": 0.1630451071482092, + "grad_norm": 0.6831156611442566, + "learning_rate": 9.544088186697515e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.9005651772022247, + "num_tokens": 433218998.0, + "step": 1632 + }, + { + "epoch": 0.16314501223837355, + "grad_norm": 0.5831497311592102, + "learning_rate": 9.543412981084282e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9049779772758484, + "num_tokens": 433478644.0, + "step": 1633 + }, + { + "epoch": 0.16324491732853788, + "grad_norm": 0.5579458475112915, + "learning_rate": 9.542737299771262e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9063190221786499, + "num_tokens": 433741201.0, + "step": 1634 + }, + { + "epoch": 0.16334482241870224, + "grad_norm": 0.5146346092224121, + "learning_rate": 9.542061142829206e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.9066999554634094, + "num_tokens": 434017043.0, + "step": 1635 + }, + { + "epoch": 0.16344472750886657, + "grad_norm": 0.8437780737876892, + "learning_rate": 9.541384510328905e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.9000266492366791, + "num_tokens": 434276443.0, + "step": 1636 + }, + { + "epoch": 0.16354463259903093, + "grad_norm": 0.6479038596153259, + "learning_rate": 9.540707402341203e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.9028315544128418, + "num_tokens": 434536057.0, + "step": 1637 + }, + { + "epoch": 0.16364453768919526, + "grad_norm": 0.5933903455734253, + "learning_rate": 9.540029818936993e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8987687230110168, + "num_tokens": 434790539.0, + "step": 1638 + }, + { + "epoch": 0.1637444427793596, + "grad_norm": 0.6367395520210266, + "learning_rate": 9.539351760187218e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.9004695117473602, + "num_tokens": 435045345.0, + "step": 1639 + }, + { + "epoch": 0.16384434786952395, + "grad_norm": 0.5646086931228638, + "learning_rate": 9.538673226162878e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.9037920534610748, + "num_tokens": 435312916.0, + "step": 1640 + }, + { + "epoch": 0.16394425295968829, + "grad_norm": 0.5656902194023132, + "learning_rate": 9.537994216935007e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.9039512574672699, + "num_tokens": 435576234.0, + "step": 1641 + }, + { + "epoch": 0.16404415804985265, + "grad_norm": 0.6453832387924194, + "learning_rate": 9.537314732574702e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.904319703578949, + "num_tokens": 435843148.0, + "step": 1642 + }, + { + "epoch": 0.16414406314001698, + "grad_norm": 0.564923882484436, + "learning_rate": 9.536634773153108e-06, + "loss": 0.513, + "mean_token_accuracy": 0.9048894047737122, + "num_tokens": 436110123.0, + "step": 1643 + }, + { + "epoch": 0.16424396823018134, + "grad_norm": 0.5627858638763428, + "learning_rate": 9.535954338741416e-06, + "loss": 0.505, + "mean_token_accuracy": 0.9024398326873779, + "num_tokens": 436377051.0, + "step": 1644 + }, + { + "epoch": 0.16434387332034567, + "grad_norm": 0.6840916872024536, + "learning_rate": 9.535273429410865e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.9040327072143555, + "num_tokens": 436646006.0, + "step": 1645 + }, + { + "epoch": 0.16444377841051003, + "grad_norm": 0.5469236373901367, + "learning_rate": 9.534592045232752e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.9029740393161774, + "num_tokens": 436902032.0, + "step": 1646 + }, + { + "epoch": 0.16454368350067436, + "grad_norm": 0.6535437107086182, + "learning_rate": 9.533910186278413e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.9032817780971527, + "num_tokens": 437170512.0, + "step": 1647 + }, + { + "epoch": 0.1646435885908387, + "grad_norm": 0.6609862446784973, + "learning_rate": 9.533227852619244e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.903153270483017, + "num_tokens": 437433099.0, + "step": 1648 + }, + { + "epoch": 0.16474349368100305, + "grad_norm": 0.593184232711792, + "learning_rate": 9.532545044326685e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9048750102519989, + "num_tokens": 437692011.0, + "step": 1649 + }, + { + "epoch": 0.16484339877116738, + "grad_norm": 0.5035969614982605, + "learning_rate": 9.531861761472222e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.9041532576084137, + "num_tokens": 437950727.0, + "step": 1650 + }, + { + "epoch": 0.16494330386133174, + "grad_norm": 0.5166203379631042, + "learning_rate": 9.531178004127404e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9018945097923279, + "num_tokens": 438219524.0, + "step": 1651 + }, + { + "epoch": 0.16504320895149607, + "grad_norm": 0.7452768683433533, + "learning_rate": 9.530493772363814e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.9030871093273163, + "num_tokens": 438493141.0, + "step": 1652 + }, + { + "epoch": 0.16514311404166043, + "grad_norm": 0.504246711730957, + "learning_rate": 9.529809066253095e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9028056859970093, + "num_tokens": 438758736.0, + "step": 1653 + }, + { + "epoch": 0.16524301913182476, + "grad_norm": 0.5967943072319031, + "learning_rate": 9.529123885866934e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.903816431760788, + "num_tokens": 439028286.0, + "step": 1654 + }, + { + "epoch": 0.16534292422198912, + "grad_norm": 0.5770803093910217, + "learning_rate": 9.528438231277073e-06, + "loss": 0.509, + "mean_token_accuracy": 0.9014593660831451, + "num_tokens": 439287947.0, + "step": 1655 + }, + { + "epoch": 0.16544282931215346, + "grad_norm": 0.502758264541626, + "learning_rate": 9.5277521025553e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.901127427816391, + "num_tokens": 439556982.0, + "step": 1656 + }, + { + "epoch": 0.1655427344023178, + "grad_norm": 0.7474815845489502, + "learning_rate": 9.527065499773449e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.900395542383194, + "num_tokens": 439820068.0, + "step": 1657 + }, + { + "epoch": 0.16564263949248215, + "grad_norm": 0.6270552277565002, + "learning_rate": 9.526378423003415e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.9036577641963959, + "num_tokens": 440089837.0, + "step": 1658 + }, + { + "epoch": 0.16574254458264648, + "grad_norm": 0.5051000118255615, + "learning_rate": 9.52569087231713e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.9047114849090576, + "num_tokens": 440358497.0, + "step": 1659 + }, + { + "epoch": 0.16584244967281084, + "grad_norm": 0.6046449542045593, + "learning_rate": 9.525002847786585e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.9072101712226868, + "num_tokens": 440619497.0, + "step": 1660 + }, + { + "epoch": 0.16594235476297517, + "grad_norm": 0.6692368388175964, + "learning_rate": 9.524314349483815e-06, + "loss": 0.506, + "mean_token_accuracy": 0.9011781811714172, + "num_tokens": 440885610.0, + "step": 1661 + }, + { + "epoch": 0.16604225985313953, + "grad_norm": 0.7369250059127808, + "learning_rate": 9.523625377480907e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.9027471840381622, + "num_tokens": 441150172.0, + "step": 1662 + }, + { + "epoch": 0.16614216494330386, + "grad_norm": 0.6510202288627625, + "learning_rate": 9.522935931849996e-06, + "loss": 0.505, + "mean_token_accuracy": 0.9029288589954376, + "num_tokens": 441420357.0, + "step": 1663 + }, + { + "epoch": 0.1662420700334682, + "grad_norm": 0.6267367601394653, + "learning_rate": 9.522246012663267e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.9040212631225586, + "num_tokens": 441689649.0, + "step": 1664 + }, + { + "epoch": 0.16634197512363255, + "grad_norm": 0.5127782225608826, + "learning_rate": 9.52155561999296e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9060147106647491, + "num_tokens": 441948213.0, + "step": 1665 + }, + { + "epoch": 0.16644188021379688, + "grad_norm": 0.5437791347503662, + "learning_rate": 9.520864753911353e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9031147658824921, + "num_tokens": 442229755.0, + "step": 1666 + }, + { + "epoch": 0.16654178530396124, + "grad_norm": 0.5746214985847473, + "learning_rate": 9.520173414490787e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.9033672511577606, + "num_tokens": 442499138.0, + "step": 1667 + }, + { + "epoch": 0.16664169039412557, + "grad_norm": 0.5380088090896606, + "learning_rate": 9.51948160180364e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.9057815670967102, + "num_tokens": 442758769.0, + "step": 1668 + }, + { + "epoch": 0.16674159548428993, + "grad_norm": 0.576178789138794, + "learning_rate": 9.51878931592235e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.904695063829422, + "num_tokens": 443025470.0, + "step": 1669 + }, + { + "epoch": 0.16684150057445427, + "grad_norm": 0.7054939866065979, + "learning_rate": 9.518096556919396e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.9003466367721558, + "num_tokens": 443286798.0, + "step": 1670 + }, + { + "epoch": 0.16694140566461863, + "grad_norm": 1.4372934103012085, + "learning_rate": 9.517403324867313e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.9028944373130798, + "num_tokens": 443552334.0, + "step": 1671 + }, + { + "epoch": 0.16704131075478296, + "grad_norm": 0.5404810309410095, + "learning_rate": 9.516709619838685e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.9022538661956787, + "num_tokens": 443818589.0, + "step": 1672 + }, + { + "epoch": 0.1671412158449473, + "grad_norm": 0.6744565367698669, + "learning_rate": 9.51601544190614e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.9019111394882202, + "num_tokens": 444088620.0, + "step": 1673 + }, + { + "epoch": 0.16724112093511165, + "grad_norm": 0.5118709802627563, + "learning_rate": 9.51532079114236e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.9053635001182556, + "num_tokens": 444351339.0, + "step": 1674 + }, + { + "epoch": 0.16734102602527598, + "grad_norm": 0.5878137946128845, + "learning_rate": 9.514625667620077e-06, + "loss": 0.506, + "mean_token_accuracy": 0.9028836488723755, + "num_tokens": 444611830.0, + "step": 1675 + }, + { + "epoch": 0.16744093111544034, + "grad_norm": 0.49859973788261414, + "learning_rate": 9.51393007141207e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.9058897793292999, + "num_tokens": 444880124.0, + "step": 1676 + }, + { + "epoch": 0.16754083620560467, + "grad_norm": 0.6987383961677551, + "learning_rate": 9.513234002591167e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9052127003669739, + "num_tokens": 445148517.0, + "step": 1677 + }, + { + "epoch": 0.16764074129576903, + "grad_norm": 0.5811188220977783, + "learning_rate": 9.512537461230252e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9058873355388641, + "num_tokens": 445420020.0, + "step": 1678 + }, + { + "epoch": 0.16774064638593336, + "grad_norm": 0.7014575600624084, + "learning_rate": 9.511840447402247e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9025618433952332, + "num_tokens": 445677924.0, + "step": 1679 + }, + { + "epoch": 0.1678405514760977, + "grad_norm": 0.5937491655349731, + "learning_rate": 9.511142961180135e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9039745628833771, + "num_tokens": 445945491.0, + "step": 1680 + }, + { + "epoch": 0.16794045656626205, + "grad_norm": 0.4882948100566864, + "learning_rate": 9.510445002636943e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.9055314660072327, + "num_tokens": 446212175.0, + "step": 1681 + }, + { + "epoch": 0.16804036165642638, + "grad_norm": 0.5847341418266296, + "learning_rate": 9.509746571845747e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.9026099741458893, + "num_tokens": 446475130.0, + "step": 1682 + }, + { + "epoch": 0.16814026674659074, + "grad_norm": 0.555514395236969, + "learning_rate": 9.509047668879672e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.9028255939483643, + "num_tokens": 446734500.0, + "step": 1683 + }, + { + "epoch": 0.16824017183675508, + "grad_norm": 0.6119492053985596, + "learning_rate": 9.508348293811895e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9040518701076508, + "num_tokens": 447021040.0, + "step": 1684 + }, + { + "epoch": 0.16834007692691944, + "grad_norm": 0.6967895030975342, + "learning_rate": 9.507648446715642e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9020030498504639, + "num_tokens": 447290009.0, + "step": 1685 + }, + { + "epoch": 0.16843998201708377, + "grad_norm": 0.739327609539032, + "learning_rate": 9.506948127664186e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.9044288098812103, + "num_tokens": 447556823.0, + "step": 1686 + }, + { + "epoch": 0.16853988710724813, + "grad_norm": 0.5470688343048096, + "learning_rate": 9.506247336730854e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.9043063223361969, + "num_tokens": 447822561.0, + "step": 1687 + }, + { + "epoch": 0.16863979219741246, + "grad_norm": 0.5588088035583496, + "learning_rate": 9.505546073989016e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.9030115902423859, + "num_tokens": 448092036.0, + "step": 1688 + }, + { + "epoch": 0.1687396972875768, + "grad_norm": 0.5436162352561951, + "learning_rate": 9.504844339512096e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.9064910411834717, + "num_tokens": 448355665.0, + "step": 1689 + }, + { + "epoch": 0.16883960237774115, + "grad_norm": 0.6045195460319519, + "learning_rate": 9.504142133373568e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.9038142561912537, + "num_tokens": 448619010.0, + "step": 1690 + }, + { + "epoch": 0.16893950746790548, + "grad_norm": 0.5515959858894348, + "learning_rate": 9.503439455646952e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.906343549489975, + "num_tokens": 448889551.0, + "step": 1691 + }, + { + "epoch": 0.16903941255806984, + "grad_norm": 0.6139813661575317, + "learning_rate": 9.50273630640582e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.904435396194458, + "num_tokens": 449175761.0, + "step": 1692 + }, + { + "epoch": 0.16913931764823417, + "grad_norm": 0.6351615786552429, + "learning_rate": 9.502032685723792e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.9024999737739563, + "num_tokens": 449442777.0, + "step": 1693 + }, + { + "epoch": 0.16923922273839853, + "grad_norm": 0.5780846476554871, + "learning_rate": 9.501328593674537e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.9061670005321503, + "num_tokens": 449709071.0, + "step": 1694 + }, + { + "epoch": 0.16933912782856286, + "grad_norm": 0.7270721793174744, + "learning_rate": 9.500624030331775e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.90597864985466, + "num_tokens": 449972915.0, + "step": 1695 + }, + { + "epoch": 0.16943903291872722, + "grad_norm": 0.5678133368492126, + "learning_rate": 9.499918995769274e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.9026512503623962, + "num_tokens": 450244175.0, + "step": 1696 + }, + { + "epoch": 0.16953893800889155, + "grad_norm": 0.6171008348464966, + "learning_rate": 9.499213490060853e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.9019734859466553, + "num_tokens": 450505180.0, + "step": 1697 + }, + { + "epoch": 0.16963884309905589, + "grad_norm": 0.5307679176330566, + "learning_rate": 9.498507513280378e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9062852263450623, + "num_tokens": 450766724.0, + "step": 1698 + }, + { + "epoch": 0.16973874818922025, + "grad_norm": 0.5393680334091187, + "learning_rate": 9.497801065501766e-06, + "loss": 0.502, + "mean_token_accuracy": 0.9038625657558441, + "num_tokens": 451027186.0, + "step": 1699 + }, + { + "epoch": 0.16983865327938458, + "grad_norm": 0.5983613729476929, + "learning_rate": 9.497094146798981e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.9028242528438568, + "num_tokens": 451287489.0, + "step": 1700 + }, + { + "epoch": 0.16993855836954894, + "grad_norm": 0.8433669805526733, + "learning_rate": 9.496386757246041e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9023908078670502, + "num_tokens": 451548831.0, + "step": 1701 + }, + { + "epoch": 0.17003846345971327, + "grad_norm": 0.5222576856613159, + "learning_rate": 9.495678896917009e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.9054237008094788, + "num_tokens": 451814595.0, + "step": 1702 + }, + { + "epoch": 0.17013836854987763, + "grad_norm": 0.682066023349762, + "learning_rate": 9.494970565885998e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.9026994109153748, + "num_tokens": 452082333.0, + "step": 1703 + }, + { + "epoch": 0.17023827364004196, + "grad_norm": 0.5175467133522034, + "learning_rate": 9.494261764227172e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.9046328663825989, + "num_tokens": 452353189.0, + "step": 1704 + }, + { + "epoch": 0.1703381787302063, + "grad_norm": 0.5963051319122314, + "learning_rate": 9.493552492014743e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.9033039212226868, + "num_tokens": 452623998.0, + "step": 1705 + }, + { + "epoch": 0.17043808382037065, + "grad_norm": 0.7331396341323853, + "learning_rate": 9.492842749322972e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.9043768644332886, + "num_tokens": 452890672.0, + "step": 1706 + }, + { + "epoch": 0.17053798891053498, + "grad_norm": 0.5556407570838928, + "learning_rate": 9.492132536226168e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9037508964538574, + "num_tokens": 453158360.0, + "step": 1707 + }, + { + "epoch": 0.17063789400069934, + "grad_norm": 0.6313567161560059, + "learning_rate": 9.491421852798695e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9025244116783142, + "num_tokens": 453420473.0, + "step": 1708 + }, + { + "epoch": 0.17073779909086367, + "grad_norm": 0.5246389508247375, + "learning_rate": 9.49071069911496e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.903459221124649, + "num_tokens": 453683307.0, + "step": 1709 + }, + { + "epoch": 0.17083770418102803, + "grad_norm": 0.6346522569656372, + "learning_rate": 9.489999075249422e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.9049836099147797, + "num_tokens": 453954325.0, + "step": 1710 + }, + { + "epoch": 0.17093760927119236, + "grad_norm": 0.6031825542449951, + "learning_rate": 9.48928698127659e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.903427392244339, + "num_tokens": 454216059.0, + "step": 1711 + }, + { + "epoch": 0.17103751436135672, + "grad_norm": 0.580390453338623, + "learning_rate": 9.488574417271017e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.905235081911087, + "num_tokens": 454489880.0, + "step": 1712 + }, + { + "epoch": 0.17113741945152106, + "grad_norm": 0.7173709273338318, + "learning_rate": 9.487861383307312e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.9055802524089813, + "num_tokens": 454757605.0, + "step": 1713 + }, + { + "epoch": 0.1712373245416854, + "grad_norm": 0.5317500233650208, + "learning_rate": 9.48714787946013e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.9075916111469269, + "num_tokens": 455023958.0, + "step": 1714 + }, + { + "epoch": 0.17133722963184975, + "grad_norm": 0.5097466707229614, + "learning_rate": 9.486433905804176e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9076012670993805, + "num_tokens": 455281001.0, + "step": 1715 + }, + { + "epoch": 0.17143713472201408, + "grad_norm": 0.6453192830085754, + "learning_rate": 9.485719462414202e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.9054496884346008, + "num_tokens": 455542208.0, + "step": 1716 + }, + { + "epoch": 0.17153703981217844, + "grad_norm": 0.519590437412262, + "learning_rate": 9.485004549365013e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.9034575521945953, + "num_tokens": 455808060.0, + "step": 1717 + }, + { + "epoch": 0.17163694490234277, + "grad_norm": 1.2718651294708252, + "learning_rate": 9.484289166731461e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.9026791453361511, + "num_tokens": 456069577.0, + "step": 1718 + }, + { + "epoch": 0.17173684999250713, + "grad_norm": 0.5829778909683228, + "learning_rate": 9.483573314588446e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9061062037944794, + "num_tokens": 456327889.0, + "step": 1719 + }, + { + "epoch": 0.17183675508267146, + "grad_norm": 0.546359658241272, + "learning_rate": 9.482856993010919e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.9044662415981293, + "num_tokens": 456592362.0, + "step": 1720 + }, + { + "epoch": 0.1719366601728358, + "grad_norm": 0.7697861790657043, + "learning_rate": 9.48214020207388e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.8995614051818848, + "num_tokens": 456859283.0, + "step": 1721 + }, + { + "epoch": 0.17203656526300015, + "grad_norm": 0.6118437647819519, + "learning_rate": 9.481422941852376e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.9034380912780762, + "num_tokens": 457134280.0, + "step": 1722 + }, + { + "epoch": 0.17213647035316448, + "grad_norm": 0.8786156177520752, + "learning_rate": 9.480705212421505e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.9019449949264526, + "num_tokens": 457396831.0, + "step": 1723 + }, + { + "epoch": 0.17223637544332884, + "grad_norm": 0.5002433657646179, + "learning_rate": 9.479987013856417e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.9042560160160065, + "num_tokens": 457659292.0, + "step": 1724 + }, + { + "epoch": 0.17233628053349317, + "grad_norm": 0.5057740807533264, + "learning_rate": 9.479268346232307e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.9039151072502136, + "num_tokens": 457923244.0, + "step": 1725 + }, + { + "epoch": 0.17243618562365753, + "grad_norm": 0.49227455258369446, + "learning_rate": 9.478549209624417e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.9051358699798584, + "num_tokens": 458188862.0, + "step": 1726 + }, + { + "epoch": 0.17253609071382187, + "grad_norm": 0.6189920902252197, + "learning_rate": 9.477829604108044e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9054086208343506, + "num_tokens": 458457035.0, + "step": 1727 + }, + { + "epoch": 0.17263599580398623, + "grad_norm": 0.6524850130081177, + "learning_rate": 9.477109529758533e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.9030501246452332, + "num_tokens": 458719046.0, + "step": 1728 + }, + { + "epoch": 0.17273590089415056, + "grad_norm": 0.939970850944519, + "learning_rate": 9.476388986651272e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.9049280285835266, + "num_tokens": 458986360.0, + "step": 1729 + }, + { + "epoch": 0.1728358059843149, + "grad_norm": 0.5737977623939514, + "learning_rate": 9.475667974861706e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.9027558863162994, + "num_tokens": 459252130.0, + "step": 1730 + }, + { + "epoch": 0.17293571107447925, + "grad_norm": 0.6568558812141418, + "learning_rate": 9.474946494465324e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.9029998183250427, + "num_tokens": 459511315.0, + "step": 1731 + }, + { + "epoch": 0.17303561616464358, + "grad_norm": 0.7441205382347107, + "learning_rate": 9.474224545537669e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.903083086013794, + "num_tokens": 459780276.0, + "step": 1732 + }, + { + "epoch": 0.17313552125480794, + "grad_norm": 0.6320019960403442, + "learning_rate": 9.473502128154324e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.9034376442432404, + "num_tokens": 460044398.0, + "step": 1733 + }, + { + "epoch": 0.17323542634497227, + "grad_norm": 0.7480111718177795, + "learning_rate": 9.472779242390932e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.9044786989688873, + "num_tokens": 460303401.0, + "step": 1734 + }, + { + "epoch": 0.17333533143513663, + "grad_norm": 0.49870938062667847, + "learning_rate": 9.472055888323177e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.9041434526443481, + "num_tokens": 460578882.0, + "step": 1735 + }, + { + "epoch": 0.17343523652530096, + "grad_norm": 0.6024656891822815, + "learning_rate": 9.471332066026795e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.9011503458023071, + "num_tokens": 460835628.0, + "step": 1736 + }, + { + "epoch": 0.17353514161546532, + "grad_norm": 0.7140057682991028, + "learning_rate": 9.470607775577574e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.9029259383678436, + "num_tokens": 461110718.0, + "step": 1737 + }, + { + "epoch": 0.17363504670562965, + "grad_norm": 0.5148012042045593, + "learning_rate": 9.469883017051345e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.904869794845581, + "num_tokens": 461383297.0, + "step": 1738 + }, + { + "epoch": 0.17373495179579398, + "grad_norm": 0.5433271527290344, + "learning_rate": 9.46915779052399e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.9072708487510681, + "num_tokens": 461646218.0, + "step": 1739 + }, + { + "epoch": 0.17383485688595834, + "grad_norm": 0.6706197261810303, + "learning_rate": 9.468432096071442e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.9022555649280548, + "num_tokens": 461916731.0, + "step": 1740 + }, + { + "epoch": 0.17393476197612268, + "grad_norm": 1.0411388874053955, + "learning_rate": 9.467705933769685e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.9052059054374695, + "num_tokens": 462188831.0, + "step": 1741 + }, + { + "epoch": 0.17403466706628704, + "grad_norm": 0.6398762464523315, + "learning_rate": 9.466979303694743e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.901685357093811, + "num_tokens": 462448670.0, + "step": 1742 + }, + { + "epoch": 0.17413457215645137, + "grad_norm": 0.4784257113933563, + "learning_rate": 9.4662522059227e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.9040701389312744, + "num_tokens": 462718457.0, + "step": 1743 + }, + { + "epoch": 0.17423447724661573, + "grad_norm": 0.6450126767158508, + "learning_rate": 9.465524640529681e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.9060291349887848, + "num_tokens": 462989245.0, + "step": 1744 + }, + { + "epoch": 0.17433438233678006, + "grad_norm": 0.8440563082695007, + "learning_rate": 9.464796607591865e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.9053738713264465, + "num_tokens": 463257344.0, + "step": 1745 + }, + { + "epoch": 0.1744342874269444, + "grad_norm": 0.8129132986068726, + "learning_rate": 9.464068107185476e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.9059290289878845, + "num_tokens": 463514260.0, + "step": 1746 + }, + { + "epoch": 0.17453419251710875, + "grad_norm": 0.7763200998306274, + "learning_rate": 9.463339139386788e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9042078256607056, + "num_tokens": 463792348.0, + "step": 1747 + }, + { + "epoch": 0.17463409760727308, + "grad_norm": 0.6096160411834717, + "learning_rate": 9.462609704272127e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9043519198894501, + "num_tokens": 464058985.0, + "step": 1748 + }, + { + "epoch": 0.17473400269743744, + "grad_norm": 0.7090389132499695, + "learning_rate": 9.461879801917864e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.9035322964191437, + "num_tokens": 464329137.0, + "step": 1749 + }, + { + "epoch": 0.17483390778760177, + "grad_norm": 2.718484878540039, + "learning_rate": 9.46114943240042e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.9022543132305145, + "num_tokens": 464595266.0, + "step": 1750 + }, + { + "epoch": 0.17493381287776613, + "grad_norm": 0.6396251916885376, + "learning_rate": 9.460418595796268e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.9014605581760406, + "num_tokens": 464869655.0, + "step": 1751 + }, + { + "epoch": 0.17503371796793046, + "grad_norm": 0.6035601496696472, + "learning_rate": 9.459687292181924e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.9040043354034424, + "num_tokens": 465141810.0, + "step": 1752 + }, + { + "epoch": 0.17513362305809482, + "grad_norm": 0.9304218888282776, + "learning_rate": 9.45895552163396e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9074162244796753, + "num_tokens": 465415493.0, + "step": 1753 + }, + { + "epoch": 0.17523352814825915, + "grad_norm": 0.5602255463600159, + "learning_rate": 9.45822328422899e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.9026830494403839, + "num_tokens": 465676736.0, + "step": 1754 + }, + { + "epoch": 0.1753334332384235, + "grad_norm": 0.6595318913459778, + "learning_rate": 9.45749058004368e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.905442625284195, + "num_tokens": 465940946.0, + "step": 1755 + }, + { + "epoch": 0.17543333832858785, + "grad_norm": 0.6172067523002625, + "learning_rate": 9.456757409154747e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.9023919701576233, + "num_tokens": 466198639.0, + "step": 1756 + }, + { + "epoch": 0.17553324341875218, + "grad_norm": 0.5898210406303406, + "learning_rate": 9.456023771638953e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.904460996389389, + "num_tokens": 466466045.0, + "step": 1757 + }, + { + "epoch": 0.17563314850891654, + "grad_norm": 0.5841765999794006, + "learning_rate": 9.45528966757311e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.9044845402240753, + "num_tokens": 466719855.0, + "step": 1758 + }, + { + "epoch": 0.17573305359908087, + "grad_norm": 0.8055115342140198, + "learning_rate": 9.454555097034081e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.9028651416301727, + "num_tokens": 466974534.0, + "step": 1759 + }, + { + "epoch": 0.17583295868924523, + "grad_norm": 0.6298359632492065, + "learning_rate": 9.453820060098777e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9022411406040192, + "num_tokens": 467234554.0, + "step": 1760 + }, + { + "epoch": 0.17593286377940956, + "grad_norm": 0.7779732346534729, + "learning_rate": 9.453084556844154e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.9064112901687622, + "num_tokens": 467500219.0, + "step": 1761 + }, + { + "epoch": 0.1760327688695739, + "grad_norm": 0.7688563466072083, + "learning_rate": 9.452348587347224e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.9033164381980896, + "num_tokens": 467763465.0, + "step": 1762 + }, + { + "epoch": 0.17613267395973825, + "grad_norm": 0.5880594253540039, + "learning_rate": 9.45161215168504e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.9050682187080383, + "num_tokens": 468032279.0, + "step": 1763 + }, + { + "epoch": 0.17623257904990258, + "grad_norm": 0.7406613230705261, + "learning_rate": 9.450875249934708e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9038872122764587, + "num_tokens": 468298476.0, + "step": 1764 + }, + { + "epoch": 0.17633248414006694, + "grad_norm": 0.6638922095298767, + "learning_rate": 9.450137882173385e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9037511050701141, + "num_tokens": 468560866.0, + "step": 1765 + }, + { + "epoch": 0.17643238923023127, + "grad_norm": 0.85225909948349, + "learning_rate": 9.44940004847827e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.9029715657234192, + "num_tokens": 468825006.0, + "step": 1766 + }, + { + "epoch": 0.17653229432039563, + "grad_norm": 0.7684223055839539, + "learning_rate": 9.44866174892662e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.9035623967647552, + "num_tokens": 469090312.0, + "step": 1767 + }, + { + "epoch": 0.17663219941055996, + "grad_norm": 0.5849438905715942, + "learning_rate": 9.44792298359573e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.9039220213890076, + "num_tokens": 469345160.0, + "step": 1768 + }, + { + "epoch": 0.17673210450072432, + "grad_norm": 1.1023225784301758, + "learning_rate": 9.447183752562954e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.9024760127067566, + "num_tokens": 469607042.0, + "step": 1769 + }, + { + "epoch": 0.17683200959088866, + "grad_norm": 0.792046844959259, + "learning_rate": 9.446444055905691e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.9055384397506714, + "num_tokens": 469875867.0, + "step": 1770 + }, + { + "epoch": 0.176931914681053, + "grad_norm": 0.6323318481445312, + "learning_rate": 9.445703893701383e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9002969264984131, + "num_tokens": 470138082.0, + "step": 1771 + }, + { + "epoch": 0.17703181977121735, + "grad_norm": 0.6325379014015198, + "learning_rate": 9.444963266027528e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.9002887606620789, + "num_tokens": 470405539.0, + "step": 1772 + }, + { + "epoch": 0.17713172486138168, + "grad_norm": 0.740299642086029, + "learning_rate": 9.444222172961672e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9063725471496582, + "num_tokens": 470674228.0, + "step": 1773 + }, + { + "epoch": 0.17723162995154604, + "grad_norm": 0.8266692757606506, + "learning_rate": 9.443480614581406e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.904046356678009, + "num_tokens": 470946215.0, + "step": 1774 + }, + { + "epoch": 0.17733153504171037, + "grad_norm": 0.7388534545898438, + "learning_rate": 9.442738590964373e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.9056152403354645, + "num_tokens": 471222589.0, + "step": 1775 + }, + { + "epoch": 0.17743144013187473, + "grad_norm": 0.6190120577812195, + "learning_rate": 9.441996102188265e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.903448611497879, + "num_tokens": 471481677.0, + "step": 1776 + }, + { + "epoch": 0.17753134522203906, + "grad_norm": 0.8591283559799194, + "learning_rate": 9.441253148330818e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.9018056392669678, + "num_tokens": 471743426.0, + "step": 1777 + }, + { + "epoch": 0.17763125031220342, + "grad_norm": 0.6166785955429077, + "learning_rate": 9.440509729469823e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.9040938913822174, + "num_tokens": 472006608.0, + "step": 1778 + }, + { + "epoch": 0.17773115540236775, + "grad_norm": 1.0211715698242188, + "learning_rate": 9.439765845683114e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9061250984668732, + "num_tokens": 472282927.0, + "step": 1779 + }, + { + "epoch": 0.17783106049253208, + "grad_norm": 0.6761077642440796, + "learning_rate": 9.439021497048577e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.9000135958194733, + "num_tokens": 472559662.0, + "step": 1780 + }, + { + "epoch": 0.17793096558269644, + "grad_norm": 0.6726570725440979, + "learning_rate": 9.43827668364415e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.90571328997612, + "num_tokens": 472820301.0, + "step": 1781 + }, + { + "epoch": 0.17803087067286077, + "grad_norm": 0.9986647963523865, + "learning_rate": 9.43753140554781e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.9018749594688416, + "num_tokens": 473088501.0, + "step": 1782 + }, + { + "epoch": 0.17813077576302513, + "grad_norm": 0.535144031047821, + "learning_rate": 9.436785662837591e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.9046727120876312, + "num_tokens": 473352719.0, + "step": 1783 + }, + { + "epoch": 0.17823068085318947, + "grad_norm": 0.8333587646484375, + "learning_rate": 9.436039455591574e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.9043475389480591, + "num_tokens": 473622086.0, + "step": 1784 + }, + { + "epoch": 0.17833058594335383, + "grad_norm": 0.6313154697418213, + "learning_rate": 9.435292783887885e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.9034178853034973, + "num_tokens": 473886489.0, + "step": 1785 + }, + { + "epoch": 0.17843049103351816, + "grad_norm": 0.5844412446022034, + "learning_rate": 9.434545647804703e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.903112143278122, + "num_tokens": 474146305.0, + "step": 1786 + }, + { + "epoch": 0.1785303961236825, + "grad_norm": 0.6520765423774719, + "learning_rate": 9.433798047420256e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.905561238527298, + "num_tokens": 474408584.0, + "step": 1787 + }, + { + "epoch": 0.17863030121384685, + "grad_norm": 0.9320226311683655, + "learning_rate": 9.433049982812813e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.9052580893039703, + "num_tokens": 474674913.0, + "step": 1788 + }, + { + "epoch": 0.17873020630401118, + "grad_norm": 1.4006402492523193, + "learning_rate": 9.432301454060702e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.9037557542324066, + "num_tokens": 474940482.0, + "step": 1789 + }, + { + "epoch": 0.17883011139417554, + "grad_norm": 0.6519148945808411, + "learning_rate": 9.431552461242291e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.9002265930175781, + "num_tokens": 475201831.0, + "step": 1790 + }, + { + "epoch": 0.17893001648433987, + "grad_norm": 0.7402968406677246, + "learning_rate": 9.430803004436004e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9028676748275757, + "num_tokens": 475472721.0, + "step": 1791 + }, + { + "epoch": 0.17902992157450423, + "grad_norm": 0.6305089592933655, + "learning_rate": 9.430053083720307e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.9068587124347687, + "num_tokens": 475743780.0, + "step": 1792 + }, + { + "epoch": 0.17912982666466856, + "grad_norm": 0.7213061451911926, + "learning_rate": 9.429302699173719e-06, + "loss": 0.506, + "mean_token_accuracy": 0.9051795601844788, + "num_tokens": 476009069.0, + "step": 1793 + }, + { + "epoch": 0.17922973175483292, + "grad_norm": 0.8056860566139221, + "learning_rate": 9.428551850874805e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9018847644329071, + "num_tokens": 476283112.0, + "step": 1794 + }, + { + "epoch": 0.17932963684499725, + "grad_norm": 0.7794078588485718, + "learning_rate": 9.42780053890218e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9041489958763123, + "num_tokens": 476555333.0, + "step": 1795 + }, + { + "epoch": 0.17942954193516159, + "grad_norm": 0.6161383986473083, + "learning_rate": 9.427048763334507e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.9035521447658539, + "num_tokens": 476820960.0, + "step": 1796 + }, + { + "epoch": 0.17952944702532594, + "grad_norm": 1.2568327188491821, + "learning_rate": 9.426296524250498e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.904815286397934, + "num_tokens": 477085441.0, + "step": 1797 + }, + { + "epoch": 0.17962935211549028, + "grad_norm": 0.7765632271766663, + "learning_rate": 9.425543821728913e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9031216204166412, + "num_tokens": 477355534.0, + "step": 1798 + }, + { + "epoch": 0.17972925720565464, + "grad_norm": 1.2424813508987427, + "learning_rate": 9.42479065584856e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.901507169008255, + "num_tokens": 477620679.0, + "step": 1799 + }, + { + "epoch": 0.17982916229581897, + "grad_norm": 0.6939692497253418, + "learning_rate": 9.424037026688298e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.9049741923809052, + "num_tokens": 477883501.0, + "step": 1800 + }, + { + "epoch": 0.17992906738598333, + "grad_norm": 0.6300768852233887, + "learning_rate": 9.42328293432703e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.9037237167358398, + "num_tokens": 478154685.0, + "step": 1801 + }, + { + "epoch": 0.18002897247614766, + "grad_norm": 0.7714188694953918, + "learning_rate": 9.422528378843714e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9041953682899475, + "num_tokens": 478414056.0, + "step": 1802 + }, + { + "epoch": 0.180128877566312, + "grad_norm": 1.697234034538269, + "learning_rate": 9.421773360317348e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.9036962687969208, + "num_tokens": 478678096.0, + "step": 1803 + }, + { + "epoch": 0.18022878265647635, + "grad_norm": 0.8642975091934204, + "learning_rate": 9.421017878826986e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.9019030630588531, + "num_tokens": 478940888.0, + "step": 1804 + }, + { + "epoch": 0.18032868774664068, + "grad_norm": 0.9598732590675354, + "learning_rate": 9.420261934451728e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.9056201875209808, + "num_tokens": 479211002.0, + "step": 1805 + }, + { + "epoch": 0.18042859283680504, + "grad_norm": 0.7732366919517517, + "learning_rate": 9.41950552727072e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.9009326696395874, + "num_tokens": 479480258.0, + "step": 1806 + }, + { + "epoch": 0.18052849792696937, + "grad_norm": 0.6688029766082764, + "learning_rate": 9.418748657363161e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.9060971736907959, + "num_tokens": 479745351.0, + "step": 1807 + }, + { + "epoch": 0.18062840301713373, + "grad_norm": 0.8235188722610474, + "learning_rate": 9.417991324808296e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9015281796455383, + "num_tokens": 480002313.0, + "step": 1808 + }, + { + "epoch": 0.18072830810729806, + "grad_norm": 1.649667739868164, + "learning_rate": 9.417233529685417e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.9001340270042419, + "num_tokens": 480267413.0, + "step": 1809 + }, + { + "epoch": 0.18082821319746242, + "grad_norm": 0.8142596483230591, + "learning_rate": 9.416475272073864e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.9027431309223175, + "num_tokens": 480529129.0, + "step": 1810 + }, + { + "epoch": 0.18092811828762675, + "grad_norm": 0.5662294030189514, + "learning_rate": 9.415716552053031e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.9041255712509155, + "num_tokens": 480795273.0, + "step": 1811 + }, + { + "epoch": 0.1810280233777911, + "grad_norm": 0.7401605248451233, + "learning_rate": 9.414957369702356e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.9032481610774994, + "num_tokens": 481058896.0, + "step": 1812 + }, + { + "epoch": 0.18112792846795545, + "grad_norm": 0.7736427187919617, + "learning_rate": 9.414197725101327e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.9032937586307526, + "num_tokens": 481321718.0, + "step": 1813 + }, + { + "epoch": 0.18122783355811978, + "grad_norm": 0.6320231556892395, + "learning_rate": 9.413437618329476e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.9041429460048676, + "num_tokens": 481584832.0, + "step": 1814 + }, + { + "epoch": 0.18132773864828414, + "grad_norm": 0.614247739315033, + "learning_rate": 9.412677049466388e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9084599316120148, + "num_tokens": 481847807.0, + "step": 1815 + }, + { + "epoch": 0.18142764373844847, + "grad_norm": 1.268647313117981, + "learning_rate": 9.411916018591696e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9035007953643799, + "num_tokens": 482118011.0, + "step": 1816 + }, + { + "epoch": 0.18152754882861283, + "grad_norm": 0.6791312098503113, + "learning_rate": 9.411154525785082e-06, + "loss": 0.504, + "mean_token_accuracy": 0.9019083976745605, + "num_tokens": 482384450.0, + "step": 1817 + }, + { + "epoch": 0.18162745391877716, + "grad_norm": 0.7571622133255005, + "learning_rate": 9.410392571126275e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.9027359485626221, + "num_tokens": 482657601.0, + "step": 1818 + }, + { + "epoch": 0.18172735900894152, + "grad_norm": 0.6714386940002441, + "learning_rate": 9.40963015469505e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.9053019881248474, + "num_tokens": 482914756.0, + "step": 1819 + }, + { + "epoch": 0.18182726409910585, + "grad_norm": 0.7131423950195312, + "learning_rate": 9.408867276571235e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.9027461409568787, + "num_tokens": 483178361.0, + "step": 1820 + }, + { + "epoch": 0.18192716918927018, + "grad_norm": 0.7346880435943604, + "learning_rate": 9.408103936834703e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.9034725427627563, + "num_tokens": 483430448.0, + "step": 1821 + }, + { + "epoch": 0.18202707427943454, + "grad_norm": 1.4119391441345215, + "learning_rate": 9.407340135565375e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.9023167788982391, + "num_tokens": 483691784.0, + "step": 1822 + }, + { + "epoch": 0.18212697936959887, + "grad_norm": 0.656622588634491, + "learning_rate": 9.406575872843224e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.9049022793769836, + "num_tokens": 483959538.0, + "step": 1823 + }, + { + "epoch": 0.18222688445976323, + "grad_norm": 0.7793489694595337, + "learning_rate": 9.40581114874827e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.9027813374996185, + "num_tokens": 484221533.0, + "step": 1824 + }, + { + "epoch": 0.18232678954992756, + "grad_norm": 1.2440807819366455, + "learning_rate": 9.405045963360577e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.9029967486858368, + "num_tokens": 484484419.0, + "step": 1825 + }, + { + "epoch": 0.18242669464009192, + "grad_norm": 1.0954803228378296, + "learning_rate": 9.404280316760264e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.9036293029785156, + "num_tokens": 484750002.0, + "step": 1826 + }, + { + "epoch": 0.18252659973025626, + "grad_norm": 1.2360029220581055, + "learning_rate": 9.403514209027491e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.9007436335086823, + "num_tokens": 485020683.0, + "step": 1827 + }, + { + "epoch": 0.1826265048204206, + "grad_norm": 0.8126938939094543, + "learning_rate": 9.402747640242475e-06, + "loss": 0.506, + "mean_token_accuracy": 0.9041915833950043, + "num_tokens": 485294433.0, + "step": 1828 + }, + { + "epoch": 0.18272640991058495, + "grad_norm": 0.6576655507087708, + "learning_rate": 9.401980610485472e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.9046714305877686, + "num_tokens": 485564291.0, + "step": 1829 + }, + { + "epoch": 0.18282631500074928, + "grad_norm": 1.0011577606201172, + "learning_rate": 9.401213119836795e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.903494656085968, + "num_tokens": 485832639.0, + "step": 1830 + }, + { + "epoch": 0.18292622009091364, + "grad_norm": 1.2083721160888672, + "learning_rate": 9.400445168376798e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.9038940072059631, + "num_tokens": 486099550.0, + "step": 1831 + }, + { + "epoch": 0.18302612518107797, + "grad_norm": 0.9350221753120422, + "learning_rate": 9.399676756185887e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.9037570059299469, + "num_tokens": 486361245.0, + "step": 1832 + }, + { + "epoch": 0.18312603027124233, + "grad_norm": 0.6759036779403687, + "learning_rate": 9.398907883344514e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.9029037654399872, + "num_tokens": 486626280.0, + "step": 1833 + }, + { + "epoch": 0.18322593536140666, + "grad_norm": 1.1211169958114624, + "learning_rate": 9.398138549933184e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.9016587734222412, + "num_tokens": 486895306.0, + "step": 1834 + }, + { + "epoch": 0.18332584045157102, + "grad_norm": 0.7002373337745667, + "learning_rate": 9.397368756032445e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.9028621017932892, + "num_tokens": 487164217.0, + "step": 1835 + }, + { + "epoch": 0.18342574554173535, + "grad_norm": 0.8139476180076599, + "learning_rate": 9.396598501722897e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.9057653248310089, + "num_tokens": 487426508.0, + "step": 1836 + }, + { + "epoch": 0.18352565063189968, + "grad_norm": 0.6332756280899048, + "learning_rate": 9.395827787085183e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.906907320022583, + "num_tokens": 487684646.0, + "step": 1837 + }, + { + "epoch": 0.18362555572206404, + "grad_norm": 0.8495187163352966, + "learning_rate": 9.3950566122e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.9063364863395691, + "num_tokens": 487946872.0, + "step": 1838 + }, + { + "epoch": 0.18372546081222837, + "grad_norm": 0.6511660218238831, + "learning_rate": 9.394284977148091e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.9035096764564514, + "num_tokens": 488218830.0, + "step": 1839 + }, + { + "epoch": 0.18382536590239273, + "grad_norm": 0.6510178446769714, + "learning_rate": 9.393512882010246e-06, + "loss": 0.508, + "mean_token_accuracy": 0.9038873016834259, + "num_tokens": 488493978.0, + "step": 1840 + }, + { + "epoch": 0.18392527099255707, + "grad_norm": 1.13258957862854, + "learning_rate": 9.392740326867304e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.9035498797893524, + "num_tokens": 488761461.0, + "step": 1841 + }, + { + "epoch": 0.18402517608272143, + "grad_norm": 0.7331169247627258, + "learning_rate": 9.391967311800154e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8996517360210419, + "num_tokens": 489021337.0, + "step": 1842 + }, + { + "epoch": 0.18412508117288576, + "grad_norm": 0.5459000468254089, + "learning_rate": 9.391193836889728e-06, + "loss": 0.506, + "mean_token_accuracy": 0.9036667346954346, + "num_tokens": 489292136.0, + "step": 1843 + }, + { + "epoch": 0.1842249862630501, + "grad_norm": 0.9680888652801514, + "learning_rate": 9.390419902217011e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.9047720730304718, + "num_tokens": 489560504.0, + "step": 1844 + }, + { + "epoch": 0.18432489135321445, + "grad_norm": 0.5151711106300354, + "learning_rate": 9.389645507863036e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.9045321643352509, + "num_tokens": 489832766.0, + "step": 1845 + }, + { + "epoch": 0.18442479644337878, + "grad_norm": 0.8158555626869202, + "learning_rate": 9.388870653908883e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.9060832262039185, + "num_tokens": 490094755.0, + "step": 1846 + }, + { + "epoch": 0.18452470153354314, + "grad_norm": 0.64750075340271, + "learning_rate": 9.38809534043568e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.9017360508441925, + "num_tokens": 490363768.0, + "step": 1847 + }, + { + "epoch": 0.18462460662370747, + "grad_norm": 0.5445069074630737, + "learning_rate": 9.387319567524602e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9039370119571686, + "num_tokens": 490635282.0, + "step": 1848 + }, + { + "epoch": 0.18472451171387183, + "grad_norm": 0.63569575548172, + "learning_rate": 9.38654333525687e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.905972421169281, + "num_tokens": 490902894.0, + "step": 1849 + }, + { + "epoch": 0.18482441680403616, + "grad_norm": 1.5490987300872803, + "learning_rate": 9.385766643713764e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.9029092192649841, + "num_tokens": 491168315.0, + "step": 1850 + }, + { + "epoch": 0.18492432189420052, + "grad_norm": 0.6718323230743408, + "learning_rate": 9.384989492976598e-06, + "loss": 0.502, + "mean_token_accuracy": 0.9038454592227936, + "num_tokens": 491431878.0, + "step": 1851 + }, + { + "epoch": 0.18502422698436485, + "grad_norm": 1.0541181564331055, + "learning_rate": 9.384211883126741e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.903600811958313, + "num_tokens": 491694964.0, + "step": 1852 + }, + { + "epoch": 0.18512413207452919, + "grad_norm": 0.6818791031837463, + "learning_rate": 9.383433814245612e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.9068031013011932, + "num_tokens": 491958559.0, + "step": 1853 + }, + { + "epoch": 0.18522403716469354, + "grad_norm": 0.6047778129577637, + "learning_rate": 9.382655286414677e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.9020286202430725, + "num_tokens": 492234757.0, + "step": 1854 + }, + { + "epoch": 0.18532394225485788, + "grad_norm": 0.5983203649520874, + "learning_rate": 9.381876299715444e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.9018416106700897, + "num_tokens": 492488594.0, + "step": 1855 + }, + { + "epoch": 0.18542384734502224, + "grad_norm": 0.5271109938621521, + "learning_rate": 9.381096854229476e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.9035864174365997, + "num_tokens": 492757465.0, + "step": 1856 + }, + { + "epoch": 0.18552375243518657, + "grad_norm": 0.6384602189064026, + "learning_rate": 9.380316950038382e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.9046818912029266, + "num_tokens": 493023243.0, + "step": 1857 + }, + { + "epoch": 0.18562365752535093, + "grad_norm": 0.49045616388320923, + "learning_rate": 9.379536587223818e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.9033565521240234, + "num_tokens": 493289124.0, + "step": 1858 + }, + { + "epoch": 0.18572356261551526, + "grad_norm": 0.4995969235897064, + "learning_rate": 9.378755765867488e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.9048930406570435, + "num_tokens": 493549955.0, + "step": 1859 + }, + { + "epoch": 0.18582346770567962, + "grad_norm": 0.6576818823814392, + "learning_rate": 9.377974486051149e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9077730178833008, + "num_tokens": 493809888.0, + "step": 1860 + }, + { + "epoch": 0.18592337279584395, + "grad_norm": 0.6557890772819519, + "learning_rate": 9.377192747856596e-06, + "loss": 0.505, + "mean_token_accuracy": 0.9025789797306061, + "num_tokens": 494091496.0, + "step": 1861 + }, + { + "epoch": 0.18602327788600828, + "grad_norm": 0.6161521673202515, + "learning_rate": 9.37641055136568e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.9038001596927643, + "num_tokens": 494349652.0, + "step": 1862 + }, + { + "epoch": 0.18612318297617264, + "grad_norm": 0.6039844155311584, + "learning_rate": 9.375627896660299e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.9078842401504517, + "num_tokens": 494616158.0, + "step": 1863 + }, + { + "epoch": 0.18622308806633697, + "grad_norm": 0.7750139832496643, + "learning_rate": 9.374844783822396e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9056781232357025, + "num_tokens": 494884092.0, + "step": 1864 + }, + { + "epoch": 0.18632299315650133, + "grad_norm": 0.6075642704963684, + "learning_rate": 9.374061212933965e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.903397262096405, + "num_tokens": 495148979.0, + "step": 1865 + }, + { + "epoch": 0.18642289824666566, + "grad_norm": 0.507577121257782, + "learning_rate": 9.373277184077047e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.9078173935413361, + "num_tokens": 495423992.0, + "step": 1866 + }, + { + "epoch": 0.18652280333683002, + "grad_norm": 0.885535478591919, + "learning_rate": 9.372492697333728e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.9047930240631104, + "num_tokens": 495695637.0, + "step": 1867 + }, + { + "epoch": 0.18662270842699435, + "grad_norm": 0.5577073693275452, + "learning_rate": 9.371707752786147e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.902817040681839, + "num_tokens": 495954980.0, + "step": 1868 + }, + { + "epoch": 0.1867226135171587, + "grad_norm": 0.4819889962673187, + "learning_rate": 9.370922350516486e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.9046104848384857, + "num_tokens": 496226123.0, + "step": 1869 + }, + { + "epoch": 0.18682251860732305, + "grad_norm": 0.5280167460441589, + "learning_rate": 9.370136490606982e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.9038742184638977, + "num_tokens": 496495409.0, + "step": 1870 + }, + { + "epoch": 0.18692242369748738, + "grad_norm": 0.6025378704071045, + "learning_rate": 9.369350173139911e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.9062499701976776, + "num_tokens": 496749894.0, + "step": 1871 + }, + { + "epoch": 0.18702232878765174, + "grad_norm": 0.5872483849525452, + "learning_rate": 9.368563398197603e-06, + "loss": 0.498, + "mean_token_accuracy": 0.9058083593845367, + "num_tokens": 497021810.0, + "step": 1872 + }, + { + "epoch": 0.18712223387781607, + "grad_norm": 0.46633973717689514, + "learning_rate": 9.367776165862434e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.9053482115268707, + "num_tokens": 497283548.0, + "step": 1873 + }, + { + "epoch": 0.18722213896798043, + "grad_norm": 0.5482932925224304, + "learning_rate": 9.366988476216826e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9040248692035675, + "num_tokens": 497541826.0, + "step": 1874 + }, + { + "epoch": 0.18732204405814476, + "grad_norm": 0.6083416938781738, + "learning_rate": 9.366200329343254e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.9045588672161102, + "num_tokens": 497809767.0, + "step": 1875 + }, + { + "epoch": 0.18742194914830912, + "grad_norm": 0.6521367430686951, + "learning_rate": 9.365411725324237e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.9031912088394165, + "num_tokens": 498085260.0, + "step": 1876 + }, + { + "epoch": 0.18752185423847345, + "grad_norm": 0.5763635039329529, + "learning_rate": 9.36462266424234e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.9024290144443512, + "num_tokens": 498350729.0, + "step": 1877 + }, + { + "epoch": 0.18762175932863778, + "grad_norm": 0.6006721258163452, + "learning_rate": 9.36383314618018e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.903380274772644, + "num_tokens": 498615935.0, + "step": 1878 + }, + { + "epoch": 0.18772166441880214, + "grad_norm": 0.5953575968742371, + "learning_rate": 9.363043171220423e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.9043267965316772, + "num_tokens": 498880291.0, + "step": 1879 + }, + { + "epoch": 0.18782156950896647, + "grad_norm": 0.7807033658027649, + "learning_rate": 9.362252739445776e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.9063130617141724, + "num_tokens": 499143107.0, + "step": 1880 + }, + { + "epoch": 0.18792147459913083, + "grad_norm": 0.5874289274215698, + "learning_rate": 9.361461850938999e-06, + "loss": 0.509, + "mean_token_accuracy": 0.904294341802597, + "num_tokens": 499410960.0, + "step": 1881 + }, + { + "epoch": 0.18802137968929516, + "grad_norm": 0.6057317852973938, + "learning_rate": 9.360670505782903e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9064527153968811, + "num_tokens": 499683804.0, + "step": 1882 + }, + { + "epoch": 0.18812128477945952, + "grad_norm": 0.7865630388259888, + "learning_rate": 9.359878704060336e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.9014391005039215, + "num_tokens": 499947964.0, + "step": 1883 + }, + { + "epoch": 0.18822118986962386, + "grad_norm": 0.6600625514984131, + "learning_rate": 9.359086445854206e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9036059975624084, + "num_tokens": 500213863.0, + "step": 1884 + }, + { + "epoch": 0.1883210949597882, + "grad_norm": 0.5751078128814697, + "learning_rate": 9.358293731247459e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.9035585820674896, + "num_tokens": 500482152.0, + "step": 1885 + }, + { + "epoch": 0.18842100004995255, + "grad_norm": 0.5663090348243713, + "learning_rate": 9.357500560323096e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9066228270530701, + "num_tokens": 500749182.0, + "step": 1886 + }, + { + "epoch": 0.18852090514011688, + "grad_norm": 0.7552834749221802, + "learning_rate": 9.356706933164161e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.9020602405071259, + "num_tokens": 500998122.0, + "step": 1887 + }, + { + "epoch": 0.18862081023028124, + "grad_norm": 0.641635537147522, + "learning_rate": 9.355912849853747e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.9047819375991821, + "num_tokens": 501259464.0, + "step": 1888 + }, + { + "epoch": 0.18872071532044557, + "grad_norm": 0.45978665351867676, + "learning_rate": 9.355118310475e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.9037047922611237, + "num_tokens": 501526036.0, + "step": 1889 + }, + { + "epoch": 0.18882062041060993, + "grad_norm": 0.5384812951087952, + "learning_rate": 9.354323315111102e-06, + "loss": 0.507, + "mean_token_accuracy": 0.9032436609268188, + "num_tokens": 501794513.0, + "step": 1890 + }, + { + "epoch": 0.18892052550077426, + "grad_norm": 0.5480051636695862, + "learning_rate": 9.353527863845296e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.903631180524826, + "num_tokens": 502062781.0, + "step": 1891 + }, + { + "epoch": 0.18902043059093862, + "grad_norm": 0.5439882874488831, + "learning_rate": 9.35273195676086e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.901974618434906, + "num_tokens": 502331635.0, + "step": 1892 + }, + { + "epoch": 0.18912033568110295, + "grad_norm": 0.46865805983543396, + "learning_rate": 9.351935593941134e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.9044225215911865, + "num_tokens": 502606629.0, + "step": 1893 + }, + { + "epoch": 0.18922024077126728, + "grad_norm": 0.6341946721076965, + "learning_rate": 9.351138775469493e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.9023724496364594, + "num_tokens": 502882338.0, + "step": 1894 + }, + { + "epoch": 0.18932014586143164, + "grad_norm": 0.4879814088344574, + "learning_rate": 9.350341501429366e-06, + "loss": 0.501, + "mean_token_accuracy": 0.90638667345047, + "num_tokens": 503146875.0, + "step": 1895 + }, + { + "epoch": 0.18942005095159598, + "grad_norm": 0.49912044405937195, + "learning_rate": 9.349543771904225e-06, + "loss": 0.503, + "mean_token_accuracy": 0.9039722383022308, + "num_tokens": 503415858.0, + "step": 1896 + }, + { + "epoch": 0.18951995604176033, + "grad_norm": 0.4864227771759033, + "learning_rate": 9.348745586977599e-06, + "loss": 0.507, + "mean_token_accuracy": 0.9033427834510803, + "num_tokens": 503680554.0, + "step": 1897 + }, + { + "epoch": 0.18961986113192467, + "grad_norm": 0.6056549549102783, + "learning_rate": 9.347946946733055e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.9070384204387665, + "num_tokens": 503947650.0, + "step": 1898 + }, + { + "epoch": 0.18971976622208903, + "grad_norm": 0.5170502066612244, + "learning_rate": 9.347147851254213e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.9052523076534271, + "num_tokens": 504212869.0, + "step": 1899 + }, + { + "epoch": 0.18981967131225336, + "grad_norm": 0.5494750142097473, + "learning_rate": 9.34634830062474e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.9029579162597656, + "num_tokens": 504469727.0, + "step": 1900 + }, + { + "epoch": 0.18991957640241772, + "grad_norm": 0.5157909989356995, + "learning_rate": 9.345548294928344e-06, + "loss": 0.511, + "mean_token_accuracy": 0.9067108035087585, + "num_tokens": 504734198.0, + "step": 1901 + }, + { + "epoch": 0.19001948149258205, + "grad_norm": 0.514491081237793, + "learning_rate": 9.344747834248793e-06, + "loss": 0.499, + "mean_token_accuracy": 0.9058763980865479, + "num_tokens": 504999408.0, + "step": 1902 + }, + { + "epoch": 0.19011938658274638, + "grad_norm": 0.47165170311927795, + "learning_rate": 9.343946918669893e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.904710441827774, + "num_tokens": 505275772.0, + "step": 1903 + }, + { + "epoch": 0.19021929167291074, + "grad_norm": 0.48760947585105896, + "learning_rate": 9.343145548275503e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.9045685827732086, + "num_tokens": 505539906.0, + "step": 1904 + }, + { + "epoch": 0.19031919676307507, + "grad_norm": 0.530010998249054, + "learning_rate": 9.342343723149523e-06, + "loss": 0.506, + "mean_token_accuracy": 0.9042829871177673, + "num_tokens": 505803226.0, + "step": 1905 + }, + { + "epoch": 0.19041910185323943, + "grad_norm": 0.5183342695236206, + "learning_rate": 9.341541443375907e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.902922660112381, + "num_tokens": 506062306.0, + "step": 1906 + }, + { + "epoch": 0.19051900694340376, + "grad_norm": 0.6388257145881653, + "learning_rate": 9.340738709038657e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.9035120606422424, + "num_tokens": 506327171.0, + "step": 1907 + }, + { + "epoch": 0.19061891203356812, + "grad_norm": 0.5807130336761475, + "learning_rate": 9.339935520221816e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.9032579362392426, + "num_tokens": 506594762.0, + "step": 1908 + }, + { + "epoch": 0.19071881712373245, + "grad_norm": 0.6342124342918396, + "learning_rate": 9.339131877009482e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.90461665391922, + "num_tokens": 506864685.0, + "step": 1909 + }, + { + "epoch": 0.19081872221389679, + "grad_norm": 0.5490785241127014, + "learning_rate": 9.338327779485794e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.905794620513916, + "num_tokens": 507137807.0, + "step": 1910 + }, + { + "epoch": 0.19091862730406114, + "grad_norm": 0.5354376435279846, + "learning_rate": 9.337523227734945e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.9041210114955902, + "num_tokens": 507398449.0, + "step": 1911 + }, + { + "epoch": 0.19101853239422548, + "grad_norm": 1.377945899963379, + "learning_rate": 9.33671822184117e-06, + "loss": 0.5, + "mean_token_accuracy": 0.9045880436897278, + "num_tokens": 507668103.0, + "step": 1912 + }, + { + "epoch": 0.19111843748438984, + "grad_norm": 0.573306679725647, + "learning_rate": 9.335912761888754e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.901680201292038, + "num_tokens": 507926068.0, + "step": 1913 + }, + { + "epoch": 0.19121834257455417, + "grad_norm": 0.5867418646812439, + "learning_rate": 9.335106847962032e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.9050920307636261, + "num_tokens": 508190651.0, + "step": 1914 + }, + { + "epoch": 0.19131824766471853, + "grad_norm": 0.6609401702880859, + "learning_rate": 9.334300480145381e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.9047488570213318, + "num_tokens": 508452842.0, + "step": 1915 + }, + { + "epoch": 0.19141815275488286, + "grad_norm": 0.6523907780647278, + "learning_rate": 9.333493658523231e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.9054960012435913, + "num_tokens": 508713924.0, + "step": 1916 + }, + { + "epoch": 0.19151805784504722, + "grad_norm": 0.6786147952079773, + "learning_rate": 9.332686383180055e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.9024700820446014, + "num_tokens": 508972075.0, + "step": 1917 + }, + { + "epoch": 0.19161796293521155, + "grad_norm": 0.6314734816551208, + "learning_rate": 9.331878654200377e-06, + "loss": 0.512, + "mean_token_accuracy": 0.9033898711204529, + "num_tokens": 509243258.0, + "step": 1918 + }, + { + "epoch": 0.19171786802537588, + "grad_norm": 0.6604753732681274, + "learning_rate": 9.331070471668764e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.9020237624645233, + "num_tokens": 509519827.0, + "step": 1919 + }, + { + "epoch": 0.19181777311554024, + "grad_norm": 0.5614109635353088, + "learning_rate": 9.330261835669839e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.903258204460144, + "num_tokens": 509781991.0, + "step": 1920 + }, + { + "epoch": 0.19191767820570457, + "grad_norm": 0.565757155418396, + "learning_rate": 9.329452746288261e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.9041323065757751, + "num_tokens": 510043538.0, + "step": 1921 + }, + { + "epoch": 0.19201758329586893, + "grad_norm": 0.4895612895488739, + "learning_rate": 9.328643203608747e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.905462384223938, + "num_tokens": 510307194.0, + "step": 1922 + }, + { + "epoch": 0.19211748838603326, + "grad_norm": 0.5517895221710205, + "learning_rate": 9.327833207716053e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.9036737978458405, + "num_tokens": 510585245.0, + "step": 1923 + }, + { + "epoch": 0.19221739347619762, + "grad_norm": 0.748272716999054, + "learning_rate": 9.327022758694991e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.9059540927410126, + "num_tokens": 510858167.0, + "step": 1924 + }, + { + "epoch": 0.19231729856636195, + "grad_norm": 1.7596632242202759, + "learning_rate": 9.32621185663041e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.9057161211967468, + "num_tokens": 511117792.0, + "step": 1925 + }, + { + "epoch": 0.1924172036565263, + "grad_norm": 0.7338852286338806, + "learning_rate": 9.325400501607218e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.9017881453037262, + "num_tokens": 511378033.0, + "step": 1926 + }, + { + "epoch": 0.19251710874669065, + "grad_norm": 0.5887309908866882, + "learning_rate": 9.32458869371036e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.9041813313961029, + "num_tokens": 511643749.0, + "step": 1927 + }, + { + "epoch": 0.19261701383685498, + "grad_norm": 0.5683996677398682, + "learning_rate": 9.323776433024838e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.9027931392192841, + "num_tokens": 511906591.0, + "step": 1928 + }, + { + "epoch": 0.19271691892701934, + "grad_norm": 0.6401326656341553, + "learning_rate": 9.322963719635693e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.9064330160617828, + "num_tokens": 512169689.0, + "step": 1929 + }, + { + "epoch": 0.19281682401718367, + "grad_norm": 1.3686660528182983, + "learning_rate": 9.322150553628017e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.9063427448272705, + "num_tokens": 512427916.0, + "step": 1930 + }, + { + "epoch": 0.19291672910734803, + "grad_norm": 0.6854103207588196, + "learning_rate": 9.32133693508695e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.9037925899028778, + "num_tokens": 512690982.0, + "step": 1931 + }, + { + "epoch": 0.19301663419751236, + "grad_norm": 0.6006134748458862, + "learning_rate": 9.320522864097678e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.9054137766361237, + "num_tokens": 512959150.0, + "step": 1932 + }, + { + "epoch": 0.19311653928767672, + "grad_norm": 0.5619436502456665, + "learning_rate": 9.319708340745437e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.9046031534671783, + "num_tokens": 513222120.0, + "step": 1933 + }, + { + "epoch": 0.19321644437784105, + "grad_norm": 1.1936899423599243, + "learning_rate": 9.318893365115506e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.901681125164032, + "num_tokens": 513485776.0, + "step": 1934 + }, + { + "epoch": 0.19331634946800538, + "grad_norm": 0.6519607305526733, + "learning_rate": 9.318077937293215e-06, + "loss": 0.513, + "mean_token_accuracy": 0.9015476703643799, + "num_tokens": 513752296.0, + "step": 1935 + }, + { + "epoch": 0.19341625455816974, + "grad_norm": 0.5780264735221863, + "learning_rate": 9.31726205736394e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9030801057815552, + "num_tokens": 514015138.0, + "step": 1936 + }, + { + "epoch": 0.19351615964833407, + "grad_norm": 0.4976091980934143, + "learning_rate": 9.316445725413103e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.9051942527294159, + "num_tokens": 514282942.0, + "step": 1937 + }, + { + "epoch": 0.19361606473849843, + "grad_norm": 0.6057147979736328, + "learning_rate": 9.315628941526179e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.9047718048095703, + "num_tokens": 514549322.0, + "step": 1938 + }, + { + "epoch": 0.19371596982866277, + "grad_norm": 0.7381507158279419, + "learning_rate": 9.31481170578868e-06, + "loss": 0.501, + "mean_token_accuracy": 0.9049597084522247, + "num_tokens": 514817530.0, + "step": 1939 + }, + { + "epoch": 0.19381587491882712, + "grad_norm": 0.5402387976646423, + "learning_rate": 9.313994018286175e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.9047084748744965, + "num_tokens": 515087645.0, + "step": 1940 + }, + { + "epoch": 0.19391578000899146, + "grad_norm": 0.6947450637817383, + "learning_rate": 9.313175879104277e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.9013444185256958, + "num_tokens": 515361981.0, + "step": 1941 + }, + { + "epoch": 0.19401568509915582, + "grad_norm": 0.4972838759422302, + "learning_rate": 9.312357288328645e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.9072962403297424, + "num_tokens": 515641964.0, + "step": 1942 + }, + { + "epoch": 0.19411559018932015, + "grad_norm": 0.4842279255390167, + "learning_rate": 9.311538246044987e-06, + "loss": 0.506, + "mean_token_accuracy": 0.9014309346675873, + "num_tokens": 515895456.0, + "step": 1943 + }, + { + "epoch": 0.19421549527948448, + "grad_norm": 0.6081920266151428, + "learning_rate": 9.310718752339054e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.900009274482727, + "num_tokens": 516160289.0, + "step": 1944 + }, + { + "epoch": 0.19431540036964884, + "grad_norm": 0.5910548567771912, + "learning_rate": 9.309898807296653e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.9020135402679443, + "num_tokens": 516425248.0, + "step": 1945 + }, + { + "epoch": 0.19441530545981317, + "grad_norm": 0.7410874962806702, + "learning_rate": 9.309078411003632e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.9023971855640411, + "num_tokens": 516681522.0, + "step": 1946 + }, + { + "epoch": 0.19451521054997753, + "grad_norm": 0.5293632745742798, + "learning_rate": 9.308257563545885e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.9080946445465088, + "num_tokens": 516937678.0, + "step": 1947 + }, + { + "epoch": 0.19461511564014186, + "grad_norm": 0.6637295484542847, + "learning_rate": 9.307436265009354e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.9028376340866089, + "num_tokens": 517193340.0, + "step": 1948 + }, + { + "epoch": 0.19471502073030622, + "grad_norm": 0.50786954164505, + "learning_rate": 9.306614515480035e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.902174323797226, + "num_tokens": 517462610.0, + "step": 1949 + }, + { + "epoch": 0.19481492582047055, + "grad_norm": 0.6312168836593628, + "learning_rate": 9.305792315043962e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.9051772058010101, + "num_tokens": 517712380.0, + "step": 1950 + }, + { + "epoch": 0.19491483091063488, + "grad_norm": 0.5126204490661621, + "learning_rate": 9.304969663787222e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.9027182161808014, + "num_tokens": 517986355.0, + "step": 1951 + }, + { + "epoch": 0.19501473600079924, + "grad_norm": 0.5198445320129395, + "learning_rate": 9.304146561795946e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.9038743674755096, + "num_tokens": 518258152.0, + "step": 1952 + }, + { + "epoch": 0.19511464109096358, + "grad_norm": 0.6003000736236572, + "learning_rate": 9.303323009156315e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.9006552696228027, + "num_tokens": 518520575.0, + "step": 1953 + }, + { + "epoch": 0.19521454618112793, + "grad_norm": 0.5324742197990417, + "learning_rate": 9.302499005954557e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.9015927016735077, + "num_tokens": 518784775.0, + "step": 1954 + }, + { + "epoch": 0.19531445127129227, + "grad_norm": 0.5648677349090576, + "learning_rate": 9.301674552276942e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.9016366899013519, + "num_tokens": 519052742.0, + "step": 1955 + }, + { + "epoch": 0.19541435636145663, + "grad_norm": 0.5095251202583313, + "learning_rate": 9.300849648209794e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.9049032926559448, + "num_tokens": 519323216.0, + "step": 1956 + }, + { + "epoch": 0.19551426145162096, + "grad_norm": 1.0308281183242798, + "learning_rate": 9.30002429383948e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.9032728672027588, + "num_tokens": 519583032.0, + "step": 1957 + }, + { + "epoch": 0.19561416654178532, + "grad_norm": 0.96681809425354, + "learning_rate": 9.299198489252417e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.9059643447399139, + "num_tokens": 519844634.0, + "step": 1958 + }, + { + "epoch": 0.19571407163194965, + "grad_norm": 0.5539917945861816, + "learning_rate": 9.298372234535067e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.9047166407108307, + "num_tokens": 520110452.0, + "step": 1959 + }, + { + "epoch": 0.19581397672211398, + "grad_norm": 0.5165465474128723, + "learning_rate": 9.297545529773936e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.905598372220993, + "num_tokens": 520373356.0, + "step": 1960 + }, + { + "epoch": 0.19591388181227834, + "grad_norm": 0.5591888427734375, + "learning_rate": 9.296718375055587e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.901160329580307, + "num_tokens": 520637425.0, + "step": 1961 + }, + { + "epoch": 0.19601378690244267, + "grad_norm": 0.5625268816947937, + "learning_rate": 9.29589077046662e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.9048480093479156, + "num_tokens": 520893354.0, + "step": 1962 + }, + { + "epoch": 0.19611369199260703, + "grad_norm": 0.6057774424552917, + "learning_rate": 9.295062716093688e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.9041980504989624, + "num_tokens": 521160987.0, + "step": 1963 + }, + { + "epoch": 0.19621359708277136, + "grad_norm": 0.5518869757652283, + "learning_rate": 9.294234212023485e-06, + "loss": 0.504, + "mean_token_accuracy": 0.9029980003833771, + "num_tokens": 521422178.0, + "step": 1964 + }, + { + "epoch": 0.19631350217293572, + "grad_norm": 1.0165293216705322, + "learning_rate": 9.293405258342762e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.9062220752239227, + "num_tokens": 521690323.0, + "step": 1965 + }, + { + "epoch": 0.19641340726310005, + "grad_norm": 0.673354983329773, + "learning_rate": 9.292575855138307e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.9006896018981934, + "num_tokens": 521947339.0, + "step": 1966 + }, + { + "epoch": 0.19651331235326439, + "grad_norm": 0.6119635701179504, + "learning_rate": 9.291746002496962e-06, + "loss": 0.501, + "mean_token_accuracy": 0.9044865369796753, + "num_tokens": 522214751.0, + "step": 1967 + }, + { + "epoch": 0.19661321744342874, + "grad_norm": 0.4289535582065582, + "learning_rate": 9.290915700505611e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.9035583436489105, + "num_tokens": 522481980.0, + "step": 1968 + }, + { + "epoch": 0.19671312253359308, + "grad_norm": 0.47522810101509094, + "learning_rate": 9.29008494925119e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9040274918079376, + "num_tokens": 522739835.0, + "step": 1969 + }, + { + "epoch": 0.19681302762375744, + "grad_norm": 0.5721302628517151, + "learning_rate": 9.289253748820675e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.9024990797042847, + "num_tokens": 523006194.0, + "step": 1970 + }, + { + "epoch": 0.19691293271392177, + "grad_norm": 0.5585421323776245, + "learning_rate": 9.2884220993011e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.9045890867710114, + "num_tokens": 523263839.0, + "step": 1971 + }, + { + "epoch": 0.19701283780408613, + "grad_norm": 0.5399455428123474, + "learning_rate": 9.287590000779535e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9033178389072418, + "num_tokens": 523521610.0, + "step": 1972 + }, + { + "epoch": 0.19711274289425046, + "grad_norm": 0.493659108877182, + "learning_rate": 9.2867574533431e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.9023455381393433, + "num_tokens": 523774476.0, + "step": 1973 + }, + { + "epoch": 0.19721264798441482, + "grad_norm": 0.48153209686279297, + "learning_rate": 9.28592445707897e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.9031641483306885, + "num_tokens": 524037446.0, + "step": 1974 + }, + { + "epoch": 0.19731255307457915, + "grad_norm": 0.580401599407196, + "learning_rate": 9.285091012074354e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9042889475822449, + "num_tokens": 524301827.0, + "step": 1975 + }, + { + "epoch": 0.19741245816474348, + "grad_norm": 0.5957124829292297, + "learning_rate": 9.284257118416518e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.905668169260025, + "num_tokens": 524566181.0, + "step": 1976 + }, + { + "epoch": 0.19751236325490784, + "grad_norm": 0.5598387718200684, + "learning_rate": 9.283422776192772e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9018264412879944, + "num_tokens": 524843029.0, + "step": 1977 + }, + { + "epoch": 0.19761226834507217, + "grad_norm": 0.7986255288124084, + "learning_rate": 9.282587985490468e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.9036961495876312, + "num_tokens": 525107374.0, + "step": 1978 + }, + { + "epoch": 0.19771217343523653, + "grad_norm": 0.6918619275093079, + "learning_rate": 9.281752746397015e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9046569764614105, + "num_tokens": 525367599.0, + "step": 1979 + }, + { + "epoch": 0.19781207852540086, + "grad_norm": 0.6637462973594666, + "learning_rate": 9.28091705899986e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.9016998708248138, + "num_tokens": 525633161.0, + "step": 1980 + }, + { + "epoch": 0.19791198361556522, + "grad_norm": 0.6145539879798889, + "learning_rate": 9.280080923386501e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.9048923254013062, + "num_tokens": 525900657.0, + "step": 1981 + }, + { + "epoch": 0.19801188870572956, + "grad_norm": 1.2534252405166626, + "learning_rate": 9.279244339644484e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.9057965576648712, + "num_tokens": 526165929.0, + "step": 1982 + }, + { + "epoch": 0.19811179379589391, + "grad_norm": 0.4774090051651001, + "learning_rate": 9.278407307861397e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.9039596915245056, + "num_tokens": 526431991.0, + "step": 1983 + }, + { + "epoch": 0.19821169888605825, + "grad_norm": 0.5436484217643738, + "learning_rate": 9.277569828124879e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.9032017886638641, + "num_tokens": 526704745.0, + "step": 1984 + }, + { + "epoch": 0.19831160397622258, + "grad_norm": 0.533840537071228, + "learning_rate": 9.276731900522616e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.9061298072338104, + "num_tokens": 526973215.0, + "step": 1985 + }, + { + "epoch": 0.19841150906638694, + "grad_norm": 0.5021746754646301, + "learning_rate": 9.27589352514234e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.902717798948288, + "num_tokens": 527245583.0, + "step": 1986 + }, + { + "epoch": 0.19851141415655127, + "grad_norm": 0.7344016432762146, + "learning_rate": 9.275054702071828e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.9024772942066193, + "num_tokens": 527497089.0, + "step": 1987 + }, + { + "epoch": 0.19861131924671563, + "grad_norm": 0.5538991093635559, + "learning_rate": 9.274215431398906e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.9025543630123138, + "num_tokens": 527761190.0, + "step": 1988 + }, + { + "epoch": 0.19871122433687996, + "grad_norm": 0.5457239151000977, + "learning_rate": 9.273375713211447e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.9044439196586609, + "num_tokens": 528022975.0, + "step": 1989 + }, + { + "epoch": 0.19881112942704432, + "grad_norm": 0.5381529331207275, + "learning_rate": 9.272535547597372e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.9056429266929626, + "num_tokens": 528289084.0, + "step": 1990 + }, + { + "epoch": 0.19891103451720865, + "grad_norm": 0.7824201583862305, + "learning_rate": 9.271694934644646e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.9051600694656372, + "num_tokens": 528552155.0, + "step": 1991 + }, + { + "epoch": 0.19901093960737298, + "grad_norm": 0.6430003643035889, + "learning_rate": 9.270853874441281e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.9049120247364044, + "num_tokens": 528819822.0, + "step": 1992 + }, + { + "epoch": 0.19911084469753734, + "grad_norm": 0.5992310047149658, + "learning_rate": 9.270012367075337e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.9024639427661896, + "num_tokens": 529087337.0, + "step": 1993 + }, + { + "epoch": 0.19921074978770167, + "grad_norm": 0.7230057716369629, + "learning_rate": 9.26917041263492e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.9065532684326172, + "num_tokens": 529348258.0, + "step": 1994 + }, + { + "epoch": 0.19931065487786603, + "grad_norm": 0.7473909258842468, + "learning_rate": 9.268328011208186e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.902056872844696, + "num_tokens": 529612189.0, + "step": 1995 + }, + { + "epoch": 0.19941055996803037, + "grad_norm": 0.5342884659767151, + "learning_rate": 9.267485162883334e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.9064548313617706, + "num_tokens": 529879206.0, + "step": 1996 + }, + { + "epoch": 0.19951046505819472, + "grad_norm": 0.5708054304122925, + "learning_rate": 9.26664186774861e-06, + "loss": 0.506, + "mean_token_accuracy": 0.9013229608535767, + "num_tokens": 530139387.0, + "step": 1997 + }, + { + "epoch": 0.19961037014835906, + "grad_norm": 0.5495632886886597, + "learning_rate": 9.26579812589231e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.9039883613586426, + "num_tokens": 530407518.0, + "step": 1998 + }, + { + "epoch": 0.19971027523852342, + "grad_norm": 1.1600337028503418, + "learning_rate": 9.26495393740277e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9060654938220978, + "num_tokens": 530671605.0, + "step": 1999 + }, + { + "epoch": 0.19981018032868775, + "grad_norm": 0.6684672236442566, + "learning_rate": 9.264109302368383e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9034871757030487, + "num_tokens": 530936759.0, + "step": 2000 + } + ], + "logging_steps": 1, + "max_steps": 10010, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.9517152990036754e+19, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}