{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 30.0, "eval_steps": 500, "global_step": 23130, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0311284046692607, "grad_norm": 11.856830596923828, "learning_rate": 3.314121037463977e-06, "loss": 2.0132, "step": 24 }, { "epoch": 0.0622568093385214, "grad_norm": 1.1055241823196411, "learning_rate": 6.7723342939481265e-06, "loss": 1.0751, "step": 48 }, { "epoch": 0.0933852140077821, "grad_norm": 1.4284316301345825, "learning_rate": 1.0230547550432277e-05, "loss": 1.038, "step": 72 }, { "epoch": 0.1245136186770428, "grad_norm": 0.6491210460662842, "learning_rate": 1.3688760806916426e-05, "loss": 0.9988, "step": 96 }, { "epoch": 0.1556420233463035, "grad_norm": 0.39960888028144836, "learning_rate": 1.7146974063400578e-05, "loss": 0.9849, "step": 120 }, { "epoch": 0.1867704280155642, "grad_norm": 0.678686797618866, "learning_rate": 2.060518731988473e-05, "loss": 0.9767, "step": 144 }, { "epoch": 0.2178988326848249, "grad_norm": 0.5720846056938171, "learning_rate": 2.406340057636888e-05, "loss": 0.9684, "step": 168 }, { "epoch": 0.2490272373540856, "grad_norm": 0.6077919602394104, "learning_rate": 2.7521613832853026e-05, "loss": 0.967, "step": 192 }, { "epoch": 0.2801556420233463, "grad_norm": 0.7459629774093628, "learning_rate": 3.097982708933718e-05, "loss": 0.9553, "step": 216 }, { "epoch": 0.311284046692607, "grad_norm": 0.8133582472801208, "learning_rate": 3.443804034582133e-05, "loss": 0.9474, "step": 240 }, { "epoch": 0.3424124513618677, "grad_norm": 0.7748175263404846, "learning_rate": 3.7896253602305474e-05, "loss": 0.9404, "step": 264 }, { "epoch": 0.3735408560311284, "grad_norm": 0.9244363903999329, "learning_rate": 4.135446685878963e-05, "loss": 0.9326, "step": 288 }, { "epoch": 0.4046692607003891, "grad_norm": 1.0879474878311157, "learning_rate": 4.4812680115273775e-05, "loss": 0.9112, "step": 312 }, { "epoch": 0.4357976653696498, "grad_norm": 1.3521575927734375, "learning_rate": 4.827089337175792e-05, "loss": 0.9033, "step": 336 }, { "epoch": 0.4669260700389105, "grad_norm": 1.4220598936080933, "learning_rate": 5.1729106628242076e-05, "loss": 0.8877, "step": 360 }, { "epoch": 0.4980544747081712, "grad_norm": 1.345544457435608, "learning_rate": 5.518731988472623e-05, "loss": 0.8726, "step": 384 }, { "epoch": 0.5291828793774319, "grad_norm": 1.7283053398132324, "learning_rate": 5.864553314121038e-05, "loss": 0.8553, "step": 408 }, { "epoch": 0.5603112840466926, "grad_norm": 1.2822779417037964, "learning_rate": 6.210374639769453e-05, "loss": 0.841, "step": 432 }, { "epoch": 0.5914396887159533, "grad_norm": 2.8578877449035645, "learning_rate": 6.556195965417868e-05, "loss": 0.8262, "step": 456 }, { "epoch": 0.622568093385214, "grad_norm": 1.603874683380127, "learning_rate": 6.902017291066282e-05, "loss": 0.7989, "step": 480 }, { "epoch": 0.6536964980544747, "grad_norm": 2.2428903579711914, "learning_rate": 7.247838616714697e-05, "loss": 0.7958, "step": 504 }, { "epoch": 0.6848249027237354, "grad_norm": 1.6760625839233398, "learning_rate": 7.593659942363113e-05, "loss": 0.7799, "step": 528 }, { "epoch": 0.7159533073929961, "grad_norm": 2.1145055294036865, "learning_rate": 7.939481268011528e-05, "loss": 0.7671, "step": 552 }, { "epoch": 0.7470817120622568, "grad_norm": 1.3805097341537476, "learning_rate": 8.285302593659943e-05, "loss": 0.7563, "step": 576 }, { "epoch": 0.7782101167315175, "grad_norm": 2.1005349159240723, "learning_rate": 8.631123919308359e-05, "loss": 0.7396, "step": 600 }, { "epoch": 0.8093385214007782, "grad_norm": 1.6995466947555542, "learning_rate": 8.976945244956772e-05, "loss": 0.738, "step": 624 }, { "epoch": 0.8404669260700389, "grad_norm": 1.5165631771087646, "learning_rate": 9.322766570605188e-05, "loss": 0.7208, "step": 648 }, { "epoch": 0.8715953307392996, "grad_norm": 1.4923312664031982, "learning_rate": 9.668587896253603e-05, "loss": 0.7126, "step": 672 }, { "epoch": 0.9027237354085603, "grad_norm": 1.7333067655563354, "learning_rate": 9.999999950982757e-05, "loss": 0.6998, "step": 696 }, { "epoch": 0.933852140077821, "grad_norm": 1.5790561437606812, "learning_rate": 9.999969364253642e-05, "loss": 0.6943, "step": 720 }, { "epoch": 0.9649805447470817, "grad_norm": 2.4895715713500977, "learning_rate": 9.999882310058304e-05, "loss": 0.6887, "step": 744 }, { "epoch": 0.9961089494163424, "grad_norm": 1.2938724756240845, "learning_rate": 9.999738789379896e-05, "loss": 0.6728, "step": 768 }, { "epoch": 1.027237354085603, "grad_norm": 2.313992738723755, "learning_rate": 9.999538803839277e-05, "loss": 0.6704, "step": 792 }, { "epoch": 1.0583657587548638, "grad_norm": 2.3841969966888428, "learning_rate": 9.999282355694997e-05, "loss": 0.6683, "step": 816 }, { "epoch": 1.0894941634241244, "grad_norm": 1.7238163948059082, "learning_rate": 9.998969447843267e-05, "loss": 0.6598, "step": 840 }, { "epoch": 1.1206225680933852, "grad_norm": 1.889561653137207, "learning_rate": 9.998600083817934e-05, "loss": 0.6469, "step": 864 }, { "epoch": 1.1517509727626458, "grad_norm": 2.078350305557251, "learning_rate": 9.998174267790433e-05, "loss": 0.6394, "step": 888 }, { "epoch": 1.1828793774319066, "grad_norm": 3.088223934173584, "learning_rate": 9.99769200456974e-05, "loss": 0.642, "step": 912 }, { "epoch": 1.2140077821011672, "grad_norm": 1.8867013454437256, "learning_rate": 9.997153299602332e-05, "loss": 0.6365, "step": 936 }, { "epoch": 1.245136186770428, "grad_norm": 2.187405586242676, "learning_rate": 9.9965581589721e-05, "loss": 0.6216, "step": 960 }, { "epoch": 1.2762645914396886, "grad_norm": 1.5248736143112183, "learning_rate": 9.995906589400307e-05, "loss": 0.6208, "step": 984 }, { "epoch": 1.3073929961089494, "grad_norm": 1.3533403873443604, "learning_rate": 9.995198598245492e-05, "loss": 0.6143, "step": 1008 }, { "epoch": 1.3385214007782102, "grad_norm": 1.9436872005462646, "learning_rate": 9.994434193503399e-05, "loss": 0.6101, "step": 1032 }, { "epoch": 1.3696498054474708, "grad_norm": 1.5890527963638306, "learning_rate": 9.993613383806879e-05, "loss": 0.6011, "step": 1056 }, { "epoch": 1.4007782101167314, "grad_norm": 1.6845647096633911, "learning_rate": 9.9927361784258e-05, "loss": 0.6022, "step": 1080 }, { "epoch": 1.4319066147859922, "grad_norm": 1.5048511028289795, "learning_rate": 9.991802587266932e-05, "loss": 0.6078, "step": 1104 }, { "epoch": 1.463035019455253, "grad_norm": 1.8788032531738281, "learning_rate": 9.990812620873848e-05, "loss": 0.6014, "step": 1128 }, { "epoch": 1.4941634241245136, "grad_norm": 2.0226938724517822, "learning_rate": 9.989766290426795e-05, "loss": 0.5912, "step": 1152 }, { "epoch": 1.5252918287937742, "grad_norm": 1.9385308027267456, "learning_rate": 9.98866360774257e-05, "loss": 0.5812, "step": 1176 }, { "epoch": 1.556420233463035, "grad_norm": 1.2753961086273193, "learning_rate": 9.98750458527439e-05, "loss": 0.5825, "step": 1200 }, { "epoch": 1.5875486381322959, "grad_norm": 1.6889104843139648, "learning_rate": 9.986289236111747e-05, "loss": 0.58, "step": 1224 }, { "epoch": 1.6186770428015564, "grad_norm": 2.130415916442871, "learning_rate": 9.985017573980262e-05, "loss": 0.5853, "step": 1248 }, { "epoch": 1.649805447470817, "grad_norm": 1.8879446983337402, "learning_rate": 9.983689613241531e-05, "loss": 0.5806, "step": 1272 }, { "epoch": 1.6809338521400778, "grad_norm": 1.2330986261367798, "learning_rate": 9.982305368892964e-05, "loss": 0.574, "step": 1296 }, { "epoch": 1.7120622568093387, "grad_norm": 1.389142632484436, "learning_rate": 9.980864856567606e-05, "loss": 0.5743, "step": 1320 }, { "epoch": 1.7431906614785992, "grad_norm": 1.184691309928894, "learning_rate": 9.979368092533978e-05, "loss": 0.5691, "step": 1344 }, { "epoch": 1.7743190661478598, "grad_norm": 1.3246943950653076, "learning_rate": 9.977815093695875e-05, "loss": 0.5669, "step": 1368 }, { "epoch": 1.8054474708171206, "grad_norm": 1.5033084154129028, "learning_rate": 9.976205877592189e-05, "loss": 0.5636, "step": 1392 }, { "epoch": 1.8365758754863815, "grad_norm": 2.675381660461426, "learning_rate": 9.974540462396697e-05, "loss": 0.5554, "step": 1416 }, { "epoch": 1.867704280155642, "grad_norm": 1.4676384925842285, "learning_rate": 9.972818866917877e-05, "loss": 0.5526, "step": 1440 }, { "epoch": 1.8988326848249026, "grad_norm": 2.269249200820923, "learning_rate": 9.971041110598669e-05, "loss": 0.556, "step": 1464 }, { "epoch": 1.9299610894941635, "grad_norm": 1.7739601135253906, "learning_rate": 9.969207213516279e-05, "loss": 0.5546, "step": 1488 }, { "epoch": 1.9610894941634243, "grad_norm": 1.2574249505996704, "learning_rate": 9.967317196381936e-05, "loss": 0.549, "step": 1512 }, { "epoch": 1.9922178988326849, "grad_norm": 1.65413236618042, "learning_rate": 9.965371080540666e-05, "loss": 0.5537, "step": 1536 }, { "epoch": 2.0233463035019454, "grad_norm": 1.6155718564987183, "learning_rate": 9.96336888797105e-05, "loss": 0.5424, "step": 1560 }, { "epoch": 2.054474708171206, "grad_norm": 1.556755542755127, "learning_rate": 9.961310641284977e-05, "loss": 0.5396, "step": 1584 }, { "epoch": 2.085603112840467, "grad_norm": 1.5641894340515137, "learning_rate": 9.959196363727383e-05, "loss": 0.5465, "step": 1608 }, { "epoch": 2.1167315175097277, "grad_norm": 1.4483375549316406, "learning_rate": 9.957026079175996e-05, "loss": 0.5401, "step": 1632 }, { "epoch": 2.1478599221789882, "grad_norm": 1.8051731586456299, "learning_rate": 9.954799812141054e-05, "loss": 0.541, "step": 1656 }, { "epoch": 2.178988326848249, "grad_norm": 2.337942361831665, "learning_rate": 9.952517587765049e-05, "loss": 0.5359, "step": 1680 }, { "epoch": 2.21011673151751, "grad_norm": 1.5796310901641846, "learning_rate": 9.950179431822421e-05, "loss": 0.5361, "step": 1704 }, { "epoch": 2.2412451361867705, "grad_norm": 1.3433961868286133, "learning_rate": 9.947785370719281e-05, "loss": 0.5254, "step": 1728 }, { "epoch": 2.272373540856031, "grad_norm": 1.8424466848373413, "learning_rate": 9.945335431493108e-05, "loss": 0.5278, "step": 1752 }, { "epoch": 2.3035019455252916, "grad_norm": 1.280912160873413, "learning_rate": 9.942829641812445e-05, "loss": 0.5314, "step": 1776 }, { "epoch": 2.3346303501945527, "grad_norm": 2.389176368713379, "learning_rate": 9.94026802997658e-05, "loss": 0.5272, "step": 1800 }, { "epoch": 2.3657587548638133, "grad_norm": 1.804115653038025, "learning_rate": 9.93765062491524e-05, "loss": 0.5214, "step": 1824 }, { "epoch": 2.396887159533074, "grad_norm": 2.4799587726593018, "learning_rate": 9.934977456188253e-05, "loss": 0.5228, "step": 1848 }, { "epoch": 2.4280155642023344, "grad_norm": 1.3502540588378906, "learning_rate": 9.932248553985213e-05, "loss": 0.5269, "step": 1872 }, { "epoch": 2.4591439688715955, "grad_norm": 1.9639521837234497, "learning_rate": 9.929463949125151e-05, "loss": 0.5244, "step": 1896 }, { "epoch": 2.490272373540856, "grad_norm": 1.5300196409225464, "learning_rate": 9.926623673056173e-05, "loss": 0.5163, "step": 1920 }, { "epoch": 2.5214007782101167, "grad_norm": 1.3195667266845703, "learning_rate": 9.923727757855117e-05, "loss": 0.5155, "step": 1944 }, { "epoch": 2.5525291828793772, "grad_norm": 1.3704023361206055, "learning_rate": 9.920776236227181e-05, "loss": 0.5164, "step": 1968 }, { "epoch": 2.5836575875486383, "grad_norm": 1.2443211078643799, "learning_rate": 9.917769141505557e-05, "loss": 0.5119, "step": 1992 }, { "epoch": 2.614785992217899, "grad_norm": 1.7005102634429932, "learning_rate": 9.91470650765106e-05, "loss": 0.5191, "step": 2016 }, { "epoch": 2.6459143968871595, "grad_norm": 1.742263674736023, "learning_rate": 9.911588369251736e-05, "loss": 0.5207, "step": 2040 }, { "epoch": 2.6770428015564205, "grad_norm": 2.342224597930908, "learning_rate": 9.908414761522473e-05, "loss": 0.5116, "step": 2064 }, { "epoch": 2.708171206225681, "grad_norm": 1.481919765472412, "learning_rate": 9.905185720304612e-05, "loss": 0.5169, "step": 2088 }, { "epoch": 2.7392996108949417, "grad_norm": 2.477743148803711, "learning_rate": 9.901901282065529e-05, "loss": 0.5125, "step": 2112 }, { "epoch": 2.7704280155642023, "grad_norm": 1.231108546257019, "learning_rate": 9.898561483898233e-05, "loss": 0.5119, "step": 2136 }, { "epoch": 2.801556420233463, "grad_norm": 1.6876734495162964, "learning_rate": 9.895166363520943e-05, "loss": 0.5098, "step": 2160 }, { "epoch": 2.832684824902724, "grad_norm": 1.886053442955017, "learning_rate": 9.891715959276664e-05, "loss": 0.509, "step": 2184 }, { "epoch": 2.8638132295719845, "grad_norm": 2.044147253036499, "learning_rate": 9.88821031013275e-05, "loss": 0.5081, "step": 2208 }, { "epoch": 2.894941634241245, "grad_norm": 1.8339983224868774, "learning_rate": 9.88464945568047e-05, "loss": 0.5031, "step": 2232 }, { "epoch": 2.926070038910506, "grad_norm": 2.0237860679626465, "learning_rate": 9.881033436134555e-05, "loss": 0.5026, "step": 2256 }, { "epoch": 2.9571984435797667, "grad_norm": 1.222092866897583, "learning_rate": 9.877362292332749e-05, "loss": 0.4922, "step": 2280 }, { "epoch": 2.9883268482490273, "grad_norm": 2.8668859004974365, "learning_rate": 9.873636065735343e-05, "loss": 0.4978, "step": 2304 }, { "epoch": 3.019455252918288, "grad_norm": 1.8704198598861694, "learning_rate": 9.869854798424709e-05, "loss": 0.4999, "step": 2328 }, { "epoch": 3.0505836575875485, "grad_norm": 1.3280694484710693, "learning_rate": 9.866018533104826e-05, "loss": 0.4979, "step": 2352 }, { "epoch": 3.0817120622568095, "grad_norm": 1.6099941730499268, "learning_rate": 9.862127313100795e-05, "loss": 0.4966, "step": 2376 }, { "epoch": 3.11284046692607, "grad_norm": 1.797253131866455, "learning_rate": 9.858181182358355e-05, "loss": 0.4913, "step": 2400 }, { "epoch": 3.1439688715953307, "grad_norm": 1.4523372650146484, "learning_rate": 9.854180185443378e-05, "loss": 0.494, "step": 2424 }, { "epoch": 3.1750972762645913, "grad_norm": 1.665285587310791, "learning_rate": 9.850124367541371e-05, "loss": 0.495, "step": 2448 }, { "epoch": 3.2062256809338523, "grad_norm": 1.2931227684020996, "learning_rate": 9.84601377445697e-05, "loss": 0.4949, "step": 2472 }, { "epoch": 3.237354085603113, "grad_norm": 2.0045413970947266, "learning_rate": 9.841848452613412e-05, "loss": 0.4901, "step": 2496 }, { "epoch": 3.2684824902723735, "grad_norm": 1.2784613370895386, "learning_rate": 9.83762844905202e-05, "loss": 0.4967, "step": 2520 }, { "epoch": 3.299610894941634, "grad_norm": 1.485795497894287, "learning_rate": 9.833353811431669e-05, "loss": 0.4921, "step": 2544 }, { "epoch": 3.330739299610895, "grad_norm": 2.1288626194000244, "learning_rate": 9.829024588028244e-05, "loss": 0.4912, "step": 2568 }, { "epoch": 3.3618677042801557, "grad_norm": 1.5102566480636597, "learning_rate": 9.824640827734102e-05, "loss": 0.4938, "step": 2592 }, { "epoch": 3.3929961089494163, "grad_norm": 2.126574993133545, "learning_rate": 9.820202580057512e-05, "loss": 0.4881, "step": 2616 }, { "epoch": 3.424124513618677, "grad_norm": 1.1427215337753296, "learning_rate": 9.8157098951221e-05, "loss": 0.4956, "step": 2640 }, { "epoch": 3.455252918287938, "grad_norm": 1.847524881362915, "learning_rate": 9.811162823666287e-05, "loss": 0.4883, "step": 2664 }, { "epoch": 3.4863813229571985, "grad_norm": 1.3941086530685425, "learning_rate": 9.806561417042706e-05, "loss": 0.488, "step": 2688 }, { "epoch": 3.517509727626459, "grad_norm": 1.7835474014282227, "learning_rate": 9.801905727217631e-05, "loss": 0.4796, "step": 2712 }, { "epoch": 3.5486381322957197, "grad_norm": 2.4145917892456055, "learning_rate": 9.797195806770387e-05, "loss": 0.4856, "step": 2736 }, { "epoch": 3.5797665369649807, "grad_norm": 1.6567249298095703, "learning_rate": 9.792431708892752e-05, "loss": 0.4799, "step": 2760 }, { "epoch": 3.6108949416342413, "grad_norm": 1.7985295057296753, "learning_rate": 9.787613487388365e-05, "loss": 0.4886, "step": 2784 }, { "epoch": 3.642023346303502, "grad_norm": 1.7581013441085815, "learning_rate": 9.78274119667211e-05, "loss": 0.4835, "step": 2808 }, { "epoch": 3.673151750972763, "grad_norm": 1.6254545450210571, "learning_rate": 9.777814891769507e-05, "loss": 0.4841, "step": 2832 }, { "epoch": 3.7042801556420235, "grad_norm": 1.745969295501709, "learning_rate": 9.772834628316087e-05, "loss": 0.4848, "step": 2856 }, { "epoch": 3.735408560311284, "grad_norm": 1.762830138206482, "learning_rate": 9.767800462556769e-05, "loss": 0.476, "step": 2880 }, { "epoch": 3.7665369649805447, "grad_norm": 1.6283063888549805, "learning_rate": 9.762712451345217e-05, "loss": 0.48, "step": 2904 }, { "epoch": 3.7976653696498053, "grad_norm": 1.7204512357711792, "learning_rate": 9.757570652143202e-05, "loss": 0.4746, "step": 2928 }, { "epoch": 3.8287937743190663, "grad_norm": 2.6043598651885986, "learning_rate": 9.752375123019956e-05, "loss": 0.4805, "step": 2952 }, { "epoch": 3.859922178988327, "grad_norm": 2.134938955307007, "learning_rate": 9.74712592265151e-05, "loss": 0.4776, "step": 2976 }, { "epoch": 3.8910505836575875, "grad_norm": 1.4748331308364868, "learning_rate": 9.741823110320037e-05, "loss": 0.4725, "step": 3000 }, { "epoch": 3.9221789883268485, "grad_norm": 2.2188987731933594, "learning_rate": 9.73646674591318e-05, "loss": 0.4781, "step": 3024 }, { "epoch": 3.953307392996109, "grad_norm": 1.2936460971832275, "learning_rate": 9.731056889923374e-05, "loss": 0.4808, "step": 3048 }, { "epoch": 3.9844357976653697, "grad_norm": 2.5133862495422363, "learning_rate": 9.725593603447166e-05, "loss": 0.4839, "step": 3072 }, { "epoch": 4.01556420233463, "grad_norm": 2.2660224437713623, "learning_rate": 9.720076948184522e-05, "loss": 0.4709, "step": 3096 }, { "epoch": 4.046692607003891, "grad_norm": 1.573203444480896, "learning_rate": 9.714506986438134e-05, "loss": 0.4762, "step": 3120 }, { "epoch": 4.0778210116731515, "grad_norm": 1.9054023027420044, "learning_rate": 9.70888378111271e-05, "loss": 0.4796, "step": 3144 }, { "epoch": 4.108949416342412, "grad_norm": 2.2776753902435303, "learning_rate": 9.703207395714274e-05, "loss": 0.4705, "step": 3168 }, { "epoch": 4.1400778210116735, "grad_norm": 1.614623785018921, "learning_rate": 9.697477894349438e-05, "loss": 0.4713, "step": 3192 }, { "epoch": 4.171206225680934, "grad_norm": 2.478569269180298, "learning_rate": 9.691695341724681e-05, "loss": 0.4719, "step": 3216 }, { "epoch": 4.202334630350195, "grad_norm": 1.3797364234924316, "learning_rate": 9.685859803145625e-05, "loss": 0.4663, "step": 3240 }, { "epoch": 4.233463035019455, "grad_norm": 2.49601674079895, "learning_rate": 9.679971344516288e-05, "loss": 0.4827, "step": 3264 }, { "epoch": 4.264591439688716, "grad_norm": 1.5913656949996948, "learning_rate": 9.674030032338346e-05, "loss": 0.4869, "step": 3288 }, { "epoch": 4.2957198443579765, "grad_norm": 1.5114320516586304, "learning_rate": 9.668035933710378e-05, "loss": 0.4794, "step": 3312 }, { "epoch": 4.326848249027237, "grad_norm": 1.905714750289917, "learning_rate": 9.661989116327112e-05, "loss": 0.4702, "step": 3336 }, { "epoch": 4.357976653696498, "grad_norm": 1.6932348012924194, "learning_rate": 9.655889648478657e-05, "loss": 0.4693, "step": 3360 }, { "epoch": 4.389105058365759, "grad_norm": 1.9976513385772705, "learning_rate": 9.649737599049736e-05, "loss": 0.4705, "step": 3384 }, { "epoch": 4.42023346303502, "grad_norm": 1.4826905727386475, "learning_rate": 9.643533037518899e-05, "loss": 0.4697, "step": 3408 }, { "epoch": 4.45136186770428, "grad_norm": 1.617922306060791, "learning_rate": 9.637276033957755e-05, "loss": 0.4684, "step": 3432 }, { "epoch": 4.482490272373541, "grad_norm": 2.4124162197113037, "learning_rate": 9.630966659030158e-05, "loss": 0.462, "step": 3456 }, { "epoch": 4.5136186770428015, "grad_norm": 1.8999947309494019, "learning_rate": 9.624604983991434e-05, "loss": 0.4614, "step": 3480 }, { "epoch": 4.544747081712062, "grad_norm": 2.2038631439208984, "learning_rate": 9.618191080687552e-05, "loss": 0.473, "step": 3504 }, { "epoch": 4.575875486381323, "grad_norm": 1.5659903287887573, "learning_rate": 9.611725021554333e-05, "loss": 0.4632, "step": 3528 }, { "epoch": 4.607003891050583, "grad_norm": 2.38783597946167, "learning_rate": 9.605206879616617e-05, "loss": 0.4547, "step": 3552 }, { "epoch": 4.638132295719844, "grad_norm": 1.5512051582336426, "learning_rate": 9.59863672848745e-05, "loss": 0.4623, "step": 3576 }, { "epoch": 4.669260700389105, "grad_norm": 3.2371737957000732, "learning_rate": 9.592014642367243e-05, "loss": 0.4635, "step": 3600 }, { "epoch": 4.700389105058366, "grad_norm": 1.7594435214996338, "learning_rate": 9.585340696042935e-05, "loss": 0.4674, "step": 3624 }, { "epoch": 4.7315175097276265, "grad_norm": 1.3836287260055542, "learning_rate": 9.57861496488716e-05, "loss": 0.4611, "step": 3648 }, { "epoch": 4.762645914396887, "grad_norm": 1.7907147407531738, "learning_rate": 9.571837524857384e-05, "loss": 0.4609, "step": 3672 }, { "epoch": 4.793774319066148, "grad_norm": 1.7246521711349487, "learning_rate": 9.565008452495046e-05, "loss": 0.4588, "step": 3696 }, { "epoch": 4.824902723735408, "grad_norm": 2.1095101833343506, "learning_rate": 9.558127824924701e-05, "loss": 0.4623, "step": 3720 }, { "epoch": 4.856031128404669, "grad_norm": 1.1277464628219604, "learning_rate": 9.551195719853147e-05, "loss": 0.4568, "step": 3744 }, { "epoch": 4.88715953307393, "grad_norm": 1.2232158184051514, "learning_rate": 9.544212215568547e-05, "loss": 0.459, "step": 3768 }, { "epoch": 4.918287937743191, "grad_norm": 1.9220589399337769, "learning_rate": 9.53717739093954e-05, "loss": 0.4539, "step": 3792 }, { "epoch": 4.9494163424124515, "grad_norm": 1.6076886653900146, "learning_rate": 9.530091325414359e-05, "loss": 0.4583, "step": 3816 }, { "epoch": 4.980544747081712, "grad_norm": 1.2246028184890747, "learning_rate": 9.522954099019927e-05, "loss": 0.4567, "step": 3840 }, { "epoch": 5.011673151750973, "grad_norm": 1.4004205465316772, "learning_rate": 9.515765792360955e-05, "loss": 0.4535, "step": 3864 }, { "epoch": 5.042801556420233, "grad_norm": 1.30203378200531, "learning_rate": 9.508526486619036e-05, "loss": 0.452, "step": 3888 }, { "epoch": 5.073929961089494, "grad_norm": 1.538682222366333, "learning_rate": 9.501236263551719e-05, "loss": 0.4511, "step": 3912 }, { "epoch": 5.1050583657587545, "grad_norm": 1.3054372072219849, "learning_rate": 9.493895205491595e-05, "loss": 0.4489, "step": 3936 }, { "epoch": 5.136186770428016, "grad_norm": 1.4001922607421875, "learning_rate": 9.486503395345358e-05, "loss": 0.4577, "step": 3960 }, { "epoch": 5.167315175097277, "grad_norm": 1.3580487966537476, "learning_rate": 9.47906091659288e-05, "loss": 0.4519, "step": 3984 }, { "epoch": 5.198443579766537, "grad_norm": 1.929853081703186, "learning_rate": 9.47156785328626e-05, "loss": 0.4562, "step": 4008 }, { "epoch": 5.229571984435798, "grad_norm": 1.9883568286895752, "learning_rate": 9.464024290048879e-05, "loss": 0.4573, "step": 4032 }, { "epoch": 5.260700389105058, "grad_norm": 1.7795013189315796, "learning_rate": 9.456430312074432e-05, "loss": 0.4513, "step": 4056 }, { "epoch": 5.291828793774319, "grad_norm": 1.1019718647003174, "learning_rate": 9.44878600512599e-05, "loss": 0.4475, "step": 4080 }, { "epoch": 5.3229571984435795, "grad_norm": 1.604556918144226, "learning_rate": 9.441091455535007e-05, "loss": 0.4466, "step": 4104 }, { "epoch": 5.35408560311284, "grad_norm": 1.8707294464111328, "learning_rate": 9.433346750200363e-05, "loss": 0.4501, "step": 4128 }, { "epoch": 5.385214007782102, "grad_norm": 1.6021867990493774, "learning_rate": 9.425551976587366e-05, "loss": 0.4443, "step": 4152 }, { "epoch": 5.416342412451362, "grad_norm": 1.7186486721038818, "learning_rate": 9.417707222726784e-05, "loss": 0.4374, "step": 4176 }, { "epoch": 5.447470817120623, "grad_norm": 2.0640745162963867, "learning_rate": 9.409812577213833e-05, "loss": 0.4468, "step": 4200 }, { "epoch": 5.478599221789883, "grad_norm": 2.1669087409973145, "learning_rate": 9.401868129207181e-05, "loss": 0.4501, "step": 4224 }, { "epoch": 5.509727626459144, "grad_norm": 2.237527847290039, "learning_rate": 9.393873968427953e-05, "loss": 0.4469, "step": 4248 }, { "epoch": 5.5408560311284045, "grad_norm": 1.5120989084243774, "learning_rate": 9.385830185158701e-05, "loss": 0.4425, "step": 4272 }, { "epoch": 5.571984435797665, "grad_norm": 2.029425621032715, "learning_rate": 9.377736870242393e-05, "loss": 0.4509, "step": 4296 }, { "epoch": 5.603112840466926, "grad_norm": 1.609480857849121, "learning_rate": 9.369594115081386e-05, "loss": 0.4528, "step": 4320 }, { "epoch": 5.634241245136186, "grad_norm": 1.126060128211975, "learning_rate": 9.361402011636395e-05, "loss": 0.4435, "step": 4344 }, { "epoch": 5.665369649805448, "grad_norm": 3.637361526489258, "learning_rate": 9.353160652425452e-05, "loss": 0.4466, "step": 4368 }, { "epoch": 5.696498054474708, "grad_norm": 3.3293521404266357, "learning_rate": 9.344870130522863e-05, "loss": 0.4495, "step": 4392 }, { "epoch": 5.727626459143969, "grad_norm": 1.1623419523239136, "learning_rate": 9.33653053955815e-05, "loss": 0.4362, "step": 4416 }, { "epoch": 5.7587548638132295, "grad_norm": 1.3908815383911133, "learning_rate": 9.328141973715008e-05, "loss": 0.445, "step": 4440 }, { "epoch": 5.78988326848249, "grad_norm": 1.1905103921890259, "learning_rate": 9.31970452773023e-05, "loss": 0.4399, "step": 4464 }, { "epoch": 5.821011673151751, "grad_norm": 1.7141236066818237, "learning_rate": 9.311218296892636e-05, "loss": 0.4396, "step": 4488 }, { "epoch": 5.852140077821011, "grad_norm": 1.5528429746627808, "learning_rate": 9.302683377042007e-05, "loss": 0.4369, "step": 4512 }, { "epoch": 5.883268482490273, "grad_norm": 1.206060528755188, "learning_rate": 9.29409986456799e-05, "loss": 0.443, "step": 4536 }, { "epoch": 5.914396887159533, "grad_norm": 1.2948627471923828, "learning_rate": 9.285467856409023e-05, "loss": 0.4421, "step": 4560 }, { "epoch": 5.945525291828794, "grad_norm": 1.6690411567687988, "learning_rate": 9.276787450051225e-05, "loss": 0.4393, "step": 4584 }, { "epoch": 5.976653696498055, "grad_norm": 1.4727965593338013, "learning_rate": 9.26805874352731e-05, "loss": 0.443, "step": 4608 }, { "epoch": 6.007782101167315, "grad_norm": 2.1878299713134766, "learning_rate": 9.25928183541547e-05, "loss": 0.4359, "step": 4632 }, { "epoch": 6.038910505836576, "grad_norm": 1.5079774856567383, "learning_rate": 9.250456824838263e-05, "loss": 0.438, "step": 4656 }, { "epoch": 6.070038910505836, "grad_norm": 1.5700092315673828, "learning_rate": 9.241583811461498e-05, "loss": 0.4355, "step": 4680 }, { "epoch": 6.101167315175097, "grad_norm": 1.0717116594314575, "learning_rate": 9.232662895493107e-05, "loss": 0.4337, "step": 4704 }, { "epoch": 6.132295719844358, "grad_norm": 1.775414228439331, "learning_rate": 9.223694177682009e-05, "loss": 0.4398, "step": 4728 }, { "epoch": 6.163424124513619, "grad_norm": 3.057781457901001, "learning_rate": 9.214677759316982e-05, "loss": 0.4367, "step": 4752 }, { "epoch": 6.19455252918288, "grad_norm": 1.2848880290985107, "learning_rate": 9.205613742225507e-05, "loss": 0.433, "step": 4776 }, { "epoch": 6.22568093385214, "grad_norm": 1.5465294122695923, "learning_rate": 9.196502228772626e-05, "loss": 0.442, "step": 4800 }, { "epoch": 6.256809338521401, "grad_norm": 1.1864486932754517, "learning_rate": 9.18734332185979e-05, "loss": 0.4356, "step": 4824 }, { "epoch": 6.287937743190661, "grad_norm": 1.6817840337753296, "learning_rate": 9.17813712492368e-05, "loss": 0.4386, "step": 4848 }, { "epoch": 6.319066147859922, "grad_norm": 1.285474181175232, "learning_rate": 9.16888374193506e-05, "loss": 0.4306, "step": 4872 }, { "epoch": 6.3501945525291825, "grad_norm": 1.5364230871200562, "learning_rate": 9.159583277397587e-05, "loss": 0.4333, "step": 4896 }, { "epoch": 6.381322957198444, "grad_norm": 1.8164541721343994, "learning_rate": 9.150235836346639e-05, "loss": 0.4285, "step": 4920 }, { "epoch": 6.412451361867705, "grad_norm": 1.5146026611328125, "learning_rate": 9.140841524348125e-05, "loss": 0.4354, "step": 4944 }, { "epoch": 6.443579766536965, "grad_norm": 1.238393783569336, "learning_rate": 9.131400447497294e-05, "loss": 0.4257, "step": 4968 }, { "epoch": 6.474708171206226, "grad_norm": 1.4109466075897217, "learning_rate": 9.121912712417536e-05, "loss": 0.43, "step": 4992 }, { "epoch": 6.505836575875486, "grad_norm": 1.8265984058380127, "learning_rate": 9.11237842625918e-05, "loss": 0.4373, "step": 5016 }, { "epoch": 6.536964980544747, "grad_norm": 1.5519527196884155, "learning_rate": 9.102797696698284e-05, "loss": 0.4347, "step": 5040 }, { "epoch": 6.5680933852140075, "grad_norm": 1.314172387123108, "learning_rate": 9.093170631935412e-05, "loss": 0.4348, "step": 5064 }, { "epoch": 6.599221789883268, "grad_norm": 1.7968671321868896, "learning_rate": 9.083497340694425e-05, "loss": 0.4379, "step": 5088 }, { "epoch": 6.630350194552529, "grad_norm": 1.166242003440857, "learning_rate": 9.073777932221239e-05, "loss": 0.4313, "step": 5112 }, { "epoch": 6.66147859922179, "grad_norm": 1.9698489904403687, "learning_rate": 9.064012516282601e-05, "loss": 0.441, "step": 5136 }, { "epoch": 6.692607003891051, "grad_norm": 1.2938389778137207, "learning_rate": 9.054201203164845e-05, "loss": 0.4301, "step": 5160 }, { "epoch": 6.723735408560311, "grad_norm": 5.220723628997803, "learning_rate": 9.044344103672651e-05, "loss": 0.4232, "step": 5184 }, { "epoch": 6.754863813229572, "grad_norm": 1.7442070245742798, "learning_rate": 9.034441329127783e-05, "loss": 0.4343, "step": 5208 }, { "epoch": 6.785992217898833, "grad_norm": 4.927098274230957, "learning_rate": 9.024492991367848e-05, "loss": 0.4279, "step": 5232 }, { "epoch": 6.817120622568093, "grad_norm": 1.1979647874832153, "learning_rate": 9.014499202745019e-05, "loss": 0.4312, "step": 5256 }, { "epoch": 6.848249027237354, "grad_norm": 1.6905076503753662, "learning_rate": 9.004460076124768e-05, "loss": 0.432, "step": 5280 }, { "epoch": 6.879377431906615, "grad_norm": 1.388134241104126, "learning_rate": 8.994375724884604e-05, "loss": 0.4314, "step": 5304 }, { "epoch": 6.910505836575876, "grad_norm": 2.4431025981903076, "learning_rate": 8.984246262912774e-05, "loss": 0.4341, "step": 5328 }, { "epoch": 6.941634241245136, "grad_norm": 2.5521421432495117, "learning_rate": 8.974071804606989e-05, "loss": 0.4251, "step": 5352 }, { "epoch": 6.972762645914397, "grad_norm": 1.6180981397628784, "learning_rate": 8.96385246487313e-05, "loss": 0.4332, "step": 5376 }, { "epoch": 7.003891050583658, "grad_norm": 1.673168659210205, "learning_rate": 8.95358835912395e-05, "loss": 0.4258, "step": 5400 }, { "epoch": 7.035019455252918, "grad_norm": 2.032773733139038, "learning_rate": 8.943279603277767e-05, "loss": 0.4337, "step": 5424 }, { "epoch": 7.066147859922179, "grad_norm": 1.7290483713150024, "learning_rate": 8.932926313757157e-05, "loss": 0.4312, "step": 5448 }, { "epoch": 7.097276264591439, "grad_norm": 4.685028076171875, "learning_rate": 8.922528607487645e-05, "loss": 0.4416, "step": 5472 }, { "epoch": 7.1284046692607, "grad_norm": 1.5580335855484009, "learning_rate": 8.912086601896372e-05, "loss": 0.4358, "step": 5496 }, { "epoch": 7.159533073929961, "grad_norm": 1.332607388496399, "learning_rate": 8.901600414910785e-05, "loss": 0.4288, "step": 5520 }, { "epoch": 7.190661478599222, "grad_norm": 1.2149999141693115, "learning_rate": 8.891070164957288e-05, "loss": 0.4238, "step": 5544 }, { "epoch": 7.221789883268483, "grad_norm": 1.4633874893188477, "learning_rate": 8.880495970959917e-05, "loss": 0.4278, "step": 5568 }, { "epoch": 7.252918287937743, "grad_norm": 1.4801607131958008, "learning_rate": 8.869877952338991e-05, "loss": 0.4227, "step": 5592 }, { "epoch": 7.284046692607004, "grad_norm": 1.8194708824157715, "learning_rate": 8.85921622900977e-05, "loss": 0.4192, "step": 5616 }, { "epoch": 7.315175097276264, "grad_norm": 1.111076831817627, "learning_rate": 8.848510921381089e-05, "loss": 0.4231, "step": 5640 }, { "epoch": 7.346303501945525, "grad_norm": 1.4320513010025024, "learning_rate": 8.83776215035401e-05, "loss": 0.4224, "step": 5664 }, { "epoch": 7.377431906614786, "grad_norm": 1.80966317653656, "learning_rate": 8.826970037320448e-05, "loss": 0.4183, "step": 5688 }, { "epoch": 7.408560311284047, "grad_norm": 1.843509554862976, "learning_rate": 8.816134704161807e-05, "loss": 0.417, "step": 5712 }, { "epoch": 7.439688715953308, "grad_norm": 1.2015341520309448, "learning_rate": 8.805256273247598e-05, "loss": 0.4177, "step": 5736 }, { "epoch": 7.470817120622568, "grad_norm": 1.6432462930679321, "learning_rate": 8.794334867434059e-05, "loss": 0.4236, "step": 5760 }, { "epoch": 7.501945525291829, "grad_norm": 1.354224443435669, "learning_rate": 8.783370610062769e-05, "loss": 0.4142, "step": 5784 }, { "epoch": 7.533073929961089, "grad_norm": 1.6838608980178833, "learning_rate": 8.772363624959255e-05, "loss": 0.4173, "step": 5808 }, { "epoch": 7.56420233463035, "grad_norm": 1.8743314743041992, "learning_rate": 8.761314036431588e-05, "loss": 0.4248, "step": 5832 }, { "epoch": 7.595330739299611, "grad_norm": 1.4311802387237549, "learning_rate": 8.750221969268985e-05, "loss": 0.4204, "step": 5856 }, { "epoch": 7.626459143968871, "grad_norm": 1.4219359159469604, "learning_rate": 8.739087548740404e-05, "loss": 0.4201, "step": 5880 }, { "epoch": 7.657587548638133, "grad_norm": 2.070533275604248, "learning_rate": 8.727910900593114e-05, "loss": 0.4229, "step": 5904 }, { "epoch": 7.688715953307393, "grad_norm": 1.4531338214874268, "learning_rate": 8.716692151051293e-05, "loss": 0.42, "step": 5928 }, { "epoch": 7.719844357976654, "grad_norm": 2.2621729373931885, "learning_rate": 8.705431426814585e-05, "loss": 0.4171, "step": 5952 }, { "epoch": 7.750972762645914, "grad_norm": 1.242394208908081, "learning_rate": 8.694128855056683e-05, "loss": 0.4133, "step": 5976 }, { "epoch": 7.782101167315175, "grad_norm": 1.2939616441726685, "learning_rate": 8.68278456342389e-05, "loss": 0.4185, "step": 6000 }, { "epoch": 7.813229571984436, "grad_norm": 2.0788450241088867, "learning_rate": 8.671398680033668e-05, "loss": 0.4183, "step": 6024 }, { "epoch": 7.844357976653696, "grad_norm": 2.538680076599121, "learning_rate": 8.659971333473206e-05, "loss": 0.4246, "step": 6048 }, { "epoch": 7.875486381322958, "grad_norm": 2.1128950119018555, "learning_rate": 8.648502652797954e-05, "loss": 0.4156, "step": 6072 }, { "epoch": 7.906614785992218, "grad_norm": 2.2612478733062744, "learning_rate": 8.636992767530171e-05, "loss": 0.409, "step": 6096 }, { "epoch": 7.937743190661479, "grad_norm": 2.0751936435699463, "learning_rate": 8.625441807657471e-05, "loss": 0.4264, "step": 6120 }, { "epoch": 7.968871595330739, "grad_norm": 2.009459972381592, "learning_rate": 8.613849903631334e-05, "loss": 0.4255, "step": 6144 }, { "epoch": 8.0, "grad_norm": 1.8576405048370361, "learning_rate": 8.602217186365655e-05, "loss": 0.4211, "step": 6168 }, { "epoch": 8.03112840466926, "grad_norm": 2.817073345184326, "learning_rate": 8.590543787235252e-05, "loss": 0.4156, "step": 6192 }, { "epoch": 8.062256809338521, "grad_norm": 1.5011825561523438, "learning_rate": 8.578829838074389e-05, "loss": 0.41, "step": 6216 }, { "epoch": 8.093385214007782, "grad_norm": 1.2293556928634644, "learning_rate": 8.567075471175281e-05, "loss": 0.417, "step": 6240 }, { "epoch": 8.124513618677042, "grad_norm": 1.415345549583435, "learning_rate": 8.555280819286603e-05, "loss": 0.4148, "step": 6264 }, { "epoch": 8.155642023346303, "grad_norm": 2.2379307746887207, "learning_rate": 8.543446015611995e-05, "loss": 0.4104, "step": 6288 }, { "epoch": 8.186770428015564, "grad_norm": 1.0670602321624756, "learning_rate": 8.531571193808549e-05, "loss": 0.4131, "step": 6312 }, { "epoch": 8.217898832684824, "grad_norm": 1.0915449857711792, "learning_rate": 8.519656487985309e-05, "loss": 0.4073, "step": 6336 }, { "epoch": 8.249027237354085, "grad_norm": 1.4844944477081299, "learning_rate": 8.507702032701748e-05, "loss": 0.4109, "step": 6360 }, { "epoch": 8.280155642023347, "grad_norm": 1.1173604726791382, "learning_rate": 8.495707962966253e-05, "loss": 0.4145, "step": 6384 }, { "epoch": 8.311284046692608, "grad_norm": 1.5978012084960938, "learning_rate": 8.4836744142346e-05, "loss": 0.4108, "step": 6408 }, { "epoch": 8.342412451361868, "grad_norm": 1.7912710905075073, "learning_rate": 8.471601522408422e-05, "loss": 0.4155, "step": 6432 }, { "epoch": 8.373540856031129, "grad_norm": 2.182061195373535, "learning_rate": 8.459489423833678e-05, "loss": 0.4117, "step": 6456 }, { "epoch": 8.40466926070039, "grad_norm": 1.8379067182540894, "learning_rate": 8.447338255299106e-05, "loss": 0.4104, "step": 6480 }, { "epoch": 8.43579766536965, "grad_norm": 1.4474197626113892, "learning_rate": 8.435148154034694e-05, "loss": 0.4142, "step": 6504 }, { "epoch": 8.46692607003891, "grad_norm": 2.309518575668335, "learning_rate": 8.422919257710104e-05, "loss": 0.4079, "step": 6528 }, { "epoch": 8.498054474708171, "grad_norm": 1.2606794834136963, "learning_rate": 8.410651704433146e-05, "loss": 0.4125, "step": 6552 }, { "epoch": 8.529182879377432, "grad_norm": 1.683693766593933, "learning_rate": 8.398345632748194e-05, "loss": 0.4132, "step": 6576 }, { "epoch": 8.560311284046692, "grad_norm": 2.342796802520752, "learning_rate": 8.386001181634642e-05, "loss": 0.4125, "step": 6600 }, { "epoch": 8.591439688715953, "grad_norm": 0.9687896370887756, "learning_rate": 8.373618490505315e-05, "loss": 0.4082, "step": 6624 }, { "epoch": 8.622568093385214, "grad_norm": 1.2769346237182617, "learning_rate": 8.361197699204911e-05, "loss": 0.413, "step": 6648 }, { "epoch": 8.653696498054474, "grad_norm": 1.4064596891403198, "learning_rate": 8.348738948008413e-05, "loss": 0.4172, "step": 6672 }, { "epoch": 8.684824902723735, "grad_norm": 1.0059700012207031, "learning_rate": 8.336242377619501e-05, "loss": 0.4132, "step": 6696 }, { "epoch": 8.715953307392995, "grad_norm": 1.5852705240249634, "learning_rate": 8.323708129168979e-05, "loss": 0.4129, "step": 6720 }, { "epoch": 8.747081712062258, "grad_norm": 1.879469394683838, "learning_rate": 8.31113634421316e-05, "loss": 0.4104, "step": 6744 }, { "epoch": 8.778210116731518, "grad_norm": 1.1461695432662964, "learning_rate": 8.298527164732283e-05, "loss": 0.4068, "step": 6768 }, { "epoch": 8.809338521400779, "grad_norm": 1.1254854202270508, "learning_rate": 8.285880733128907e-05, "loss": 0.4118, "step": 6792 }, { "epoch": 8.84046692607004, "grad_norm": 1.7840899229049683, "learning_rate": 8.273197192226294e-05, "loss": 0.4113, "step": 6816 }, { "epoch": 8.8715953307393, "grad_norm": 1.618880271911621, "learning_rate": 8.260476685266807e-05, "loss": 0.4065, "step": 6840 }, { "epoch": 8.90272373540856, "grad_norm": 1.2630411386489868, "learning_rate": 8.247719355910284e-05, "loss": 0.4029, "step": 6864 }, { "epoch": 8.933852140077821, "grad_norm": 1.138664960861206, "learning_rate": 8.234925348232421e-05, "loss": 0.4012, "step": 6888 }, { "epoch": 8.964980544747082, "grad_norm": 1.4435471296310425, "learning_rate": 8.222094806723143e-05, "loss": 0.4068, "step": 6912 }, { "epoch": 8.996108949416342, "grad_norm": 1.9499974250793457, "learning_rate": 8.209227876284972e-05, "loss": 0.4092, "step": 6936 }, { "epoch": 9.027237354085603, "grad_norm": 2.3621513843536377, "learning_rate": 8.196324702231389e-05, "loss": 0.4048, "step": 6960 }, { "epoch": 9.058365758754864, "grad_norm": 1.2890691757202148, "learning_rate": 8.183385430285197e-05, "loss": 0.3996, "step": 6984 }, { "epoch": 9.089494163424124, "grad_norm": 1.3257933855056763, "learning_rate": 8.170410206576872e-05, "loss": 0.3985, "step": 7008 }, { "epoch": 9.120622568093385, "grad_norm": 1.485418677330017, "learning_rate": 8.157399177642914e-05, "loss": 0.3994, "step": 7032 }, { "epoch": 9.151750972762645, "grad_norm": 1.115235686302185, "learning_rate": 8.144352490424187e-05, "loss": 0.3997, "step": 7056 }, { "epoch": 9.182879377431906, "grad_norm": 1.565184473991394, "learning_rate": 8.131270292264272e-05, "loss": 0.4059, "step": 7080 }, { "epoch": 9.214007782101167, "grad_norm": 1.3453902006149292, "learning_rate": 8.118152730907788e-05, "loss": 0.406, "step": 7104 }, { "epoch": 9.245136186770427, "grad_norm": 1.4093341827392578, "learning_rate": 8.104999954498734e-05, "loss": 0.4029, "step": 7128 }, { "epoch": 9.27626459143969, "grad_norm": 1.1250804662704468, "learning_rate": 8.091812111578812e-05, "loss": 0.4097, "step": 7152 }, { "epoch": 9.30739299610895, "grad_norm": 1.6016291379928589, "learning_rate": 8.07858935108575e-05, "loss": 0.4078, "step": 7176 }, { "epoch": 9.33852140077821, "grad_norm": 1.8599820137023926, "learning_rate": 8.065331822351618e-05, "loss": 0.4029, "step": 7200 }, { "epoch": 9.369649805447471, "grad_norm": 1.2994579076766968, "learning_rate": 8.052039675101143e-05, "loss": 0.4079, "step": 7224 }, { "epoch": 9.400778210116732, "grad_norm": 1.200239896774292, "learning_rate": 8.038713059450026e-05, "loss": 0.4017, "step": 7248 }, { "epoch": 9.431906614785992, "grad_norm": 3.8246068954467773, "learning_rate": 8.025352125903227e-05, "loss": 0.4006, "step": 7272 }, { "epoch": 9.463035019455253, "grad_norm": 1.4172035455703735, "learning_rate": 8.011957025353287e-05, "loss": 0.4028, "step": 7296 }, { "epoch": 9.494163424124514, "grad_norm": 2.0654618740081787, "learning_rate": 7.998527909078607e-05, "loss": 0.4014, "step": 7320 }, { "epoch": 9.525291828793774, "grad_norm": 1.3547816276550293, "learning_rate": 7.985064928741754e-05, "loss": 0.3981, "step": 7344 }, { "epoch": 9.556420233463035, "grad_norm": 1.3812025785446167, "learning_rate": 7.971568236387734e-05, "loss": 0.406, "step": 7368 }, { "epoch": 9.587548638132295, "grad_norm": 1.438240885734558, "learning_rate": 7.958037984442285e-05, "loss": 0.4011, "step": 7392 }, { "epoch": 9.618677042801556, "grad_norm": 1.7840272188186646, "learning_rate": 7.944474325710154e-05, "loss": 0.401, "step": 7416 }, { "epoch": 9.649805447470817, "grad_norm": 1.251658320426941, "learning_rate": 7.930877413373367e-05, "loss": 0.3969, "step": 7440 }, { "epoch": 9.680933852140077, "grad_norm": 2.252761125564575, "learning_rate": 7.917247400989505e-05, "loss": 0.4049, "step": 7464 }, { "epoch": 9.712062256809338, "grad_norm": 1.476012110710144, "learning_rate": 7.903584442489958e-05, "loss": 0.401, "step": 7488 }, { "epoch": 9.7431906614786, "grad_norm": 2.692723035812378, "learning_rate": 7.889888692178207e-05, "loss": 0.4017, "step": 7512 }, { "epoch": 9.77431906614786, "grad_norm": 3.0412638187408447, "learning_rate": 7.87616030472806e-05, "loss": 0.4093, "step": 7536 }, { "epoch": 9.805447470817121, "grad_norm": 1.527076244354248, "learning_rate": 7.862399435181917e-05, "loss": 0.3988, "step": 7560 }, { "epoch": 9.836575875486382, "grad_norm": 1.2038588523864746, "learning_rate": 7.848606238949021e-05, "loss": 0.4058, "step": 7584 }, { "epoch": 9.867704280155642, "grad_norm": 1.9050565958023071, "learning_rate": 7.834780871803693e-05, "loss": 0.3943, "step": 7608 }, { "epoch": 9.898832684824903, "grad_norm": 1.483185887336731, "learning_rate": 7.82092348988358e-05, "loss": 0.3992, "step": 7632 }, { "epoch": 9.929961089494164, "grad_norm": 1.5043606758117676, "learning_rate": 7.80703424968789e-05, "loss": 0.3989, "step": 7656 }, { "epoch": 9.961089494163424, "grad_norm": 1.194094181060791, "learning_rate": 7.793113308075626e-05, "loss": 0.4007, "step": 7680 }, { "epoch": 9.992217898832685, "grad_norm": 1.5360095500946045, "learning_rate": 7.77916082226381e-05, "loss": 0.395, "step": 7704 }, { "epoch": 10.023346303501945, "grad_norm": 1.1073459386825562, "learning_rate": 7.76517694982571e-05, "loss": 0.3989, "step": 7728 }, { "epoch": 10.054474708171206, "grad_norm": 1.4059771299362183, "learning_rate": 7.751161848689063e-05, "loss": 0.3964, "step": 7752 }, { "epoch": 10.085603112840467, "grad_norm": 1.8619714975357056, "learning_rate": 7.737115677134294e-05, "loss": 0.3964, "step": 7776 }, { "epoch": 10.116731517509727, "grad_norm": 0.8621863722801208, "learning_rate": 7.723038593792712e-05, "loss": 0.4019, "step": 7800 }, { "epoch": 10.147859922178988, "grad_norm": 1.542912483215332, "learning_rate": 7.708930757644739e-05, "loss": 0.3957, "step": 7824 }, { "epoch": 10.178988326848248, "grad_norm": 1.8078597784042358, "learning_rate": 7.694792328018106e-05, "loss": 0.3991, "step": 7848 }, { "epoch": 10.210116731517509, "grad_norm": 1.4210093021392822, "learning_rate": 7.680623464586048e-05, "loss": 0.3925, "step": 7872 }, { "epoch": 10.24124513618677, "grad_norm": 1.6985816955566406, "learning_rate": 7.66642432736551e-05, "loss": 0.3984, "step": 7896 }, { "epoch": 10.272373540856032, "grad_norm": 1.4291504621505737, "learning_rate": 7.652195076715332e-05, "loss": 0.4016, "step": 7920 }, { "epoch": 10.303501945525293, "grad_norm": 1.3934870958328247, "learning_rate": 7.637935873334448e-05, "loss": 0.3992, "step": 7944 }, { "epoch": 10.334630350194553, "grad_norm": 1.5841765403747559, "learning_rate": 7.623646878260062e-05, "loss": 0.3989, "step": 7968 }, { "epoch": 10.365758754863814, "grad_norm": 1.1344020366668701, "learning_rate": 7.60932825286583e-05, "loss": 0.3934, "step": 7992 }, { "epoch": 10.396887159533074, "grad_norm": 1.1252238750457764, "learning_rate": 7.594980158860043e-05, "loss": 0.3947, "step": 8016 }, { "epoch": 10.428015564202335, "grad_norm": 1.5455870628356934, "learning_rate": 7.580602758283796e-05, "loss": 0.3897, "step": 8040 }, { "epoch": 10.459143968871595, "grad_norm": 2.1351683139801025, "learning_rate": 7.566196213509163e-05, "loss": 0.3911, "step": 8064 }, { "epoch": 10.490272373540856, "grad_norm": 1.9759098291397095, "learning_rate": 7.551760687237351e-05, "loss": 0.3973, "step": 8088 }, { "epoch": 10.521400778210117, "grad_norm": 1.0132018327713013, "learning_rate": 7.537296342496884e-05, "loss": 0.3957, "step": 8112 }, { "epoch": 10.552529182879377, "grad_norm": 2.219759464263916, "learning_rate": 7.522803342641737e-05, "loss": 0.3887, "step": 8136 }, { "epoch": 10.583657587548638, "grad_norm": 2.361774206161499, "learning_rate": 7.508281851349512e-05, "loss": 0.3975, "step": 8160 }, { "epoch": 10.614785992217898, "grad_norm": 1.4584128856658936, "learning_rate": 7.493732032619578e-05, "loss": 0.4, "step": 8184 }, { "epoch": 10.645914396887159, "grad_norm": 1.375190019607544, "learning_rate": 7.47915405077122e-05, "loss": 0.4021, "step": 8208 }, { "epoch": 10.67704280155642, "grad_norm": 1.5501540899276733, "learning_rate": 7.464548070441785e-05, "loss": 0.3943, "step": 8232 }, { "epoch": 10.70817120622568, "grad_norm": 1.5805977582931519, "learning_rate": 7.449914256584828e-05, "loss": 0.3915, "step": 8256 }, { "epoch": 10.739299610894943, "grad_norm": 1.0127402544021606, "learning_rate": 7.435252774468237e-05, "loss": 0.3899, "step": 8280 }, { "epoch": 10.770428015564203, "grad_norm": 1.5114730596542358, "learning_rate": 7.420563789672375e-05, "loss": 0.3922, "step": 8304 }, { "epoch": 10.801556420233464, "grad_norm": 1.1805211305618286, "learning_rate": 7.405847468088209e-05, "loss": 0.3951, "step": 8328 }, { "epoch": 10.832684824902724, "grad_norm": 1.1337734460830688, "learning_rate": 7.391103975915436e-05, "loss": 0.3954, "step": 8352 }, { "epoch": 10.863813229571985, "grad_norm": 1.024134874343872, "learning_rate": 7.376333479660607e-05, "loss": 0.3829, "step": 8376 }, { "epoch": 10.894941634241246, "grad_norm": 1.2885181903839111, "learning_rate": 7.361536146135243e-05, "loss": 0.3904, "step": 8400 }, { "epoch": 10.926070038910506, "grad_norm": 1.2240935564041138, "learning_rate": 7.346712142453954e-05, "loss": 0.3904, "step": 8424 }, { "epoch": 10.957198443579767, "grad_norm": 1.2982319593429565, "learning_rate": 7.33186163603255e-05, "loss": 0.3944, "step": 8448 }, { "epoch": 10.988326848249027, "grad_norm": 1.0359567403793335, "learning_rate": 7.316984794586155e-05, "loss": 0.3989, "step": 8472 }, { "epoch": 11.019455252918288, "grad_norm": 2.0623931884765625, "learning_rate": 7.302081786127304e-05, "loss": 0.3853, "step": 8496 }, { "epoch": 11.050583657587548, "grad_norm": 1.2377070188522339, "learning_rate": 7.287152778964055e-05, "loss": 0.3913, "step": 8520 }, { "epoch": 11.081712062256809, "grad_norm": 1.016614556312561, "learning_rate": 7.272197941698084e-05, "loss": 0.3882, "step": 8544 }, { "epoch": 11.11284046692607, "grad_norm": 1.5649337768554688, "learning_rate": 7.257217443222777e-05, "loss": 0.378, "step": 8568 }, { "epoch": 11.14396887159533, "grad_norm": 1.4619653224945068, "learning_rate": 7.242211452721331e-05, "loss": 0.3874, "step": 8592 }, { "epoch": 11.17509727626459, "grad_norm": 1.6870439052581787, "learning_rate": 7.227180139664836e-05, "loss": 0.3867, "step": 8616 }, { "epoch": 11.206225680933851, "grad_norm": 1.0460180044174194, "learning_rate": 7.212123673810363e-05, "loss": 0.394, "step": 8640 }, { "epoch": 11.237354085603112, "grad_norm": 1.0444591045379639, "learning_rate": 7.19704222519905e-05, "loss": 0.3877, "step": 8664 }, { "epoch": 11.268482490272374, "grad_norm": 1.3924522399902344, "learning_rate": 7.181935964154182e-05, "loss": 0.3836, "step": 8688 }, { "epoch": 11.299610894941635, "grad_norm": 2.0957131385803223, "learning_rate": 7.166805061279257e-05, "loss": 0.3879, "step": 8712 }, { "epoch": 11.330739299610896, "grad_norm": 1.5147196054458618, "learning_rate": 7.151649687456074e-05, "loss": 0.3888, "step": 8736 }, { "epoch": 11.361867704280156, "grad_norm": 1.5958192348480225, "learning_rate": 7.136470013842791e-05, "loss": 0.3883, "step": 8760 }, { "epoch": 11.392996108949417, "grad_norm": 1.494354248046875, "learning_rate": 7.121266211872004e-05, "loss": 0.3847, "step": 8784 }, { "epoch": 11.424124513618677, "grad_norm": 1.3116648197174072, "learning_rate": 7.106038453248794e-05, "loss": 0.3913, "step": 8808 }, { "epoch": 11.455252918287938, "grad_norm": 2.947636842727661, "learning_rate": 7.090786909948809e-05, "loss": 0.3837, "step": 8832 }, { "epoch": 11.486381322957198, "grad_norm": 1.8480781316757202, "learning_rate": 7.075511754216304e-05, "loss": 0.3816, "step": 8856 }, { "epoch": 11.517509727626459, "grad_norm": 1.5083237886428833, "learning_rate": 7.060213158562205e-05, "loss": 0.3856, "step": 8880 }, { "epoch": 11.54863813229572, "grad_norm": 1.2127504348754883, "learning_rate": 7.044891295762154e-05, "loss": 0.3861, "step": 8904 }, { "epoch": 11.57976653696498, "grad_norm": 1.0090476274490356, "learning_rate": 7.029546338854569e-05, "loss": 0.3894, "step": 8928 }, { "epoch": 11.61089494163424, "grad_norm": 0.9990460872650146, "learning_rate": 7.014178461138676e-05, "loss": 0.388, "step": 8952 }, { "epoch": 11.642023346303501, "grad_norm": 1.7229726314544678, "learning_rate": 6.998787836172564e-05, "loss": 0.3883, "step": 8976 }, { "epoch": 11.673151750972762, "grad_norm": 1.0046260356903076, "learning_rate": 6.983374637771217e-05, "loss": 0.3853, "step": 9000 }, { "epoch": 11.704280155642023, "grad_norm": 1.4152393341064453, "learning_rate": 6.967939040004551e-05, "loss": 0.3829, "step": 9024 }, { "epoch": 11.735408560311285, "grad_norm": 1.2723467350006104, "learning_rate": 6.952481217195456e-05, "loss": 0.3879, "step": 9048 }, { "epoch": 11.766536964980546, "grad_norm": 1.7674216032028198, "learning_rate": 6.937001343917818e-05, "loss": 0.3909, "step": 9072 }, { "epoch": 11.797665369649806, "grad_norm": 1.4604827165603638, "learning_rate": 6.92149959499455e-05, "loss": 0.3878, "step": 9096 }, { "epoch": 11.828793774319067, "grad_norm": 1.5532753467559814, "learning_rate": 6.905976145495628e-05, "loss": 0.3884, "step": 9120 }, { "epoch": 11.859922178988327, "grad_norm": 1.1423866748809814, "learning_rate": 6.890431170736091e-05, "loss": 0.3861, "step": 9144 }, { "epoch": 11.891050583657588, "grad_norm": 1.350380778312683, "learning_rate": 6.874864846274087e-05, "loss": 0.3813, "step": 9168 }, { "epoch": 11.922178988326849, "grad_norm": 1.2758312225341797, "learning_rate": 6.85927734790887e-05, "loss": 0.3877, "step": 9192 }, { "epoch": 11.95330739299611, "grad_norm": 1.970986247062683, "learning_rate": 6.843668851678831e-05, "loss": 0.3828, "step": 9216 }, { "epoch": 11.98443579766537, "grad_norm": 1.340889811515808, "learning_rate": 6.828039533859489e-05, "loss": 0.3875, "step": 9240 }, { "epoch": 12.01556420233463, "grad_norm": 1.2335118055343628, "learning_rate": 6.812389570961525e-05, "loss": 0.3809, "step": 9264 }, { "epoch": 12.04669260700389, "grad_norm": 1.2043426036834717, "learning_rate": 6.796719139728777e-05, "loss": 0.3835, "step": 9288 }, { "epoch": 12.077821011673151, "grad_norm": 1.197809100151062, "learning_rate": 6.781028417136231e-05, "loss": 0.3792, "step": 9312 }, { "epoch": 12.108949416342412, "grad_norm": 1.2524584531784058, "learning_rate": 6.765317580388046e-05, "loss": 0.3842, "step": 9336 }, { "epoch": 12.140077821011673, "grad_norm": 1.082410454750061, "learning_rate": 6.749586806915535e-05, "loss": 0.3827, "step": 9360 }, { "epoch": 12.171206225680933, "grad_norm": 1.2853772640228271, "learning_rate": 6.733836274375176e-05, "loss": 0.3755, "step": 9384 }, { "epoch": 12.202334630350194, "grad_norm": 1.6849515438079834, "learning_rate": 6.718066160646585e-05, "loss": 0.38, "step": 9408 }, { "epoch": 12.233463035019454, "grad_norm": 2.0715172290802, "learning_rate": 6.702276643830531e-05, "loss": 0.3799, "step": 9432 }, { "epoch": 12.264591439688717, "grad_norm": 1.7511128187179565, "learning_rate": 6.686467902246909e-05, "loss": 0.3752, "step": 9456 }, { "epoch": 12.295719844357977, "grad_norm": 1.1407638788223267, "learning_rate": 6.670640114432724e-05, "loss": 0.3834, "step": 9480 }, { "epoch": 12.326848249027238, "grad_norm": 1.0695194005966187, "learning_rate": 6.654793459140089e-05, "loss": 0.3835, "step": 9504 }, { "epoch": 12.357976653696499, "grad_norm": 1.285834789276123, "learning_rate": 6.638928115334196e-05, "loss": 0.3904, "step": 9528 }, { "epoch": 12.38910505836576, "grad_norm": 1.508699893951416, "learning_rate": 6.623044262191293e-05, "loss": 0.3964, "step": 9552 }, { "epoch": 12.42023346303502, "grad_norm": 1.287642002105713, "learning_rate": 6.607142079096668e-05, "loss": 0.3819, "step": 9576 }, { "epoch": 12.45136186770428, "grad_norm": 2.893951892852783, "learning_rate": 6.591221745642621e-05, "loss": 0.3805, "step": 9600 }, { "epoch": 12.482490272373541, "grad_norm": 1.4402974843978882, "learning_rate": 6.575283441626433e-05, "loss": 0.376, "step": 9624 }, { "epoch": 12.513618677042802, "grad_norm": 1.156258225440979, "learning_rate": 6.559327347048331e-05, "loss": 0.3778, "step": 9648 }, { "epoch": 12.544747081712062, "grad_norm": 1.5183446407318115, "learning_rate": 6.543353642109469e-05, "loss": 0.382, "step": 9672 }, { "epoch": 12.575875486381323, "grad_norm": 1.611879825592041, "learning_rate": 6.527362507209879e-05, "loss": 0.3791, "step": 9696 }, { "epoch": 12.607003891050583, "grad_norm": 1.3625446557998657, "learning_rate": 6.511354122946443e-05, "loss": 0.379, "step": 9720 }, { "epoch": 12.638132295719844, "grad_norm": 1.2298206090927124, "learning_rate": 6.495328670110848e-05, "loss": 0.3773, "step": 9744 }, { "epoch": 12.669260700389104, "grad_norm": 1.0427093505859375, "learning_rate": 6.479286329687543e-05, "loss": 0.3752, "step": 9768 }, { "epoch": 12.700389105058365, "grad_norm": 1.6555167436599731, "learning_rate": 6.463227282851708e-05, "loss": 0.3771, "step": 9792 }, { "epoch": 12.731517509727626, "grad_norm": 1.3086024522781372, "learning_rate": 6.447151710967187e-05, "loss": 0.377, "step": 9816 }, { "epoch": 12.762645914396888, "grad_norm": 1.3003504276275635, "learning_rate": 6.431059795584453e-05, "loss": 0.3812, "step": 9840 }, { "epoch": 12.793774319066149, "grad_norm": 1.4847590923309326, "learning_rate": 6.414951718438561e-05, "loss": 0.3778, "step": 9864 }, { "epoch": 12.82490272373541, "grad_norm": 1.3426965475082397, "learning_rate": 6.398827661447084e-05, "loss": 0.3794, "step": 9888 }, { "epoch": 12.85603112840467, "grad_norm": 1.2530086040496826, "learning_rate": 6.382687806708067e-05, "loss": 0.3728, "step": 9912 }, { "epoch": 12.88715953307393, "grad_norm": 1.8029588460922241, "learning_rate": 6.366532336497968e-05, "loss": 0.3795, "step": 9936 }, { "epoch": 12.918287937743191, "grad_norm": 1.9585580825805664, "learning_rate": 6.350361433269599e-05, "loss": 0.3769, "step": 9960 }, { "epoch": 12.949416342412452, "grad_norm": 1.7418956756591797, "learning_rate": 6.334175279650062e-05, "loss": 0.3778, "step": 9984 }, { "epoch": 12.980544747081712, "grad_norm": 1.6264042854309082, "learning_rate": 6.317974058438697e-05, "loss": 0.3821, "step": 10008 }, { "epoch": 13.011673151750973, "grad_norm": 0.9489176869392395, "learning_rate": 6.301757952605007e-05, "loss": 0.374, "step": 10032 }, { "epoch": 13.042801556420233, "grad_norm": 2.183706045150757, "learning_rate": 6.285527145286594e-05, "loss": 0.3736, "step": 10056 }, { "epoch": 13.073929961089494, "grad_norm": 1.3998112678527832, "learning_rate": 6.269281819787095e-05, "loss": 0.3726, "step": 10080 }, { "epoch": 13.105058365758754, "grad_norm": 1.5030006170272827, "learning_rate": 6.253022159574108e-05, "loss": 0.3741, "step": 10104 }, { "epoch": 13.136186770428015, "grad_norm": 2.579502820968628, "learning_rate": 6.23674834827712e-05, "loss": 0.373, "step": 10128 }, { "epoch": 13.167315175097276, "grad_norm": 1.5349212884902954, "learning_rate": 6.220460569685437e-05, "loss": 0.3739, "step": 10152 }, { "epoch": 13.198443579766536, "grad_norm": 1.6323474645614624, "learning_rate": 6.204159007746103e-05, "loss": 0.3729, "step": 10176 }, { "epoch": 13.229571984435797, "grad_norm": 1.1729427576065063, "learning_rate": 6.187843846561824e-05, "loss": 0.3759, "step": 10200 }, { "epoch": 13.26070038910506, "grad_norm": 2.276395320892334, "learning_rate": 6.171515270388892e-05, "loss": 0.3657, "step": 10224 }, { "epoch": 13.29182879377432, "grad_norm": 0.9925207495689392, "learning_rate": 6.155173463635103e-05, "loss": 0.3724, "step": 10248 }, { "epoch": 13.32295719844358, "grad_norm": 0.9079545140266418, "learning_rate": 6.13881861085767e-05, "loss": 0.3675, "step": 10272 }, { "epoch": 13.354085603112841, "grad_norm": 2.5486135482788086, "learning_rate": 6.122450896761147e-05, "loss": 0.3684, "step": 10296 }, { "epoch": 13.385214007782102, "grad_norm": 1.5650309324264526, "learning_rate": 6.106070506195332e-05, "loss": 0.3765, "step": 10320 }, { "epoch": 13.416342412451362, "grad_norm": 0.9130122065544128, "learning_rate": 6.0896776241531916e-05, "loss": 0.3788, "step": 10344 }, { "epoch": 13.447470817120623, "grad_norm": 1.1227184534072876, "learning_rate": 6.073272435768761e-05, "loss": 0.3717, "step": 10368 }, { "epoch": 13.478599221789883, "grad_norm": 2.312488079071045, "learning_rate": 6.0568551263150606e-05, "loss": 0.3775, "step": 10392 }, { "epoch": 13.509727626459144, "grad_norm": 1.1797654628753662, "learning_rate": 6.040425881201998e-05, "loss": 0.3721, "step": 10416 }, { "epoch": 13.540856031128405, "grad_norm": 3.0446395874023438, "learning_rate": 6.0239848859742795e-05, "loss": 0.3698, "step": 10440 }, { "epoch": 13.571984435797665, "grad_norm": 1.0386089086532593, "learning_rate": 6.007532326309313e-05, "loss": 0.3724, "step": 10464 }, { "epoch": 13.603112840466926, "grad_norm": 1.4335585832595825, "learning_rate": 5.9910683880151064e-05, "loss": 0.3749, "step": 10488 }, { "epoch": 13.634241245136186, "grad_norm": 1.4243568181991577, "learning_rate": 5.974593257028176e-05, "loss": 0.3714, "step": 10512 }, { "epoch": 13.665369649805447, "grad_norm": 1.3887135982513428, "learning_rate": 5.958107119411441e-05, "loss": 0.3763, "step": 10536 }, { "epoch": 13.696498054474707, "grad_norm": 1.4939093589782715, "learning_rate": 5.941610161352128e-05, "loss": 0.3689, "step": 10560 }, { "epoch": 13.727626459143968, "grad_norm": 1.3950523138046265, "learning_rate": 5.925102569159661e-05, "loss": 0.3721, "step": 10584 }, { "epoch": 13.75875486381323, "grad_norm": 1.5457286834716797, "learning_rate": 5.9085845292635645e-05, "loss": 0.3736, "step": 10608 }, { "epoch": 13.789883268482491, "grad_norm": 1.7134722471237183, "learning_rate": 5.8920562282113534e-05, "loss": 0.3705, "step": 10632 }, { "epoch": 13.821011673151752, "grad_norm": 1.9264869689941406, "learning_rate": 5.875517852666428e-05, "loss": 0.3731, "step": 10656 }, { "epoch": 13.852140077821012, "grad_norm": 1.9957599639892578, "learning_rate": 5.8589695894059626e-05, "loss": 0.3727, "step": 10680 }, { "epoch": 13.883268482490273, "grad_norm": 1.0721269845962524, "learning_rate": 5.842411625318805e-05, "loss": 0.3717, "step": 10704 }, { "epoch": 13.914396887159533, "grad_norm": 1.339650273323059, "learning_rate": 5.825844147403353e-05, "loss": 0.3781, "step": 10728 }, { "epoch": 13.945525291828794, "grad_norm": 1.0256425142288208, "learning_rate": 5.809267342765456e-05, "loss": 0.3743, "step": 10752 }, { "epoch": 13.976653696498055, "grad_norm": 1.1623256206512451, "learning_rate": 5.792681398616293e-05, "loss": 0.372, "step": 10776 }, { "epoch": 14.007782101167315, "grad_norm": 2.1772332191467285, "learning_rate": 5.776086502270258e-05, "loss": 0.3768, "step": 10800 }, { "epoch": 14.038910505836576, "grad_norm": 1.4126263856887817, "learning_rate": 5.759482841142848e-05, "loss": 0.3689, "step": 10824 }, { "epoch": 14.070038910505836, "grad_norm": 1.1903387308120728, "learning_rate": 5.742870602748547e-05, "loss": 0.3667, "step": 10848 }, { "epoch": 14.101167315175097, "grad_norm": 1.1915792226791382, "learning_rate": 5.7262499746987094e-05, "loss": 0.372, "step": 10872 }, { "epoch": 14.132295719844358, "grad_norm": 1.3118023872375488, "learning_rate": 5.7096211446994344e-05, "loss": 0.3673, "step": 10896 }, { "epoch": 14.163424124513618, "grad_norm": 1.0034823417663574, "learning_rate": 5.692984300549451e-05, "loss": 0.3743, "step": 10920 }, { "epoch": 14.194552529182879, "grad_norm": 1.1173166036605835, "learning_rate": 5.6763396301379976e-05, "loss": 0.3722, "step": 10944 }, { "epoch": 14.22568093385214, "grad_norm": 1.1479343175888062, "learning_rate": 5.659687321442701e-05, "loss": 0.3691, "step": 10968 }, { "epoch": 14.2568093385214, "grad_norm": 1.3507132530212402, "learning_rate": 5.6430275625274456e-05, "loss": 0.3655, "step": 10992 }, { "epoch": 14.287937743190662, "grad_norm": 1.1012446880340576, "learning_rate": 5.626360541540261e-05, "loss": 0.366, "step": 11016 }, { "epoch": 14.319066147859923, "grad_norm": 1.2122224569320679, "learning_rate": 5.609686446711191e-05, "loss": 0.3608, "step": 11040 }, { "epoch": 14.350194552529183, "grad_norm": 0.9675916433334351, "learning_rate": 5.593005466350164e-05, "loss": 0.3677, "step": 11064 }, { "epoch": 14.381322957198444, "grad_norm": 1.0538902282714844, "learning_rate": 5.576317788844875e-05, "loss": 0.369, "step": 11088 }, { "epoch": 14.412451361867705, "grad_norm": 2.077829122543335, "learning_rate": 5.55962360265865e-05, "loss": 0.3642, "step": 11112 }, { "epoch": 14.443579766536965, "grad_norm": 1.2885998487472534, "learning_rate": 5.542923096328325e-05, "loss": 0.3685, "step": 11136 }, { "epoch": 14.474708171206226, "grad_norm": 2.953463077545166, "learning_rate": 5.526216458462111e-05, "loss": 0.3683, "step": 11160 }, { "epoch": 14.505836575875486, "grad_norm": 1.336449384689331, "learning_rate": 5.509503877737465e-05, "loss": 0.3627, "step": 11184 }, { "epoch": 14.536964980544747, "grad_norm": 4.623841762542725, "learning_rate": 5.4927855428989624e-05, "loss": 0.3738, "step": 11208 }, { "epoch": 14.568093385214008, "grad_norm": 1.4652122259140015, "learning_rate": 5.476061642756161e-05, "loss": 0.3722, "step": 11232 }, { "epoch": 14.599221789883268, "grad_norm": 1.3524249792099, "learning_rate": 5.4593323661814686e-05, "loss": 0.3586, "step": 11256 }, { "epoch": 14.630350194552529, "grad_norm": 1.833708643913269, "learning_rate": 5.442597902108019e-05, "loss": 0.3568, "step": 11280 }, { "epoch": 14.66147859922179, "grad_norm": 1.4893455505371094, "learning_rate": 5.425858439527525e-05, "loss": 0.3698, "step": 11304 }, { "epoch": 14.69260700389105, "grad_norm": 1.7463867664337158, "learning_rate": 5.409114167488152e-05, "loss": 0.3726, "step": 11328 }, { "epoch": 14.72373540856031, "grad_norm": 1.5364842414855957, "learning_rate": 5.392365275092383e-05, "loss": 0.3656, "step": 11352 }, { "epoch": 14.754863813229573, "grad_norm": 1.4161092042922974, "learning_rate": 5.37561195149488e-05, "loss": 0.3636, "step": 11376 }, { "epoch": 14.785992217898833, "grad_norm": 1.125667691230774, "learning_rate": 5.358854385900348e-05, "loss": 0.3636, "step": 11400 }, { "epoch": 14.817120622568094, "grad_norm": 1.9482998847961426, "learning_rate": 5.342092767561402e-05, "loss": 0.3646, "step": 11424 }, { "epoch": 14.848249027237355, "grad_norm": 1.8707369565963745, "learning_rate": 5.325327285776425e-05, "loss": 0.3657, "step": 11448 }, { "epoch": 14.879377431906615, "grad_norm": 1.7567267417907715, "learning_rate": 5.308558129887431e-05, "loss": 0.3628, "step": 11472 }, { "epoch": 14.910505836575876, "grad_norm": 1.5714308023452759, "learning_rate": 5.2917854892779304e-05, "loss": 0.3667, "step": 11496 }, { "epoch": 14.941634241245136, "grad_norm": 2.1905322074890137, "learning_rate": 5.275009553370788e-05, "loss": 0.371, "step": 11520 }, { "epoch": 14.972762645914397, "grad_norm": 2.8119211196899414, "learning_rate": 5.2582305116260835e-05, "loss": 0.3704, "step": 11544 }, { "epoch": 15.003891050583658, "grad_norm": 1.1872552633285522, "learning_rate": 5.241448553538968e-05, "loss": 0.3755, "step": 11568 }, { "epoch": 15.035019455252918, "grad_norm": 1.4244314432144165, "learning_rate": 5.224663868637538e-05, "loss": 0.3599, "step": 11592 }, { "epoch": 15.066147859922179, "grad_norm": 1.2808740139007568, "learning_rate": 5.2078766464806796e-05, "loss": 0.3683, "step": 11616 }, { "epoch": 15.09727626459144, "grad_norm": 1.0528135299682617, "learning_rate": 5.191087076655935e-05, "loss": 0.3598, "step": 11640 }, { "epoch": 15.1284046692607, "grad_norm": 1.8377207517623901, "learning_rate": 5.174295348777357e-05, "loss": 0.3553, "step": 11664 }, { "epoch": 15.15953307392996, "grad_norm": 1.7853907346725464, "learning_rate": 5.1575016524833754e-05, "loss": 0.3614, "step": 11688 }, { "epoch": 15.190661478599221, "grad_norm": 1.7978260517120361, "learning_rate": 5.140706177434645e-05, "loss": 0.3608, "step": 11712 }, { "epoch": 15.221789883268482, "grad_norm": 1.1315481662750244, "learning_rate": 5.123909113311915e-05, "loss": 0.3635, "step": 11736 }, { "epoch": 15.252918287937742, "grad_norm": 1.6177383661270142, "learning_rate": 5.1071106498138764e-05, "loss": 0.3624, "step": 11760 }, { "epoch": 15.284046692607005, "grad_norm": 1.2278454303741455, "learning_rate": 5.0903109766550264e-05, "loss": 0.3658, "step": 11784 }, { "epoch": 15.315175097276265, "grad_norm": 1.3733409643173218, "learning_rate": 5.073510283563523e-05, "loss": 0.3612, "step": 11808 }, { "epoch": 15.346303501945526, "grad_norm": 1.3404691219329834, "learning_rate": 5.05670876027904e-05, "loss": 0.3629, "step": 11832 }, { "epoch": 15.377431906614786, "grad_norm": 1.2201738357543945, "learning_rate": 5.039906596550633e-05, "loss": 0.3666, "step": 11856 }, { "epoch": 15.408560311284047, "grad_norm": 2.0148181915283203, "learning_rate": 5.023103982134586e-05, "loss": 0.3665, "step": 11880 }, { "epoch": 15.439688715953308, "grad_norm": 1.249961256980896, "learning_rate": 5.006301106792274e-05, "loss": 0.3647, "step": 11904 }, { "epoch": 15.470817120622568, "grad_norm": 1.5822800397872925, "learning_rate": 4.989498160288019e-05, "loss": 0.3659, "step": 11928 }, { "epoch": 15.501945525291829, "grad_norm": 1.1686407327651978, "learning_rate": 4.9726953323869456e-05, "loss": 0.363, "step": 11952 }, { "epoch": 15.53307392996109, "grad_norm": 1.8801552057266235, "learning_rate": 4.9558928128528414e-05, "loss": 0.3623, "step": 11976 }, { "epoch": 15.56420233463035, "grad_norm": 1.2335692644119263, "learning_rate": 4.9390907914460105e-05, "loss": 0.3664, "step": 12000 }, { "epoch": 15.59533073929961, "grad_norm": 1.496955156326294, "learning_rate": 4.9222894579211276e-05, "loss": 0.3644, "step": 12024 }, { "epoch": 15.626459143968871, "grad_norm": 1.6293377876281738, "learning_rate": 4.905489002025106e-05, "loss": 0.3605, "step": 12048 }, { "epoch": 15.657587548638132, "grad_norm": 1.2555320262908936, "learning_rate": 4.8886896134949415e-05, "loss": 0.3594, "step": 12072 }, { "epoch": 15.688715953307392, "grad_norm": 1.2741057872772217, "learning_rate": 4.871891482055575e-05, "loss": 0.3622, "step": 12096 }, { "epoch": 15.719844357976653, "grad_norm": 2.100410223007202, "learning_rate": 4.855094797417758e-05, "loss": 0.3612, "step": 12120 }, { "epoch": 15.750972762645915, "grad_norm": 0.88619464635849, "learning_rate": 4.8382997492758936e-05, "loss": 0.3589, "step": 12144 }, { "epoch": 15.782101167315176, "grad_norm": 1.5951071977615356, "learning_rate": 4.8215065273059085e-05, "loss": 0.3613, "step": 12168 }, { "epoch": 15.813229571984436, "grad_norm": 1.1034135818481445, "learning_rate": 4.8047153211631e-05, "loss": 0.3609, "step": 12192 }, { "epoch": 15.844357976653697, "grad_norm": 1.9069421291351318, "learning_rate": 4.787926320480009e-05, "loss": 0.3617, "step": 12216 }, { "epoch": 15.875486381322958, "grad_norm": 2.139292001724243, "learning_rate": 4.7711397148642583e-05, "loss": 0.3582, "step": 12240 }, { "epoch": 15.906614785992218, "grad_norm": 1.134293556213379, "learning_rate": 4.7543556938964275e-05, "loss": 0.361, "step": 12264 }, { "epoch": 15.937743190661479, "grad_norm": 1.2520484924316406, "learning_rate": 4.7375744471279084e-05, "loss": 0.3613, "step": 12288 }, { "epoch": 15.96887159533074, "grad_norm": 1.2001314163208008, "learning_rate": 4.720796164078755e-05, "loss": 0.363, "step": 12312 }, { "epoch": 16.0, "grad_norm": 1.0038580894470215, "learning_rate": 4.7040210342355584e-05, "loss": 0.3566, "step": 12336 }, { "epoch": 16.03112840466926, "grad_norm": 1.0586698055267334, "learning_rate": 4.6872492470492914e-05, "loss": 0.3554, "step": 12360 }, { "epoch": 16.06225680933852, "grad_norm": 1.4238923788070679, "learning_rate": 4.670480991933182e-05, "loss": 0.3598, "step": 12384 }, { "epoch": 16.09338521400778, "grad_norm": 1.7448209524154663, "learning_rate": 4.6537164582605674e-05, "loss": 0.3523, "step": 12408 }, { "epoch": 16.124513618677042, "grad_norm": 0.9236373901367188, "learning_rate": 4.6369558353627517e-05, "loss": 0.3556, "step": 12432 }, { "epoch": 16.155642023346303, "grad_norm": 1.2013592720031738, "learning_rate": 4.6201993125268804e-05, "loss": 0.352, "step": 12456 }, { "epoch": 16.186770428015564, "grad_norm": 1.267756700515747, "learning_rate": 4.603447078993788e-05, "loss": 0.3578, "step": 12480 }, { "epoch": 16.217898832684824, "grad_norm": 1.0369305610656738, "learning_rate": 4.586699323955871e-05, "loss": 0.3476, "step": 12504 }, { "epoch": 16.249027237354085, "grad_norm": 1.4075908660888672, "learning_rate": 4.569956236554945e-05, "loss": 0.3544, "step": 12528 }, { "epoch": 16.280155642023345, "grad_norm": 1.3998584747314453, "learning_rate": 4.5532180058801145e-05, "loss": 0.3596, "step": 12552 }, { "epoch": 16.311284046692606, "grad_norm": 1.5231702327728271, "learning_rate": 4.5364848209656336e-05, "loss": 0.3542, "step": 12576 }, { "epoch": 16.342412451361866, "grad_norm": 1.283345103263855, "learning_rate": 4.5197568707887675e-05, "loss": 0.3526, "step": 12600 }, { "epoch": 16.373540856031127, "grad_norm": 1.3944894075393677, "learning_rate": 4.503034344267671e-05, "loss": 0.357, "step": 12624 }, { "epoch": 16.404669260700388, "grad_norm": 1.9900680780410767, "learning_rate": 4.486317430259238e-05, "loss": 0.3603, "step": 12648 }, { "epoch": 16.43579766536965, "grad_norm": 0.9823328852653503, "learning_rate": 4.4696063175569804e-05, "loss": 0.3545, "step": 12672 }, { "epoch": 16.46692607003891, "grad_norm": 1.634529709815979, "learning_rate": 4.452901194888897e-05, "loss": 0.3543, "step": 12696 }, { "epoch": 16.49805447470817, "grad_norm": 1.4010380506515503, "learning_rate": 4.436202250915329e-05, "loss": 0.3524, "step": 12720 }, { "epoch": 16.529182879377434, "grad_norm": 1.239943504333496, "learning_rate": 4.419509674226846e-05, "loss": 0.3648, "step": 12744 }, { "epoch": 16.560311284046694, "grad_norm": 3.315246820449829, "learning_rate": 4.4028236533421016e-05, "loss": 0.3624, "step": 12768 }, { "epoch": 16.591439688715955, "grad_norm": 1.0445722341537476, "learning_rate": 4.3861443767057205e-05, "loss": 0.3536, "step": 12792 }, { "epoch": 16.622568093385215, "grad_norm": 1.154893398284912, "learning_rate": 4.369472032686149e-05, "loss": 0.3608, "step": 12816 }, { "epoch": 16.653696498054476, "grad_norm": 2.0033769607543945, "learning_rate": 4.352806809573547e-05, "loss": 0.3511, "step": 12840 }, { "epoch": 16.684824902723737, "grad_norm": 1.4693876504898071, "learning_rate": 4.336148895577656e-05, "loss": 0.3531, "step": 12864 }, { "epoch": 16.715953307392997, "grad_norm": 1.8765549659729004, "learning_rate": 4.319498478825663e-05, "loss": 0.3563, "step": 12888 }, { "epoch": 16.747081712062258, "grad_norm": 1.6893914937973022, "learning_rate": 4.302855747360092e-05, "loss": 0.3579, "step": 12912 }, { "epoch": 16.77821011673152, "grad_norm": 1.183452844619751, "learning_rate": 4.286220889136668e-05, "loss": 0.3637, "step": 12936 }, { "epoch": 16.80933852140078, "grad_norm": 1.102815866470337, "learning_rate": 4.269594092022203e-05, "loss": 0.3561, "step": 12960 }, { "epoch": 16.84046692607004, "grad_norm": 0.9764434695243835, "learning_rate": 4.252975543792468e-05, "loss": 0.3581, "step": 12984 }, { "epoch": 16.8715953307393, "grad_norm": 2.3779425621032715, "learning_rate": 4.2363654321300735e-05, "loss": 0.3531, "step": 13008 }, { "epoch": 16.90272373540856, "grad_norm": 1.463118076324463, "learning_rate": 4.219763944622356e-05, "loss": 0.3562, "step": 13032 }, { "epoch": 16.93385214007782, "grad_norm": 1.756101369857788, "learning_rate": 4.203171268759248e-05, "loss": 0.3566, "step": 13056 }, { "epoch": 16.964980544747082, "grad_norm": 1.5917153358459473, "learning_rate": 4.1865875919311726e-05, "loss": 0.3504, "step": 13080 }, { "epoch": 16.996108949416342, "grad_norm": 2.404031753540039, "learning_rate": 4.170013101426917e-05, "loss": 0.3581, "step": 13104 }, { "epoch": 17.027237354085603, "grad_norm": 1.3285900354385376, "learning_rate": 4.153447984431527e-05, "loss": 0.3499, "step": 13128 }, { "epoch": 17.058365758754864, "grad_norm": 1.0520793199539185, "learning_rate": 4.136892428024187e-05, "loss": 0.3547, "step": 13152 }, { "epoch": 17.089494163424124, "grad_norm": 1.0784560441970825, "learning_rate": 4.120346619176102e-05, "loss": 0.3525, "step": 13176 }, { "epoch": 17.120622568093385, "grad_norm": 1.9099761247634888, "learning_rate": 4.103810744748403e-05, "loss": 0.3531, "step": 13200 }, { "epoch": 17.151750972762645, "grad_norm": 1.4144366979599, "learning_rate": 4.0872849914900175e-05, "loss": 0.3431, "step": 13224 }, { "epoch": 17.182879377431906, "grad_norm": 1.078682541847229, "learning_rate": 4.070769546035571e-05, "loss": 0.3563, "step": 13248 }, { "epoch": 17.214007782101167, "grad_norm": 2.5183982849121094, "learning_rate": 4.054264594903281e-05, "loss": 0.3534, "step": 13272 }, { "epoch": 17.245136186770427, "grad_norm": 1.3110893964767456, "learning_rate": 4.037770324492841e-05, "loss": 0.351, "step": 13296 }, { "epoch": 17.276264591439688, "grad_norm": 1.4684545993804932, "learning_rate": 4.021286921083326e-05, "loss": 0.3525, "step": 13320 }, { "epoch": 17.30739299610895, "grad_norm": 1.3898323774337769, "learning_rate": 4.004814570831078e-05, "loss": 0.353, "step": 13344 }, { "epoch": 17.33852140077821, "grad_norm": 1.7565838098526, "learning_rate": 3.9883534597676177e-05, "loss": 0.3566, "step": 13368 }, { "epoch": 17.36964980544747, "grad_norm": 1.3672667741775513, "learning_rate": 3.971903773797528e-05, "loss": 0.3502, "step": 13392 }, { "epoch": 17.40077821011673, "grad_norm": 1.2242878675460815, "learning_rate": 3.955465698696363e-05, "loss": 0.3518, "step": 13416 }, { "epoch": 17.43190661478599, "grad_norm": 2.410991907119751, "learning_rate": 3.939039420108556e-05, "loss": 0.3503, "step": 13440 }, { "epoch": 17.46303501945525, "grad_norm": 1.4282727241516113, "learning_rate": 3.922625123545305e-05, "loss": 0.3488, "step": 13464 }, { "epoch": 17.494163424124515, "grad_norm": 1.5992825031280518, "learning_rate": 3.906222994382495e-05, "loss": 0.3567, "step": 13488 }, { "epoch": 17.525291828793776, "grad_norm": 2.398169994354248, "learning_rate": 3.889833217858594e-05, "loss": 0.3542, "step": 13512 }, { "epoch": 17.556420233463037, "grad_norm": 1.140195608139038, "learning_rate": 3.873455979072569e-05, "loss": 0.3493, "step": 13536 }, { "epoch": 17.587548638132297, "grad_norm": 1.305156946182251, "learning_rate": 3.8570914629817886e-05, "loss": 0.3504, "step": 13560 }, { "epoch": 17.618677042801558, "grad_norm": 9.382534980773926, "learning_rate": 3.840739854399934e-05, "loss": 0.3534, "step": 13584 }, { "epoch": 17.64980544747082, "grad_norm": 1.1403177976608276, "learning_rate": 3.824401337994923e-05, "loss": 0.3461, "step": 13608 }, { "epoch": 17.68093385214008, "grad_norm": 2.1274640560150146, "learning_rate": 3.808076098286806e-05, "loss": 0.3521, "step": 13632 }, { "epoch": 17.71206225680934, "grad_norm": 1.9969298839569092, "learning_rate": 3.7917643196457e-05, "loss": 0.3521, "step": 13656 }, { "epoch": 17.7431906614786, "grad_norm": 1.2433438301086426, "learning_rate": 3.775466186289693e-05, "loss": 0.3565, "step": 13680 }, { "epoch": 17.77431906614786, "grad_norm": 1.7864729166030884, "learning_rate": 3.7591818822827745e-05, "loss": 0.3508, "step": 13704 }, { "epoch": 17.80544747081712, "grad_norm": 1.7596447467803955, "learning_rate": 3.7429115915327484e-05, "loss": 0.3533, "step": 13728 }, { "epoch": 17.836575875486382, "grad_norm": 1.7605047225952148, "learning_rate": 3.726655497789156e-05, "loss": 0.3553, "step": 13752 }, { "epoch": 17.867704280155642, "grad_norm": 1.5380836725234985, "learning_rate": 3.710413784641212e-05, "loss": 0.3526, "step": 13776 }, { "epoch": 17.898832684824903, "grad_norm": 1.448866844177246, "learning_rate": 3.694186635515714e-05, "loss": 0.3516, "step": 13800 }, { "epoch": 17.929961089494164, "grad_norm": 1.527550458908081, "learning_rate": 3.677974233674983e-05, "loss": 0.3438, "step": 13824 }, { "epoch": 17.961089494163424, "grad_norm": 1.3250521421432495, "learning_rate": 3.661776762214797e-05, "loss": 0.3551, "step": 13848 }, { "epoch": 17.992217898832685, "grad_norm": 1.4741333723068237, "learning_rate": 3.6455944040623075e-05, "loss": 0.3529, "step": 13872 }, { "epoch": 18.023346303501945, "grad_norm": 2.2234058380126953, "learning_rate": 3.6294273419739874e-05, "loss": 0.3486, "step": 13896 }, { "epoch": 18.054474708171206, "grad_norm": 1.4099419116973877, "learning_rate": 3.613275758533561e-05, "loss": 0.3473, "step": 13920 }, { "epoch": 18.085603112840467, "grad_norm": 1.9094316959381104, "learning_rate": 3.5971398361499466e-05, "loss": 0.3548, "step": 13944 }, { "epoch": 18.116731517509727, "grad_norm": 1.2845815420150757, "learning_rate": 3.581019757055188e-05, "loss": 0.345, "step": 13968 }, { "epoch": 18.147859922178988, "grad_norm": 2.0491998195648193, "learning_rate": 3.564915703302407e-05, "loss": 0.3474, "step": 13992 }, { "epoch": 18.17898832684825, "grad_norm": 1.3620078563690186, "learning_rate": 3.5488278567637426e-05, "loss": 0.3452, "step": 14016 }, { "epoch": 18.21011673151751, "grad_norm": 4.295355796813965, "learning_rate": 3.53275639912829e-05, "loss": 0.3474, "step": 14040 }, { "epoch": 18.24124513618677, "grad_norm": 2.150200366973877, "learning_rate": 3.516701511900062e-05, "loss": 0.3465, "step": 14064 }, { "epoch": 18.27237354085603, "grad_norm": 1.407614827156067, "learning_rate": 3.500663376395927e-05, "loss": 0.3453, "step": 14088 }, { "epoch": 18.30350194552529, "grad_norm": 1.2066164016723633, "learning_rate": 3.484642173743575e-05, "loss": 0.3477, "step": 14112 }, { "epoch": 18.33463035019455, "grad_norm": 1.1473839282989502, "learning_rate": 3.4686380848794544e-05, "loss": 0.3448, "step": 14136 }, { "epoch": 18.365758754863812, "grad_norm": 2.0838565826416016, "learning_rate": 3.452651290546742e-05, "loss": 0.3451, "step": 14160 }, { "epoch": 18.396887159533073, "grad_norm": 1.3917421102523804, "learning_rate": 3.436681971293301e-05, "loss": 0.3442, "step": 14184 }, { "epoch": 18.428015564202333, "grad_norm": 1.2915924787521362, "learning_rate": 3.420730307469632e-05, "loss": 0.3409, "step": 14208 }, { "epoch": 18.459143968871594, "grad_norm": 2.337096691131592, "learning_rate": 3.404796479226852e-05, "loss": 0.3471, "step": 14232 }, { "epoch": 18.490272373540854, "grad_norm": 1.732359528541565, "learning_rate": 3.3888806665146374e-05, "loss": 0.3478, "step": 14256 }, { "epoch": 18.52140077821012, "grad_norm": 1.1314399242401123, "learning_rate": 3.3729830490792166e-05, "loss": 0.345, "step": 14280 }, { "epoch": 18.55252918287938, "grad_norm": 1.5127285718917847, "learning_rate": 3.357103806461328e-05, "loss": 0.3405, "step": 14304 }, { "epoch": 18.58365758754864, "grad_norm": 1.306648850440979, "learning_rate": 3.3412431179941847e-05, "loss": 0.3443, "step": 14328 }, { "epoch": 18.6147859922179, "grad_norm": 1.189726710319519, "learning_rate": 3.3254011628014656e-05, "loss": 0.3447, "step": 14352 }, { "epoch": 18.64591439688716, "grad_norm": 1.2058913707733154, "learning_rate": 3.309578119795278e-05, "loss": 0.347, "step": 14376 }, { "epoch": 18.67704280155642, "grad_norm": 1.702572226524353, "learning_rate": 3.293774167674149e-05, "loss": 0.3496, "step": 14400 }, { "epoch": 18.708171206225682, "grad_norm": 1.8515872955322266, "learning_rate": 3.277989484920996e-05, "loss": 0.344, "step": 14424 }, { "epoch": 18.739299610894943, "grad_norm": 1.8190243244171143, "learning_rate": 3.26222424980112e-05, "loss": 0.3499, "step": 14448 }, { "epoch": 18.770428015564203, "grad_norm": 1.261648416519165, "learning_rate": 3.246478640360191e-05, "loss": 0.345, "step": 14472 }, { "epoch": 18.801556420233464, "grad_norm": 1.3052914142608643, "learning_rate": 3.2307528344222296e-05, "loss": 0.3505, "step": 14496 }, { "epoch": 18.832684824902724, "grad_norm": 1.5217386484146118, "learning_rate": 3.215047009587609e-05, "loss": 0.3507, "step": 14520 }, { "epoch": 18.863813229571985, "grad_norm": 1.2934740781784058, "learning_rate": 3.1993613432310384e-05, "loss": 0.3459, "step": 14544 }, { "epoch": 18.894941634241246, "grad_norm": 1.5978559255599976, "learning_rate": 3.183696012499574e-05, "loss": 0.3464, "step": 14568 }, { "epoch": 18.926070038910506, "grad_norm": 1.2306820154190063, "learning_rate": 3.168051194310609e-05, "loss": 0.3446, "step": 14592 }, { "epoch": 18.957198443579767, "grad_norm": 1.1488240957260132, "learning_rate": 3.152427065349867e-05, "loss": 0.3475, "step": 14616 }, { "epoch": 18.988326848249027, "grad_norm": 3.1832704544067383, "learning_rate": 3.1368238020694316e-05, "loss": 0.3437, "step": 14640 }, { "epoch": 19.019455252918288, "grad_norm": 2.3371617794036865, "learning_rate": 3.121241580685727e-05, "loss": 0.3465, "step": 14664 }, { "epoch": 19.05058365758755, "grad_norm": 2.816099166870117, "learning_rate": 3.1056805771775436e-05, "loss": 0.3435, "step": 14688 }, { "epoch": 19.08171206225681, "grad_norm": 1.3421522378921509, "learning_rate": 3.090140967284046e-05, "loss": 0.3418, "step": 14712 }, { "epoch": 19.11284046692607, "grad_norm": 1.8488672971725464, "learning_rate": 3.07462292650279e-05, "loss": 0.348, "step": 14736 }, { "epoch": 19.14396887159533, "grad_norm": 1.2293037176132202, "learning_rate": 3.05912663008774e-05, "loss": 0.342, "step": 14760 }, { "epoch": 19.17509727626459, "grad_norm": 1.7620015144348145, "learning_rate": 3.043652253047281e-05, "loss": 0.3454, "step": 14784 }, { "epoch": 19.20622568093385, "grad_norm": 1.6479402780532837, "learning_rate": 3.0281999701422637e-05, "loss": 0.3427, "step": 14808 }, { "epoch": 19.237354085603112, "grad_norm": 1.5058902502059937, "learning_rate": 3.012769955884005e-05, "loss": 0.3328, "step": 14832 }, { "epoch": 19.268482490272373, "grad_norm": 1.6616445779800415, "learning_rate": 2.9973623845323347e-05, "loss": 0.3441, "step": 14856 }, { "epoch": 19.299610894941633, "grad_norm": 1.5390020608901978, "learning_rate": 2.9819774300936255e-05, "loss": 0.3434, "step": 14880 }, { "epoch": 19.330739299610894, "grad_norm": 1.7172026634216309, "learning_rate": 2.9666152663188172e-05, "loss": 0.3439, "step": 14904 }, { "epoch": 19.361867704280154, "grad_norm": 1.134320855140686, "learning_rate": 2.9512760667014682e-05, "loss": 0.3431, "step": 14928 }, { "epoch": 19.392996108949415, "grad_norm": 4.418805122375488, "learning_rate": 2.935960004475784e-05, "loss": 0.344, "step": 14952 }, { "epoch": 19.424124513618676, "grad_norm": 1.3951141834259033, "learning_rate": 2.920667252614674e-05, "loss": 0.3334, "step": 14976 }, { "epoch": 19.455252918287936, "grad_norm": 2.0081377029418945, "learning_rate": 2.9053979838277834e-05, "loss": 0.3413, "step": 15000 }, { "epoch": 19.486381322957197, "grad_norm": 1.0862860679626465, "learning_rate": 2.890152370559552e-05, "loss": 0.3406, "step": 15024 }, { "epoch": 19.51750972762646, "grad_norm": 1.3487762212753296, "learning_rate": 2.8749305849872686e-05, "loss": 0.3335, "step": 15048 }, { "epoch": 19.54863813229572, "grad_norm": 1.122753381729126, "learning_rate": 2.8597327990191146e-05, "loss": 0.3491, "step": 15072 }, { "epoch": 19.579766536964982, "grad_norm": 1.518355131149292, "learning_rate": 2.844559184292239e-05, "loss": 0.3405, "step": 15096 }, { "epoch": 19.610894941634243, "grad_norm": 1.0469350814819336, "learning_rate": 2.829409912170806e-05, "loss": 0.3395, "step": 15120 }, { "epoch": 19.642023346303503, "grad_norm": 1.915490984916687, "learning_rate": 2.814285153744064e-05, "loss": 0.3426, "step": 15144 }, { "epoch": 19.673151750972764, "grad_norm": 1.477184772491455, "learning_rate": 2.7991850798244197e-05, "loss": 0.3463, "step": 15168 }, { "epoch": 19.704280155642024, "grad_norm": 1.3598774671554565, "learning_rate": 2.7841098609454976e-05, "loss": 0.3454, "step": 15192 }, { "epoch": 19.735408560311285, "grad_norm": 2.6406991481781006, "learning_rate": 2.769059667360227e-05, "loss": 0.3422, "step": 15216 }, { "epoch": 19.766536964980546, "grad_norm": 1.2698395252227783, "learning_rate": 2.754034669038905e-05, "loss": 0.3473, "step": 15240 }, { "epoch": 19.797665369649806, "grad_norm": 1.3700004816055298, "learning_rate": 2.7390350356672934e-05, "loss": 0.3434, "step": 15264 }, { "epoch": 19.828793774319067, "grad_norm": 1.1726247072219849, "learning_rate": 2.7240609366446845e-05, "loss": 0.3421, "step": 15288 }, { "epoch": 19.859922178988327, "grad_norm": 1.5183639526367188, "learning_rate": 2.709112541082e-05, "loss": 0.3418, "step": 15312 }, { "epoch": 19.891050583657588, "grad_norm": 1.1311919689178467, "learning_rate": 2.6941900177998824e-05, "loss": 0.3411, "step": 15336 }, { "epoch": 19.92217898832685, "grad_norm": 1.6014869213104248, "learning_rate": 2.6792935353267757e-05, "loss": 0.339, "step": 15360 }, { "epoch": 19.95330739299611, "grad_norm": 1.8378218412399292, "learning_rate": 2.6644232618970382e-05, "loss": 0.3464, "step": 15384 }, { "epoch": 19.98443579766537, "grad_norm": 2.1291933059692383, "learning_rate": 2.6495793654490292e-05, "loss": 0.3409, "step": 15408 }, { "epoch": 20.01556420233463, "grad_norm": 1.1774524450302124, "learning_rate": 2.6347620136232232e-05, "loss": 0.339, "step": 15432 }, { "epoch": 20.04669260700389, "grad_norm": 1.3319616317749023, "learning_rate": 2.6199713737603055e-05, "loss": 0.3376, "step": 15456 }, { "epoch": 20.07782101167315, "grad_norm": 1.488239049911499, "learning_rate": 2.60520761289929e-05, "loss": 0.3379, "step": 15480 }, { "epoch": 20.108949416342412, "grad_norm": 1.2733827829360962, "learning_rate": 2.590470897775636e-05, "loss": 0.3352, "step": 15504 }, { "epoch": 20.140077821011673, "grad_norm": 2.291374921798706, "learning_rate": 2.575761394819351e-05, "loss": 0.3395, "step": 15528 }, { "epoch": 20.171206225680933, "grad_norm": 1.3169567584991455, "learning_rate": 2.5610792701531298e-05, "loss": 0.3365, "step": 15552 }, { "epoch": 20.202334630350194, "grad_norm": 1.0463300943374634, "learning_rate": 2.54642468959046e-05, "loss": 0.337, "step": 15576 }, { "epoch": 20.233463035019454, "grad_norm": 1.5346705913543701, "learning_rate": 2.5317978186337664e-05, "loss": 0.3394, "step": 15600 }, { "epoch": 20.264591439688715, "grad_norm": 1.6092703342437744, "learning_rate": 2.5171988224725267e-05, "loss": 0.3308, "step": 15624 }, { "epoch": 20.295719844357976, "grad_norm": 1.3011606931686401, "learning_rate": 2.5026278659814144e-05, "loss": 0.339, "step": 15648 }, { "epoch": 20.326848249027236, "grad_norm": 1.2459102869033813, "learning_rate": 2.4880851137184403e-05, "loss": 0.3308, "step": 15672 }, { "epoch": 20.357976653696497, "grad_norm": 1.4810408353805542, "learning_rate": 2.4735707299230808e-05, "loss": 0.3376, "step": 15696 }, { "epoch": 20.389105058365757, "grad_norm": 1.2645267248153687, "learning_rate": 2.4590848785144386e-05, "loss": 0.3402, "step": 15720 }, { "epoch": 20.420233463035018, "grad_norm": 2.001779556274414, "learning_rate": 2.4446277230893823e-05, "loss": 0.3358, "step": 15744 }, { "epoch": 20.45136186770428, "grad_norm": 3.0970067977905273, "learning_rate": 2.4301994269206968e-05, "loss": 0.334, "step": 15768 }, { "epoch": 20.48249027237354, "grad_norm": 1.4983640909194946, "learning_rate": 2.415800152955247e-05, "loss": 0.3424, "step": 15792 }, { "epoch": 20.5136186770428, "grad_norm": 1.3392024040222168, "learning_rate": 2.40143006381213e-05, "loss": 0.3463, "step": 15816 }, { "epoch": 20.544747081712064, "grad_norm": 1.4383450746536255, "learning_rate": 2.3870893217808495e-05, "loss": 0.3354, "step": 15840 }, { "epoch": 20.575875486381324, "grad_norm": 1.4223530292510986, "learning_rate": 2.3727780888194658e-05, "loss": 0.333, "step": 15864 }, { "epoch": 20.607003891050585, "grad_norm": 1.5441044569015503, "learning_rate": 2.3584965265527847e-05, "loss": 0.3335, "step": 15888 }, { "epoch": 20.638132295719846, "grad_norm": 0.8291170597076416, "learning_rate": 2.344244796270524e-05, "loss": 0.3389, "step": 15912 }, { "epoch": 20.669260700389106, "grad_norm": 2.7805609703063965, "learning_rate": 2.330023058925486e-05, "loss": 0.3353, "step": 15936 }, { "epoch": 20.700389105058367, "grad_norm": 1.6097582578659058, "learning_rate": 2.3158314751317513e-05, "loss": 0.339, "step": 15960 }, { "epoch": 20.731517509727627, "grad_norm": 1.4149878025054932, "learning_rate": 2.3016702051628547e-05, "loss": 0.3375, "step": 15984 }, { "epoch": 20.762645914396888, "grad_norm": 1.2236443758010864, "learning_rate": 2.2875394089499847e-05, "loss": 0.3358, "step": 16008 }, { "epoch": 20.79377431906615, "grad_norm": 1.0645393133163452, "learning_rate": 2.2734392460801727e-05, "loss": 0.3377, "step": 16032 }, { "epoch": 20.82490272373541, "grad_norm": 1.2843340635299683, "learning_rate": 2.259369875794485e-05, "loss": 0.3332, "step": 16056 }, { "epoch": 20.85603112840467, "grad_norm": 1.735514760017395, "learning_rate": 2.2453314569862366e-05, "loss": 0.3364, "step": 16080 }, { "epoch": 20.88715953307393, "grad_norm": 1.3856208324432373, "learning_rate": 2.2313241481991855e-05, "loss": 0.3389, "step": 16104 }, { "epoch": 20.91828793774319, "grad_norm": 1.7546725273132324, "learning_rate": 2.217348107625748e-05, "loss": 0.3373, "step": 16128 }, { "epoch": 20.94941634241245, "grad_norm": 1.3664530515670776, "learning_rate": 2.2034034931052096e-05, "loss": 0.3398, "step": 16152 }, { "epoch": 20.980544747081712, "grad_norm": 5.165532112121582, "learning_rate": 2.1894904621219463e-05, "loss": 0.3372, "step": 16176 }, { "epoch": 21.011673151750973, "grad_norm": 1.3261635303497314, "learning_rate": 2.175609171803644e-05, "loss": 0.3381, "step": 16200 }, { "epoch": 21.042801556420233, "grad_norm": 1.8854881525039673, "learning_rate": 2.1617597789195193e-05, "loss": 0.3347, "step": 16224 }, { "epoch": 21.073929961089494, "grad_norm": 1.3904035091400146, "learning_rate": 2.1479424398785573e-05, "loss": 0.3346, "step": 16248 }, { "epoch": 21.105058365758754, "grad_norm": 1.318601369857788, "learning_rate": 2.1341573107277392e-05, "loss": 0.3347, "step": 16272 }, { "epoch": 21.136186770428015, "grad_norm": 1.0564274787902832, "learning_rate": 2.1204045471502803e-05, "loss": 0.3295, "step": 16296 }, { "epoch": 21.167315175097276, "grad_norm": 0.9953235387802124, "learning_rate": 2.106684304463874e-05, "loss": 0.3339, "step": 16320 }, { "epoch": 21.198443579766536, "grad_norm": 1.0253063440322876, "learning_rate": 2.092996737618939e-05, "loss": 0.3271, "step": 16344 }, { "epoch": 21.229571984435797, "grad_norm": 1.5001134872436523, "learning_rate": 2.079342001196869e-05, "loss": 0.3359, "step": 16368 }, { "epoch": 21.260700389105057, "grad_norm": 1.1106650829315186, "learning_rate": 2.0657202494082773e-05, "loss": 0.327, "step": 16392 }, { "epoch": 21.291828793774318, "grad_norm": 1.0053423643112183, "learning_rate": 2.052131636091273e-05, "loss": 0.3398, "step": 16416 }, { "epoch": 21.32295719844358, "grad_norm": 1.3083621263504028, "learning_rate": 2.038576314709707e-05, "loss": 0.3306, "step": 16440 }, { "epoch": 21.35408560311284, "grad_norm": 1.4561755657196045, "learning_rate": 2.0250544383514457e-05, "loss": 0.3364, "step": 16464 }, { "epoch": 21.3852140077821, "grad_norm": 1.0885835886001587, "learning_rate": 2.0115661597266476e-05, "loss": 0.3355, "step": 16488 }, { "epoch": 21.41634241245136, "grad_norm": 1.3506430387496948, "learning_rate": 1.998111631166027e-05, "loss": 0.3334, "step": 16512 }, { "epoch": 21.44747081712062, "grad_norm": 1.0331530570983887, "learning_rate": 1.9846910046191446e-05, "loss": 0.3303, "step": 16536 }, { "epoch": 21.47859922178988, "grad_norm": 1.0616254806518555, "learning_rate": 1.9713044316526813e-05, "loss": 0.3348, "step": 16560 }, { "epoch": 21.509727626459146, "grad_norm": 2.5577657222747803, "learning_rate": 1.9579520634487386e-05, "loss": 0.335, "step": 16584 }, { "epoch": 21.540856031128406, "grad_norm": 1.5290476083755493, "learning_rate": 1.9446340508031185e-05, "loss": 0.3382, "step": 16608 }, { "epoch": 21.571984435797667, "grad_norm": 0.8804724216461182, "learning_rate": 1.931350544123627e-05, "loss": 0.3257, "step": 16632 }, { "epoch": 21.603112840466927, "grad_norm": 1.1799284219741821, "learning_rate": 1.918101693428379e-05, "loss": 0.3298, "step": 16656 }, { "epoch": 21.634241245136188, "grad_norm": 1.3328742980957031, "learning_rate": 1.9048876483440942e-05, "loss": 0.3373, "step": 16680 }, { "epoch": 21.66536964980545, "grad_norm": 0.9985073208808899, "learning_rate": 1.8917085581044193e-05, "loss": 0.3313, "step": 16704 }, { "epoch": 21.69649805447471, "grad_norm": 1.498244047164917, "learning_rate": 1.8785645715482285e-05, "loss": 0.3303, "step": 16728 }, { "epoch": 21.72762645914397, "grad_norm": 1.6468580961227417, "learning_rate": 1.8654558371179583e-05, "loss": 0.3252, "step": 16752 }, { "epoch": 21.75875486381323, "grad_norm": 1.6541725397109985, "learning_rate": 1.8523825028579212e-05, "loss": 0.3299, "step": 16776 }, { "epoch": 21.78988326848249, "grad_norm": 0.9805202484130859, "learning_rate": 1.8393447164126282e-05, "loss": 0.3342, "step": 16800 }, { "epoch": 21.82101167315175, "grad_norm": 0.9097315073013306, "learning_rate": 1.8263426250251388e-05, "loss": 0.3309, "step": 16824 }, { "epoch": 21.852140077821012, "grad_norm": 1.2603996992111206, "learning_rate": 1.8133763755353816e-05, "loss": 0.3387, "step": 16848 }, { "epoch": 21.883268482490273, "grad_norm": 1.0283710956573486, "learning_rate": 1.800446114378508e-05, "loss": 0.3325, "step": 16872 }, { "epoch": 21.914396887159533, "grad_norm": 2.601137399673462, "learning_rate": 1.7875519875832254e-05, "loss": 0.3356, "step": 16896 }, { "epoch": 21.945525291828794, "grad_norm": 1.0405902862548828, "learning_rate": 1.774694140770163e-05, "loss": 0.3339, "step": 16920 }, { "epoch": 21.976653696498055, "grad_norm": 1.504928708076477, "learning_rate": 1.7618727191502188e-05, "loss": 0.3329, "step": 16944 }, { "epoch": 22.007782101167315, "grad_norm": 1.1356394290924072, "learning_rate": 1.749087867522912e-05, "loss": 0.331, "step": 16968 }, { "epoch": 22.038910505836576, "grad_norm": 1.3053059577941895, "learning_rate": 1.7363397302747687e-05, "loss": 0.3316, "step": 16992 }, { "epoch": 22.070038910505836, "grad_norm": 1.8512986898422241, "learning_rate": 1.723628451377669e-05, "loss": 0.3286, "step": 17016 }, { "epoch": 22.101167315175097, "grad_norm": 1.1379419565200806, "learning_rate": 1.7109541743872366e-05, "loss": 0.3311, "step": 17040 }, { "epoch": 22.132295719844358, "grad_norm": 1.0137568712234497, "learning_rate": 1.698317042441211e-05, "loss": 0.3294, "step": 17064 }, { "epoch": 22.163424124513618, "grad_norm": 1.1163158416748047, "learning_rate": 1.6857171982578286e-05, "loss": 0.3247, "step": 17088 }, { "epoch": 22.19455252918288, "grad_norm": 0.992064893245697, "learning_rate": 1.6731547841342193e-05, "loss": 0.3331, "step": 17112 }, { "epoch": 22.22568093385214, "grad_norm": 1.2021843194961548, "learning_rate": 1.6606299419447894e-05, "loss": 0.3284, "step": 17136 }, { "epoch": 22.2568093385214, "grad_norm": 2.352348566055298, "learning_rate": 1.6481428131396275e-05, "loss": 0.3315, "step": 17160 }, { "epoch": 22.28793774319066, "grad_norm": 1.283078908920288, "learning_rate": 1.6356935387428996e-05, "loss": 0.3262, "step": 17184 }, { "epoch": 22.31906614785992, "grad_norm": 1.2125391960144043, "learning_rate": 1.6232822593512654e-05, "loss": 0.3312, "step": 17208 }, { "epoch": 22.35019455252918, "grad_norm": 1.2397364377975464, "learning_rate": 1.610909115132286e-05, "loss": 0.3268, "step": 17232 }, { "epoch": 22.381322957198442, "grad_norm": 1.4817135334014893, "learning_rate": 1.5985742458228338e-05, "loss": 0.3283, "step": 17256 }, { "epoch": 22.412451361867703, "grad_norm": 2.0548017024993896, "learning_rate": 1.58627779072753e-05, "loss": 0.3249, "step": 17280 }, { "epoch": 22.443579766536963, "grad_norm": 1.4913387298583984, "learning_rate": 1.574019888717155e-05, "loss": 0.3277, "step": 17304 }, { "epoch": 22.474708171206224, "grad_norm": 1.2476876974105835, "learning_rate": 1.5618006782270904e-05, "loss": 0.3298, "step": 17328 }, { "epoch": 22.505836575875485, "grad_norm": 1.2181342840194702, "learning_rate": 1.5496202972557556e-05, "loss": 0.329, "step": 17352 }, { "epoch": 22.53696498054475, "grad_norm": 1.3082391023635864, "learning_rate": 1.5374788833630404e-05, "loss": 0.328, "step": 17376 }, { "epoch": 22.56809338521401, "grad_norm": 1.217458963394165, "learning_rate": 1.5253765736687636e-05, "loss": 0.3273, "step": 17400 }, { "epoch": 22.59922178988327, "grad_norm": 1.1426113843917847, "learning_rate": 1.5133135048511127e-05, "loss": 0.3314, "step": 17424 }, { "epoch": 22.63035019455253, "grad_norm": 1.8684285879135132, "learning_rate": 1.5012898131451114e-05, "loss": 0.3301, "step": 17448 }, { "epoch": 22.66147859922179, "grad_norm": 1.1370235681533813, "learning_rate": 1.489305634341071e-05, "loss": 0.3315, "step": 17472 }, { "epoch": 22.69260700389105, "grad_norm": 1.1359672546386719, "learning_rate": 1.4773611037830626e-05, "loss": 0.3283, "step": 17496 }, { "epoch": 22.723735408560312, "grad_norm": 1.3090800046920776, "learning_rate": 1.4654563563673901e-05, "loss": 0.3282, "step": 17520 }, { "epoch": 22.754863813229573, "grad_norm": 1.2736905813217163, "learning_rate": 1.4535915265410593e-05, "loss": 0.33, "step": 17544 }, { "epoch": 22.785992217898833, "grad_norm": 1.189782977104187, "learning_rate": 1.4417667483002688e-05, "loss": 0.3267, "step": 17568 }, { "epoch": 22.817120622568094, "grad_norm": 2.092562437057495, "learning_rate": 1.4299821551888881e-05, "loss": 0.3276, "step": 17592 }, { "epoch": 22.848249027237355, "grad_norm": 1.8085280656814575, "learning_rate": 1.4182378802969582e-05, "loss": 0.3267, "step": 17616 }, { "epoch": 22.879377431906615, "grad_norm": 1.2389247417449951, "learning_rate": 1.4065340562591784e-05, "loss": 0.3322, "step": 17640 }, { "epoch": 22.910505836575876, "grad_norm": 2.3639073371887207, "learning_rate": 1.3948708152534162e-05, "loss": 0.3286, "step": 17664 }, { "epoch": 22.941634241245136, "grad_norm": 1.4584684371948242, "learning_rate": 1.3832482889992138e-05, "loss": 0.3275, "step": 17688 }, { "epoch": 22.972762645914397, "grad_norm": 1.2135454416275024, "learning_rate": 1.3716666087562951e-05, "loss": 0.3331, "step": 17712 }, { "epoch": 23.003891050583658, "grad_norm": 1.1459728479385376, "learning_rate": 1.3601259053230924e-05, "loss": 0.3259, "step": 17736 }, { "epoch": 23.035019455252918, "grad_norm": 1.1459057331085205, "learning_rate": 1.3486263090352563e-05, "loss": 0.3229, "step": 17760 }, { "epoch": 23.06614785992218, "grad_norm": 1.3186362981796265, "learning_rate": 1.3371679497641997e-05, "loss": 0.3242, "step": 17784 }, { "epoch": 23.09727626459144, "grad_norm": 0.9882354736328125, "learning_rate": 1.3257509569156162e-05, "loss": 0.3263, "step": 17808 }, { "epoch": 23.1284046692607, "grad_norm": 1.146543264389038, "learning_rate": 1.3143754594280266e-05, "loss": 0.3239, "step": 17832 }, { "epoch": 23.15953307392996, "grad_norm": 1.5829049348831177, "learning_rate": 1.3030415857713246e-05, "loss": 0.3274, "step": 17856 }, { "epoch": 23.19066147859922, "grad_norm": 1.1690993309020996, "learning_rate": 1.2917494639453171e-05, "loss": 0.3266, "step": 17880 }, { "epoch": 23.22178988326848, "grad_norm": 2.0189902782440186, "learning_rate": 1.280499221478289e-05, "loss": 0.3277, "step": 17904 }, { "epoch": 23.252918287937742, "grad_norm": 2.8502254486083984, "learning_rate": 1.269290985425557e-05, "loss": 0.3309, "step": 17928 }, { "epoch": 23.284046692607003, "grad_norm": 1.144399881362915, "learning_rate": 1.2581248823680336e-05, "loss": 0.3302, "step": 17952 }, { "epoch": 23.315175097276263, "grad_norm": 1.0023480653762817, "learning_rate": 1.2470010384108012e-05, "loss": 0.3259, "step": 17976 }, { "epoch": 23.346303501945524, "grad_norm": 1.0780220031738281, "learning_rate": 1.2359195791816841e-05, "loss": 0.3274, "step": 18000 }, { "epoch": 23.377431906614785, "grad_norm": 1.4481017589569092, "learning_rate": 1.2248806298298372e-05, "loss": 0.3191, "step": 18024 }, { "epoch": 23.408560311284045, "grad_norm": 0.9282727837562561, "learning_rate": 1.2138843150243212e-05, "loss": 0.326, "step": 18048 }, { "epoch": 23.439688715953306, "grad_norm": 1.2329308986663818, "learning_rate": 1.2029307589527062e-05, "loss": 0.3245, "step": 18072 }, { "epoch": 23.470817120622566, "grad_norm": 1.535043478012085, "learning_rate": 1.1920200853196623e-05, "loss": 0.3273, "step": 18096 }, { "epoch": 23.50194552529183, "grad_norm": 1.5993396043777466, "learning_rate": 1.1811524173455618e-05, "loss": 0.3242, "step": 18120 }, { "epoch": 23.53307392996109, "grad_norm": 2.646594762802124, "learning_rate": 1.1703278777650929e-05, "loss": 0.3323, "step": 18144 }, { "epoch": 23.56420233463035, "grad_norm": 1.254061222076416, "learning_rate": 1.1595465888258661e-05, "loss": 0.3238, "step": 18168 }, { "epoch": 23.595330739299612, "grad_norm": 1.3275645971298218, "learning_rate": 1.1488086722870439e-05, "loss": 0.328, "step": 18192 }, { "epoch": 23.626459143968873, "grad_norm": 1.366665244102478, "learning_rate": 1.1381142494179586e-05, "loss": 0.3275, "step": 18216 }, { "epoch": 23.657587548638134, "grad_norm": 1.2128342390060425, "learning_rate": 1.1274634409967389e-05, "loss": 0.3247, "step": 18240 }, { "epoch": 23.688715953307394, "grad_norm": 1.168764591217041, "learning_rate": 1.1168563673089589e-05, "loss": 0.3239, "step": 18264 }, { "epoch": 23.719844357976655, "grad_norm": 1.2446372509002686, "learning_rate": 1.1062931481462647e-05, "loss": 0.32, "step": 18288 }, { "epoch": 23.750972762645915, "grad_norm": 1.4571527242660522, "learning_rate": 1.095773902805033e-05, "loss": 0.3272, "step": 18312 }, { "epoch": 23.782101167315176, "grad_norm": 1.1576392650604248, "learning_rate": 1.0852987500850148e-05, "loss": 0.3251, "step": 18336 }, { "epoch": 23.813229571984436, "grad_norm": 1.3691147565841675, "learning_rate": 1.0748678082880049e-05, "loss": 0.3253, "step": 18360 }, { "epoch": 23.844357976653697, "grad_norm": 1.859039068222046, "learning_rate": 1.0644811952164957e-05, "loss": 0.3293, "step": 18384 }, { "epoch": 23.875486381322958, "grad_norm": 1.2036535739898682, "learning_rate": 1.0541390281723478e-05, "loss": 0.3269, "step": 18408 }, { "epoch": 23.90661478599222, "grad_norm": 1.459100365638733, "learning_rate": 1.043841423955474e-05, "loss": 0.3276, "step": 18432 }, { "epoch": 23.93774319066148, "grad_norm": 1.2927861213684082, "learning_rate": 1.0335884988625084e-05, "loss": 0.3263, "step": 18456 }, { "epoch": 23.96887159533074, "grad_norm": 1.4151058197021484, "learning_rate": 1.0233803686855014e-05, "loss": 0.321, "step": 18480 }, { "epoch": 24.0, "grad_norm": 1.434226393699646, "learning_rate": 1.0132171487106068e-05, "loss": 0.3202, "step": 18504 }, { "epoch": 24.03112840466926, "grad_norm": 1.2331753969192505, "learning_rate": 1.0030989537167857e-05, "loss": 0.3242, "step": 18528 }, { "epoch": 24.06225680933852, "grad_norm": 1.6305173635482788, "learning_rate": 9.930258979745055e-06, "loss": 0.3221, "step": 18552 }, { "epoch": 24.09338521400778, "grad_norm": 1.1515713930130005, "learning_rate": 9.82998095244449e-06, "loss": 0.3217, "step": 18576 }, { "epoch": 24.124513618677042, "grad_norm": 1.1086283922195435, "learning_rate": 9.730156587762335e-06, "loss": 0.3225, "step": 18600 }, { "epoch": 24.155642023346303, "grad_norm": 1.256364107131958, "learning_rate": 9.630787013071286e-06, "loss": 0.3218, "step": 18624 }, { "epoch": 24.186770428015564, "grad_norm": 1.2893520593643188, "learning_rate": 9.531873350607823e-06, "loss": 0.3285, "step": 18648 }, { "epoch": 24.217898832684824, "grad_norm": 1.1564453840255737, "learning_rate": 9.433416717459592e-06, "loss": 0.3234, "step": 18672 }, { "epoch": 24.249027237354085, "grad_norm": 1.6299091577529907, "learning_rate": 9.3354182255527e-06, "loss": 0.3237, "step": 18696 }, { "epoch": 24.280155642023345, "grad_norm": 0.9497871994972229, "learning_rate": 9.237878981639264e-06, "loss": 0.3226, "step": 18720 }, { "epoch": 24.311284046692606, "grad_norm": 1.3882777690887451, "learning_rate": 9.140800087284801e-06, "loss": 0.322, "step": 18744 }, { "epoch": 24.342412451361866, "grad_norm": 1.1506375074386597, "learning_rate": 9.044182638855891e-06, "loss": 0.3274, "step": 18768 }, { "epoch": 24.373540856031127, "grad_norm": 0.8968532681465149, "learning_rate": 8.948027727507708e-06, "loss": 0.319, "step": 18792 }, { "epoch": 24.404669260700388, "grad_norm": 1.5157815217971802, "learning_rate": 8.852336439171733e-06, "loss": 0.3254, "step": 18816 }, { "epoch": 24.43579766536965, "grad_norm": 0.9984537959098816, "learning_rate": 8.757109854543533e-06, "loss": 0.3244, "step": 18840 }, { "epoch": 24.46692607003891, "grad_norm": 1.8151588439941406, "learning_rate": 8.662349049070463e-06, "loss": 0.3198, "step": 18864 }, { "epoch": 24.49805447470817, "grad_norm": 1.1167311668395996, "learning_rate": 8.568055092939615e-06, "loss": 0.3179, "step": 18888 }, { "epoch": 24.529182879377434, "grad_norm": 1.3895347118377686, "learning_rate": 8.474229051065657e-06, "loss": 0.3211, "step": 18912 }, { "epoch": 24.560311284046694, "grad_norm": 1.2524361610412598, "learning_rate": 8.38087198307887e-06, "loss": 0.32, "step": 18936 }, { "epoch": 24.591439688715955, "grad_norm": 1.389087200164795, "learning_rate": 8.287984943313114e-06, "loss": 0.3251, "step": 18960 }, { "epoch": 24.622568093385215, "grad_norm": 1.6150294542312622, "learning_rate": 8.195568980793967e-06, "loss": 0.3275, "step": 18984 }, { "epoch": 24.653696498054476, "grad_norm": 1.6251153945922852, "learning_rate": 8.103625139226895e-06, "loss": 0.3225, "step": 19008 }, { "epoch": 24.684824902723737, "grad_norm": 1.5373034477233887, "learning_rate": 8.012154456985388e-06, "loss": 0.3253, "step": 19032 }, { "epoch": 24.715953307392997, "grad_norm": 0.9456262588500977, "learning_rate": 7.921157967099336e-06, "loss": 0.3151, "step": 19056 }, { "epoch": 24.747081712062258, "grad_norm": 0.9828768372535706, "learning_rate": 7.830636697243254e-06, "loss": 0.3252, "step": 19080 }, { "epoch": 24.77821011673152, "grad_norm": 1.8610461950302124, "learning_rate": 7.740591669724772e-06, "loss": 0.325, "step": 19104 }, { "epoch": 24.80933852140078, "grad_norm": 1.8049260377883911, "learning_rate": 7.651023901473032e-06, "loss": 0.3204, "step": 19128 }, { "epoch": 24.84046692607004, "grad_norm": 1.1601166725158691, "learning_rate": 7.561934404027193e-06, "loss": 0.3231, "step": 19152 }, { "epoch": 24.8715953307393, "grad_norm": 1.2389658689498901, "learning_rate": 7.473324183525088e-06, "loss": 0.329, "step": 19176 }, { "epoch": 24.90272373540856, "grad_norm": 1.0001511573791504, "learning_rate": 7.385194240691751e-06, "loss": 0.319, "step": 19200 }, { "epoch": 24.93385214007782, "grad_norm": 1.7757816314697266, "learning_rate": 7.297545570828207e-06, "loss": 0.3267, "step": 19224 }, { "epoch": 24.964980544747082, "grad_norm": 1.1014970541000366, "learning_rate": 7.210379163800185e-06, "loss": 0.3223, "step": 19248 }, { "epoch": 24.996108949416342, "grad_norm": 1.6188836097717285, "learning_rate": 7.123696004026947e-06, "loss": 0.3227, "step": 19272 }, { "epoch": 25.027237354085603, "grad_norm": 1.2841421365737915, "learning_rate": 7.037497070470167e-06, "loss": 0.32, "step": 19296 }, { "epoch": 25.058365758754864, "grad_norm": 1.2222139835357666, "learning_rate": 6.951783336622864e-06, "loss": 0.3217, "step": 19320 }, { "epoch": 25.089494163424124, "grad_norm": 1.0179907083511353, "learning_rate": 6.866555770498473e-06, "loss": 0.3182, "step": 19344 }, { "epoch": 25.120622568093385, "grad_norm": 0.9595916271209717, "learning_rate": 6.781815334619812e-06, "loss": 0.3195, "step": 19368 }, { "epoch": 25.151750972762645, "grad_norm": 1.2857320308685303, "learning_rate": 6.6975629860082935e-06, "loss": 0.3177, "step": 19392 }, { "epoch": 25.182879377431906, "grad_norm": 1.7358510494232178, "learning_rate": 6.613799676173088e-06, "loss": 0.3208, "step": 19416 }, { "epoch": 25.214007782101167, "grad_norm": 1.8369121551513672, "learning_rate": 6.530526351100347e-06, "loss": 0.3196, "step": 19440 }, { "epoch": 25.245136186770427, "grad_norm": 2.4744224548339844, "learning_rate": 6.447743951242591e-06, "loss": 0.3239, "step": 19464 }, { "epoch": 25.276264591439688, "grad_norm": 1.2925540208816528, "learning_rate": 6.3654534115079936e-06, "loss": 0.3157, "step": 19488 }, { "epoch": 25.30739299610895, "grad_norm": 1.1039607524871826, "learning_rate": 6.28365566124991e-06, "loss": 0.3229, "step": 19512 }, { "epoch": 25.33852140077821, "grad_norm": 0.8712733387947083, "learning_rate": 6.202351624256359e-06, "loss": 0.3181, "step": 19536 }, { "epoch": 25.36964980544747, "grad_norm": 1.236718773841858, "learning_rate": 6.1215422187395345e-06, "loss": 0.3172, "step": 19560 }, { "epoch": 25.40077821011673, "grad_norm": 1.4729557037353516, "learning_rate": 6.041228357325529e-06, "loss": 0.3244, "step": 19584 }, { "epoch": 25.43190661478599, "grad_norm": 1.1015067100524902, "learning_rate": 5.961410947043927e-06, "loss": 0.3227, "step": 19608 }, { "epoch": 25.46303501945525, "grad_norm": 1.4798215627670288, "learning_rate": 5.882090889317671e-06, "loss": 0.3208, "step": 19632 }, { "epoch": 25.494163424124515, "grad_norm": 1.9315009117126465, "learning_rate": 5.803269079952739e-06, "loss": 0.3158, "step": 19656 }, { "epoch": 25.525291828793776, "grad_norm": 1.1661323308944702, "learning_rate": 5.724946409128179e-06, "loss": 0.3194, "step": 19680 }, { "epoch": 25.556420233463037, "grad_norm": 1.796525239944458, "learning_rate": 5.647123761385975e-06, "loss": 0.3236, "step": 19704 }, { "epoch": 25.587548638132297, "grad_norm": 1.251969814300537, "learning_rate": 5.569802015621039e-06, "loss": 0.3228, "step": 19728 }, { "epoch": 25.618677042801558, "grad_norm": 1.9998018741607666, "learning_rate": 5.492982045071355e-06, "loss": 0.3248, "step": 19752 }, { "epoch": 25.64980544747082, "grad_norm": 1.0044583082199097, "learning_rate": 5.4166647173080345e-06, "loss": 0.3246, "step": 19776 }, { "epoch": 25.68093385214008, "grad_norm": 1.0275497436523438, "learning_rate": 5.340850894225607e-06, "loss": 0.3253, "step": 19800 }, { "epoch": 25.71206225680934, "grad_norm": 1.0156971216201782, "learning_rate": 5.265541432032212e-06, "loss": 0.3171, "step": 19824 }, { "epoch": 25.7431906614786, "grad_norm": 1.4596341848373413, "learning_rate": 5.190737181239941e-06, "loss": 0.3212, "step": 19848 }, { "epoch": 25.77431906614786, "grad_norm": 1.2357956171035767, "learning_rate": 5.116438986655303e-06, "loss": 0.3268, "step": 19872 }, { "epoch": 25.80544747081712, "grad_norm": 1.335877537727356, "learning_rate": 5.042647687369573e-06, "loss": 0.3218, "step": 19896 }, { "epoch": 25.836575875486382, "grad_norm": 1.5729907751083374, "learning_rate": 4.969364116749414e-06, "loss": 0.3205, "step": 19920 }, { "epoch": 25.867704280155642, "grad_norm": 1.5255457162857056, "learning_rate": 4.89658910242739e-06, "loss": 0.3165, "step": 19944 }, { "epoch": 25.898832684824903, "grad_norm": 1.195453405380249, "learning_rate": 4.8243234662926905e-06, "loss": 0.323, "step": 19968 }, { "epoch": 25.929961089494164, "grad_norm": 1.1830676794052124, "learning_rate": 4.75256802448178e-06, "loss": 0.3173, "step": 19992 }, { "epoch": 25.961089494163424, "grad_norm": 0.9383173584938049, "learning_rate": 4.681323587369213e-06, "loss": 0.3159, "step": 20016 }, { "epoch": 25.992217898832685, "grad_norm": 1.3204113245010376, "learning_rate": 4.610590959558497e-06, "loss": 0.3217, "step": 20040 }, { "epoch": 26.023346303501945, "grad_norm": 1.1940529346466064, "learning_rate": 4.540370939872974e-06, "loss": 0.3188, "step": 20064 }, { "epoch": 26.054474708171206, "grad_norm": 1.7250840663909912, "learning_rate": 4.470664321346829e-06, "loss": 0.3192, "step": 20088 }, { "epoch": 26.085603112840467, "grad_norm": 0.9612188339233398, "learning_rate": 4.401471891216114e-06, "loss": 0.3183, "step": 20112 }, { "epoch": 26.116731517509727, "grad_norm": 1.175308108329773, "learning_rate": 4.332794430909854e-06, "loss": 0.3162, "step": 20136 }, { "epoch": 26.147859922178988, "grad_norm": 1.3628140687942505, "learning_rate": 4.264632716041234e-06, "loss": 0.3173, "step": 20160 }, { "epoch": 26.17898832684825, "grad_norm": 0.9504318237304688, "learning_rate": 4.196987516398831e-06, "loss": 0.3259, "step": 20184 }, { "epoch": 26.21011673151751, "grad_norm": 1.6836086511611938, "learning_rate": 4.129859595937946e-06, "loss": 0.3188, "step": 20208 }, { "epoch": 26.24124513618677, "grad_norm": 1.2717008590698242, "learning_rate": 4.063249712771922e-06, "loss": 0.321, "step": 20232 }, { "epoch": 26.27237354085603, "grad_norm": 1.989966869354248, "learning_rate": 3.997158619163644e-06, "loss": 0.3215, "step": 20256 }, { "epoch": 26.30350194552529, "grad_norm": 1.1739614009857178, "learning_rate": 3.931587061517011e-06, "loss": 0.3193, "step": 20280 }, { "epoch": 26.33463035019455, "grad_norm": 1.1167713403701782, "learning_rate": 3.8665357803685025e-06, "loss": 0.3174, "step": 20304 }, { "epoch": 26.365758754863812, "grad_norm": 1.379565715789795, "learning_rate": 3.8020055103788144e-06, "loss": 0.3218, "step": 20328 }, { "epoch": 26.396887159533073, "grad_norm": 1.4840023517608643, "learning_rate": 3.7379969803245763e-06, "loss": 0.3213, "step": 20352 }, { "epoch": 26.428015564202333, "grad_norm": 1.1443723440170288, "learning_rate": 3.6745109130901288e-06, "loss": 0.3141, "step": 20376 }, { "epoch": 26.459143968871594, "grad_norm": 1.090888500213623, "learning_rate": 3.6115480256593394e-06, "loss": 0.3212, "step": 20400 }, { "epoch": 26.490272373540854, "grad_norm": 1.472679615020752, "learning_rate": 3.5491090291075004e-06, "loss": 0.3151, "step": 20424 }, { "epoch": 26.52140077821012, "grad_norm": 0.9774566292762756, "learning_rate": 3.487194628593332e-06, "loss": 0.3214, "step": 20448 }, { "epoch": 26.55252918287938, "grad_norm": 2.1687231063842773, "learning_rate": 3.4258055233509665e-06, "loss": 0.324, "step": 20472 }, { "epoch": 26.58365758754864, "grad_norm": 1.2352170944213867, "learning_rate": 3.364942406682109e-06, "loss": 0.3101, "step": 20496 }, { "epoch": 26.6147859922179, "grad_norm": 2.996083974838257, "learning_rate": 3.304605965948149e-06, "loss": 0.3141, "step": 20520 }, { "epoch": 26.64591439688716, "grad_norm": 1.5926743745803833, "learning_rate": 3.244796882562462e-06, "loss": 0.3229, "step": 20544 }, { "epoch": 26.67704280155642, "grad_norm": 1.1748905181884766, "learning_rate": 3.1855158319826774e-06, "loss": 0.3213, "step": 20568 }, { "epoch": 26.708171206225682, "grad_norm": 1.1093063354492188, "learning_rate": 3.126763483703016e-06, "loss": 0.3178, "step": 20592 }, { "epoch": 26.739299610894943, "grad_norm": 1.1090799570083618, "learning_rate": 3.0685405012468137e-06, "loss": 0.3198, "step": 20616 }, { "epoch": 26.770428015564203, "grad_norm": 1.0905050039291382, "learning_rate": 3.010847542158951e-06, "loss": 0.3192, "step": 20640 }, { "epoch": 26.801556420233464, "grad_norm": 1.8493279218673706, "learning_rate": 2.953685257998451e-06, "loss": 0.3204, "step": 20664 }, { "epoch": 26.832684824902724, "grad_norm": 1.2924058437347412, "learning_rate": 2.8970542943311583e-06, "loss": 0.3261, "step": 20688 }, { "epoch": 26.863813229571985, "grad_norm": 0.9771651029586792, "learning_rate": 2.8409552907223804e-06, "loss": 0.3132, "step": 20712 }, { "epoch": 26.894941634241246, "grad_norm": 1.0269138813018799, "learning_rate": 2.785388880729739e-06, "loss": 0.3199, "step": 20736 }, { "epoch": 26.926070038910506, "grad_norm": 1.309114933013916, "learning_rate": 2.7303556918959305e-06, "loss": 0.3145, "step": 20760 }, { "epoch": 26.957198443579767, "grad_norm": 1.0709702968597412, "learning_rate": 2.6758563457417286e-06, "loss": 0.3192, "step": 20784 }, { "epoch": 26.988326848249027, "grad_norm": 1.4049859046936035, "learning_rate": 2.621891457758896e-06, "loss": 0.3206, "step": 20808 }, { "epoch": 27.019455252918288, "grad_norm": 1.3224713802337646, "learning_rate": 2.568461637403252e-06, "loss": 0.312, "step": 20832 }, { "epoch": 27.05058365758755, "grad_norm": 1.3082164525985718, "learning_rate": 2.5155674880878334e-06, "loss": 0.3108, "step": 20856 }, { "epoch": 27.08171206225681, "grad_norm": 0.991944432258606, "learning_rate": 2.4632096071759925e-06, "loss": 0.3188, "step": 20880 }, { "epoch": 27.11284046692607, "grad_norm": 1.2203731536865234, "learning_rate": 2.4113885859747497e-06, "loss": 0.3108, "step": 20904 }, { "epoch": 27.14396887159533, "grad_norm": 1.203995704650879, "learning_rate": 2.360105009728025e-06, "loss": 0.3102, "step": 20928 }, { "epoch": 27.17509727626459, "grad_norm": 1.6264797449111938, "learning_rate": 2.3093594576101107e-06, "loss": 0.3174, "step": 20952 }, { "epoch": 27.20622568093385, "grad_norm": 1.3530755043029785, "learning_rate": 2.2591525027190473e-06, "loss": 0.3252, "step": 20976 }, { "epoch": 27.237354085603112, "grad_norm": 2.048307418823242, "learning_rate": 2.20948471207022e-06, "loss": 0.3184, "step": 21000 }, { "epoch": 27.268482490272373, "grad_norm": 1.320873737335205, "learning_rate": 2.160356646589934e-06, "loss": 0.3191, "step": 21024 }, { "epoch": 27.299610894941633, "grad_norm": 1.1831213235855103, "learning_rate": 2.111768861109048e-06, "loss": 0.3183, "step": 21048 }, { "epoch": 27.330739299610894, "grad_norm": 1.0811506509780884, "learning_rate": 2.0637219043567636e-06, "loss": 0.3177, "step": 21072 }, { "epoch": 27.361867704280154, "grad_norm": 1.1472513675689697, "learning_rate": 2.0162163189543838e-06, "loss": 0.3171, "step": 21096 }, { "epoch": 27.392996108949415, "grad_norm": 1.6906425952911377, "learning_rate": 1.9692526414092084e-06, "loss": 0.3223, "step": 21120 }, { "epoch": 27.424124513618676, "grad_norm": 1.600865364074707, "learning_rate": 1.9228314021084548e-06, "loss": 0.3151, "step": 21144 }, { "epoch": 27.455252918287936, "grad_norm": 1.7052664756774902, "learning_rate": 1.8769531253132854e-06, "loss": 0.3172, "step": 21168 }, { "epoch": 27.486381322957197, "grad_norm": 1.2754665613174438, "learning_rate": 1.83161832915289e-06, "loss": 0.3181, "step": 21192 }, { "epoch": 27.51750972762646, "grad_norm": 0.9670736193656921, "learning_rate": 1.7868275256186174e-06, "loss": 0.3209, "step": 21216 }, { "epoch": 27.54863813229572, "grad_norm": 1.7570668458938599, "learning_rate": 1.7425812205582147e-06, "loss": 0.3151, "step": 21240 }, { "epoch": 27.579766536964982, "grad_norm": 1.1468702554702759, "learning_rate": 1.6988799136700706e-06, "loss": 0.32, "step": 21264 }, { "epoch": 27.610894941634243, "grad_norm": 1.837241768836975, "learning_rate": 1.6557240984976408e-06, "loss": 0.3176, "step": 21288 }, { "epoch": 27.642023346303503, "grad_norm": 1.050024151802063, "learning_rate": 1.613114262423815e-06, "loss": 0.3169, "step": 21312 }, { "epoch": 27.673151750972764, "grad_norm": 1.0731110572814941, "learning_rate": 1.5710508866654261e-06, "loss": 0.3204, "step": 21336 }, { "epoch": 27.704280155642024, "grad_norm": 1.2539221048355103, "learning_rate": 1.5295344462678495e-06, "loss": 0.3168, "step": 21360 }, { "epoch": 27.735408560311285, "grad_norm": 1.4090372323989868, "learning_rate": 1.488565410099585e-06, "loss": 0.3164, "step": 21384 }, { "epoch": 27.766536964980546, "grad_norm": 1.5965330600738525, "learning_rate": 1.4481442408470047e-06, "loss": 0.3216, "step": 21408 }, { "epoch": 27.797665369649806, "grad_norm": 1.1138761043548584, "learning_rate": 1.4082713950091198e-06, "loss": 0.3206, "step": 21432 }, { "epoch": 27.828793774319067, "grad_norm": 1.1677641868591309, "learning_rate": 1.3689473228923944e-06, "loss": 0.3241, "step": 21456 }, { "epoch": 27.859922178988327, "grad_norm": 2.1310067176818848, "learning_rate": 1.3301724686056894e-06, "loss": 0.3187, "step": 21480 }, { "epoch": 27.891050583657588, "grad_norm": 1.3181018829345703, "learning_rate": 1.2919472700552382e-06, "loss": 0.3164, "step": 21504 }, { "epoch": 27.92217898832685, "grad_norm": 1.476120114326477, "learning_rate": 1.2542721589397234e-06, "loss": 0.3184, "step": 21528 }, { "epoch": 27.95330739299611, "grad_norm": 1.1621023416519165, "learning_rate": 1.217147560745352e-06, "loss": 0.319, "step": 21552 }, { "epoch": 27.98443579766537, "grad_norm": 1.1426842212677002, "learning_rate": 1.1805738947410938e-06, "loss": 0.3155, "step": 21576 }, { "epoch": 28.01556420233463, "grad_norm": 2.4093399047851562, "learning_rate": 1.1445515739739399e-06, "loss": 0.3135, "step": 21600 }, { "epoch": 28.04669260700389, "grad_norm": 1.5340672731399536, "learning_rate": 1.1090810052642064e-06, "loss": 0.3181, "step": 21624 }, { "epoch": 28.07782101167315, "grad_norm": 1.0847253799438477, "learning_rate": 1.0741625892009833e-06, "loss": 0.3165, "step": 21648 }, { "epoch": 28.108949416342412, "grad_norm": 1.3261409997940063, "learning_rate": 1.0397967201375814e-06, "loss": 0.3204, "step": 21672 }, { "epoch": 28.140077821011673, "grad_norm": 1.0757031440734863, "learning_rate": 1.0059837861870812e-06, "loss": 0.3187, "step": 21696 }, { "epoch": 28.171206225680933, "grad_norm": 1.2534974813461304, "learning_rate": 9.727241692179756e-07, "loss": 0.3096, "step": 21720 }, { "epoch": 28.202334630350194, "grad_norm": 1.2287142276763916, "learning_rate": 9.400182448498163e-07, "loss": 0.3169, "step": 21744 }, { "epoch": 28.233463035019454, "grad_norm": 0.9463332891464233, "learning_rate": 9.078663824490131e-07, "loss": 0.3185, "step": 21768 }, { "epoch": 28.264591439688715, "grad_norm": 2.7430317401885986, "learning_rate": 8.762689451246198e-07, "loss": 0.3178, "step": 21792 }, { "epoch": 28.295719844357976, "grad_norm": 1.1905908584594727, "learning_rate": 8.452262897242768e-07, "loss": 0.3197, "step": 21816 }, { "epoch": 28.326848249027236, "grad_norm": 0.894260823726654, "learning_rate": 8.147387668301421e-07, "loss": 0.3201, "step": 21840 }, { "epoch": 28.357976653696497, "grad_norm": 1.122759222984314, "learning_rate": 7.848067207549603e-07, "loss": 0.3102, "step": 21864 }, { "epoch": 28.389105058365757, "grad_norm": 1.454839825630188, "learning_rate": 7.554304895381781e-07, "loss": 0.3156, "step": 21888 }, { "epoch": 28.420233463035018, "grad_norm": 1.348819613456726, "learning_rate": 7.266104049420797e-07, "loss": 0.3173, "step": 21912 }, { "epoch": 28.45136186770428, "grad_norm": 1.397900104522705, "learning_rate": 6.983467924480957e-07, "loss": 0.3206, "step": 21936 }, { "epoch": 28.48249027237354, "grad_norm": 2.4935896396636963, "learning_rate": 6.706399712531009e-07, "loss": 0.3227, "step": 21960 }, { "epoch": 28.5136186770428, "grad_norm": 1.3364354372024536, "learning_rate": 6.434902542658106e-07, "loss": 0.3143, "step": 21984 }, { "epoch": 28.544747081712064, "grad_norm": 1.0415703058242798, "learning_rate": 6.168979481032455e-07, "loss": 0.3204, "step": 22008 }, { "epoch": 28.575875486381324, "grad_norm": 1.0268234014511108, "learning_rate": 5.908633530872732e-07, "loss": 0.3163, "step": 22032 }, { "epoch": 28.607003891050585, "grad_norm": 1.0088456869125366, "learning_rate": 5.653867632412269e-07, "loss": 0.3118, "step": 22056 }, { "epoch": 28.638132295719846, "grad_norm": 1.52815842628479, "learning_rate": 5.404684662865589e-07, "loss": 0.3166, "step": 22080 }, { "epoch": 28.669260700389106, "grad_norm": 1.0740587711334229, "learning_rate": 5.161087436396095e-07, "loss": 0.3157, "step": 22104 }, { "epoch": 28.700389105058367, "grad_norm": 1.263934850692749, "learning_rate": 4.923078704084372e-07, "loss": 0.3169, "step": 22128 }, { "epoch": 28.731517509727627, "grad_norm": 1.1837375164031982, "learning_rate": 4.690661153896825e-07, "loss": 0.3177, "step": 22152 }, { "epoch": 28.762645914396888, "grad_norm": 1.1407973766326904, "learning_rate": 4.463837410655536e-07, "loss": 0.3161, "step": 22176 }, { "epoch": 28.79377431906615, "grad_norm": 1.019492268562317, "learning_rate": 4.242610036008676e-07, "loss": 0.3135, "step": 22200 }, { "epoch": 28.82490272373541, "grad_norm": 1.7875498533248901, "learning_rate": 4.026981528401419e-07, "loss": 0.3213, "step": 22224 }, { "epoch": 28.85603112840467, "grad_norm": 0.9684593677520752, "learning_rate": 3.8169543230477387e-07, "loss": 0.3151, "step": 22248 }, { "epoch": 28.88715953307393, "grad_norm": 1.086421012878418, "learning_rate": 3.612530791903046e-07, "loss": 0.3172, "step": 22272 }, { "epoch": 28.91828793774319, "grad_norm": 1.9420697689056396, "learning_rate": 3.4137132436372064e-07, "loss": 0.3181, "step": 22296 }, { "epoch": 28.94941634241245, "grad_norm": 1.217786192893982, "learning_rate": 3.2205039236086197e-07, "loss": 0.3151, "step": 22320 }, { "epoch": 28.980544747081712, "grad_norm": 1.1275442838668823, "learning_rate": 3.0329050138388494e-07, "loss": 0.3193, "step": 22344 }, { "epoch": 29.011673151750973, "grad_norm": 0.9701781272888184, "learning_rate": 2.850918632987809e-07, "loss": 0.316, "step": 22368 }, { "epoch": 29.042801556420233, "grad_norm": 1.0859931707382202, "learning_rate": 2.674546836330172e-07, "loss": 0.3169, "step": 22392 }, { "epoch": 29.073929961089494, "grad_norm": 0.9976264834403992, "learning_rate": 2.503791615731721e-07, "loss": 0.3172, "step": 22416 }, { "epoch": 29.105058365758754, "grad_norm": 2.1112818717956543, "learning_rate": 2.3386548996272572e-07, "loss": 0.3202, "step": 22440 }, { "epoch": 29.136186770428015, "grad_norm": 1.3070718050003052, "learning_rate": 2.1791385529986163e-07, "loss": 0.3163, "step": 22464 }, { "epoch": 29.167315175097276, "grad_norm": 1.5637389421463013, "learning_rate": 2.02524437735363e-07, "loss": 0.3183, "step": 22488 }, { "epoch": 29.198443579766536, "grad_norm": 1.19569730758667, "learning_rate": 1.876974110705698e-07, "loss": 0.3176, "step": 22512 }, { "epoch": 29.229571984435797, "grad_norm": 2.7948904037475586, "learning_rate": 1.7343294275543599e-07, "loss": 0.3181, "step": 22536 }, { "epoch": 29.260700389105057, "grad_norm": 2.1853528022766113, "learning_rate": 1.597311938866308e-07, "loss": 0.3144, "step": 22560 }, { "epoch": 29.291828793774318, "grad_norm": 1.4694305658340454, "learning_rate": 1.4659231920571282e-07, "loss": 0.318, "step": 22584 }, { "epoch": 29.32295719844358, "grad_norm": 1.037607192993164, "learning_rate": 1.3401646709736983e-07, "loss": 0.3142, "step": 22608 }, { "epoch": 29.35408560311284, "grad_norm": 0.9353266358375549, "learning_rate": 1.2200377958778708e-07, "loss": 0.3133, "step": 22632 }, { "epoch": 29.3852140077821, "grad_norm": 1.4458966255187988, "learning_rate": 1.1055439234299858e-07, "loss": 0.3164, "step": 22656 }, { "epoch": 29.41634241245136, "grad_norm": 0.9110085368156433, "learning_rate": 9.966843466736597e-08, "loss": 0.3157, "step": 22680 }, { "epoch": 29.44747081712062, "grad_norm": 1.0257847309112549, "learning_rate": 8.934602950213533e-08, "loss": 0.319, "step": 22704 }, { "epoch": 29.47859922178988, "grad_norm": 1.2331140041351318, "learning_rate": 7.958729342403826e-08, "loss": 0.3177, "step": 22728 }, { "epoch": 29.509727626459146, "grad_norm": 2.199601650238037, "learning_rate": 7.039233664396516e-08, "loss": 0.3164, "step": 22752 }, { "epoch": 29.540856031128406, "grad_norm": 1.1412527561187744, "learning_rate": 6.176126300573848e-08, "loss": 0.3127, "step": 22776 }, { "epoch": 29.571984435797667, "grad_norm": 1.556688904762268, "learning_rate": 5.369416998492471e-08, "loss": 0.3181, "step": 22800 }, { "epoch": 29.603112840466927, "grad_norm": 1.2471084594726562, "learning_rate": 4.619114868774643e-08, "loss": 0.3152, "step": 22824 }, { "epoch": 29.634241245136188, "grad_norm": 1.3103766441345215, "learning_rate": 3.92522838500331e-08, "loss": 0.3171, "step": 22848 }, { "epoch": 29.66536964980545, "grad_norm": 1.0881154537200928, "learning_rate": 3.2877653836299594e-08, "loss": 0.3162, "step": 22872 }, { "epoch": 29.69649805447471, "grad_norm": 0.981332004070282, "learning_rate": 2.7067330638824718e-08, "loss": 0.3152, "step": 22896 }, { "epoch": 29.72762645914397, "grad_norm": 2.1748950481414795, "learning_rate": 2.1821379876851845e-08, "loss": 0.3138, "step": 22920 }, { "epoch": 29.75875486381323, "grad_norm": 1.0983901023864746, "learning_rate": 1.7139860795861717e-08, "loss": 0.3194, "step": 22944 }, { "epoch": 29.78988326848249, "grad_norm": 0.9180955290794373, "learning_rate": 1.3022826266873012e-08, "loss": 0.3155, "step": 22968 }, { "epoch": 29.82101167315175, "grad_norm": 4.426241397857666, "learning_rate": 9.470322785881668e-09, "loss": 0.3176, "step": 22992 }, { "epoch": 29.852140077821012, "grad_norm": 1.521730661392212, "learning_rate": 6.482390473294686e-09, "loss": 0.3179, "step": 23016 }, { "epoch": 29.883268482490273, "grad_norm": 1.1130119562149048, "learning_rate": 4.059063073524882e-09, "loss": 0.3199, "step": 23040 }, { "epoch": 29.914396887159533, "grad_norm": 1.0622695684432983, "learning_rate": 2.2003679545690158e-09, "loss": 0.3167, "step": 23064 }, { "epoch": 29.945525291828794, "grad_norm": 1.495850920677185, "learning_rate": 9.063261077080221e-10, "loss": 0.3201, "step": 23088 }, { "epoch": 29.976653696498055, "grad_norm": 1.2298061847686768, "learning_rate": 1.7695214729607224e-10, "loss": 0.3134, "step": 23112 }, { "epoch": 30.0, "step": 23130, "total_flos": 9.11148472281858e+17, "train_loss": 0.3991138265909079, "train_runtime": 54856.7027, "train_samples_per_second": 107.912, "train_steps_per_second": 0.422 } ], "logging_steps": 24, "max_steps": 23130, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 1157, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.11148472281858e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }