diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9844 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 2451, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0024479804161566705, + "grad_norm": 5.113167762756348, + "learning_rate": 4.0650406504065046e-08, + "loss": 1.1539, + "num_tokens": 579147.0, + "step": 2 + }, + { + "epoch": 0.004895960832313341, + "grad_norm": 4.9927592277526855, + "learning_rate": 1.2195121951219514e-07, + "loss": 1.1356, + "num_tokens": 1171666.0, + "step": 4 + }, + { + "epoch": 0.0073439412484700125, + "grad_norm": 5.046677112579346, + "learning_rate": 2.0325203252032523e-07, + "loss": 1.1527, + "num_tokens": 1782835.0, + "step": 6 + }, + { + "epoch": 0.009791921664626682, + "grad_norm": 5.062073707580566, + "learning_rate": 2.845528455284553e-07, + "loss": 1.1503, + "num_tokens": 2373836.0, + "step": 8 + }, + { + "epoch": 0.012239902080783354, + "grad_norm": 5.033382892608643, + "learning_rate": 3.6585365853658536e-07, + "loss": 1.1475, + "num_tokens": 2949918.0, + "step": 10 + }, + { + "epoch": 0.014687882496940025, + "grad_norm": 4.7532267570495605, + "learning_rate": 4.471544715447155e-07, + "loss": 1.1358, + "num_tokens": 3550879.0, + "step": 12 + }, + { + "epoch": 0.017135862913096694, + "grad_norm": 4.71107292175293, + "learning_rate": 5.284552845528456e-07, + "loss": 1.1428, + "num_tokens": 4150487.0, + "step": 14 + }, + { + "epoch": 0.019583843329253364, + "grad_norm": 4.236279487609863, + "learning_rate": 6.097560975609757e-07, + "loss": 1.0976, + "num_tokens": 4732419.0, + "step": 16 + }, + { + "epoch": 0.022031823745410038, + "grad_norm": 4.217196941375732, + "learning_rate": 6.910569105691058e-07, + "loss": 1.104, + "num_tokens": 5323835.0, + "step": 18 + }, + { + "epoch": 0.02447980416156671, + "grad_norm": 4.150773525238037, + "learning_rate": 7.723577235772359e-07, + "loss": 1.0909, + "num_tokens": 5907508.0, + "step": 20 + }, + { + "epoch": 0.02692778457772338, + "grad_norm": 3.1003658771514893, + "learning_rate": 8.53658536585366e-07, + "loss": 1.0317, + "num_tokens": 6502779.0, + "step": 22 + }, + { + "epoch": 0.02937576499388005, + "grad_norm": 2.9581403732299805, + "learning_rate": 9.349593495934959e-07, + "loss": 0.9689, + "num_tokens": 7084297.0, + "step": 24 + }, + { + "epoch": 0.03182374541003672, + "grad_norm": 2.856872797012329, + "learning_rate": 1.0162601626016261e-06, + "loss": 0.9612, + "num_tokens": 7677074.0, + "step": 26 + }, + { + "epoch": 0.03427172582619339, + "grad_norm": 2.546175241470337, + "learning_rate": 1.0975609756097562e-06, + "loss": 0.9467, + "num_tokens": 8262795.0, + "step": 28 + }, + { + "epoch": 0.03671970624235006, + "grad_norm": 1.7229831218719482, + "learning_rate": 1.1788617886178863e-06, + "loss": 0.8588, + "num_tokens": 8856597.0, + "step": 30 + }, + { + "epoch": 0.03916768665850673, + "grad_norm": 1.508795976638794, + "learning_rate": 1.2601626016260162e-06, + "loss": 0.8236, + "num_tokens": 9443865.0, + "step": 32 + }, + { + "epoch": 0.0416156670746634, + "grad_norm": 1.2896347045898438, + "learning_rate": 1.3414634146341465e-06, + "loss": 0.798, + "num_tokens": 10028572.0, + "step": 34 + }, + { + "epoch": 0.044063647490820076, + "grad_norm": 1.0740002393722534, + "learning_rate": 1.4227642276422766e-06, + "loss": 0.7936, + "num_tokens": 10622144.0, + "step": 36 + }, + { + "epoch": 0.046511627906976744, + "grad_norm": 0.8853735327720642, + "learning_rate": 1.5040650406504067e-06, + "loss": 0.7573, + "num_tokens": 11195500.0, + "step": 38 + }, + { + "epoch": 0.04895960832313342, + "grad_norm": 0.6570454239845276, + "learning_rate": 1.5853658536585368e-06, + "loss": 0.7249, + "num_tokens": 11804642.0, + "step": 40 + }, + { + "epoch": 0.051407588739290085, + "grad_norm": 0.7068105340003967, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.7, + "num_tokens": 12412613.0, + "step": 42 + }, + { + "epoch": 0.05385556915544676, + "grad_norm": 0.6152868270874023, + "learning_rate": 1.747967479674797e-06, + "loss": 0.6867, + "num_tokens": 12980451.0, + "step": 44 + }, + { + "epoch": 0.056303549571603426, + "grad_norm": 0.5061647891998291, + "learning_rate": 1.8292682926829268e-06, + "loss": 0.6777, + "num_tokens": 13573410.0, + "step": 46 + }, + { + "epoch": 0.0587515299877601, + "grad_norm": 0.4394634962081909, + "learning_rate": 1.9105691056910574e-06, + "loss": 0.6849, + "num_tokens": 14151346.0, + "step": 48 + }, + { + "epoch": 0.06119951040391677, + "grad_norm": 0.38909780979156494, + "learning_rate": 1.991869918699187e-06, + "loss": 0.6814, + "num_tokens": 14772803.0, + "step": 50 + }, + { + "epoch": 0.06364749082007344, + "grad_norm": 0.3801971673965454, + "learning_rate": 2.073170731707317e-06, + "loss": 0.6561, + "num_tokens": 15354869.0, + "step": 52 + }, + { + "epoch": 0.06609547123623011, + "grad_norm": 0.3451838493347168, + "learning_rate": 2.154471544715447e-06, + "loss": 0.6419, + "num_tokens": 15944249.0, + "step": 54 + }, + { + "epoch": 0.06854345165238677, + "grad_norm": 0.30912837386131287, + "learning_rate": 2.2357723577235773e-06, + "loss": 0.6106, + "num_tokens": 16523611.0, + "step": 56 + }, + { + "epoch": 0.07099143206854346, + "grad_norm": 0.30823254585266113, + "learning_rate": 2.317073170731708e-06, + "loss": 0.6246, + "num_tokens": 17110353.0, + "step": 58 + }, + { + "epoch": 0.07343941248470012, + "grad_norm": 0.29659801721572876, + "learning_rate": 2.3983739837398375e-06, + "loss": 0.6197, + "num_tokens": 17682600.0, + "step": 60 + }, + { + "epoch": 0.07588739290085679, + "grad_norm": 0.3088859021663666, + "learning_rate": 2.4796747967479676e-06, + "loss": 0.6052, + "num_tokens": 18274385.0, + "step": 62 + }, + { + "epoch": 0.07833537331701346, + "grad_norm": 0.2870602309703827, + "learning_rate": 2.5609756097560977e-06, + "loss": 0.6243, + "num_tokens": 18864027.0, + "step": 64 + }, + { + "epoch": 0.08078335373317014, + "grad_norm": 0.2588765323162079, + "learning_rate": 2.6422764227642278e-06, + "loss": 0.5855, + "num_tokens": 19450167.0, + "step": 66 + }, + { + "epoch": 0.0832313341493268, + "grad_norm": 0.2649621367454529, + "learning_rate": 2.723577235772358e-06, + "loss": 0.5827, + "num_tokens": 20027616.0, + "step": 68 + }, + { + "epoch": 0.08567931456548347, + "grad_norm": 0.26677006483078003, + "learning_rate": 2.8048780487804884e-06, + "loss": 0.5919, + "num_tokens": 20641209.0, + "step": 70 + }, + { + "epoch": 0.08812729498164015, + "grad_norm": 0.25061097741127014, + "learning_rate": 2.8861788617886185e-06, + "loss": 0.5694, + "num_tokens": 21231993.0, + "step": 72 + }, + { + "epoch": 0.09057527539779682, + "grad_norm": 0.2478611171245575, + "learning_rate": 2.967479674796748e-06, + "loss": 0.561, + "num_tokens": 21822237.0, + "step": 74 + }, + { + "epoch": 0.09302325581395349, + "grad_norm": 0.2571170926094055, + "learning_rate": 3.0487804878048782e-06, + "loss": 0.5809, + "num_tokens": 22406364.0, + "step": 76 + }, + { + "epoch": 0.09547123623011015, + "grad_norm": 0.24517571926116943, + "learning_rate": 3.1300813008130083e-06, + "loss": 0.571, + "num_tokens": 23012261.0, + "step": 78 + }, + { + "epoch": 0.09791921664626684, + "grad_norm": 0.25413069128990173, + "learning_rate": 3.211382113821139e-06, + "loss": 0.5671, + "num_tokens": 23609152.0, + "step": 80 + }, + { + "epoch": 0.1003671970624235, + "grad_norm": 0.2391115128993988, + "learning_rate": 3.292682926829269e-06, + "loss": 0.56, + "num_tokens": 24195467.0, + "step": 82 + }, + { + "epoch": 0.10281517747858017, + "grad_norm": 0.2625662088394165, + "learning_rate": 3.3739837398373986e-06, + "loss": 0.5457, + "num_tokens": 24776287.0, + "step": 84 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 0.25236576795578003, + "learning_rate": 3.4552845528455287e-06, + "loss": 0.5555, + "num_tokens": 25353355.0, + "step": 86 + }, + { + "epoch": 0.10771113831089352, + "grad_norm": 0.2561672627925873, + "learning_rate": 3.5365853658536588e-06, + "loss": 0.5514, + "num_tokens": 25910479.0, + "step": 88 + }, + { + "epoch": 0.11015911872705018, + "grad_norm": 0.25256532430648804, + "learning_rate": 3.6178861788617893e-06, + "loss": 0.5572, + "num_tokens": 26499675.0, + "step": 90 + }, + { + "epoch": 0.11260709914320685, + "grad_norm": 0.24565160274505615, + "learning_rate": 3.699186991869919e-06, + "loss": 0.5365, + "num_tokens": 27077382.0, + "step": 92 + }, + { + "epoch": 0.11505507955936352, + "grad_norm": 0.2521428167819977, + "learning_rate": 3.780487804878049e-06, + "loss": 0.5465, + "num_tokens": 27670497.0, + "step": 94 + }, + { + "epoch": 0.1175030599755202, + "grad_norm": 0.24958781898021698, + "learning_rate": 3.861788617886179e-06, + "loss": 0.5379, + "num_tokens": 28263901.0, + "step": 96 + }, + { + "epoch": 0.11995104039167687, + "grad_norm": 0.2590058147907257, + "learning_rate": 3.943089430894309e-06, + "loss": 0.5183, + "num_tokens": 28845275.0, + "step": 98 + }, + { + "epoch": 0.12239902080783353, + "grad_norm": 0.24741683900356293, + "learning_rate": 4.024390243902439e-06, + "loss": 0.5358, + "num_tokens": 29436166.0, + "step": 100 + }, + { + "epoch": 0.12484700122399021, + "grad_norm": 0.24634818732738495, + "learning_rate": 4.10569105691057e-06, + "loss": 0.5311, + "num_tokens": 30044583.0, + "step": 102 + }, + { + "epoch": 0.12729498164014688, + "grad_norm": 0.243186354637146, + "learning_rate": 4.1869918699186995e-06, + "loss": 0.5312, + "num_tokens": 30645726.0, + "step": 104 + }, + { + "epoch": 0.12974296205630356, + "grad_norm": 0.26044467091560364, + "learning_rate": 4.268292682926829e-06, + "loss": 0.544, + "num_tokens": 31251475.0, + "step": 106 + }, + { + "epoch": 0.13219094247246022, + "grad_norm": 0.2474851757287979, + "learning_rate": 4.34959349593496e-06, + "loss": 0.5167, + "num_tokens": 31830636.0, + "step": 108 + }, + { + "epoch": 0.1346389228886169, + "grad_norm": 0.2415020912885666, + "learning_rate": 4.43089430894309e-06, + "loss": 0.5371, + "num_tokens": 32435998.0, + "step": 110 + }, + { + "epoch": 0.13708690330477355, + "grad_norm": 0.2419433742761612, + "learning_rate": 4.51219512195122e-06, + "loss": 0.5163, + "num_tokens": 33041535.0, + "step": 112 + }, + { + "epoch": 0.13953488372093023, + "grad_norm": 0.2523370087146759, + "learning_rate": 4.59349593495935e-06, + "loss": 0.53, + "num_tokens": 33617226.0, + "step": 114 + }, + { + "epoch": 0.1419828641370869, + "grad_norm": 0.2544844448566437, + "learning_rate": 4.67479674796748e-06, + "loss": 0.5212, + "num_tokens": 34184157.0, + "step": 116 + }, + { + "epoch": 0.14443084455324356, + "grad_norm": 0.2378045618534088, + "learning_rate": 4.75609756097561e-06, + "loss": 0.5107, + "num_tokens": 34779714.0, + "step": 118 + }, + { + "epoch": 0.14687882496940025, + "grad_norm": 0.25506970286369324, + "learning_rate": 4.83739837398374e-06, + "loss": 0.5081, + "num_tokens": 35361759.0, + "step": 120 + }, + { + "epoch": 0.14932680538555693, + "grad_norm": 0.24085764586925507, + "learning_rate": 4.918699186991871e-06, + "loss": 0.5101, + "num_tokens": 35960992.0, + "step": 122 + }, + { + "epoch": 0.15177478580171358, + "grad_norm": 0.24073615670204163, + "learning_rate": 5e-06, + "loss": 0.5242, + "num_tokens": 36577873.0, + "step": 124 + }, + { + "epoch": 0.15422276621787026, + "grad_norm": 0.2359231561422348, + "learning_rate": 5.081300813008131e-06, + "loss": 0.525, + "num_tokens": 37173083.0, + "step": 126 + }, + { + "epoch": 0.15667074663402691, + "grad_norm": 0.24386054277420044, + "learning_rate": 5.162601626016261e-06, + "loss": 0.5222, + "num_tokens": 37772321.0, + "step": 128 + }, + { + "epoch": 0.1591187270501836, + "grad_norm": 0.24544130265712738, + "learning_rate": 5.243902439024391e-06, + "loss": 0.5259, + "num_tokens": 38366280.0, + "step": 130 + }, + { + "epoch": 0.16156670746634028, + "grad_norm": 0.2467457354068756, + "learning_rate": 5.32520325203252e-06, + "loss": 0.5357, + "num_tokens": 38963886.0, + "step": 132 + }, + { + "epoch": 0.16401468788249693, + "grad_norm": 0.2387515753507614, + "learning_rate": 5.4065040650406504e-06, + "loss": 0.5047, + "num_tokens": 39590863.0, + "step": 134 + }, + { + "epoch": 0.1664626682986536, + "grad_norm": 0.24072597920894623, + "learning_rate": 5.487804878048781e-06, + "loss": 0.5076, + "num_tokens": 40182071.0, + "step": 136 + }, + { + "epoch": 0.1689106487148103, + "grad_norm": 0.24588756263256073, + "learning_rate": 5.569105691056911e-06, + "loss": 0.5018, + "num_tokens": 40758834.0, + "step": 138 + }, + { + "epoch": 0.17135862913096694, + "grad_norm": 0.24266333878040314, + "learning_rate": 5.650406504065041e-06, + "loss": 0.5331, + "num_tokens": 41366991.0, + "step": 140 + }, + { + "epoch": 0.17380660954712362, + "grad_norm": 0.2442614734172821, + "learning_rate": 5.731707317073171e-06, + "loss": 0.5212, + "num_tokens": 41970776.0, + "step": 142 + }, + { + "epoch": 0.1762545899632803, + "grad_norm": 0.24988345801830292, + "learning_rate": 5.813008130081301e-06, + "loss": 0.5022, + "num_tokens": 42545064.0, + "step": 144 + }, + { + "epoch": 0.17870257037943696, + "grad_norm": 0.250379741191864, + "learning_rate": 5.894308943089432e-06, + "loss": 0.4936, + "num_tokens": 43097646.0, + "step": 146 + }, + { + "epoch": 0.18115055079559364, + "grad_norm": 0.27457740902900696, + "learning_rate": 5.9756097560975615e-06, + "loss": 0.5212, + "num_tokens": 43704720.0, + "step": 148 + }, + { + "epoch": 0.1835985312117503, + "grad_norm": 0.24359893798828125, + "learning_rate": 6.056910569105692e-06, + "loss": 0.4802, + "num_tokens": 44291468.0, + "step": 150 + }, + { + "epoch": 0.18604651162790697, + "grad_norm": 0.2580820918083191, + "learning_rate": 6.138211382113821e-06, + "loss": 0.5089, + "num_tokens": 44889809.0, + "step": 152 + }, + { + "epoch": 0.18849449204406366, + "grad_norm": 0.26753556728363037, + "learning_rate": 6.219512195121951e-06, + "loss": 0.4957, + "num_tokens": 45471363.0, + "step": 154 + }, + { + "epoch": 0.1909424724602203, + "grad_norm": 0.2733646631240845, + "learning_rate": 6.300813008130082e-06, + "loss": 0.4918, + "num_tokens": 46044686.0, + "step": 156 + }, + { + "epoch": 0.193390452876377, + "grad_norm": 0.23967885971069336, + "learning_rate": 6.3821138211382115e-06, + "loss": 0.4793, + "num_tokens": 46638282.0, + "step": 158 + }, + { + "epoch": 0.19583843329253367, + "grad_norm": 0.25125283002853394, + "learning_rate": 6.463414634146342e-06, + "loss": 0.5065, + "num_tokens": 47254725.0, + "step": 160 + }, + { + "epoch": 0.19828641370869032, + "grad_norm": 0.2450665384531021, + "learning_rate": 6.544715447154472e-06, + "loss": 0.4916, + "num_tokens": 47825874.0, + "step": 162 + }, + { + "epoch": 0.200734394124847, + "grad_norm": 0.25482243299484253, + "learning_rate": 6.626016260162602e-06, + "loss": 0.4967, + "num_tokens": 48404000.0, + "step": 164 + }, + { + "epoch": 0.20318237454100369, + "grad_norm": 0.24947668612003326, + "learning_rate": 6.707317073170733e-06, + "loss": 0.5079, + "num_tokens": 49020620.0, + "step": 166 + }, + { + "epoch": 0.20563035495716034, + "grad_norm": 0.2664734721183777, + "learning_rate": 6.788617886178862e-06, + "loss": 0.4939, + "num_tokens": 49610138.0, + "step": 168 + }, + { + "epoch": 0.20807833537331702, + "grad_norm": 0.24819302558898926, + "learning_rate": 6.869918699186993e-06, + "loss": 0.4976, + "num_tokens": 50239651.0, + "step": 170 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.25770121812820435, + "learning_rate": 6.951219512195122e-06, + "loss": 0.4838, + "num_tokens": 50821539.0, + "step": 172 + }, + { + "epoch": 0.21297429620563035, + "grad_norm": 0.2596096694469452, + "learning_rate": 7.032520325203252e-06, + "loss": 0.4922, + "num_tokens": 51392202.0, + "step": 174 + }, + { + "epoch": 0.21542227662178703, + "grad_norm": 0.24805551767349243, + "learning_rate": 7.113821138211383e-06, + "loss": 0.5006, + "num_tokens": 51973577.0, + "step": 176 + }, + { + "epoch": 0.2178702570379437, + "grad_norm": 0.26363953948020935, + "learning_rate": 7.1951219512195125e-06, + "loss": 0.4848, + "num_tokens": 52557644.0, + "step": 178 + }, + { + "epoch": 0.22031823745410037, + "grad_norm": 0.2537650167942047, + "learning_rate": 7.276422764227643e-06, + "loss": 0.4836, + "num_tokens": 53136591.0, + "step": 180 + }, + { + "epoch": 0.22276621787025705, + "grad_norm": 0.2626267671585083, + "learning_rate": 7.357723577235773e-06, + "loss": 0.5021, + "num_tokens": 53723609.0, + "step": 182 + }, + { + "epoch": 0.2252141982864137, + "grad_norm": 0.2569217085838318, + "learning_rate": 7.439024390243903e-06, + "loss": 0.5023, + "num_tokens": 54317296.0, + "step": 184 + }, + { + "epoch": 0.22766217870257038, + "grad_norm": 0.25913605093955994, + "learning_rate": 7.520325203252034e-06, + "loss": 0.4829, + "num_tokens": 54898930.0, + "step": 186 + }, + { + "epoch": 0.23011015911872704, + "grad_norm": 0.25674960017204285, + "learning_rate": 7.601626016260163e-06, + "loss": 0.4927, + "num_tokens": 55520199.0, + "step": 188 + }, + { + "epoch": 0.23255813953488372, + "grad_norm": 0.23637175559997559, + "learning_rate": 7.682926829268293e-06, + "loss": 0.4817, + "num_tokens": 56148692.0, + "step": 190 + }, + { + "epoch": 0.2350061199510404, + "grad_norm": 0.26663970947265625, + "learning_rate": 7.764227642276424e-06, + "loss": 0.4894, + "num_tokens": 56744145.0, + "step": 192 + }, + { + "epoch": 0.23745410036719705, + "grad_norm": 0.2444421350955963, + "learning_rate": 7.845528455284554e-06, + "loss": 0.4858, + "num_tokens": 57372074.0, + "step": 194 + }, + { + "epoch": 0.23990208078335373, + "grad_norm": 0.2536911964416504, + "learning_rate": 7.926829268292685e-06, + "loss": 0.5046, + "num_tokens": 57970018.0, + "step": 196 + }, + { + "epoch": 0.2423500611995104, + "grad_norm": 0.25137463212013245, + "learning_rate": 8.008130081300813e-06, + "loss": 0.4995, + "num_tokens": 58616468.0, + "step": 198 + }, + { + "epoch": 0.24479804161566707, + "grad_norm": 0.27120441198349, + "learning_rate": 8.089430894308944e-06, + "loss": 0.4862, + "num_tokens": 59212867.0, + "step": 200 + }, + { + "epoch": 0.24724602203182375, + "grad_norm": 0.2675727605819702, + "learning_rate": 8.170731707317073e-06, + "loss": 0.4877, + "num_tokens": 59831819.0, + "step": 202 + }, + { + "epoch": 0.24969400244798043, + "grad_norm": 0.25941142439842224, + "learning_rate": 8.252032520325203e-06, + "loss": 0.4744, + "num_tokens": 60406569.0, + "step": 204 + }, + { + "epoch": 0.2521419828641371, + "grad_norm": 0.25482282042503357, + "learning_rate": 8.333333333333334e-06, + "loss": 0.4897, + "num_tokens": 60992960.0, + "step": 206 + }, + { + "epoch": 0.25458996328029376, + "grad_norm": 0.25704023241996765, + "learning_rate": 8.414634146341464e-06, + "loss": 0.4815, + "num_tokens": 61582502.0, + "step": 208 + }, + { + "epoch": 0.25703794369645044, + "grad_norm": 0.28427180647850037, + "learning_rate": 8.495934959349595e-06, + "loss": 0.4815, + "num_tokens": 62170270.0, + "step": 210 + }, + { + "epoch": 0.2594859241126071, + "grad_norm": 0.26738202571868896, + "learning_rate": 8.577235772357724e-06, + "loss": 0.4752, + "num_tokens": 62762518.0, + "step": 212 + }, + { + "epoch": 0.26193390452876375, + "grad_norm": 0.2779182493686676, + "learning_rate": 8.658536585365854e-06, + "loss": 0.4835, + "num_tokens": 63338317.0, + "step": 214 + }, + { + "epoch": 0.26438188494492043, + "grad_norm": 0.25400421023368835, + "learning_rate": 8.739837398373985e-06, + "loss": 0.4682, + "num_tokens": 63946534.0, + "step": 216 + }, + { + "epoch": 0.2668298653610771, + "grad_norm": 0.2691749930381775, + "learning_rate": 8.821138211382113e-06, + "loss": 0.4686, + "num_tokens": 64529245.0, + "step": 218 + }, + { + "epoch": 0.2692778457772338, + "grad_norm": 0.30239883065223694, + "learning_rate": 8.902439024390244e-06, + "loss": 0.4931, + "num_tokens": 65101588.0, + "step": 220 + }, + { + "epoch": 0.2717258261933905, + "grad_norm": 0.2659878134727478, + "learning_rate": 8.983739837398374e-06, + "loss": 0.4809, + "num_tokens": 65693459.0, + "step": 222 + }, + { + "epoch": 0.2741738066095471, + "grad_norm": 0.29131749272346497, + "learning_rate": 9.065040650406505e-06, + "loss": 0.484, + "num_tokens": 66264575.0, + "step": 224 + }, + { + "epoch": 0.2766217870257038, + "grad_norm": 0.26120057702064514, + "learning_rate": 9.146341463414635e-06, + "loss": 0.4621, + "num_tokens": 66836693.0, + "step": 226 + }, + { + "epoch": 0.27906976744186046, + "grad_norm": 0.25684523582458496, + "learning_rate": 9.227642276422764e-06, + "loss": 0.4846, + "num_tokens": 67435169.0, + "step": 228 + }, + { + "epoch": 0.28151774785801714, + "grad_norm": 0.2609543204307556, + "learning_rate": 9.308943089430895e-06, + "loss": 0.4892, + "num_tokens": 68047327.0, + "step": 230 + }, + { + "epoch": 0.2839657282741738, + "grad_norm": 0.25355297327041626, + "learning_rate": 9.390243902439025e-06, + "loss": 0.481, + "num_tokens": 68652895.0, + "step": 232 + }, + { + "epoch": 0.2864137086903305, + "grad_norm": 0.2718547284603119, + "learning_rate": 9.471544715447156e-06, + "loss": 0.4826, + "num_tokens": 69234680.0, + "step": 234 + }, + { + "epoch": 0.28886168910648713, + "grad_norm": 0.2677147388458252, + "learning_rate": 9.552845528455286e-06, + "loss": 0.4832, + "num_tokens": 69826413.0, + "step": 236 + }, + { + "epoch": 0.2913096695226438, + "grad_norm": 0.26797816157341003, + "learning_rate": 9.634146341463415e-06, + "loss": 0.4756, + "num_tokens": 70441126.0, + "step": 238 + }, + { + "epoch": 0.2937576499388005, + "grad_norm": 0.2443031668663025, + "learning_rate": 9.715447154471546e-06, + "loss": 0.4724, + "num_tokens": 71026297.0, + "step": 240 + }, + { + "epoch": 0.2962056303549572, + "grad_norm": 0.23899759352207184, + "learning_rate": 9.796747967479675e-06, + "loss": 0.4632, + "num_tokens": 71615415.0, + "step": 242 + }, + { + "epoch": 0.29865361077111385, + "grad_norm": 0.2585891783237457, + "learning_rate": 9.878048780487805e-06, + "loss": 0.4768, + "num_tokens": 72187205.0, + "step": 244 + }, + { + "epoch": 0.3011015911872705, + "grad_norm": 0.25309988856315613, + "learning_rate": 9.959349593495936e-06, + "loss": 0.4695, + "num_tokens": 72766789.0, + "step": 246 + }, + { + "epoch": 0.30354957160342716, + "grad_norm": 0.2485942840576172, + "learning_rate": 1e-05, + "loss": 0.4791, + "num_tokens": 73367195.0, + "step": 248 + }, + { + "epoch": 0.30599755201958384, + "grad_norm": 0.2777419686317444, + "learning_rate": 1e-05, + "loss": 0.4839, + "num_tokens": 73931883.0, + "step": 250 + }, + { + "epoch": 0.3084455324357405, + "grad_norm": 0.27165666222572327, + "learning_rate": 1e-05, + "loss": 0.4806, + "num_tokens": 74512785.0, + "step": 252 + }, + { + "epoch": 0.3108935128518972, + "grad_norm": 0.2516685426235199, + "learning_rate": 1e-05, + "loss": 0.4693, + "num_tokens": 75103604.0, + "step": 254 + }, + { + "epoch": 0.31334149326805383, + "grad_norm": 0.2623477280139923, + "learning_rate": 1e-05, + "loss": 0.4859, + "num_tokens": 75691456.0, + "step": 256 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 0.26603803038597107, + "learning_rate": 1e-05, + "loss": 0.4723, + "num_tokens": 76296514.0, + "step": 258 + }, + { + "epoch": 0.3182374541003672, + "grad_norm": 0.2537820637226105, + "learning_rate": 1e-05, + "loss": 0.4746, + "num_tokens": 76901295.0, + "step": 260 + }, + { + "epoch": 0.32068543451652387, + "grad_norm": 0.2758727967739105, + "learning_rate": 1e-05, + "loss": 0.4663, + "num_tokens": 77473398.0, + "step": 262 + }, + { + "epoch": 0.32313341493268055, + "grad_norm": 0.2774087190628052, + "learning_rate": 1e-05, + "loss": 0.4829, + "num_tokens": 78068399.0, + "step": 264 + }, + { + "epoch": 0.32558139534883723, + "grad_norm": 0.26797741651535034, + "learning_rate": 1e-05, + "loss": 0.4892, + "num_tokens": 78683018.0, + "step": 266 + }, + { + "epoch": 0.32802937576499386, + "grad_norm": 0.2684287428855896, + "learning_rate": 1e-05, + "loss": 0.466, + "num_tokens": 79280661.0, + "step": 268 + }, + { + "epoch": 0.33047735618115054, + "grad_norm": 0.27204152941703796, + "learning_rate": 1e-05, + "loss": 0.4674, + "num_tokens": 79894114.0, + "step": 270 + }, + { + "epoch": 0.3329253365973072, + "grad_norm": 0.26379671692848206, + "learning_rate": 1e-05, + "loss": 0.4703, + "num_tokens": 80498761.0, + "step": 272 + }, + { + "epoch": 0.3353733170134639, + "grad_norm": 0.26917415857315063, + "learning_rate": 1e-05, + "loss": 0.46, + "num_tokens": 81090664.0, + "step": 274 + }, + { + "epoch": 0.3378212974296206, + "grad_norm": 0.2546355128288269, + "learning_rate": 1e-05, + "loss": 0.4684, + "num_tokens": 81682310.0, + "step": 276 + }, + { + "epoch": 0.3402692778457772, + "grad_norm": 0.24116742610931396, + "learning_rate": 1e-05, + "loss": 0.453, + "num_tokens": 82257211.0, + "step": 278 + }, + { + "epoch": 0.3427172582619339, + "grad_norm": 0.3105870485305786, + "learning_rate": 1e-05, + "loss": 0.4552, + "num_tokens": 82873665.0, + "step": 280 + }, + { + "epoch": 0.34516523867809057, + "grad_norm": 0.2542496919631958, + "learning_rate": 1e-05, + "loss": 0.4651, + "num_tokens": 83459915.0, + "step": 282 + }, + { + "epoch": 0.34761321909424725, + "grad_norm": 0.2633056640625, + "learning_rate": 1e-05, + "loss": 0.4743, + "num_tokens": 84064495.0, + "step": 284 + }, + { + "epoch": 0.35006119951040393, + "grad_norm": 0.27345481514930725, + "learning_rate": 1e-05, + "loss": 0.4522, + "num_tokens": 84650583.0, + "step": 286 + }, + { + "epoch": 0.3525091799265606, + "grad_norm": 0.2693355083465576, + "learning_rate": 1e-05, + "loss": 0.4648, + "num_tokens": 85253276.0, + "step": 288 + }, + { + "epoch": 0.35495716034271724, + "grad_norm": 0.2904321253299713, + "learning_rate": 1e-05, + "loss": 0.4627, + "num_tokens": 85833675.0, + "step": 290 + }, + { + "epoch": 0.3574051407588739, + "grad_norm": 0.2525518834590912, + "learning_rate": 1e-05, + "loss": 0.4558, + "num_tokens": 86421330.0, + "step": 292 + }, + { + "epoch": 0.3598531211750306, + "grad_norm": 0.2589590847492218, + "learning_rate": 1e-05, + "loss": 0.4674, + "num_tokens": 87030151.0, + "step": 294 + }, + { + "epoch": 0.3623011015911873, + "grad_norm": 0.26387012004852295, + "learning_rate": 1e-05, + "loss": 0.4457, + "num_tokens": 87567827.0, + "step": 296 + }, + { + "epoch": 0.36474908200734396, + "grad_norm": 0.25372225046157837, + "learning_rate": 1e-05, + "loss": 0.4531, + "num_tokens": 88157730.0, + "step": 298 + }, + { + "epoch": 0.3671970624235006, + "grad_norm": 0.2606695890426636, + "learning_rate": 1e-05, + "loss": 0.4652, + "num_tokens": 88759382.0, + "step": 300 + }, + { + "epoch": 0.36964504283965727, + "grad_norm": 0.28036609292030334, + "learning_rate": 1e-05, + "loss": 0.4617, + "num_tokens": 89342429.0, + "step": 302 + }, + { + "epoch": 0.37209302325581395, + "grad_norm": 0.2817690968513489, + "learning_rate": 1e-05, + "loss": 0.4612, + "num_tokens": 89934089.0, + "step": 304 + }, + { + "epoch": 0.37454100367197063, + "grad_norm": 0.2582840919494629, + "learning_rate": 1e-05, + "loss": 0.457, + "num_tokens": 90525609.0, + "step": 306 + }, + { + "epoch": 0.3769889840881273, + "grad_norm": 0.23353107273578644, + "learning_rate": 1e-05, + "loss": 0.4361, + "num_tokens": 91116420.0, + "step": 308 + }, + { + "epoch": 0.379436964504284, + "grad_norm": 0.2614266872406006, + "learning_rate": 1e-05, + "loss": 0.4613, + "num_tokens": 91706872.0, + "step": 310 + }, + { + "epoch": 0.3818849449204406, + "grad_norm": 0.25390151143074036, + "learning_rate": 1e-05, + "loss": 0.4563, + "num_tokens": 92273116.0, + "step": 312 + }, + { + "epoch": 0.3843329253365973, + "grad_norm": 0.2604464590549469, + "learning_rate": 1e-05, + "loss": 0.4658, + "num_tokens": 92877700.0, + "step": 314 + }, + { + "epoch": 0.386780905752754, + "grad_norm": 0.2540639638900757, + "learning_rate": 1e-05, + "loss": 0.467, + "num_tokens": 93448212.0, + "step": 316 + }, + { + "epoch": 0.38922888616891066, + "grad_norm": 0.30420681834220886, + "learning_rate": 1e-05, + "loss": 0.4569, + "num_tokens": 94015732.0, + "step": 318 + }, + { + "epoch": 0.39167686658506734, + "grad_norm": 0.26026180386543274, + "learning_rate": 1e-05, + "loss": 0.452, + "num_tokens": 94583212.0, + "step": 320 + }, + { + "epoch": 0.39412484700122397, + "grad_norm": 0.24830417335033417, + "learning_rate": 1e-05, + "loss": 0.4429, + "num_tokens": 95154747.0, + "step": 322 + }, + { + "epoch": 0.39657282741738065, + "grad_norm": 0.25844183564186096, + "learning_rate": 1e-05, + "loss": 0.4698, + "num_tokens": 95736389.0, + "step": 324 + }, + { + "epoch": 0.3990208078335373, + "grad_norm": 0.2400529533624649, + "learning_rate": 1e-05, + "loss": 0.4662, + "num_tokens": 96331517.0, + "step": 326 + }, + { + "epoch": 0.401468788249694, + "grad_norm": 0.2508002519607544, + "learning_rate": 1e-05, + "loss": 0.4559, + "num_tokens": 96919369.0, + "step": 328 + }, + { + "epoch": 0.4039167686658507, + "grad_norm": 0.3863252103328705, + "learning_rate": 1e-05, + "loss": 0.4508, + "num_tokens": 97509288.0, + "step": 330 + }, + { + "epoch": 0.40636474908200737, + "grad_norm": 0.23996925354003906, + "learning_rate": 1e-05, + "loss": 0.464, + "num_tokens": 98114408.0, + "step": 332 + }, + { + "epoch": 0.408812729498164, + "grad_norm": 0.2341168373823166, + "learning_rate": 1e-05, + "loss": 0.4449, + "num_tokens": 98713441.0, + "step": 334 + }, + { + "epoch": 0.4112607099143207, + "grad_norm": 0.25438442826271057, + "learning_rate": 1e-05, + "loss": 0.4525, + "num_tokens": 99327738.0, + "step": 336 + }, + { + "epoch": 0.41370869033047736, + "grad_norm": 0.25638943910598755, + "learning_rate": 1e-05, + "loss": 0.456, + "num_tokens": 99919049.0, + "step": 338 + }, + { + "epoch": 0.41615667074663404, + "grad_norm": 0.249691903591156, + "learning_rate": 1e-05, + "loss": 0.4451, + "num_tokens": 100516376.0, + "step": 340 + }, + { + "epoch": 0.4186046511627907, + "grad_norm": 0.23924164474010468, + "learning_rate": 1e-05, + "loss": 0.4594, + "num_tokens": 101133012.0, + "step": 342 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.2472023069858551, + "learning_rate": 1e-05, + "loss": 0.4475, + "num_tokens": 101700842.0, + "step": 344 + }, + { + "epoch": 0.423500611995104, + "grad_norm": 0.25458183884620667, + "learning_rate": 1e-05, + "loss": 0.4636, + "num_tokens": 102301821.0, + "step": 346 + }, + { + "epoch": 0.4259485924112607, + "grad_norm": 0.24029487371444702, + "learning_rate": 1e-05, + "loss": 0.4543, + "num_tokens": 102911147.0, + "step": 348 + }, + { + "epoch": 0.4283965728274174, + "grad_norm": 0.24597443640232086, + "learning_rate": 1e-05, + "loss": 0.4327, + "num_tokens": 103494243.0, + "step": 350 + }, + { + "epoch": 0.43084455324357407, + "grad_norm": 0.24597881734371185, + "learning_rate": 1e-05, + "loss": 0.4544, + "num_tokens": 104100107.0, + "step": 352 + }, + { + "epoch": 0.43329253365973075, + "grad_norm": 0.251849889755249, + "learning_rate": 1e-05, + "loss": 0.4518, + "num_tokens": 104696583.0, + "step": 354 + }, + { + "epoch": 0.4357405140758874, + "grad_norm": 0.26897719502449036, + "learning_rate": 1e-05, + "loss": 0.4542, + "num_tokens": 105266973.0, + "step": 356 + }, + { + "epoch": 0.43818849449204406, + "grad_norm": 0.25020480155944824, + "learning_rate": 1e-05, + "loss": 0.4428, + "num_tokens": 105855583.0, + "step": 358 + }, + { + "epoch": 0.44063647490820074, + "grad_norm": 0.25249218940734863, + "learning_rate": 1e-05, + "loss": 0.451, + "num_tokens": 106413444.0, + "step": 360 + }, + { + "epoch": 0.4430844553243574, + "grad_norm": 0.25864851474761963, + "learning_rate": 1e-05, + "loss": 0.4602, + "num_tokens": 107001653.0, + "step": 362 + }, + { + "epoch": 0.4455324357405141, + "grad_norm": 0.24207186698913574, + "learning_rate": 1e-05, + "loss": 0.4523, + "num_tokens": 107595819.0, + "step": 364 + }, + { + "epoch": 0.4479804161566707, + "grad_norm": 0.2500375509262085, + "learning_rate": 1e-05, + "loss": 0.4261, + "num_tokens": 108198205.0, + "step": 366 + }, + { + "epoch": 0.4504283965728274, + "grad_norm": 0.27041998505592346, + "learning_rate": 1e-05, + "loss": 0.4331, + "num_tokens": 108768930.0, + "step": 368 + }, + { + "epoch": 0.4528763769889841, + "grad_norm": 0.257564902305603, + "learning_rate": 1e-05, + "loss": 0.4606, + "num_tokens": 109393449.0, + "step": 370 + }, + { + "epoch": 0.45532435740514077, + "grad_norm": 0.2373887002468109, + "learning_rate": 1e-05, + "loss": 0.4421, + "num_tokens": 109998048.0, + "step": 372 + }, + { + "epoch": 0.45777233782129745, + "grad_norm": 0.24258120357990265, + "learning_rate": 1e-05, + "loss": 0.4532, + "num_tokens": 110602780.0, + "step": 374 + }, + { + "epoch": 0.4602203182374541, + "grad_norm": 0.2412237524986267, + "learning_rate": 1e-05, + "loss": 0.4438, + "num_tokens": 111196271.0, + "step": 376 + }, + { + "epoch": 0.46266829865361075, + "grad_norm": 0.2580074965953827, + "learning_rate": 1e-05, + "loss": 0.4393, + "num_tokens": 111762398.0, + "step": 378 + }, + { + "epoch": 0.46511627906976744, + "grad_norm": 0.24817973375320435, + "learning_rate": 1e-05, + "loss": 0.4473, + "num_tokens": 112344276.0, + "step": 380 + }, + { + "epoch": 0.4675642594859241, + "grad_norm": 0.24946565926074982, + "learning_rate": 1e-05, + "loss": 0.4407, + "num_tokens": 112929852.0, + "step": 382 + }, + { + "epoch": 0.4700122399020808, + "grad_norm": 0.25697141885757446, + "learning_rate": 1e-05, + "loss": 0.4426, + "num_tokens": 113524769.0, + "step": 384 + }, + { + "epoch": 0.4724602203182375, + "grad_norm": 0.2628220319747925, + "learning_rate": 1e-05, + "loss": 0.4315, + "num_tokens": 114097175.0, + "step": 386 + }, + { + "epoch": 0.4749082007343941, + "grad_norm": 0.252259224653244, + "learning_rate": 1e-05, + "loss": 0.4536, + "num_tokens": 114681195.0, + "step": 388 + }, + { + "epoch": 0.4773561811505508, + "grad_norm": 0.24542003870010376, + "learning_rate": 1e-05, + "loss": 0.4575, + "num_tokens": 115275959.0, + "step": 390 + }, + { + "epoch": 0.47980416156670747, + "grad_norm": 0.2483384609222412, + "learning_rate": 1e-05, + "loss": 0.4316, + "num_tokens": 115836837.0, + "step": 392 + }, + { + "epoch": 0.48225214198286415, + "grad_norm": 0.25170162320137024, + "learning_rate": 1e-05, + "loss": 0.4537, + "num_tokens": 116449305.0, + "step": 394 + }, + { + "epoch": 0.4847001223990208, + "grad_norm": 0.2621166706085205, + "learning_rate": 1e-05, + "loss": 0.4362, + "num_tokens": 117045707.0, + "step": 396 + }, + { + "epoch": 0.48714810281517745, + "grad_norm": 0.2502780854701996, + "learning_rate": 1e-05, + "loss": 0.4454, + "num_tokens": 117635477.0, + "step": 398 + }, + { + "epoch": 0.48959608323133413, + "grad_norm": 0.2551596164703369, + "learning_rate": 1e-05, + "loss": 0.4481, + "num_tokens": 118231246.0, + "step": 400 + }, + { + "epoch": 0.4920440636474908, + "grad_norm": 0.24621985852718353, + "learning_rate": 1e-05, + "loss": 0.4422, + "num_tokens": 118844143.0, + "step": 402 + }, + { + "epoch": 0.4944920440636475, + "grad_norm": 0.2507868707180023, + "learning_rate": 1e-05, + "loss": 0.4372, + "num_tokens": 119416215.0, + "step": 404 + }, + { + "epoch": 0.4969400244798042, + "grad_norm": 0.2631942331790924, + "learning_rate": 1e-05, + "loss": 0.4452, + "num_tokens": 120009624.0, + "step": 406 + }, + { + "epoch": 0.49938800489596086, + "grad_norm": 0.2675624489784241, + "learning_rate": 1e-05, + "loss": 0.4305, + "num_tokens": 120601206.0, + "step": 408 + }, + { + "epoch": 0.5018359853121175, + "grad_norm": 0.2466759979724884, + "learning_rate": 1e-05, + "loss": 0.4527, + "num_tokens": 121202335.0, + "step": 410 + }, + { + "epoch": 0.5042839657282742, + "grad_norm": 0.2538067400455475, + "learning_rate": 1e-05, + "loss": 0.4442, + "num_tokens": 121787505.0, + "step": 412 + }, + { + "epoch": 0.5067319461444308, + "grad_norm": 0.25116246938705444, + "learning_rate": 1e-05, + "loss": 0.4425, + "num_tokens": 122383008.0, + "step": 414 + }, + { + "epoch": 0.5091799265605875, + "grad_norm": 0.236452117562294, + "learning_rate": 1e-05, + "loss": 0.4537, + "num_tokens": 122980153.0, + "step": 416 + }, + { + "epoch": 0.5116279069767442, + "grad_norm": 0.24228782951831818, + "learning_rate": 1e-05, + "loss": 0.4302, + "num_tokens": 123564055.0, + "step": 418 + }, + { + "epoch": 0.5140758873929009, + "grad_norm": 0.25363457202911377, + "learning_rate": 1e-05, + "loss": 0.4413, + "num_tokens": 124154714.0, + "step": 420 + }, + { + "epoch": 0.5165238678090576, + "grad_norm": 0.2742238938808441, + "learning_rate": 1e-05, + "loss": 0.4492, + "num_tokens": 124725604.0, + "step": 422 + }, + { + "epoch": 0.5189718482252142, + "grad_norm": 0.24658766388893127, + "learning_rate": 1e-05, + "loss": 0.4429, + "num_tokens": 125329669.0, + "step": 424 + }, + { + "epoch": 0.5214198286413708, + "grad_norm": 0.2611011862754822, + "learning_rate": 1e-05, + "loss": 0.4339, + "num_tokens": 125871390.0, + "step": 426 + }, + { + "epoch": 0.5238678090575275, + "grad_norm": 0.25249847769737244, + "learning_rate": 1e-05, + "loss": 0.4476, + "num_tokens": 126485091.0, + "step": 428 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.25424760580062866, + "learning_rate": 1e-05, + "loss": 0.4394, + "num_tokens": 127070743.0, + "step": 430 + }, + { + "epoch": 0.5287637698898409, + "grad_norm": 0.26382750272750854, + "learning_rate": 1e-05, + "loss": 0.451, + "num_tokens": 127643183.0, + "step": 432 + }, + { + "epoch": 0.5312117503059975, + "grad_norm": 0.264977365732193, + "learning_rate": 1e-05, + "loss": 0.4471, + "num_tokens": 128238869.0, + "step": 434 + }, + { + "epoch": 0.5336597307221542, + "grad_norm": 0.24802319705486298, + "learning_rate": 1e-05, + "loss": 0.4312, + "num_tokens": 128833166.0, + "step": 436 + }, + { + "epoch": 0.5361077111383109, + "grad_norm": 0.2559887766838074, + "learning_rate": 1e-05, + "loss": 0.4513, + "num_tokens": 129432358.0, + "step": 438 + }, + { + "epoch": 0.5385556915544676, + "grad_norm": 0.2669244706630707, + "learning_rate": 1e-05, + "loss": 0.425, + "num_tokens": 130005589.0, + "step": 440 + }, + { + "epoch": 0.5410036719706243, + "grad_norm": 0.26476767659187317, + "learning_rate": 1e-05, + "loss": 0.4596, + "num_tokens": 130617968.0, + "step": 442 + }, + { + "epoch": 0.543451652386781, + "grad_norm": 0.24924980103969574, + "learning_rate": 1e-05, + "loss": 0.4275, + "num_tokens": 131211641.0, + "step": 444 + }, + { + "epoch": 0.5458996328029376, + "grad_norm": 0.2561533451080322, + "learning_rate": 1e-05, + "loss": 0.4389, + "num_tokens": 131815751.0, + "step": 446 + }, + { + "epoch": 0.5483476132190942, + "grad_norm": 0.2370700240135193, + "learning_rate": 1e-05, + "loss": 0.4442, + "num_tokens": 132435428.0, + "step": 448 + }, + { + "epoch": 0.5507955936352509, + "grad_norm": 0.259915828704834, + "learning_rate": 1e-05, + "loss": 0.4521, + "num_tokens": 133026844.0, + "step": 450 + }, + { + "epoch": 0.5532435740514076, + "grad_norm": 0.2539651095867157, + "learning_rate": 1e-05, + "loss": 0.4513, + "num_tokens": 133610279.0, + "step": 452 + }, + { + "epoch": 0.5556915544675642, + "grad_norm": 0.2514914274215698, + "learning_rate": 1e-05, + "loss": 0.4359, + "num_tokens": 134186783.0, + "step": 454 + }, + { + "epoch": 0.5581395348837209, + "grad_norm": 0.23980778455734253, + "learning_rate": 1e-05, + "loss": 0.4217, + "num_tokens": 134766199.0, + "step": 456 + }, + { + "epoch": 0.5605875152998776, + "grad_norm": 0.246282696723938, + "learning_rate": 1e-05, + "loss": 0.4346, + "num_tokens": 135355631.0, + "step": 458 + }, + { + "epoch": 0.5630354957160343, + "grad_norm": 0.24391327798366547, + "learning_rate": 1e-05, + "loss": 0.455, + "num_tokens": 135973241.0, + "step": 460 + }, + { + "epoch": 0.565483476132191, + "grad_norm": 0.2361239790916443, + "learning_rate": 1e-05, + "loss": 0.4288, + "num_tokens": 136579751.0, + "step": 462 + }, + { + "epoch": 0.5679314565483476, + "grad_norm": 0.2682438790798187, + "learning_rate": 1e-05, + "loss": 0.4407, + "num_tokens": 137187474.0, + "step": 464 + }, + { + "epoch": 0.5703794369645043, + "grad_norm": 0.24559003114700317, + "learning_rate": 1e-05, + "loss": 0.4571, + "num_tokens": 137796342.0, + "step": 466 + }, + { + "epoch": 0.572827417380661, + "grad_norm": 0.2480727732181549, + "learning_rate": 1e-05, + "loss": 0.4341, + "num_tokens": 138415346.0, + "step": 468 + }, + { + "epoch": 0.5752753977968176, + "grad_norm": 0.24036051332950592, + "learning_rate": 1e-05, + "loss": 0.4519, + "num_tokens": 139036676.0, + "step": 470 + }, + { + "epoch": 0.5777233782129743, + "grad_norm": 0.25144311785697937, + "learning_rate": 1e-05, + "loss": 0.4485, + "num_tokens": 139647226.0, + "step": 472 + }, + { + "epoch": 0.5801713586291309, + "grad_norm": 0.2437385618686676, + "learning_rate": 1e-05, + "loss": 0.4292, + "num_tokens": 140222429.0, + "step": 474 + }, + { + "epoch": 0.5826193390452876, + "grad_norm": 0.25468119978904724, + "learning_rate": 1e-05, + "loss": 0.4488, + "num_tokens": 140846203.0, + "step": 476 + }, + { + "epoch": 0.5850673194614443, + "grad_norm": 0.23626460134983063, + "learning_rate": 1e-05, + "loss": 0.4305, + "num_tokens": 141448874.0, + "step": 478 + }, + { + "epoch": 0.587515299877601, + "grad_norm": 0.266257643699646, + "learning_rate": 1e-05, + "loss": 0.4268, + "num_tokens": 142029277.0, + "step": 480 + }, + { + "epoch": 0.5899632802937577, + "grad_norm": 0.2561860680580139, + "learning_rate": 1e-05, + "loss": 0.4263, + "num_tokens": 142624816.0, + "step": 482 + }, + { + "epoch": 0.5924112607099143, + "grad_norm": 0.266886830329895, + "learning_rate": 1e-05, + "loss": 0.4297, + "num_tokens": 143212355.0, + "step": 484 + }, + { + "epoch": 0.594859241126071, + "grad_norm": 0.257272869348526, + "learning_rate": 1e-05, + "loss": 0.4468, + "num_tokens": 143820960.0, + "step": 486 + }, + { + "epoch": 0.5973072215422277, + "grad_norm": 0.24327340722084045, + "learning_rate": 1e-05, + "loss": 0.4448, + "num_tokens": 144394056.0, + "step": 488 + }, + { + "epoch": 0.5997552019583844, + "grad_norm": 0.2501513361930847, + "learning_rate": 1e-05, + "loss": 0.4322, + "num_tokens": 144976355.0, + "step": 490 + }, + { + "epoch": 0.602203182374541, + "grad_norm": 0.2472490519285202, + "learning_rate": 1e-05, + "loss": 0.4349, + "num_tokens": 145575990.0, + "step": 492 + }, + { + "epoch": 0.6046511627906976, + "grad_norm": 0.24121037125587463, + "learning_rate": 1e-05, + "loss": 0.4341, + "num_tokens": 146180118.0, + "step": 494 + }, + { + "epoch": 0.6070991432068543, + "grad_norm": 0.25340262055397034, + "learning_rate": 1e-05, + "loss": 0.444, + "num_tokens": 146763569.0, + "step": 496 + }, + { + "epoch": 0.609547123623011, + "grad_norm": 0.28607383370399475, + "learning_rate": 1e-05, + "loss": 0.4319, + "num_tokens": 147348892.0, + "step": 498 + }, + { + "epoch": 0.6119951040391677, + "grad_norm": 0.2469649612903595, + "learning_rate": 1e-05, + "loss": 0.4351, + "num_tokens": 147934106.0, + "step": 500 + }, + { + "epoch": 0.6144430844553244, + "grad_norm": 0.24661080539226532, + "learning_rate": 1e-05, + "loss": 0.4513, + "num_tokens": 148526284.0, + "step": 502 + }, + { + "epoch": 0.616891064871481, + "grad_norm": 0.2517259120941162, + "learning_rate": 1e-05, + "loss": 0.4491, + "num_tokens": 149116759.0, + "step": 504 + }, + { + "epoch": 0.6193390452876377, + "grad_norm": 0.2508615255355835, + "learning_rate": 1e-05, + "loss": 0.4464, + "num_tokens": 149733718.0, + "step": 506 + }, + { + "epoch": 0.6217870257037944, + "grad_norm": 0.24546414613723755, + "learning_rate": 1e-05, + "loss": 0.4387, + "num_tokens": 150325552.0, + "step": 508 + }, + { + "epoch": 0.6242350061199511, + "grad_norm": 0.24201036989688873, + "learning_rate": 1e-05, + "loss": 0.4373, + "num_tokens": 150919640.0, + "step": 510 + }, + { + "epoch": 0.6266829865361077, + "grad_norm": 0.26162493228912354, + "learning_rate": 1e-05, + "loss": 0.4106, + "num_tokens": 151479810.0, + "step": 512 + }, + { + "epoch": 0.6291309669522643, + "grad_norm": 0.248540461063385, + "learning_rate": 1e-05, + "loss": 0.4402, + "num_tokens": 152069098.0, + "step": 514 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.24691417813301086, + "learning_rate": 1e-05, + "loss": 0.429, + "num_tokens": 152650689.0, + "step": 516 + }, + { + "epoch": 0.6340269277845777, + "grad_norm": 0.23826554417610168, + "learning_rate": 1e-05, + "loss": 0.4397, + "num_tokens": 153238050.0, + "step": 518 + }, + { + "epoch": 0.6364749082007344, + "grad_norm": 0.2382456660270691, + "learning_rate": 1e-05, + "loss": 0.4355, + "num_tokens": 153848706.0, + "step": 520 + }, + { + "epoch": 0.6389228886168911, + "grad_norm": 0.24490463733673096, + "learning_rate": 1e-05, + "loss": 0.4531, + "num_tokens": 154435999.0, + "step": 522 + }, + { + "epoch": 0.6413708690330477, + "grad_norm": 0.2386292815208435, + "learning_rate": 1e-05, + "loss": 0.4202, + "num_tokens": 155012325.0, + "step": 524 + }, + { + "epoch": 0.6438188494492044, + "grad_norm": 0.25578925013542175, + "learning_rate": 1e-05, + "loss": 0.4096, + "num_tokens": 155577911.0, + "step": 526 + }, + { + "epoch": 0.6462668298653611, + "grad_norm": 0.2566823959350586, + "learning_rate": 1e-05, + "loss": 0.4397, + "num_tokens": 156159354.0, + "step": 528 + }, + { + "epoch": 0.6487148102815178, + "grad_norm": 0.2507897615432739, + "learning_rate": 1e-05, + "loss": 0.4328, + "num_tokens": 156734371.0, + "step": 530 + }, + { + "epoch": 0.6511627906976745, + "grad_norm": 0.2468583583831787, + "learning_rate": 1e-05, + "loss": 0.4524, + "num_tokens": 157342481.0, + "step": 532 + }, + { + "epoch": 0.653610771113831, + "grad_norm": 0.22932949662208557, + "learning_rate": 1e-05, + "loss": 0.4217, + "num_tokens": 157940378.0, + "step": 534 + }, + { + "epoch": 0.6560587515299877, + "grad_norm": 0.24993668496608734, + "learning_rate": 1e-05, + "loss": 0.4397, + "num_tokens": 158529483.0, + "step": 536 + }, + { + "epoch": 0.6585067319461444, + "grad_norm": 0.2605837285518646, + "learning_rate": 1e-05, + "loss": 0.438, + "num_tokens": 159116553.0, + "step": 538 + }, + { + "epoch": 0.6609547123623011, + "grad_norm": 0.2537282407283783, + "learning_rate": 1e-05, + "loss": 0.4316, + "num_tokens": 159704424.0, + "step": 540 + }, + { + "epoch": 0.6634026927784578, + "grad_norm": 0.24413853883743286, + "learning_rate": 1e-05, + "loss": 0.4416, + "num_tokens": 160315337.0, + "step": 542 + }, + { + "epoch": 0.6658506731946144, + "grad_norm": 0.24804916977882385, + "learning_rate": 1e-05, + "loss": 0.4388, + "num_tokens": 160906136.0, + "step": 544 + }, + { + "epoch": 0.6682986536107711, + "grad_norm": 0.2709481418132782, + "learning_rate": 1e-05, + "loss": 0.4155, + "num_tokens": 161482176.0, + "step": 546 + }, + { + "epoch": 0.6707466340269278, + "grad_norm": 0.2590528428554535, + "learning_rate": 1e-05, + "loss": 0.4217, + "num_tokens": 162066339.0, + "step": 548 + }, + { + "epoch": 0.6731946144430845, + "grad_norm": 0.22846968472003937, + "learning_rate": 1e-05, + "loss": 0.4235, + "num_tokens": 162686859.0, + "step": 550 + }, + { + "epoch": 0.6756425948592412, + "grad_norm": 0.2419133186340332, + "learning_rate": 1e-05, + "loss": 0.4215, + "num_tokens": 163261743.0, + "step": 552 + }, + { + "epoch": 0.6780905752753978, + "grad_norm": 0.2427610456943512, + "learning_rate": 1e-05, + "loss": 0.4264, + "num_tokens": 163864744.0, + "step": 554 + }, + { + "epoch": 0.6805385556915544, + "grad_norm": 0.23300205171108246, + "learning_rate": 1e-05, + "loss": 0.4151, + "num_tokens": 164424223.0, + "step": 556 + }, + { + "epoch": 0.6829865361077111, + "grad_norm": 0.23740941286087036, + "learning_rate": 1e-05, + "loss": 0.4361, + "num_tokens": 165031113.0, + "step": 558 + }, + { + "epoch": 0.6854345165238678, + "grad_norm": 0.23929090797901154, + "learning_rate": 1e-05, + "loss": 0.4312, + "num_tokens": 165645826.0, + "step": 560 + }, + { + "epoch": 0.6878824969400245, + "grad_norm": 0.25261008739471436, + "learning_rate": 1e-05, + "loss": 0.4312, + "num_tokens": 166270973.0, + "step": 562 + }, + { + "epoch": 0.6903304773561811, + "grad_norm": 0.24025267362594604, + "learning_rate": 1e-05, + "loss": 0.4448, + "num_tokens": 166855785.0, + "step": 564 + }, + { + "epoch": 0.6927784577723378, + "grad_norm": 0.24211308360099792, + "learning_rate": 1e-05, + "loss": 0.432, + "num_tokens": 167481909.0, + "step": 566 + }, + { + "epoch": 0.6952264381884945, + "grad_norm": 0.24173016846179962, + "learning_rate": 1e-05, + "loss": 0.4232, + "num_tokens": 168078039.0, + "step": 568 + }, + { + "epoch": 0.6976744186046512, + "grad_norm": 0.2316238284111023, + "learning_rate": 1e-05, + "loss": 0.4285, + "num_tokens": 168668586.0, + "step": 570 + }, + { + "epoch": 0.7001223990208079, + "grad_norm": 0.25232845544815063, + "learning_rate": 1e-05, + "loss": 0.4244, + "num_tokens": 169280779.0, + "step": 572 + }, + { + "epoch": 0.7025703794369645, + "grad_norm": 0.2419183850288391, + "learning_rate": 1e-05, + "loss": 0.4085, + "num_tokens": 169869109.0, + "step": 574 + }, + { + "epoch": 0.7050183598531212, + "grad_norm": 0.25120463967323303, + "learning_rate": 1e-05, + "loss": 0.4369, + "num_tokens": 170459382.0, + "step": 576 + }, + { + "epoch": 0.7074663402692778, + "grad_norm": 0.2427869439125061, + "learning_rate": 1e-05, + "loss": 0.4425, + "num_tokens": 171059799.0, + "step": 578 + }, + { + "epoch": 0.7099143206854345, + "grad_norm": 0.2565186023712158, + "learning_rate": 1e-05, + "loss": 0.4333, + "num_tokens": 171616638.0, + "step": 580 + }, + { + "epoch": 0.7123623011015912, + "grad_norm": 0.26279616355895996, + "learning_rate": 1e-05, + "loss": 0.4375, + "num_tokens": 172215523.0, + "step": 582 + }, + { + "epoch": 0.7148102815177478, + "grad_norm": 0.24042481184005737, + "learning_rate": 1e-05, + "loss": 0.4258, + "num_tokens": 172816261.0, + "step": 584 + }, + { + "epoch": 0.7172582619339045, + "grad_norm": 0.2408638894557953, + "learning_rate": 1e-05, + "loss": 0.4179, + "num_tokens": 173418219.0, + "step": 586 + }, + { + "epoch": 0.7197062423500612, + "grad_norm": 0.2416829615831375, + "learning_rate": 1e-05, + "loss": 0.43, + "num_tokens": 174001845.0, + "step": 588 + }, + { + "epoch": 0.7221542227662179, + "grad_norm": 0.25662803649902344, + "learning_rate": 1e-05, + "loss": 0.4055, + "num_tokens": 174573841.0, + "step": 590 + }, + { + "epoch": 0.7246022031823746, + "grad_norm": 0.2541915476322174, + "learning_rate": 1e-05, + "loss": 0.4271, + "num_tokens": 175157025.0, + "step": 592 + }, + { + "epoch": 0.7270501835985312, + "grad_norm": 0.23873859643936157, + "learning_rate": 1e-05, + "loss": 0.4134, + "num_tokens": 175763839.0, + "step": 594 + }, + { + "epoch": 0.7294981640146879, + "grad_norm": 0.2606620192527771, + "learning_rate": 1e-05, + "loss": 0.4215, + "num_tokens": 176340303.0, + "step": 596 + }, + { + "epoch": 0.7319461444308446, + "grad_norm": 0.2397637516260147, + "learning_rate": 1e-05, + "loss": 0.4355, + "num_tokens": 176933693.0, + "step": 598 + }, + { + "epoch": 0.7343941248470012, + "grad_norm": 0.24392768740653992, + "learning_rate": 1e-05, + "loss": 0.4267, + "num_tokens": 177551417.0, + "step": 600 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 0.251198947429657, + "learning_rate": 1e-05, + "loss": 0.4202, + "num_tokens": 178108215.0, + "step": 602 + }, + { + "epoch": 0.7392900856793145, + "grad_norm": 0.24769917130470276, + "learning_rate": 1e-05, + "loss": 0.4262, + "num_tokens": 178678758.0, + "step": 604 + }, + { + "epoch": 0.7417380660954712, + "grad_norm": 0.2392667829990387, + "learning_rate": 1e-05, + "loss": 0.4187, + "num_tokens": 179263018.0, + "step": 606 + }, + { + "epoch": 0.7441860465116279, + "grad_norm": 0.25212791562080383, + "learning_rate": 1e-05, + "loss": 0.4276, + "num_tokens": 179877385.0, + "step": 608 + }, + { + "epoch": 0.7466340269277846, + "grad_norm": 0.24665768444538116, + "learning_rate": 1e-05, + "loss": 0.4255, + "num_tokens": 180464341.0, + "step": 610 + }, + { + "epoch": 0.7490820073439413, + "grad_norm": 0.2613765597343445, + "learning_rate": 1e-05, + "loss": 0.4342, + "num_tokens": 181070260.0, + "step": 612 + }, + { + "epoch": 0.7515299877600979, + "grad_norm": 0.27524396777153015, + "learning_rate": 1e-05, + "loss": 0.4142, + "num_tokens": 181650173.0, + "step": 614 + }, + { + "epoch": 0.7539779681762546, + "grad_norm": 0.2363055795431137, + "learning_rate": 1e-05, + "loss": 0.4199, + "num_tokens": 182238618.0, + "step": 616 + }, + { + "epoch": 0.7564259485924113, + "grad_norm": 0.2546214759349823, + "learning_rate": 1e-05, + "loss": 0.4119, + "num_tokens": 182798204.0, + "step": 618 + }, + { + "epoch": 0.758873929008568, + "grad_norm": 0.2432137429714203, + "learning_rate": 1e-05, + "loss": 0.4195, + "num_tokens": 183380029.0, + "step": 620 + }, + { + "epoch": 0.7613219094247246, + "grad_norm": 0.258290559053421, + "learning_rate": 1e-05, + "loss": 0.4282, + "num_tokens": 183966255.0, + "step": 622 + }, + { + "epoch": 0.7637698898408812, + "grad_norm": 0.23586836457252502, + "learning_rate": 1e-05, + "loss": 0.4352, + "num_tokens": 184583415.0, + "step": 624 + }, + { + "epoch": 0.7662178702570379, + "grad_norm": 0.23846642673015594, + "learning_rate": 1e-05, + "loss": 0.4328, + "num_tokens": 185197955.0, + "step": 626 + }, + { + "epoch": 0.7686658506731946, + "grad_norm": 0.24626384675502777, + "learning_rate": 1e-05, + "loss": 0.4109, + "num_tokens": 185785557.0, + "step": 628 + }, + { + "epoch": 0.7711138310893513, + "grad_norm": 0.24848531186580658, + "learning_rate": 1e-05, + "loss": 0.4223, + "num_tokens": 186356034.0, + "step": 630 + }, + { + "epoch": 0.773561811505508, + "grad_norm": 0.26634329557418823, + "learning_rate": 1e-05, + "loss": 0.4297, + "num_tokens": 186944825.0, + "step": 632 + }, + { + "epoch": 0.7760097919216646, + "grad_norm": 0.23417183756828308, + "learning_rate": 1e-05, + "loss": 0.4297, + "num_tokens": 187555120.0, + "step": 634 + }, + { + "epoch": 0.7784577723378213, + "grad_norm": 0.2514715790748596, + "learning_rate": 1e-05, + "loss": 0.4315, + "num_tokens": 188158111.0, + "step": 636 + }, + { + "epoch": 0.780905752753978, + "grad_norm": 0.2464294582605362, + "learning_rate": 1e-05, + "loss": 0.4341, + "num_tokens": 188752896.0, + "step": 638 + }, + { + "epoch": 0.7833537331701347, + "grad_norm": 0.2839301526546478, + "learning_rate": 1e-05, + "loss": 0.4315, + "num_tokens": 189348219.0, + "step": 640 + }, + { + "epoch": 0.7858017135862914, + "grad_norm": 0.2621495723724365, + "learning_rate": 1e-05, + "loss": 0.4151, + "num_tokens": 189894564.0, + "step": 642 + }, + { + "epoch": 0.7882496940024479, + "grad_norm": 0.23994684219360352, + "learning_rate": 1e-05, + "loss": 0.4329, + "num_tokens": 190529171.0, + "step": 644 + }, + { + "epoch": 0.7906976744186046, + "grad_norm": 0.23574483394622803, + "learning_rate": 1e-05, + "loss": 0.4215, + "num_tokens": 191131915.0, + "step": 646 + }, + { + "epoch": 0.7931456548347613, + "grad_norm": 0.2408006489276886, + "learning_rate": 1e-05, + "loss": 0.435, + "num_tokens": 191724725.0, + "step": 648 + }, + { + "epoch": 0.795593635250918, + "grad_norm": 0.23889918625354767, + "learning_rate": 1e-05, + "loss": 0.43, + "num_tokens": 192336245.0, + "step": 650 + }, + { + "epoch": 0.7980416156670747, + "grad_norm": 0.23778203129768372, + "learning_rate": 1e-05, + "loss": 0.4207, + "num_tokens": 192923779.0, + "step": 652 + }, + { + "epoch": 0.8004895960832313, + "grad_norm": 0.2521960437297821, + "learning_rate": 1e-05, + "loss": 0.4057, + "num_tokens": 193499166.0, + "step": 654 + }, + { + "epoch": 0.802937576499388, + "grad_norm": 0.2429361194372177, + "learning_rate": 1e-05, + "loss": 0.4242, + "num_tokens": 194104195.0, + "step": 656 + }, + { + "epoch": 0.8053855569155447, + "grad_norm": 0.23911674320697784, + "learning_rate": 1e-05, + "loss": 0.4326, + "num_tokens": 194700389.0, + "step": 658 + }, + { + "epoch": 0.8078335373317014, + "grad_norm": 0.24030958116054535, + "learning_rate": 1e-05, + "loss": 0.432, + "num_tokens": 195293054.0, + "step": 660 + }, + { + "epoch": 0.8102815177478581, + "grad_norm": 0.23353174328804016, + "learning_rate": 1e-05, + "loss": 0.4122, + "num_tokens": 195898402.0, + "step": 662 + }, + { + "epoch": 0.8127294981640147, + "grad_norm": 0.2420521378517151, + "learning_rate": 1e-05, + "loss": 0.4191, + "num_tokens": 196479623.0, + "step": 664 + }, + { + "epoch": 0.8151774785801713, + "grad_norm": 0.24901549518108368, + "learning_rate": 1e-05, + "loss": 0.4169, + "num_tokens": 197025340.0, + "step": 666 + }, + { + "epoch": 0.817625458996328, + "grad_norm": 0.24542152881622314, + "learning_rate": 1e-05, + "loss": 0.4157, + "num_tokens": 197594800.0, + "step": 668 + }, + { + "epoch": 0.8200734394124847, + "grad_norm": 0.24356816709041595, + "learning_rate": 1e-05, + "loss": 0.4236, + "num_tokens": 198193576.0, + "step": 670 + }, + { + "epoch": 0.8225214198286414, + "grad_norm": 0.24126183986663818, + "learning_rate": 1e-05, + "loss": 0.4176, + "num_tokens": 198803995.0, + "step": 672 + }, + { + "epoch": 0.824969400244798, + "grad_norm": 0.24375128746032715, + "learning_rate": 1e-05, + "loss": 0.4215, + "num_tokens": 199416611.0, + "step": 674 + }, + { + "epoch": 0.8274173806609547, + "grad_norm": 0.25136980414390564, + "learning_rate": 1e-05, + "loss": 0.4129, + "num_tokens": 200007315.0, + "step": 676 + }, + { + "epoch": 0.8298653610771114, + "grad_norm": 0.23422685265541077, + "learning_rate": 1e-05, + "loss": 0.4179, + "num_tokens": 200608406.0, + "step": 678 + }, + { + "epoch": 0.8323133414932681, + "grad_norm": 0.24444159865379333, + "learning_rate": 1e-05, + "loss": 0.4317, + "num_tokens": 201208706.0, + "step": 680 + }, + { + "epoch": 0.8347613219094248, + "grad_norm": 0.23103386163711548, + "learning_rate": 1e-05, + "loss": 0.3971, + "num_tokens": 201783927.0, + "step": 682 + }, + { + "epoch": 0.8372093023255814, + "grad_norm": 0.25669893622398376, + "learning_rate": 1e-05, + "loss": 0.4295, + "num_tokens": 202380174.0, + "step": 684 + }, + { + "epoch": 0.8396572827417381, + "grad_norm": 0.2663831114768982, + "learning_rate": 1e-05, + "loss": 0.4274, + "num_tokens": 202985735.0, + "step": 686 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.2565767467021942, + "learning_rate": 1e-05, + "loss": 0.4352, + "num_tokens": 203582111.0, + "step": 688 + }, + { + "epoch": 0.8445532435740514, + "grad_norm": 0.2389577478170395, + "learning_rate": 1e-05, + "loss": 0.4138, + "num_tokens": 204153573.0, + "step": 690 + }, + { + "epoch": 0.847001223990208, + "grad_norm": 0.2350880652666092, + "learning_rate": 1e-05, + "loss": 0.4168, + "num_tokens": 204763271.0, + "step": 692 + }, + { + "epoch": 0.8494492044063647, + "grad_norm": 0.25765883922576904, + "learning_rate": 1e-05, + "loss": 0.4185, + "num_tokens": 205341916.0, + "step": 694 + }, + { + "epoch": 0.8518971848225214, + "grad_norm": 0.2440985143184662, + "learning_rate": 1e-05, + "loss": 0.4362, + "num_tokens": 205924333.0, + "step": 696 + }, + { + "epoch": 0.8543451652386781, + "grad_norm": 0.2437448501586914, + "learning_rate": 1e-05, + "loss": 0.4238, + "num_tokens": 206489074.0, + "step": 698 + }, + { + "epoch": 0.8567931456548348, + "grad_norm": 0.3054177165031433, + "learning_rate": 1e-05, + "loss": 0.4179, + "num_tokens": 207076635.0, + "step": 700 + }, + { + "epoch": 0.8592411260709915, + "grad_norm": 0.2562294006347656, + "learning_rate": 1e-05, + "loss": 0.4297, + "num_tokens": 207679550.0, + "step": 702 + }, + { + "epoch": 0.8616891064871481, + "grad_norm": 0.25215739011764526, + "learning_rate": 1e-05, + "loss": 0.4262, + "num_tokens": 208285797.0, + "step": 704 + }, + { + "epoch": 0.8641370869033048, + "grad_norm": 0.25319892168045044, + "learning_rate": 1e-05, + "loss": 0.4268, + "num_tokens": 208900402.0, + "step": 706 + }, + { + "epoch": 0.8665850673194615, + "grad_norm": 0.22704288363456726, + "learning_rate": 1e-05, + "loss": 0.4133, + "num_tokens": 209478996.0, + "step": 708 + }, + { + "epoch": 0.8690330477356181, + "grad_norm": 0.23838220536708832, + "learning_rate": 1e-05, + "loss": 0.413, + "num_tokens": 210061792.0, + "step": 710 + }, + { + "epoch": 0.8714810281517748, + "grad_norm": 0.25101152062416077, + "learning_rate": 1e-05, + "loss": 0.4195, + "num_tokens": 210619894.0, + "step": 712 + }, + { + "epoch": 0.8739290085679314, + "grad_norm": 0.2533038258552551, + "learning_rate": 1e-05, + "loss": 0.4198, + "num_tokens": 211195236.0, + "step": 714 + }, + { + "epoch": 0.8763769889840881, + "grad_norm": 0.24252592027187347, + "learning_rate": 1e-05, + "loss": 0.4141, + "num_tokens": 211757983.0, + "step": 716 + }, + { + "epoch": 0.8788249694002448, + "grad_norm": 0.2420939952135086, + "learning_rate": 1e-05, + "loss": 0.4213, + "num_tokens": 212332438.0, + "step": 718 + }, + { + "epoch": 0.8812729498164015, + "grad_norm": 0.23020204901695251, + "learning_rate": 1e-05, + "loss": 0.4111, + "num_tokens": 212936652.0, + "step": 720 + }, + { + "epoch": 0.8837209302325582, + "grad_norm": 0.24056395888328552, + "learning_rate": 1e-05, + "loss": 0.4047, + "num_tokens": 213505407.0, + "step": 722 + }, + { + "epoch": 0.8861689106487148, + "grad_norm": 0.24601121246814728, + "learning_rate": 1e-05, + "loss": 0.4197, + "num_tokens": 214094524.0, + "step": 724 + }, + { + "epoch": 0.8886168910648715, + "grad_norm": 0.23350679874420166, + "learning_rate": 1e-05, + "loss": 0.398, + "num_tokens": 214670619.0, + "step": 726 + }, + { + "epoch": 0.8910648714810282, + "grad_norm": 0.23900729417800903, + "learning_rate": 1e-05, + "loss": 0.4114, + "num_tokens": 215236751.0, + "step": 728 + }, + { + "epoch": 0.8935128518971848, + "grad_norm": 0.243704155087471, + "learning_rate": 1e-05, + "loss": 0.4131, + "num_tokens": 215828751.0, + "step": 730 + }, + { + "epoch": 0.8959608323133414, + "grad_norm": 0.23456545174121857, + "learning_rate": 1e-05, + "loss": 0.4158, + "num_tokens": 216449250.0, + "step": 732 + }, + { + "epoch": 0.8984088127294981, + "grad_norm": 0.23414072394371033, + "learning_rate": 1e-05, + "loss": 0.4121, + "num_tokens": 217044781.0, + "step": 734 + }, + { + "epoch": 0.9008567931456548, + "grad_norm": 0.2433299720287323, + "learning_rate": 1e-05, + "loss": 0.4114, + "num_tokens": 217611996.0, + "step": 736 + }, + { + "epoch": 0.9033047735618115, + "grad_norm": 0.2463146448135376, + "learning_rate": 1e-05, + "loss": 0.4068, + "num_tokens": 218232059.0, + "step": 738 + }, + { + "epoch": 0.9057527539779682, + "grad_norm": 0.7880977392196655, + "learning_rate": 1e-05, + "loss": 0.4441, + "num_tokens": 218810653.0, + "step": 740 + }, + { + "epoch": 0.9082007343941249, + "grad_norm": 0.241953507065773, + "learning_rate": 1e-05, + "loss": 0.4087, + "num_tokens": 219409815.0, + "step": 742 + }, + { + "epoch": 0.9106487148102815, + "grad_norm": 0.2762121260166168, + "learning_rate": 1e-05, + "loss": 0.4228, + "num_tokens": 220016419.0, + "step": 744 + }, + { + "epoch": 0.9130966952264382, + "grad_norm": 0.2605133354663849, + "learning_rate": 1e-05, + "loss": 0.4168, + "num_tokens": 220599687.0, + "step": 746 + }, + { + "epoch": 0.9155446756425949, + "grad_norm": 0.24054831266403198, + "learning_rate": 1e-05, + "loss": 0.4181, + "num_tokens": 221170683.0, + "step": 748 + }, + { + "epoch": 0.9179926560587516, + "grad_norm": 0.2439662516117096, + "learning_rate": 1e-05, + "loss": 0.4175, + "num_tokens": 221764282.0, + "step": 750 + }, + { + "epoch": 0.9204406364749081, + "grad_norm": 0.23831577599048615, + "learning_rate": 1e-05, + "loss": 0.4228, + "num_tokens": 222383973.0, + "step": 752 + }, + { + "epoch": 0.9228886168910648, + "grad_norm": 0.24441011250019073, + "learning_rate": 1e-05, + "loss": 0.4143, + "num_tokens": 222981657.0, + "step": 754 + }, + { + "epoch": 0.9253365973072215, + "grad_norm": 0.2541545033454895, + "learning_rate": 1e-05, + "loss": 0.4263, + "num_tokens": 223580164.0, + "step": 756 + }, + { + "epoch": 0.9277845777233782, + "grad_norm": 0.23410865664482117, + "learning_rate": 1e-05, + "loss": 0.4142, + "num_tokens": 224181708.0, + "step": 758 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 0.24794194102287292, + "learning_rate": 1e-05, + "loss": 0.424, + "num_tokens": 224750929.0, + "step": 760 + }, + { + "epoch": 0.9326805385556916, + "grad_norm": 0.23957248032093048, + "learning_rate": 1e-05, + "loss": 0.4192, + "num_tokens": 225353399.0, + "step": 762 + }, + { + "epoch": 0.9351285189718482, + "grad_norm": 0.2275751829147339, + "learning_rate": 1e-05, + "loss": 0.4283, + "num_tokens": 225946912.0, + "step": 764 + }, + { + "epoch": 0.9375764993880049, + "grad_norm": 0.24257154762744904, + "learning_rate": 1e-05, + "loss": 0.4088, + "num_tokens": 226513122.0, + "step": 766 + }, + { + "epoch": 0.9400244798041616, + "grad_norm": 0.2594261169433594, + "learning_rate": 1e-05, + "loss": 0.4149, + "num_tokens": 227097275.0, + "step": 768 + }, + { + "epoch": 0.9424724602203183, + "grad_norm": 0.24188987910747528, + "learning_rate": 1e-05, + "loss": 0.4173, + "num_tokens": 227685155.0, + "step": 770 + }, + { + "epoch": 0.944920440636475, + "grad_norm": 0.2449246346950531, + "learning_rate": 1e-05, + "loss": 0.41, + "num_tokens": 228268933.0, + "step": 772 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.2591334581375122, + "learning_rate": 1e-05, + "loss": 0.4098, + "num_tokens": 228844822.0, + "step": 774 + }, + { + "epoch": 0.9498164014687882, + "grad_norm": 0.25259289145469666, + "learning_rate": 1e-05, + "loss": 0.4149, + "num_tokens": 229413402.0, + "step": 776 + }, + { + "epoch": 0.9522643818849449, + "grad_norm": 0.24534189701080322, + "learning_rate": 1e-05, + "loss": 0.4095, + "num_tokens": 229957992.0, + "step": 778 + }, + { + "epoch": 0.9547123623011016, + "grad_norm": 0.25992926955223083, + "learning_rate": 1e-05, + "loss": 0.4259, + "num_tokens": 230534148.0, + "step": 780 + }, + { + "epoch": 0.9571603427172583, + "grad_norm": 0.23857857286930084, + "learning_rate": 1e-05, + "loss": 0.4054, + "num_tokens": 231127999.0, + "step": 782 + }, + { + "epoch": 0.9596083231334149, + "grad_norm": 0.2835080921649933, + "learning_rate": 1e-05, + "loss": 0.4205, + "num_tokens": 231731253.0, + "step": 784 + }, + { + "epoch": 0.9620563035495716, + "grad_norm": 0.2432568073272705, + "learning_rate": 1e-05, + "loss": 0.4165, + "num_tokens": 232342748.0, + "step": 786 + }, + { + "epoch": 0.9645042839657283, + "grad_norm": 0.23912744224071503, + "learning_rate": 1e-05, + "loss": 0.4089, + "num_tokens": 232934775.0, + "step": 788 + }, + { + "epoch": 0.966952264381885, + "grad_norm": 0.2454313039779663, + "learning_rate": 1e-05, + "loss": 0.419, + "num_tokens": 233502064.0, + "step": 790 + }, + { + "epoch": 0.9694002447980417, + "grad_norm": 0.2457619458436966, + "learning_rate": 1e-05, + "loss": 0.4112, + "num_tokens": 234073172.0, + "step": 792 + }, + { + "epoch": 0.9718482252141983, + "grad_norm": 0.2537059485912323, + "learning_rate": 1e-05, + "loss": 0.4272, + "num_tokens": 234661532.0, + "step": 794 + }, + { + "epoch": 0.9742962056303549, + "grad_norm": 0.24286150932312012, + "learning_rate": 1e-05, + "loss": 0.4301, + "num_tokens": 235274908.0, + "step": 796 + }, + { + "epoch": 0.9767441860465116, + "grad_norm": 0.2509307265281677, + "learning_rate": 1e-05, + "loss": 0.4278, + "num_tokens": 235855663.0, + "step": 798 + }, + { + "epoch": 0.9791921664626683, + "grad_norm": 0.2525811493396759, + "learning_rate": 1e-05, + "loss": 0.399, + "num_tokens": 236428558.0, + "step": 800 + }, + { + "epoch": 0.981640146878825, + "grad_norm": 0.23528246581554413, + "learning_rate": 1e-05, + "loss": 0.4187, + "num_tokens": 237023018.0, + "step": 802 + }, + { + "epoch": 0.9840881272949816, + "grad_norm": 0.25735700130462646, + "learning_rate": 1e-05, + "loss": 0.4158, + "num_tokens": 237611598.0, + "step": 804 + }, + { + "epoch": 0.9865361077111383, + "grad_norm": 0.23932790756225586, + "learning_rate": 1e-05, + "loss": 0.4072, + "num_tokens": 238214543.0, + "step": 806 + }, + { + "epoch": 0.988984088127295, + "grad_norm": 0.2567075788974762, + "learning_rate": 1e-05, + "loss": 0.4149, + "num_tokens": 238786408.0, + "step": 808 + }, + { + "epoch": 0.9914320685434517, + "grad_norm": 0.25353989005088806, + "learning_rate": 1e-05, + "loss": 0.4115, + "num_tokens": 239363151.0, + "step": 810 + }, + { + "epoch": 0.9938800489596084, + "grad_norm": 0.2540046274662018, + "learning_rate": 1e-05, + "loss": 0.4177, + "num_tokens": 239967829.0, + "step": 812 + }, + { + "epoch": 0.996328029375765, + "grad_norm": 0.24277007579803467, + "learning_rate": 1e-05, + "loss": 0.405, + "num_tokens": 240551191.0, + "step": 814 + }, + { + "epoch": 0.9987760097919217, + "grad_norm": 0.24569077789783478, + "learning_rate": 1e-05, + "loss": 0.4205, + "num_tokens": 241153107.0, + "step": 816 + }, + { + "epoch": 1.0012239902080784, + "grad_norm": 0.2544793486595154, + "learning_rate": 1e-05, + "loss": 0.3973, + "num_tokens": 241769492.0, + "step": 818 + }, + { + "epoch": 1.003671970624235, + "grad_norm": 0.2589578330516815, + "learning_rate": 1e-05, + "loss": 0.3779, + "num_tokens": 242359313.0, + "step": 820 + }, + { + "epoch": 1.0061199510403918, + "grad_norm": 0.27613627910614014, + "learning_rate": 1e-05, + "loss": 0.3709, + "num_tokens": 242911445.0, + "step": 822 + }, + { + "epoch": 1.0085679314565483, + "grad_norm": 0.3414134085178375, + "learning_rate": 1e-05, + "loss": 0.3848, + "num_tokens": 243515905.0, + "step": 824 + }, + { + "epoch": 1.0110159118727051, + "grad_norm": 0.26070353388786316, + "learning_rate": 1e-05, + "loss": 0.3819, + "num_tokens": 244085094.0, + "step": 826 + }, + { + "epoch": 1.0134638922888617, + "grad_norm": 0.2581327259540558, + "learning_rate": 1e-05, + "loss": 0.3962, + "num_tokens": 244710189.0, + "step": 828 + }, + { + "epoch": 1.0159118727050183, + "grad_norm": 0.23694124817848206, + "learning_rate": 1e-05, + "loss": 0.3735, + "num_tokens": 245302719.0, + "step": 830 + }, + { + "epoch": 1.018359853121175, + "grad_norm": 0.2756808400154114, + "learning_rate": 1e-05, + "loss": 0.3949, + "num_tokens": 245899907.0, + "step": 832 + }, + { + "epoch": 1.0208078335373316, + "grad_norm": 0.2444760948419571, + "learning_rate": 1e-05, + "loss": 0.3905, + "num_tokens": 246514507.0, + "step": 834 + }, + { + "epoch": 1.0232558139534884, + "grad_norm": 0.2670097053050995, + "learning_rate": 1e-05, + "loss": 0.3762, + "num_tokens": 247121076.0, + "step": 836 + }, + { + "epoch": 1.025703794369645, + "grad_norm": 0.24394531548023224, + "learning_rate": 1e-05, + "loss": 0.3718, + "num_tokens": 247731957.0, + "step": 838 + }, + { + "epoch": 1.0281517747858018, + "grad_norm": 0.2526130676269531, + "learning_rate": 1e-05, + "loss": 0.3777, + "num_tokens": 248333325.0, + "step": 840 + }, + { + "epoch": 1.0305997552019583, + "grad_norm": 0.25735199451446533, + "learning_rate": 1e-05, + "loss": 0.3771, + "num_tokens": 248929993.0, + "step": 842 + }, + { + "epoch": 1.0330477356181151, + "grad_norm": 0.2488754689693451, + "learning_rate": 1e-05, + "loss": 0.3633, + "num_tokens": 249530707.0, + "step": 844 + }, + { + "epoch": 1.0354957160342717, + "grad_norm": 0.238496333360672, + "learning_rate": 1e-05, + "loss": 0.3827, + "num_tokens": 250144988.0, + "step": 846 + }, + { + "epoch": 1.0379436964504285, + "grad_norm": 0.24877411127090454, + "learning_rate": 1e-05, + "loss": 0.3853, + "num_tokens": 250726834.0, + "step": 848 + }, + { + "epoch": 1.040391676866585, + "grad_norm": 0.24590681493282318, + "learning_rate": 1e-05, + "loss": 0.3866, + "num_tokens": 251338286.0, + "step": 850 + }, + { + "epoch": 1.0428396572827416, + "grad_norm": 0.2483719140291214, + "learning_rate": 1e-05, + "loss": 0.3809, + "num_tokens": 251938256.0, + "step": 852 + }, + { + "epoch": 1.0452876376988984, + "grad_norm": 0.23960165679454803, + "learning_rate": 1e-05, + "loss": 0.3699, + "num_tokens": 252528052.0, + "step": 854 + }, + { + "epoch": 1.047735618115055, + "grad_norm": 0.2561052739620209, + "learning_rate": 1e-05, + "loss": 0.3716, + "num_tokens": 253131838.0, + "step": 856 + }, + { + "epoch": 1.0501835985312118, + "grad_norm": 0.2708950638771057, + "learning_rate": 1e-05, + "loss": 0.368, + "num_tokens": 253683217.0, + "step": 858 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.2559908926486969, + "learning_rate": 1e-05, + "loss": 0.3847, + "num_tokens": 254305603.0, + "step": 860 + }, + { + "epoch": 1.0550795593635252, + "grad_norm": 0.26056236028671265, + "learning_rate": 1e-05, + "loss": 0.3747, + "num_tokens": 277568.0, + "step": 862 + }, + { + "epoch": 1.0575275397796817, + "grad_norm": 0.2658675014972687, + "learning_rate": 1e-05, + "loss": 0.3823, + "num_tokens": 848750.0, + "step": 864 + }, + { + "epoch": 1.0599755201958385, + "grad_norm": 0.249778613448143, + "learning_rate": 1e-05, + "loss": 0.3693, + "num_tokens": 1452635.0, + "step": 866 + }, + { + "epoch": 1.062423500611995, + "grad_norm": 0.2459205985069275, + "learning_rate": 1e-05, + "loss": 0.3741, + "num_tokens": 2029158.0, + "step": 868 + }, + { + "epoch": 1.0648714810281519, + "grad_norm": 0.2944379448890686, + "learning_rate": 1e-05, + "loss": 0.3849, + "num_tokens": 2637054.0, + "step": 870 + }, + { + "epoch": 1.0673194614443084, + "grad_norm": 0.2451840043067932, + "learning_rate": 1e-05, + "loss": 0.3775, + "num_tokens": 3232747.0, + "step": 872 + }, + { + "epoch": 1.069767441860465, + "grad_norm": 0.26428672671318054, + "learning_rate": 1e-05, + "loss": 0.3711, + "num_tokens": 3822434.0, + "step": 874 + }, + { + "epoch": 1.0722154222766218, + "grad_norm": 0.2651713192462921, + "learning_rate": 1e-05, + "loss": 0.3816, + "num_tokens": 4391500.0, + "step": 876 + }, + { + "epoch": 1.0746634026927784, + "grad_norm": 0.2486201673746109, + "learning_rate": 1e-05, + "loss": 0.3824, + "num_tokens": 4979531.0, + "step": 878 + }, + { + "epoch": 1.0771113831089352, + "grad_norm": 0.25607433915138245, + "learning_rate": 1e-05, + "loss": 0.384, + "num_tokens": 5584377.0, + "step": 880 + }, + { + "epoch": 1.0795593635250917, + "grad_norm": 0.24247251451015472, + "learning_rate": 1e-05, + "loss": 0.3906, + "num_tokens": 6190943.0, + "step": 882 + }, + { + "epoch": 1.0820073439412485, + "grad_norm": 0.25798070430755615, + "learning_rate": 1e-05, + "loss": 0.3754, + "num_tokens": 6789570.0, + "step": 884 + }, + { + "epoch": 1.084455324357405, + "grad_norm": 0.25158044695854187, + "learning_rate": 1e-05, + "loss": 0.3723, + "num_tokens": 7373742.0, + "step": 886 + }, + { + "epoch": 1.086903304773562, + "grad_norm": 0.2608836889266968, + "learning_rate": 1e-05, + "loss": 0.3776, + "num_tokens": 7972262.0, + "step": 888 + }, + { + "epoch": 1.0893512851897185, + "grad_norm": 0.2598704397678375, + "learning_rate": 1e-05, + "loss": 0.3771, + "num_tokens": 8547155.0, + "step": 890 + }, + { + "epoch": 1.091799265605875, + "grad_norm": 0.24557508528232574, + "learning_rate": 1e-05, + "loss": 0.3902, + "num_tokens": 9120559.0, + "step": 892 + }, + { + "epoch": 1.0942472460220318, + "grad_norm": 0.2631266117095947, + "learning_rate": 1e-05, + "loss": 0.3795, + "num_tokens": 9683298.0, + "step": 894 + }, + { + "epoch": 1.0966952264381884, + "grad_norm": 0.24435891211032867, + "learning_rate": 1e-05, + "loss": 0.3777, + "num_tokens": 10279511.0, + "step": 896 + }, + { + "epoch": 1.0991432068543452, + "grad_norm": 0.249556764960289, + "learning_rate": 1e-05, + "loss": 0.3775, + "num_tokens": 10880149.0, + "step": 898 + }, + { + "epoch": 1.1015911872705018, + "grad_norm": 0.2669646739959717, + "learning_rate": 1e-05, + "loss": 0.393, + "num_tokens": 11490560.0, + "step": 900 + }, + { + "epoch": 1.1040391676866586, + "grad_norm": 0.2627948224544525, + "learning_rate": 1e-05, + "loss": 0.4063, + "num_tokens": 12087747.0, + "step": 902 + }, + { + "epoch": 1.1064871481028151, + "grad_norm": 0.2501852214336395, + "learning_rate": 1e-05, + "loss": 0.3807, + "num_tokens": 12661934.0, + "step": 904 + }, + { + "epoch": 1.108935128518972, + "grad_norm": 0.26337432861328125, + "learning_rate": 1e-05, + "loss": 0.3849, + "num_tokens": 13261711.0, + "step": 906 + }, + { + "epoch": 1.1113831089351285, + "grad_norm": 0.25716322660446167, + "learning_rate": 1e-05, + "loss": 0.3869, + "num_tokens": 13872528.0, + "step": 908 + }, + { + "epoch": 1.1138310893512853, + "grad_norm": 0.25698763132095337, + "learning_rate": 1e-05, + "loss": 0.3818, + "num_tokens": 14445102.0, + "step": 910 + }, + { + "epoch": 1.1162790697674418, + "grad_norm": 0.25583845376968384, + "learning_rate": 1e-05, + "loss": 0.3695, + "num_tokens": 15037648.0, + "step": 912 + }, + { + "epoch": 1.1187270501835984, + "grad_norm": 0.2765620946884155, + "learning_rate": 1e-05, + "loss": 0.3632, + "num_tokens": 15631668.0, + "step": 914 + }, + { + "epoch": 1.1211750305997552, + "grad_norm": 0.2591513991355896, + "learning_rate": 1e-05, + "loss": 0.377, + "num_tokens": 16207356.0, + "step": 916 + }, + { + "epoch": 1.1236230110159118, + "grad_norm": 0.2652893364429474, + "learning_rate": 1e-05, + "loss": 0.3572, + "num_tokens": 16776274.0, + "step": 918 + }, + { + "epoch": 1.1260709914320686, + "grad_norm": 0.2598043978214264, + "learning_rate": 1e-05, + "loss": 0.3803, + "num_tokens": 17370079.0, + "step": 920 + }, + { + "epoch": 1.1285189718482251, + "grad_norm": 0.2437954545021057, + "learning_rate": 1e-05, + "loss": 0.3718, + "num_tokens": 17947221.0, + "step": 922 + }, + { + "epoch": 1.130966952264382, + "grad_norm": 0.2446569800376892, + "learning_rate": 1e-05, + "loss": 0.3641, + "num_tokens": 18544345.0, + "step": 924 + }, + { + "epoch": 1.1334149326805385, + "grad_norm": 0.24647963047027588, + "learning_rate": 1e-05, + "loss": 0.3663, + "num_tokens": 19118029.0, + "step": 926 + }, + { + "epoch": 1.1358629130966953, + "grad_norm": 0.2526357173919678, + "learning_rate": 1e-05, + "loss": 0.3664, + "num_tokens": 19699461.0, + "step": 928 + }, + { + "epoch": 1.1383108935128519, + "grad_norm": 0.2365649789571762, + "learning_rate": 1e-05, + "loss": 0.3709, + "num_tokens": 20296793.0, + "step": 930 + }, + { + "epoch": 1.1407588739290087, + "grad_norm": 0.24821361899375916, + "learning_rate": 1e-05, + "loss": 0.3739, + "num_tokens": 20880960.0, + "step": 932 + }, + { + "epoch": 1.1432068543451652, + "grad_norm": 0.2682252824306488, + "learning_rate": 1e-05, + "loss": 0.3803, + "num_tokens": 21478802.0, + "step": 934 + }, + { + "epoch": 1.1456548347613218, + "grad_norm": 0.24566704034805298, + "learning_rate": 1e-05, + "loss": 0.3709, + "num_tokens": 22078568.0, + "step": 936 + }, + { + "epoch": 1.1481028151774786, + "grad_norm": 0.2511333227157593, + "learning_rate": 1e-05, + "loss": 0.3849, + "num_tokens": 22668790.0, + "step": 938 + }, + { + "epoch": 1.1505507955936352, + "grad_norm": 0.24181029200553894, + "learning_rate": 1e-05, + "loss": 0.3667, + "num_tokens": 23247142.0, + "step": 940 + }, + { + "epoch": 1.152998776009792, + "grad_norm": 0.24866792559623718, + "learning_rate": 1e-05, + "loss": 0.3793, + "num_tokens": 23816149.0, + "step": 942 + }, + { + "epoch": 1.1554467564259485, + "grad_norm": 0.2545630633831024, + "learning_rate": 1e-05, + "loss": 0.3809, + "num_tokens": 24404560.0, + "step": 944 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 0.25114840269088745, + "learning_rate": 1e-05, + "loss": 0.3847, + "num_tokens": 24979236.0, + "step": 946 + }, + { + "epoch": 1.1603427172582619, + "grad_norm": 0.2634119391441345, + "learning_rate": 1e-05, + "loss": 0.3842, + "num_tokens": 25563695.0, + "step": 948 + }, + { + "epoch": 1.1627906976744187, + "grad_norm": 0.255024254322052, + "learning_rate": 1e-05, + "loss": 0.3926, + "num_tokens": 26161113.0, + "step": 950 + }, + { + "epoch": 1.1652386780905752, + "grad_norm": 0.24087129533290863, + "learning_rate": 1e-05, + "loss": 0.3871, + "num_tokens": 26767525.0, + "step": 952 + }, + { + "epoch": 1.167686658506732, + "grad_norm": 0.2511006295681, + "learning_rate": 1e-05, + "loss": 0.385, + "num_tokens": 27347454.0, + "step": 954 + }, + { + "epoch": 1.1701346389228886, + "grad_norm": 0.2362564653158188, + "learning_rate": 1e-05, + "loss": 0.3751, + "num_tokens": 27955641.0, + "step": 956 + }, + { + "epoch": 1.1725826193390452, + "grad_norm": 0.24775467813014984, + "learning_rate": 1e-05, + "loss": 0.3759, + "num_tokens": 28559267.0, + "step": 958 + }, + { + "epoch": 1.175030599755202, + "grad_norm": 0.24708232283592224, + "learning_rate": 1e-05, + "loss": 0.3797, + "num_tokens": 29152270.0, + "step": 960 + }, + { + "epoch": 1.1774785801713585, + "grad_norm": 0.2411796748638153, + "learning_rate": 1e-05, + "loss": 0.382, + "num_tokens": 29749611.0, + "step": 962 + }, + { + "epoch": 1.1799265605875153, + "grad_norm": 0.24558380246162415, + "learning_rate": 1e-05, + "loss": 0.3836, + "num_tokens": 30348108.0, + "step": 964 + }, + { + "epoch": 1.182374541003672, + "grad_norm": 0.24171082675457, + "learning_rate": 1e-05, + "loss": 0.3871, + "num_tokens": 30934012.0, + "step": 966 + }, + { + "epoch": 1.1848225214198287, + "grad_norm": 0.254991352558136, + "learning_rate": 1e-05, + "loss": 0.3846, + "num_tokens": 31530616.0, + "step": 968 + }, + { + "epoch": 1.1872705018359853, + "grad_norm": 0.2404201775789261, + "learning_rate": 1e-05, + "loss": 0.3771, + "num_tokens": 32106431.0, + "step": 970 + }, + { + "epoch": 1.189718482252142, + "grad_norm": 0.2498648315668106, + "learning_rate": 1e-05, + "loss": 0.3743, + "num_tokens": 32719129.0, + "step": 972 + }, + { + "epoch": 1.1921664626682986, + "grad_norm": 0.32168230414390564, + "learning_rate": 1e-05, + "loss": 0.3615, + "num_tokens": 33274739.0, + "step": 974 + }, + { + "epoch": 1.1946144430844554, + "grad_norm": 0.24494768679141998, + "learning_rate": 1e-05, + "loss": 0.3742, + "num_tokens": 33851163.0, + "step": 976 + }, + { + "epoch": 1.197062423500612, + "grad_norm": 0.24181753396987915, + "learning_rate": 1e-05, + "loss": 0.3779, + "num_tokens": 34434320.0, + "step": 978 + }, + { + "epoch": 1.1995104039167686, + "grad_norm": 0.2651110291481018, + "learning_rate": 1e-05, + "loss": 0.3792, + "num_tokens": 35003003.0, + "step": 980 + }, + { + "epoch": 1.2019583843329253, + "grad_norm": 0.26116904616355896, + "learning_rate": 1e-05, + "loss": 0.3747, + "num_tokens": 35550284.0, + "step": 982 + }, + { + "epoch": 1.204406364749082, + "grad_norm": 0.24539689719676971, + "learning_rate": 1e-05, + "loss": 0.3575, + "num_tokens": 36114596.0, + "step": 984 + }, + { + "epoch": 1.2068543451652387, + "grad_norm": 0.2678145170211792, + "learning_rate": 1e-05, + "loss": 0.3888, + "num_tokens": 36694709.0, + "step": 986 + }, + { + "epoch": 1.2093023255813953, + "grad_norm": 0.25375595688819885, + "learning_rate": 1e-05, + "loss": 0.3894, + "num_tokens": 37270293.0, + "step": 988 + }, + { + "epoch": 1.211750305997552, + "grad_norm": 0.23707690834999084, + "learning_rate": 1e-05, + "loss": 0.3794, + "num_tokens": 37871657.0, + "step": 990 + }, + { + "epoch": 1.2141982864137086, + "grad_norm": 0.2559983730316162, + "learning_rate": 1e-05, + "loss": 0.3762, + "num_tokens": 38425602.0, + "step": 992 + }, + { + "epoch": 1.2166462668298654, + "grad_norm": 0.2463446706533432, + "learning_rate": 1e-05, + "loss": 0.3641, + "num_tokens": 39013285.0, + "step": 994 + }, + { + "epoch": 1.219094247246022, + "grad_norm": 0.2542133629322052, + "learning_rate": 1e-05, + "loss": 0.3786, + "num_tokens": 39609316.0, + "step": 996 + }, + { + "epoch": 1.2215422276621788, + "grad_norm": 0.24676287174224854, + "learning_rate": 1e-05, + "loss": 0.3767, + "num_tokens": 40176004.0, + "step": 998 + }, + { + "epoch": 1.2239902080783354, + "grad_norm": 0.24902845919132233, + "learning_rate": 1e-05, + "loss": 0.3767, + "num_tokens": 40766442.0, + "step": 1000 + }, + { + "epoch": 1.226438188494492, + "grad_norm": 0.24813127517700195, + "learning_rate": 1e-05, + "loss": 0.3741, + "num_tokens": 41349662.0, + "step": 1002 + }, + { + "epoch": 1.2288861689106487, + "grad_norm": 0.25595715641975403, + "learning_rate": 1e-05, + "loss": 0.3772, + "num_tokens": 41921723.0, + "step": 1004 + }, + { + "epoch": 1.2313341493268053, + "grad_norm": 0.2417302131652832, + "learning_rate": 1e-05, + "loss": 0.3814, + "num_tokens": 42502360.0, + "step": 1006 + }, + { + "epoch": 1.233782129742962, + "grad_norm": 0.24199765920639038, + "learning_rate": 1e-05, + "loss": 0.3849, + "num_tokens": 43104367.0, + "step": 1008 + }, + { + "epoch": 1.2362301101591187, + "grad_norm": 0.2543700635433197, + "learning_rate": 1e-05, + "loss": 0.3792, + "num_tokens": 43685299.0, + "step": 1010 + }, + { + "epoch": 1.2386780905752754, + "grad_norm": 0.23722825944423676, + "learning_rate": 1e-05, + "loss": 0.3746, + "num_tokens": 44282353.0, + "step": 1012 + }, + { + "epoch": 1.241126070991432, + "grad_norm": 0.2463102787733078, + "learning_rate": 1e-05, + "loss": 0.3758, + "num_tokens": 44876385.0, + "step": 1014 + }, + { + "epoch": 1.2435740514075888, + "grad_norm": 0.25006935000419617, + "learning_rate": 1e-05, + "loss": 0.3739, + "num_tokens": 45464003.0, + "step": 1016 + }, + { + "epoch": 1.2460220318237454, + "grad_norm": 0.24640695750713348, + "learning_rate": 1e-05, + "loss": 0.3807, + "num_tokens": 46050560.0, + "step": 1018 + }, + { + "epoch": 1.2484700122399022, + "grad_norm": 0.2511467933654785, + "learning_rate": 1e-05, + "loss": 0.3999, + "num_tokens": 46640792.0, + "step": 1020 + }, + { + "epoch": 1.2509179926560587, + "grad_norm": 0.2527099549770355, + "learning_rate": 1e-05, + "loss": 0.3894, + "num_tokens": 47241084.0, + "step": 1022 + }, + { + "epoch": 1.2533659730722153, + "grad_norm": 0.24509671330451965, + "learning_rate": 1e-05, + "loss": 0.3703, + "num_tokens": 47813100.0, + "step": 1024 + }, + { + "epoch": 1.255813953488372, + "grad_norm": 0.23858234286308289, + "learning_rate": 1e-05, + "loss": 0.379, + "num_tokens": 48415520.0, + "step": 1026 + }, + { + "epoch": 1.258261933904529, + "grad_norm": 0.2405681014060974, + "learning_rate": 1e-05, + "loss": 0.3771, + "num_tokens": 49009965.0, + "step": 1028 + }, + { + "epoch": 1.2607099143206855, + "grad_norm": 0.37645822763442993, + "learning_rate": 1e-05, + "loss": 0.3809, + "num_tokens": 49627546.0, + "step": 1030 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 0.24870316684246063, + "learning_rate": 1e-05, + "loss": 0.377, + "num_tokens": 50236156.0, + "step": 1032 + }, + { + "epoch": 1.2656058751529988, + "grad_norm": 0.2347888946533203, + "learning_rate": 1e-05, + "loss": 0.3831, + "num_tokens": 50826554.0, + "step": 1034 + }, + { + "epoch": 1.2680538555691554, + "grad_norm": 0.24506457149982452, + "learning_rate": 1e-05, + "loss": 0.3774, + "num_tokens": 51428333.0, + "step": 1036 + }, + { + "epoch": 1.2705018359853122, + "grad_norm": 0.2655375599861145, + "learning_rate": 1e-05, + "loss": 0.3837, + "num_tokens": 52042985.0, + "step": 1038 + }, + { + "epoch": 1.2729498164014688, + "grad_norm": 0.24918022751808167, + "learning_rate": 1e-05, + "loss": 0.3776, + "num_tokens": 52631339.0, + "step": 1040 + }, + { + "epoch": 1.2753977968176256, + "grad_norm": 0.2505210041999817, + "learning_rate": 1e-05, + "loss": 0.3798, + "num_tokens": 53217729.0, + "step": 1042 + }, + { + "epoch": 1.2778457772337821, + "grad_norm": 0.23858347535133362, + "learning_rate": 1e-05, + "loss": 0.3601, + "num_tokens": 53801689.0, + "step": 1044 + }, + { + "epoch": 1.2802937576499387, + "grad_norm": 0.25308915972709656, + "learning_rate": 1e-05, + "loss": 0.3807, + "num_tokens": 54404896.0, + "step": 1046 + }, + { + "epoch": 1.2827417380660955, + "grad_norm": 0.23880726099014282, + "learning_rate": 1e-05, + "loss": 0.381, + "num_tokens": 54994715.0, + "step": 1048 + }, + { + "epoch": 1.2851897184822523, + "grad_norm": 0.2413705289363861, + "learning_rate": 1e-05, + "loss": 0.3763, + "num_tokens": 55583693.0, + "step": 1050 + }, + { + "epoch": 1.2876376988984088, + "grad_norm": 0.24790863692760468, + "learning_rate": 1e-05, + "loss": 0.3759, + "num_tokens": 56166986.0, + "step": 1052 + }, + { + "epoch": 1.2900856793145654, + "grad_norm": 0.2571425139904022, + "learning_rate": 1e-05, + "loss": 0.3721, + "num_tokens": 56739683.0, + "step": 1054 + }, + { + "epoch": 1.2925336597307222, + "grad_norm": 0.237641379237175, + "learning_rate": 1e-05, + "loss": 0.3902, + "num_tokens": 57354184.0, + "step": 1056 + }, + { + "epoch": 1.2949816401468788, + "grad_norm": 0.23018361628055573, + "learning_rate": 1e-05, + "loss": 0.3568, + "num_tokens": 57940330.0, + "step": 1058 + }, + { + "epoch": 1.2974296205630356, + "grad_norm": 0.2509154677391052, + "learning_rate": 1e-05, + "loss": 0.362, + "num_tokens": 58521951.0, + "step": 1060 + }, + { + "epoch": 1.2998776009791921, + "grad_norm": 0.255787193775177, + "learning_rate": 1e-05, + "loss": 0.3845, + "num_tokens": 59128476.0, + "step": 1062 + }, + { + "epoch": 1.302325581395349, + "grad_norm": 0.2486552596092224, + "learning_rate": 1e-05, + "loss": 0.3724, + "num_tokens": 59717196.0, + "step": 1064 + }, + { + "epoch": 1.3047735618115055, + "grad_norm": 0.260206937789917, + "learning_rate": 1e-05, + "loss": 0.3548, + "num_tokens": 60280879.0, + "step": 1066 + }, + { + "epoch": 1.307221542227662, + "grad_norm": 0.250387966632843, + "learning_rate": 1e-05, + "loss": 0.3753, + "num_tokens": 60859876.0, + "step": 1068 + }, + { + "epoch": 1.3096695226438189, + "grad_norm": 0.24388471245765686, + "learning_rate": 1e-05, + "loss": 0.3718, + "num_tokens": 61438497.0, + "step": 1070 + }, + { + "epoch": 1.3121175030599757, + "grad_norm": 0.23518991470336914, + "learning_rate": 1e-05, + "loss": 0.3693, + "num_tokens": 62038958.0, + "step": 1072 + }, + { + "epoch": 1.3145654834761322, + "grad_norm": 0.24329505860805511, + "learning_rate": 1e-05, + "loss": 0.3885, + "num_tokens": 62608639.0, + "step": 1074 + }, + { + "epoch": 1.3170134638922888, + "grad_norm": 0.24493998289108276, + "learning_rate": 1e-05, + "loss": 0.375, + "num_tokens": 63182731.0, + "step": 1076 + }, + { + "epoch": 1.3194614443084456, + "grad_norm": 0.23012055456638336, + "learning_rate": 1e-05, + "loss": 0.3774, + "num_tokens": 63784383.0, + "step": 1078 + }, + { + "epoch": 1.3219094247246022, + "grad_norm": 0.230576291680336, + "learning_rate": 1e-05, + "loss": 0.3767, + "num_tokens": 64371986.0, + "step": 1080 + }, + { + "epoch": 1.324357405140759, + "grad_norm": 0.23564326763153076, + "learning_rate": 1e-05, + "loss": 0.3727, + "num_tokens": 64966281.0, + "step": 1082 + }, + { + "epoch": 1.3268053855569155, + "grad_norm": 0.23679161071777344, + "learning_rate": 1e-05, + "loss": 0.3672, + "num_tokens": 65550336.0, + "step": 1084 + }, + { + "epoch": 1.3292533659730723, + "grad_norm": 0.25614285469055176, + "learning_rate": 1e-05, + "loss": 0.3809, + "num_tokens": 66125746.0, + "step": 1086 + }, + { + "epoch": 1.3317013463892289, + "grad_norm": 0.24231094121932983, + "learning_rate": 1e-05, + "loss": 0.3975, + "num_tokens": 66726281.0, + "step": 1088 + }, + { + "epoch": 1.3341493268053854, + "grad_norm": 0.25783106684684753, + "learning_rate": 1e-05, + "loss": 0.3792, + "num_tokens": 67286102.0, + "step": 1090 + }, + { + "epoch": 1.3365973072215422, + "grad_norm": 0.2526884078979492, + "learning_rate": 1e-05, + "loss": 0.3654, + "num_tokens": 67863433.0, + "step": 1092 + }, + { + "epoch": 1.339045287637699, + "grad_norm": 0.2526870369911194, + "learning_rate": 1e-05, + "loss": 0.3721, + "num_tokens": 68449351.0, + "step": 1094 + }, + { + "epoch": 1.3414932680538556, + "grad_norm": 0.24224893748760223, + "learning_rate": 1e-05, + "loss": 0.3756, + "num_tokens": 69068729.0, + "step": 1096 + }, + { + "epoch": 1.3439412484700122, + "grad_norm": 0.24491116404533386, + "learning_rate": 1e-05, + "loss": 0.3874, + "num_tokens": 69656409.0, + "step": 1098 + }, + { + "epoch": 1.346389228886169, + "grad_norm": 0.2297855019569397, + "learning_rate": 1e-05, + "loss": 0.3763, + "num_tokens": 70284891.0, + "step": 1100 + }, + { + "epoch": 1.3488372093023255, + "grad_norm": 0.25858911871910095, + "learning_rate": 1e-05, + "loss": 0.3737, + "num_tokens": 70859349.0, + "step": 1102 + }, + { + "epoch": 1.3512851897184823, + "grad_norm": 0.24717983603477478, + "learning_rate": 1e-05, + "loss": 0.3862, + "num_tokens": 71445796.0, + "step": 1104 + }, + { + "epoch": 1.353733170134639, + "grad_norm": 0.2568610608577728, + "learning_rate": 1e-05, + "loss": 0.379, + "num_tokens": 72031706.0, + "step": 1106 + }, + { + "epoch": 1.3561811505507957, + "grad_norm": 0.24025028944015503, + "learning_rate": 1e-05, + "loss": 0.3727, + "num_tokens": 72618510.0, + "step": 1108 + }, + { + "epoch": 1.3586291309669523, + "grad_norm": 0.24287337064743042, + "learning_rate": 1e-05, + "loss": 0.389, + "num_tokens": 73203297.0, + "step": 1110 + }, + { + "epoch": 1.3610771113831088, + "grad_norm": 0.25672075152397156, + "learning_rate": 1e-05, + "loss": 0.3901, + "num_tokens": 73812815.0, + "step": 1112 + }, + { + "epoch": 1.3635250917992656, + "grad_norm": 0.23002919554710388, + "learning_rate": 1e-05, + "loss": 0.3606, + "num_tokens": 74428959.0, + "step": 1114 + }, + { + "epoch": 1.3659730722154224, + "grad_norm": 0.24714474380016327, + "learning_rate": 1e-05, + "loss": 0.3614, + "num_tokens": 75008016.0, + "step": 1116 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 0.24221962690353394, + "learning_rate": 1e-05, + "loss": 0.3843, + "num_tokens": 75628706.0, + "step": 1118 + }, + { + "epoch": 1.3708690330477356, + "grad_norm": 0.2576131522655487, + "learning_rate": 1e-05, + "loss": 0.3867, + "num_tokens": 76195451.0, + "step": 1120 + }, + { + "epoch": 1.3733170134638923, + "grad_norm": 0.2453685849905014, + "learning_rate": 1e-05, + "loss": 0.3873, + "num_tokens": 76761969.0, + "step": 1122 + }, + { + "epoch": 1.375764993880049, + "grad_norm": 0.24041421711444855, + "learning_rate": 1e-05, + "loss": 0.3701, + "num_tokens": 77344091.0, + "step": 1124 + }, + { + "epoch": 1.3782129742962057, + "grad_norm": 0.25494855642318726, + "learning_rate": 1e-05, + "loss": 0.3708, + "num_tokens": 77933968.0, + "step": 1126 + }, + { + "epoch": 1.3806609547123623, + "grad_norm": 0.24914491176605225, + "learning_rate": 1e-05, + "loss": 0.3676, + "num_tokens": 78506872.0, + "step": 1128 + }, + { + "epoch": 1.383108935128519, + "grad_norm": 0.24509696662425995, + "learning_rate": 1e-05, + "loss": 0.3922, + "num_tokens": 79115121.0, + "step": 1130 + }, + { + "epoch": 1.3855569155446756, + "grad_norm": 0.24067234992980957, + "learning_rate": 1e-05, + "loss": 0.3719, + "num_tokens": 79689670.0, + "step": 1132 + }, + { + "epoch": 1.3880048959608322, + "grad_norm": 0.2415025383234024, + "learning_rate": 1e-05, + "loss": 0.3842, + "num_tokens": 80296309.0, + "step": 1134 + }, + { + "epoch": 1.390452876376989, + "grad_norm": 0.23817619681358337, + "learning_rate": 1e-05, + "loss": 0.3763, + "num_tokens": 80893156.0, + "step": 1136 + }, + { + "epoch": 1.3929008567931458, + "grad_norm": 0.24730853736400604, + "learning_rate": 1e-05, + "loss": 0.3825, + "num_tokens": 81501032.0, + "step": 1138 + }, + { + "epoch": 1.3953488372093024, + "grad_norm": 0.2538658082485199, + "learning_rate": 1e-05, + "loss": 0.3728, + "num_tokens": 82063275.0, + "step": 1140 + }, + { + "epoch": 1.397796817625459, + "grad_norm": 0.2547858655452728, + "learning_rate": 1e-05, + "loss": 0.3658, + "num_tokens": 82647178.0, + "step": 1142 + }, + { + "epoch": 1.4002447980416157, + "grad_norm": 0.24213729798793793, + "learning_rate": 1e-05, + "loss": 0.3709, + "num_tokens": 83253938.0, + "step": 1144 + }, + { + "epoch": 1.4026927784577723, + "grad_norm": 0.24049073457717896, + "learning_rate": 1e-05, + "loss": 0.3727, + "num_tokens": 83843402.0, + "step": 1146 + }, + { + "epoch": 1.405140758873929, + "grad_norm": 0.2472056746482849, + "learning_rate": 1e-05, + "loss": 0.3714, + "num_tokens": 84445702.0, + "step": 1148 + }, + { + "epoch": 1.4075887392900857, + "grad_norm": 0.23540234565734863, + "learning_rate": 1e-05, + "loss": 0.3841, + "num_tokens": 85061854.0, + "step": 1150 + }, + { + "epoch": 1.4100367197062424, + "grad_norm": 0.24163338541984558, + "learning_rate": 1e-05, + "loss": 0.3707, + "num_tokens": 85656497.0, + "step": 1152 + }, + { + "epoch": 1.412484700122399, + "grad_norm": 0.2464989274740219, + "learning_rate": 1e-05, + "loss": 0.3739, + "num_tokens": 86251427.0, + "step": 1154 + }, + { + "epoch": 1.4149326805385556, + "grad_norm": 0.2482248991727829, + "learning_rate": 1e-05, + "loss": 0.3705, + "num_tokens": 86832067.0, + "step": 1156 + }, + { + "epoch": 1.4173806609547124, + "grad_norm": 0.24530856311321259, + "learning_rate": 1e-05, + "loss": 0.3753, + "num_tokens": 87430733.0, + "step": 1158 + }, + { + "epoch": 1.4198286413708692, + "grad_norm": 0.2416103333234787, + "learning_rate": 1e-05, + "loss": 0.3788, + "num_tokens": 88052112.0, + "step": 1160 + }, + { + "epoch": 1.4222766217870257, + "grad_norm": 0.2397005707025528, + "learning_rate": 1e-05, + "loss": 0.3734, + "num_tokens": 88650677.0, + "step": 1162 + }, + { + "epoch": 1.4247246022031823, + "grad_norm": 0.2345299869775772, + "learning_rate": 1e-05, + "loss": 0.3721, + "num_tokens": 89245448.0, + "step": 1164 + }, + { + "epoch": 1.427172582619339, + "grad_norm": 0.23692870140075684, + "learning_rate": 1e-05, + "loss": 0.3663, + "num_tokens": 89834942.0, + "step": 1166 + }, + { + "epoch": 1.4296205630354957, + "grad_norm": 0.25026100873947144, + "learning_rate": 1e-05, + "loss": 0.3917, + "num_tokens": 90433936.0, + "step": 1168 + }, + { + "epoch": 1.4320685434516525, + "grad_norm": 0.24824434518814087, + "learning_rate": 1e-05, + "loss": 0.3806, + "num_tokens": 91061342.0, + "step": 1170 + }, + { + "epoch": 1.434516523867809, + "grad_norm": 0.24724173545837402, + "learning_rate": 1e-05, + "loss": 0.3605, + "num_tokens": 91652152.0, + "step": 1172 + }, + { + "epoch": 1.4369645042839658, + "grad_norm": 0.26517099142074585, + "learning_rate": 1e-05, + "loss": 0.3898, + "num_tokens": 92245127.0, + "step": 1174 + }, + { + "epoch": 1.4394124847001224, + "grad_norm": 0.26708370447158813, + "learning_rate": 1e-05, + "loss": 0.3797, + "num_tokens": 92838369.0, + "step": 1176 + }, + { + "epoch": 1.441860465116279, + "grad_norm": 0.2398887723684311, + "learning_rate": 1e-05, + "loss": 0.3794, + "num_tokens": 93420778.0, + "step": 1178 + }, + { + "epoch": 1.4443084455324358, + "grad_norm": 0.267572820186615, + "learning_rate": 1e-05, + "loss": 0.3766, + "num_tokens": 93997138.0, + "step": 1180 + }, + { + "epoch": 1.4467564259485923, + "grad_norm": 0.24645371735095978, + "learning_rate": 1e-05, + "loss": 0.3638, + "num_tokens": 94586885.0, + "step": 1182 + }, + { + "epoch": 1.4492044063647491, + "grad_norm": 0.2596088647842407, + "learning_rate": 1e-05, + "loss": 0.3739, + "num_tokens": 95158567.0, + "step": 1184 + }, + { + "epoch": 1.4516523867809057, + "grad_norm": 0.24318157136440277, + "learning_rate": 1e-05, + "loss": 0.3708, + "num_tokens": 95755615.0, + "step": 1186 + }, + { + "epoch": 1.4541003671970625, + "grad_norm": 0.25732532143592834, + "learning_rate": 1e-05, + "loss": 0.3733, + "num_tokens": 96326941.0, + "step": 1188 + }, + { + "epoch": 1.456548347613219, + "grad_norm": 0.24617703258991241, + "learning_rate": 1e-05, + "loss": 0.3749, + "num_tokens": 96934565.0, + "step": 1190 + }, + { + "epoch": 1.4589963280293758, + "grad_norm": 0.24249835312366486, + "learning_rate": 1e-05, + "loss": 0.3709, + "num_tokens": 97544628.0, + "step": 1192 + }, + { + "epoch": 1.4614443084455324, + "grad_norm": 0.25143083930015564, + "learning_rate": 1e-05, + "loss": 0.3716, + "num_tokens": 98129011.0, + "step": 1194 + }, + { + "epoch": 1.4638922888616892, + "grad_norm": 0.2425592988729477, + "learning_rate": 1e-05, + "loss": 0.3823, + "num_tokens": 98750360.0, + "step": 1196 + }, + { + "epoch": 1.4663402692778458, + "grad_norm": 0.24293585121631622, + "learning_rate": 1e-05, + "loss": 0.3621, + "num_tokens": 99341765.0, + "step": 1198 + }, + { + "epoch": 1.4687882496940023, + "grad_norm": 0.23716874420642853, + "learning_rate": 1e-05, + "loss": 0.381, + "num_tokens": 99932679.0, + "step": 1200 + }, + { + "epoch": 1.4712362301101591, + "grad_norm": 0.22967226803302765, + "learning_rate": 1e-05, + "loss": 0.3641, + "num_tokens": 100527728.0, + "step": 1202 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 0.24329859018325806, + "learning_rate": 1e-05, + "loss": 0.3757, + "num_tokens": 101140712.0, + "step": 1204 + }, + { + "epoch": 1.4761321909424725, + "grad_norm": 0.2402748316526413, + "learning_rate": 1e-05, + "loss": 0.3706, + "num_tokens": 101749103.0, + "step": 1206 + }, + { + "epoch": 1.478580171358629, + "grad_norm": 0.2467920184135437, + "learning_rate": 1e-05, + "loss": 0.3844, + "num_tokens": 102349686.0, + "step": 1208 + }, + { + "epoch": 1.4810281517747859, + "grad_norm": 0.23824624717235565, + "learning_rate": 1e-05, + "loss": 0.3801, + "num_tokens": 102935794.0, + "step": 1210 + }, + { + "epoch": 1.4834761321909424, + "grad_norm": 0.22843535244464874, + "learning_rate": 1e-05, + "loss": 0.3798, + "num_tokens": 103565230.0, + "step": 1212 + }, + { + "epoch": 1.4859241126070992, + "grad_norm": 0.251880019903183, + "learning_rate": 1e-05, + "loss": 0.3792, + "num_tokens": 104127436.0, + "step": 1214 + }, + { + "epoch": 1.4883720930232558, + "grad_norm": 0.23985151946544647, + "learning_rate": 1e-05, + "loss": 0.3752, + "num_tokens": 104719703.0, + "step": 1216 + }, + { + "epoch": 1.4908200734394126, + "grad_norm": 0.24645771086215973, + "learning_rate": 1e-05, + "loss": 0.3723, + "num_tokens": 105292775.0, + "step": 1218 + }, + { + "epoch": 1.4932680538555692, + "grad_norm": 0.2515164613723755, + "learning_rate": 1e-05, + "loss": 0.3781, + "num_tokens": 105898653.0, + "step": 1220 + }, + { + "epoch": 1.4957160342717257, + "grad_norm": 0.2437610626220703, + "learning_rate": 1e-05, + "loss": 0.3762, + "num_tokens": 106506854.0, + "step": 1222 + }, + { + "epoch": 1.4981640146878825, + "grad_norm": 0.2478041648864746, + "learning_rate": 1e-05, + "loss": 0.3695, + "num_tokens": 107066302.0, + "step": 1224 + }, + { + "epoch": 1.5006119951040393, + "grad_norm": 0.24896089732646942, + "learning_rate": 1e-05, + "loss": 0.3687, + "num_tokens": 107646336.0, + "step": 1226 + }, + { + "epoch": 1.5030599755201959, + "grad_norm": 0.24831615388393402, + "learning_rate": 1e-05, + "loss": 0.3864, + "num_tokens": 108256194.0, + "step": 1228 + }, + { + "epoch": 1.5055079559363524, + "grad_norm": 0.2522716522216797, + "learning_rate": 1e-05, + "loss": 0.3687, + "num_tokens": 108833203.0, + "step": 1230 + }, + { + "epoch": 1.5079559363525092, + "grad_norm": 0.2509858310222626, + "learning_rate": 1e-05, + "loss": 0.3786, + "num_tokens": 109423517.0, + "step": 1232 + }, + { + "epoch": 1.5104039167686658, + "grad_norm": 0.236283540725708, + "learning_rate": 1e-05, + "loss": 0.3572, + "num_tokens": 110000996.0, + "step": 1234 + }, + { + "epoch": 1.5128518971848224, + "grad_norm": 0.24623404443264008, + "learning_rate": 1e-05, + "loss": 0.3674, + "num_tokens": 110581988.0, + "step": 1236 + }, + { + "epoch": 1.5152998776009792, + "grad_norm": 0.24854424595832825, + "learning_rate": 1e-05, + "loss": 0.3744, + "num_tokens": 111174046.0, + "step": 1238 + }, + { + "epoch": 1.517747858017136, + "grad_norm": 0.2561953365802765, + "learning_rate": 1e-05, + "loss": 0.3777, + "num_tokens": 111790906.0, + "step": 1240 + }, + { + "epoch": 1.5201958384332925, + "grad_norm": 0.2569839358329773, + "learning_rate": 1e-05, + "loss": 0.3751, + "num_tokens": 112378426.0, + "step": 1242 + }, + { + "epoch": 1.522643818849449, + "grad_norm": 0.24260175228118896, + "learning_rate": 1e-05, + "loss": 0.3755, + "num_tokens": 112972211.0, + "step": 1244 + }, + { + "epoch": 1.525091799265606, + "grad_norm": 0.2720998227596283, + "learning_rate": 1e-05, + "loss": 0.3644, + "num_tokens": 113560888.0, + "step": 1246 + }, + { + "epoch": 1.5275397796817627, + "grad_norm": 0.24294328689575195, + "learning_rate": 1e-05, + "loss": 0.3652, + "num_tokens": 114131545.0, + "step": 1248 + }, + { + "epoch": 1.5299877600979193, + "grad_norm": 0.24424785375595093, + "learning_rate": 1e-05, + "loss": 0.3853, + "num_tokens": 114724758.0, + "step": 1250 + }, + { + "epoch": 1.5324357405140758, + "grad_norm": 0.2621991038322449, + "learning_rate": 1e-05, + "loss": 0.3671, + "num_tokens": 115299206.0, + "step": 1252 + }, + { + "epoch": 1.5348837209302326, + "grad_norm": 0.2541300356388092, + "learning_rate": 1e-05, + "loss": 0.3799, + "num_tokens": 115885530.0, + "step": 1254 + }, + { + "epoch": 1.5373317013463892, + "grad_norm": 0.26225703954696655, + "learning_rate": 1e-05, + "loss": 0.3613, + "num_tokens": 116448812.0, + "step": 1256 + }, + { + "epoch": 1.5397796817625458, + "grad_norm": 0.23766973614692688, + "learning_rate": 1e-05, + "loss": 0.3748, + "num_tokens": 117055396.0, + "step": 1258 + }, + { + "epoch": 1.5422276621787026, + "grad_norm": 0.24437591433525085, + "learning_rate": 1e-05, + "loss": 0.3619, + "num_tokens": 117646869.0, + "step": 1260 + }, + { + "epoch": 1.5446756425948593, + "grad_norm": 0.2565830647945404, + "learning_rate": 1e-05, + "loss": 0.3723, + "num_tokens": 118208889.0, + "step": 1262 + }, + { + "epoch": 1.547123623011016, + "grad_norm": 0.24650581181049347, + "learning_rate": 1e-05, + "loss": 0.3749, + "num_tokens": 118791050.0, + "step": 1264 + }, + { + "epoch": 1.5495716034271725, + "grad_norm": 0.24814392626285553, + "learning_rate": 1e-05, + "loss": 0.3684, + "num_tokens": 119365749.0, + "step": 1266 + }, + { + "epoch": 1.5520195838433293, + "grad_norm": 0.24582888185977936, + "learning_rate": 1e-05, + "loss": 0.3555, + "num_tokens": 119950880.0, + "step": 1268 + }, + { + "epoch": 1.554467564259486, + "grad_norm": 0.24036677181720734, + "learning_rate": 1e-05, + "loss": 0.3766, + "num_tokens": 120565071.0, + "step": 1270 + }, + { + "epoch": 1.5569155446756426, + "grad_norm": 0.23649543523788452, + "learning_rate": 1e-05, + "loss": 0.3727, + "num_tokens": 121183680.0, + "step": 1272 + }, + { + "epoch": 1.5593635250917992, + "grad_norm": 0.23108918964862823, + "learning_rate": 1e-05, + "loss": 0.3665, + "num_tokens": 121800058.0, + "step": 1274 + }, + { + "epoch": 1.561811505507956, + "grad_norm": 0.25551703572273254, + "learning_rate": 1e-05, + "loss": 0.373, + "num_tokens": 122392911.0, + "step": 1276 + }, + { + "epoch": 1.5642594859241126, + "grad_norm": 0.2403259128332138, + "learning_rate": 1e-05, + "loss": 0.371, + "num_tokens": 122994876.0, + "step": 1278 + }, + { + "epoch": 1.5667074663402691, + "grad_norm": 0.23855362832546234, + "learning_rate": 1e-05, + "loss": 0.3758, + "num_tokens": 123607985.0, + "step": 1280 + }, + { + "epoch": 1.569155446756426, + "grad_norm": 0.24700097739696503, + "learning_rate": 1e-05, + "loss": 0.3805, + "num_tokens": 124174095.0, + "step": 1282 + }, + { + "epoch": 1.5716034271725827, + "grad_norm": 0.2535829246044159, + "learning_rate": 1e-05, + "loss": 0.3706, + "num_tokens": 124758917.0, + "step": 1284 + }, + { + "epoch": 1.5740514075887393, + "grad_norm": 0.24376705288887024, + "learning_rate": 1e-05, + "loss": 0.371, + "num_tokens": 125354644.0, + "step": 1286 + }, + { + "epoch": 1.5764993880048959, + "grad_norm": 0.24517321586608887, + "learning_rate": 1e-05, + "loss": 0.3811, + "num_tokens": 125955891.0, + "step": 1288 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.24353429675102234, + "learning_rate": 1e-05, + "loss": 0.3672, + "num_tokens": 126576567.0, + "step": 1290 + }, + { + "epoch": 1.5813953488372094, + "grad_norm": 0.24050772190093994, + "learning_rate": 1e-05, + "loss": 0.3746, + "num_tokens": 127142553.0, + "step": 1292 + }, + { + "epoch": 1.583843329253366, + "grad_norm": 0.24689637124538422, + "learning_rate": 1e-05, + "loss": 0.3817, + "num_tokens": 127758123.0, + "step": 1294 + }, + { + "epoch": 1.5862913096695226, + "grad_norm": 0.24711348116397858, + "learning_rate": 1e-05, + "loss": 0.3575, + "num_tokens": 128370312.0, + "step": 1296 + }, + { + "epoch": 1.5887392900856794, + "grad_norm": 0.23984448611736298, + "learning_rate": 1e-05, + "loss": 0.3677, + "num_tokens": 128972638.0, + "step": 1298 + }, + { + "epoch": 1.591187270501836, + "grad_norm": 0.2484259158372879, + "learning_rate": 1e-05, + "loss": 0.3755, + "num_tokens": 129572913.0, + "step": 1300 + }, + { + "epoch": 1.5936352509179925, + "grad_norm": 0.2364276647567749, + "learning_rate": 1e-05, + "loss": 0.3747, + "num_tokens": 130195765.0, + "step": 1302 + }, + { + "epoch": 1.5960832313341493, + "grad_norm": 0.24387337267398834, + "learning_rate": 1e-05, + "loss": 0.3745, + "num_tokens": 130789536.0, + "step": 1304 + }, + { + "epoch": 1.598531211750306, + "grad_norm": 0.23871919512748718, + "learning_rate": 1e-05, + "loss": 0.3714, + "num_tokens": 131403745.0, + "step": 1306 + }, + { + "epoch": 1.6009791921664627, + "grad_norm": 0.25457867980003357, + "learning_rate": 1e-05, + "loss": 0.3686, + "num_tokens": 131969389.0, + "step": 1308 + }, + { + "epoch": 1.6034271725826192, + "grad_norm": 0.2390459179878235, + "learning_rate": 1e-05, + "loss": 0.3736, + "num_tokens": 132571128.0, + "step": 1310 + }, + { + "epoch": 1.605875152998776, + "grad_norm": 0.24587389826774597, + "learning_rate": 1e-05, + "loss": 0.3733, + "num_tokens": 133160154.0, + "step": 1312 + }, + { + "epoch": 1.6083231334149328, + "grad_norm": 0.2370455414056778, + "learning_rate": 1e-05, + "loss": 0.3665, + "num_tokens": 133735784.0, + "step": 1314 + }, + { + "epoch": 1.6107711138310894, + "grad_norm": 0.24686893820762634, + "learning_rate": 1e-05, + "loss": 0.3815, + "num_tokens": 134327883.0, + "step": 1316 + }, + { + "epoch": 1.613219094247246, + "grad_norm": 0.24591784179210663, + "learning_rate": 1e-05, + "loss": 0.3794, + "num_tokens": 134922195.0, + "step": 1318 + }, + { + "epoch": 1.6156670746634028, + "grad_norm": 0.24312755465507507, + "learning_rate": 1e-05, + "loss": 0.373, + "num_tokens": 135497704.0, + "step": 1320 + }, + { + "epoch": 1.6181150550795593, + "grad_norm": 0.24310767650604248, + "learning_rate": 1e-05, + "loss": 0.3702, + "num_tokens": 136090481.0, + "step": 1322 + }, + { + "epoch": 1.620563035495716, + "grad_norm": 0.2369573563337326, + "learning_rate": 1e-05, + "loss": 0.3681, + "num_tokens": 136691129.0, + "step": 1324 + }, + { + "epoch": 1.6230110159118727, + "grad_norm": 0.2429567277431488, + "learning_rate": 1e-05, + "loss": 0.3672, + "num_tokens": 137299855.0, + "step": 1326 + }, + { + "epoch": 1.6254589963280295, + "grad_norm": 0.271967351436615, + "learning_rate": 1e-05, + "loss": 0.362, + "num_tokens": 137882026.0, + "step": 1328 + }, + { + "epoch": 1.627906976744186, + "grad_norm": 0.2503095269203186, + "learning_rate": 1e-05, + "loss": 0.371, + "num_tokens": 138449440.0, + "step": 1330 + }, + { + "epoch": 1.6303549571603426, + "grad_norm": 0.23528578877449036, + "learning_rate": 1e-05, + "loss": 0.3693, + "num_tokens": 139040161.0, + "step": 1332 + }, + { + "epoch": 1.6328029375764994, + "grad_norm": 0.2523113489151001, + "learning_rate": 1e-05, + "loss": 0.3852, + "num_tokens": 139625197.0, + "step": 1334 + }, + { + "epoch": 1.6352509179926562, + "grad_norm": 0.23514311015605927, + "learning_rate": 1e-05, + "loss": 0.3643, + "num_tokens": 140210445.0, + "step": 1336 + }, + { + "epoch": 1.6376988984088128, + "grad_norm": 0.2515263557434082, + "learning_rate": 1e-05, + "loss": 0.3786, + "num_tokens": 140794059.0, + "step": 1338 + }, + { + "epoch": 1.6401468788249693, + "grad_norm": 0.23876678943634033, + "learning_rate": 1e-05, + "loss": 0.3682, + "num_tokens": 141373540.0, + "step": 1340 + }, + { + "epoch": 1.6425948592411261, + "grad_norm": 0.24048742651939392, + "learning_rate": 1e-05, + "loss": 0.3635, + "num_tokens": 141952741.0, + "step": 1342 + }, + { + "epoch": 1.6450428396572827, + "grad_norm": 0.24254868924617767, + "learning_rate": 1e-05, + "loss": 0.3712, + "num_tokens": 142536772.0, + "step": 1344 + }, + { + "epoch": 1.6474908200734393, + "grad_norm": 0.2360367327928543, + "learning_rate": 1e-05, + "loss": 0.3584, + "num_tokens": 143109958.0, + "step": 1346 + }, + { + "epoch": 1.649938800489596, + "grad_norm": 0.23876917362213135, + "learning_rate": 1e-05, + "loss": 0.372, + "num_tokens": 143683599.0, + "step": 1348 + }, + { + "epoch": 1.6523867809057529, + "grad_norm": 0.24206481873989105, + "learning_rate": 1e-05, + "loss": 0.366, + "num_tokens": 144275321.0, + "step": 1350 + }, + { + "epoch": 1.6548347613219094, + "grad_norm": 0.2546513080596924, + "learning_rate": 1e-05, + "loss": 0.3589, + "num_tokens": 144841340.0, + "step": 1352 + }, + { + "epoch": 1.657282741738066, + "grad_norm": 0.25866273045539856, + "learning_rate": 1e-05, + "loss": 0.3581, + "num_tokens": 145428192.0, + "step": 1354 + }, + { + "epoch": 1.6597307221542228, + "grad_norm": 0.244053915143013, + "learning_rate": 1e-05, + "loss": 0.3881, + "num_tokens": 146015954.0, + "step": 1356 + }, + { + "epoch": 1.6621787025703796, + "grad_norm": 0.24577046930789948, + "learning_rate": 1e-05, + "loss": 0.3563, + "num_tokens": 146597823.0, + "step": 1358 + }, + { + "epoch": 1.6646266829865362, + "grad_norm": 0.23734845221042633, + "learning_rate": 1e-05, + "loss": 0.3627, + "num_tokens": 147191387.0, + "step": 1360 + }, + { + "epoch": 1.6670746634026927, + "grad_norm": 0.2565036416053772, + "learning_rate": 1e-05, + "loss": 0.3883, + "num_tokens": 147783308.0, + "step": 1362 + }, + { + "epoch": 1.6695226438188495, + "grad_norm": 0.2334887683391571, + "learning_rate": 1e-05, + "loss": 0.3743, + "num_tokens": 148375364.0, + "step": 1364 + }, + { + "epoch": 1.671970624235006, + "grad_norm": 0.24507057666778564, + "learning_rate": 1e-05, + "loss": 0.3708, + "num_tokens": 148967447.0, + "step": 1366 + }, + { + "epoch": 1.6744186046511627, + "grad_norm": 0.2379869818687439, + "learning_rate": 1e-05, + "loss": 0.3764, + "num_tokens": 149559323.0, + "step": 1368 + }, + { + "epoch": 1.6768665850673194, + "grad_norm": 0.2732064127922058, + "learning_rate": 1e-05, + "loss": 0.3711, + "num_tokens": 150158225.0, + "step": 1370 + }, + { + "epoch": 1.6793145654834762, + "grad_norm": 0.23710450530052185, + "learning_rate": 1e-05, + "loss": 0.3713, + "num_tokens": 150736029.0, + "step": 1372 + }, + { + "epoch": 1.6817625458996328, + "grad_norm": 0.24757219851016998, + "learning_rate": 1e-05, + "loss": 0.3787, + "num_tokens": 151320035.0, + "step": 1374 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 0.24102835357189178, + "learning_rate": 1e-05, + "loss": 0.3784, + "num_tokens": 151877797.0, + "step": 1376 + }, + { + "epoch": 1.6866585067319462, + "grad_norm": 0.2291739284992218, + "learning_rate": 1e-05, + "loss": 0.3744, + "num_tokens": 152494821.0, + "step": 1378 + }, + { + "epoch": 1.689106487148103, + "grad_norm": 0.2373509258031845, + "learning_rate": 1e-05, + "loss": 0.3699, + "num_tokens": 153103512.0, + "step": 1380 + }, + { + "epoch": 1.6915544675642595, + "grad_norm": 0.23789846897125244, + "learning_rate": 1e-05, + "loss": 0.3752, + "num_tokens": 153698813.0, + "step": 1382 + }, + { + "epoch": 1.694002447980416, + "grad_norm": 0.2423292100429535, + "learning_rate": 1e-05, + "loss": 0.3645, + "num_tokens": 154272685.0, + "step": 1384 + }, + { + "epoch": 1.696450428396573, + "grad_norm": 0.23416705429553986, + "learning_rate": 1e-05, + "loss": 0.3538, + "num_tokens": 154812070.0, + "step": 1386 + }, + { + "epoch": 1.6988984088127295, + "grad_norm": 0.25672298669815063, + "learning_rate": 1e-05, + "loss": 0.3666, + "num_tokens": 155392151.0, + "step": 1388 + }, + { + "epoch": 1.701346389228886, + "grad_norm": 0.23592020571231842, + "learning_rate": 1e-05, + "loss": 0.372, + "num_tokens": 156034757.0, + "step": 1390 + }, + { + "epoch": 1.7037943696450428, + "grad_norm": 0.2403942346572876, + "learning_rate": 1e-05, + "loss": 0.3644, + "num_tokens": 156628474.0, + "step": 1392 + }, + { + "epoch": 1.7062423500611996, + "grad_norm": 0.2440410554409027, + "learning_rate": 1e-05, + "loss": 0.3675, + "num_tokens": 157204254.0, + "step": 1394 + }, + { + "epoch": 1.7086903304773562, + "grad_norm": 0.2393968254327774, + "learning_rate": 1e-05, + "loss": 0.3774, + "num_tokens": 157793450.0, + "step": 1396 + }, + { + "epoch": 1.7111383108935128, + "grad_norm": 0.24635456502437592, + "learning_rate": 1e-05, + "loss": 0.3611, + "num_tokens": 158389128.0, + "step": 1398 + }, + { + "epoch": 1.7135862913096696, + "grad_norm": 0.2437032163143158, + "learning_rate": 1e-05, + "loss": 0.3726, + "num_tokens": 158958965.0, + "step": 1400 + }, + { + "epoch": 1.7160342717258263, + "grad_norm": 0.24211743474006653, + "learning_rate": 1e-05, + "loss": 0.363, + "num_tokens": 159568000.0, + "step": 1402 + }, + { + "epoch": 1.718482252141983, + "grad_norm": 0.23698055744171143, + "learning_rate": 1e-05, + "loss": 0.3695, + "num_tokens": 160156477.0, + "step": 1404 + }, + { + "epoch": 1.7209302325581395, + "grad_norm": 0.24990005791187286, + "learning_rate": 1e-05, + "loss": 0.3733, + "num_tokens": 160754592.0, + "step": 1406 + }, + { + "epoch": 1.7233782129742963, + "grad_norm": 0.2389305830001831, + "learning_rate": 1e-05, + "loss": 0.3503, + "num_tokens": 161339326.0, + "step": 1408 + }, + { + "epoch": 1.7258261933904528, + "grad_norm": 0.23870104551315308, + "learning_rate": 1e-05, + "loss": 0.3547, + "num_tokens": 161958750.0, + "step": 1410 + }, + { + "epoch": 1.7282741738066094, + "grad_norm": 0.24630923569202423, + "learning_rate": 1e-05, + "loss": 0.3695, + "num_tokens": 162547514.0, + "step": 1412 + }, + { + "epoch": 1.7307221542227662, + "grad_norm": 0.24691331386566162, + "learning_rate": 1e-05, + "loss": 0.3755, + "num_tokens": 163145389.0, + "step": 1414 + }, + { + "epoch": 1.733170134638923, + "grad_norm": 0.2594568431377411, + "learning_rate": 1e-05, + "loss": 0.3538, + "num_tokens": 163745928.0, + "step": 1416 + }, + { + "epoch": 1.7356181150550796, + "grad_norm": 0.24012570083141327, + "learning_rate": 1e-05, + "loss": 0.3791, + "num_tokens": 164340009.0, + "step": 1418 + }, + { + "epoch": 1.7380660954712361, + "grad_norm": 0.2581995725631714, + "learning_rate": 1e-05, + "loss": 0.3626, + "num_tokens": 164941110.0, + "step": 1420 + }, + { + "epoch": 1.740514075887393, + "grad_norm": 0.25723937153816223, + "learning_rate": 1e-05, + "loss": 0.3748, + "num_tokens": 165514409.0, + "step": 1422 + }, + { + "epoch": 1.7429620563035497, + "grad_norm": 0.2481413334608078, + "learning_rate": 1e-05, + "loss": 0.3854, + "num_tokens": 166119773.0, + "step": 1424 + }, + { + "epoch": 1.7454100367197063, + "grad_norm": 0.22994013130664825, + "learning_rate": 1e-05, + "loss": 0.3858, + "num_tokens": 166747342.0, + "step": 1426 + }, + { + "epoch": 1.7478580171358629, + "grad_norm": 0.23820635676383972, + "learning_rate": 1e-05, + "loss": 0.3678, + "num_tokens": 167339222.0, + "step": 1428 + }, + { + "epoch": 1.7503059975520197, + "grad_norm": 0.24489940702915192, + "learning_rate": 1e-05, + "loss": 0.3644, + "num_tokens": 167907963.0, + "step": 1430 + }, + { + "epoch": 1.7527539779681762, + "grad_norm": 0.24379844963550568, + "learning_rate": 1e-05, + "loss": 0.3646, + "num_tokens": 168490594.0, + "step": 1432 + }, + { + "epoch": 1.7552019583843328, + "grad_norm": 0.23336175084114075, + "learning_rate": 1e-05, + "loss": 0.3716, + "num_tokens": 169098140.0, + "step": 1434 + }, + { + "epoch": 1.7576499388004896, + "grad_norm": 0.24837198853492737, + "learning_rate": 1e-05, + "loss": 0.3601, + "num_tokens": 169673058.0, + "step": 1436 + }, + { + "epoch": 1.7600979192166464, + "grad_norm": 0.24629239737987518, + "learning_rate": 1e-05, + "loss": 0.3711, + "num_tokens": 170258149.0, + "step": 1438 + }, + { + "epoch": 1.762545899632803, + "grad_norm": 0.2353641837835312, + "learning_rate": 1e-05, + "loss": 0.3727, + "num_tokens": 170855087.0, + "step": 1440 + }, + { + "epoch": 1.7649938800489595, + "grad_norm": 0.25406232476234436, + "learning_rate": 1e-05, + "loss": 0.3829, + "num_tokens": 171460109.0, + "step": 1442 + }, + { + "epoch": 1.7674418604651163, + "grad_norm": 0.24506038427352905, + "learning_rate": 1e-05, + "loss": 0.3675, + "num_tokens": 172064062.0, + "step": 1444 + }, + { + "epoch": 1.769889840881273, + "grad_norm": 0.24228806793689728, + "learning_rate": 1e-05, + "loss": 0.3794, + "num_tokens": 172650152.0, + "step": 1446 + }, + { + "epoch": 1.7723378212974297, + "grad_norm": 0.2572336494922638, + "learning_rate": 1e-05, + "loss": 0.3768, + "num_tokens": 173241018.0, + "step": 1448 + }, + { + "epoch": 1.7747858017135862, + "grad_norm": 0.23749330639839172, + "learning_rate": 1e-05, + "loss": 0.3773, + "num_tokens": 173857068.0, + "step": 1450 + }, + { + "epoch": 1.777233782129743, + "grad_norm": 0.24634265899658203, + "learning_rate": 1e-05, + "loss": 0.3739, + "num_tokens": 174469774.0, + "step": 1452 + }, + { + "epoch": 1.7796817625458996, + "grad_norm": 0.22762316465377808, + "learning_rate": 1e-05, + "loss": 0.3761, + "num_tokens": 175084326.0, + "step": 1454 + }, + { + "epoch": 1.7821297429620562, + "grad_norm": 0.24187412858009338, + "learning_rate": 1e-05, + "loss": 0.3613, + "num_tokens": 175650143.0, + "step": 1456 + }, + { + "epoch": 1.784577723378213, + "grad_norm": 0.24004510045051575, + "learning_rate": 1e-05, + "loss": 0.3675, + "num_tokens": 176248086.0, + "step": 1458 + }, + { + "epoch": 1.7870257037943698, + "grad_norm": 0.24396033585071564, + "learning_rate": 1e-05, + "loss": 0.3627, + "num_tokens": 176851627.0, + "step": 1460 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 0.2826113998889923, + "learning_rate": 1e-05, + "loss": 0.3886, + "num_tokens": 177453579.0, + "step": 1462 + }, + { + "epoch": 1.791921664626683, + "grad_norm": 0.250384658575058, + "learning_rate": 1e-05, + "loss": 0.3781, + "num_tokens": 178065458.0, + "step": 1464 + }, + { + "epoch": 1.7943696450428397, + "grad_norm": 0.2513806223869324, + "learning_rate": 1e-05, + "loss": 0.3667, + "num_tokens": 178644461.0, + "step": 1466 + }, + { + "epoch": 1.7968176254589965, + "grad_norm": 0.25463229417800903, + "learning_rate": 1e-05, + "loss": 0.3703, + "num_tokens": 179220166.0, + "step": 1468 + }, + { + "epoch": 1.799265605875153, + "grad_norm": 0.2267390787601471, + "learning_rate": 1e-05, + "loss": 0.3628, + "num_tokens": 179824923.0, + "step": 1470 + }, + { + "epoch": 1.8017135862913096, + "grad_norm": 0.2407565414905548, + "learning_rate": 1e-05, + "loss": 0.3667, + "num_tokens": 180430779.0, + "step": 1472 + }, + { + "epoch": 1.8041615667074664, + "grad_norm": 0.2538115978240967, + "learning_rate": 1e-05, + "loss": 0.3713, + "num_tokens": 181015860.0, + "step": 1474 + }, + { + "epoch": 1.806609547123623, + "grad_norm": 0.2532234489917755, + "learning_rate": 1e-05, + "loss": 0.3662, + "num_tokens": 181597275.0, + "step": 1476 + }, + { + "epoch": 1.8090575275397796, + "grad_norm": 0.24419713020324707, + "learning_rate": 1e-05, + "loss": 0.3673, + "num_tokens": 182201474.0, + "step": 1478 + }, + { + "epoch": 1.8115055079559363, + "grad_norm": 0.2511015236377716, + "learning_rate": 1e-05, + "loss": 0.3765, + "num_tokens": 182770589.0, + "step": 1480 + }, + { + "epoch": 1.8139534883720931, + "grad_norm": 0.27226755023002625, + "learning_rate": 1e-05, + "loss": 0.3628, + "num_tokens": 183366383.0, + "step": 1482 + }, + { + "epoch": 1.8164014687882497, + "grad_norm": 0.2558925449848175, + "learning_rate": 1e-05, + "loss": 0.3724, + "num_tokens": 183965838.0, + "step": 1484 + }, + { + "epoch": 1.8188494492044063, + "grad_norm": 0.2294636219739914, + "learning_rate": 1e-05, + "loss": 0.3692, + "num_tokens": 184558859.0, + "step": 1486 + }, + { + "epoch": 1.821297429620563, + "grad_norm": 0.26806262135505676, + "learning_rate": 1e-05, + "loss": 0.3619, + "num_tokens": 185170502.0, + "step": 1488 + }, + { + "epoch": 1.8237454100367199, + "grad_norm": 0.255285382270813, + "learning_rate": 1e-05, + "loss": 0.3525, + "num_tokens": 185773232.0, + "step": 1490 + }, + { + "epoch": 1.8261933904528764, + "grad_norm": 0.23820726573467255, + "learning_rate": 1e-05, + "loss": 0.367, + "num_tokens": 186363042.0, + "step": 1492 + }, + { + "epoch": 1.828641370869033, + "grad_norm": 0.2516627013683319, + "learning_rate": 1e-05, + "loss": 0.3726, + "num_tokens": 186970258.0, + "step": 1494 + }, + { + "epoch": 1.8310893512851898, + "grad_norm": 0.24903468787670135, + "learning_rate": 1e-05, + "loss": 0.3735, + "num_tokens": 187562607.0, + "step": 1496 + }, + { + "epoch": 1.8335373317013464, + "grad_norm": 0.2585706114768982, + "learning_rate": 1e-05, + "loss": 0.3631, + "num_tokens": 188147440.0, + "step": 1498 + }, + { + "epoch": 1.835985312117503, + "grad_norm": 0.23948891460895538, + "learning_rate": 1e-05, + "loss": 0.3707, + "num_tokens": 188760737.0, + "step": 1500 + }, + { + "epoch": 1.8384332925336597, + "grad_norm": 0.23523059487342834, + "learning_rate": 1e-05, + "loss": 0.3668, + "num_tokens": 189331744.0, + "step": 1502 + }, + { + "epoch": 1.8408812729498165, + "grad_norm": 0.23200415074825287, + "learning_rate": 1e-05, + "loss": 0.3563, + "num_tokens": 189912006.0, + "step": 1504 + }, + { + "epoch": 1.843329253365973, + "grad_norm": 0.24164701998233795, + "learning_rate": 1e-05, + "loss": 0.3678, + "num_tokens": 190507560.0, + "step": 1506 + }, + { + "epoch": 1.8457772337821297, + "grad_norm": 0.24801412224769592, + "learning_rate": 1e-05, + "loss": 0.3637, + "num_tokens": 191119916.0, + "step": 1508 + }, + { + "epoch": 1.8482252141982864, + "grad_norm": 0.2424629032611847, + "learning_rate": 1e-05, + "loss": 0.3846, + "num_tokens": 191696881.0, + "step": 1510 + }, + { + "epoch": 1.8506731946144432, + "grad_norm": 0.2434435933828354, + "learning_rate": 1e-05, + "loss": 0.3485, + "num_tokens": 192250894.0, + "step": 1512 + }, + { + "epoch": 1.8531211750305998, + "grad_norm": 0.22857359051704407, + "learning_rate": 1e-05, + "loss": 0.3679, + "num_tokens": 192868098.0, + "step": 1514 + }, + { + "epoch": 1.8555691554467564, + "grad_norm": 0.2418743371963501, + "learning_rate": 1e-05, + "loss": 0.3452, + "num_tokens": 193432178.0, + "step": 1516 + }, + { + "epoch": 1.8580171358629132, + "grad_norm": 0.25751784443855286, + "learning_rate": 1e-05, + "loss": 0.3564, + "num_tokens": 193997484.0, + "step": 1518 + }, + { + "epoch": 1.8604651162790697, + "grad_norm": 0.23958885669708252, + "learning_rate": 1e-05, + "loss": 0.3717, + "num_tokens": 194552018.0, + "step": 1520 + }, + { + "epoch": 1.8629130966952263, + "grad_norm": 0.2377503365278244, + "learning_rate": 1e-05, + "loss": 0.3622, + "num_tokens": 195142270.0, + "step": 1522 + }, + { + "epoch": 1.865361077111383, + "grad_norm": 0.24225211143493652, + "learning_rate": 1e-05, + "loss": 0.3794, + "num_tokens": 195742052.0, + "step": 1524 + }, + { + "epoch": 1.86780905752754, + "grad_norm": 0.23690791428089142, + "learning_rate": 1e-05, + "loss": 0.3857, + "num_tokens": 196359878.0, + "step": 1526 + }, + { + "epoch": 1.8702570379436965, + "grad_norm": 0.23414862155914307, + "learning_rate": 1e-05, + "loss": 0.3689, + "num_tokens": 196973431.0, + "step": 1528 + }, + { + "epoch": 1.872705018359853, + "grad_norm": 0.23633243143558502, + "learning_rate": 1e-05, + "loss": 0.3593, + "num_tokens": 197557990.0, + "step": 1530 + }, + { + "epoch": 1.8751529987760098, + "grad_norm": 0.24126210808753967, + "learning_rate": 1e-05, + "loss": 0.3838, + "num_tokens": 198162137.0, + "step": 1532 + }, + { + "epoch": 1.8776009791921666, + "grad_norm": 0.24134069681167603, + "learning_rate": 1e-05, + "loss": 0.3589, + "num_tokens": 198764078.0, + "step": 1534 + }, + { + "epoch": 1.880048959608323, + "grad_norm": 0.25478363037109375, + "learning_rate": 1e-05, + "loss": 0.3653, + "num_tokens": 199331133.0, + "step": 1536 + }, + { + "epoch": 1.8824969400244798, + "grad_norm": 0.24682196974754333, + "learning_rate": 1e-05, + "loss": 0.3779, + "num_tokens": 199936917.0, + "step": 1538 + }, + { + "epoch": 1.8849449204406366, + "grad_norm": 0.24788504838943481, + "learning_rate": 1e-05, + "loss": 0.3676, + "num_tokens": 200517595.0, + "step": 1540 + }, + { + "epoch": 1.8873929008567931, + "grad_norm": 0.23016424477100372, + "learning_rate": 1e-05, + "loss": 0.3654, + "num_tokens": 201124880.0, + "step": 1542 + }, + { + "epoch": 1.8898408812729497, + "grad_norm": 0.24088110029697418, + "learning_rate": 1e-05, + "loss": 0.3627, + "num_tokens": 201716008.0, + "step": 1544 + }, + { + "epoch": 1.8922888616891065, + "grad_norm": 0.25025540590286255, + "learning_rate": 1e-05, + "loss": 0.3726, + "num_tokens": 202315279.0, + "step": 1546 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.2488720566034317, + "learning_rate": 1e-05, + "loss": 0.3773, + "num_tokens": 202876589.0, + "step": 1548 + }, + { + "epoch": 1.8971848225214198, + "grad_norm": 0.24178265035152435, + "learning_rate": 1e-05, + "loss": 0.3653, + "num_tokens": 203467310.0, + "step": 1550 + }, + { + "epoch": 1.8996328029375764, + "grad_norm": 0.23375409841537476, + "learning_rate": 1e-05, + "loss": 0.3683, + "num_tokens": 204058668.0, + "step": 1552 + }, + { + "epoch": 1.9020807833537332, + "grad_norm": 0.24636210501194, + "learning_rate": 1e-05, + "loss": 0.3716, + "num_tokens": 204627426.0, + "step": 1554 + }, + { + "epoch": 1.90452876376989, + "grad_norm": 0.22735662758350372, + "learning_rate": 1e-05, + "loss": 0.379, + "num_tokens": 205252881.0, + "step": 1556 + }, + { + "epoch": 1.9069767441860463, + "grad_norm": 0.2275749295949936, + "learning_rate": 1e-05, + "loss": 0.3667, + "num_tokens": 205833656.0, + "step": 1558 + }, + { + "epoch": 1.9094247246022031, + "grad_norm": 0.23169246315956116, + "learning_rate": 1e-05, + "loss": 0.3688, + "num_tokens": 206439536.0, + "step": 1560 + }, + { + "epoch": 1.91187270501836, + "grad_norm": 0.24443915486335754, + "learning_rate": 1e-05, + "loss": 0.3779, + "num_tokens": 207024878.0, + "step": 1562 + }, + { + "epoch": 1.9143206854345165, + "grad_norm": 0.2206883579492569, + "learning_rate": 1e-05, + "loss": 0.3691, + "num_tokens": 207630287.0, + "step": 1564 + }, + { + "epoch": 1.916768665850673, + "grad_norm": 0.23586352169513702, + "learning_rate": 1e-05, + "loss": 0.3675, + "num_tokens": 208197538.0, + "step": 1566 + }, + { + "epoch": 1.9192166462668299, + "grad_norm": 0.245608851313591, + "learning_rate": 1e-05, + "loss": 0.3704, + "num_tokens": 208789057.0, + "step": 1568 + }, + { + "epoch": 1.9216646266829867, + "grad_norm": 0.2444605678319931, + "learning_rate": 1e-05, + "loss": 0.3701, + "num_tokens": 209369955.0, + "step": 1570 + }, + { + "epoch": 1.9241126070991432, + "grad_norm": 0.23837168514728546, + "learning_rate": 1e-05, + "loss": 0.3734, + "num_tokens": 209961210.0, + "step": 1572 + }, + { + "epoch": 1.9265605875152998, + "grad_norm": 0.24565783143043518, + "learning_rate": 1e-05, + "loss": 0.3595, + "num_tokens": 210538072.0, + "step": 1574 + }, + { + "epoch": 1.9290085679314566, + "grad_norm": 0.2464468628168106, + "learning_rate": 1e-05, + "loss": 0.3758, + "num_tokens": 211099323.0, + "step": 1576 + }, + { + "epoch": 1.9314565483476134, + "grad_norm": 0.23516030609607697, + "learning_rate": 1e-05, + "loss": 0.373, + "num_tokens": 211721276.0, + "step": 1578 + }, + { + "epoch": 1.9339045287637697, + "grad_norm": 0.2265416979789734, + "learning_rate": 1e-05, + "loss": 0.3702, + "num_tokens": 212324805.0, + "step": 1580 + }, + { + "epoch": 1.9363525091799265, + "grad_norm": 0.2524133324623108, + "learning_rate": 1e-05, + "loss": 0.3804, + "num_tokens": 212953032.0, + "step": 1582 + }, + { + "epoch": 1.9388004895960833, + "grad_norm": 0.2479136735200882, + "learning_rate": 1e-05, + "loss": 0.3771, + "num_tokens": 213557991.0, + "step": 1584 + }, + { + "epoch": 1.9412484700122399, + "grad_norm": 0.25407838821411133, + "learning_rate": 1e-05, + "loss": 0.3775, + "num_tokens": 214170512.0, + "step": 1586 + }, + { + "epoch": 1.9436964504283964, + "grad_norm": 0.23423869907855988, + "learning_rate": 1e-05, + "loss": 0.3796, + "num_tokens": 214778571.0, + "step": 1588 + }, + { + "epoch": 1.9461444308445532, + "grad_norm": 0.2350510060787201, + "learning_rate": 1e-05, + "loss": 0.3767, + "num_tokens": 215370495.0, + "step": 1590 + }, + { + "epoch": 1.94859241126071, + "grad_norm": 0.2388015240430832, + "learning_rate": 1e-05, + "loss": 0.3621, + "num_tokens": 215962798.0, + "step": 1592 + }, + { + "epoch": 1.9510403916768666, + "grad_norm": 0.24582920968532562, + "learning_rate": 1e-05, + "loss": 0.3785, + "num_tokens": 216535476.0, + "step": 1594 + }, + { + "epoch": 1.9534883720930232, + "grad_norm": 0.22780659794807434, + "learning_rate": 1e-05, + "loss": 0.3516, + "num_tokens": 217157260.0, + "step": 1596 + }, + { + "epoch": 1.95593635250918, + "grad_norm": 0.24787680804729462, + "learning_rate": 1e-05, + "loss": 0.3719, + "num_tokens": 217759614.0, + "step": 1598 + }, + { + "epoch": 1.9583843329253368, + "grad_norm": 0.22913233935832977, + "learning_rate": 1e-05, + "loss": 0.3724, + "num_tokens": 218354172.0, + "step": 1600 + }, + { + "epoch": 1.960832313341493, + "grad_norm": 0.256535142660141, + "learning_rate": 1e-05, + "loss": 0.3625, + "num_tokens": 218928228.0, + "step": 1602 + }, + { + "epoch": 1.96328029375765, + "grad_norm": 0.35804998874664307, + "learning_rate": 1e-05, + "loss": 0.3726, + "num_tokens": 219538782.0, + "step": 1604 + }, + { + "epoch": 1.9657282741738067, + "grad_norm": 0.24214068055152893, + "learning_rate": 1e-05, + "loss": 0.3662, + "num_tokens": 220109485.0, + "step": 1606 + }, + { + "epoch": 1.9681762545899633, + "grad_norm": 0.2461036592721939, + "learning_rate": 1e-05, + "loss": 0.3644, + "num_tokens": 220702847.0, + "step": 1608 + }, + { + "epoch": 1.9706242350061198, + "grad_norm": 0.22849348187446594, + "learning_rate": 1e-05, + "loss": 0.3697, + "num_tokens": 221307344.0, + "step": 1610 + }, + { + "epoch": 1.9730722154222766, + "grad_norm": 0.24049215018749237, + "learning_rate": 1e-05, + "loss": 0.3726, + "num_tokens": 221897755.0, + "step": 1612 + }, + { + "epoch": 1.9755201958384334, + "grad_norm": 0.22864216566085815, + "learning_rate": 1e-05, + "loss": 0.3755, + "num_tokens": 222502665.0, + "step": 1614 + }, + { + "epoch": 1.97796817625459, + "grad_norm": 0.24188441038131714, + "learning_rate": 1e-05, + "loss": 0.3713, + "num_tokens": 223105542.0, + "step": 1616 + }, + { + "epoch": 1.9804161566707466, + "grad_norm": 0.2464837282896042, + "learning_rate": 1e-05, + "loss": 0.3683, + "num_tokens": 223664313.0, + "step": 1618 + }, + { + "epoch": 1.9828641370869033, + "grad_norm": 0.25046202540397644, + "learning_rate": 1e-05, + "loss": 0.3715, + "num_tokens": 224249672.0, + "step": 1620 + }, + { + "epoch": 1.9853121175030601, + "grad_norm": 0.2353782206773758, + "learning_rate": 1e-05, + "loss": 0.3673, + "num_tokens": 224831961.0, + "step": 1622 + }, + { + "epoch": 1.9877600979192165, + "grad_norm": 0.24083220958709717, + "learning_rate": 1e-05, + "loss": 0.351, + "num_tokens": 225377557.0, + "step": 1624 + }, + { + "epoch": 1.9902080783353733, + "grad_norm": 0.22877903282642365, + "learning_rate": 1e-05, + "loss": 0.361, + "num_tokens": 225983898.0, + "step": 1626 + }, + { + "epoch": 1.99265605875153, + "grad_norm": 0.23952309787273407, + "learning_rate": 1e-05, + "loss": 0.357, + "num_tokens": 226571294.0, + "step": 1628 + }, + { + "epoch": 1.9951040391676866, + "grad_norm": 0.2325475960969925, + "learning_rate": 1e-05, + "loss": 0.3581, + "num_tokens": 227158234.0, + "step": 1630 + }, + { + "epoch": 1.9975520195838432, + "grad_norm": 0.24239014089107513, + "learning_rate": 1e-05, + "loss": 0.3603, + "num_tokens": 227756083.0, + "step": 1632 + }, + { + "epoch": 2.0, + "grad_norm": 0.2470468282699585, + "learning_rate": 1e-05, + "loss": 0.3632, + "num_tokens": 228332108.0, + "step": 1634 + }, + { + "epoch": 2.002447980416157, + "grad_norm": 0.26383376121520996, + "learning_rate": 1e-05, + "loss": 0.3307, + "num_tokens": 228926091.0, + "step": 1636 + }, + { + "epoch": 2.004895960832313, + "grad_norm": 0.2823599576950073, + "learning_rate": 1e-05, + "loss": 0.3223, + "num_tokens": 229508232.0, + "step": 1638 + }, + { + "epoch": 2.00734394124847, + "grad_norm": 0.2819001078605652, + "learning_rate": 1e-05, + "loss": 0.3233, + "num_tokens": 230109053.0, + "step": 1640 + }, + { + "epoch": 2.0097919216646267, + "grad_norm": 0.2523273229598999, + "learning_rate": 1e-05, + "loss": 0.3339, + "num_tokens": 230732787.0, + "step": 1642 + }, + { + "epoch": 2.0122399020807835, + "grad_norm": 0.25086861848831177, + "learning_rate": 1e-05, + "loss": 0.32, + "num_tokens": 231324249.0, + "step": 1644 + }, + { + "epoch": 2.01468788249694, + "grad_norm": 0.25949275493621826, + "learning_rate": 1e-05, + "loss": 0.3149, + "num_tokens": 231904953.0, + "step": 1646 + }, + { + "epoch": 2.0171358629130967, + "grad_norm": 0.2776775360107422, + "learning_rate": 1e-05, + "loss": 0.3173, + "num_tokens": 232473855.0, + "step": 1648 + }, + { + "epoch": 2.0195838433292534, + "grad_norm": 0.26470237970352173, + "learning_rate": 1e-05, + "loss": 0.3242, + "num_tokens": 233076215.0, + "step": 1650 + }, + { + "epoch": 2.0220318237454102, + "grad_norm": 0.26274335384368896, + "learning_rate": 1e-05, + "loss": 0.3218, + "num_tokens": 233693874.0, + "step": 1652 + }, + { + "epoch": 2.0244798041615666, + "grad_norm": 0.2498115599155426, + "learning_rate": 1e-05, + "loss": 0.3168, + "num_tokens": 234300479.0, + "step": 1654 + }, + { + "epoch": 2.0269277845777234, + "grad_norm": 0.2655656635761261, + "learning_rate": 1e-05, + "loss": 0.3162, + "num_tokens": 234870651.0, + "step": 1656 + }, + { + "epoch": 2.02937576499388, + "grad_norm": 0.2704806923866272, + "learning_rate": 1e-05, + "loss": 0.326, + "num_tokens": 235462004.0, + "step": 1658 + }, + { + "epoch": 2.0318237454100365, + "grad_norm": 0.2563600242137909, + "learning_rate": 1e-05, + "loss": 0.326, + "num_tokens": 236065795.0, + "step": 1660 + }, + { + "epoch": 2.0342717258261933, + "grad_norm": 0.2734050452709198, + "learning_rate": 1e-05, + "loss": 0.3122, + "num_tokens": 236638043.0, + "step": 1662 + }, + { + "epoch": 2.03671970624235, + "grad_norm": 0.26698222756385803, + "learning_rate": 1e-05, + "loss": 0.3129, + "num_tokens": 237209148.0, + "step": 1664 + }, + { + "epoch": 2.039167686658507, + "grad_norm": 0.27800020575523376, + "learning_rate": 1e-05, + "loss": 0.3288, + "num_tokens": 237782297.0, + "step": 1666 + }, + { + "epoch": 2.0416156670746632, + "grad_norm": 0.2683752179145813, + "learning_rate": 1e-05, + "loss": 0.3337, + "num_tokens": 238370815.0, + "step": 1668 + }, + { + "epoch": 2.04406364749082, + "grad_norm": 0.2758134603500366, + "learning_rate": 1e-05, + "loss": 0.3203, + "num_tokens": 238974735.0, + "step": 1670 + }, + { + "epoch": 2.046511627906977, + "grad_norm": 0.2527683675289154, + "learning_rate": 1e-05, + "loss": 0.3131, + "num_tokens": 239582812.0, + "step": 1672 + }, + { + "epoch": 2.0489596083231336, + "grad_norm": 0.2617241442203522, + "learning_rate": 1e-05, + "loss": 0.3156, + "num_tokens": 240153131.0, + "step": 1674 + }, + { + "epoch": 2.05140758873929, + "grad_norm": 0.24523000419139862, + "learning_rate": 1e-05, + "loss": 0.318, + "num_tokens": 240743051.0, + "step": 1676 + }, + { + "epoch": 2.0538555691554468, + "grad_norm": 0.24731133878231049, + "learning_rate": 1e-05, + "loss": 0.3186, + "num_tokens": 241332255.0, + "step": 1678 + }, + { + "epoch": 2.0563035495716036, + "grad_norm": 0.26008081436157227, + "learning_rate": 1e-05, + "loss": 0.32, + "num_tokens": 241925955.0, + "step": 1680 + }, + { + "epoch": 2.05875152998776, + "grad_norm": 0.24686473608016968, + "learning_rate": 1e-05, + "loss": 0.3214, + "num_tokens": 242559460.0, + "step": 1682 + }, + { + "epoch": 2.0611995104039167, + "grad_norm": 0.2397872358560562, + "learning_rate": 1e-05, + "loss": 0.3135, + "num_tokens": 243147912.0, + "step": 1684 + }, + { + "epoch": 2.0636474908200735, + "grad_norm": 0.2412048876285553, + "learning_rate": 1e-05, + "loss": 0.3187, + "num_tokens": 243737102.0, + "step": 1686 + }, + { + "epoch": 2.0660954712362303, + "grad_norm": 0.2491815984249115, + "learning_rate": 1e-05, + "loss": 0.3268, + "num_tokens": 244379777.0, + "step": 1688 + }, + { + "epoch": 2.0685434516523866, + "grad_norm": 0.2565453350543976, + "learning_rate": 1e-05, + "loss": 0.3305, + "num_tokens": 244978647.0, + "step": 1690 + }, + { + "epoch": 2.0709914320685434, + "grad_norm": 0.24665790796279907, + "learning_rate": 1e-05, + "loss": 0.3157, + "num_tokens": 245555548.0, + "step": 1692 + }, + { + "epoch": 2.0734394124847, + "grad_norm": 0.2575874924659729, + "learning_rate": 1e-05, + "loss": 0.32, + "num_tokens": 246135906.0, + "step": 1694 + }, + { + "epoch": 2.075887392900857, + "grad_norm": 0.2599528133869171, + "learning_rate": 1e-05, + "loss": 0.331, + "num_tokens": 246714507.0, + "step": 1696 + }, + { + "epoch": 2.0783353733170133, + "grad_norm": 0.246963232755661, + "learning_rate": 1e-05, + "loss": 0.3178, + "num_tokens": 247312796.0, + "step": 1698 + }, + { + "epoch": 2.08078335373317, + "grad_norm": 0.2495131939649582, + "learning_rate": 1e-05, + "loss": 0.3156, + "num_tokens": 247899696.0, + "step": 1700 + }, + { + "epoch": 2.083231334149327, + "grad_norm": 0.24859756231307983, + "learning_rate": 1e-05, + "loss": 0.3216, + "num_tokens": 248478103.0, + "step": 1702 + }, + { + "epoch": 2.0856793145654833, + "grad_norm": 0.2642371356487274, + "learning_rate": 1e-05, + "loss": 0.3244, + "num_tokens": 249093078.0, + "step": 1704 + }, + { + "epoch": 2.08812729498164, + "grad_norm": 0.25307220220565796, + "learning_rate": 1e-05, + "loss": 0.3161, + "num_tokens": 249671000.0, + "step": 1706 + }, + { + "epoch": 2.090575275397797, + "grad_norm": 0.25887933373451233, + "learning_rate": 1e-05, + "loss": 0.3085, + "num_tokens": 250228631.0, + "step": 1708 + }, + { + "epoch": 2.0930232558139537, + "grad_norm": 0.25659000873565674, + "learning_rate": 1e-05, + "loss": 0.3319, + "num_tokens": 250830771.0, + "step": 1710 + }, + { + "epoch": 2.09547123623011, + "grad_norm": 0.27465444803237915, + "learning_rate": 1e-05, + "loss": 0.326, + "num_tokens": 251414177.0, + "step": 1712 + }, + { + "epoch": 2.097919216646267, + "grad_norm": 0.26044049859046936, + "learning_rate": 1e-05, + "loss": 0.3165, + "num_tokens": 251980472.0, + "step": 1714 + }, + { + "epoch": 2.1003671970624236, + "grad_norm": 0.26657673716545105, + "learning_rate": 1e-05, + "loss": 0.3195, + "num_tokens": 252563419.0, + "step": 1716 + }, + { + "epoch": 2.1028151774785804, + "grad_norm": 0.2445669025182724, + "learning_rate": 1e-05, + "loss": 0.3169, + "num_tokens": 253156291.0, + "step": 1718 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.26667121052742004, + "learning_rate": 1e-05, + "loss": 0.326, + "num_tokens": 253748542.0, + "step": 1720 + }, + { + "epoch": 2.1077111383108935, + "grad_norm": 0.26954278349876404, + "learning_rate": 1e-05, + "loss": 0.3291, + "num_tokens": 254355454.0, + "step": 1722 + }, + { + "epoch": 2.1101591187270503, + "grad_norm": 0.26194727420806885, + "learning_rate": 1e-05, + "loss": 0.3193, + "num_tokens": 254981618.0, + "step": 1724 + }, + { + "epoch": 2.1126070991432067, + "grad_norm": 0.26653769612312317, + "learning_rate": 1e-05, + "loss": 0.3249, + "num_tokens": 255580818.0, + "step": 1726 + }, + { + "epoch": 2.1150550795593634, + "grad_norm": 0.2530732750892639, + "learning_rate": 1e-05, + "loss": 0.3231, + "num_tokens": 256175654.0, + "step": 1728 + }, + { + "epoch": 2.1175030599755202, + "grad_norm": 0.26466599106788635, + "learning_rate": 1e-05, + "loss": 0.3149, + "num_tokens": 256769129.0, + "step": 1730 + }, + { + "epoch": 2.119951040391677, + "grad_norm": 0.2696734666824341, + "learning_rate": 1e-05, + "loss": 0.3253, + "num_tokens": 257368163.0, + "step": 1732 + }, + { + "epoch": 2.1223990208078334, + "grad_norm": 0.2791430354118347, + "learning_rate": 1e-05, + "loss": 0.3276, + "num_tokens": 257979865.0, + "step": 1734 + }, + { + "epoch": 2.12484700122399, + "grad_norm": 0.25057294964790344, + "learning_rate": 1e-05, + "loss": 0.3208, + "num_tokens": 258574921.0, + "step": 1736 + }, + { + "epoch": 2.127294981640147, + "grad_norm": 0.250307559967041, + "learning_rate": 1e-05, + "loss": 0.3161, + "num_tokens": 259188694.0, + "step": 1738 + }, + { + "epoch": 2.1297429620563038, + "grad_norm": 0.2579153776168823, + "learning_rate": 1e-05, + "loss": 0.3215, + "num_tokens": 259788088.0, + "step": 1740 + }, + { + "epoch": 2.13219094247246, + "grad_norm": 0.25432637333869934, + "learning_rate": 1e-05, + "loss": 0.3263, + "num_tokens": 260355939.0, + "step": 1742 + }, + { + "epoch": 2.134638922888617, + "grad_norm": 0.25369200110435486, + "learning_rate": 1e-05, + "loss": 0.3245, + "num_tokens": 260952386.0, + "step": 1744 + }, + { + "epoch": 2.1370869033047737, + "grad_norm": 0.26250943541526794, + "learning_rate": 1e-05, + "loss": 0.3149, + "num_tokens": 261542760.0, + "step": 1746 + }, + { + "epoch": 2.13953488372093, + "grad_norm": 0.2528480887413025, + "learning_rate": 1e-05, + "loss": 0.3177, + "num_tokens": 262121722.0, + "step": 1748 + }, + { + "epoch": 2.141982864137087, + "grad_norm": 0.2538909316062927, + "learning_rate": 1e-05, + "loss": 0.3291, + "num_tokens": 262729764.0, + "step": 1750 + }, + { + "epoch": 2.1444308445532436, + "grad_norm": 0.27961215376853943, + "learning_rate": 1e-05, + "loss": 0.3355, + "num_tokens": 263343702.0, + "step": 1752 + }, + { + "epoch": 2.1468788249694004, + "grad_norm": 0.2544754445552826, + "learning_rate": 1e-05, + "loss": 0.3193, + "num_tokens": 263930203.0, + "step": 1754 + }, + { + "epoch": 2.1493268053855568, + "grad_norm": 0.2512654960155487, + "learning_rate": 1e-05, + "loss": 0.3213, + "num_tokens": 264517859.0, + "step": 1756 + }, + { + "epoch": 2.1517747858017136, + "grad_norm": 0.24453192949295044, + "learning_rate": 1e-05, + "loss": 0.3316, + "num_tokens": 265135649.0, + "step": 1758 + }, + { + "epoch": 2.1542227662178703, + "grad_norm": 0.2528865337371826, + "learning_rate": 1e-05, + "loss": 0.3193, + "num_tokens": 265724746.0, + "step": 1760 + }, + { + "epoch": 2.1566707466340267, + "grad_norm": 0.25134533643722534, + "learning_rate": 1e-05, + "loss": 0.3275, + "num_tokens": 266324867.0, + "step": 1762 + }, + { + "epoch": 2.1591187270501835, + "grad_norm": 0.25282108783721924, + "learning_rate": 1e-05, + "loss": 0.3229, + "num_tokens": 266919065.0, + "step": 1764 + }, + { + "epoch": 2.1615667074663403, + "grad_norm": 0.25280824303627014, + "learning_rate": 1e-05, + "loss": 0.3249, + "num_tokens": 267495602.0, + "step": 1766 + }, + { + "epoch": 2.164014687882497, + "grad_norm": 0.2632092535495758, + "learning_rate": 1e-05, + "loss": 0.3296, + "num_tokens": 268081542.0, + "step": 1768 + }, + { + "epoch": 2.1664626682986534, + "grad_norm": 0.258695513010025, + "learning_rate": 1e-05, + "loss": 0.3271, + "num_tokens": 268691744.0, + "step": 1770 + }, + { + "epoch": 2.16891064871481, + "grad_norm": 0.26054874062538147, + "learning_rate": 1e-05, + "loss": 0.3359, + "num_tokens": 269267045.0, + "step": 1772 + }, + { + "epoch": 2.171358629130967, + "grad_norm": 0.2614051401615143, + "learning_rate": 1e-05, + "loss": 0.3218, + "num_tokens": 269852138.0, + "step": 1774 + }, + { + "epoch": 2.173806609547124, + "grad_norm": 0.25378215312957764, + "learning_rate": 1e-05, + "loss": 0.3138, + "num_tokens": 270447909.0, + "step": 1776 + }, + { + "epoch": 2.17625458996328, + "grad_norm": 0.25152331590652466, + "learning_rate": 1e-05, + "loss": 0.326, + "num_tokens": 271039752.0, + "step": 1778 + }, + { + "epoch": 2.178702570379437, + "grad_norm": 0.2568153440952301, + "learning_rate": 1e-05, + "loss": 0.3192, + "num_tokens": 271624236.0, + "step": 1780 + }, + { + "epoch": 2.1811505507955937, + "grad_norm": 0.24978622794151306, + "learning_rate": 1e-05, + "loss": 0.3301, + "num_tokens": 272229137.0, + "step": 1782 + }, + { + "epoch": 2.18359853121175, + "grad_norm": 0.25673535466194153, + "learning_rate": 1e-05, + "loss": 0.3275, + "num_tokens": 272804913.0, + "step": 1784 + }, + { + "epoch": 2.186046511627907, + "grad_norm": 0.26895347237586975, + "learning_rate": 1e-05, + "loss": 0.3206, + "num_tokens": 273380069.0, + "step": 1786 + }, + { + "epoch": 2.1884944920440637, + "grad_norm": 0.2613040804862976, + "learning_rate": 1e-05, + "loss": 0.3187, + "num_tokens": 273956503.0, + "step": 1788 + }, + { + "epoch": 2.1909424724602204, + "grad_norm": 0.2516239285469055, + "learning_rate": 1e-05, + "loss": 0.3172, + "num_tokens": 274541930.0, + "step": 1790 + }, + { + "epoch": 2.193390452876377, + "grad_norm": 0.24175681173801422, + "learning_rate": 1e-05, + "loss": 0.3082, + "num_tokens": 275133982.0, + "step": 1792 + }, + { + "epoch": 2.1958384332925336, + "grad_norm": 0.2464313507080078, + "learning_rate": 1e-05, + "loss": 0.3362, + "num_tokens": 275738202.0, + "step": 1794 + }, + { + "epoch": 2.1982864137086904, + "grad_norm": 0.2737571597099304, + "learning_rate": 1e-05, + "loss": 0.3106, + "num_tokens": 276299705.0, + "step": 1796 + }, + { + "epoch": 2.200734394124847, + "grad_norm": 0.26766151189804077, + "learning_rate": 1e-05, + "loss": 0.3172, + "num_tokens": 276904098.0, + "step": 1798 + }, + { + "epoch": 2.2031823745410035, + "grad_norm": 0.25508153438568115, + "learning_rate": 1e-05, + "loss": 0.3174, + "num_tokens": 277491075.0, + "step": 1800 + }, + { + "epoch": 2.2056303549571603, + "grad_norm": 0.2565786838531494, + "learning_rate": 1e-05, + "loss": 0.3186, + "num_tokens": 278068282.0, + "step": 1802 + }, + { + "epoch": 2.208078335373317, + "grad_norm": 0.25016945600509644, + "learning_rate": 1e-05, + "loss": 0.3138, + "num_tokens": 278674326.0, + "step": 1804 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 0.26891186833381653, + "learning_rate": 1e-05, + "loss": 0.3207, + "num_tokens": 279249277.0, + "step": 1806 + }, + { + "epoch": 2.2129742962056302, + "grad_norm": 0.2475995570421219, + "learning_rate": 1e-05, + "loss": 0.3134, + "num_tokens": 279825692.0, + "step": 1808 + }, + { + "epoch": 2.215422276621787, + "grad_norm": 0.2526877820491791, + "learning_rate": 1e-05, + "loss": 0.3239, + "num_tokens": 280411199.0, + "step": 1810 + }, + { + "epoch": 2.217870257037944, + "grad_norm": 0.29060930013656616, + "learning_rate": 1e-05, + "loss": 0.3202, + "num_tokens": 281024327.0, + "step": 1812 + }, + { + "epoch": 2.2203182374541, + "grad_norm": 0.2517792582511902, + "learning_rate": 1e-05, + "loss": 0.3201, + "num_tokens": 281603472.0, + "step": 1814 + }, + { + "epoch": 2.222766217870257, + "grad_norm": 0.2607707381248474, + "learning_rate": 1e-05, + "loss": 0.3131, + "num_tokens": 282183690.0, + "step": 1816 + }, + { + "epoch": 2.2252141982864138, + "grad_norm": 0.2490987777709961, + "learning_rate": 1e-05, + "loss": 0.3304, + "num_tokens": 282764121.0, + "step": 1818 + }, + { + "epoch": 2.2276621787025706, + "grad_norm": 0.25851231813430786, + "learning_rate": 1e-05, + "loss": 0.326, + "num_tokens": 283347033.0, + "step": 1820 + }, + { + "epoch": 2.230110159118727, + "grad_norm": 0.2487228363752365, + "learning_rate": 1e-05, + "loss": 0.3178, + "num_tokens": 283931247.0, + "step": 1822 + }, + { + "epoch": 2.2325581395348837, + "grad_norm": 0.25673067569732666, + "learning_rate": 1e-05, + "loss": 0.315, + "num_tokens": 284503256.0, + "step": 1824 + }, + { + "epoch": 2.2350061199510405, + "grad_norm": 0.2713610827922821, + "learning_rate": 1e-05, + "loss": 0.3281, + "num_tokens": 285105100.0, + "step": 1826 + }, + { + "epoch": 2.237454100367197, + "grad_norm": 0.2791447639465332, + "learning_rate": 1e-05, + "loss": 0.3349, + "num_tokens": 285674535.0, + "step": 1828 + }, + { + "epoch": 2.2399020807833536, + "grad_norm": 0.26290756464004517, + "learning_rate": 1e-05, + "loss": 0.3185, + "num_tokens": 286267445.0, + "step": 1830 + }, + { + "epoch": 2.2423500611995104, + "grad_norm": 0.2563938796520233, + "learning_rate": 1e-05, + "loss": 0.3259, + "num_tokens": 286847377.0, + "step": 1832 + }, + { + "epoch": 2.244798041615667, + "grad_norm": 0.2585829496383667, + "learning_rate": 1e-05, + "loss": 0.3271, + "num_tokens": 287405979.0, + "step": 1834 + }, + { + "epoch": 2.2472460220318236, + "grad_norm": 0.33251309394836426, + "learning_rate": 1e-05, + "loss": 0.3305, + "num_tokens": 288011470.0, + "step": 1836 + }, + { + "epoch": 2.2496940024479803, + "grad_norm": 0.2520661950111389, + "learning_rate": 1e-05, + "loss": 0.316, + "num_tokens": 288599550.0, + "step": 1838 + }, + { + "epoch": 2.252141982864137, + "grad_norm": 0.27090510725975037, + "learning_rate": 1e-05, + "loss": 0.3261, + "num_tokens": 289194997.0, + "step": 1840 + }, + { + "epoch": 2.254589963280294, + "grad_norm": 0.24881038069725037, + "learning_rate": 1e-05, + "loss": 0.3136, + "num_tokens": 289794688.0, + "step": 1842 + }, + { + "epoch": 2.2570379436964503, + "grad_norm": 0.25386103987693787, + "learning_rate": 1e-05, + "loss": 0.3289, + "num_tokens": 290400833.0, + "step": 1844 + }, + { + "epoch": 2.259485924112607, + "grad_norm": 0.24282298982143402, + "learning_rate": 1e-05, + "loss": 0.3304, + "num_tokens": 291039254.0, + "step": 1846 + }, + { + "epoch": 2.261933904528764, + "grad_norm": 0.2525377869606018, + "learning_rate": 1e-05, + "loss": 0.3128, + "num_tokens": 291621991.0, + "step": 1848 + }, + { + "epoch": 2.26438188494492, + "grad_norm": 0.2528608441352844, + "learning_rate": 1e-05, + "loss": 0.3258, + "num_tokens": 292214020.0, + "step": 1850 + }, + { + "epoch": 2.266829865361077, + "grad_norm": 0.2423013150691986, + "learning_rate": 1e-05, + "loss": 0.3294, + "num_tokens": 292851314.0, + "step": 1852 + }, + { + "epoch": 2.269277845777234, + "grad_norm": 0.24104289710521698, + "learning_rate": 1e-05, + "loss": 0.3135, + "num_tokens": 293454455.0, + "step": 1854 + }, + { + "epoch": 2.2717258261933906, + "grad_norm": 0.24933366477489471, + "learning_rate": 1e-05, + "loss": 0.3201, + "num_tokens": 294037926.0, + "step": 1856 + }, + { + "epoch": 2.274173806609547, + "grad_norm": 0.25847917795181274, + "learning_rate": 1e-05, + "loss": 0.3157, + "num_tokens": 294636937.0, + "step": 1858 + }, + { + "epoch": 2.2766217870257037, + "grad_norm": 0.2618774473667145, + "learning_rate": 1e-05, + "loss": 0.3258, + "num_tokens": 295229198.0, + "step": 1860 + }, + { + "epoch": 2.2790697674418605, + "grad_norm": 0.2543027400970459, + "learning_rate": 1e-05, + "loss": 0.3182, + "num_tokens": 295829418.0, + "step": 1862 + }, + { + "epoch": 2.2815177478580173, + "grad_norm": 0.2600286304950714, + "learning_rate": 1e-05, + "loss": 0.3277, + "num_tokens": 296419808.0, + "step": 1864 + }, + { + "epoch": 2.2839657282741737, + "grad_norm": 0.2631028890609741, + "learning_rate": 1e-05, + "loss": 0.331, + "num_tokens": 297031243.0, + "step": 1866 + }, + { + "epoch": 2.2864137086903304, + "grad_norm": 0.2603526711463928, + "learning_rate": 1e-05, + "loss": 0.3295, + "num_tokens": 297626941.0, + "step": 1868 + }, + { + "epoch": 2.2888616891064872, + "grad_norm": 0.25479280948638916, + "learning_rate": 1e-05, + "loss": 0.3217, + "num_tokens": 298178534.0, + "step": 1870 + }, + { + "epoch": 2.2913096695226436, + "grad_norm": 0.25323933362960815, + "learning_rate": 1e-05, + "loss": 0.3248, + "num_tokens": 298770355.0, + "step": 1872 + }, + { + "epoch": 2.2937576499388004, + "grad_norm": 0.26648545265197754, + "learning_rate": 1e-05, + "loss": 0.3197, + "num_tokens": 299340175.0, + "step": 1874 + }, + { + "epoch": 2.296205630354957, + "grad_norm": 0.2429247945547104, + "learning_rate": 1e-05, + "loss": 0.3125, + "num_tokens": 299919779.0, + "step": 1876 + }, + { + "epoch": 2.298653610771114, + "grad_norm": 0.25432199239730835, + "learning_rate": 1e-05, + "loss": 0.3125, + "num_tokens": 300496223.0, + "step": 1878 + }, + { + "epoch": 2.3011015911872703, + "grad_norm": 0.2576068341732025, + "learning_rate": 1e-05, + "loss": 0.3266, + "num_tokens": 301103197.0, + "step": 1880 + }, + { + "epoch": 2.303549571603427, + "grad_norm": 0.25816693902015686, + "learning_rate": 1e-05, + "loss": 0.3226, + "num_tokens": 301681339.0, + "step": 1882 + }, + { + "epoch": 2.305997552019584, + "grad_norm": 0.25362440943717957, + "learning_rate": 1e-05, + "loss": 0.328, + "num_tokens": 302307086.0, + "step": 1884 + }, + { + "epoch": 2.3084455324357407, + "grad_norm": 0.26144906878471375, + "learning_rate": 1e-05, + "loss": 0.333, + "num_tokens": 302937116.0, + "step": 1886 + }, + { + "epoch": 2.310893512851897, + "grad_norm": 0.25918832421302795, + "learning_rate": 1e-05, + "loss": 0.3161, + "num_tokens": 303516988.0, + "step": 1888 + }, + { + "epoch": 2.313341493268054, + "grad_norm": 0.2761925756931305, + "learning_rate": 1e-05, + "loss": 0.3313, + "num_tokens": 304132182.0, + "step": 1890 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.2471666783094406, + "learning_rate": 1e-05, + "loss": 0.3123, + "num_tokens": 304705289.0, + "step": 1892 + }, + { + "epoch": 2.318237454100367, + "grad_norm": 0.27031853795051575, + "learning_rate": 1e-05, + "loss": 0.3368, + "num_tokens": 305325892.0, + "step": 1894 + }, + { + "epoch": 2.3206854345165238, + "grad_norm": 0.253797322511673, + "learning_rate": 1e-05, + "loss": 0.3177, + "num_tokens": 305921736.0, + "step": 1896 + }, + { + "epoch": 2.3231334149326806, + "grad_norm": 0.25007933378219604, + "learning_rate": 1e-05, + "loss": 0.3302, + "num_tokens": 306526398.0, + "step": 1898 + }, + { + "epoch": 2.3255813953488373, + "grad_norm": 0.2611125409603119, + "learning_rate": 1e-05, + "loss": 0.3234, + "num_tokens": 307121655.0, + "step": 1900 + }, + { + "epoch": 2.3280293757649937, + "grad_norm": 0.2409556806087494, + "learning_rate": 1e-05, + "loss": 0.3168, + "num_tokens": 307691871.0, + "step": 1902 + }, + { + "epoch": 2.3304773561811505, + "grad_norm": 0.26317453384399414, + "learning_rate": 1e-05, + "loss": 0.3242, + "num_tokens": 308295238.0, + "step": 1904 + }, + { + "epoch": 2.3329253365973073, + "grad_norm": 0.2561883330345154, + "learning_rate": 1e-05, + "loss": 0.3249, + "num_tokens": 308901215.0, + "step": 1906 + }, + { + "epoch": 2.335373317013464, + "grad_norm": 0.2611881196498871, + "learning_rate": 1e-05, + "loss": 0.3201, + "num_tokens": 309525875.0, + "step": 1908 + }, + { + "epoch": 2.3378212974296204, + "grad_norm": 0.24518193304538727, + "learning_rate": 1e-05, + "loss": 0.3311, + "num_tokens": 310111880.0, + "step": 1910 + }, + { + "epoch": 2.340269277845777, + "grad_norm": 0.2470950484275818, + "learning_rate": 1e-05, + "loss": 0.32, + "num_tokens": 310703795.0, + "step": 1912 + }, + { + "epoch": 2.342717258261934, + "grad_norm": 0.2571375072002411, + "learning_rate": 1e-05, + "loss": 0.3278, + "num_tokens": 311292013.0, + "step": 1914 + }, + { + "epoch": 2.3451652386780903, + "grad_norm": 0.25326406955718994, + "learning_rate": 1e-05, + "loss": 0.3274, + "num_tokens": 311872266.0, + "step": 1916 + }, + { + "epoch": 2.347613219094247, + "grad_norm": 0.26152241230010986, + "learning_rate": 1e-05, + "loss": 0.3336, + "num_tokens": 312443087.0, + "step": 1918 + }, + { + "epoch": 2.350061199510404, + "grad_norm": 0.24850670993328094, + "learning_rate": 1e-05, + "loss": 0.3195, + "num_tokens": 313014118.0, + "step": 1920 + }, + { + "epoch": 2.3525091799265607, + "grad_norm": 0.2560129761695862, + "learning_rate": 1e-05, + "loss": 0.3157, + "num_tokens": 313595591.0, + "step": 1922 + }, + { + "epoch": 2.354957160342717, + "grad_norm": 0.25367817282676697, + "learning_rate": 1e-05, + "loss": 0.3213, + "num_tokens": 314199972.0, + "step": 1924 + }, + { + "epoch": 2.357405140758874, + "grad_norm": 0.25712472200393677, + "learning_rate": 1e-05, + "loss": 0.3211, + "num_tokens": 314795639.0, + "step": 1926 + }, + { + "epoch": 2.3598531211750307, + "grad_norm": 0.2516370713710785, + "learning_rate": 1e-05, + "loss": 0.3215, + "num_tokens": 315398116.0, + "step": 1928 + }, + { + "epoch": 2.3623011015911874, + "grad_norm": 0.2571152448654175, + "learning_rate": 1e-05, + "loss": 0.3212, + "num_tokens": 315967279.0, + "step": 1930 + }, + { + "epoch": 2.364749082007344, + "grad_norm": 0.25274693965911865, + "learning_rate": 1e-05, + "loss": 0.3297, + "num_tokens": 316551418.0, + "step": 1932 + }, + { + "epoch": 2.3671970624235006, + "grad_norm": 0.24968615174293518, + "learning_rate": 1e-05, + "loss": 0.3246, + "num_tokens": 317161199.0, + "step": 1934 + }, + { + "epoch": 2.3696450428396574, + "grad_norm": 0.25102177262306213, + "learning_rate": 1e-05, + "loss": 0.3351, + "num_tokens": 317772288.0, + "step": 1936 + }, + { + "epoch": 2.3720930232558137, + "grad_norm": 0.256553590297699, + "learning_rate": 1e-05, + "loss": 0.3128, + "num_tokens": 318324063.0, + "step": 1938 + }, + { + "epoch": 2.3745410036719705, + "grad_norm": 0.2520389258861542, + "learning_rate": 1e-05, + "loss": 0.3305, + "num_tokens": 318926620.0, + "step": 1940 + }, + { + "epoch": 2.3769889840881273, + "grad_norm": 0.2571202516555786, + "learning_rate": 1e-05, + "loss": 0.3196, + "num_tokens": 319515709.0, + "step": 1942 + }, + { + "epoch": 2.379436964504284, + "grad_norm": 0.2587038576602936, + "learning_rate": 1e-05, + "loss": 0.326, + "num_tokens": 320113312.0, + "step": 1944 + }, + { + "epoch": 2.3818849449204405, + "grad_norm": 0.24635723233222961, + "learning_rate": 1e-05, + "loss": 0.3225, + "num_tokens": 320710619.0, + "step": 1946 + }, + { + "epoch": 2.3843329253365972, + "grad_norm": 0.25865438580513, + "learning_rate": 1e-05, + "loss": 0.3267, + "num_tokens": 321291492.0, + "step": 1948 + }, + { + "epoch": 2.386780905752754, + "grad_norm": 0.25598016381263733, + "learning_rate": 1e-05, + "loss": 0.3297, + "num_tokens": 321877027.0, + "step": 1950 + }, + { + "epoch": 2.389228886168911, + "grad_norm": 0.2425144612789154, + "learning_rate": 1e-05, + "loss": 0.32, + "num_tokens": 322481168.0, + "step": 1952 + }, + { + "epoch": 2.391676866585067, + "grad_norm": 0.2606021761894226, + "learning_rate": 1e-05, + "loss": 0.3265, + "num_tokens": 323075623.0, + "step": 1954 + }, + { + "epoch": 2.394124847001224, + "grad_norm": 0.2530352473258972, + "learning_rate": 1e-05, + "loss": 0.3353, + "num_tokens": 323690401.0, + "step": 1956 + }, + { + "epoch": 2.3965728274173808, + "grad_norm": 0.24680784344673157, + "learning_rate": 1e-05, + "loss": 0.3104, + "num_tokens": 324265813.0, + "step": 1958 + }, + { + "epoch": 2.399020807833537, + "grad_norm": 0.24906018376350403, + "learning_rate": 1e-05, + "loss": 0.3305, + "num_tokens": 324872597.0, + "step": 1960 + }, + { + "epoch": 2.401468788249694, + "grad_norm": 0.26364362239837646, + "learning_rate": 1e-05, + "loss": 0.3241, + "num_tokens": 325494495.0, + "step": 1962 + }, + { + "epoch": 2.4039167686658507, + "grad_norm": 0.2485814392566681, + "learning_rate": 1e-05, + "loss": 0.3218, + "num_tokens": 326085292.0, + "step": 1964 + }, + { + "epoch": 2.4063647490820075, + "grad_norm": 0.25182193517684937, + "learning_rate": 1e-05, + "loss": 0.3178, + "num_tokens": 326663046.0, + "step": 1966 + }, + { + "epoch": 2.408812729498164, + "grad_norm": 0.2674141526222229, + "learning_rate": 1e-05, + "loss": 0.3191, + "num_tokens": 327237445.0, + "step": 1968 + }, + { + "epoch": 2.4112607099143206, + "grad_norm": 0.24762989580631256, + "learning_rate": 1e-05, + "loss": 0.315, + "num_tokens": 327830560.0, + "step": 1970 + }, + { + "epoch": 2.4137086903304774, + "grad_norm": 0.2627556324005127, + "learning_rate": 1e-05, + "loss": 0.3285, + "num_tokens": 328430438.0, + "step": 1972 + }, + { + "epoch": 2.416156670746634, + "grad_norm": 0.26080411672592163, + "learning_rate": 1e-05, + "loss": 0.3231, + "num_tokens": 329025822.0, + "step": 1974 + }, + { + "epoch": 2.4186046511627906, + "grad_norm": 0.2651062309741974, + "learning_rate": 1e-05, + "loss": 0.3196, + "num_tokens": 329616416.0, + "step": 1976 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 0.2645553946495056, + "learning_rate": 1e-05, + "loss": 0.3278, + "num_tokens": 330223848.0, + "step": 1978 + }, + { + "epoch": 2.423500611995104, + "grad_norm": 0.24931900203227997, + "learning_rate": 1e-05, + "loss": 0.3281, + "num_tokens": 330823815.0, + "step": 1980 + }, + { + "epoch": 2.4259485924112605, + "grad_norm": 0.26821616291999817, + "learning_rate": 1e-05, + "loss": 0.3235, + "num_tokens": 331414716.0, + "step": 1982 + }, + { + "epoch": 2.4283965728274173, + "grad_norm": 0.2596832513809204, + "learning_rate": 1e-05, + "loss": 0.3259, + "num_tokens": 332022505.0, + "step": 1984 + }, + { + "epoch": 2.430844553243574, + "grad_norm": 0.261565625667572, + "learning_rate": 1e-05, + "loss": 0.3219, + "num_tokens": 332643682.0, + "step": 1986 + }, + { + "epoch": 2.433292533659731, + "grad_norm": 0.25323015451431274, + "learning_rate": 1e-05, + "loss": 0.3187, + "num_tokens": 333236054.0, + "step": 1988 + }, + { + "epoch": 2.435740514075887, + "grad_norm": 0.27590665221214294, + "learning_rate": 1e-05, + "loss": 0.328, + "num_tokens": 333780888.0, + "step": 1990 + }, + { + "epoch": 2.438188494492044, + "grad_norm": 0.26514706015586853, + "learning_rate": 1e-05, + "loss": 0.3244, + "num_tokens": 334367607.0, + "step": 1992 + }, + { + "epoch": 2.440636474908201, + "grad_norm": 0.2672777473926544, + "learning_rate": 1e-05, + "loss": 0.3248, + "num_tokens": 334949364.0, + "step": 1994 + }, + { + "epoch": 2.4430844553243576, + "grad_norm": 0.27689221501350403, + "learning_rate": 1e-05, + "loss": 0.3241, + "num_tokens": 335514748.0, + "step": 1996 + }, + { + "epoch": 2.445532435740514, + "grad_norm": 0.2605144679546356, + "learning_rate": 1e-05, + "loss": 0.3135, + "num_tokens": 336080476.0, + "step": 1998 + }, + { + "epoch": 2.4479804161566707, + "grad_norm": 0.2649420499801636, + "learning_rate": 1e-05, + "loss": 0.3279, + "num_tokens": 336632578.0, + "step": 2000 + }, + { + "epoch": 2.4504283965728275, + "grad_norm": 0.2612689137458801, + "learning_rate": 1e-05, + "loss": 0.347, + "num_tokens": 337221659.0, + "step": 2002 + }, + { + "epoch": 2.452876376988984, + "grad_norm": 0.25685855746269226, + "learning_rate": 1e-05, + "loss": 0.3322, + "num_tokens": 337798086.0, + "step": 2004 + }, + { + "epoch": 2.4553243574051407, + "grad_norm": 0.2578672468662262, + "learning_rate": 1e-05, + "loss": 0.324, + "num_tokens": 338400582.0, + "step": 2006 + }, + { + "epoch": 2.4577723378212974, + "grad_norm": 0.2598439157009125, + "learning_rate": 1e-05, + "loss": 0.3371, + "num_tokens": 338998073.0, + "step": 2008 + }, + { + "epoch": 2.4602203182374542, + "grad_norm": 0.26257333159446716, + "learning_rate": 1e-05, + "loss": 0.3224, + "num_tokens": 339594572.0, + "step": 2010 + }, + { + "epoch": 2.4626682986536106, + "grad_norm": 0.2520839273929596, + "learning_rate": 1e-05, + "loss": 0.3264, + "num_tokens": 340156779.0, + "step": 2012 + }, + { + "epoch": 2.4651162790697674, + "grad_norm": 0.25438588857650757, + "learning_rate": 1e-05, + "loss": 0.3309, + "num_tokens": 340758075.0, + "step": 2014 + }, + { + "epoch": 2.467564259485924, + "grad_norm": 0.2509094178676605, + "learning_rate": 1e-05, + "loss": 0.3342, + "num_tokens": 341384154.0, + "step": 2016 + }, + { + "epoch": 2.470012239902081, + "grad_norm": 0.2800988554954529, + "learning_rate": 1e-05, + "loss": 0.3302, + "num_tokens": 341973383.0, + "step": 2018 + }, + { + "epoch": 2.4724602203182373, + "grad_norm": 0.2561210095882416, + "learning_rate": 1e-05, + "loss": 0.3182, + "num_tokens": 342579481.0, + "step": 2020 + }, + { + "epoch": 2.474908200734394, + "grad_norm": 0.24735815823078156, + "learning_rate": 1e-05, + "loss": 0.3277, + "num_tokens": 343171803.0, + "step": 2022 + }, + { + "epoch": 2.477356181150551, + "grad_norm": 0.2473309189081192, + "learning_rate": 1e-05, + "loss": 0.3225, + "num_tokens": 343800177.0, + "step": 2024 + }, + { + "epoch": 2.4798041615667072, + "grad_norm": 0.2584695816040039, + "learning_rate": 1e-05, + "loss": 0.3215, + "num_tokens": 344357371.0, + "step": 2026 + }, + { + "epoch": 2.482252141982864, + "grad_norm": 0.34662726521492004, + "learning_rate": 1e-05, + "loss": 0.3398, + "num_tokens": 344941928.0, + "step": 2028 + }, + { + "epoch": 2.484700122399021, + "grad_norm": 0.2552769184112549, + "learning_rate": 1e-05, + "loss": 0.3238, + "num_tokens": 345509899.0, + "step": 2030 + }, + { + "epoch": 2.4871481028151776, + "grad_norm": 0.2522905170917511, + "learning_rate": 1e-05, + "loss": 0.3317, + "num_tokens": 346102331.0, + "step": 2032 + }, + { + "epoch": 2.489596083231334, + "grad_norm": 0.2538517713546753, + "learning_rate": 1e-05, + "loss": 0.3165, + "num_tokens": 346684689.0, + "step": 2034 + }, + { + "epoch": 2.4920440636474908, + "grad_norm": 0.2533273994922638, + "learning_rate": 1e-05, + "loss": 0.3309, + "num_tokens": 347297893.0, + "step": 2036 + }, + { + "epoch": 2.4944920440636476, + "grad_norm": 0.24895316362380981, + "learning_rate": 1e-05, + "loss": 0.316, + "num_tokens": 347897795.0, + "step": 2038 + }, + { + "epoch": 2.4969400244798043, + "grad_norm": 0.2465788722038269, + "learning_rate": 1e-05, + "loss": 0.3255, + "num_tokens": 348514676.0, + "step": 2040 + }, + { + "epoch": 2.4993880048959607, + "grad_norm": 0.2624768018722534, + "learning_rate": 1e-05, + "loss": 0.3258, + "num_tokens": 349096443.0, + "step": 2042 + }, + { + "epoch": 2.5018359853121175, + "grad_norm": 0.25287625193595886, + "learning_rate": 1e-05, + "loss": 0.3223, + "num_tokens": 349706030.0, + "step": 2044 + }, + { + "epoch": 2.5042839657282743, + "grad_norm": 0.26094719767570496, + "learning_rate": 1e-05, + "loss": 0.3238, + "num_tokens": 350291310.0, + "step": 2046 + }, + { + "epoch": 2.5067319461444306, + "grad_norm": 0.24862395226955414, + "learning_rate": 1e-05, + "loss": 0.3142, + "num_tokens": 350897628.0, + "step": 2048 + }, + { + "epoch": 2.5091799265605874, + "grad_norm": 0.2516186237335205, + "learning_rate": 1e-05, + "loss": 0.3158, + "num_tokens": 351488228.0, + "step": 2050 + }, + { + "epoch": 2.511627906976744, + "grad_norm": 0.24392171204090118, + "learning_rate": 1e-05, + "loss": 0.3309, + "num_tokens": 352088867.0, + "step": 2052 + }, + { + "epoch": 2.514075887392901, + "grad_norm": 0.2512110769748688, + "learning_rate": 1e-05, + "loss": 0.3147, + "num_tokens": 352685180.0, + "step": 2054 + }, + { + "epoch": 2.516523867809058, + "grad_norm": 0.25949329137802124, + "learning_rate": 1e-05, + "loss": 0.3234, + "num_tokens": 353277486.0, + "step": 2056 + }, + { + "epoch": 2.518971848225214, + "grad_norm": 0.2465786188840866, + "learning_rate": 1e-05, + "loss": 0.3168, + "num_tokens": 353860979.0, + "step": 2058 + }, + { + "epoch": 2.521419828641371, + "grad_norm": 0.2434110790491104, + "learning_rate": 1e-05, + "loss": 0.3305, + "num_tokens": 354468059.0, + "step": 2060 + }, + { + "epoch": 2.5238678090575277, + "grad_norm": 0.253316730260849, + "learning_rate": 1e-05, + "loss": 0.3222, + "num_tokens": 355075770.0, + "step": 2062 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.2552608549594879, + "learning_rate": 1e-05, + "loss": 0.3203, + "num_tokens": 355626769.0, + "step": 2064 + }, + { + "epoch": 2.528763769889841, + "grad_norm": 0.25336354970932007, + "learning_rate": 1e-05, + "loss": 0.3112, + "num_tokens": 356154850.0, + "step": 2066 + }, + { + "epoch": 2.5312117503059977, + "grad_norm": 0.26685798168182373, + "learning_rate": 1e-05, + "loss": 0.3359, + "num_tokens": 356765495.0, + "step": 2068 + }, + { + "epoch": 2.533659730722154, + "grad_norm": 0.25629910826683044, + "learning_rate": 1e-05, + "loss": 0.3314, + "num_tokens": 357343529.0, + "step": 2070 + }, + { + "epoch": 2.536107711138311, + "grad_norm": 0.2573475241661072, + "learning_rate": 1e-05, + "loss": 0.3336, + "num_tokens": 357938110.0, + "step": 2072 + }, + { + "epoch": 2.5385556915544676, + "grad_norm": 0.24889910221099854, + "learning_rate": 1e-05, + "loss": 0.3134, + "num_tokens": 358505809.0, + "step": 2074 + }, + { + "epoch": 2.5410036719706244, + "grad_norm": 0.24748258292675018, + "learning_rate": 1e-05, + "loss": 0.3355, + "num_tokens": 359093575.0, + "step": 2076 + }, + { + "epoch": 2.543451652386781, + "grad_norm": 0.25954684615135193, + "learning_rate": 1e-05, + "loss": 0.3362, + "num_tokens": 359684808.0, + "step": 2078 + }, + { + "epoch": 2.5458996328029375, + "grad_norm": 0.25065338611602783, + "learning_rate": 1e-05, + "loss": 0.3206, + "num_tokens": 360271198.0, + "step": 2080 + }, + { + "epoch": 2.5483476132190943, + "grad_norm": 0.2514982521533966, + "learning_rate": 1e-05, + "loss": 0.3371, + "num_tokens": 360883060.0, + "step": 2082 + }, + { + "epoch": 2.550795593635251, + "grad_norm": 0.25208184123039246, + "learning_rate": 1e-05, + "loss": 0.332, + "num_tokens": 361474796.0, + "step": 2084 + }, + { + "epoch": 2.5532435740514074, + "grad_norm": 0.2545960545539856, + "learning_rate": 1e-05, + "loss": 0.3192, + "num_tokens": 362049485.0, + "step": 2086 + }, + { + "epoch": 2.5556915544675642, + "grad_norm": 0.2432592362165451, + "learning_rate": 1e-05, + "loss": 0.3094, + "num_tokens": 362633769.0, + "step": 2088 + }, + { + "epoch": 2.558139534883721, + "grad_norm": 0.257764607667923, + "learning_rate": 1e-05, + "loss": 0.3247, + "num_tokens": 363244435.0, + "step": 2090 + }, + { + "epoch": 2.5605875152998774, + "grad_norm": 0.26873084902763367, + "learning_rate": 1e-05, + "loss": 0.3268, + "num_tokens": 363814594.0, + "step": 2092 + }, + { + "epoch": 2.563035495716034, + "grad_norm": 0.25509268045425415, + "learning_rate": 1e-05, + "loss": 0.315, + "num_tokens": 364412136.0, + "step": 2094 + }, + { + "epoch": 2.565483476132191, + "grad_norm": 0.26778125762939453, + "learning_rate": 1e-05, + "loss": 0.3307, + "num_tokens": 364999748.0, + "step": 2096 + }, + { + "epoch": 2.5679314565483478, + "grad_norm": 0.26534485816955566, + "learning_rate": 1e-05, + "loss": 0.3224, + "num_tokens": 365565328.0, + "step": 2098 + }, + { + "epoch": 2.5703794369645045, + "grad_norm": 0.2511390745639801, + "learning_rate": 1e-05, + "loss": 0.3352, + "num_tokens": 366166059.0, + "step": 2100 + }, + { + "epoch": 2.572827417380661, + "grad_norm": 0.2617775499820709, + "learning_rate": 1e-05, + "loss": 0.3157, + "num_tokens": 366767336.0, + "step": 2102 + }, + { + "epoch": 2.5752753977968177, + "grad_norm": 0.27579638361930847, + "learning_rate": 1e-05, + "loss": 0.325, + "num_tokens": 367336066.0, + "step": 2104 + }, + { + "epoch": 2.5777233782129745, + "grad_norm": 0.2548695206642151, + "learning_rate": 1e-05, + "loss": 0.3168, + "num_tokens": 367930420.0, + "step": 2106 + }, + { + "epoch": 2.580171358629131, + "grad_norm": 0.2536294162273407, + "learning_rate": 1e-05, + "loss": 0.3266, + "num_tokens": 368516286.0, + "step": 2108 + }, + { + "epoch": 2.5826193390452876, + "grad_norm": 0.24859356880187988, + "learning_rate": 1e-05, + "loss": 0.3145, + "num_tokens": 369101362.0, + "step": 2110 + }, + { + "epoch": 2.5850673194614444, + "grad_norm": 0.26407885551452637, + "learning_rate": 1e-05, + "loss": 0.3367, + "num_tokens": 369705566.0, + "step": 2112 + }, + { + "epoch": 2.5875152998776008, + "grad_norm": 0.255536824464798, + "learning_rate": 1e-05, + "loss": 0.3219, + "num_tokens": 370320305.0, + "step": 2114 + }, + { + "epoch": 2.5899632802937576, + "grad_norm": 0.2630644142627716, + "learning_rate": 1e-05, + "loss": 0.3275, + "num_tokens": 370910515.0, + "step": 2116 + }, + { + "epoch": 2.5924112607099143, + "grad_norm": 0.2648698091506958, + "learning_rate": 1e-05, + "loss": 0.326, + "num_tokens": 371503862.0, + "step": 2118 + }, + { + "epoch": 2.594859241126071, + "grad_norm": 0.24663522839546204, + "learning_rate": 1e-05, + "loss": 0.3182, + "num_tokens": 372108685.0, + "step": 2120 + }, + { + "epoch": 2.597307221542228, + "grad_norm": 0.25443825125694275, + "learning_rate": 1e-05, + "loss": 0.3295, + "num_tokens": 372708606.0, + "step": 2122 + }, + { + "epoch": 2.5997552019583843, + "grad_norm": 0.24800601601600647, + "learning_rate": 1e-05, + "loss": 0.3243, + "num_tokens": 373316363.0, + "step": 2124 + }, + { + "epoch": 2.602203182374541, + "grad_norm": 0.2647169530391693, + "learning_rate": 1e-05, + "loss": 0.3248, + "num_tokens": 373895091.0, + "step": 2126 + }, + { + "epoch": 2.604651162790698, + "grad_norm": 0.24779950082302094, + "learning_rate": 1e-05, + "loss": 0.3258, + "num_tokens": 374468123.0, + "step": 2128 + }, + { + "epoch": 2.607099143206854, + "grad_norm": 0.2565578818321228, + "learning_rate": 1e-05, + "loss": 0.3317, + "num_tokens": 375090517.0, + "step": 2130 + }, + { + "epoch": 2.609547123623011, + "grad_norm": 0.2583713233470917, + "learning_rate": 1e-05, + "loss": 0.317, + "num_tokens": 375682699.0, + "step": 2132 + }, + { + "epoch": 2.611995104039168, + "grad_norm": 0.2560901641845703, + "learning_rate": 1e-05, + "loss": 0.3251, + "num_tokens": 376267492.0, + "step": 2134 + }, + { + "epoch": 2.614443084455324, + "grad_norm": 0.2595535218715668, + "learning_rate": 1e-05, + "loss": 0.3218, + "num_tokens": 376865486.0, + "step": 2136 + }, + { + "epoch": 2.616891064871481, + "grad_norm": 0.251692533493042, + "learning_rate": 1e-05, + "loss": 0.3181, + "num_tokens": 377428181.0, + "step": 2138 + }, + { + "epoch": 2.6193390452876377, + "grad_norm": 0.25037339329719543, + "learning_rate": 1e-05, + "loss": 0.3297, + "num_tokens": 378030958.0, + "step": 2140 + }, + { + "epoch": 2.6217870257037945, + "grad_norm": 0.25543636083602905, + "learning_rate": 1e-05, + "loss": 0.3282, + "num_tokens": 378624733.0, + "step": 2142 + }, + { + "epoch": 2.6242350061199513, + "grad_norm": 0.24831168353557587, + "learning_rate": 1e-05, + "loss": 0.3212, + "num_tokens": 379202817.0, + "step": 2144 + }, + { + "epoch": 2.6266829865361077, + "grad_norm": 0.2579514682292938, + "learning_rate": 1e-05, + "loss": 0.329, + "num_tokens": 379806787.0, + "step": 2146 + }, + { + "epoch": 2.6291309669522644, + "grad_norm": 0.24643900990486145, + "learning_rate": 1e-05, + "loss": 0.3185, + "num_tokens": 380411339.0, + "step": 2148 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.25557950139045715, + "learning_rate": 1e-05, + "loss": 0.3276, + "num_tokens": 381016714.0, + "step": 2150 + }, + { + "epoch": 2.6340269277845776, + "grad_norm": 0.244479238986969, + "learning_rate": 1e-05, + "loss": 0.3175, + "num_tokens": 381601301.0, + "step": 2152 + }, + { + "epoch": 2.6364749082007344, + "grad_norm": 0.2623596787452698, + "learning_rate": 1e-05, + "loss": 0.3291, + "num_tokens": 382217241.0, + "step": 2154 + }, + { + "epoch": 2.638922888616891, + "grad_norm": 0.24378331005573273, + "learning_rate": 1e-05, + "loss": 0.3215, + "num_tokens": 382853897.0, + "step": 2156 + }, + { + "epoch": 2.6413708690330475, + "grad_norm": 0.24978433549404144, + "learning_rate": 1e-05, + "loss": 0.3172, + "num_tokens": 383439968.0, + "step": 2158 + }, + { + "epoch": 2.6438188494492043, + "grad_norm": 0.2559012174606323, + "learning_rate": 1e-05, + "loss": 0.3305, + "num_tokens": 384024107.0, + "step": 2160 + }, + { + "epoch": 2.646266829865361, + "grad_norm": 0.24494871497154236, + "learning_rate": 1e-05, + "loss": 0.3192, + "num_tokens": 384609051.0, + "step": 2162 + }, + { + "epoch": 2.648714810281518, + "grad_norm": 0.26301223039627075, + "learning_rate": 1e-05, + "loss": 0.3171, + "num_tokens": 385178869.0, + "step": 2164 + }, + { + "epoch": 2.6511627906976747, + "grad_norm": 0.2578657865524292, + "learning_rate": 1e-05, + "loss": 0.3348, + "num_tokens": 385754760.0, + "step": 2166 + }, + { + "epoch": 2.653610771113831, + "grad_norm": 0.26708459854125977, + "learning_rate": 1e-05, + "loss": 0.323, + "num_tokens": 386342145.0, + "step": 2168 + }, + { + "epoch": 2.656058751529988, + "grad_norm": 0.25751882791519165, + "learning_rate": 1e-05, + "loss": 0.3276, + "num_tokens": 386934129.0, + "step": 2170 + }, + { + "epoch": 2.6585067319461446, + "grad_norm": 0.2512890696525574, + "learning_rate": 1e-05, + "loss": 0.3291, + "num_tokens": 387537081.0, + "step": 2172 + }, + { + "epoch": 2.660954712362301, + "grad_norm": 0.2504841089248657, + "learning_rate": 1e-05, + "loss": 0.3254, + "num_tokens": 388133636.0, + "step": 2174 + }, + { + "epoch": 2.6634026927784578, + "grad_norm": 0.2503618597984314, + "learning_rate": 1e-05, + "loss": 0.3393, + "num_tokens": 388744350.0, + "step": 2176 + }, + { + "epoch": 2.6658506731946146, + "grad_norm": 0.25509852170944214, + "learning_rate": 1e-05, + "loss": 0.3191, + "num_tokens": 389326694.0, + "step": 2178 + }, + { + "epoch": 2.668298653610771, + "grad_norm": 0.26111486554145813, + "learning_rate": 1e-05, + "loss": 0.3325, + "num_tokens": 389912003.0, + "step": 2180 + }, + { + "epoch": 2.6707466340269277, + "grad_norm": 0.2661671042442322, + "learning_rate": 1e-05, + "loss": 0.3265, + "num_tokens": 390471156.0, + "step": 2182 + }, + { + "epoch": 2.6731946144430845, + "grad_norm": 0.24591811001300812, + "learning_rate": 1e-05, + "loss": 0.317, + "num_tokens": 391051907.0, + "step": 2184 + }, + { + "epoch": 2.6756425948592413, + "grad_norm": 0.25238117575645447, + "learning_rate": 1e-05, + "loss": 0.322, + "num_tokens": 391640182.0, + "step": 2186 + }, + { + "epoch": 2.678090575275398, + "grad_norm": 0.2388547956943512, + "learning_rate": 1e-05, + "loss": 0.3173, + "num_tokens": 392251983.0, + "step": 2188 + }, + { + "epoch": 2.6805385556915544, + "grad_norm": 0.25939592719078064, + "learning_rate": 1e-05, + "loss": 0.3251, + "num_tokens": 392813772.0, + "step": 2190 + }, + { + "epoch": 2.682986536107711, + "grad_norm": 0.2584627866744995, + "learning_rate": 1e-05, + "loss": 0.3182, + "num_tokens": 393419195.0, + "step": 2192 + }, + { + "epoch": 2.685434516523868, + "grad_norm": 0.25700339674949646, + "learning_rate": 1e-05, + "loss": 0.328, + "num_tokens": 394004044.0, + "step": 2194 + }, + { + "epoch": 2.6878824969400243, + "grad_norm": 0.26522722840309143, + "learning_rate": 1e-05, + "loss": 0.3282, + "num_tokens": 394595054.0, + "step": 2196 + }, + { + "epoch": 2.690330477356181, + "grad_norm": 0.25307518243789673, + "learning_rate": 1e-05, + "loss": 0.3283, + "num_tokens": 395163217.0, + "step": 2198 + }, + { + "epoch": 2.692778457772338, + "grad_norm": 0.25655147433280945, + "learning_rate": 1e-05, + "loss": 0.3308, + "num_tokens": 395758903.0, + "step": 2200 + }, + { + "epoch": 2.6952264381884943, + "grad_norm": 0.235391765832901, + "learning_rate": 1e-05, + "loss": 0.3, + "num_tokens": 396332634.0, + "step": 2202 + }, + { + "epoch": 2.697674418604651, + "grad_norm": 0.25023701786994934, + "learning_rate": 1e-05, + "loss": 0.3264, + "num_tokens": 396928536.0, + "step": 2204 + }, + { + "epoch": 2.700122399020808, + "grad_norm": 0.24907897412776947, + "learning_rate": 1e-05, + "loss": 0.3187, + "num_tokens": 397500400.0, + "step": 2206 + }, + { + "epoch": 2.7025703794369647, + "grad_norm": 0.2496294379234314, + "learning_rate": 1e-05, + "loss": 0.3259, + "num_tokens": 398092585.0, + "step": 2208 + }, + { + "epoch": 2.7050183598531214, + "grad_norm": 0.2585897147655487, + "learning_rate": 1e-05, + "loss": 0.3166, + "num_tokens": 398668492.0, + "step": 2210 + }, + { + "epoch": 2.707466340269278, + "grad_norm": 0.25619223713874817, + "learning_rate": 1e-05, + "loss": 0.3338, + "num_tokens": 399251194.0, + "step": 2212 + }, + { + "epoch": 2.7099143206854346, + "grad_norm": 0.31031566858291626, + "learning_rate": 1e-05, + "loss": 0.3329, + "num_tokens": 399837720.0, + "step": 2214 + }, + { + "epoch": 2.7123623011015914, + "grad_norm": 0.24954873323440552, + "learning_rate": 1e-05, + "loss": 0.3256, + "num_tokens": 400442822.0, + "step": 2216 + }, + { + "epoch": 2.7148102815177477, + "grad_norm": 0.2491050362586975, + "learning_rate": 1e-05, + "loss": 0.3194, + "num_tokens": 401012536.0, + "step": 2218 + }, + { + "epoch": 2.7172582619339045, + "grad_norm": 0.2543574571609497, + "learning_rate": 1e-05, + "loss": 0.3289, + "num_tokens": 401597084.0, + "step": 2220 + }, + { + "epoch": 2.7197062423500613, + "grad_norm": 0.2470719963312149, + "learning_rate": 1e-05, + "loss": 0.3113, + "num_tokens": 402180043.0, + "step": 2222 + }, + { + "epoch": 2.7221542227662177, + "grad_norm": 0.2668313682079315, + "learning_rate": 1e-05, + "loss": 0.324, + "num_tokens": 402779003.0, + "step": 2224 + }, + { + "epoch": 2.7246022031823744, + "grad_norm": 0.24889680743217468, + "learning_rate": 1e-05, + "loss": 0.326, + "num_tokens": 403408199.0, + "step": 2226 + }, + { + "epoch": 2.7270501835985312, + "grad_norm": 0.25317952036857605, + "learning_rate": 1e-05, + "loss": 0.3232, + "num_tokens": 403992124.0, + "step": 2228 + }, + { + "epoch": 2.729498164014688, + "grad_norm": 0.2511397898197174, + "learning_rate": 1e-05, + "loss": 0.3281, + "num_tokens": 404560380.0, + "step": 2230 + }, + { + "epoch": 2.731946144430845, + "grad_norm": 0.2410099059343338, + "learning_rate": 1e-05, + "loss": 0.3263, + "num_tokens": 405160188.0, + "step": 2232 + }, + { + "epoch": 2.734394124847001, + "grad_norm": 0.25235220789909363, + "learning_rate": 1e-05, + "loss": 0.3255, + "num_tokens": 405748237.0, + "step": 2234 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 0.2504369616508484, + "learning_rate": 1e-05, + "loss": 0.337, + "num_tokens": 406342136.0, + "step": 2236 + }, + { + "epoch": 2.7392900856793148, + "grad_norm": 0.25537410378456116, + "learning_rate": 1e-05, + "loss": 0.3247, + "num_tokens": 406914106.0, + "step": 2238 + }, + { + "epoch": 2.741738066095471, + "grad_norm": 0.25914260745048523, + "learning_rate": 1e-05, + "loss": 0.33, + "num_tokens": 407516938.0, + "step": 2240 + }, + { + "epoch": 2.744186046511628, + "grad_norm": 0.25355833768844604, + "learning_rate": 1e-05, + "loss": 0.3215, + "num_tokens": 408085444.0, + "step": 2242 + }, + { + "epoch": 2.7466340269277847, + "grad_norm": 0.24525900185108185, + "learning_rate": 1e-05, + "loss": 0.3195, + "num_tokens": 408660772.0, + "step": 2244 + }, + { + "epoch": 2.749082007343941, + "grad_norm": 0.25692498683929443, + "learning_rate": 1e-05, + "loss": 0.3283, + "num_tokens": 409235521.0, + "step": 2246 + }, + { + "epoch": 2.751529987760098, + "grad_norm": 0.2557971477508545, + "learning_rate": 1e-05, + "loss": 0.3242, + "num_tokens": 409774999.0, + "step": 2248 + }, + { + "epoch": 2.7539779681762546, + "grad_norm": 0.25260117650032043, + "learning_rate": 1e-05, + "loss": 0.3156, + "num_tokens": 410367564.0, + "step": 2250 + }, + { + "epoch": 2.7564259485924114, + "grad_norm": 0.24852988123893738, + "learning_rate": 1e-05, + "loss": 0.3209, + "num_tokens": 410969022.0, + "step": 2252 + }, + { + "epoch": 2.758873929008568, + "grad_norm": 0.2457083761692047, + "learning_rate": 1e-05, + "loss": 0.3289, + "num_tokens": 411548719.0, + "step": 2254 + }, + { + "epoch": 2.7613219094247246, + "grad_norm": 0.25295954942703247, + "learning_rate": 1e-05, + "loss": 0.3246, + "num_tokens": 412130956.0, + "step": 2256 + }, + { + "epoch": 2.7637698898408813, + "grad_norm": 0.2472776174545288, + "learning_rate": 1e-05, + "loss": 0.3349, + "num_tokens": 412736440.0, + "step": 2258 + }, + { + "epoch": 2.766217870257038, + "grad_norm": 0.2581010162830353, + "learning_rate": 1e-05, + "loss": 0.3176, + "num_tokens": 413318066.0, + "step": 2260 + }, + { + "epoch": 2.7686658506731945, + "grad_norm": 0.2525770962238312, + "learning_rate": 1e-05, + "loss": 0.3177, + "num_tokens": 413917917.0, + "step": 2262 + }, + { + "epoch": 2.7711138310893513, + "grad_norm": 0.2548413872718811, + "learning_rate": 1e-05, + "loss": 0.3175, + "num_tokens": 414520002.0, + "step": 2264 + }, + { + "epoch": 2.773561811505508, + "grad_norm": 0.25579121708869934, + "learning_rate": 1e-05, + "loss": 0.336, + "num_tokens": 415107892.0, + "step": 2266 + }, + { + "epoch": 2.7760097919216644, + "grad_norm": 0.25981447100639343, + "learning_rate": 1e-05, + "loss": 0.3275, + "num_tokens": 415667389.0, + "step": 2268 + }, + { + "epoch": 2.778457772337821, + "grad_norm": 0.254893958568573, + "learning_rate": 1e-05, + "loss": 0.3406, + "num_tokens": 416255130.0, + "step": 2270 + }, + { + "epoch": 2.780905752753978, + "grad_norm": 0.24784359335899353, + "learning_rate": 1e-05, + "loss": 0.3257, + "num_tokens": 416839340.0, + "step": 2272 + }, + { + "epoch": 2.783353733170135, + "grad_norm": 0.2368021309375763, + "learning_rate": 1e-05, + "loss": 0.3222, + "num_tokens": 417443118.0, + "step": 2274 + }, + { + "epoch": 2.7858017135862916, + "grad_norm": 0.24467357993125916, + "learning_rate": 1e-05, + "loss": 0.333, + "num_tokens": 418042727.0, + "step": 2276 + }, + { + "epoch": 2.788249694002448, + "grad_norm": 0.257119357585907, + "learning_rate": 1e-05, + "loss": 0.3249, + "num_tokens": 418649730.0, + "step": 2278 + }, + { + "epoch": 2.7906976744186047, + "grad_norm": 0.24344298243522644, + "learning_rate": 1e-05, + "loss": 0.3217, + "num_tokens": 419233737.0, + "step": 2280 + }, + { + "epoch": 2.7931456548347615, + "grad_norm": 0.25380992889404297, + "learning_rate": 1e-05, + "loss": 0.3306, + "num_tokens": 419811330.0, + "step": 2282 + }, + { + "epoch": 2.795593635250918, + "grad_norm": 0.25424936413764954, + "learning_rate": 1e-05, + "loss": 0.3332, + "num_tokens": 420410428.0, + "step": 2284 + }, + { + "epoch": 2.7980416156670747, + "grad_norm": 0.26502135396003723, + "learning_rate": 1e-05, + "loss": 0.3351, + "num_tokens": 420991985.0, + "step": 2286 + }, + { + "epoch": 2.8004895960832314, + "grad_norm": 0.24788039922714233, + "learning_rate": 1e-05, + "loss": 0.3253, + "num_tokens": 421601039.0, + "step": 2288 + }, + { + "epoch": 2.802937576499388, + "grad_norm": 0.2538328468799591, + "learning_rate": 1e-05, + "loss": 0.3322, + "num_tokens": 422188161.0, + "step": 2290 + }, + { + "epoch": 2.8053855569155446, + "grad_norm": 0.23793523013591766, + "learning_rate": 1e-05, + "loss": 0.3219, + "num_tokens": 422803635.0, + "step": 2292 + }, + { + "epoch": 2.8078335373317014, + "grad_norm": 0.24384064972400665, + "learning_rate": 1e-05, + "loss": 0.3285, + "num_tokens": 423413615.0, + "step": 2294 + }, + { + "epoch": 2.810281517747858, + "grad_norm": 0.24296848475933075, + "learning_rate": 1e-05, + "loss": 0.3225, + "num_tokens": 424000637.0, + "step": 2296 + }, + { + "epoch": 2.812729498164015, + "grad_norm": 0.2595270276069641, + "learning_rate": 1e-05, + "loss": 0.3231, + "num_tokens": 424553573.0, + "step": 2298 + }, + { + "epoch": 2.8151774785801713, + "grad_norm": 0.25373417139053345, + "learning_rate": 1e-05, + "loss": 0.3259, + "num_tokens": 425142063.0, + "step": 2300 + }, + { + "epoch": 2.817625458996328, + "grad_norm": 0.24285350739955902, + "learning_rate": 1e-05, + "loss": 0.3156, + "num_tokens": 425760616.0, + "step": 2302 + }, + { + "epoch": 2.820073439412485, + "grad_norm": 0.2502899467945099, + "learning_rate": 1e-05, + "loss": 0.3203, + "num_tokens": 426347703.0, + "step": 2304 + }, + { + "epoch": 2.8225214198286412, + "grad_norm": 0.2567707896232605, + "learning_rate": 1e-05, + "loss": 0.3181, + "num_tokens": 426941074.0, + "step": 2306 + }, + { + "epoch": 2.824969400244798, + "grad_norm": 0.2537354826927185, + "learning_rate": 1e-05, + "loss": 0.3184, + "num_tokens": 427538380.0, + "step": 2308 + }, + { + "epoch": 2.827417380660955, + "grad_norm": 0.3201362192630768, + "learning_rate": 1e-05, + "loss": 0.3258, + "num_tokens": 428145766.0, + "step": 2310 + }, + { + "epoch": 2.829865361077111, + "grad_norm": 0.25844812393188477, + "learning_rate": 1e-05, + "loss": 0.3221, + "num_tokens": 428713019.0, + "step": 2312 + }, + { + "epoch": 2.832313341493268, + "grad_norm": 0.2547036409378052, + "learning_rate": 1e-05, + "loss": 0.321, + "num_tokens": 429313757.0, + "step": 2314 + }, + { + "epoch": 2.8347613219094248, + "grad_norm": 0.2565271854400635, + "learning_rate": 1e-05, + "loss": 0.3156, + "num_tokens": 429881555.0, + "step": 2316 + }, + { + "epoch": 2.8372093023255816, + "grad_norm": 0.2533104717731476, + "learning_rate": 1e-05, + "loss": 0.3249, + "num_tokens": 430476119.0, + "step": 2318 + }, + { + "epoch": 2.8396572827417383, + "grad_norm": 0.2432134449481964, + "learning_rate": 1e-05, + "loss": 0.3286, + "num_tokens": 431085110.0, + "step": 2320 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.2589612901210785, + "learning_rate": 1e-05, + "loss": 0.3259, + "num_tokens": 431646542.0, + "step": 2322 + }, + { + "epoch": 2.8445532435740515, + "grad_norm": 0.25283631682395935, + "learning_rate": 1e-05, + "loss": 0.3169, + "num_tokens": 432238611.0, + "step": 2324 + }, + { + "epoch": 2.8470012239902083, + "grad_norm": 0.27747154235839844, + "learning_rate": 1e-05, + "loss": 0.3281, + "num_tokens": 432805736.0, + "step": 2326 + }, + { + "epoch": 2.8494492044063646, + "grad_norm": 0.24919773638248444, + "learning_rate": 1e-05, + "loss": 0.327, + "num_tokens": 433416751.0, + "step": 2328 + }, + { + "epoch": 2.8518971848225214, + "grad_norm": 0.254692405462265, + "learning_rate": 1e-05, + "loss": 0.3198, + "num_tokens": 434017692.0, + "step": 2330 + }, + { + "epoch": 2.854345165238678, + "grad_norm": 0.2547382414340973, + "learning_rate": 1e-05, + "loss": 0.3215, + "num_tokens": 434628565.0, + "step": 2332 + }, + { + "epoch": 2.8567931456548346, + "grad_norm": 0.3222307562828064, + "learning_rate": 1e-05, + "loss": 0.3131, + "num_tokens": 435187461.0, + "step": 2334 + }, + { + "epoch": 2.8592411260709913, + "grad_norm": 0.2559938132762909, + "learning_rate": 1e-05, + "loss": 0.3344, + "num_tokens": 435791429.0, + "step": 2336 + }, + { + "epoch": 2.861689106487148, + "grad_norm": 0.24854914844036102, + "learning_rate": 1e-05, + "loss": 0.3267, + "num_tokens": 436369357.0, + "step": 2338 + }, + { + "epoch": 2.864137086903305, + "grad_norm": 0.24382859468460083, + "learning_rate": 1e-05, + "loss": 0.3377, + "num_tokens": 436988615.0, + "step": 2340 + }, + { + "epoch": 2.8665850673194617, + "grad_norm": 0.24978092312812805, + "learning_rate": 1e-05, + "loss": 0.318, + "num_tokens": 437587075.0, + "step": 2342 + }, + { + "epoch": 2.869033047735618, + "grad_norm": 0.2609320282936096, + "learning_rate": 1e-05, + "loss": 0.3202, + "num_tokens": 438146930.0, + "step": 2344 + }, + { + "epoch": 2.871481028151775, + "grad_norm": 0.25630444288253784, + "learning_rate": 1e-05, + "loss": 0.3284, + "num_tokens": 438732548.0, + "step": 2346 + }, + { + "epoch": 2.8739290085679317, + "grad_norm": 0.24788805842399597, + "learning_rate": 1e-05, + "loss": 0.3333, + "num_tokens": 439328118.0, + "step": 2348 + }, + { + "epoch": 2.876376988984088, + "grad_norm": 0.24909254908561707, + "learning_rate": 1e-05, + "loss": 0.3258, + "num_tokens": 439919885.0, + "step": 2350 + }, + { + "epoch": 2.878824969400245, + "grad_norm": 0.2596440613269806, + "learning_rate": 1e-05, + "loss": 0.3231, + "num_tokens": 440495586.0, + "step": 2352 + }, + { + "epoch": 2.8812729498164016, + "grad_norm": 0.2636336386203766, + "learning_rate": 1e-05, + "loss": 0.3093, + "num_tokens": 441074305.0, + "step": 2354 + }, + { + "epoch": 2.883720930232558, + "grad_norm": 0.3469390869140625, + "learning_rate": 1e-05, + "loss": 0.3108, + "num_tokens": 441634567.0, + "step": 2356 + }, + { + "epoch": 2.8861689106487147, + "grad_norm": 0.26014935970306396, + "learning_rate": 1e-05, + "loss": 0.3225, + "num_tokens": 442213028.0, + "step": 2358 + }, + { + "epoch": 2.8886168910648715, + "grad_norm": 0.2676273286342621, + "learning_rate": 1e-05, + "loss": 0.3327, + "num_tokens": 442803814.0, + "step": 2360 + }, + { + "epoch": 2.8910648714810283, + "grad_norm": 0.24446800351142883, + "learning_rate": 1e-05, + "loss": 0.3238, + "num_tokens": 443397503.0, + "step": 2362 + }, + { + "epoch": 2.8935128518971847, + "grad_norm": 0.25436699390411377, + "learning_rate": 1e-05, + "loss": 0.3339, + "num_tokens": 444015938.0, + "step": 2364 + }, + { + "epoch": 2.8959608323133414, + "grad_norm": 0.24916264414787292, + "learning_rate": 1e-05, + "loss": 0.3331, + "num_tokens": 444605250.0, + "step": 2366 + }, + { + "epoch": 2.8984088127294982, + "grad_norm": 0.24253615736961365, + "learning_rate": 1e-05, + "loss": 0.3193, + "num_tokens": 445196912.0, + "step": 2368 + }, + { + "epoch": 2.900856793145655, + "grad_norm": 0.2454930543899536, + "learning_rate": 1e-05, + "loss": 0.3155, + "num_tokens": 445802496.0, + "step": 2370 + }, + { + "epoch": 2.9033047735618114, + "grad_norm": 0.24793769419193268, + "learning_rate": 1e-05, + "loss": 0.3285, + "num_tokens": 446401922.0, + "step": 2372 + }, + { + "epoch": 2.905752753977968, + "grad_norm": 0.24584895372390747, + "learning_rate": 1e-05, + "loss": 0.3211, + "num_tokens": 447001867.0, + "step": 2374 + }, + { + "epoch": 2.908200734394125, + "grad_norm": 0.26117268204689026, + "learning_rate": 1e-05, + "loss": 0.3211, + "num_tokens": 447551496.0, + "step": 2376 + }, + { + "epoch": 2.9106487148102813, + "grad_norm": 0.24407590925693512, + "learning_rate": 1e-05, + "loss": 0.3158, + "num_tokens": 448133543.0, + "step": 2378 + }, + { + "epoch": 2.913096695226438, + "grad_norm": 0.24445265531539917, + "learning_rate": 1e-05, + "loss": 0.3237, + "num_tokens": 448731691.0, + "step": 2380 + }, + { + "epoch": 2.915544675642595, + "grad_norm": 0.24581503868103027, + "learning_rate": 1e-05, + "loss": 0.3211, + "num_tokens": 449327173.0, + "step": 2382 + }, + { + "epoch": 2.9179926560587517, + "grad_norm": 0.2608600854873657, + "learning_rate": 1e-05, + "loss": 0.3281, + "num_tokens": 449920674.0, + "step": 2384 + }, + { + "epoch": 2.920440636474908, + "grad_norm": 0.24781474471092224, + "learning_rate": 1e-05, + "loss": 0.3138, + "num_tokens": 450507378.0, + "step": 2386 + }, + { + "epoch": 2.922888616891065, + "grad_norm": 0.24228855967521667, + "learning_rate": 1e-05, + "loss": 0.3183, + "num_tokens": 451081732.0, + "step": 2388 + }, + { + "epoch": 2.9253365973072216, + "grad_norm": 0.257331520318985, + "learning_rate": 1e-05, + "loss": 0.3198, + "num_tokens": 451660618.0, + "step": 2390 + }, + { + "epoch": 2.9277845777233784, + "grad_norm": 0.2509153485298157, + "learning_rate": 1e-05, + "loss": 0.3227, + "num_tokens": 452229573.0, + "step": 2392 + }, + { + "epoch": 2.9302325581395348, + "grad_norm": 0.25570929050445557, + "learning_rate": 1e-05, + "loss": 0.3223, + "num_tokens": 452841553.0, + "step": 2394 + }, + { + "epoch": 2.9326805385556916, + "grad_norm": 0.24994735419750214, + "learning_rate": 1e-05, + "loss": 0.3223, + "num_tokens": 453429653.0, + "step": 2396 + }, + { + "epoch": 2.9351285189718483, + "grad_norm": 0.25359705090522766, + "learning_rate": 1e-05, + "loss": 0.323, + "num_tokens": 454026873.0, + "step": 2398 + }, + { + "epoch": 2.9375764993880047, + "grad_norm": 0.24387480318546295, + "learning_rate": 1e-05, + "loss": 0.3261, + "num_tokens": 454620108.0, + "step": 2400 + }, + { + "epoch": 2.9400244798041615, + "grad_norm": 0.2454231232404709, + "learning_rate": 1e-05, + "loss": 0.3292, + "num_tokens": 455236527.0, + "step": 2402 + }, + { + "epoch": 2.9424724602203183, + "grad_norm": 0.2575424015522003, + "learning_rate": 1e-05, + "loss": 0.3285, + "num_tokens": 455862858.0, + "step": 2404 + }, + { + "epoch": 2.944920440636475, + "grad_norm": 0.2581271231174469, + "learning_rate": 1e-05, + "loss": 0.3318, + "num_tokens": 456431610.0, + "step": 2406 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.24646282196044922, + "learning_rate": 1e-05, + "loss": 0.3201, + "num_tokens": 457014496.0, + "step": 2408 + }, + { + "epoch": 2.949816401468788, + "grad_norm": 0.2476486712694168, + "learning_rate": 1e-05, + "loss": 0.327, + "num_tokens": 457624802.0, + "step": 2410 + }, + { + "epoch": 2.952264381884945, + "grad_norm": 0.2568027973175049, + "learning_rate": 1e-05, + "loss": 0.3165, + "num_tokens": 458184907.0, + "step": 2412 + }, + { + "epoch": 2.954712362301102, + "grad_norm": 0.27381864190101624, + "learning_rate": 1e-05, + "loss": 0.3163, + "num_tokens": 458756228.0, + "step": 2414 + }, + { + "epoch": 2.957160342717258, + "grad_norm": 0.2646235227584839, + "learning_rate": 1e-05, + "loss": 0.3248, + "num_tokens": 459326023.0, + "step": 2416 + }, + { + "epoch": 2.959608323133415, + "grad_norm": 0.2521771490573883, + "learning_rate": 1e-05, + "loss": 0.3436, + "num_tokens": 459922424.0, + "step": 2418 + }, + { + "epoch": 2.9620563035495717, + "grad_norm": 0.25192415714263916, + "learning_rate": 1e-05, + "loss": 0.3258, + "num_tokens": 460506985.0, + "step": 2420 + }, + { + "epoch": 2.964504283965728, + "grad_norm": 0.24735009670257568, + "learning_rate": 1e-05, + "loss": 0.32, + "num_tokens": 461089976.0, + "step": 2422 + }, + { + "epoch": 2.966952264381885, + "grad_norm": 0.25478076934814453, + "learning_rate": 1e-05, + "loss": 0.322, + "num_tokens": 461681056.0, + "step": 2424 + }, + { + "epoch": 2.9694002447980417, + "grad_norm": 0.24166902899742126, + "learning_rate": 1e-05, + "loss": 0.3199, + "num_tokens": 462278680.0, + "step": 2426 + }, + { + "epoch": 2.9718482252141984, + "grad_norm": 0.23424232006072998, + "learning_rate": 1e-05, + "loss": 0.3292, + "num_tokens": 462918784.0, + "step": 2428 + }, + { + "epoch": 2.974296205630355, + "grad_norm": 0.24685415625572205, + "learning_rate": 1e-05, + "loss": 0.3349, + "num_tokens": 463532263.0, + "step": 2430 + }, + { + "epoch": 2.9767441860465116, + "grad_norm": 0.24677598476409912, + "learning_rate": 1e-05, + "loss": 0.3235, + "num_tokens": 464170104.0, + "step": 2432 + }, + { + "epoch": 2.9791921664626684, + "grad_norm": 0.257145494222641, + "learning_rate": 1e-05, + "loss": 0.3254, + "num_tokens": 464754665.0, + "step": 2434 + }, + { + "epoch": 2.981640146878825, + "grad_norm": 0.2553233802318573, + "learning_rate": 1e-05, + "loss": 0.3316, + "num_tokens": 465349241.0, + "step": 2436 + }, + { + "epoch": 2.9840881272949815, + "grad_norm": 0.2508125305175781, + "learning_rate": 1e-05, + "loss": 0.3222, + "num_tokens": 465923272.0, + "step": 2438 + }, + { + "epoch": 2.9865361077111383, + "grad_norm": 0.2562168538570404, + "learning_rate": 1e-05, + "loss": 0.3223, + "num_tokens": 466496143.0, + "step": 2440 + }, + { + "epoch": 2.988984088127295, + "grad_norm": 0.24228590726852417, + "learning_rate": 1e-05, + "loss": 0.3196, + "num_tokens": 467109273.0, + "step": 2442 + }, + { + "epoch": 2.9914320685434515, + "grad_norm": 0.24678544700145721, + "learning_rate": 1e-05, + "loss": 0.3139, + "num_tokens": 467701007.0, + "step": 2444 + }, + { + "epoch": 2.9938800489596082, + "grad_norm": 0.2628517746925354, + "learning_rate": 1e-05, + "loss": 0.3247, + "num_tokens": 468293330.0, + "step": 2446 + }, + { + "epoch": 2.996328029375765, + "grad_norm": 0.2700228691101074, + "learning_rate": 1e-05, + "loss": 0.3248, + "num_tokens": 468896606.0, + "step": 2448 + }, + { + "epoch": 2.998776009791922, + "grad_norm": 0.24113397300243378, + "learning_rate": 1e-05, + "loss": 0.3309, + "num_tokens": 469499267.0, + "step": 2450 + }, + { + "epoch": 3.0, + "num_tokens": 469802937.0, + "step": 2451, + "total_flos": 2.993584104037312e+19, + "train_loss": 0.2255855570249001, + "train_runtime": 11813.0141, + "train_samples_per_second": 23.224, + "train_steps_per_second": 0.207 + } + ], + "logging_steps": 2, + "max_steps": 2451, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 123, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.993584104037312e+19, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}