| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 2451, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0024479804161566705, | |
| "grad_norm": 5.113167762756348, | |
| "learning_rate": 4.0650406504065046e-08, | |
| "loss": 1.1539, | |
| "num_tokens": 579147.0, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.004895960832313341, | |
| "grad_norm": 4.9927592277526855, | |
| "learning_rate": 1.2195121951219514e-07, | |
| "loss": 1.1356, | |
| "num_tokens": 1171666.0, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0073439412484700125, | |
| "grad_norm": 5.046677112579346, | |
| "learning_rate": 2.0325203252032523e-07, | |
| "loss": 1.1527, | |
| "num_tokens": 1782835.0, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.009791921664626682, | |
| "grad_norm": 5.062073707580566, | |
| "learning_rate": 2.845528455284553e-07, | |
| "loss": 1.1503, | |
| "num_tokens": 2373836.0, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.012239902080783354, | |
| "grad_norm": 5.033382892608643, | |
| "learning_rate": 3.6585365853658536e-07, | |
| "loss": 1.1475, | |
| "num_tokens": 2949918.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.014687882496940025, | |
| "grad_norm": 4.7532267570495605, | |
| "learning_rate": 4.471544715447155e-07, | |
| "loss": 1.1358, | |
| "num_tokens": 3550879.0, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.017135862913096694, | |
| "grad_norm": 4.71107292175293, | |
| "learning_rate": 5.284552845528456e-07, | |
| "loss": 1.1428, | |
| "num_tokens": 4150487.0, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.019583843329253364, | |
| "grad_norm": 4.236279487609863, | |
| "learning_rate": 6.097560975609757e-07, | |
| "loss": 1.0976, | |
| "num_tokens": 4732419.0, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.022031823745410038, | |
| "grad_norm": 4.217196941375732, | |
| "learning_rate": 6.910569105691058e-07, | |
| "loss": 1.104, | |
| "num_tokens": 5323835.0, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.02447980416156671, | |
| "grad_norm": 4.150773525238037, | |
| "learning_rate": 7.723577235772359e-07, | |
| "loss": 1.0909, | |
| "num_tokens": 5907508.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.02692778457772338, | |
| "grad_norm": 3.1003658771514893, | |
| "learning_rate": 8.53658536585366e-07, | |
| "loss": 1.0317, | |
| "num_tokens": 6502779.0, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.02937576499388005, | |
| "grad_norm": 2.9581403732299805, | |
| "learning_rate": 9.349593495934959e-07, | |
| "loss": 0.9689, | |
| "num_tokens": 7084297.0, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.03182374541003672, | |
| "grad_norm": 2.856872797012329, | |
| "learning_rate": 1.0162601626016261e-06, | |
| "loss": 0.9612, | |
| "num_tokens": 7677074.0, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.03427172582619339, | |
| "grad_norm": 2.546175241470337, | |
| "learning_rate": 1.0975609756097562e-06, | |
| "loss": 0.9467, | |
| "num_tokens": 8262795.0, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.03671970624235006, | |
| "grad_norm": 1.7229831218719482, | |
| "learning_rate": 1.1788617886178863e-06, | |
| "loss": 0.8588, | |
| "num_tokens": 8856597.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03916768665850673, | |
| "grad_norm": 1.508795976638794, | |
| "learning_rate": 1.2601626016260162e-06, | |
| "loss": 0.8236, | |
| "num_tokens": 9443865.0, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.0416156670746634, | |
| "grad_norm": 1.2896347045898438, | |
| "learning_rate": 1.3414634146341465e-06, | |
| "loss": 0.798, | |
| "num_tokens": 10028572.0, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.044063647490820076, | |
| "grad_norm": 1.0740002393722534, | |
| "learning_rate": 1.4227642276422766e-06, | |
| "loss": 0.7936, | |
| "num_tokens": 10622144.0, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.046511627906976744, | |
| "grad_norm": 0.8853735327720642, | |
| "learning_rate": 1.5040650406504067e-06, | |
| "loss": 0.7573, | |
| "num_tokens": 11195500.0, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.04895960832313342, | |
| "grad_norm": 0.6570454239845276, | |
| "learning_rate": 1.5853658536585368e-06, | |
| "loss": 0.7249, | |
| "num_tokens": 11804642.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.051407588739290085, | |
| "grad_norm": 0.7068105340003967, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 0.7, | |
| "num_tokens": 12412613.0, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.05385556915544676, | |
| "grad_norm": 0.6152868270874023, | |
| "learning_rate": 1.747967479674797e-06, | |
| "loss": 0.6867, | |
| "num_tokens": 12980451.0, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.056303549571603426, | |
| "grad_norm": 0.5061647891998291, | |
| "learning_rate": 1.8292682926829268e-06, | |
| "loss": 0.6777, | |
| "num_tokens": 13573410.0, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.0587515299877601, | |
| "grad_norm": 0.4394634962081909, | |
| "learning_rate": 1.9105691056910574e-06, | |
| "loss": 0.6849, | |
| "num_tokens": 14151346.0, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.06119951040391677, | |
| "grad_norm": 0.38909780979156494, | |
| "learning_rate": 1.991869918699187e-06, | |
| "loss": 0.6814, | |
| "num_tokens": 14772803.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06364749082007344, | |
| "grad_norm": 0.3801971673965454, | |
| "learning_rate": 2.073170731707317e-06, | |
| "loss": 0.6561, | |
| "num_tokens": 15354869.0, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.06609547123623011, | |
| "grad_norm": 0.3451838493347168, | |
| "learning_rate": 2.154471544715447e-06, | |
| "loss": 0.6419, | |
| "num_tokens": 15944249.0, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.06854345165238677, | |
| "grad_norm": 0.30912837386131287, | |
| "learning_rate": 2.2357723577235773e-06, | |
| "loss": 0.6106, | |
| "num_tokens": 16523611.0, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.07099143206854346, | |
| "grad_norm": 0.30823254585266113, | |
| "learning_rate": 2.317073170731708e-06, | |
| "loss": 0.6246, | |
| "num_tokens": 17110353.0, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.07343941248470012, | |
| "grad_norm": 0.29659801721572876, | |
| "learning_rate": 2.3983739837398375e-06, | |
| "loss": 0.6197, | |
| "num_tokens": 17682600.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07588739290085679, | |
| "grad_norm": 0.3088859021663666, | |
| "learning_rate": 2.4796747967479676e-06, | |
| "loss": 0.6052, | |
| "num_tokens": 18274385.0, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.07833537331701346, | |
| "grad_norm": 0.2870602309703827, | |
| "learning_rate": 2.5609756097560977e-06, | |
| "loss": 0.6243, | |
| "num_tokens": 18864027.0, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.08078335373317014, | |
| "grad_norm": 0.2588765323162079, | |
| "learning_rate": 2.6422764227642278e-06, | |
| "loss": 0.5855, | |
| "num_tokens": 19450167.0, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.0832313341493268, | |
| "grad_norm": 0.2649621367454529, | |
| "learning_rate": 2.723577235772358e-06, | |
| "loss": 0.5827, | |
| "num_tokens": 20027616.0, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.08567931456548347, | |
| "grad_norm": 0.26677006483078003, | |
| "learning_rate": 2.8048780487804884e-06, | |
| "loss": 0.5919, | |
| "num_tokens": 20641209.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.08812729498164015, | |
| "grad_norm": 0.25061097741127014, | |
| "learning_rate": 2.8861788617886185e-06, | |
| "loss": 0.5694, | |
| "num_tokens": 21231993.0, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.09057527539779682, | |
| "grad_norm": 0.2478611171245575, | |
| "learning_rate": 2.967479674796748e-06, | |
| "loss": 0.561, | |
| "num_tokens": 21822237.0, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.09302325581395349, | |
| "grad_norm": 0.2571170926094055, | |
| "learning_rate": 3.0487804878048782e-06, | |
| "loss": 0.5809, | |
| "num_tokens": 22406364.0, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.09547123623011015, | |
| "grad_norm": 0.24517571926116943, | |
| "learning_rate": 3.1300813008130083e-06, | |
| "loss": 0.571, | |
| "num_tokens": 23012261.0, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.09791921664626684, | |
| "grad_norm": 0.25413069128990173, | |
| "learning_rate": 3.211382113821139e-06, | |
| "loss": 0.5671, | |
| "num_tokens": 23609152.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1003671970624235, | |
| "grad_norm": 0.2391115128993988, | |
| "learning_rate": 3.292682926829269e-06, | |
| "loss": 0.56, | |
| "num_tokens": 24195467.0, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.10281517747858017, | |
| "grad_norm": 0.2625662088394165, | |
| "learning_rate": 3.3739837398373986e-06, | |
| "loss": 0.5457, | |
| "num_tokens": 24776287.0, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.10526315789473684, | |
| "grad_norm": 0.25236576795578003, | |
| "learning_rate": 3.4552845528455287e-06, | |
| "loss": 0.5555, | |
| "num_tokens": 25353355.0, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.10771113831089352, | |
| "grad_norm": 0.2561672627925873, | |
| "learning_rate": 3.5365853658536588e-06, | |
| "loss": 0.5514, | |
| "num_tokens": 25910479.0, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.11015911872705018, | |
| "grad_norm": 0.25256532430648804, | |
| "learning_rate": 3.6178861788617893e-06, | |
| "loss": 0.5572, | |
| "num_tokens": 26499675.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.11260709914320685, | |
| "grad_norm": 0.24565160274505615, | |
| "learning_rate": 3.699186991869919e-06, | |
| "loss": 0.5365, | |
| "num_tokens": 27077382.0, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.11505507955936352, | |
| "grad_norm": 0.2521428167819977, | |
| "learning_rate": 3.780487804878049e-06, | |
| "loss": 0.5465, | |
| "num_tokens": 27670497.0, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.1175030599755202, | |
| "grad_norm": 0.24958781898021698, | |
| "learning_rate": 3.861788617886179e-06, | |
| "loss": 0.5379, | |
| "num_tokens": 28263901.0, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.11995104039167687, | |
| "grad_norm": 0.2590058147907257, | |
| "learning_rate": 3.943089430894309e-06, | |
| "loss": 0.5183, | |
| "num_tokens": 28845275.0, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.12239902080783353, | |
| "grad_norm": 0.24741683900356293, | |
| "learning_rate": 4.024390243902439e-06, | |
| "loss": 0.5358, | |
| "num_tokens": 29436166.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.12484700122399021, | |
| "grad_norm": 0.24634818732738495, | |
| "learning_rate": 4.10569105691057e-06, | |
| "loss": 0.5311, | |
| "num_tokens": 30044583.0, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.12729498164014688, | |
| "grad_norm": 0.243186354637146, | |
| "learning_rate": 4.1869918699186995e-06, | |
| "loss": 0.5312, | |
| "num_tokens": 30645726.0, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.12974296205630356, | |
| "grad_norm": 0.26044467091560364, | |
| "learning_rate": 4.268292682926829e-06, | |
| "loss": 0.544, | |
| "num_tokens": 31251475.0, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.13219094247246022, | |
| "grad_norm": 0.2474851757287979, | |
| "learning_rate": 4.34959349593496e-06, | |
| "loss": 0.5167, | |
| "num_tokens": 31830636.0, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.1346389228886169, | |
| "grad_norm": 0.2415020912885666, | |
| "learning_rate": 4.43089430894309e-06, | |
| "loss": 0.5371, | |
| "num_tokens": 32435998.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.13708690330477355, | |
| "grad_norm": 0.2419433742761612, | |
| "learning_rate": 4.51219512195122e-06, | |
| "loss": 0.5163, | |
| "num_tokens": 33041535.0, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.13953488372093023, | |
| "grad_norm": 0.2523370087146759, | |
| "learning_rate": 4.59349593495935e-06, | |
| "loss": 0.53, | |
| "num_tokens": 33617226.0, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.1419828641370869, | |
| "grad_norm": 0.2544844448566437, | |
| "learning_rate": 4.67479674796748e-06, | |
| "loss": 0.5212, | |
| "num_tokens": 34184157.0, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.14443084455324356, | |
| "grad_norm": 0.2378045618534088, | |
| "learning_rate": 4.75609756097561e-06, | |
| "loss": 0.5107, | |
| "num_tokens": 34779714.0, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.14687882496940025, | |
| "grad_norm": 0.25506970286369324, | |
| "learning_rate": 4.83739837398374e-06, | |
| "loss": 0.5081, | |
| "num_tokens": 35361759.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.14932680538555693, | |
| "grad_norm": 0.24085764586925507, | |
| "learning_rate": 4.918699186991871e-06, | |
| "loss": 0.5101, | |
| "num_tokens": 35960992.0, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.15177478580171358, | |
| "grad_norm": 0.24073615670204163, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5242, | |
| "num_tokens": 36577873.0, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.15422276621787026, | |
| "grad_norm": 0.2359231561422348, | |
| "learning_rate": 5.081300813008131e-06, | |
| "loss": 0.525, | |
| "num_tokens": 37173083.0, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.15667074663402691, | |
| "grad_norm": 0.24386054277420044, | |
| "learning_rate": 5.162601626016261e-06, | |
| "loss": 0.5222, | |
| "num_tokens": 37772321.0, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.1591187270501836, | |
| "grad_norm": 0.24544130265712738, | |
| "learning_rate": 5.243902439024391e-06, | |
| "loss": 0.5259, | |
| "num_tokens": 38366280.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.16156670746634028, | |
| "grad_norm": 0.2467457354068756, | |
| "learning_rate": 5.32520325203252e-06, | |
| "loss": 0.5357, | |
| "num_tokens": 38963886.0, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.16401468788249693, | |
| "grad_norm": 0.2387515753507614, | |
| "learning_rate": 5.4065040650406504e-06, | |
| "loss": 0.5047, | |
| "num_tokens": 39590863.0, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.1664626682986536, | |
| "grad_norm": 0.24072597920894623, | |
| "learning_rate": 5.487804878048781e-06, | |
| "loss": 0.5076, | |
| "num_tokens": 40182071.0, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.1689106487148103, | |
| "grad_norm": 0.24588756263256073, | |
| "learning_rate": 5.569105691056911e-06, | |
| "loss": 0.5018, | |
| "num_tokens": 40758834.0, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.17135862913096694, | |
| "grad_norm": 0.24266333878040314, | |
| "learning_rate": 5.650406504065041e-06, | |
| "loss": 0.5331, | |
| "num_tokens": 41366991.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.17380660954712362, | |
| "grad_norm": 0.2442614734172821, | |
| "learning_rate": 5.731707317073171e-06, | |
| "loss": 0.5212, | |
| "num_tokens": 41970776.0, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.1762545899632803, | |
| "grad_norm": 0.24988345801830292, | |
| "learning_rate": 5.813008130081301e-06, | |
| "loss": 0.5022, | |
| "num_tokens": 42545064.0, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.17870257037943696, | |
| "grad_norm": 0.250379741191864, | |
| "learning_rate": 5.894308943089432e-06, | |
| "loss": 0.4936, | |
| "num_tokens": 43097646.0, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.18115055079559364, | |
| "grad_norm": 0.27457740902900696, | |
| "learning_rate": 5.9756097560975615e-06, | |
| "loss": 0.5212, | |
| "num_tokens": 43704720.0, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.1835985312117503, | |
| "grad_norm": 0.24359893798828125, | |
| "learning_rate": 6.056910569105692e-06, | |
| "loss": 0.4802, | |
| "num_tokens": 44291468.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.18604651162790697, | |
| "grad_norm": 0.2580820918083191, | |
| "learning_rate": 6.138211382113821e-06, | |
| "loss": 0.5089, | |
| "num_tokens": 44889809.0, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.18849449204406366, | |
| "grad_norm": 0.26753556728363037, | |
| "learning_rate": 6.219512195121951e-06, | |
| "loss": 0.4957, | |
| "num_tokens": 45471363.0, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.1909424724602203, | |
| "grad_norm": 0.2733646631240845, | |
| "learning_rate": 6.300813008130082e-06, | |
| "loss": 0.4918, | |
| "num_tokens": 46044686.0, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.193390452876377, | |
| "grad_norm": 0.23967885971069336, | |
| "learning_rate": 6.3821138211382115e-06, | |
| "loss": 0.4793, | |
| "num_tokens": 46638282.0, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.19583843329253367, | |
| "grad_norm": 0.25125283002853394, | |
| "learning_rate": 6.463414634146342e-06, | |
| "loss": 0.5065, | |
| "num_tokens": 47254725.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.19828641370869032, | |
| "grad_norm": 0.2450665384531021, | |
| "learning_rate": 6.544715447154472e-06, | |
| "loss": 0.4916, | |
| "num_tokens": 47825874.0, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.200734394124847, | |
| "grad_norm": 0.25482243299484253, | |
| "learning_rate": 6.626016260162602e-06, | |
| "loss": 0.4967, | |
| "num_tokens": 48404000.0, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.20318237454100369, | |
| "grad_norm": 0.24947668612003326, | |
| "learning_rate": 6.707317073170733e-06, | |
| "loss": 0.5079, | |
| "num_tokens": 49020620.0, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.20563035495716034, | |
| "grad_norm": 0.2664734721183777, | |
| "learning_rate": 6.788617886178862e-06, | |
| "loss": 0.4939, | |
| "num_tokens": 49610138.0, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.20807833537331702, | |
| "grad_norm": 0.24819302558898926, | |
| "learning_rate": 6.869918699186993e-06, | |
| "loss": 0.4976, | |
| "num_tokens": 50239651.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.21052631578947367, | |
| "grad_norm": 0.25770121812820435, | |
| "learning_rate": 6.951219512195122e-06, | |
| "loss": 0.4838, | |
| "num_tokens": 50821539.0, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.21297429620563035, | |
| "grad_norm": 0.2596096694469452, | |
| "learning_rate": 7.032520325203252e-06, | |
| "loss": 0.4922, | |
| "num_tokens": 51392202.0, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.21542227662178703, | |
| "grad_norm": 0.24805551767349243, | |
| "learning_rate": 7.113821138211383e-06, | |
| "loss": 0.5006, | |
| "num_tokens": 51973577.0, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.2178702570379437, | |
| "grad_norm": 0.26363953948020935, | |
| "learning_rate": 7.1951219512195125e-06, | |
| "loss": 0.4848, | |
| "num_tokens": 52557644.0, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.22031823745410037, | |
| "grad_norm": 0.2537650167942047, | |
| "learning_rate": 7.276422764227643e-06, | |
| "loss": 0.4836, | |
| "num_tokens": 53136591.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.22276621787025705, | |
| "grad_norm": 0.2626267671585083, | |
| "learning_rate": 7.357723577235773e-06, | |
| "loss": 0.5021, | |
| "num_tokens": 53723609.0, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.2252141982864137, | |
| "grad_norm": 0.2569217085838318, | |
| "learning_rate": 7.439024390243903e-06, | |
| "loss": 0.5023, | |
| "num_tokens": 54317296.0, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.22766217870257038, | |
| "grad_norm": 0.25913605093955994, | |
| "learning_rate": 7.520325203252034e-06, | |
| "loss": 0.4829, | |
| "num_tokens": 54898930.0, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.23011015911872704, | |
| "grad_norm": 0.25674960017204285, | |
| "learning_rate": 7.601626016260163e-06, | |
| "loss": 0.4927, | |
| "num_tokens": 55520199.0, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.23255813953488372, | |
| "grad_norm": 0.23637175559997559, | |
| "learning_rate": 7.682926829268293e-06, | |
| "loss": 0.4817, | |
| "num_tokens": 56148692.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2350061199510404, | |
| "grad_norm": 0.26663970947265625, | |
| "learning_rate": 7.764227642276424e-06, | |
| "loss": 0.4894, | |
| "num_tokens": 56744145.0, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.23745410036719705, | |
| "grad_norm": 0.2444421350955963, | |
| "learning_rate": 7.845528455284554e-06, | |
| "loss": 0.4858, | |
| "num_tokens": 57372074.0, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.23990208078335373, | |
| "grad_norm": 0.2536911964416504, | |
| "learning_rate": 7.926829268292685e-06, | |
| "loss": 0.5046, | |
| "num_tokens": 57970018.0, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.2423500611995104, | |
| "grad_norm": 0.25137463212013245, | |
| "learning_rate": 8.008130081300813e-06, | |
| "loss": 0.4995, | |
| "num_tokens": 58616468.0, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.24479804161566707, | |
| "grad_norm": 0.27120441198349, | |
| "learning_rate": 8.089430894308944e-06, | |
| "loss": 0.4862, | |
| "num_tokens": 59212867.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.24724602203182375, | |
| "grad_norm": 0.2675727605819702, | |
| "learning_rate": 8.170731707317073e-06, | |
| "loss": 0.4877, | |
| "num_tokens": 59831819.0, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.24969400244798043, | |
| "grad_norm": 0.25941142439842224, | |
| "learning_rate": 8.252032520325203e-06, | |
| "loss": 0.4744, | |
| "num_tokens": 60406569.0, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.2521419828641371, | |
| "grad_norm": 0.25482282042503357, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.4897, | |
| "num_tokens": 60992960.0, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.25458996328029376, | |
| "grad_norm": 0.25704023241996765, | |
| "learning_rate": 8.414634146341464e-06, | |
| "loss": 0.4815, | |
| "num_tokens": 61582502.0, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.25703794369645044, | |
| "grad_norm": 0.28427180647850037, | |
| "learning_rate": 8.495934959349595e-06, | |
| "loss": 0.4815, | |
| "num_tokens": 62170270.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2594859241126071, | |
| "grad_norm": 0.26738202571868896, | |
| "learning_rate": 8.577235772357724e-06, | |
| "loss": 0.4752, | |
| "num_tokens": 62762518.0, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.26193390452876375, | |
| "grad_norm": 0.2779182493686676, | |
| "learning_rate": 8.658536585365854e-06, | |
| "loss": 0.4835, | |
| "num_tokens": 63338317.0, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.26438188494492043, | |
| "grad_norm": 0.25400421023368835, | |
| "learning_rate": 8.739837398373985e-06, | |
| "loss": 0.4682, | |
| "num_tokens": 63946534.0, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.2668298653610771, | |
| "grad_norm": 0.2691749930381775, | |
| "learning_rate": 8.821138211382113e-06, | |
| "loss": 0.4686, | |
| "num_tokens": 64529245.0, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.2692778457772338, | |
| "grad_norm": 0.30239883065223694, | |
| "learning_rate": 8.902439024390244e-06, | |
| "loss": 0.4931, | |
| "num_tokens": 65101588.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2717258261933905, | |
| "grad_norm": 0.2659878134727478, | |
| "learning_rate": 8.983739837398374e-06, | |
| "loss": 0.4809, | |
| "num_tokens": 65693459.0, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.2741738066095471, | |
| "grad_norm": 0.29131749272346497, | |
| "learning_rate": 9.065040650406505e-06, | |
| "loss": 0.484, | |
| "num_tokens": 66264575.0, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.2766217870257038, | |
| "grad_norm": 0.26120057702064514, | |
| "learning_rate": 9.146341463414635e-06, | |
| "loss": 0.4621, | |
| "num_tokens": 66836693.0, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.27906976744186046, | |
| "grad_norm": 0.25684523582458496, | |
| "learning_rate": 9.227642276422764e-06, | |
| "loss": 0.4846, | |
| "num_tokens": 67435169.0, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.28151774785801714, | |
| "grad_norm": 0.2609543204307556, | |
| "learning_rate": 9.308943089430895e-06, | |
| "loss": 0.4892, | |
| "num_tokens": 68047327.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2839657282741738, | |
| "grad_norm": 0.25355297327041626, | |
| "learning_rate": 9.390243902439025e-06, | |
| "loss": 0.481, | |
| "num_tokens": 68652895.0, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.2864137086903305, | |
| "grad_norm": 0.2718547284603119, | |
| "learning_rate": 9.471544715447156e-06, | |
| "loss": 0.4826, | |
| "num_tokens": 69234680.0, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.28886168910648713, | |
| "grad_norm": 0.2677147388458252, | |
| "learning_rate": 9.552845528455286e-06, | |
| "loss": 0.4832, | |
| "num_tokens": 69826413.0, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.2913096695226438, | |
| "grad_norm": 0.26797816157341003, | |
| "learning_rate": 9.634146341463415e-06, | |
| "loss": 0.4756, | |
| "num_tokens": 70441126.0, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.2937576499388005, | |
| "grad_norm": 0.2443031668663025, | |
| "learning_rate": 9.715447154471546e-06, | |
| "loss": 0.4724, | |
| "num_tokens": 71026297.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2962056303549572, | |
| "grad_norm": 0.23899759352207184, | |
| "learning_rate": 9.796747967479675e-06, | |
| "loss": 0.4632, | |
| "num_tokens": 71615415.0, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.29865361077111385, | |
| "grad_norm": 0.2585891783237457, | |
| "learning_rate": 9.878048780487805e-06, | |
| "loss": 0.4768, | |
| "num_tokens": 72187205.0, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.3011015911872705, | |
| "grad_norm": 0.25309988856315613, | |
| "learning_rate": 9.959349593495936e-06, | |
| "loss": 0.4695, | |
| "num_tokens": 72766789.0, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.30354957160342716, | |
| "grad_norm": 0.2485942840576172, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4791, | |
| "num_tokens": 73367195.0, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.30599755201958384, | |
| "grad_norm": 0.2777419686317444, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4839, | |
| "num_tokens": 73931883.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3084455324357405, | |
| "grad_norm": 0.27165666222572327, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4806, | |
| "num_tokens": 74512785.0, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.3108935128518972, | |
| "grad_norm": 0.2516685426235199, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4693, | |
| "num_tokens": 75103604.0, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.31334149326805383, | |
| "grad_norm": 0.2623477280139923, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4859, | |
| "num_tokens": 75691456.0, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.3157894736842105, | |
| "grad_norm": 0.26603803038597107, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4723, | |
| "num_tokens": 76296514.0, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.3182374541003672, | |
| "grad_norm": 0.2537820637226105, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4746, | |
| "num_tokens": 76901295.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.32068543451652387, | |
| "grad_norm": 0.2758727967739105, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4663, | |
| "num_tokens": 77473398.0, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.32313341493268055, | |
| "grad_norm": 0.2774087190628052, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4829, | |
| "num_tokens": 78068399.0, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.32558139534883723, | |
| "grad_norm": 0.26797741651535034, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4892, | |
| "num_tokens": 78683018.0, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.32802937576499386, | |
| "grad_norm": 0.2684287428855896, | |
| "learning_rate": 1e-05, | |
| "loss": 0.466, | |
| "num_tokens": 79280661.0, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.33047735618115054, | |
| "grad_norm": 0.27204152941703796, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4674, | |
| "num_tokens": 79894114.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.3329253365973072, | |
| "grad_norm": 0.26379671692848206, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4703, | |
| "num_tokens": 80498761.0, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.3353733170134639, | |
| "grad_norm": 0.26917415857315063, | |
| "learning_rate": 1e-05, | |
| "loss": 0.46, | |
| "num_tokens": 81090664.0, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.3378212974296206, | |
| "grad_norm": 0.2546355128288269, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4684, | |
| "num_tokens": 81682310.0, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.3402692778457772, | |
| "grad_norm": 0.24116742610931396, | |
| "learning_rate": 1e-05, | |
| "loss": 0.453, | |
| "num_tokens": 82257211.0, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.3427172582619339, | |
| "grad_norm": 0.3105870485305786, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4552, | |
| "num_tokens": 82873665.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.34516523867809057, | |
| "grad_norm": 0.2542496919631958, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4651, | |
| "num_tokens": 83459915.0, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.34761321909424725, | |
| "grad_norm": 0.2633056640625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4743, | |
| "num_tokens": 84064495.0, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.35006119951040393, | |
| "grad_norm": 0.27345481514930725, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4522, | |
| "num_tokens": 84650583.0, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.3525091799265606, | |
| "grad_norm": 0.2693355083465576, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4648, | |
| "num_tokens": 85253276.0, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.35495716034271724, | |
| "grad_norm": 0.2904321253299713, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4627, | |
| "num_tokens": 85833675.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3574051407588739, | |
| "grad_norm": 0.2525518834590912, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4558, | |
| "num_tokens": 86421330.0, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.3598531211750306, | |
| "grad_norm": 0.2589590847492218, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4674, | |
| "num_tokens": 87030151.0, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.3623011015911873, | |
| "grad_norm": 0.26387012004852295, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4457, | |
| "num_tokens": 87567827.0, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.36474908200734396, | |
| "grad_norm": 0.25372225046157837, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4531, | |
| "num_tokens": 88157730.0, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.3671970624235006, | |
| "grad_norm": 0.2606695890426636, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4652, | |
| "num_tokens": 88759382.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.36964504283965727, | |
| "grad_norm": 0.28036609292030334, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4617, | |
| "num_tokens": 89342429.0, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.37209302325581395, | |
| "grad_norm": 0.2817690968513489, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4612, | |
| "num_tokens": 89934089.0, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.37454100367197063, | |
| "grad_norm": 0.2582840919494629, | |
| "learning_rate": 1e-05, | |
| "loss": 0.457, | |
| "num_tokens": 90525609.0, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.3769889840881273, | |
| "grad_norm": 0.23353107273578644, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4361, | |
| "num_tokens": 91116420.0, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.379436964504284, | |
| "grad_norm": 0.2614266872406006, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4613, | |
| "num_tokens": 91706872.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3818849449204406, | |
| "grad_norm": 0.25390151143074036, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4563, | |
| "num_tokens": 92273116.0, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.3843329253365973, | |
| "grad_norm": 0.2604464590549469, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4658, | |
| "num_tokens": 92877700.0, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.386780905752754, | |
| "grad_norm": 0.2540639638900757, | |
| "learning_rate": 1e-05, | |
| "loss": 0.467, | |
| "num_tokens": 93448212.0, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.38922888616891066, | |
| "grad_norm": 0.30420681834220886, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4569, | |
| "num_tokens": 94015732.0, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.39167686658506734, | |
| "grad_norm": 0.26026180386543274, | |
| "learning_rate": 1e-05, | |
| "loss": 0.452, | |
| "num_tokens": 94583212.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.39412484700122397, | |
| "grad_norm": 0.24830417335033417, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4429, | |
| "num_tokens": 95154747.0, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.39657282741738065, | |
| "grad_norm": 0.25844183564186096, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4698, | |
| "num_tokens": 95736389.0, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.3990208078335373, | |
| "grad_norm": 0.2400529533624649, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4662, | |
| "num_tokens": 96331517.0, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.401468788249694, | |
| "grad_norm": 0.2508002519607544, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4559, | |
| "num_tokens": 96919369.0, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.4039167686658507, | |
| "grad_norm": 0.3863252103328705, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4508, | |
| "num_tokens": 97509288.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.40636474908200737, | |
| "grad_norm": 0.23996925354003906, | |
| "learning_rate": 1e-05, | |
| "loss": 0.464, | |
| "num_tokens": 98114408.0, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.408812729498164, | |
| "grad_norm": 0.2341168373823166, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4449, | |
| "num_tokens": 98713441.0, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.4112607099143207, | |
| "grad_norm": 0.25438442826271057, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4525, | |
| "num_tokens": 99327738.0, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.41370869033047736, | |
| "grad_norm": 0.25638943910598755, | |
| "learning_rate": 1e-05, | |
| "loss": 0.456, | |
| "num_tokens": 99919049.0, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.41615667074663404, | |
| "grad_norm": 0.249691903591156, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4451, | |
| "num_tokens": 100516376.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.4186046511627907, | |
| "grad_norm": 0.23924164474010468, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4594, | |
| "num_tokens": 101133012.0, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 0.2472023069858551, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4475, | |
| "num_tokens": 101700842.0, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.423500611995104, | |
| "grad_norm": 0.25458183884620667, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4636, | |
| "num_tokens": 102301821.0, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.4259485924112607, | |
| "grad_norm": 0.24029487371444702, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4543, | |
| "num_tokens": 102911147.0, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.4283965728274174, | |
| "grad_norm": 0.24597443640232086, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4327, | |
| "num_tokens": 103494243.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.43084455324357407, | |
| "grad_norm": 0.24597881734371185, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4544, | |
| "num_tokens": 104100107.0, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.43329253365973075, | |
| "grad_norm": 0.251849889755249, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4518, | |
| "num_tokens": 104696583.0, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.4357405140758874, | |
| "grad_norm": 0.26897719502449036, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4542, | |
| "num_tokens": 105266973.0, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.43818849449204406, | |
| "grad_norm": 0.25020480155944824, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4428, | |
| "num_tokens": 105855583.0, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.44063647490820074, | |
| "grad_norm": 0.25249218940734863, | |
| "learning_rate": 1e-05, | |
| "loss": 0.451, | |
| "num_tokens": 106413444.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4430844553243574, | |
| "grad_norm": 0.25864851474761963, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4602, | |
| "num_tokens": 107001653.0, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.4455324357405141, | |
| "grad_norm": 0.24207186698913574, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4523, | |
| "num_tokens": 107595819.0, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.4479804161566707, | |
| "grad_norm": 0.2500375509262085, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4261, | |
| "num_tokens": 108198205.0, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.4504283965728274, | |
| "grad_norm": 0.27041998505592346, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4331, | |
| "num_tokens": 108768930.0, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.4528763769889841, | |
| "grad_norm": 0.257564902305603, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4606, | |
| "num_tokens": 109393449.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.45532435740514077, | |
| "grad_norm": 0.2373887002468109, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4421, | |
| "num_tokens": 109998048.0, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.45777233782129745, | |
| "grad_norm": 0.24258120357990265, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4532, | |
| "num_tokens": 110602780.0, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.4602203182374541, | |
| "grad_norm": 0.2412237524986267, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4438, | |
| "num_tokens": 111196271.0, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.46266829865361075, | |
| "grad_norm": 0.2580074965953827, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4393, | |
| "num_tokens": 111762398.0, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.46511627906976744, | |
| "grad_norm": 0.24817973375320435, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4473, | |
| "num_tokens": 112344276.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.4675642594859241, | |
| "grad_norm": 0.24946565926074982, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4407, | |
| "num_tokens": 112929852.0, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.4700122399020808, | |
| "grad_norm": 0.25697141885757446, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4426, | |
| "num_tokens": 113524769.0, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.4724602203182375, | |
| "grad_norm": 0.2628220319747925, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4315, | |
| "num_tokens": 114097175.0, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.4749082007343941, | |
| "grad_norm": 0.252259224653244, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4536, | |
| "num_tokens": 114681195.0, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.4773561811505508, | |
| "grad_norm": 0.24542003870010376, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4575, | |
| "num_tokens": 115275959.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.47980416156670747, | |
| "grad_norm": 0.2483384609222412, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4316, | |
| "num_tokens": 115836837.0, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.48225214198286415, | |
| "grad_norm": 0.25170162320137024, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4537, | |
| "num_tokens": 116449305.0, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.4847001223990208, | |
| "grad_norm": 0.2621166706085205, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4362, | |
| "num_tokens": 117045707.0, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.48714810281517745, | |
| "grad_norm": 0.2502780854701996, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4454, | |
| "num_tokens": 117635477.0, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.48959608323133413, | |
| "grad_norm": 0.2551596164703369, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4481, | |
| "num_tokens": 118231246.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.4920440636474908, | |
| "grad_norm": 0.24621985852718353, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4422, | |
| "num_tokens": 118844143.0, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.4944920440636475, | |
| "grad_norm": 0.2507868707180023, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4372, | |
| "num_tokens": 119416215.0, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.4969400244798042, | |
| "grad_norm": 0.2631942331790924, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4452, | |
| "num_tokens": 120009624.0, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.49938800489596086, | |
| "grad_norm": 0.2675624489784241, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4305, | |
| "num_tokens": 120601206.0, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.5018359853121175, | |
| "grad_norm": 0.2466759979724884, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4527, | |
| "num_tokens": 121202335.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5042839657282742, | |
| "grad_norm": 0.2538067400455475, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4442, | |
| "num_tokens": 121787505.0, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.5067319461444308, | |
| "grad_norm": 0.25116246938705444, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4425, | |
| "num_tokens": 122383008.0, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.5091799265605875, | |
| "grad_norm": 0.236452117562294, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4537, | |
| "num_tokens": 122980153.0, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.5116279069767442, | |
| "grad_norm": 0.24228782951831818, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4302, | |
| "num_tokens": 123564055.0, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.5140758873929009, | |
| "grad_norm": 0.25363457202911377, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4413, | |
| "num_tokens": 124154714.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5165238678090576, | |
| "grad_norm": 0.2742238938808441, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4492, | |
| "num_tokens": 124725604.0, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.5189718482252142, | |
| "grad_norm": 0.24658766388893127, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4429, | |
| "num_tokens": 125329669.0, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.5214198286413708, | |
| "grad_norm": 0.2611011862754822, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4339, | |
| "num_tokens": 125871390.0, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.5238678090575275, | |
| "grad_norm": 0.25249847769737244, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4476, | |
| "num_tokens": 126485091.0, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 0.25424760580062866, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4394, | |
| "num_tokens": 127070743.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5287637698898409, | |
| "grad_norm": 0.26382750272750854, | |
| "learning_rate": 1e-05, | |
| "loss": 0.451, | |
| "num_tokens": 127643183.0, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.5312117503059975, | |
| "grad_norm": 0.264977365732193, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4471, | |
| "num_tokens": 128238869.0, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.5336597307221542, | |
| "grad_norm": 0.24802319705486298, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4312, | |
| "num_tokens": 128833166.0, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.5361077111383109, | |
| "grad_norm": 0.2559887766838074, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4513, | |
| "num_tokens": 129432358.0, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.5385556915544676, | |
| "grad_norm": 0.2669244706630707, | |
| "learning_rate": 1e-05, | |
| "loss": 0.425, | |
| "num_tokens": 130005589.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5410036719706243, | |
| "grad_norm": 0.26476767659187317, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4596, | |
| "num_tokens": 130617968.0, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.543451652386781, | |
| "grad_norm": 0.24924980103969574, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4275, | |
| "num_tokens": 131211641.0, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.5458996328029376, | |
| "grad_norm": 0.2561533451080322, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4389, | |
| "num_tokens": 131815751.0, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.5483476132190942, | |
| "grad_norm": 0.2370700240135193, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4442, | |
| "num_tokens": 132435428.0, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.5507955936352509, | |
| "grad_norm": 0.259915828704834, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4521, | |
| "num_tokens": 133026844.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5532435740514076, | |
| "grad_norm": 0.2539651095867157, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4513, | |
| "num_tokens": 133610279.0, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.5556915544675642, | |
| "grad_norm": 0.2514914274215698, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4359, | |
| "num_tokens": 134186783.0, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.5581395348837209, | |
| "grad_norm": 0.23980778455734253, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4217, | |
| "num_tokens": 134766199.0, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.5605875152998776, | |
| "grad_norm": 0.246282696723938, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4346, | |
| "num_tokens": 135355631.0, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.5630354957160343, | |
| "grad_norm": 0.24391327798366547, | |
| "learning_rate": 1e-05, | |
| "loss": 0.455, | |
| "num_tokens": 135973241.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.565483476132191, | |
| "grad_norm": 0.2361239790916443, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4288, | |
| "num_tokens": 136579751.0, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.5679314565483476, | |
| "grad_norm": 0.2682438790798187, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4407, | |
| "num_tokens": 137187474.0, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.5703794369645043, | |
| "grad_norm": 0.24559003114700317, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4571, | |
| "num_tokens": 137796342.0, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.572827417380661, | |
| "grad_norm": 0.2480727732181549, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4341, | |
| "num_tokens": 138415346.0, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.5752753977968176, | |
| "grad_norm": 0.24036051332950592, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4519, | |
| "num_tokens": 139036676.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5777233782129743, | |
| "grad_norm": 0.25144311785697937, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4485, | |
| "num_tokens": 139647226.0, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.5801713586291309, | |
| "grad_norm": 0.2437385618686676, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4292, | |
| "num_tokens": 140222429.0, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.5826193390452876, | |
| "grad_norm": 0.25468119978904724, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4488, | |
| "num_tokens": 140846203.0, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.5850673194614443, | |
| "grad_norm": 0.23626460134983063, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4305, | |
| "num_tokens": 141448874.0, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.587515299877601, | |
| "grad_norm": 0.266257643699646, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4268, | |
| "num_tokens": 142029277.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5899632802937577, | |
| "grad_norm": 0.2561860680580139, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4263, | |
| "num_tokens": 142624816.0, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.5924112607099143, | |
| "grad_norm": 0.266886830329895, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4297, | |
| "num_tokens": 143212355.0, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.594859241126071, | |
| "grad_norm": 0.257272869348526, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4468, | |
| "num_tokens": 143820960.0, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.5973072215422277, | |
| "grad_norm": 0.24327340722084045, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4448, | |
| "num_tokens": 144394056.0, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.5997552019583844, | |
| "grad_norm": 0.2501513361930847, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4322, | |
| "num_tokens": 144976355.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.602203182374541, | |
| "grad_norm": 0.2472490519285202, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4349, | |
| "num_tokens": 145575990.0, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.6046511627906976, | |
| "grad_norm": 0.24121037125587463, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4341, | |
| "num_tokens": 146180118.0, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.6070991432068543, | |
| "grad_norm": 0.25340262055397034, | |
| "learning_rate": 1e-05, | |
| "loss": 0.444, | |
| "num_tokens": 146763569.0, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.609547123623011, | |
| "grad_norm": 0.28607383370399475, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4319, | |
| "num_tokens": 147348892.0, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.6119951040391677, | |
| "grad_norm": 0.2469649612903595, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4351, | |
| "num_tokens": 147934106.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6144430844553244, | |
| "grad_norm": 0.24661080539226532, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4513, | |
| "num_tokens": 148526284.0, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 0.616891064871481, | |
| "grad_norm": 0.2517259120941162, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4491, | |
| "num_tokens": 149116759.0, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.6193390452876377, | |
| "grad_norm": 0.2508615255355835, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4464, | |
| "num_tokens": 149733718.0, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 0.6217870257037944, | |
| "grad_norm": 0.24546414613723755, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4387, | |
| "num_tokens": 150325552.0, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 0.6242350061199511, | |
| "grad_norm": 0.24201036989688873, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4373, | |
| "num_tokens": 150919640.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.6266829865361077, | |
| "grad_norm": 0.26162493228912354, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4106, | |
| "num_tokens": 151479810.0, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.6291309669522643, | |
| "grad_norm": 0.248540461063385, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4402, | |
| "num_tokens": 152069098.0, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 0.631578947368421, | |
| "grad_norm": 0.24691417813301086, | |
| "learning_rate": 1e-05, | |
| "loss": 0.429, | |
| "num_tokens": 152650689.0, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 0.6340269277845777, | |
| "grad_norm": 0.23826554417610168, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4397, | |
| "num_tokens": 153238050.0, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 0.6364749082007344, | |
| "grad_norm": 0.2382456660270691, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4355, | |
| "num_tokens": 153848706.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.6389228886168911, | |
| "grad_norm": 0.24490463733673096, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4531, | |
| "num_tokens": 154435999.0, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 0.6413708690330477, | |
| "grad_norm": 0.2386292815208435, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4202, | |
| "num_tokens": 155012325.0, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 0.6438188494492044, | |
| "grad_norm": 0.25578925013542175, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4096, | |
| "num_tokens": 155577911.0, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 0.6462668298653611, | |
| "grad_norm": 0.2566823959350586, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4397, | |
| "num_tokens": 156159354.0, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 0.6487148102815178, | |
| "grad_norm": 0.2507897615432739, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4328, | |
| "num_tokens": 156734371.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.6511627906976745, | |
| "grad_norm": 0.2468583583831787, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4524, | |
| "num_tokens": 157342481.0, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 0.653610771113831, | |
| "grad_norm": 0.22932949662208557, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4217, | |
| "num_tokens": 157940378.0, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 0.6560587515299877, | |
| "grad_norm": 0.24993668496608734, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4397, | |
| "num_tokens": 158529483.0, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 0.6585067319461444, | |
| "grad_norm": 0.2605837285518646, | |
| "learning_rate": 1e-05, | |
| "loss": 0.438, | |
| "num_tokens": 159116553.0, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 0.6609547123623011, | |
| "grad_norm": 0.2537282407283783, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4316, | |
| "num_tokens": 159704424.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6634026927784578, | |
| "grad_norm": 0.24413853883743286, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4416, | |
| "num_tokens": 160315337.0, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 0.6658506731946144, | |
| "grad_norm": 0.24804916977882385, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4388, | |
| "num_tokens": 160906136.0, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.6682986536107711, | |
| "grad_norm": 0.2709481418132782, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4155, | |
| "num_tokens": 161482176.0, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 0.6707466340269278, | |
| "grad_norm": 0.2590528428554535, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4217, | |
| "num_tokens": 162066339.0, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 0.6731946144430845, | |
| "grad_norm": 0.22846968472003937, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4235, | |
| "num_tokens": 162686859.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.6756425948592412, | |
| "grad_norm": 0.2419133186340332, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4215, | |
| "num_tokens": 163261743.0, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 0.6780905752753978, | |
| "grad_norm": 0.2427610456943512, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4264, | |
| "num_tokens": 163864744.0, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 0.6805385556915544, | |
| "grad_norm": 0.23300205171108246, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4151, | |
| "num_tokens": 164424223.0, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 0.6829865361077111, | |
| "grad_norm": 0.23740941286087036, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4361, | |
| "num_tokens": 165031113.0, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 0.6854345165238678, | |
| "grad_norm": 0.23929090797901154, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4312, | |
| "num_tokens": 165645826.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6878824969400245, | |
| "grad_norm": 0.25261008739471436, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4312, | |
| "num_tokens": 166270973.0, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 0.6903304773561811, | |
| "grad_norm": 0.24025267362594604, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4448, | |
| "num_tokens": 166855785.0, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 0.6927784577723378, | |
| "grad_norm": 0.24211308360099792, | |
| "learning_rate": 1e-05, | |
| "loss": 0.432, | |
| "num_tokens": 167481909.0, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 0.6952264381884945, | |
| "grad_norm": 0.24173016846179962, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4232, | |
| "num_tokens": 168078039.0, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 0.6976744186046512, | |
| "grad_norm": 0.2316238284111023, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4285, | |
| "num_tokens": 168668586.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.7001223990208079, | |
| "grad_norm": 0.25232845544815063, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4244, | |
| "num_tokens": 169280779.0, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 0.7025703794369645, | |
| "grad_norm": 0.2419183850288391, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4085, | |
| "num_tokens": 169869109.0, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 0.7050183598531212, | |
| "grad_norm": 0.25120463967323303, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4369, | |
| "num_tokens": 170459382.0, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.7074663402692778, | |
| "grad_norm": 0.2427869439125061, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4425, | |
| "num_tokens": 171059799.0, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 0.7099143206854345, | |
| "grad_norm": 0.2565186023712158, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4333, | |
| "num_tokens": 171616638.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.7123623011015912, | |
| "grad_norm": 0.26279616355895996, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4375, | |
| "num_tokens": 172215523.0, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 0.7148102815177478, | |
| "grad_norm": 0.24042481184005737, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4258, | |
| "num_tokens": 172816261.0, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 0.7172582619339045, | |
| "grad_norm": 0.2408638894557953, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4179, | |
| "num_tokens": 173418219.0, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 0.7197062423500612, | |
| "grad_norm": 0.2416829615831375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.43, | |
| "num_tokens": 174001845.0, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 0.7221542227662179, | |
| "grad_norm": 0.25662803649902344, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4055, | |
| "num_tokens": 174573841.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.7246022031823746, | |
| "grad_norm": 0.2541915476322174, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4271, | |
| "num_tokens": 175157025.0, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 0.7270501835985312, | |
| "grad_norm": 0.23873859643936157, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4134, | |
| "num_tokens": 175763839.0, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 0.7294981640146879, | |
| "grad_norm": 0.2606620192527771, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4215, | |
| "num_tokens": 176340303.0, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 0.7319461444308446, | |
| "grad_norm": 0.2397637516260147, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4355, | |
| "num_tokens": 176933693.0, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 0.7343941248470012, | |
| "grad_norm": 0.24392768740653992, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4267, | |
| "num_tokens": 177551417.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7368421052631579, | |
| "grad_norm": 0.251198947429657, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4202, | |
| "num_tokens": 178108215.0, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 0.7392900856793145, | |
| "grad_norm": 0.24769917130470276, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4262, | |
| "num_tokens": 178678758.0, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 0.7417380660954712, | |
| "grad_norm": 0.2392667829990387, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4187, | |
| "num_tokens": 179263018.0, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 0.7441860465116279, | |
| "grad_norm": 0.25212791562080383, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4276, | |
| "num_tokens": 179877385.0, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 0.7466340269277846, | |
| "grad_norm": 0.24665768444538116, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4255, | |
| "num_tokens": 180464341.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.7490820073439413, | |
| "grad_norm": 0.2613765597343445, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4342, | |
| "num_tokens": 181070260.0, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 0.7515299877600979, | |
| "grad_norm": 0.27524396777153015, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4142, | |
| "num_tokens": 181650173.0, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 0.7539779681762546, | |
| "grad_norm": 0.2363055795431137, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4199, | |
| "num_tokens": 182238618.0, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 0.7564259485924113, | |
| "grad_norm": 0.2546214759349823, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4119, | |
| "num_tokens": 182798204.0, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 0.758873929008568, | |
| "grad_norm": 0.2432137429714203, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4195, | |
| "num_tokens": 183380029.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.7613219094247246, | |
| "grad_norm": 0.258290559053421, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4282, | |
| "num_tokens": 183966255.0, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 0.7637698898408812, | |
| "grad_norm": 0.23586836457252502, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4352, | |
| "num_tokens": 184583415.0, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 0.7662178702570379, | |
| "grad_norm": 0.23846642673015594, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4328, | |
| "num_tokens": 185197955.0, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 0.7686658506731946, | |
| "grad_norm": 0.24626384675502777, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4109, | |
| "num_tokens": 185785557.0, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 0.7711138310893513, | |
| "grad_norm": 0.24848531186580658, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4223, | |
| "num_tokens": 186356034.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.773561811505508, | |
| "grad_norm": 0.26634329557418823, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4297, | |
| "num_tokens": 186944825.0, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 0.7760097919216646, | |
| "grad_norm": 0.23417183756828308, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4297, | |
| "num_tokens": 187555120.0, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 0.7784577723378213, | |
| "grad_norm": 0.2514715790748596, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4315, | |
| "num_tokens": 188158111.0, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 0.780905752753978, | |
| "grad_norm": 0.2464294582605362, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4341, | |
| "num_tokens": 188752896.0, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 0.7833537331701347, | |
| "grad_norm": 0.2839301526546478, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4315, | |
| "num_tokens": 189348219.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.7858017135862914, | |
| "grad_norm": 0.2621495723724365, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4151, | |
| "num_tokens": 189894564.0, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 0.7882496940024479, | |
| "grad_norm": 0.23994684219360352, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4329, | |
| "num_tokens": 190529171.0, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 0.7906976744186046, | |
| "grad_norm": 0.23574483394622803, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4215, | |
| "num_tokens": 191131915.0, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 0.7931456548347613, | |
| "grad_norm": 0.2408006489276886, | |
| "learning_rate": 1e-05, | |
| "loss": 0.435, | |
| "num_tokens": 191724725.0, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 0.795593635250918, | |
| "grad_norm": 0.23889918625354767, | |
| "learning_rate": 1e-05, | |
| "loss": 0.43, | |
| "num_tokens": 192336245.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.7980416156670747, | |
| "grad_norm": 0.23778203129768372, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4207, | |
| "num_tokens": 192923779.0, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 0.8004895960832313, | |
| "grad_norm": 0.2521960437297821, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4057, | |
| "num_tokens": 193499166.0, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 0.802937576499388, | |
| "grad_norm": 0.2429361194372177, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4242, | |
| "num_tokens": 194104195.0, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 0.8053855569155447, | |
| "grad_norm": 0.23911674320697784, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4326, | |
| "num_tokens": 194700389.0, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 0.8078335373317014, | |
| "grad_norm": 0.24030958116054535, | |
| "learning_rate": 1e-05, | |
| "loss": 0.432, | |
| "num_tokens": 195293054.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.8102815177478581, | |
| "grad_norm": 0.23353174328804016, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4122, | |
| "num_tokens": 195898402.0, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 0.8127294981640147, | |
| "grad_norm": 0.2420521378517151, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4191, | |
| "num_tokens": 196479623.0, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 0.8151774785801713, | |
| "grad_norm": 0.24901549518108368, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4169, | |
| "num_tokens": 197025340.0, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 0.817625458996328, | |
| "grad_norm": 0.24542152881622314, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4157, | |
| "num_tokens": 197594800.0, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 0.8200734394124847, | |
| "grad_norm": 0.24356816709041595, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4236, | |
| "num_tokens": 198193576.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.8225214198286414, | |
| "grad_norm": 0.24126183986663818, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4176, | |
| "num_tokens": 198803995.0, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 0.824969400244798, | |
| "grad_norm": 0.24375128746032715, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4215, | |
| "num_tokens": 199416611.0, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 0.8274173806609547, | |
| "grad_norm": 0.25136980414390564, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4129, | |
| "num_tokens": 200007315.0, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 0.8298653610771114, | |
| "grad_norm": 0.23422685265541077, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4179, | |
| "num_tokens": 200608406.0, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 0.8323133414932681, | |
| "grad_norm": 0.24444159865379333, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4317, | |
| "num_tokens": 201208706.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.8347613219094248, | |
| "grad_norm": 0.23103386163711548, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3971, | |
| "num_tokens": 201783927.0, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 0.8372093023255814, | |
| "grad_norm": 0.25669893622398376, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4295, | |
| "num_tokens": 202380174.0, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 0.8396572827417381, | |
| "grad_norm": 0.2663831114768982, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4274, | |
| "num_tokens": 202985735.0, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 0.2565767467021942, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4352, | |
| "num_tokens": 203582111.0, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 0.8445532435740514, | |
| "grad_norm": 0.2389577478170395, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4138, | |
| "num_tokens": 204153573.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.847001223990208, | |
| "grad_norm": 0.2350880652666092, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4168, | |
| "num_tokens": 204763271.0, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 0.8494492044063647, | |
| "grad_norm": 0.25765883922576904, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4185, | |
| "num_tokens": 205341916.0, | |
| "step": 694 | |
| }, | |
| { | |
| "epoch": 0.8518971848225214, | |
| "grad_norm": 0.2440985143184662, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4362, | |
| "num_tokens": 205924333.0, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 0.8543451652386781, | |
| "grad_norm": 0.2437448501586914, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4238, | |
| "num_tokens": 206489074.0, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 0.8567931456548348, | |
| "grad_norm": 0.3054177165031433, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4179, | |
| "num_tokens": 207076635.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.8592411260709915, | |
| "grad_norm": 0.2562294006347656, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4297, | |
| "num_tokens": 207679550.0, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 0.8616891064871481, | |
| "grad_norm": 0.25215739011764526, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4262, | |
| "num_tokens": 208285797.0, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 0.8641370869033048, | |
| "grad_norm": 0.25319892168045044, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4268, | |
| "num_tokens": 208900402.0, | |
| "step": 706 | |
| }, | |
| { | |
| "epoch": 0.8665850673194615, | |
| "grad_norm": 0.22704288363456726, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4133, | |
| "num_tokens": 209478996.0, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 0.8690330477356181, | |
| "grad_norm": 0.23838220536708832, | |
| "learning_rate": 1e-05, | |
| "loss": 0.413, | |
| "num_tokens": 210061792.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.8714810281517748, | |
| "grad_norm": 0.25101152062416077, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4195, | |
| "num_tokens": 210619894.0, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 0.8739290085679314, | |
| "grad_norm": 0.2533038258552551, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4198, | |
| "num_tokens": 211195236.0, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 0.8763769889840881, | |
| "grad_norm": 0.24252592027187347, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4141, | |
| "num_tokens": 211757983.0, | |
| "step": 716 | |
| }, | |
| { | |
| "epoch": 0.8788249694002448, | |
| "grad_norm": 0.2420939952135086, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4213, | |
| "num_tokens": 212332438.0, | |
| "step": 718 | |
| }, | |
| { | |
| "epoch": 0.8812729498164015, | |
| "grad_norm": 0.23020204901695251, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4111, | |
| "num_tokens": 212936652.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.8837209302325582, | |
| "grad_norm": 0.24056395888328552, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4047, | |
| "num_tokens": 213505407.0, | |
| "step": 722 | |
| }, | |
| { | |
| "epoch": 0.8861689106487148, | |
| "grad_norm": 0.24601121246814728, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4197, | |
| "num_tokens": 214094524.0, | |
| "step": 724 | |
| }, | |
| { | |
| "epoch": 0.8886168910648715, | |
| "grad_norm": 0.23350679874420166, | |
| "learning_rate": 1e-05, | |
| "loss": 0.398, | |
| "num_tokens": 214670619.0, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 0.8910648714810282, | |
| "grad_norm": 0.23900729417800903, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4114, | |
| "num_tokens": 215236751.0, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 0.8935128518971848, | |
| "grad_norm": 0.243704155087471, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4131, | |
| "num_tokens": 215828751.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.8959608323133414, | |
| "grad_norm": 0.23456545174121857, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4158, | |
| "num_tokens": 216449250.0, | |
| "step": 732 | |
| }, | |
| { | |
| "epoch": 0.8984088127294981, | |
| "grad_norm": 0.23414072394371033, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4121, | |
| "num_tokens": 217044781.0, | |
| "step": 734 | |
| }, | |
| { | |
| "epoch": 0.9008567931456548, | |
| "grad_norm": 0.2433299720287323, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4114, | |
| "num_tokens": 217611996.0, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 0.9033047735618115, | |
| "grad_norm": 0.2463146448135376, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4068, | |
| "num_tokens": 218232059.0, | |
| "step": 738 | |
| }, | |
| { | |
| "epoch": 0.9057527539779682, | |
| "grad_norm": 0.7880977392196655, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4441, | |
| "num_tokens": 218810653.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.9082007343941249, | |
| "grad_norm": 0.241953507065773, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4087, | |
| "num_tokens": 219409815.0, | |
| "step": 742 | |
| }, | |
| { | |
| "epoch": 0.9106487148102815, | |
| "grad_norm": 0.2762121260166168, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4228, | |
| "num_tokens": 220016419.0, | |
| "step": 744 | |
| }, | |
| { | |
| "epoch": 0.9130966952264382, | |
| "grad_norm": 0.2605133354663849, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4168, | |
| "num_tokens": 220599687.0, | |
| "step": 746 | |
| }, | |
| { | |
| "epoch": 0.9155446756425949, | |
| "grad_norm": 0.24054831266403198, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4181, | |
| "num_tokens": 221170683.0, | |
| "step": 748 | |
| }, | |
| { | |
| "epoch": 0.9179926560587516, | |
| "grad_norm": 0.2439662516117096, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4175, | |
| "num_tokens": 221764282.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.9204406364749081, | |
| "grad_norm": 0.23831577599048615, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4228, | |
| "num_tokens": 222383973.0, | |
| "step": 752 | |
| }, | |
| { | |
| "epoch": 0.9228886168910648, | |
| "grad_norm": 0.24441011250019073, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4143, | |
| "num_tokens": 222981657.0, | |
| "step": 754 | |
| }, | |
| { | |
| "epoch": 0.9253365973072215, | |
| "grad_norm": 0.2541545033454895, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4263, | |
| "num_tokens": 223580164.0, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 0.9277845777233782, | |
| "grad_norm": 0.23410865664482117, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4142, | |
| "num_tokens": 224181708.0, | |
| "step": 758 | |
| }, | |
| { | |
| "epoch": 0.9302325581395349, | |
| "grad_norm": 0.24794194102287292, | |
| "learning_rate": 1e-05, | |
| "loss": 0.424, | |
| "num_tokens": 224750929.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.9326805385556916, | |
| "grad_norm": 0.23957248032093048, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4192, | |
| "num_tokens": 225353399.0, | |
| "step": 762 | |
| }, | |
| { | |
| "epoch": 0.9351285189718482, | |
| "grad_norm": 0.2275751829147339, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4283, | |
| "num_tokens": 225946912.0, | |
| "step": 764 | |
| }, | |
| { | |
| "epoch": 0.9375764993880049, | |
| "grad_norm": 0.24257154762744904, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4088, | |
| "num_tokens": 226513122.0, | |
| "step": 766 | |
| }, | |
| { | |
| "epoch": 0.9400244798041616, | |
| "grad_norm": 0.2594261169433594, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4149, | |
| "num_tokens": 227097275.0, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 0.9424724602203183, | |
| "grad_norm": 0.24188987910747528, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4173, | |
| "num_tokens": 227685155.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.944920440636475, | |
| "grad_norm": 0.2449246346950531, | |
| "learning_rate": 1e-05, | |
| "loss": 0.41, | |
| "num_tokens": 228268933.0, | |
| "step": 772 | |
| }, | |
| { | |
| "epoch": 0.9473684210526315, | |
| "grad_norm": 0.2591334581375122, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4098, | |
| "num_tokens": 228844822.0, | |
| "step": 774 | |
| }, | |
| { | |
| "epoch": 0.9498164014687882, | |
| "grad_norm": 0.25259289145469666, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4149, | |
| "num_tokens": 229413402.0, | |
| "step": 776 | |
| }, | |
| { | |
| "epoch": 0.9522643818849449, | |
| "grad_norm": 0.24534189701080322, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4095, | |
| "num_tokens": 229957992.0, | |
| "step": 778 | |
| }, | |
| { | |
| "epoch": 0.9547123623011016, | |
| "grad_norm": 0.25992926955223083, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4259, | |
| "num_tokens": 230534148.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.9571603427172583, | |
| "grad_norm": 0.23857857286930084, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4054, | |
| "num_tokens": 231127999.0, | |
| "step": 782 | |
| }, | |
| { | |
| "epoch": 0.9596083231334149, | |
| "grad_norm": 0.2835080921649933, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4205, | |
| "num_tokens": 231731253.0, | |
| "step": 784 | |
| }, | |
| { | |
| "epoch": 0.9620563035495716, | |
| "grad_norm": 0.2432568073272705, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4165, | |
| "num_tokens": 232342748.0, | |
| "step": 786 | |
| }, | |
| { | |
| "epoch": 0.9645042839657283, | |
| "grad_norm": 0.23912744224071503, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4089, | |
| "num_tokens": 232934775.0, | |
| "step": 788 | |
| }, | |
| { | |
| "epoch": 0.966952264381885, | |
| "grad_norm": 0.2454313039779663, | |
| "learning_rate": 1e-05, | |
| "loss": 0.419, | |
| "num_tokens": 233502064.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.9694002447980417, | |
| "grad_norm": 0.2457619458436966, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4112, | |
| "num_tokens": 234073172.0, | |
| "step": 792 | |
| }, | |
| { | |
| "epoch": 0.9718482252141983, | |
| "grad_norm": 0.2537059485912323, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4272, | |
| "num_tokens": 234661532.0, | |
| "step": 794 | |
| }, | |
| { | |
| "epoch": 0.9742962056303549, | |
| "grad_norm": 0.24286150932312012, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4301, | |
| "num_tokens": 235274908.0, | |
| "step": 796 | |
| }, | |
| { | |
| "epoch": 0.9767441860465116, | |
| "grad_norm": 0.2509307265281677, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4278, | |
| "num_tokens": 235855663.0, | |
| "step": 798 | |
| }, | |
| { | |
| "epoch": 0.9791921664626683, | |
| "grad_norm": 0.2525811493396759, | |
| "learning_rate": 1e-05, | |
| "loss": 0.399, | |
| "num_tokens": 236428558.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.981640146878825, | |
| "grad_norm": 0.23528246581554413, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4187, | |
| "num_tokens": 237023018.0, | |
| "step": 802 | |
| }, | |
| { | |
| "epoch": 0.9840881272949816, | |
| "grad_norm": 0.25735700130462646, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4158, | |
| "num_tokens": 237611598.0, | |
| "step": 804 | |
| }, | |
| { | |
| "epoch": 0.9865361077111383, | |
| "grad_norm": 0.23932790756225586, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4072, | |
| "num_tokens": 238214543.0, | |
| "step": 806 | |
| }, | |
| { | |
| "epoch": 0.988984088127295, | |
| "grad_norm": 0.2567075788974762, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4149, | |
| "num_tokens": 238786408.0, | |
| "step": 808 | |
| }, | |
| { | |
| "epoch": 0.9914320685434517, | |
| "grad_norm": 0.25353989005088806, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4115, | |
| "num_tokens": 239363151.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.9938800489596084, | |
| "grad_norm": 0.2540046274662018, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4177, | |
| "num_tokens": 239967829.0, | |
| "step": 812 | |
| }, | |
| { | |
| "epoch": 0.996328029375765, | |
| "grad_norm": 0.24277007579803467, | |
| "learning_rate": 1e-05, | |
| "loss": 0.405, | |
| "num_tokens": 240551191.0, | |
| "step": 814 | |
| }, | |
| { | |
| "epoch": 0.9987760097919217, | |
| "grad_norm": 0.24569077789783478, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4205, | |
| "num_tokens": 241153107.0, | |
| "step": 816 | |
| }, | |
| { | |
| "epoch": 1.0012239902080784, | |
| "grad_norm": 0.2544793486595154, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3973, | |
| "num_tokens": 241769492.0, | |
| "step": 818 | |
| }, | |
| { | |
| "epoch": 1.003671970624235, | |
| "grad_norm": 0.2589578330516815, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3779, | |
| "num_tokens": 242359313.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.0061199510403918, | |
| "grad_norm": 0.27613627910614014, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3709, | |
| "num_tokens": 242911445.0, | |
| "step": 822 | |
| }, | |
| { | |
| "epoch": 1.0085679314565483, | |
| "grad_norm": 0.3414134085178375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3848, | |
| "num_tokens": 243515905.0, | |
| "step": 824 | |
| }, | |
| { | |
| "epoch": 1.0110159118727051, | |
| "grad_norm": 0.26070353388786316, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3819, | |
| "num_tokens": 244085094.0, | |
| "step": 826 | |
| }, | |
| { | |
| "epoch": 1.0134638922888617, | |
| "grad_norm": 0.2581327259540558, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3962, | |
| "num_tokens": 244710189.0, | |
| "step": 828 | |
| }, | |
| { | |
| "epoch": 1.0159118727050183, | |
| "grad_norm": 0.23694124817848206, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3735, | |
| "num_tokens": 245302719.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.018359853121175, | |
| "grad_norm": 0.2756808400154114, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3949, | |
| "num_tokens": 245899907.0, | |
| "step": 832 | |
| }, | |
| { | |
| "epoch": 1.0208078335373316, | |
| "grad_norm": 0.2444760948419571, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3905, | |
| "num_tokens": 246514507.0, | |
| "step": 834 | |
| }, | |
| { | |
| "epoch": 1.0232558139534884, | |
| "grad_norm": 0.2670097053050995, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3762, | |
| "num_tokens": 247121076.0, | |
| "step": 836 | |
| }, | |
| { | |
| "epoch": 1.025703794369645, | |
| "grad_norm": 0.24394531548023224, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3718, | |
| "num_tokens": 247731957.0, | |
| "step": 838 | |
| }, | |
| { | |
| "epoch": 1.0281517747858018, | |
| "grad_norm": 0.2526130676269531, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3777, | |
| "num_tokens": 248333325.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.0305997552019583, | |
| "grad_norm": 0.25735199451446533, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3771, | |
| "num_tokens": 248929993.0, | |
| "step": 842 | |
| }, | |
| { | |
| "epoch": 1.0330477356181151, | |
| "grad_norm": 0.2488754689693451, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3633, | |
| "num_tokens": 249530707.0, | |
| "step": 844 | |
| }, | |
| { | |
| "epoch": 1.0354957160342717, | |
| "grad_norm": 0.238496333360672, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3827, | |
| "num_tokens": 250144988.0, | |
| "step": 846 | |
| }, | |
| { | |
| "epoch": 1.0379436964504285, | |
| "grad_norm": 0.24877411127090454, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3853, | |
| "num_tokens": 250726834.0, | |
| "step": 848 | |
| }, | |
| { | |
| "epoch": 1.040391676866585, | |
| "grad_norm": 0.24590681493282318, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3866, | |
| "num_tokens": 251338286.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.0428396572827416, | |
| "grad_norm": 0.2483719140291214, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3809, | |
| "num_tokens": 251938256.0, | |
| "step": 852 | |
| }, | |
| { | |
| "epoch": 1.0452876376988984, | |
| "grad_norm": 0.23960165679454803, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3699, | |
| "num_tokens": 252528052.0, | |
| "step": 854 | |
| }, | |
| { | |
| "epoch": 1.047735618115055, | |
| "grad_norm": 0.2561052739620209, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3716, | |
| "num_tokens": 253131838.0, | |
| "step": 856 | |
| }, | |
| { | |
| "epoch": 1.0501835985312118, | |
| "grad_norm": 0.2708950638771057, | |
| "learning_rate": 1e-05, | |
| "loss": 0.368, | |
| "num_tokens": 253683217.0, | |
| "step": 858 | |
| }, | |
| { | |
| "epoch": 1.0526315789473684, | |
| "grad_norm": 0.2559908926486969, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3847, | |
| "num_tokens": 254305603.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.0550795593635252, | |
| "grad_norm": 0.26056236028671265, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3747, | |
| "num_tokens": 277568.0, | |
| "step": 862 | |
| }, | |
| { | |
| "epoch": 1.0575275397796817, | |
| "grad_norm": 0.2658675014972687, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3823, | |
| "num_tokens": 848750.0, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 1.0599755201958385, | |
| "grad_norm": 0.249778613448143, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3693, | |
| "num_tokens": 1452635.0, | |
| "step": 866 | |
| }, | |
| { | |
| "epoch": 1.062423500611995, | |
| "grad_norm": 0.2459205985069275, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3741, | |
| "num_tokens": 2029158.0, | |
| "step": 868 | |
| }, | |
| { | |
| "epoch": 1.0648714810281519, | |
| "grad_norm": 0.2944379448890686, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3849, | |
| "num_tokens": 2637054.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.0673194614443084, | |
| "grad_norm": 0.2451840043067932, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3775, | |
| "num_tokens": 3232747.0, | |
| "step": 872 | |
| }, | |
| { | |
| "epoch": 1.069767441860465, | |
| "grad_norm": 0.26428672671318054, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3711, | |
| "num_tokens": 3822434.0, | |
| "step": 874 | |
| }, | |
| { | |
| "epoch": 1.0722154222766218, | |
| "grad_norm": 0.2651713192462921, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3816, | |
| "num_tokens": 4391500.0, | |
| "step": 876 | |
| }, | |
| { | |
| "epoch": 1.0746634026927784, | |
| "grad_norm": 0.2486201673746109, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3824, | |
| "num_tokens": 4979531.0, | |
| "step": 878 | |
| }, | |
| { | |
| "epoch": 1.0771113831089352, | |
| "grad_norm": 0.25607433915138245, | |
| "learning_rate": 1e-05, | |
| "loss": 0.384, | |
| "num_tokens": 5584377.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.0795593635250917, | |
| "grad_norm": 0.24247251451015472, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3906, | |
| "num_tokens": 6190943.0, | |
| "step": 882 | |
| }, | |
| { | |
| "epoch": 1.0820073439412485, | |
| "grad_norm": 0.25798070430755615, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3754, | |
| "num_tokens": 6789570.0, | |
| "step": 884 | |
| }, | |
| { | |
| "epoch": 1.084455324357405, | |
| "grad_norm": 0.25158044695854187, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3723, | |
| "num_tokens": 7373742.0, | |
| "step": 886 | |
| }, | |
| { | |
| "epoch": 1.086903304773562, | |
| "grad_norm": 0.2608836889266968, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3776, | |
| "num_tokens": 7972262.0, | |
| "step": 888 | |
| }, | |
| { | |
| "epoch": 1.0893512851897185, | |
| "grad_norm": 0.2598704397678375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3771, | |
| "num_tokens": 8547155.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.091799265605875, | |
| "grad_norm": 0.24557508528232574, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3902, | |
| "num_tokens": 9120559.0, | |
| "step": 892 | |
| }, | |
| { | |
| "epoch": 1.0942472460220318, | |
| "grad_norm": 0.2631266117095947, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3795, | |
| "num_tokens": 9683298.0, | |
| "step": 894 | |
| }, | |
| { | |
| "epoch": 1.0966952264381884, | |
| "grad_norm": 0.24435891211032867, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3777, | |
| "num_tokens": 10279511.0, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 1.0991432068543452, | |
| "grad_norm": 0.249556764960289, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3775, | |
| "num_tokens": 10880149.0, | |
| "step": 898 | |
| }, | |
| { | |
| "epoch": 1.1015911872705018, | |
| "grad_norm": 0.2669646739959717, | |
| "learning_rate": 1e-05, | |
| "loss": 0.393, | |
| "num_tokens": 11490560.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.1040391676866586, | |
| "grad_norm": 0.2627948224544525, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4063, | |
| "num_tokens": 12087747.0, | |
| "step": 902 | |
| }, | |
| { | |
| "epoch": 1.1064871481028151, | |
| "grad_norm": 0.2501852214336395, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3807, | |
| "num_tokens": 12661934.0, | |
| "step": 904 | |
| }, | |
| { | |
| "epoch": 1.108935128518972, | |
| "grad_norm": 0.26337432861328125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3849, | |
| "num_tokens": 13261711.0, | |
| "step": 906 | |
| }, | |
| { | |
| "epoch": 1.1113831089351285, | |
| "grad_norm": 0.25716322660446167, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3869, | |
| "num_tokens": 13872528.0, | |
| "step": 908 | |
| }, | |
| { | |
| "epoch": 1.1138310893512853, | |
| "grad_norm": 0.25698763132095337, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3818, | |
| "num_tokens": 14445102.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.1162790697674418, | |
| "grad_norm": 0.25583845376968384, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3695, | |
| "num_tokens": 15037648.0, | |
| "step": 912 | |
| }, | |
| { | |
| "epoch": 1.1187270501835984, | |
| "grad_norm": 0.2765620946884155, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3632, | |
| "num_tokens": 15631668.0, | |
| "step": 914 | |
| }, | |
| { | |
| "epoch": 1.1211750305997552, | |
| "grad_norm": 0.2591513991355896, | |
| "learning_rate": 1e-05, | |
| "loss": 0.377, | |
| "num_tokens": 16207356.0, | |
| "step": 916 | |
| }, | |
| { | |
| "epoch": 1.1236230110159118, | |
| "grad_norm": 0.2652893364429474, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3572, | |
| "num_tokens": 16776274.0, | |
| "step": 918 | |
| }, | |
| { | |
| "epoch": 1.1260709914320686, | |
| "grad_norm": 0.2598043978214264, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3803, | |
| "num_tokens": 17370079.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.1285189718482251, | |
| "grad_norm": 0.2437954545021057, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3718, | |
| "num_tokens": 17947221.0, | |
| "step": 922 | |
| }, | |
| { | |
| "epoch": 1.130966952264382, | |
| "grad_norm": 0.2446569800376892, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3641, | |
| "num_tokens": 18544345.0, | |
| "step": 924 | |
| }, | |
| { | |
| "epoch": 1.1334149326805385, | |
| "grad_norm": 0.24647963047027588, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3663, | |
| "num_tokens": 19118029.0, | |
| "step": 926 | |
| }, | |
| { | |
| "epoch": 1.1358629130966953, | |
| "grad_norm": 0.2526357173919678, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3664, | |
| "num_tokens": 19699461.0, | |
| "step": 928 | |
| }, | |
| { | |
| "epoch": 1.1383108935128519, | |
| "grad_norm": 0.2365649789571762, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3709, | |
| "num_tokens": 20296793.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.1407588739290087, | |
| "grad_norm": 0.24821361899375916, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3739, | |
| "num_tokens": 20880960.0, | |
| "step": 932 | |
| }, | |
| { | |
| "epoch": 1.1432068543451652, | |
| "grad_norm": 0.2682252824306488, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3803, | |
| "num_tokens": 21478802.0, | |
| "step": 934 | |
| }, | |
| { | |
| "epoch": 1.1456548347613218, | |
| "grad_norm": 0.24566704034805298, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3709, | |
| "num_tokens": 22078568.0, | |
| "step": 936 | |
| }, | |
| { | |
| "epoch": 1.1481028151774786, | |
| "grad_norm": 0.2511333227157593, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3849, | |
| "num_tokens": 22668790.0, | |
| "step": 938 | |
| }, | |
| { | |
| "epoch": 1.1505507955936352, | |
| "grad_norm": 0.24181029200553894, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3667, | |
| "num_tokens": 23247142.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.152998776009792, | |
| "grad_norm": 0.24866792559623718, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3793, | |
| "num_tokens": 23816149.0, | |
| "step": 942 | |
| }, | |
| { | |
| "epoch": 1.1554467564259485, | |
| "grad_norm": 0.2545630633831024, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3809, | |
| "num_tokens": 24404560.0, | |
| "step": 944 | |
| }, | |
| { | |
| "epoch": 1.1578947368421053, | |
| "grad_norm": 0.25114840269088745, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3847, | |
| "num_tokens": 24979236.0, | |
| "step": 946 | |
| }, | |
| { | |
| "epoch": 1.1603427172582619, | |
| "grad_norm": 0.2634119391441345, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3842, | |
| "num_tokens": 25563695.0, | |
| "step": 948 | |
| }, | |
| { | |
| "epoch": 1.1627906976744187, | |
| "grad_norm": 0.255024254322052, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3926, | |
| "num_tokens": 26161113.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.1652386780905752, | |
| "grad_norm": 0.24087129533290863, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3871, | |
| "num_tokens": 26767525.0, | |
| "step": 952 | |
| }, | |
| { | |
| "epoch": 1.167686658506732, | |
| "grad_norm": 0.2511006295681, | |
| "learning_rate": 1e-05, | |
| "loss": 0.385, | |
| "num_tokens": 27347454.0, | |
| "step": 954 | |
| }, | |
| { | |
| "epoch": 1.1701346389228886, | |
| "grad_norm": 0.2362564653158188, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3751, | |
| "num_tokens": 27955641.0, | |
| "step": 956 | |
| }, | |
| { | |
| "epoch": 1.1725826193390452, | |
| "grad_norm": 0.24775467813014984, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3759, | |
| "num_tokens": 28559267.0, | |
| "step": 958 | |
| }, | |
| { | |
| "epoch": 1.175030599755202, | |
| "grad_norm": 0.24708232283592224, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3797, | |
| "num_tokens": 29152270.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.1774785801713585, | |
| "grad_norm": 0.2411796748638153, | |
| "learning_rate": 1e-05, | |
| "loss": 0.382, | |
| "num_tokens": 29749611.0, | |
| "step": 962 | |
| }, | |
| { | |
| "epoch": 1.1799265605875153, | |
| "grad_norm": 0.24558380246162415, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3836, | |
| "num_tokens": 30348108.0, | |
| "step": 964 | |
| }, | |
| { | |
| "epoch": 1.182374541003672, | |
| "grad_norm": 0.24171082675457, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3871, | |
| "num_tokens": 30934012.0, | |
| "step": 966 | |
| }, | |
| { | |
| "epoch": 1.1848225214198287, | |
| "grad_norm": 0.254991352558136, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3846, | |
| "num_tokens": 31530616.0, | |
| "step": 968 | |
| }, | |
| { | |
| "epoch": 1.1872705018359853, | |
| "grad_norm": 0.2404201775789261, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3771, | |
| "num_tokens": 32106431.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.189718482252142, | |
| "grad_norm": 0.2498648315668106, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3743, | |
| "num_tokens": 32719129.0, | |
| "step": 972 | |
| }, | |
| { | |
| "epoch": 1.1921664626682986, | |
| "grad_norm": 0.32168230414390564, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3615, | |
| "num_tokens": 33274739.0, | |
| "step": 974 | |
| }, | |
| { | |
| "epoch": 1.1946144430844554, | |
| "grad_norm": 0.24494768679141998, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3742, | |
| "num_tokens": 33851163.0, | |
| "step": 976 | |
| }, | |
| { | |
| "epoch": 1.197062423500612, | |
| "grad_norm": 0.24181753396987915, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3779, | |
| "num_tokens": 34434320.0, | |
| "step": 978 | |
| }, | |
| { | |
| "epoch": 1.1995104039167686, | |
| "grad_norm": 0.2651110291481018, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3792, | |
| "num_tokens": 35003003.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.2019583843329253, | |
| "grad_norm": 0.26116904616355896, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3747, | |
| "num_tokens": 35550284.0, | |
| "step": 982 | |
| }, | |
| { | |
| "epoch": 1.204406364749082, | |
| "grad_norm": 0.24539689719676971, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3575, | |
| "num_tokens": 36114596.0, | |
| "step": 984 | |
| }, | |
| { | |
| "epoch": 1.2068543451652387, | |
| "grad_norm": 0.2678145170211792, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3888, | |
| "num_tokens": 36694709.0, | |
| "step": 986 | |
| }, | |
| { | |
| "epoch": 1.2093023255813953, | |
| "grad_norm": 0.25375595688819885, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3894, | |
| "num_tokens": 37270293.0, | |
| "step": 988 | |
| }, | |
| { | |
| "epoch": 1.211750305997552, | |
| "grad_norm": 0.23707690834999084, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3794, | |
| "num_tokens": 37871657.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.2141982864137086, | |
| "grad_norm": 0.2559983730316162, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3762, | |
| "num_tokens": 38425602.0, | |
| "step": 992 | |
| }, | |
| { | |
| "epoch": 1.2166462668298654, | |
| "grad_norm": 0.2463446706533432, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3641, | |
| "num_tokens": 39013285.0, | |
| "step": 994 | |
| }, | |
| { | |
| "epoch": 1.219094247246022, | |
| "grad_norm": 0.2542133629322052, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3786, | |
| "num_tokens": 39609316.0, | |
| "step": 996 | |
| }, | |
| { | |
| "epoch": 1.2215422276621788, | |
| "grad_norm": 0.24676287174224854, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3767, | |
| "num_tokens": 40176004.0, | |
| "step": 998 | |
| }, | |
| { | |
| "epoch": 1.2239902080783354, | |
| "grad_norm": 0.24902845919132233, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3767, | |
| "num_tokens": 40766442.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.226438188494492, | |
| "grad_norm": 0.24813127517700195, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3741, | |
| "num_tokens": 41349662.0, | |
| "step": 1002 | |
| }, | |
| { | |
| "epoch": 1.2288861689106487, | |
| "grad_norm": 0.25595715641975403, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3772, | |
| "num_tokens": 41921723.0, | |
| "step": 1004 | |
| }, | |
| { | |
| "epoch": 1.2313341493268053, | |
| "grad_norm": 0.2417302131652832, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3814, | |
| "num_tokens": 42502360.0, | |
| "step": 1006 | |
| }, | |
| { | |
| "epoch": 1.233782129742962, | |
| "grad_norm": 0.24199765920639038, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3849, | |
| "num_tokens": 43104367.0, | |
| "step": 1008 | |
| }, | |
| { | |
| "epoch": 1.2362301101591187, | |
| "grad_norm": 0.2543700635433197, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3792, | |
| "num_tokens": 43685299.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.2386780905752754, | |
| "grad_norm": 0.23722825944423676, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3746, | |
| "num_tokens": 44282353.0, | |
| "step": 1012 | |
| }, | |
| { | |
| "epoch": 1.241126070991432, | |
| "grad_norm": 0.2463102787733078, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3758, | |
| "num_tokens": 44876385.0, | |
| "step": 1014 | |
| }, | |
| { | |
| "epoch": 1.2435740514075888, | |
| "grad_norm": 0.25006935000419617, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3739, | |
| "num_tokens": 45464003.0, | |
| "step": 1016 | |
| }, | |
| { | |
| "epoch": 1.2460220318237454, | |
| "grad_norm": 0.24640695750713348, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3807, | |
| "num_tokens": 46050560.0, | |
| "step": 1018 | |
| }, | |
| { | |
| "epoch": 1.2484700122399022, | |
| "grad_norm": 0.2511467933654785, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3999, | |
| "num_tokens": 46640792.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.2509179926560587, | |
| "grad_norm": 0.2527099549770355, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3894, | |
| "num_tokens": 47241084.0, | |
| "step": 1022 | |
| }, | |
| { | |
| "epoch": 1.2533659730722153, | |
| "grad_norm": 0.24509671330451965, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3703, | |
| "num_tokens": 47813100.0, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 1.255813953488372, | |
| "grad_norm": 0.23858234286308289, | |
| "learning_rate": 1e-05, | |
| "loss": 0.379, | |
| "num_tokens": 48415520.0, | |
| "step": 1026 | |
| }, | |
| { | |
| "epoch": 1.258261933904529, | |
| "grad_norm": 0.2405681014060974, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3771, | |
| "num_tokens": 49009965.0, | |
| "step": 1028 | |
| }, | |
| { | |
| "epoch": 1.2607099143206855, | |
| "grad_norm": 0.37645822763442993, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3809, | |
| "num_tokens": 49627546.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.263157894736842, | |
| "grad_norm": 0.24870316684246063, | |
| "learning_rate": 1e-05, | |
| "loss": 0.377, | |
| "num_tokens": 50236156.0, | |
| "step": 1032 | |
| }, | |
| { | |
| "epoch": 1.2656058751529988, | |
| "grad_norm": 0.2347888946533203, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3831, | |
| "num_tokens": 50826554.0, | |
| "step": 1034 | |
| }, | |
| { | |
| "epoch": 1.2680538555691554, | |
| "grad_norm": 0.24506457149982452, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3774, | |
| "num_tokens": 51428333.0, | |
| "step": 1036 | |
| }, | |
| { | |
| "epoch": 1.2705018359853122, | |
| "grad_norm": 0.2655375599861145, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3837, | |
| "num_tokens": 52042985.0, | |
| "step": 1038 | |
| }, | |
| { | |
| "epoch": 1.2729498164014688, | |
| "grad_norm": 0.24918022751808167, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3776, | |
| "num_tokens": 52631339.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.2753977968176256, | |
| "grad_norm": 0.2505210041999817, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3798, | |
| "num_tokens": 53217729.0, | |
| "step": 1042 | |
| }, | |
| { | |
| "epoch": 1.2778457772337821, | |
| "grad_norm": 0.23858347535133362, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3601, | |
| "num_tokens": 53801689.0, | |
| "step": 1044 | |
| }, | |
| { | |
| "epoch": 1.2802937576499387, | |
| "grad_norm": 0.25308915972709656, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3807, | |
| "num_tokens": 54404896.0, | |
| "step": 1046 | |
| }, | |
| { | |
| "epoch": 1.2827417380660955, | |
| "grad_norm": 0.23880726099014282, | |
| "learning_rate": 1e-05, | |
| "loss": 0.381, | |
| "num_tokens": 54994715.0, | |
| "step": 1048 | |
| }, | |
| { | |
| "epoch": 1.2851897184822523, | |
| "grad_norm": 0.2413705289363861, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3763, | |
| "num_tokens": 55583693.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.2876376988984088, | |
| "grad_norm": 0.24790863692760468, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3759, | |
| "num_tokens": 56166986.0, | |
| "step": 1052 | |
| }, | |
| { | |
| "epoch": 1.2900856793145654, | |
| "grad_norm": 0.2571425139904022, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3721, | |
| "num_tokens": 56739683.0, | |
| "step": 1054 | |
| }, | |
| { | |
| "epoch": 1.2925336597307222, | |
| "grad_norm": 0.237641379237175, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3902, | |
| "num_tokens": 57354184.0, | |
| "step": 1056 | |
| }, | |
| { | |
| "epoch": 1.2949816401468788, | |
| "grad_norm": 0.23018361628055573, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3568, | |
| "num_tokens": 57940330.0, | |
| "step": 1058 | |
| }, | |
| { | |
| "epoch": 1.2974296205630356, | |
| "grad_norm": 0.2509154677391052, | |
| "learning_rate": 1e-05, | |
| "loss": 0.362, | |
| "num_tokens": 58521951.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.2998776009791921, | |
| "grad_norm": 0.255787193775177, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3845, | |
| "num_tokens": 59128476.0, | |
| "step": 1062 | |
| }, | |
| { | |
| "epoch": 1.302325581395349, | |
| "grad_norm": 0.2486552596092224, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3724, | |
| "num_tokens": 59717196.0, | |
| "step": 1064 | |
| }, | |
| { | |
| "epoch": 1.3047735618115055, | |
| "grad_norm": 0.260206937789917, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3548, | |
| "num_tokens": 60280879.0, | |
| "step": 1066 | |
| }, | |
| { | |
| "epoch": 1.307221542227662, | |
| "grad_norm": 0.250387966632843, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3753, | |
| "num_tokens": 60859876.0, | |
| "step": 1068 | |
| }, | |
| { | |
| "epoch": 1.3096695226438189, | |
| "grad_norm": 0.24388471245765686, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3718, | |
| "num_tokens": 61438497.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.3121175030599757, | |
| "grad_norm": 0.23518991470336914, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3693, | |
| "num_tokens": 62038958.0, | |
| "step": 1072 | |
| }, | |
| { | |
| "epoch": 1.3145654834761322, | |
| "grad_norm": 0.24329505860805511, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3885, | |
| "num_tokens": 62608639.0, | |
| "step": 1074 | |
| }, | |
| { | |
| "epoch": 1.3170134638922888, | |
| "grad_norm": 0.24493998289108276, | |
| "learning_rate": 1e-05, | |
| "loss": 0.375, | |
| "num_tokens": 63182731.0, | |
| "step": 1076 | |
| }, | |
| { | |
| "epoch": 1.3194614443084456, | |
| "grad_norm": 0.23012055456638336, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3774, | |
| "num_tokens": 63784383.0, | |
| "step": 1078 | |
| }, | |
| { | |
| "epoch": 1.3219094247246022, | |
| "grad_norm": 0.230576291680336, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3767, | |
| "num_tokens": 64371986.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.324357405140759, | |
| "grad_norm": 0.23564326763153076, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3727, | |
| "num_tokens": 64966281.0, | |
| "step": 1082 | |
| }, | |
| { | |
| "epoch": 1.3268053855569155, | |
| "grad_norm": 0.23679161071777344, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3672, | |
| "num_tokens": 65550336.0, | |
| "step": 1084 | |
| }, | |
| { | |
| "epoch": 1.3292533659730723, | |
| "grad_norm": 0.25614285469055176, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3809, | |
| "num_tokens": 66125746.0, | |
| "step": 1086 | |
| }, | |
| { | |
| "epoch": 1.3317013463892289, | |
| "grad_norm": 0.24231094121932983, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3975, | |
| "num_tokens": 66726281.0, | |
| "step": 1088 | |
| }, | |
| { | |
| "epoch": 1.3341493268053854, | |
| "grad_norm": 0.25783106684684753, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3792, | |
| "num_tokens": 67286102.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.3365973072215422, | |
| "grad_norm": 0.2526884078979492, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3654, | |
| "num_tokens": 67863433.0, | |
| "step": 1092 | |
| }, | |
| { | |
| "epoch": 1.339045287637699, | |
| "grad_norm": 0.2526870369911194, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3721, | |
| "num_tokens": 68449351.0, | |
| "step": 1094 | |
| }, | |
| { | |
| "epoch": 1.3414932680538556, | |
| "grad_norm": 0.24224893748760223, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3756, | |
| "num_tokens": 69068729.0, | |
| "step": 1096 | |
| }, | |
| { | |
| "epoch": 1.3439412484700122, | |
| "grad_norm": 0.24491116404533386, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3874, | |
| "num_tokens": 69656409.0, | |
| "step": 1098 | |
| }, | |
| { | |
| "epoch": 1.346389228886169, | |
| "grad_norm": 0.2297855019569397, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3763, | |
| "num_tokens": 70284891.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.3488372093023255, | |
| "grad_norm": 0.25858911871910095, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3737, | |
| "num_tokens": 70859349.0, | |
| "step": 1102 | |
| }, | |
| { | |
| "epoch": 1.3512851897184823, | |
| "grad_norm": 0.24717983603477478, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3862, | |
| "num_tokens": 71445796.0, | |
| "step": 1104 | |
| }, | |
| { | |
| "epoch": 1.353733170134639, | |
| "grad_norm": 0.2568610608577728, | |
| "learning_rate": 1e-05, | |
| "loss": 0.379, | |
| "num_tokens": 72031706.0, | |
| "step": 1106 | |
| }, | |
| { | |
| "epoch": 1.3561811505507957, | |
| "grad_norm": 0.24025028944015503, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3727, | |
| "num_tokens": 72618510.0, | |
| "step": 1108 | |
| }, | |
| { | |
| "epoch": 1.3586291309669523, | |
| "grad_norm": 0.24287337064743042, | |
| "learning_rate": 1e-05, | |
| "loss": 0.389, | |
| "num_tokens": 73203297.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.3610771113831088, | |
| "grad_norm": 0.25672075152397156, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3901, | |
| "num_tokens": 73812815.0, | |
| "step": 1112 | |
| }, | |
| { | |
| "epoch": 1.3635250917992656, | |
| "grad_norm": 0.23002919554710388, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3606, | |
| "num_tokens": 74428959.0, | |
| "step": 1114 | |
| }, | |
| { | |
| "epoch": 1.3659730722154224, | |
| "grad_norm": 0.24714474380016327, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3614, | |
| "num_tokens": 75008016.0, | |
| "step": 1116 | |
| }, | |
| { | |
| "epoch": 1.368421052631579, | |
| "grad_norm": 0.24221962690353394, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3843, | |
| "num_tokens": 75628706.0, | |
| "step": 1118 | |
| }, | |
| { | |
| "epoch": 1.3708690330477356, | |
| "grad_norm": 0.2576131522655487, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3867, | |
| "num_tokens": 76195451.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.3733170134638923, | |
| "grad_norm": 0.2453685849905014, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3873, | |
| "num_tokens": 76761969.0, | |
| "step": 1122 | |
| }, | |
| { | |
| "epoch": 1.375764993880049, | |
| "grad_norm": 0.24041421711444855, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3701, | |
| "num_tokens": 77344091.0, | |
| "step": 1124 | |
| }, | |
| { | |
| "epoch": 1.3782129742962057, | |
| "grad_norm": 0.25494855642318726, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3708, | |
| "num_tokens": 77933968.0, | |
| "step": 1126 | |
| }, | |
| { | |
| "epoch": 1.3806609547123623, | |
| "grad_norm": 0.24914491176605225, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3676, | |
| "num_tokens": 78506872.0, | |
| "step": 1128 | |
| }, | |
| { | |
| "epoch": 1.383108935128519, | |
| "grad_norm": 0.24509696662425995, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3922, | |
| "num_tokens": 79115121.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.3855569155446756, | |
| "grad_norm": 0.24067234992980957, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3719, | |
| "num_tokens": 79689670.0, | |
| "step": 1132 | |
| }, | |
| { | |
| "epoch": 1.3880048959608322, | |
| "grad_norm": 0.2415025383234024, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3842, | |
| "num_tokens": 80296309.0, | |
| "step": 1134 | |
| }, | |
| { | |
| "epoch": 1.390452876376989, | |
| "grad_norm": 0.23817619681358337, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3763, | |
| "num_tokens": 80893156.0, | |
| "step": 1136 | |
| }, | |
| { | |
| "epoch": 1.3929008567931458, | |
| "grad_norm": 0.24730853736400604, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3825, | |
| "num_tokens": 81501032.0, | |
| "step": 1138 | |
| }, | |
| { | |
| "epoch": 1.3953488372093024, | |
| "grad_norm": 0.2538658082485199, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3728, | |
| "num_tokens": 82063275.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.397796817625459, | |
| "grad_norm": 0.2547858655452728, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3658, | |
| "num_tokens": 82647178.0, | |
| "step": 1142 | |
| }, | |
| { | |
| "epoch": 1.4002447980416157, | |
| "grad_norm": 0.24213729798793793, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3709, | |
| "num_tokens": 83253938.0, | |
| "step": 1144 | |
| }, | |
| { | |
| "epoch": 1.4026927784577723, | |
| "grad_norm": 0.24049073457717896, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3727, | |
| "num_tokens": 83843402.0, | |
| "step": 1146 | |
| }, | |
| { | |
| "epoch": 1.405140758873929, | |
| "grad_norm": 0.2472056746482849, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3714, | |
| "num_tokens": 84445702.0, | |
| "step": 1148 | |
| }, | |
| { | |
| "epoch": 1.4075887392900857, | |
| "grad_norm": 0.23540234565734863, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3841, | |
| "num_tokens": 85061854.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.4100367197062424, | |
| "grad_norm": 0.24163338541984558, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3707, | |
| "num_tokens": 85656497.0, | |
| "step": 1152 | |
| }, | |
| { | |
| "epoch": 1.412484700122399, | |
| "grad_norm": 0.2464989274740219, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3739, | |
| "num_tokens": 86251427.0, | |
| "step": 1154 | |
| }, | |
| { | |
| "epoch": 1.4149326805385556, | |
| "grad_norm": 0.2482248991727829, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3705, | |
| "num_tokens": 86832067.0, | |
| "step": 1156 | |
| }, | |
| { | |
| "epoch": 1.4173806609547124, | |
| "grad_norm": 0.24530856311321259, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3753, | |
| "num_tokens": 87430733.0, | |
| "step": 1158 | |
| }, | |
| { | |
| "epoch": 1.4198286413708692, | |
| "grad_norm": 0.2416103333234787, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3788, | |
| "num_tokens": 88052112.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.4222766217870257, | |
| "grad_norm": 0.2397005707025528, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3734, | |
| "num_tokens": 88650677.0, | |
| "step": 1162 | |
| }, | |
| { | |
| "epoch": 1.4247246022031823, | |
| "grad_norm": 0.2345299869775772, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3721, | |
| "num_tokens": 89245448.0, | |
| "step": 1164 | |
| }, | |
| { | |
| "epoch": 1.427172582619339, | |
| "grad_norm": 0.23692870140075684, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3663, | |
| "num_tokens": 89834942.0, | |
| "step": 1166 | |
| }, | |
| { | |
| "epoch": 1.4296205630354957, | |
| "grad_norm": 0.25026100873947144, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3917, | |
| "num_tokens": 90433936.0, | |
| "step": 1168 | |
| }, | |
| { | |
| "epoch": 1.4320685434516525, | |
| "grad_norm": 0.24824434518814087, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3806, | |
| "num_tokens": 91061342.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.434516523867809, | |
| "grad_norm": 0.24724173545837402, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3605, | |
| "num_tokens": 91652152.0, | |
| "step": 1172 | |
| }, | |
| { | |
| "epoch": 1.4369645042839658, | |
| "grad_norm": 0.26517099142074585, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3898, | |
| "num_tokens": 92245127.0, | |
| "step": 1174 | |
| }, | |
| { | |
| "epoch": 1.4394124847001224, | |
| "grad_norm": 0.26708370447158813, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3797, | |
| "num_tokens": 92838369.0, | |
| "step": 1176 | |
| }, | |
| { | |
| "epoch": 1.441860465116279, | |
| "grad_norm": 0.2398887723684311, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3794, | |
| "num_tokens": 93420778.0, | |
| "step": 1178 | |
| }, | |
| { | |
| "epoch": 1.4443084455324358, | |
| "grad_norm": 0.267572820186615, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3766, | |
| "num_tokens": 93997138.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.4467564259485923, | |
| "grad_norm": 0.24645371735095978, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3638, | |
| "num_tokens": 94586885.0, | |
| "step": 1182 | |
| }, | |
| { | |
| "epoch": 1.4492044063647491, | |
| "grad_norm": 0.2596088647842407, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3739, | |
| "num_tokens": 95158567.0, | |
| "step": 1184 | |
| }, | |
| { | |
| "epoch": 1.4516523867809057, | |
| "grad_norm": 0.24318157136440277, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3708, | |
| "num_tokens": 95755615.0, | |
| "step": 1186 | |
| }, | |
| { | |
| "epoch": 1.4541003671970625, | |
| "grad_norm": 0.25732532143592834, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3733, | |
| "num_tokens": 96326941.0, | |
| "step": 1188 | |
| }, | |
| { | |
| "epoch": 1.456548347613219, | |
| "grad_norm": 0.24617703258991241, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3749, | |
| "num_tokens": 96934565.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.4589963280293758, | |
| "grad_norm": 0.24249835312366486, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3709, | |
| "num_tokens": 97544628.0, | |
| "step": 1192 | |
| }, | |
| { | |
| "epoch": 1.4614443084455324, | |
| "grad_norm": 0.25143083930015564, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3716, | |
| "num_tokens": 98129011.0, | |
| "step": 1194 | |
| }, | |
| { | |
| "epoch": 1.4638922888616892, | |
| "grad_norm": 0.2425592988729477, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3823, | |
| "num_tokens": 98750360.0, | |
| "step": 1196 | |
| }, | |
| { | |
| "epoch": 1.4663402692778458, | |
| "grad_norm": 0.24293585121631622, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3621, | |
| "num_tokens": 99341765.0, | |
| "step": 1198 | |
| }, | |
| { | |
| "epoch": 1.4687882496940023, | |
| "grad_norm": 0.23716874420642853, | |
| "learning_rate": 1e-05, | |
| "loss": 0.381, | |
| "num_tokens": 99932679.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.4712362301101591, | |
| "grad_norm": 0.22967226803302765, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3641, | |
| "num_tokens": 100527728.0, | |
| "step": 1202 | |
| }, | |
| { | |
| "epoch": 1.4736842105263157, | |
| "grad_norm": 0.24329859018325806, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3757, | |
| "num_tokens": 101140712.0, | |
| "step": 1204 | |
| }, | |
| { | |
| "epoch": 1.4761321909424725, | |
| "grad_norm": 0.2402748316526413, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3706, | |
| "num_tokens": 101749103.0, | |
| "step": 1206 | |
| }, | |
| { | |
| "epoch": 1.478580171358629, | |
| "grad_norm": 0.2467920184135437, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3844, | |
| "num_tokens": 102349686.0, | |
| "step": 1208 | |
| }, | |
| { | |
| "epoch": 1.4810281517747859, | |
| "grad_norm": 0.23824624717235565, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3801, | |
| "num_tokens": 102935794.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.4834761321909424, | |
| "grad_norm": 0.22843535244464874, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3798, | |
| "num_tokens": 103565230.0, | |
| "step": 1212 | |
| }, | |
| { | |
| "epoch": 1.4859241126070992, | |
| "grad_norm": 0.251880019903183, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3792, | |
| "num_tokens": 104127436.0, | |
| "step": 1214 | |
| }, | |
| { | |
| "epoch": 1.4883720930232558, | |
| "grad_norm": 0.23985151946544647, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3752, | |
| "num_tokens": 104719703.0, | |
| "step": 1216 | |
| }, | |
| { | |
| "epoch": 1.4908200734394126, | |
| "grad_norm": 0.24645771086215973, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3723, | |
| "num_tokens": 105292775.0, | |
| "step": 1218 | |
| }, | |
| { | |
| "epoch": 1.4932680538555692, | |
| "grad_norm": 0.2515164613723755, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3781, | |
| "num_tokens": 105898653.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.4957160342717257, | |
| "grad_norm": 0.2437610626220703, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3762, | |
| "num_tokens": 106506854.0, | |
| "step": 1222 | |
| }, | |
| { | |
| "epoch": 1.4981640146878825, | |
| "grad_norm": 0.2478041648864746, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3695, | |
| "num_tokens": 107066302.0, | |
| "step": 1224 | |
| }, | |
| { | |
| "epoch": 1.5006119951040393, | |
| "grad_norm": 0.24896089732646942, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3687, | |
| "num_tokens": 107646336.0, | |
| "step": 1226 | |
| }, | |
| { | |
| "epoch": 1.5030599755201959, | |
| "grad_norm": 0.24831615388393402, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3864, | |
| "num_tokens": 108256194.0, | |
| "step": 1228 | |
| }, | |
| { | |
| "epoch": 1.5055079559363524, | |
| "grad_norm": 0.2522716522216797, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3687, | |
| "num_tokens": 108833203.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.5079559363525092, | |
| "grad_norm": 0.2509858310222626, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3786, | |
| "num_tokens": 109423517.0, | |
| "step": 1232 | |
| }, | |
| { | |
| "epoch": 1.5104039167686658, | |
| "grad_norm": 0.236283540725708, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3572, | |
| "num_tokens": 110000996.0, | |
| "step": 1234 | |
| }, | |
| { | |
| "epoch": 1.5128518971848224, | |
| "grad_norm": 0.24623404443264008, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3674, | |
| "num_tokens": 110581988.0, | |
| "step": 1236 | |
| }, | |
| { | |
| "epoch": 1.5152998776009792, | |
| "grad_norm": 0.24854424595832825, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3744, | |
| "num_tokens": 111174046.0, | |
| "step": 1238 | |
| }, | |
| { | |
| "epoch": 1.517747858017136, | |
| "grad_norm": 0.2561953365802765, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3777, | |
| "num_tokens": 111790906.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.5201958384332925, | |
| "grad_norm": 0.2569839358329773, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3751, | |
| "num_tokens": 112378426.0, | |
| "step": 1242 | |
| }, | |
| { | |
| "epoch": 1.522643818849449, | |
| "grad_norm": 0.24260175228118896, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3755, | |
| "num_tokens": 112972211.0, | |
| "step": 1244 | |
| }, | |
| { | |
| "epoch": 1.525091799265606, | |
| "grad_norm": 0.2720998227596283, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3644, | |
| "num_tokens": 113560888.0, | |
| "step": 1246 | |
| }, | |
| { | |
| "epoch": 1.5275397796817627, | |
| "grad_norm": 0.24294328689575195, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3652, | |
| "num_tokens": 114131545.0, | |
| "step": 1248 | |
| }, | |
| { | |
| "epoch": 1.5299877600979193, | |
| "grad_norm": 0.24424785375595093, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3853, | |
| "num_tokens": 114724758.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.5324357405140758, | |
| "grad_norm": 0.2621991038322449, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3671, | |
| "num_tokens": 115299206.0, | |
| "step": 1252 | |
| }, | |
| { | |
| "epoch": 1.5348837209302326, | |
| "grad_norm": 0.2541300356388092, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3799, | |
| "num_tokens": 115885530.0, | |
| "step": 1254 | |
| }, | |
| { | |
| "epoch": 1.5373317013463892, | |
| "grad_norm": 0.26225703954696655, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3613, | |
| "num_tokens": 116448812.0, | |
| "step": 1256 | |
| }, | |
| { | |
| "epoch": 1.5397796817625458, | |
| "grad_norm": 0.23766973614692688, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3748, | |
| "num_tokens": 117055396.0, | |
| "step": 1258 | |
| }, | |
| { | |
| "epoch": 1.5422276621787026, | |
| "grad_norm": 0.24437591433525085, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3619, | |
| "num_tokens": 117646869.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.5446756425948593, | |
| "grad_norm": 0.2565830647945404, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3723, | |
| "num_tokens": 118208889.0, | |
| "step": 1262 | |
| }, | |
| { | |
| "epoch": 1.547123623011016, | |
| "grad_norm": 0.24650581181049347, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3749, | |
| "num_tokens": 118791050.0, | |
| "step": 1264 | |
| }, | |
| { | |
| "epoch": 1.5495716034271725, | |
| "grad_norm": 0.24814392626285553, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3684, | |
| "num_tokens": 119365749.0, | |
| "step": 1266 | |
| }, | |
| { | |
| "epoch": 1.5520195838433293, | |
| "grad_norm": 0.24582888185977936, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3555, | |
| "num_tokens": 119950880.0, | |
| "step": 1268 | |
| }, | |
| { | |
| "epoch": 1.554467564259486, | |
| "grad_norm": 0.24036677181720734, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3766, | |
| "num_tokens": 120565071.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.5569155446756426, | |
| "grad_norm": 0.23649543523788452, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3727, | |
| "num_tokens": 121183680.0, | |
| "step": 1272 | |
| }, | |
| { | |
| "epoch": 1.5593635250917992, | |
| "grad_norm": 0.23108918964862823, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3665, | |
| "num_tokens": 121800058.0, | |
| "step": 1274 | |
| }, | |
| { | |
| "epoch": 1.561811505507956, | |
| "grad_norm": 0.25551703572273254, | |
| "learning_rate": 1e-05, | |
| "loss": 0.373, | |
| "num_tokens": 122392911.0, | |
| "step": 1276 | |
| }, | |
| { | |
| "epoch": 1.5642594859241126, | |
| "grad_norm": 0.2403259128332138, | |
| "learning_rate": 1e-05, | |
| "loss": 0.371, | |
| "num_tokens": 122994876.0, | |
| "step": 1278 | |
| }, | |
| { | |
| "epoch": 1.5667074663402691, | |
| "grad_norm": 0.23855362832546234, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3758, | |
| "num_tokens": 123607985.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.569155446756426, | |
| "grad_norm": 0.24700097739696503, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3805, | |
| "num_tokens": 124174095.0, | |
| "step": 1282 | |
| }, | |
| { | |
| "epoch": 1.5716034271725827, | |
| "grad_norm": 0.2535829246044159, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3706, | |
| "num_tokens": 124758917.0, | |
| "step": 1284 | |
| }, | |
| { | |
| "epoch": 1.5740514075887393, | |
| "grad_norm": 0.24376705288887024, | |
| "learning_rate": 1e-05, | |
| "loss": 0.371, | |
| "num_tokens": 125354644.0, | |
| "step": 1286 | |
| }, | |
| { | |
| "epoch": 1.5764993880048959, | |
| "grad_norm": 0.24517321586608887, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3811, | |
| "num_tokens": 125955891.0, | |
| "step": 1288 | |
| }, | |
| { | |
| "epoch": 1.5789473684210527, | |
| "grad_norm": 0.24353429675102234, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3672, | |
| "num_tokens": 126576567.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.5813953488372094, | |
| "grad_norm": 0.24050772190093994, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3746, | |
| "num_tokens": 127142553.0, | |
| "step": 1292 | |
| }, | |
| { | |
| "epoch": 1.583843329253366, | |
| "grad_norm": 0.24689637124538422, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3817, | |
| "num_tokens": 127758123.0, | |
| "step": 1294 | |
| }, | |
| { | |
| "epoch": 1.5862913096695226, | |
| "grad_norm": 0.24711348116397858, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3575, | |
| "num_tokens": 128370312.0, | |
| "step": 1296 | |
| }, | |
| { | |
| "epoch": 1.5887392900856794, | |
| "grad_norm": 0.23984448611736298, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3677, | |
| "num_tokens": 128972638.0, | |
| "step": 1298 | |
| }, | |
| { | |
| "epoch": 1.591187270501836, | |
| "grad_norm": 0.2484259158372879, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3755, | |
| "num_tokens": 129572913.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.5936352509179925, | |
| "grad_norm": 0.2364276647567749, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3747, | |
| "num_tokens": 130195765.0, | |
| "step": 1302 | |
| }, | |
| { | |
| "epoch": 1.5960832313341493, | |
| "grad_norm": 0.24387337267398834, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3745, | |
| "num_tokens": 130789536.0, | |
| "step": 1304 | |
| }, | |
| { | |
| "epoch": 1.598531211750306, | |
| "grad_norm": 0.23871919512748718, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3714, | |
| "num_tokens": 131403745.0, | |
| "step": 1306 | |
| }, | |
| { | |
| "epoch": 1.6009791921664627, | |
| "grad_norm": 0.25457867980003357, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3686, | |
| "num_tokens": 131969389.0, | |
| "step": 1308 | |
| }, | |
| { | |
| "epoch": 1.6034271725826192, | |
| "grad_norm": 0.2390459179878235, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3736, | |
| "num_tokens": 132571128.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.605875152998776, | |
| "grad_norm": 0.24587389826774597, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3733, | |
| "num_tokens": 133160154.0, | |
| "step": 1312 | |
| }, | |
| { | |
| "epoch": 1.6083231334149328, | |
| "grad_norm": 0.2370455414056778, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3665, | |
| "num_tokens": 133735784.0, | |
| "step": 1314 | |
| }, | |
| { | |
| "epoch": 1.6107711138310894, | |
| "grad_norm": 0.24686893820762634, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3815, | |
| "num_tokens": 134327883.0, | |
| "step": 1316 | |
| }, | |
| { | |
| "epoch": 1.613219094247246, | |
| "grad_norm": 0.24591784179210663, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3794, | |
| "num_tokens": 134922195.0, | |
| "step": 1318 | |
| }, | |
| { | |
| "epoch": 1.6156670746634028, | |
| "grad_norm": 0.24312755465507507, | |
| "learning_rate": 1e-05, | |
| "loss": 0.373, | |
| "num_tokens": 135497704.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.6181150550795593, | |
| "grad_norm": 0.24310767650604248, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3702, | |
| "num_tokens": 136090481.0, | |
| "step": 1322 | |
| }, | |
| { | |
| "epoch": 1.620563035495716, | |
| "grad_norm": 0.2369573563337326, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3681, | |
| "num_tokens": 136691129.0, | |
| "step": 1324 | |
| }, | |
| { | |
| "epoch": 1.6230110159118727, | |
| "grad_norm": 0.2429567277431488, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3672, | |
| "num_tokens": 137299855.0, | |
| "step": 1326 | |
| }, | |
| { | |
| "epoch": 1.6254589963280295, | |
| "grad_norm": 0.271967351436615, | |
| "learning_rate": 1e-05, | |
| "loss": 0.362, | |
| "num_tokens": 137882026.0, | |
| "step": 1328 | |
| }, | |
| { | |
| "epoch": 1.627906976744186, | |
| "grad_norm": 0.2503095269203186, | |
| "learning_rate": 1e-05, | |
| "loss": 0.371, | |
| "num_tokens": 138449440.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.6303549571603426, | |
| "grad_norm": 0.23528578877449036, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3693, | |
| "num_tokens": 139040161.0, | |
| "step": 1332 | |
| }, | |
| { | |
| "epoch": 1.6328029375764994, | |
| "grad_norm": 0.2523113489151001, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3852, | |
| "num_tokens": 139625197.0, | |
| "step": 1334 | |
| }, | |
| { | |
| "epoch": 1.6352509179926562, | |
| "grad_norm": 0.23514311015605927, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3643, | |
| "num_tokens": 140210445.0, | |
| "step": 1336 | |
| }, | |
| { | |
| "epoch": 1.6376988984088128, | |
| "grad_norm": 0.2515263557434082, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3786, | |
| "num_tokens": 140794059.0, | |
| "step": 1338 | |
| }, | |
| { | |
| "epoch": 1.6401468788249693, | |
| "grad_norm": 0.23876678943634033, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3682, | |
| "num_tokens": 141373540.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.6425948592411261, | |
| "grad_norm": 0.24048742651939392, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3635, | |
| "num_tokens": 141952741.0, | |
| "step": 1342 | |
| }, | |
| { | |
| "epoch": 1.6450428396572827, | |
| "grad_norm": 0.24254868924617767, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3712, | |
| "num_tokens": 142536772.0, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 1.6474908200734393, | |
| "grad_norm": 0.2360367327928543, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3584, | |
| "num_tokens": 143109958.0, | |
| "step": 1346 | |
| }, | |
| { | |
| "epoch": 1.649938800489596, | |
| "grad_norm": 0.23876917362213135, | |
| "learning_rate": 1e-05, | |
| "loss": 0.372, | |
| "num_tokens": 143683599.0, | |
| "step": 1348 | |
| }, | |
| { | |
| "epoch": 1.6523867809057529, | |
| "grad_norm": 0.24206481873989105, | |
| "learning_rate": 1e-05, | |
| "loss": 0.366, | |
| "num_tokens": 144275321.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.6548347613219094, | |
| "grad_norm": 0.2546513080596924, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3589, | |
| "num_tokens": 144841340.0, | |
| "step": 1352 | |
| }, | |
| { | |
| "epoch": 1.657282741738066, | |
| "grad_norm": 0.25866273045539856, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3581, | |
| "num_tokens": 145428192.0, | |
| "step": 1354 | |
| }, | |
| { | |
| "epoch": 1.6597307221542228, | |
| "grad_norm": 0.244053915143013, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3881, | |
| "num_tokens": 146015954.0, | |
| "step": 1356 | |
| }, | |
| { | |
| "epoch": 1.6621787025703796, | |
| "grad_norm": 0.24577046930789948, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3563, | |
| "num_tokens": 146597823.0, | |
| "step": 1358 | |
| }, | |
| { | |
| "epoch": 1.6646266829865362, | |
| "grad_norm": 0.23734845221042633, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3627, | |
| "num_tokens": 147191387.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.6670746634026927, | |
| "grad_norm": 0.2565036416053772, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3883, | |
| "num_tokens": 147783308.0, | |
| "step": 1362 | |
| }, | |
| { | |
| "epoch": 1.6695226438188495, | |
| "grad_norm": 0.2334887683391571, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3743, | |
| "num_tokens": 148375364.0, | |
| "step": 1364 | |
| }, | |
| { | |
| "epoch": 1.671970624235006, | |
| "grad_norm": 0.24507057666778564, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3708, | |
| "num_tokens": 148967447.0, | |
| "step": 1366 | |
| }, | |
| { | |
| "epoch": 1.6744186046511627, | |
| "grad_norm": 0.2379869818687439, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3764, | |
| "num_tokens": 149559323.0, | |
| "step": 1368 | |
| }, | |
| { | |
| "epoch": 1.6768665850673194, | |
| "grad_norm": 0.2732064127922058, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3711, | |
| "num_tokens": 150158225.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.6793145654834762, | |
| "grad_norm": 0.23710450530052185, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3713, | |
| "num_tokens": 150736029.0, | |
| "step": 1372 | |
| }, | |
| { | |
| "epoch": 1.6817625458996328, | |
| "grad_norm": 0.24757219851016998, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3787, | |
| "num_tokens": 151320035.0, | |
| "step": 1374 | |
| }, | |
| { | |
| "epoch": 1.6842105263157894, | |
| "grad_norm": 0.24102835357189178, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3784, | |
| "num_tokens": 151877797.0, | |
| "step": 1376 | |
| }, | |
| { | |
| "epoch": 1.6866585067319462, | |
| "grad_norm": 0.2291739284992218, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3744, | |
| "num_tokens": 152494821.0, | |
| "step": 1378 | |
| }, | |
| { | |
| "epoch": 1.689106487148103, | |
| "grad_norm": 0.2373509258031845, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3699, | |
| "num_tokens": 153103512.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.6915544675642595, | |
| "grad_norm": 0.23789846897125244, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3752, | |
| "num_tokens": 153698813.0, | |
| "step": 1382 | |
| }, | |
| { | |
| "epoch": 1.694002447980416, | |
| "grad_norm": 0.2423292100429535, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3645, | |
| "num_tokens": 154272685.0, | |
| "step": 1384 | |
| }, | |
| { | |
| "epoch": 1.696450428396573, | |
| "grad_norm": 0.23416705429553986, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3538, | |
| "num_tokens": 154812070.0, | |
| "step": 1386 | |
| }, | |
| { | |
| "epoch": 1.6988984088127295, | |
| "grad_norm": 0.25672298669815063, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3666, | |
| "num_tokens": 155392151.0, | |
| "step": 1388 | |
| }, | |
| { | |
| "epoch": 1.701346389228886, | |
| "grad_norm": 0.23592020571231842, | |
| "learning_rate": 1e-05, | |
| "loss": 0.372, | |
| "num_tokens": 156034757.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.7037943696450428, | |
| "grad_norm": 0.2403942346572876, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3644, | |
| "num_tokens": 156628474.0, | |
| "step": 1392 | |
| }, | |
| { | |
| "epoch": 1.7062423500611996, | |
| "grad_norm": 0.2440410554409027, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3675, | |
| "num_tokens": 157204254.0, | |
| "step": 1394 | |
| }, | |
| { | |
| "epoch": 1.7086903304773562, | |
| "grad_norm": 0.2393968254327774, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3774, | |
| "num_tokens": 157793450.0, | |
| "step": 1396 | |
| }, | |
| { | |
| "epoch": 1.7111383108935128, | |
| "grad_norm": 0.24635456502437592, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3611, | |
| "num_tokens": 158389128.0, | |
| "step": 1398 | |
| }, | |
| { | |
| "epoch": 1.7135862913096696, | |
| "grad_norm": 0.2437032163143158, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3726, | |
| "num_tokens": 158958965.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.7160342717258263, | |
| "grad_norm": 0.24211743474006653, | |
| "learning_rate": 1e-05, | |
| "loss": 0.363, | |
| "num_tokens": 159568000.0, | |
| "step": 1402 | |
| }, | |
| { | |
| "epoch": 1.718482252141983, | |
| "grad_norm": 0.23698055744171143, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3695, | |
| "num_tokens": 160156477.0, | |
| "step": 1404 | |
| }, | |
| { | |
| "epoch": 1.7209302325581395, | |
| "grad_norm": 0.24990005791187286, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3733, | |
| "num_tokens": 160754592.0, | |
| "step": 1406 | |
| }, | |
| { | |
| "epoch": 1.7233782129742963, | |
| "grad_norm": 0.2389305830001831, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3503, | |
| "num_tokens": 161339326.0, | |
| "step": 1408 | |
| }, | |
| { | |
| "epoch": 1.7258261933904528, | |
| "grad_norm": 0.23870104551315308, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3547, | |
| "num_tokens": 161958750.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.7282741738066094, | |
| "grad_norm": 0.24630923569202423, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3695, | |
| "num_tokens": 162547514.0, | |
| "step": 1412 | |
| }, | |
| { | |
| "epoch": 1.7307221542227662, | |
| "grad_norm": 0.24691331386566162, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3755, | |
| "num_tokens": 163145389.0, | |
| "step": 1414 | |
| }, | |
| { | |
| "epoch": 1.733170134638923, | |
| "grad_norm": 0.2594568431377411, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3538, | |
| "num_tokens": 163745928.0, | |
| "step": 1416 | |
| }, | |
| { | |
| "epoch": 1.7356181150550796, | |
| "grad_norm": 0.24012570083141327, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3791, | |
| "num_tokens": 164340009.0, | |
| "step": 1418 | |
| }, | |
| { | |
| "epoch": 1.7380660954712361, | |
| "grad_norm": 0.2581995725631714, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3626, | |
| "num_tokens": 164941110.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.740514075887393, | |
| "grad_norm": 0.25723937153816223, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3748, | |
| "num_tokens": 165514409.0, | |
| "step": 1422 | |
| }, | |
| { | |
| "epoch": 1.7429620563035497, | |
| "grad_norm": 0.2481413334608078, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3854, | |
| "num_tokens": 166119773.0, | |
| "step": 1424 | |
| }, | |
| { | |
| "epoch": 1.7454100367197063, | |
| "grad_norm": 0.22994013130664825, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3858, | |
| "num_tokens": 166747342.0, | |
| "step": 1426 | |
| }, | |
| { | |
| "epoch": 1.7478580171358629, | |
| "grad_norm": 0.23820635676383972, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3678, | |
| "num_tokens": 167339222.0, | |
| "step": 1428 | |
| }, | |
| { | |
| "epoch": 1.7503059975520197, | |
| "grad_norm": 0.24489940702915192, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3644, | |
| "num_tokens": 167907963.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.7527539779681762, | |
| "grad_norm": 0.24379844963550568, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3646, | |
| "num_tokens": 168490594.0, | |
| "step": 1432 | |
| }, | |
| { | |
| "epoch": 1.7552019583843328, | |
| "grad_norm": 0.23336175084114075, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3716, | |
| "num_tokens": 169098140.0, | |
| "step": 1434 | |
| }, | |
| { | |
| "epoch": 1.7576499388004896, | |
| "grad_norm": 0.24837198853492737, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3601, | |
| "num_tokens": 169673058.0, | |
| "step": 1436 | |
| }, | |
| { | |
| "epoch": 1.7600979192166464, | |
| "grad_norm": 0.24629239737987518, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3711, | |
| "num_tokens": 170258149.0, | |
| "step": 1438 | |
| }, | |
| { | |
| "epoch": 1.762545899632803, | |
| "grad_norm": 0.2353641837835312, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3727, | |
| "num_tokens": 170855087.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.7649938800489595, | |
| "grad_norm": 0.25406232476234436, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3829, | |
| "num_tokens": 171460109.0, | |
| "step": 1442 | |
| }, | |
| { | |
| "epoch": 1.7674418604651163, | |
| "grad_norm": 0.24506038427352905, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3675, | |
| "num_tokens": 172064062.0, | |
| "step": 1444 | |
| }, | |
| { | |
| "epoch": 1.769889840881273, | |
| "grad_norm": 0.24228806793689728, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3794, | |
| "num_tokens": 172650152.0, | |
| "step": 1446 | |
| }, | |
| { | |
| "epoch": 1.7723378212974297, | |
| "grad_norm": 0.2572336494922638, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3768, | |
| "num_tokens": 173241018.0, | |
| "step": 1448 | |
| }, | |
| { | |
| "epoch": 1.7747858017135862, | |
| "grad_norm": 0.23749330639839172, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3773, | |
| "num_tokens": 173857068.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.777233782129743, | |
| "grad_norm": 0.24634265899658203, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3739, | |
| "num_tokens": 174469774.0, | |
| "step": 1452 | |
| }, | |
| { | |
| "epoch": 1.7796817625458996, | |
| "grad_norm": 0.22762316465377808, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3761, | |
| "num_tokens": 175084326.0, | |
| "step": 1454 | |
| }, | |
| { | |
| "epoch": 1.7821297429620562, | |
| "grad_norm": 0.24187412858009338, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3613, | |
| "num_tokens": 175650143.0, | |
| "step": 1456 | |
| }, | |
| { | |
| "epoch": 1.784577723378213, | |
| "grad_norm": 0.24004510045051575, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3675, | |
| "num_tokens": 176248086.0, | |
| "step": 1458 | |
| }, | |
| { | |
| "epoch": 1.7870257037943698, | |
| "grad_norm": 0.24396033585071564, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3627, | |
| "num_tokens": 176851627.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.7894736842105263, | |
| "grad_norm": 0.2826113998889923, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3886, | |
| "num_tokens": 177453579.0, | |
| "step": 1462 | |
| }, | |
| { | |
| "epoch": 1.791921664626683, | |
| "grad_norm": 0.250384658575058, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3781, | |
| "num_tokens": 178065458.0, | |
| "step": 1464 | |
| }, | |
| { | |
| "epoch": 1.7943696450428397, | |
| "grad_norm": 0.2513806223869324, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3667, | |
| "num_tokens": 178644461.0, | |
| "step": 1466 | |
| }, | |
| { | |
| "epoch": 1.7968176254589965, | |
| "grad_norm": 0.25463229417800903, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3703, | |
| "num_tokens": 179220166.0, | |
| "step": 1468 | |
| }, | |
| { | |
| "epoch": 1.799265605875153, | |
| "grad_norm": 0.2267390787601471, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3628, | |
| "num_tokens": 179824923.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.8017135862913096, | |
| "grad_norm": 0.2407565414905548, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3667, | |
| "num_tokens": 180430779.0, | |
| "step": 1472 | |
| }, | |
| { | |
| "epoch": 1.8041615667074664, | |
| "grad_norm": 0.2538115978240967, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3713, | |
| "num_tokens": 181015860.0, | |
| "step": 1474 | |
| }, | |
| { | |
| "epoch": 1.806609547123623, | |
| "grad_norm": 0.2532234489917755, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3662, | |
| "num_tokens": 181597275.0, | |
| "step": 1476 | |
| }, | |
| { | |
| "epoch": 1.8090575275397796, | |
| "grad_norm": 0.24419713020324707, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3673, | |
| "num_tokens": 182201474.0, | |
| "step": 1478 | |
| }, | |
| { | |
| "epoch": 1.8115055079559363, | |
| "grad_norm": 0.2511015236377716, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3765, | |
| "num_tokens": 182770589.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.8139534883720931, | |
| "grad_norm": 0.27226755023002625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3628, | |
| "num_tokens": 183366383.0, | |
| "step": 1482 | |
| }, | |
| { | |
| "epoch": 1.8164014687882497, | |
| "grad_norm": 0.2558925449848175, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3724, | |
| "num_tokens": 183965838.0, | |
| "step": 1484 | |
| }, | |
| { | |
| "epoch": 1.8188494492044063, | |
| "grad_norm": 0.2294636219739914, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3692, | |
| "num_tokens": 184558859.0, | |
| "step": 1486 | |
| }, | |
| { | |
| "epoch": 1.821297429620563, | |
| "grad_norm": 0.26806262135505676, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3619, | |
| "num_tokens": 185170502.0, | |
| "step": 1488 | |
| }, | |
| { | |
| "epoch": 1.8237454100367199, | |
| "grad_norm": 0.255285382270813, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3525, | |
| "num_tokens": 185773232.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.8261933904528764, | |
| "grad_norm": 0.23820726573467255, | |
| "learning_rate": 1e-05, | |
| "loss": 0.367, | |
| "num_tokens": 186363042.0, | |
| "step": 1492 | |
| }, | |
| { | |
| "epoch": 1.828641370869033, | |
| "grad_norm": 0.2516627013683319, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3726, | |
| "num_tokens": 186970258.0, | |
| "step": 1494 | |
| }, | |
| { | |
| "epoch": 1.8310893512851898, | |
| "grad_norm": 0.24903468787670135, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3735, | |
| "num_tokens": 187562607.0, | |
| "step": 1496 | |
| }, | |
| { | |
| "epoch": 1.8335373317013464, | |
| "grad_norm": 0.2585706114768982, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3631, | |
| "num_tokens": 188147440.0, | |
| "step": 1498 | |
| }, | |
| { | |
| "epoch": 1.835985312117503, | |
| "grad_norm": 0.23948891460895538, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3707, | |
| "num_tokens": 188760737.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.8384332925336597, | |
| "grad_norm": 0.23523059487342834, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3668, | |
| "num_tokens": 189331744.0, | |
| "step": 1502 | |
| }, | |
| { | |
| "epoch": 1.8408812729498165, | |
| "grad_norm": 0.23200415074825287, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3563, | |
| "num_tokens": 189912006.0, | |
| "step": 1504 | |
| }, | |
| { | |
| "epoch": 1.843329253365973, | |
| "grad_norm": 0.24164701998233795, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3678, | |
| "num_tokens": 190507560.0, | |
| "step": 1506 | |
| }, | |
| { | |
| "epoch": 1.8457772337821297, | |
| "grad_norm": 0.24801412224769592, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3637, | |
| "num_tokens": 191119916.0, | |
| "step": 1508 | |
| }, | |
| { | |
| "epoch": 1.8482252141982864, | |
| "grad_norm": 0.2424629032611847, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3846, | |
| "num_tokens": 191696881.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.8506731946144432, | |
| "grad_norm": 0.2434435933828354, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3485, | |
| "num_tokens": 192250894.0, | |
| "step": 1512 | |
| }, | |
| { | |
| "epoch": 1.8531211750305998, | |
| "grad_norm": 0.22857359051704407, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3679, | |
| "num_tokens": 192868098.0, | |
| "step": 1514 | |
| }, | |
| { | |
| "epoch": 1.8555691554467564, | |
| "grad_norm": 0.2418743371963501, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3452, | |
| "num_tokens": 193432178.0, | |
| "step": 1516 | |
| }, | |
| { | |
| "epoch": 1.8580171358629132, | |
| "grad_norm": 0.25751784443855286, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3564, | |
| "num_tokens": 193997484.0, | |
| "step": 1518 | |
| }, | |
| { | |
| "epoch": 1.8604651162790697, | |
| "grad_norm": 0.23958885669708252, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3717, | |
| "num_tokens": 194552018.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.8629130966952263, | |
| "grad_norm": 0.2377503365278244, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3622, | |
| "num_tokens": 195142270.0, | |
| "step": 1522 | |
| }, | |
| { | |
| "epoch": 1.865361077111383, | |
| "grad_norm": 0.24225211143493652, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3794, | |
| "num_tokens": 195742052.0, | |
| "step": 1524 | |
| }, | |
| { | |
| "epoch": 1.86780905752754, | |
| "grad_norm": 0.23690791428089142, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3857, | |
| "num_tokens": 196359878.0, | |
| "step": 1526 | |
| }, | |
| { | |
| "epoch": 1.8702570379436965, | |
| "grad_norm": 0.23414862155914307, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3689, | |
| "num_tokens": 196973431.0, | |
| "step": 1528 | |
| }, | |
| { | |
| "epoch": 1.872705018359853, | |
| "grad_norm": 0.23633243143558502, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3593, | |
| "num_tokens": 197557990.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.8751529987760098, | |
| "grad_norm": 0.24126210808753967, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3838, | |
| "num_tokens": 198162137.0, | |
| "step": 1532 | |
| }, | |
| { | |
| "epoch": 1.8776009791921666, | |
| "grad_norm": 0.24134069681167603, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3589, | |
| "num_tokens": 198764078.0, | |
| "step": 1534 | |
| }, | |
| { | |
| "epoch": 1.880048959608323, | |
| "grad_norm": 0.25478363037109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3653, | |
| "num_tokens": 199331133.0, | |
| "step": 1536 | |
| }, | |
| { | |
| "epoch": 1.8824969400244798, | |
| "grad_norm": 0.24682196974754333, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3779, | |
| "num_tokens": 199936917.0, | |
| "step": 1538 | |
| }, | |
| { | |
| "epoch": 1.8849449204406366, | |
| "grad_norm": 0.24788504838943481, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3676, | |
| "num_tokens": 200517595.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.8873929008567931, | |
| "grad_norm": 0.23016424477100372, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3654, | |
| "num_tokens": 201124880.0, | |
| "step": 1542 | |
| }, | |
| { | |
| "epoch": 1.8898408812729497, | |
| "grad_norm": 0.24088110029697418, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3627, | |
| "num_tokens": 201716008.0, | |
| "step": 1544 | |
| }, | |
| { | |
| "epoch": 1.8922888616891065, | |
| "grad_norm": 0.25025540590286255, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3726, | |
| "num_tokens": 202315279.0, | |
| "step": 1546 | |
| }, | |
| { | |
| "epoch": 1.8947368421052633, | |
| "grad_norm": 0.2488720566034317, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3773, | |
| "num_tokens": 202876589.0, | |
| "step": 1548 | |
| }, | |
| { | |
| "epoch": 1.8971848225214198, | |
| "grad_norm": 0.24178265035152435, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3653, | |
| "num_tokens": 203467310.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.8996328029375764, | |
| "grad_norm": 0.23375409841537476, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3683, | |
| "num_tokens": 204058668.0, | |
| "step": 1552 | |
| }, | |
| { | |
| "epoch": 1.9020807833537332, | |
| "grad_norm": 0.24636210501194, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3716, | |
| "num_tokens": 204627426.0, | |
| "step": 1554 | |
| }, | |
| { | |
| "epoch": 1.90452876376989, | |
| "grad_norm": 0.22735662758350372, | |
| "learning_rate": 1e-05, | |
| "loss": 0.379, | |
| "num_tokens": 205252881.0, | |
| "step": 1556 | |
| }, | |
| { | |
| "epoch": 1.9069767441860463, | |
| "grad_norm": 0.2275749295949936, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3667, | |
| "num_tokens": 205833656.0, | |
| "step": 1558 | |
| }, | |
| { | |
| "epoch": 1.9094247246022031, | |
| "grad_norm": 0.23169246315956116, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3688, | |
| "num_tokens": 206439536.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.91187270501836, | |
| "grad_norm": 0.24443915486335754, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3779, | |
| "num_tokens": 207024878.0, | |
| "step": 1562 | |
| }, | |
| { | |
| "epoch": 1.9143206854345165, | |
| "grad_norm": 0.2206883579492569, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3691, | |
| "num_tokens": 207630287.0, | |
| "step": 1564 | |
| }, | |
| { | |
| "epoch": 1.916768665850673, | |
| "grad_norm": 0.23586352169513702, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3675, | |
| "num_tokens": 208197538.0, | |
| "step": 1566 | |
| }, | |
| { | |
| "epoch": 1.9192166462668299, | |
| "grad_norm": 0.245608851313591, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3704, | |
| "num_tokens": 208789057.0, | |
| "step": 1568 | |
| }, | |
| { | |
| "epoch": 1.9216646266829867, | |
| "grad_norm": 0.2444605678319931, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3701, | |
| "num_tokens": 209369955.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.9241126070991432, | |
| "grad_norm": 0.23837168514728546, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3734, | |
| "num_tokens": 209961210.0, | |
| "step": 1572 | |
| }, | |
| { | |
| "epoch": 1.9265605875152998, | |
| "grad_norm": 0.24565783143043518, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3595, | |
| "num_tokens": 210538072.0, | |
| "step": 1574 | |
| }, | |
| { | |
| "epoch": 1.9290085679314566, | |
| "grad_norm": 0.2464468628168106, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3758, | |
| "num_tokens": 211099323.0, | |
| "step": 1576 | |
| }, | |
| { | |
| "epoch": 1.9314565483476134, | |
| "grad_norm": 0.23516030609607697, | |
| "learning_rate": 1e-05, | |
| "loss": 0.373, | |
| "num_tokens": 211721276.0, | |
| "step": 1578 | |
| }, | |
| { | |
| "epoch": 1.9339045287637697, | |
| "grad_norm": 0.2265416979789734, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3702, | |
| "num_tokens": 212324805.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.9363525091799265, | |
| "grad_norm": 0.2524133324623108, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3804, | |
| "num_tokens": 212953032.0, | |
| "step": 1582 | |
| }, | |
| { | |
| "epoch": 1.9388004895960833, | |
| "grad_norm": 0.2479136735200882, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3771, | |
| "num_tokens": 213557991.0, | |
| "step": 1584 | |
| }, | |
| { | |
| "epoch": 1.9412484700122399, | |
| "grad_norm": 0.25407838821411133, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3775, | |
| "num_tokens": 214170512.0, | |
| "step": 1586 | |
| }, | |
| { | |
| "epoch": 1.9436964504283964, | |
| "grad_norm": 0.23423869907855988, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3796, | |
| "num_tokens": 214778571.0, | |
| "step": 1588 | |
| }, | |
| { | |
| "epoch": 1.9461444308445532, | |
| "grad_norm": 0.2350510060787201, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3767, | |
| "num_tokens": 215370495.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.94859241126071, | |
| "grad_norm": 0.2388015240430832, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3621, | |
| "num_tokens": 215962798.0, | |
| "step": 1592 | |
| }, | |
| { | |
| "epoch": 1.9510403916768666, | |
| "grad_norm": 0.24582920968532562, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3785, | |
| "num_tokens": 216535476.0, | |
| "step": 1594 | |
| }, | |
| { | |
| "epoch": 1.9534883720930232, | |
| "grad_norm": 0.22780659794807434, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3516, | |
| "num_tokens": 217157260.0, | |
| "step": 1596 | |
| }, | |
| { | |
| "epoch": 1.95593635250918, | |
| "grad_norm": 0.24787680804729462, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3719, | |
| "num_tokens": 217759614.0, | |
| "step": 1598 | |
| }, | |
| { | |
| "epoch": 1.9583843329253368, | |
| "grad_norm": 0.22913233935832977, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3724, | |
| "num_tokens": 218354172.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.960832313341493, | |
| "grad_norm": 0.256535142660141, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3625, | |
| "num_tokens": 218928228.0, | |
| "step": 1602 | |
| }, | |
| { | |
| "epoch": 1.96328029375765, | |
| "grad_norm": 0.35804998874664307, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3726, | |
| "num_tokens": 219538782.0, | |
| "step": 1604 | |
| }, | |
| { | |
| "epoch": 1.9657282741738067, | |
| "grad_norm": 0.24214068055152893, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3662, | |
| "num_tokens": 220109485.0, | |
| "step": 1606 | |
| }, | |
| { | |
| "epoch": 1.9681762545899633, | |
| "grad_norm": 0.2461036592721939, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3644, | |
| "num_tokens": 220702847.0, | |
| "step": 1608 | |
| }, | |
| { | |
| "epoch": 1.9706242350061198, | |
| "grad_norm": 0.22849348187446594, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3697, | |
| "num_tokens": 221307344.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.9730722154222766, | |
| "grad_norm": 0.24049215018749237, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3726, | |
| "num_tokens": 221897755.0, | |
| "step": 1612 | |
| }, | |
| { | |
| "epoch": 1.9755201958384334, | |
| "grad_norm": 0.22864216566085815, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3755, | |
| "num_tokens": 222502665.0, | |
| "step": 1614 | |
| }, | |
| { | |
| "epoch": 1.97796817625459, | |
| "grad_norm": 0.24188441038131714, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3713, | |
| "num_tokens": 223105542.0, | |
| "step": 1616 | |
| }, | |
| { | |
| "epoch": 1.9804161566707466, | |
| "grad_norm": 0.2464837282896042, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3683, | |
| "num_tokens": 223664313.0, | |
| "step": 1618 | |
| }, | |
| { | |
| "epoch": 1.9828641370869033, | |
| "grad_norm": 0.25046202540397644, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3715, | |
| "num_tokens": 224249672.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.9853121175030601, | |
| "grad_norm": 0.2353782206773758, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3673, | |
| "num_tokens": 224831961.0, | |
| "step": 1622 | |
| }, | |
| { | |
| "epoch": 1.9877600979192165, | |
| "grad_norm": 0.24083220958709717, | |
| "learning_rate": 1e-05, | |
| "loss": 0.351, | |
| "num_tokens": 225377557.0, | |
| "step": 1624 | |
| }, | |
| { | |
| "epoch": 1.9902080783353733, | |
| "grad_norm": 0.22877903282642365, | |
| "learning_rate": 1e-05, | |
| "loss": 0.361, | |
| "num_tokens": 225983898.0, | |
| "step": 1626 | |
| }, | |
| { | |
| "epoch": 1.99265605875153, | |
| "grad_norm": 0.23952309787273407, | |
| "learning_rate": 1e-05, | |
| "loss": 0.357, | |
| "num_tokens": 226571294.0, | |
| "step": 1628 | |
| }, | |
| { | |
| "epoch": 1.9951040391676866, | |
| "grad_norm": 0.2325475960969925, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3581, | |
| "num_tokens": 227158234.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.9975520195838432, | |
| "grad_norm": 0.24239014089107513, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3603, | |
| "num_tokens": 227756083.0, | |
| "step": 1632 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.2470468282699585, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3632, | |
| "num_tokens": 228332108.0, | |
| "step": 1634 | |
| }, | |
| { | |
| "epoch": 2.002447980416157, | |
| "grad_norm": 0.26383376121520996, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3307, | |
| "num_tokens": 228926091.0, | |
| "step": 1636 | |
| }, | |
| { | |
| "epoch": 2.004895960832313, | |
| "grad_norm": 0.2823599576950073, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3223, | |
| "num_tokens": 229508232.0, | |
| "step": 1638 | |
| }, | |
| { | |
| "epoch": 2.00734394124847, | |
| "grad_norm": 0.2819001078605652, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3233, | |
| "num_tokens": 230109053.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.0097919216646267, | |
| "grad_norm": 0.2523273229598999, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3339, | |
| "num_tokens": 230732787.0, | |
| "step": 1642 | |
| }, | |
| { | |
| "epoch": 2.0122399020807835, | |
| "grad_norm": 0.25086861848831177, | |
| "learning_rate": 1e-05, | |
| "loss": 0.32, | |
| "num_tokens": 231324249.0, | |
| "step": 1644 | |
| }, | |
| { | |
| "epoch": 2.01468788249694, | |
| "grad_norm": 0.25949275493621826, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3149, | |
| "num_tokens": 231904953.0, | |
| "step": 1646 | |
| }, | |
| { | |
| "epoch": 2.0171358629130967, | |
| "grad_norm": 0.2776775360107422, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3173, | |
| "num_tokens": 232473855.0, | |
| "step": 1648 | |
| }, | |
| { | |
| "epoch": 2.0195838433292534, | |
| "grad_norm": 0.26470237970352173, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3242, | |
| "num_tokens": 233076215.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.0220318237454102, | |
| "grad_norm": 0.26274335384368896, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3218, | |
| "num_tokens": 233693874.0, | |
| "step": 1652 | |
| }, | |
| { | |
| "epoch": 2.0244798041615666, | |
| "grad_norm": 0.2498115599155426, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3168, | |
| "num_tokens": 234300479.0, | |
| "step": 1654 | |
| }, | |
| { | |
| "epoch": 2.0269277845777234, | |
| "grad_norm": 0.2655656635761261, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3162, | |
| "num_tokens": 234870651.0, | |
| "step": 1656 | |
| }, | |
| { | |
| "epoch": 2.02937576499388, | |
| "grad_norm": 0.2704806923866272, | |
| "learning_rate": 1e-05, | |
| "loss": 0.326, | |
| "num_tokens": 235462004.0, | |
| "step": 1658 | |
| }, | |
| { | |
| "epoch": 2.0318237454100365, | |
| "grad_norm": 0.2563600242137909, | |
| "learning_rate": 1e-05, | |
| "loss": 0.326, | |
| "num_tokens": 236065795.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.0342717258261933, | |
| "grad_norm": 0.2734050452709198, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3122, | |
| "num_tokens": 236638043.0, | |
| "step": 1662 | |
| }, | |
| { | |
| "epoch": 2.03671970624235, | |
| "grad_norm": 0.26698222756385803, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3129, | |
| "num_tokens": 237209148.0, | |
| "step": 1664 | |
| }, | |
| { | |
| "epoch": 2.039167686658507, | |
| "grad_norm": 0.27800020575523376, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3288, | |
| "num_tokens": 237782297.0, | |
| "step": 1666 | |
| }, | |
| { | |
| "epoch": 2.0416156670746632, | |
| "grad_norm": 0.2683752179145813, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3337, | |
| "num_tokens": 238370815.0, | |
| "step": 1668 | |
| }, | |
| { | |
| "epoch": 2.04406364749082, | |
| "grad_norm": 0.2758134603500366, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3203, | |
| "num_tokens": 238974735.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.046511627906977, | |
| "grad_norm": 0.2527683675289154, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3131, | |
| "num_tokens": 239582812.0, | |
| "step": 1672 | |
| }, | |
| { | |
| "epoch": 2.0489596083231336, | |
| "grad_norm": 0.2617241442203522, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3156, | |
| "num_tokens": 240153131.0, | |
| "step": 1674 | |
| }, | |
| { | |
| "epoch": 2.05140758873929, | |
| "grad_norm": 0.24523000419139862, | |
| "learning_rate": 1e-05, | |
| "loss": 0.318, | |
| "num_tokens": 240743051.0, | |
| "step": 1676 | |
| }, | |
| { | |
| "epoch": 2.0538555691554468, | |
| "grad_norm": 0.24731133878231049, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3186, | |
| "num_tokens": 241332255.0, | |
| "step": 1678 | |
| }, | |
| { | |
| "epoch": 2.0563035495716036, | |
| "grad_norm": 0.26008081436157227, | |
| "learning_rate": 1e-05, | |
| "loss": 0.32, | |
| "num_tokens": 241925955.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.05875152998776, | |
| "grad_norm": 0.24686473608016968, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3214, | |
| "num_tokens": 242559460.0, | |
| "step": 1682 | |
| }, | |
| { | |
| "epoch": 2.0611995104039167, | |
| "grad_norm": 0.2397872358560562, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3135, | |
| "num_tokens": 243147912.0, | |
| "step": 1684 | |
| }, | |
| { | |
| "epoch": 2.0636474908200735, | |
| "grad_norm": 0.2412048876285553, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3187, | |
| "num_tokens": 243737102.0, | |
| "step": 1686 | |
| }, | |
| { | |
| "epoch": 2.0660954712362303, | |
| "grad_norm": 0.2491815984249115, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3268, | |
| "num_tokens": 244379777.0, | |
| "step": 1688 | |
| }, | |
| { | |
| "epoch": 2.0685434516523866, | |
| "grad_norm": 0.2565453350543976, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3305, | |
| "num_tokens": 244978647.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.0709914320685434, | |
| "grad_norm": 0.24665790796279907, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3157, | |
| "num_tokens": 245555548.0, | |
| "step": 1692 | |
| }, | |
| { | |
| "epoch": 2.0734394124847, | |
| "grad_norm": 0.2575874924659729, | |
| "learning_rate": 1e-05, | |
| "loss": 0.32, | |
| "num_tokens": 246135906.0, | |
| "step": 1694 | |
| }, | |
| { | |
| "epoch": 2.075887392900857, | |
| "grad_norm": 0.2599528133869171, | |
| "learning_rate": 1e-05, | |
| "loss": 0.331, | |
| "num_tokens": 246714507.0, | |
| "step": 1696 | |
| }, | |
| { | |
| "epoch": 2.0783353733170133, | |
| "grad_norm": 0.246963232755661, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3178, | |
| "num_tokens": 247312796.0, | |
| "step": 1698 | |
| }, | |
| { | |
| "epoch": 2.08078335373317, | |
| "grad_norm": 0.2495131939649582, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3156, | |
| "num_tokens": 247899696.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.083231334149327, | |
| "grad_norm": 0.24859756231307983, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3216, | |
| "num_tokens": 248478103.0, | |
| "step": 1702 | |
| }, | |
| { | |
| "epoch": 2.0856793145654833, | |
| "grad_norm": 0.2642371356487274, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3244, | |
| "num_tokens": 249093078.0, | |
| "step": 1704 | |
| }, | |
| { | |
| "epoch": 2.08812729498164, | |
| "grad_norm": 0.25307220220565796, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3161, | |
| "num_tokens": 249671000.0, | |
| "step": 1706 | |
| }, | |
| { | |
| "epoch": 2.090575275397797, | |
| "grad_norm": 0.25887933373451233, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3085, | |
| "num_tokens": 250228631.0, | |
| "step": 1708 | |
| }, | |
| { | |
| "epoch": 2.0930232558139537, | |
| "grad_norm": 0.25659000873565674, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3319, | |
| "num_tokens": 250830771.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.09547123623011, | |
| "grad_norm": 0.27465444803237915, | |
| "learning_rate": 1e-05, | |
| "loss": 0.326, | |
| "num_tokens": 251414177.0, | |
| "step": 1712 | |
| }, | |
| { | |
| "epoch": 2.097919216646267, | |
| "grad_norm": 0.26044049859046936, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3165, | |
| "num_tokens": 251980472.0, | |
| "step": 1714 | |
| }, | |
| { | |
| "epoch": 2.1003671970624236, | |
| "grad_norm": 0.26657673716545105, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3195, | |
| "num_tokens": 252563419.0, | |
| "step": 1716 | |
| }, | |
| { | |
| "epoch": 2.1028151774785804, | |
| "grad_norm": 0.2445669025182724, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3169, | |
| "num_tokens": 253156291.0, | |
| "step": 1718 | |
| }, | |
| { | |
| "epoch": 2.1052631578947367, | |
| "grad_norm": 0.26667121052742004, | |
| "learning_rate": 1e-05, | |
| "loss": 0.326, | |
| "num_tokens": 253748542.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.1077111383108935, | |
| "grad_norm": 0.26954278349876404, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3291, | |
| "num_tokens": 254355454.0, | |
| "step": 1722 | |
| }, | |
| { | |
| "epoch": 2.1101591187270503, | |
| "grad_norm": 0.26194727420806885, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3193, | |
| "num_tokens": 254981618.0, | |
| "step": 1724 | |
| }, | |
| { | |
| "epoch": 2.1126070991432067, | |
| "grad_norm": 0.26653769612312317, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3249, | |
| "num_tokens": 255580818.0, | |
| "step": 1726 | |
| }, | |
| { | |
| "epoch": 2.1150550795593634, | |
| "grad_norm": 0.2530732750892639, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3231, | |
| "num_tokens": 256175654.0, | |
| "step": 1728 | |
| }, | |
| { | |
| "epoch": 2.1175030599755202, | |
| "grad_norm": 0.26466599106788635, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3149, | |
| "num_tokens": 256769129.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.119951040391677, | |
| "grad_norm": 0.2696734666824341, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3253, | |
| "num_tokens": 257368163.0, | |
| "step": 1732 | |
| }, | |
| { | |
| "epoch": 2.1223990208078334, | |
| "grad_norm": 0.2791430354118347, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3276, | |
| "num_tokens": 257979865.0, | |
| "step": 1734 | |
| }, | |
| { | |
| "epoch": 2.12484700122399, | |
| "grad_norm": 0.25057294964790344, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3208, | |
| "num_tokens": 258574921.0, | |
| "step": 1736 | |
| }, | |
| { | |
| "epoch": 2.127294981640147, | |
| "grad_norm": 0.250307559967041, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3161, | |
| "num_tokens": 259188694.0, | |
| "step": 1738 | |
| }, | |
| { | |
| "epoch": 2.1297429620563038, | |
| "grad_norm": 0.2579153776168823, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3215, | |
| "num_tokens": 259788088.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.13219094247246, | |
| "grad_norm": 0.25432637333869934, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3263, | |
| "num_tokens": 260355939.0, | |
| "step": 1742 | |
| }, | |
| { | |
| "epoch": 2.134638922888617, | |
| "grad_norm": 0.25369200110435486, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3245, | |
| "num_tokens": 260952386.0, | |
| "step": 1744 | |
| }, | |
| { | |
| "epoch": 2.1370869033047737, | |
| "grad_norm": 0.26250943541526794, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3149, | |
| "num_tokens": 261542760.0, | |
| "step": 1746 | |
| }, | |
| { | |
| "epoch": 2.13953488372093, | |
| "grad_norm": 0.2528480887413025, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3177, | |
| "num_tokens": 262121722.0, | |
| "step": 1748 | |
| }, | |
| { | |
| "epoch": 2.141982864137087, | |
| "grad_norm": 0.2538909316062927, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3291, | |
| "num_tokens": 262729764.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.1444308445532436, | |
| "grad_norm": 0.27961215376853943, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3355, | |
| "num_tokens": 263343702.0, | |
| "step": 1752 | |
| }, | |
| { | |
| "epoch": 2.1468788249694004, | |
| "grad_norm": 0.2544754445552826, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3193, | |
| "num_tokens": 263930203.0, | |
| "step": 1754 | |
| }, | |
| { | |
| "epoch": 2.1493268053855568, | |
| "grad_norm": 0.2512654960155487, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3213, | |
| "num_tokens": 264517859.0, | |
| "step": 1756 | |
| }, | |
| { | |
| "epoch": 2.1517747858017136, | |
| "grad_norm": 0.24453192949295044, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3316, | |
| "num_tokens": 265135649.0, | |
| "step": 1758 | |
| }, | |
| { | |
| "epoch": 2.1542227662178703, | |
| "grad_norm": 0.2528865337371826, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3193, | |
| "num_tokens": 265724746.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.1566707466340267, | |
| "grad_norm": 0.25134533643722534, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3275, | |
| "num_tokens": 266324867.0, | |
| "step": 1762 | |
| }, | |
| { | |
| "epoch": 2.1591187270501835, | |
| "grad_norm": 0.25282108783721924, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3229, | |
| "num_tokens": 266919065.0, | |
| "step": 1764 | |
| }, | |
| { | |
| "epoch": 2.1615667074663403, | |
| "grad_norm": 0.25280824303627014, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3249, | |
| "num_tokens": 267495602.0, | |
| "step": 1766 | |
| }, | |
| { | |
| "epoch": 2.164014687882497, | |
| "grad_norm": 0.2632092535495758, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3296, | |
| "num_tokens": 268081542.0, | |
| "step": 1768 | |
| }, | |
| { | |
| "epoch": 2.1664626682986534, | |
| "grad_norm": 0.258695513010025, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3271, | |
| "num_tokens": 268691744.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.16891064871481, | |
| "grad_norm": 0.26054874062538147, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3359, | |
| "num_tokens": 269267045.0, | |
| "step": 1772 | |
| }, | |
| { | |
| "epoch": 2.171358629130967, | |
| "grad_norm": 0.2614051401615143, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3218, | |
| "num_tokens": 269852138.0, | |
| "step": 1774 | |
| }, | |
| { | |
| "epoch": 2.173806609547124, | |
| "grad_norm": 0.25378215312957764, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3138, | |
| "num_tokens": 270447909.0, | |
| "step": 1776 | |
| }, | |
| { | |
| "epoch": 2.17625458996328, | |
| "grad_norm": 0.25152331590652466, | |
| "learning_rate": 1e-05, | |
| "loss": 0.326, | |
| "num_tokens": 271039752.0, | |
| "step": 1778 | |
| }, | |
| { | |
| "epoch": 2.178702570379437, | |
| "grad_norm": 0.2568153440952301, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3192, | |
| "num_tokens": 271624236.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.1811505507955937, | |
| "grad_norm": 0.24978622794151306, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3301, | |
| "num_tokens": 272229137.0, | |
| "step": 1782 | |
| }, | |
| { | |
| "epoch": 2.18359853121175, | |
| "grad_norm": 0.25673535466194153, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3275, | |
| "num_tokens": 272804913.0, | |
| "step": 1784 | |
| }, | |
| { | |
| "epoch": 2.186046511627907, | |
| "grad_norm": 0.26895347237586975, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3206, | |
| "num_tokens": 273380069.0, | |
| "step": 1786 | |
| }, | |
| { | |
| "epoch": 2.1884944920440637, | |
| "grad_norm": 0.2613040804862976, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3187, | |
| "num_tokens": 273956503.0, | |
| "step": 1788 | |
| }, | |
| { | |
| "epoch": 2.1909424724602204, | |
| "grad_norm": 0.2516239285469055, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3172, | |
| "num_tokens": 274541930.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.193390452876377, | |
| "grad_norm": 0.24175681173801422, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3082, | |
| "num_tokens": 275133982.0, | |
| "step": 1792 | |
| }, | |
| { | |
| "epoch": 2.1958384332925336, | |
| "grad_norm": 0.2464313507080078, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3362, | |
| "num_tokens": 275738202.0, | |
| "step": 1794 | |
| }, | |
| { | |
| "epoch": 2.1982864137086904, | |
| "grad_norm": 0.2737571597099304, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3106, | |
| "num_tokens": 276299705.0, | |
| "step": 1796 | |
| }, | |
| { | |
| "epoch": 2.200734394124847, | |
| "grad_norm": 0.26766151189804077, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3172, | |
| "num_tokens": 276904098.0, | |
| "step": 1798 | |
| }, | |
| { | |
| "epoch": 2.2031823745410035, | |
| "grad_norm": 0.25508153438568115, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3174, | |
| "num_tokens": 277491075.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.2056303549571603, | |
| "grad_norm": 0.2565786838531494, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3186, | |
| "num_tokens": 278068282.0, | |
| "step": 1802 | |
| }, | |
| { | |
| "epoch": 2.208078335373317, | |
| "grad_norm": 0.25016945600509644, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3138, | |
| "num_tokens": 278674326.0, | |
| "step": 1804 | |
| }, | |
| { | |
| "epoch": 2.2105263157894735, | |
| "grad_norm": 0.26891186833381653, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3207, | |
| "num_tokens": 279249277.0, | |
| "step": 1806 | |
| }, | |
| { | |
| "epoch": 2.2129742962056302, | |
| "grad_norm": 0.2475995570421219, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3134, | |
| "num_tokens": 279825692.0, | |
| "step": 1808 | |
| }, | |
| { | |
| "epoch": 2.215422276621787, | |
| "grad_norm": 0.2526877820491791, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3239, | |
| "num_tokens": 280411199.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.217870257037944, | |
| "grad_norm": 0.29060930013656616, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3202, | |
| "num_tokens": 281024327.0, | |
| "step": 1812 | |
| }, | |
| { | |
| "epoch": 2.2203182374541, | |
| "grad_norm": 0.2517792582511902, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3201, | |
| "num_tokens": 281603472.0, | |
| "step": 1814 | |
| }, | |
| { | |
| "epoch": 2.222766217870257, | |
| "grad_norm": 0.2607707381248474, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3131, | |
| "num_tokens": 282183690.0, | |
| "step": 1816 | |
| }, | |
| { | |
| "epoch": 2.2252141982864138, | |
| "grad_norm": 0.2490987777709961, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3304, | |
| "num_tokens": 282764121.0, | |
| "step": 1818 | |
| }, | |
| { | |
| "epoch": 2.2276621787025706, | |
| "grad_norm": 0.25851231813430786, | |
| "learning_rate": 1e-05, | |
| "loss": 0.326, | |
| "num_tokens": 283347033.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.230110159118727, | |
| "grad_norm": 0.2487228363752365, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3178, | |
| "num_tokens": 283931247.0, | |
| "step": 1822 | |
| }, | |
| { | |
| "epoch": 2.2325581395348837, | |
| "grad_norm": 0.25673067569732666, | |
| "learning_rate": 1e-05, | |
| "loss": 0.315, | |
| "num_tokens": 284503256.0, | |
| "step": 1824 | |
| }, | |
| { | |
| "epoch": 2.2350061199510405, | |
| "grad_norm": 0.2713610827922821, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3281, | |
| "num_tokens": 285105100.0, | |
| "step": 1826 | |
| }, | |
| { | |
| "epoch": 2.237454100367197, | |
| "grad_norm": 0.2791447639465332, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3349, | |
| "num_tokens": 285674535.0, | |
| "step": 1828 | |
| }, | |
| { | |
| "epoch": 2.2399020807833536, | |
| "grad_norm": 0.26290756464004517, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3185, | |
| "num_tokens": 286267445.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.2423500611995104, | |
| "grad_norm": 0.2563938796520233, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3259, | |
| "num_tokens": 286847377.0, | |
| "step": 1832 | |
| }, | |
| { | |
| "epoch": 2.244798041615667, | |
| "grad_norm": 0.2585829496383667, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3271, | |
| "num_tokens": 287405979.0, | |
| "step": 1834 | |
| }, | |
| { | |
| "epoch": 2.2472460220318236, | |
| "grad_norm": 0.33251309394836426, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3305, | |
| "num_tokens": 288011470.0, | |
| "step": 1836 | |
| }, | |
| { | |
| "epoch": 2.2496940024479803, | |
| "grad_norm": 0.2520661950111389, | |
| "learning_rate": 1e-05, | |
| "loss": 0.316, | |
| "num_tokens": 288599550.0, | |
| "step": 1838 | |
| }, | |
| { | |
| "epoch": 2.252141982864137, | |
| "grad_norm": 0.27090510725975037, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3261, | |
| "num_tokens": 289194997.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.254589963280294, | |
| "grad_norm": 0.24881038069725037, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3136, | |
| "num_tokens": 289794688.0, | |
| "step": 1842 | |
| }, | |
| { | |
| "epoch": 2.2570379436964503, | |
| "grad_norm": 0.25386103987693787, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3289, | |
| "num_tokens": 290400833.0, | |
| "step": 1844 | |
| }, | |
| { | |
| "epoch": 2.259485924112607, | |
| "grad_norm": 0.24282298982143402, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3304, | |
| "num_tokens": 291039254.0, | |
| "step": 1846 | |
| }, | |
| { | |
| "epoch": 2.261933904528764, | |
| "grad_norm": 0.2525377869606018, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3128, | |
| "num_tokens": 291621991.0, | |
| "step": 1848 | |
| }, | |
| { | |
| "epoch": 2.26438188494492, | |
| "grad_norm": 0.2528608441352844, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3258, | |
| "num_tokens": 292214020.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.266829865361077, | |
| "grad_norm": 0.2423013150691986, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3294, | |
| "num_tokens": 292851314.0, | |
| "step": 1852 | |
| }, | |
| { | |
| "epoch": 2.269277845777234, | |
| "grad_norm": 0.24104289710521698, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3135, | |
| "num_tokens": 293454455.0, | |
| "step": 1854 | |
| }, | |
| { | |
| "epoch": 2.2717258261933906, | |
| "grad_norm": 0.24933366477489471, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3201, | |
| "num_tokens": 294037926.0, | |
| "step": 1856 | |
| }, | |
| { | |
| "epoch": 2.274173806609547, | |
| "grad_norm": 0.25847917795181274, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3157, | |
| "num_tokens": 294636937.0, | |
| "step": 1858 | |
| }, | |
| { | |
| "epoch": 2.2766217870257037, | |
| "grad_norm": 0.2618774473667145, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3258, | |
| "num_tokens": 295229198.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.2790697674418605, | |
| "grad_norm": 0.2543027400970459, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3182, | |
| "num_tokens": 295829418.0, | |
| "step": 1862 | |
| }, | |
| { | |
| "epoch": 2.2815177478580173, | |
| "grad_norm": 0.2600286304950714, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3277, | |
| "num_tokens": 296419808.0, | |
| "step": 1864 | |
| }, | |
| { | |
| "epoch": 2.2839657282741737, | |
| "grad_norm": 0.2631028890609741, | |
| "learning_rate": 1e-05, | |
| "loss": 0.331, | |
| "num_tokens": 297031243.0, | |
| "step": 1866 | |
| }, | |
| { | |
| "epoch": 2.2864137086903304, | |
| "grad_norm": 0.2603526711463928, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3295, | |
| "num_tokens": 297626941.0, | |
| "step": 1868 | |
| }, | |
| { | |
| "epoch": 2.2888616891064872, | |
| "grad_norm": 0.25479280948638916, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3217, | |
| "num_tokens": 298178534.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.2913096695226436, | |
| "grad_norm": 0.25323933362960815, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3248, | |
| "num_tokens": 298770355.0, | |
| "step": 1872 | |
| }, | |
| { | |
| "epoch": 2.2937576499388004, | |
| "grad_norm": 0.26648545265197754, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3197, | |
| "num_tokens": 299340175.0, | |
| "step": 1874 | |
| }, | |
| { | |
| "epoch": 2.296205630354957, | |
| "grad_norm": 0.2429247945547104, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3125, | |
| "num_tokens": 299919779.0, | |
| "step": 1876 | |
| }, | |
| { | |
| "epoch": 2.298653610771114, | |
| "grad_norm": 0.25432199239730835, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3125, | |
| "num_tokens": 300496223.0, | |
| "step": 1878 | |
| }, | |
| { | |
| "epoch": 2.3011015911872703, | |
| "grad_norm": 0.2576068341732025, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3266, | |
| "num_tokens": 301103197.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.303549571603427, | |
| "grad_norm": 0.25816693902015686, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3226, | |
| "num_tokens": 301681339.0, | |
| "step": 1882 | |
| }, | |
| { | |
| "epoch": 2.305997552019584, | |
| "grad_norm": 0.25362440943717957, | |
| "learning_rate": 1e-05, | |
| "loss": 0.328, | |
| "num_tokens": 302307086.0, | |
| "step": 1884 | |
| }, | |
| { | |
| "epoch": 2.3084455324357407, | |
| "grad_norm": 0.26144906878471375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.333, | |
| "num_tokens": 302937116.0, | |
| "step": 1886 | |
| }, | |
| { | |
| "epoch": 2.310893512851897, | |
| "grad_norm": 0.25918832421302795, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3161, | |
| "num_tokens": 303516988.0, | |
| "step": 1888 | |
| }, | |
| { | |
| "epoch": 2.313341493268054, | |
| "grad_norm": 0.2761925756931305, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3313, | |
| "num_tokens": 304132182.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.3157894736842106, | |
| "grad_norm": 0.2471666783094406, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3123, | |
| "num_tokens": 304705289.0, | |
| "step": 1892 | |
| }, | |
| { | |
| "epoch": 2.318237454100367, | |
| "grad_norm": 0.27031853795051575, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3368, | |
| "num_tokens": 305325892.0, | |
| "step": 1894 | |
| }, | |
| { | |
| "epoch": 2.3206854345165238, | |
| "grad_norm": 0.253797322511673, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3177, | |
| "num_tokens": 305921736.0, | |
| "step": 1896 | |
| }, | |
| { | |
| "epoch": 2.3231334149326806, | |
| "grad_norm": 0.25007933378219604, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3302, | |
| "num_tokens": 306526398.0, | |
| "step": 1898 | |
| }, | |
| { | |
| "epoch": 2.3255813953488373, | |
| "grad_norm": 0.2611125409603119, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3234, | |
| "num_tokens": 307121655.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.3280293757649937, | |
| "grad_norm": 0.2409556806087494, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3168, | |
| "num_tokens": 307691871.0, | |
| "step": 1902 | |
| }, | |
| { | |
| "epoch": 2.3304773561811505, | |
| "grad_norm": 0.26317453384399414, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3242, | |
| "num_tokens": 308295238.0, | |
| "step": 1904 | |
| }, | |
| { | |
| "epoch": 2.3329253365973073, | |
| "grad_norm": 0.2561883330345154, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3249, | |
| "num_tokens": 308901215.0, | |
| "step": 1906 | |
| }, | |
| { | |
| "epoch": 2.335373317013464, | |
| "grad_norm": 0.2611881196498871, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3201, | |
| "num_tokens": 309525875.0, | |
| "step": 1908 | |
| }, | |
| { | |
| "epoch": 2.3378212974296204, | |
| "grad_norm": 0.24518193304538727, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3311, | |
| "num_tokens": 310111880.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.340269277845777, | |
| "grad_norm": 0.2470950484275818, | |
| "learning_rate": 1e-05, | |
| "loss": 0.32, | |
| "num_tokens": 310703795.0, | |
| "step": 1912 | |
| }, | |
| { | |
| "epoch": 2.342717258261934, | |
| "grad_norm": 0.2571375072002411, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3278, | |
| "num_tokens": 311292013.0, | |
| "step": 1914 | |
| }, | |
| { | |
| "epoch": 2.3451652386780903, | |
| "grad_norm": 0.25326406955718994, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3274, | |
| "num_tokens": 311872266.0, | |
| "step": 1916 | |
| }, | |
| { | |
| "epoch": 2.347613219094247, | |
| "grad_norm": 0.26152241230010986, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3336, | |
| "num_tokens": 312443087.0, | |
| "step": 1918 | |
| }, | |
| { | |
| "epoch": 2.350061199510404, | |
| "grad_norm": 0.24850670993328094, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3195, | |
| "num_tokens": 313014118.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.3525091799265607, | |
| "grad_norm": 0.2560129761695862, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3157, | |
| "num_tokens": 313595591.0, | |
| "step": 1922 | |
| }, | |
| { | |
| "epoch": 2.354957160342717, | |
| "grad_norm": 0.25367817282676697, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3213, | |
| "num_tokens": 314199972.0, | |
| "step": 1924 | |
| }, | |
| { | |
| "epoch": 2.357405140758874, | |
| "grad_norm": 0.25712472200393677, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3211, | |
| "num_tokens": 314795639.0, | |
| "step": 1926 | |
| }, | |
| { | |
| "epoch": 2.3598531211750307, | |
| "grad_norm": 0.2516370713710785, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3215, | |
| "num_tokens": 315398116.0, | |
| "step": 1928 | |
| }, | |
| { | |
| "epoch": 2.3623011015911874, | |
| "grad_norm": 0.2571152448654175, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3212, | |
| "num_tokens": 315967279.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.364749082007344, | |
| "grad_norm": 0.25274693965911865, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3297, | |
| "num_tokens": 316551418.0, | |
| "step": 1932 | |
| }, | |
| { | |
| "epoch": 2.3671970624235006, | |
| "grad_norm": 0.24968615174293518, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3246, | |
| "num_tokens": 317161199.0, | |
| "step": 1934 | |
| }, | |
| { | |
| "epoch": 2.3696450428396574, | |
| "grad_norm": 0.25102177262306213, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3351, | |
| "num_tokens": 317772288.0, | |
| "step": 1936 | |
| }, | |
| { | |
| "epoch": 2.3720930232558137, | |
| "grad_norm": 0.256553590297699, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3128, | |
| "num_tokens": 318324063.0, | |
| "step": 1938 | |
| }, | |
| { | |
| "epoch": 2.3745410036719705, | |
| "grad_norm": 0.2520389258861542, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3305, | |
| "num_tokens": 318926620.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.3769889840881273, | |
| "grad_norm": 0.2571202516555786, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3196, | |
| "num_tokens": 319515709.0, | |
| "step": 1942 | |
| }, | |
| { | |
| "epoch": 2.379436964504284, | |
| "grad_norm": 0.2587038576602936, | |
| "learning_rate": 1e-05, | |
| "loss": 0.326, | |
| "num_tokens": 320113312.0, | |
| "step": 1944 | |
| }, | |
| { | |
| "epoch": 2.3818849449204405, | |
| "grad_norm": 0.24635723233222961, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3225, | |
| "num_tokens": 320710619.0, | |
| "step": 1946 | |
| }, | |
| { | |
| "epoch": 2.3843329253365972, | |
| "grad_norm": 0.25865438580513, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3267, | |
| "num_tokens": 321291492.0, | |
| "step": 1948 | |
| }, | |
| { | |
| "epoch": 2.386780905752754, | |
| "grad_norm": 0.25598016381263733, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3297, | |
| "num_tokens": 321877027.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.389228886168911, | |
| "grad_norm": 0.2425144612789154, | |
| "learning_rate": 1e-05, | |
| "loss": 0.32, | |
| "num_tokens": 322481168.0, | |
| "step": 1952 | |
| }, | |
| { | |
| "epoch": 2.391676866585067, | |
| "grad_norm": 0.2606021761894226, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3265, | |
| "num_tokens": 323075623.0, | |
| "step": 1954 | |
| }, | |
| { | |
| "epoch": 2.394124847001224, | |
| "grad_norm": 0.2530352473258972, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3353, | |
| "num_tokens": 323690401.0, | |
| "step": 1956 | |
| }, | |
| { | |
| "epoch": 2.3965728274173808, | |
| "grad_norm": 0.24680784344673157, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3104, | |
| "num_tokens": 324265813.0, | |
| "step": 1958 | |
| }, | |
| { | |
| "epoch": 2.399020807833537, | |
| "grad_norm": 0.24906018376350403, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3305, | |
| "num_tokens": 324872597.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.401468788249694, | |
| "grad_norm": 0.26364362239837646, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3241, | |
| "num_tokens": 325494495.0, | |
| "step": 1962 | |
| }, | |
| { | |
| "epoch": 2.4039167686658507, | |
| "grad_norm": 0.2485814392566681, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3218, | |
| "num_tokens": 326085292.0, | |
| "step": 1964 | |
| }, | |
| { | |
| "epoch": 2.4063647490820075, | |
| "grad_norm": 0.25182193517684937, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3178, | |
| "num_tokens": 326663046.0, | |
| "step": 1966 | |
| }, | |
| { | |
| "epoch": 2.408812729498164, | |
| "grad_norm": 0.2674141526222229, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3191, | |
| "num_tokens": 327237445.0, | |
| "step": 1968 | |
| }, | |
| { | |
| "epoch": 2.4112607099143206, | |
| "grad_norm": 0.24762989580631256, | |
| "learning_rate": 1e-05, | |
| "loss": 0.315, | |
| "num_tokens": 327830560.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.4137086903304774, | |
| "grad_norm": 0.2627556324005127, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3285, | |
| "num_tokens": 328430438.0, | |
| "step": 1972 | |
| }, | |
| { | |
| "epoch": 2.416156670746634, | |
| "grad_norm": 0.26080411672592163, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3231, | |
| "num_tokens": 329025822.0, | |
| "step": 1974 | |
| }, | |
| { | |
| "epoch": 2.4186046511627906, | |
| "grad_norm": 0.2651062309741974, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3196, | |
| "num_tokens": 329616416.0, | |
| "step": 1976 | |
| }, | |
| { | |
| "epoch": 2.4210526315789473, | |
| "grad_norm": 0.2645553946495056, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3278, | |
| "num_tokens": 330223848.0, | |
| "step": 1978 | |
| }, | |
| { | |
| "epoch": 2.423500611995104, | |
| "grad_norm": 0.24931900203227997, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3281, | |
| "num_tokens": 330823815.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.4259485924112605, | |
| "grad_norm": 0.26821616291999817, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3235, | |
| "num_tokens": 331414716.0, | |
| "step": 1982 | |
| }, | |
| { | |
| "epoch": 2.4283965728274173, | |
| "grad_norm": 0.2596832513809204, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3259, | |
| "num_tokens": 332022505.0, | |
| "step": 1984 | |
| }, | |
| { | |
| "epoch": 2.430844553243574, | |
| "grad_norm": 0.261565625667572, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3219, | |
| "num_tokens": 332643682.0, | |
| "step": 1986 | |
| }, | |
| { | |
| "epoch": 2.433292533659731, | |
| "grad_norm": 0.25323015451431274, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3187, | |
| "num_tokens": 333236054.0, | |
| "step": 1988 | |
| }, | |
| { | |
| "epoch": 2.435740514075887, | |
| "grad_norm": 0.27590665221214294, | |
| "learning_rate": 1e-05, | |
| "loss": 0.328, | |
| "num_tokens": 333780888.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.438188494492044, | |
| "grad_norm": 0.26514706015586853, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3244, | |
| "num_tokens": 334367607.0, | |
| "step": 1992 | |
| }, | |
| { | |
| "epoch": 2.440636474908201, | |
| "grad_norm": 0.2672777473926544, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3248, | |
| "num_tokens": 334949364.0, | |
| "step": 1994 | |
| }, | |
| { | |
| "epoch": 2.4430844553243576, | |
| "grad_norm": 0.27689221501350403, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3241, | |
| "num_tokens": 335514748.0, | |
| "step": 1996 | |
| }, | |
| { | |
| "epoch": 2.445532435740514, | |
| "grad_norm": 0.2605144679546356, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3135, | |
| "num_tokens": 336080476.0, | |
| "step": 1998 | |
| }, | |
| { | |
| "epoch": 2.4479804161566707, | |
| "grad_norm": 0.2649420499801636, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3279, | |
| "num_tokens": 336632578.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.4504283965728275, | |
| "grad_norm": 0.2612689137458801, | |
| "learning_rate": 1e-05, | |
| "loss": 0.347, | |
| "num_tokens": 337221659.0, | |
| "step": 2002 | |
| }, | |
| { | |
| "epoch": 2.452876376988984, | |
| "grad_norm": 0.25685855746269226, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3322, | |
| "num_tokens": 337798086.0, | |
| "step": 2004 | |
| }, | |
| { | |
| "epoch": 2.4553243574051407, | |
| "grad_norm": 0.2578672468662262, | |
| "learning_rate": 1e-05, | |
| "loss": 0.324, | |
| "num_tokens": 338400582.0, | |
| "step": 2006 | |
| }, | |
| { | |
| "epoch": 2.4577723378212974, | |
| "grad_norm": 0.2598439157009125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3371, | |
| "num_tokens": 338998073.0, | |
| "step": 2008 | |
| }, | |
| { | |
| "epoch": 2.4602203182374542, | |
| "grad_norm": 0.26257333159446716, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3224, | |
| "num_tokens": 339594572.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.4626682986536106, | |
| "grad_norm": 0.2520839273929596, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3264, | |
| "num_tokens": 340156779.0, | |
| "step": 2012 | |
| }, | |
| { | |
| "epoch": 2.4651162790697674, | |
| "grad_norm": 0.25438588857650757, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3309, | |
| "num_tokens": 340758075.0, | |
| "step": 2014 | |
| }, | |
| { | |
| "epoch": 2.467564259485924, | |
| "grad_norm": 0.2509094178676605, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3342, | |
| "num_tokens": 341384154.0, | |
| "step": 2016 | |
| }, | |
| { | |
| "epoch": 2.470012239902081, | |
| "grad_norm": 0.2800988554954529, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3302, | |
| "num_tokens": 341973383.0, | |
| "step": 2018 | |
| }, | |
| { | |
| "epoch": 2.4724602203182373, | |
| "grad_norm": 0.2561210095882416, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3182, | |
| "num_tokens": 342579481.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 2.474908200734394, | |
| "grad_norm": 0.24735815823078156, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3277, | |
| "num_tokens": 343171803.0, | |
| "step": 2022 | |
| }, | |
| { | |
| "epoch": 2.477356181150551, | |
| "grad_norm": 0.2473309189081192, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3225, | |
| "num_tokens": 343800177.0, | |
| "step": 2024 | |
| }, | |
| { | |
| "epoch": 2.4798041615667072, | |
| "grad_norm": 0.2584695816040039, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3215, | |
| "num_tokens": 344357371.0, | |
| "step": 2026 | |
| }, | |
| { | |
| "epoch": 2.482252141982864, | |
| "grad_norm": 0.34662726521492004, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3398, | |
| "num_tokens": 344941928.0, | |
| "step": 2028 | |
| }, | |
| { | |
| "epoch": 2.484700122399021, | |
| "grad_norm": 0.2552769184112549, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3238, | |
| "num_tokens": 345509899.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 2.4871481028151776, | |
| "grad_norm": 0.2522905170917511, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3317, | |
| "num_tokens": 346102331.0, | |
| "step": 2032 | |
| }, | |
| { | |
| "epoch": 2.489596083231334, | |
| "grad_norm": 0.2538517713546753, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3165, | |
| "num_tokens": 346684689.0, | |
| "step": 2034 | |
| }, | |
| { | |
| "epoch": 2.4920440636474908, | |
| "grad_norm": 0.2533273994922638, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3309, | |
| "num_tokens": 347297893.0, | |
| "step": 2036 | |
| }, | |
| { | |
| "epoch": 2.4944920440636476, | |
| "grad_norm": 0.24895316362380981, | |
| "learning_rate": 1e-05, | |
| "loss": 0.316, | |
| "num_tokens": 347897795.0, | |
| "step": 2038 | |
| }, | |
| { | |
| "epoch": 2.4969400244798043, | |
| "grad_norm": 0.2465788722038269, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3255, | |
| "num_tokens": 348514676.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 2.4993880048959607, | |
| "grad_norm": 0.2624768018722534, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3258, | |
| "num_tokens": 349096443.0, | |
| "step": 2042 | |
| }, | |
| { | |
| "epoch": 2.5018359853121175, | |
| "grad_norm": 0.25287625193595886, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3223, | |
| "num_tokens": 349706030.0, | |
| "step": 2044 | |
| }, | |
| { | |
| "epoch": 2.5042839657282743, | |
| "grad_norm": 0.26094719767570496, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3238, | |
| "num_tokens": 350291310.0, | |
| "step": 2046 | |
| }, | |
| { | |
| "epoch": 2.5067319461444306, | |
| "grad_norm": 0.24862395226955414, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3142, | |
| "num_tokens": 350897628.0, | |
| "step": 2048 | |
| }, | |
| { | |
| "epoch": 2.5091799265605874, | |
| "grad_norm": 0.2516186237335205, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3158, | |
| "num_tokens": 351488228.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 2.511627906976744, | |
| "grad_norm": 0.24392171204090118, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3309, | |
| "num_tokens": 352088867.0, | |
| "step": 2052 | |
| }, | |
| { | |
| "epoch": 2.514075887392901, | |
| "grad_norm": 0.2512110769748688, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3147, | |
| "num_tokens": 352685180.0, | |
| "step": 2054 | |
| }, | |
| { | |
| "epoch": 2.516523867809058, | |
| "grad_norm": 0.25949329137802124, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3234, | |
| "num_tokens": 353277486.0, | |
| "step": 2056 | |
| }, | |
| { | |
| "epoch": 2.518971848225214, | |
| "grad_norm": 0.2465786188840866, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3168, | |
| "num_tokens": 353860979.0, | |
| "step": 2058 | |
| }, | |
| { | |
| "epoch": 2.521419828641371, | |
| "grad_norm": 0.2434110790491104, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3305, | |
| "num_tokens": 354468059.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 2.5238678090575277, | |
| "grad_norm": 0.253316730260849, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3222, | |
| "num_tokens": 355075770.0, | |
| "step": 2062 | |
| }, | |
| { | |
| "epoch": 2.526315789473684, | |
| "grad_norm": 0.2552608549594879, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3203, | |
| "num_tokens": 355626769.0, | |
| "step": 2064 | |
| }, | |
| { | |
| "epoch": 2.528763769889841, | |
| "grad_norm": 0.25336354970932007, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3112, | |
| "num_tokens": 356154850.0, | |
| "step": 2066 | |
| }, | |
| { | |
| "epoch": 2.5312117503059977, | |
| "grad_norm": 0.26685798168182373, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3359, | |
| "num_tokens": 356765495.0, | |
| "step": 2068 | |
| }, | |
| { | |
| "epoch": 2.533659730722154, | |
| "grad_norm": 0.25629910826683044, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3314, | |
| "num_tokens": 357343529.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.536107711138311, | |
| "grad_norm": 0.2573475241661072, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3336, | |
| "num_tokens": 357938110.0, | |
| "step": 2072 | |
| }, | |
| { | |
| "epoch": 2.5385556915544676, | |
| "grad_norm": 0.24889910221099854, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3134, | |
| "num_tokens": 358505809.0, | |
| "step": 2074 | |
| }, | |
| { | |
| "epoch": 2.5410036719706244, | |
| "grad_norm": 0.24748258292675018, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3355, | |
| "num_tokens": 359093575.0, | |
| "step": 2076 | |
| }, | |
| { | |
| "epoch": 2.543451652386781, | |
| "grad_norm": 0.25954684615135193, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3362, | |
| "num_tokens": 359684808.0, | |
| "step": 2078 | |
| }, | |
| { | |
| "epoch": 2.5458996328029375, | |
| "grad_norm": 0.25065338611602783, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3206, | |
| "num_tokens": 360271198.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.5483476132190943, | |
| "grad_norm": 0.2514982521533966, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3371, | |
| "num_tokens": 360883060.0, | |
| "step": 2082 | |
| }, | |
| { | |
| "epoch": 2.550795593635251, | |
| "grad_norm": 0.25208184123039246, | |
| "learning_rate": 1e-05, | |
| "loss": 0.332, | |
| "num_tokens": 361474796.0, | |
| "step": 2084 | |
| }, | |
| { | |
| "epoch": 2.5532435740514074, | |
| "grad_norm": 0.2545960545539856, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3192, | |
| "num_tokens": 362049485.0, | |
| "step": 2086 | |
| }, | |
| { | |
| "epoch": 2.5556915544675642, | |
| "grad_norm": 0.2432592362165451, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3094, | |
| "num_tokens": 362633769.0, | |
| "step": 2088 | |
| }, | |
| { | |
| "epoch": 2.558139534883721, | |
| "grad_norm": 0.257764607667923, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3247, | |
| "num_tokens": 363244435.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 2.5605875152998774, | |
| "grad_norm": 0.26873084902763367, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3268, | |
| "num_tokens": 363814594.0, | |
| "step": 2092 | |
| }, | |
| { | |
| "epoch": 2.563035495716034, | |
| "grad_norm": 0.25509268045425415, | |
| "learning_rate": 1e-05, | |
| "loss": 0.315, | |
| "num_tokens": 364412136.0, | |
| "step": 2094 | |
| }, | |
| { | |
| "epoch": 2.565483476132191, | |
| "grad_norm": 0.26778125762939453, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3307, | |
| "num_tokens": 364999748.0, | |
| "step": 2096 | |
| }, | |
| { | |
| "epoch": 2.5679314565483478, | |
| "grad_norm": 0.26534485816955566, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3224, | |
| "num_tokens": 365565328.0, | |
| "step": 2098 | |
| }, | |
| { | |
| "epoch": 2.5703794369645045, | |
| "grad_norm": 0.2511390745639801, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3352, | |
| "num_tokens": 366166059.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.572827417380661, | |
| "grad_norm": 0.2617775499820709, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3157, | |
| "num_tokens": 366767336.0, | |
| "step": 2102 | |
| }, | |
| { | |
| "epoch": 2.5752753977968177, | |
| "grad_norm": 0.27579638361930847, | |
| "learning_rate": 1e-05, | |
| "loss": 0.325, | |
| "num_tokens": 367336066.0, | |
| "step": 2104 | |
| }, | |
| { | |
| "epoch": 2.5777233782129745, | |
| "grad_norm": 0.2548695206642151, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3168, | |
| "num_tokens": 367930420.0, | |
| "step": 2106 | |
| }, | |
| { | |
| "epoch": 2.580171358629131, | |
| "grad_norm": 0.2536294162273407, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3266, | |
| "num_tokens": 368516286.0, | |
| "step": 2108 | |
| }, | |
| { | |
| "epoch": 2.5826193390452876, | |
| "grad_norm": 0.24859356880187988, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3145, | |
| "num_tokens": 369101362.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 2.5850673194614444, | |
| "grad_norm": 0.26407885551452637, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3367, | |
| "num_tokens": 369705566.0, | |
| "step": 2112 | |
| }, | |
| { | |
| "epoch": 2.5875152998776008, | |
| "grad_norm": 0.255536824464798, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3219, | |
| "num_tokens": 370320305.0, | |
| "step": 2114 | |
| }, | |
| { | |
| "epoch": 2.5899632802937576, | |
| "grad_norm": 0.2630644142627716, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3275, | |
| "num_tokens": 370910515.0, | |
| "step": 2116 | |
| }, | |
| { | |
| "epoch": 2.5924112607099143, | |
| "grad_norm": 0.2648698091506958, | |
| "learning_rate": 1e-05, | |
| "loss": 0.326, | |
| "num_tokens": 371503862.0, | |
| "step": 2118 | |
| }, | |
| { | |
| "epoch": 2.594859241126071, | |
| "grad_norm": 0.24663522839546204, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3182, | |
| "num_tokens": 372108685.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 2.597307221542228, | |
| "grad_norm": 0.25443825125694275, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3295, | |
| "num_tokens": 372708606.0, | |
| "step": 2122 | |
| }, | |
| { | |
| "epoch": 2.5997552019583843, | |
| "grad_norm": 0.24800601601600647, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3243, | |
| "num_tokens": 373316363.0, | |
| "step": 2124 | |
| }, | |
| { | |
| "epoch": 2.602203182374541, | |
| "grad_norm": 0.2647169530391693, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3248, | |
| "num_tokens": 373895091.0, | |
| "step": 2126 | |
| }, | |
| { | |
| "epoch": 2.604651162790698, | |
| "grad_norm": 0.24779950082302094, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3258, | |
| "num_tokens": 374468123.0, | |
| "step": 2128 | |
| }, | |
| { | |
| "epoch": 2.607099143206854, | |
| "grad_norm": 0.2565578818321228, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3317, | |
| "num_tokens": 375090517.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.609547123623011, | |
| "grad_norm": 0.2583713233470917, | |
| "learning_rate": 1e-05, | |
| "loss": 0.317, | |
| "num_tokens": 375682699.0, | |
| "step": 2132 | |
| }, | |
| { | |
| "epoch": 2.611995104039168, | |
| "grad_norm": 0.2560901641845703, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3251, | |
| "num_tokens": 376267492.0, | |
| "step": 2134 | |
| }, | |
| { | |
| "epoch": 2.614443084455324, | |
| "grad_norm": 0.2595535218715668, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3218, | |
| "num_tokens": 376865486.0, | |
| "step": 2136 | |
| }, | |
| { | |
| "epoch": 2.616891064871481, | |
| "grad_norm": 0.251692533493042, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3181, | |
| "num_tokens": 377428181.0, | |
| "step": 2138 | |
| }, | |
| { | |
| "epoch": 2.6193390452876377, | |
| "grad_norm": 0.25037339329719543, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3297, | |
| "num_tokens": 378030958.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 2.6217870257037945, | |
| "grad_norm": 0.25543636083602905, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3282, | |
| "num_tokens": 378624733.0, | |
| "step": 2142 | |
| }, | |
| { | |
| "epoch": 2.6242350061199513, | |
| "grad_norm": 0.24831168353557587, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3212, | |
| "num_tokens": 379202817.0, | |
| "step": 2144 | |
| }, | |
| { | |
| "epoch": 2.6266829865361077, | |
| "grad_norm": 0.2579514682292938, | |
| "learning_rate": 1e-05, | |
| "loss": 0.329, | |
| "num_tokens": 379806787.0, | |
| "step": 2146 | |
| }, | |
| { | |
| "epoch": 2.6291309669522644, | |
| "grad_norm": 0.24643900990486145, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3185, | |
| "num_tokens": 380411339.0, | |
| "step": 2148 | |
| }, | |
| { | |
| "epoch": 2.6315789473684212, | |
| "grad_norm": 0.25557950139045715, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3276, | |
| "num_tokens": 381016714.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.6340269277845776, | |
| "grad_norm": 0.244479238986969, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3175, | |
| "num_tokens": 381601301.0, | |
| "step": 2152 | |
| }, | |
| { | |
| "epoch": 2.6364749082007344, | |
| "grad_norm": 0.2623596787452698, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3291, | |
| "num_tokens": 382217241.0, | |
| "step": 2154 | |
| }, | |
| { | |
| "epoch": 2.638922888616891, | |
| "grad_norm": 0.24378331005573273, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3215, | |
| "num_tokens": 382853897.0, | |
| "step": 2156 | |
| }, | |
| { | |
| "epoch": 2.6413708690330475, | |
| "grad_norm": 0.24978433549404144, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3172, | |
| "num_tokens": 383439968.0, | |
| "step": 2158 | |
| }, | |
| { | |
| "epoch": 2.6438188494492043, | |
| "grad_norm": 0.2559012174606323, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3305, | |
| "num_tokens": 384024107.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.646266829865361, | |
| "grad_norm": 0.24494871497154236, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3192, | |
| "num_tokens": 384609051.0, | |
| "step": 2162 | |
| }, | |
| { | |
| "epoch": 2.648714810281518, | |
| "grad_norm": 0.26301223039627075, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3171, | |
| "num_tokens": 385178869.0, | |
| "step": 2164 | |
| }, | |
| { | |
| "epoch": 2.6511627906976747, | |
| "grad_norm": 0.2578657865524292, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3348, | |
| "num_tokens": 385754760.0, | |
| "step": 2166 | |
| }, | |
| { | |
| "epoch": 2.653610771113831, | |
| "grad_norm": 0.26708459854125977, | |
| "learning_rate": 1e-05, | |
| "loss": 0.323, | |
| "num_tokens": 386342145.0, | |
| "step": 2168 | |
| }, | |
| { | |
| "epoch": 2.656058751529988, | |
| "grad_norm": 0.25751882791519165, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3276, | |
| "num_tokens": 386934129.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 2.6585067319461446, | |
| "grad_norm": 0.2512890696525574, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3291, | |
| "num_tokens": 387537081.0, | |
| "step": 2172 | |
| }, | |
| { | |
| "epoch": 2.660954712362301, | |
| "grad_norm": 0.2504841089248657, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3254, | |
| "num_tokens": 388133636.0, | |
| "step": 2174 | |
| }, | |
| { | |
| "epoch": 2.6634026927784578, | |
| "grad_norm": 0.2503618597984314, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3393, | |
| "num_tokens": 388744350.0, | |
| "step": 2176 | |
| }, | |
| { | |
| "epoch": 2.6658506731946146, | |
| "grad_norm": 0.25509852170944214, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3191, | |
| "num_tokens": 389326694.0, | |
| "step": 2178 | |
| }, | |
| { | |
| "epoch": 2.668298653610771, | |
| "grad_norm": 0.26111486554145813, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3325, | |
| "num_tokens": 389912003.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 2.6707466340269277, | |
| "grad_norm": 0.2661671042442322, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3265, | |
| "num_tokens": 390471156.0, | |
| "step": 2182 | |
| }, | |
| { | |
| "epoch": 2.6731946144430845, | |
| "grad_norm": 0.24591811001300812, | |
| "learning_rate": 1e-05, | |
| "loss": 0.317, | |
| "num_tokens": 391051907.0, | |
| "step": 2184 | |
| }, | |
| { | |
| "epoch": 2.6756425948592413, | |
| "grad_norm": 0.25238117575645447, | |
| "learning_rate": 1e-05, | |
| "loss": 0.322, | |
| "num_tokens": 391640182.0, | |
| "step": 2186 | |
| }, | |
| { | |
| "epoch": 2.678090575275398, | |
| "grad_norm": 0.2388547956943512, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3173, | |
| "num_tokens": 392251983.0, | |
| "step": 2188 | |
| }, | |
| { | |
| "epoch": 2.6805385556915544, | |
| "grad_norm": 0.25939592719078064, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3251, | |
| "num_tokens": 392813772.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.682986536107711, | |
| "grad_norm": 0.2584627866744995, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3182, | |
| "num_tokens": 393419195.0, | |
| "step": 2192 | |
| }, | |
| { | |
| "epoch": 2.685434516523868, | |
| "grad_norm": 0.25700339674949646, | |
| "learning_rate": 1e-05, | |
| "loss": 0.328, | |
| "num_tokens": 394004044.0, | |
| "step": 2194 | |
| }, | |
| { | |
| "epoch": 2.6878824969400243, | |
| "grad_norm": 0.26522722840309143, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3282, | |
| "num_tokens": 394595054.0, | |
| "step": 2196 | |
| }, | |
| { | |
| "epoch": 2.690330477356181, | |
| "grad_norm": 0.25307518243789673, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3283, | |
| "num_tokens": 395163217.0, | |
| "step": 2198 | |
| }, | |
| { | |
| "epoch": 2.692778457772338, | |
| "grad_norm": 0.25655147433280945, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3308, | |
| "num_tokens": 395758903.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.6952264381884943, | |
| "grad_norm": 0.235391765832901, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3, | |
| "num_tokens": 396332634.0, | |
| "step": 2202 | |
| }, | |
| { | |
| "epoch": 2.697674418604651, | |
| "grad_norm": 0.25023701786994934, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3264, | |
| "num_tokens": 396928536.0, | |
| "step": 2204 | |
| }, | |
| { | |
| "epoch": 2.700122399020808, | |
| "grad_norm": 0.24907897412776947, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3187, | |
| "num_tokens": 397500400.0, | |
| "step": 2206 | |
| }, | |
| { | |
| "epoch": 2.7025703794369647, | |
| "grad_norm": 0.2496294379234314, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3259, | |
| "num_tokens": 398092585.0, | |
| "step": 2208 | |
| }, | |
| { | |
| "epoch": 2.7050183598531214, | |
| "grad_norm": 0.2585897147655487, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3166, | |
| "num_tokens": 398668492.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 2.707466340269278, | |
| "grad_norm": 0.25619223713874817, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3338, | |
| "num_tokens": 399251194.0, | |
| "step": 2212 | |
| }, | |
| { | |
| "epoch": 2.7099143206854346, | |
| "grad_norm": 0.31031566858291626, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3329, | |
| "num_tokens": 399837720.0, | |
| "step": 2214 | |
| }, | |
| { | |
| "epoch": 2.7123623011015914, | |
| "grad_norm": 0.24954873323440552, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3256, | |
| "num_tokens": 400442822.0, | |
| "step": 2216 | |
| }, | |
| { | |
| "epoch": 2.7148102815177477, | |
| "grad_norm": 0.2491050362586975, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3194, | |
| "num_tokens": 401012536.0, | |
| "step": 2218 | |
| }, | |
| { | |
| "epoch": 2.7172582619339045, | |
| "grad_norm": 0.2543574571609497, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3289, | |
| "num_tokens": 401597084.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.7197062423500613, | |
| "grad_norm": 0.2470719963312149, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3113, | |
| "num_tokens": 402180043.0, | |
| "step": 2222 | |
| }, | |
| { | |
| "epoch": 2.7221542227662177, | |
| "grad_norm": 0.2668313682079315, | |
| "learning_rate": 1e-05, | |
| "loss": 0.324, | |
| "num_tokens": 402779003.0, | |
| "step": 2224 | |
| }, | |
| { | |
| "epoch": 2.7246022031823744, | |
| "grad_norm": 0.24889680743217468, | |
| "learning_rate": 1e-05, | |
| "loss": 0.326, | |
| "num_tokens": 403408199.0, | |
| "step": 2226 | |
| }, | |
| { | |
| "epoch": 2.7270501835985312, | |
| "grad_norm": 0.25317952036857605, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3232, | |
| "num_tokens": 403992124.0, | |
| "step": 2228 | |
| }, | |
| { | |
| "epoch": 2.729498164014688, | |
| "grad_norm": 0.2511397898197174, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3281, | |
| "num_tokens": 404560380.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.731946144430845, | |
| "grad_norm": 0.2410099059343338, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3263, | |
| "num_tokens": 405160188.0, | |
| "step": 2232 | |
| }, | |
| { | |
| "epoch": 2.734394124847001, | |
| "grad_norm": 0.25235220789909363, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3255, | |
| "num_tokens": 405748237.0, | |
| "step": 2234 | |
| }, | |
| { | |
| "epoch": 2.736842105263158, | |
| "grad_norm": 0.2504369616508484, | |
| "learning_rate": 1e-05, | |
| "loss": 0.337, | |
| "num_tokens": 406342136.0, | |
| "step": 2236 | |
| }, | |
| { | |
| "epoch": 2.7392900856793148, | |
| "grad_norm": 0.25537410378456116, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3247, | |
| "num_tokens": 406914106.0, | |
| "step": 2238 | |
| }, | |
| { | |
| "epoch": 2.741738066095471, | |
| "grad_norm": 0.25914260745048523, | |
| "learning_rate": 1e-05, | |
| "loss": 0.33, | |
| "num_tokens": 407516938.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.744186046511628, | |
| "grad_norm": 0.25355833768844604, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3215, | |
| "num_tokens": 408085444.0, | |
| "step": 2242 | |
| }, | |
| { | |
| "epoch": 2.7466340269277847, | |
| "grad_norm": 0.24525900185108185, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3195, | |
| "num_tokens": 408660772.0, | |
| "step": 2244 | |
| }, | |
| { | |
| "epoch": 2.749082007343941, | |
| "grad_norm": 0.25692498683929443, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3283, | |
| "num_tokens": 409235521.0, | |
| "step": 2246 | |
| }, | |
| { | |
| "epoch": 2.751529987760098, | |
| "grad_norm": 0.2557971477508545, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3242, | |
| "num_tokens": 409774999.0, | |
| "step": 2248 | |
| }, | |
| { | |
| "epoch": 2.7539779681762546, | |
| "grad_norm": 0.25260117650032043, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3156, | |
| "num_tokens": 410367564.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.7564259485924114, | |
| "grad_norm": 0.24852988123893738, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3209, | |
| "num_tokens": 410969022.0, | |
| "step": 2252 | |
| }, | |
| { | |
| "epoch": 2.758873929008568, | |
| "grad_norm": 0.2457083761692047, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3289, | |
| "num_tokens": 411548719.0, | |
| "step": 2254 | |
| }, | |
| { | |
| "epoch": 2.7613219094247246, | |
| "grad_norm": 0.25295954942703247, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3246, | |
| "num_tokens": 412130956.0, | |
| "step": 2256 | |
| }, | |
| { | |
| "epoch": 2.7637698898408813, | |
| "grad_norm": 0.2472776174545288, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3349, | |
| "num_tokens": 412736440.0, | |
| "step": 2258 | |
| }, | |
| { | |
| "epoch": 2.766217870257038, | |
| "grad_norm": 0.2581010162830353, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3176, | |
| "num_tokens": 413318066.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.7686658506731945, | |
| "grad_norm": 0.2525770962238312, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3177, | |
| "num_tokens": 413917917.0, | |
| "step": 2262 | |
| }, | |
| { | |
| "epoch": 2.7711138310893513, | |
| "grad_norm": 0.2548413872718811, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3175, | |
| "num_tokens": 414520002.0, | |
| "step": 2264 | |
| }, | |
| { | |
| "epoch": 2.773561811505508, | |
| "grad_norm": 0.25579121708869934, | |
| "learning_rate": 1e-05, | |
| "loss": 0.336, | |
| "num_tokens": 415107892.0, | |
| "step": 2266 | |
| }, | |
| { | |
| "epoch": 2.7760097919216644, | |
| "grad_norm": 0.25981447100639343, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3275, | |
| "num_tokens": 415667389.0, | |
| "step": 2268 | |
| }, | |
| { | |
| "epoch": 2.778457772337821, | |
| "grad_norm": 0.254893958568573, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3406, | |
| "num_tokens": 416255130.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.780905752753978, | |
| "grad_norm": 0.24784359335899353, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3257, | |
| "num_tokens": 416839340.0, | |
| "step": 2272 | |
| }, | |
| { | |
| "epoch": 2.783353733170135, | |
| "grad_norm": 0.2368021309375763, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3222, | |
| "num_tokens": 417443118.0, | |
| "step": 2274 | |
| }, | |
| { | |
| "epoch": 2.7858017135862916, | |
| "grad_norm": 0.24467357993125916, | |
| "learning_rate": 1e-05, | |
| "loss": 0.333, | |
| "num_tokens": 418042727.0, | |
| "step": 2276 | |
| }, | |
| { | |
| "epoch": 2.788249694002448, | |
| "grad_norm": 0.257119357585907, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3249, | |
| "num_tokens": 418649730.0, | |
| "step": 2278 | |
| }, | |
| { | |
| "epoch": 2.7906976744186047, | |
| "grad_norm": 0.24344298243522644, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3217, | |
| "num_tokens": 419233737.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.7931456548347615, | |
| "grad_norm": 0.25380992889404297, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3306, | |
| "num_tokens": 419811330.0, | |
| "step": 2282 | |
| }, | |
| { | |
| "epoch": 2.795593635250918, | |
| "grad_norm": 0.25424936413764954, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3332, | |
| "num_tokens": 420410428.0, | |
| "step": 2284 | |
| }, | |
| { | |
| "epoch": 2.7980416156670747, | |
| "grad_norm": 0.26502135396003723, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3351, | |
| "num_tokens": 420991985.0, | |
| "step": 2286 | |
| }, | |
| { | |
| "epoch": 2.8004895960832314, | |
| "grad_norm": 0.24788039922714233, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3253, | |
| "num_tokens": 421601039.0, | |
| "step": 2288 | |
| }, | |
| { | |
| "epoch": 2.802937576499388, | |
| "grad_norm": 0.2538328468799591, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3322, | |
| "num_tokens": 422188161.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.8053855569155446, | |
| "grad_norm": 0.23793523013591766, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3219, | |
| "num_tokens": 422803635.0, | |
| "step": 2292 | |
| }, | |
| { | |
| "epoch": 2.8078335373317014, | |
| "grad_norm": 0.24384064972400665, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3285, | |
| "num_tokens": 423413615.0, | |
| "step": 2294 | |
| }, | |
| { | |
| "epoch": 2.810281517747858, | |
| "grad_norm": 0.24296848475933075, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3225, | |
| "num_tokens": 424000637.0, | |
| "step": 2296 | |
| }, | |
| { | |
| "epoch": 2.812729498164015, | |
| "grad_norm": 0.2595270276069641, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3231, | |
| "num_tokens": 424553573.0, | |
| "step": 2298 | |
| }, | |
| { | |
| "epoch": 2.8151774785801713, | |
| "grad_norm": 0.25373417139053345, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3259, | |
| "num_tokens": 425142063.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.817625458996328, | |
| "grad_norm": 0.24285350739955902, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3156, | |
| "num_tokens": 425760616.0, | |
| "step": 2302 | |
| }, | |
| { | |
| "epoch": 2.820073439412485, | |
| "grad_norm": 0.2502899467945099, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3203, | |
| "num_tokens": 426347703.0, | |
| "step": 2304 | |
| }, | |
| { | |
| "epoch": 2.8225214198286412, | |
| "grad_norm": 0.2567707896232605, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3181, | |
| "num_tokens": 426941074.0, | |
| "step": 2306 | |
| }, | |
| { | |
| "epoch": 2.824969400244798, | |
| "grad_norm": 0.2537354826927185, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3184, | |
| "num_tokens": 427538380.0, | |
| "step": 2308 | |
| }, | |
| { | |
| "epoch": 2.827417380660955, | |
| "grad_norm": 0.3201362192630768, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3258, | |
| "num_tokens": 428145766.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.829865361077111, | |
| "grad_norm": 0.25844812393188477, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3221, | |
| "num_tokens": 428713019.0, | |
| "step": 2312 | |
| }, | |
| { | |
| "epoch": 2.832313341493268, | |
| "grad_norm": 0.2547036409378052, | |
| "learning_rate": 1e-05, | |
| "loss": 0.321, | |
| "num_tokens": 429313757.0, | |
| "step": 2314 | |
| }, | |
| { | |
| "epoch": 2.8347613219094248, | |
| "grad_norm": 0.2565271854400635, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3156, | |
| "num_tokens": 429881555.0, | |
| "step": 2316 | |
| }, | |
| { | |
| "epoch": 2.8372093023255816, | |
| "grad_norm": 0.2533104717731476, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3249, | |
| "num_tokens": 430476119.0, | |
| "step": 2318 | |
| }, | |
| { | |
| "epoch": 2.8396572827417383, | |
| "grad_norm": 0.2432134449481964, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3286, | |
| "num_tokens": 431085110.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.8421052631578947, | |
| "grad_norm": 0.2589612901210785, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3259, | |
| "num_tokens": 431646542.0, | |
| "step": 2322 | |
| }, | |
| { | |
| "epoch": 2.8445532435740515, | |
| "grad_norm": 0.25283631682395935, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3169, | |
| "num_tokens": 432238611.0, | |
| "step": 2324 | |
| }, | |
| { | |
| "epoch": 2.8470012239902083, | |
| "grad_norm": 0.27747154235839844, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3281, | |
| "num_tokens": 432805736.0, | |
| "step": 2326 | |
| }, | |
| { | |
| "epoch": 2.8494492044063646, | |
| "grad_norm": 0.24919773638248444, | |
| "learning_rate": 1e-05, | |
| "loss": 0.327, | |
| "num_tokens": 433416751.0, | |
| "step": 2328 | |
| }, | |
| { | |
| "epoch": 2.8518971848225214, | |
| "grad_norm": 0.254692405462265, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3198, | |
| "num_tokens": 434017692.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.854345165238678, | |
| "grad_norm": 0.2547382414340973, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3215, | |
| "num_tokens": 434628565.0, | |
| "step": 2332 | |
| }, | |
| { | |
| "epoch": 2.8567931456548346, | |
| "grad_norm": 0.3222307562828064, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3131, | |
| "num_tokens": 435187461.0, | |
| "step": 2334 | |
| }, | |
| { | |
| "epoch": 2.8592411260709913, | |
| "grad_norm": 0.2559938132762909, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3344, | |
| "num_tokens": 435791429.0, | |
| "step": 2336 | |
| }, | |
| { | |
| "epoch": 2.861689106487148, | |
| "grad_norm": 0.24854914844036102, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3267, | |
| "num_tokens": 436369357.0, | |
| "step": 2338 | |
| }, | |
| { | |
| "epoch": 2.864137086903305, | |
| "grad_norm": 0.24382859468460083, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3377, | |
| "num_tokens": 436988615.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.8665850673194617, | |
| "grad_norm": 0.24978092312812805, | |
| "learning_rate": 1e-05, | |
| "loss": 0.318, | |
| "num_tokens": 437587075.0, | |
| "step": 2342 | |
| }, | |
| { | |
| "epoch": 2.869033047735618, | |
| "grad_norm": 0.2609320282936096, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3202, | |
| "num_tokens": 438146930.0, | |
| "step": 2344 | |
| }, | |
| { | |
| "epoch": 2.871481028151775, | |
| "grad_norm": 0.25630444288253784, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3284, | |
| "num_tokens": 438732548.0, | |
| "step": 2346 | |
| }, | |
| { | |
| "epoch": 2.8739290085679317, | |
| "grad_norm": 0.24788805842399597, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3333, | |
| "num_tokens": 439328118.0, | |
| "step": 2348 | |
| }, | |
| { | |
| "epoch": 2.876376988984088, | |
| "grad_norm": 0.24909254908561707, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3258, | |
| "num_tokens": 439919885.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.878824969400245, | |
| "grad_norm": 0.2596440613269806, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3231, | |
| "num_tokens": 440495586.0, | |
| "step": 2352 | |
| }, | |
| { | |
| "epoch": 2.8812729498164016, | |
| "grad_norm": 0.2636336386203766, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3093, | |
| "num_tokens": 441074305.0, | |
| "step": 2354 | |
| }, | |
| { | |
| "epoch": 2.883720930232558, | |
| "grad_norm": 0.3469390869140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3108, | |
| "num_tokens": 441634567.0, | |
| "step": 2356 | |
| }, | |
| { | |
| "epoch": 2.8861689106487147, | |
| "grad_norm": 0.26014935970306396, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3225, | |
| "num_tokens": 442213028.0, | |
| "step": 2358 | |
| }, | |
| { | |
| "epoch": 2.8886168910648715, | |
| "grad_norm": 0.2676273286342621, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3327, | |
| "num_tokens": 442803814.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.8910648714810283, | |
| "grad_norm": 0.24446800351142883, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3238, | |
| "num_tokens": 443397503.0, | |
| "step": 2362 | |
| }, | |
| { | |
| "epoch": 2.8935128518971847, | |
| "grad_norm": 0.25436699390411377, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3339, | |
| "num_tokens": 444015938.0, | |
| "step": 2364 | |
| }, | |
| { | |
| "epoch": 2.8959608323133414, | |
| "grad_norm": 0.24916264414787292, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3331, | |
| "num_tokens": 444605250.0, | |
| "step": 2366 | |
| }, | |
| { | |
| "epoch": 2.8984088127294982, | |
| "grad_norm": 0.24253615736961365, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3193, | |
| "num_tokens": 445196912.0, | |
| "step": 2368 | |
| }, | |
| { | |
| "epoch": 2.900856793145655, | |
| "grad_norm": 0.2454930543899536, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3155, | |
| "num_tokens": 445802496.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.9033047735618114, | |
| "grad_norm": 0.24793769419193268, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3285, | |
| "num_tokens": 446401922.0, | |
| "step": 2372 | |
| }, | |
| { | |
| "epoch": 2.905752753977968, | |
| "grad_norm": 0.24584895372390747, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3211, | |
| "num_tokens": 447001867.0, | |
| "step": 2374 | |
| }, | |
| { | |
| "epoch": 2.908200734394125, | |
| "grad_norm": 0.26117268204689026, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3211, | |
| "num_tokens": 447551496.0, | |
| "step": 2376 | |
| }, | |
| { | |
| "epoch": 2.9106487148102813, | |
| "grad_norm": 0.24407590925693512, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3158, | |
| "num_tokens": 448133543.0, | |
| "step": 2378 | |
| }, | |
| { | |
| "epoch": 2.913096695226438, | |
| "grad_norm": 0.24445265531539917, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3237, | |
| "num_tokens": 448731691.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.915544675642595, | |
| "grad_norm": 0.24581503868103027, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3211, | |
| "num_tokens": 449327173.0, | |
| "step": 2382 | |
| }, | |
| { | |
| "epoch": 2.9179926560587517, | |
| "grad_norm": 0.2608600854873657, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3281, | |
| "num_tokens": 449920674.0, | |
| "step": 2384 | |
| }, | |
| { | |
| "epoch": 2.920440636474908, | |
| "grad_norm": 0.24781474471092224, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3138, | |
| "num_tokens": 450507378.0, | |
| "step": 2386 | |
| }, | |
| { | |
| "epoch": 2.922888616891065, | |
| "grad_norm": 0.24228855967521667, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3183, | |
| "num_tokens": 451081732.0, | |
| "step": 2388 | |
| }, | |
| { | |
| "epoch": 2.9253365973072216, | |
| "grad_norm": 0.257331520318985, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3198, | |
| "num_tokens": 451660618.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.9277845777233784, | |
| "grad_norm": 0.2509153485298157, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3227, | |
| "num_tokens": 452229573.0, | |
| "step": 2392 | |
| }, | |
| { | |
| "epoch": 2.9302325581395348, | |
| "grad_norm": 0.25570929050445557, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3223, | |
| "num_tokens": 452841553.0, | |
| "step": 2394 | |
| }, | |
| { | |
| "epoch": 2.9326805385556916, | |
| "grad_norm": 0.24994735419750214, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3223, | |
| "num_tokens": 453429653.0, | |
| "step": 2396 | |
| }, | |
| { | |
| "epoch": 2.9351285189718483, | |
| "grad_norm": 0.25359705090522766, | |
| "learning_rate": 1e-05, | |
| "loss": 0.323, | |
| "num_tokens": 454026873.0, | |
| "step": 2398 | |
| }, | |
| { | |
| "epoch": 2.9375764993880047, | |
| "grad_norm": 0.24387480318546295, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3261, | |
| "num_tokens": 454620108.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.9400244798041615, | |
| "grad_norm": 0.2454231232404709, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3292, | |
| "num_tokens": 455236527.0, | |
| "step": 2402 | |
| }, | |
| { | |
| "epoch": 2.9424724602203183, | |
| "grad_norm": 0.2575424015522003, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3285, | |
| "num_tokens": 455862858.0, | |
| "step": 2404 | |
| }, | |
| { | |
| "epoch": 2.944920440636475, | |
| "grad_norm": 0.2581271231174469, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3318, | |
| "num_tokens": 456431610.0, | |
| "step": 2406 | |
| }, | |
| { | |
| "epoch": 2.9473684210526314, | |
| "grad_norm": 0.24646282196044922, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3201, | |
| "num_tokens": 457014496.0, | |
| "step": 2408 | |
| }, | |
| { | |
| "epoch": 2.949816401468788, | |
| "grad_norm": 0.2476486712694168, | |
| "learning_rate": 1e-05, | |
| "loss": 0.327, | |
| "num_tokens": 457624802.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.952264381884945, | |
| "grad_norm": 0.2568027973175049, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3165, | |
| "num_tokens": 458184907.0, | |
| "step": 2412 | |
| }, | |
| { | |
| "epoch": 2.954712362301102, | |
| "grad_norm": 0.27381864190101624, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3163, | |
| "num_tokens": 458756228.0, | |
| "step": 2414 | |
| }, | |
| { | |
| "epoch": 2.957160342717258, | |
| "grad_norm": 0.2646235227584839, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3248, | |
| "num_tokens": 459326023.0, | |
| "step": 2416 | |
| }, | |
| { | |
| "epoch": 2.959608323133415, | |
| "grad_norm": 0.2521771490573883, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3436, | |
| "num_tokens": 459922424.0, | |
| "step": 2418 | |
| }, | |
| { | |
| "epoch": 2.9620563035495717, | |
| "grad_norm": 0.25192415714263916, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3258, | |
| "num_tokens": 460506985.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.964504283965728, | |
| "grad_norm": 0.24735009670257568, | |
| "learning_rate": 1e-05, | |
| "loss": 0.32, | |
| "num_tokens": 461089976.0, | |
| "step": 2422 | |
| }, | |
| { | |
| "epoch": 2.966952264381885, | |
| "grad_norm": 0.25478076934814453, | |
| "learning_rate": 1e-05, | |
| "loss": 0.322, | |
| "num_tokens": 461681056.0, | |
| "step": 2424 | |
| }, | |
| { | |
| "epoch": 2.9694002447980417, | |
| "grad_norm": 0.24166902899742126, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3199, | |
| "num_tokens": 462278680.0, | |
| "step": 2426 | |
| }, | |
| { | |
| "epoch": 2.9718482252141984, | |
| "grad_norm": 0.23424232006072998, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3292, | |
| "num_tokens": 462918784.0, | |
| "step": 2428 | |
| }, | |
| { | |
| "epoch": 2.974296205630355, | |
| "grad_norm": 0.24685415625572205, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3349, | |
| "num_tokens": 463532263.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.9767441860465116, | |
| "grad_norm": 0.24677598476409912, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3235, | |
| "num_tokens": 464170104.0, | |
| "step": 2432 | |
| }, | |
| { | |
| "epoch": 2.9791921664626684, | |
| "grad_norm": 0.257145494222641, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3254, | |
| "num_tokens": 464754665.0, | |
| "step": 2434 | |
| }, | |
| { | |
| "epoch": 2.981640146878825, | |
| "grad_norm": 0.2553233802318573, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3316, | |
| "num_tokens": 465349241.0, | |
| "step": 2436 | |
| }, | |
| { | |
| "epoch": 2.9840881272949815, | |
| "grad_norm": 0.2508125305175781, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3222, | |
| "num_tokens": 465923272.0, | |
| "step": 2438 | |
| }, | |
| { | |
| "epoch": 2.9865361077111383, | |
| "grad_norm": 0.2562168538570404, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3223, | |
| "num_tokens": 466496143.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.988984088127295, | |
| "grad_norm": 0.24228590726852417, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3196, | |
| "num_tokens": 467109273.0, | |
| "step": 2442 | |
| }, | |
| { | |
| "epoch": 2.9914320685434515, | |
| "grad_norm": 0.24678544700145721, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3139, | |
| "num_tokens": 467701007.0, | |
| "step": 2444 | |
| }, | |
| { | |
| "epoch": 2.9938800489596082, | |
| "grad_norm": 0.2628517746925354, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3247, | |
| "num_tokens": 468293330.0, | |
| "step": 2446 | |
| }, | |
| { | |
| "epoch": 2.996328029375765, | |
| "grad_norm": 0.2700228691101074, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3248, | |
| "num_tokens": 468896606.0, | |
| "step": 2448 | |
| }, | |
| { | |
| "epoch": 2.998776009791922, | |
| "grad_norm": 0.24113397300243378, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3309, | |
| "num_tokens": 469499267.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "num_tokens": 469802937.0, | |
| "step": 2451, | |
| "total_flos": 2.993584104037312e+19, | |
| "train_loss": 0.2255855570249001, | |
| "train_runtime": 11813.0141, | |
| "train_samples_per_second": 23.224, | |
| "train_steps_per_second": 0.207 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 2451, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 123, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.993584104037312e+19, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |