diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9844 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 2451, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0024479804161566705, + "grad_norm": 5.113162517547607, + "learning_rate": 2.0325203252032523e-07, + "loss": 1.1539, + "num_tokens": 579147.0, + "step": 2 + }, + { + "epoch": 0.004895960832313341, + "grad_norm": 5.000253677368164, + "learning_rate": 6.097560975609757e-07, + "loss": 1.1348, + "num_tokens": 1171666.0, + "step": 4 + }, + { + "epoch": 0.0073439412484700125, + "grad_norm": 4.801650524139404, + "learning_rate": 1.0162601626016261e-06, + "loss": 1.1433, + "num_tokens": 1782835.0, + "step": 6 + }, + { + "epoch": 0.009791921664626682, + "grad_norm": 4.334924221038818, + "learning_rate": 1.4227642276422764e-06, + "loss": 1.1132, + "num_tokens": 2373836.0, + "step": 8 + }, + { + "epoch": 0.012239902080783354, + "grad_norm": 4.126493453979492, + "learning_rate": 1.8292682926829268e-06, + "loss": 1.0822, + "num_tokens": 2949918.0, + "step": 10 + }, + { + "epoch": 0.014687882496940025, + "grad_norm": 2.961500883102417, + "learning_rate": 2.2357723577235773e-06, + "loss": 0.9786, + "num_tokens": 3550879.0, + "step": 12 + }, + { + "epoch": 0.017135862913096694, + "grad_norm": 1.8258001804351807, + "learning_rate": 2.6422764227642278e-06, + "loss": 0.9174, + "num_tokens": 4150487.0, + "step": 14 + }, + { + "epoch": 0.019583843329253364, + "grad_norm": 1.5126988887786865, + "learning_rate": 3.0487804878048782e-06, + "loss": 0.8089, + "num_tokens": 4732419.0, + "step": 16 + }, + { + "epoch": 0.022031823745410038, + "grad_norm": 1.187708854675293, + "learning_rate": 3.4552845528455287e-06, + "loss": 0.7966, + "num_tokens": 5323835.0, + "step": 18 + }, + { + "epoch": 0.02447980416156671, + "grad_norm": 1.0947265625, + "learning_rate": 3.861788617886179e-06, + "loss": 0.7432, + "num_tokens": 5907508.0, + "step": 20 + }, + { + "epoch": 0.02692778457772338, + "grad_norm": 0.8632480502128601, + "learning_rate": 4.26829268292683e-06, + "loss": 0.7412, + "num_tokens": 6502779.0, + "step": 22 + }, + { + "epoch": 0.02937576499388005, + "grad_norm": 0.5999342799186707, + "learning_rate": 4.67479674796748e-06, + "loss": 0.6835, + "num_tokens": 7084297.0, + "step": 24 + }, + { + "epoch": 0.03182374541003672, + "grad_norm": 0.4732573628425598, + "learning_rate": 5.08130081300813e-06, + "loss": 0.6666, + "num_tokens": 7677074.0, + "step": 26 + }, + { + "epoch": 0.03427172582619339, + "grad_norm": 0.44881582260131836, + "learning_rate": 5.487804878048781e-06, + "loss": 0.6475, + "num_tokens": 8262795.0, + "step": 28 + }, + { + "epoch": 0.03671970624235006, + "grad_norm": 0.3851088881492615, + "learning_rate": 5.894308943089431e-06, + "loss": 0.6393, + "num_tokens": 8856597.0, + "step": 30 + }, + { + "epoch": 0.03916768665850673, + "grad_norm": 0.37443217635154724, + "learning_rate": 6.300813008130081e-06, + "loss": 0.6266, + "num_tokens": 9443865.0, + "step": 32 + }, + { + "epoch": 0.0416156670746634, + "grad_norm": 0.3474370837211609, + "learning_rate": 6.707317073170733e-06, + "loss": 0.6097, + "num_tokens": 10028572.0, + "step": 34 + }, + { + "epoch": 0.044063647490820076, + "grad_norm": 0.31487369537353516, + "learning_rate": 7.113821138211382e-06, + "loss": 0.6122, + "num_tokens": 10622144.0, + "step": 36 + }, + { + "epoch": 0.046511627906976744, + "grad_norm": 0.30519258975982666, + "learning_rate": 7.520325203252034e-06, + "loss": 0.5771, + "num_tokens": 11195500.0, + "step": 38 + }, + { + "epoch": 0.04895960832313342, + "grad_norm": 0.29173702001571655, + "learning_rate": 7.926829268292683e-06, + "loss": 0.5693, + "num_tokens": 11804642.0, + "step": 40 + }, + { + "epoch": 0.051407588739290085, + "grad_norm": 0.2927861511707306, + "learning_rate": 8.333333333333334e-06, + "loss": 0.5603, + "num_tokens": 12412613.0, + "step": 42 + }, + { + "epoch": 0.05385556915544676, + "grad_norm": 0.27720212936401367, + "learning_rate": 8.739837398373985e-06, + "loss": 0.5528, + "num_tokens": 12980451.0, + "step": 44 + }, + { + "epoch": 0.056303549571603426, + "grad_norm": 0.2664688229560852, + "learning_rate": 9.146341463414634e-06, + "loss": 0.5518, + "num_tokens": 13573410.0, + "step": 46 + }, + { + "epoch": 0.0587515299877601, + "grad_norm": 0.260998398065567, + "learning_rate": 9.552845528455286e-06, + "loss": 0.5618, + "num_tokens": 14151346.0, + "step": 48 + }, + { + "epoch": 0.06119951040391677, + "grad_norm": 0.2568208873271942, + "learning_rate": 9.959349593495936e-06, + "loss": 0.5638, + "num_tokens": 14772803.0, + "step": 50 + }, + { + "epoch": 0.06364749082007344, + "grad_norm": 0.265893816947937, + "learning_rate": 1.0365853658536585e-05, + "loss": 0.5446, + "num_tokens": 15354869.0, + "step": 52 + }, + { + "epoch": 0.06609547123623011, + "grad_norm": 0.26780450344085693, + "learning_rate": 1.0772357723577237e-05, + "loss": 0.5385, + "num_tokens": 15944249.0, + "step": 54 + }, + { + "epoch": 0.06854345165238677, + "grad_norm": 0.2555972933769226, + "learning_rate": 1.1178861788617887e-05, + "loss": 0.5153, + "num_tokens": 16523611.0, + "step": 56 + }, + { + "epoch": 0.07099143206854346, + "grad_norm": 0.28074198961257935, + "learning_rate": 1.1585365853658537e-05, + "loss": 0.5315, + "num_tokens": 17110353.0, + "step": 58 + }, + { + "epoch": 0.07343941248470012, + "grad_norm": 0.2655738294124603, + "learning_rate": 1.1991869918699188e-05, + "loss": 0.5317, + "num_tokens": 17682600.0, + "step": 60 + }, + { + "epoch": 0.07588739290085679, + "grad_norm": 0.2633155882358551, + "learning_rate": 1.2398373983739837e-05, + "loss": 0.5191, + "num_tokens": 18274385.0, + "step": 62 + }, + { + "epoch": 0.07833537331701346, + "grad_norm": 0.26505130529403687, + "learning_rate": 1.2804878048780488e-05, + "loss": 0.5439, + "num_tokens": 18864027.0, + "step": 64 + }, + { + "epoch": 0.08078335373317014, + "grad_norm": 0.2558540403842926, + "learning_rate": 1.321138211382114e-05, + "loss": 0.5095, + "num_tokens": 19450167.0, + "step": 66 + }, + { + "epoch": 0.0832313341493268, + "grad_norm": 0.26796481013298035, + "learning_rate": 1.3617886178861788e-05, + "loss": 0.5101, + "num_tokens": 20027616.0, + "step": 68 + }, + { + "epoch": 0.08567931456548347, + "grad_norm": 0.2605902850627899, + "learning_rate": 1.4024390243902441e-05, + "loss": 0.5202, + "num_tokens": 20641209.0, + "step": 70 + }, + { + "epoch": 0.08812729498164015, + "grad_norm": 0.2739558219909668, + "learning_rate": 1.443089430894309e-05, + "loss": 0.5019, + "num_tokens": 21231993.0, + "step": 72 + }, + { + "epoch": 0.09057527539779682, + "grad_norm": 0.2672581076622009, + "learning_rate": 1.4837398373983741e-05, + "loss": 0.4958, + "num_tokens": 21822237.0, + "step": 74 + }, + { + "epoch": 0.09302325581395349, + "grad_norm": 0.2613845467567444, + "learning_rate": 1.524390243902439e-05, + "loss": 0.5182, + "num_tokens": 22406364.0, + "step": 76 + }, + { + "epoch": 0.09547123623011015, + "grad_norm": 0.2513177692890167, + "learning_rate": 1.565040650406504e-05, + "loss": 0.5106, + "num_tokens": 23012261.0, + "step": 78 + }, + { + "epoch": 0.09791921664626684, + "grad_norm": 0.2592834532260895, + "learning_rate": 1.6056910569105692e-05, + "loss": 0.5066, + "num_tokens": 23609152.0, + "step": 80 + }, + { + "epoch": 0.1003671970624235, + "grad_norm": 0.24914328753948212, + "learning_rate": 1.6463414634146345e-05, + "loss": 0.5024, + "num_tokens": 24195467.0, + "step": 82 + }, + { + "epoch": 0.10281517747858017, + "grad_norm": 0.269598126411438, + "learning_rate": 1.6869918699186994e-05, + "loss": 0.4902, + "num_tokens": 24776287.0, + "step": 84 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 0.2777648866176605, + "learning_rate": 1.7276422764227643e-05, + "loss": 0.5007, + "num_tokens": 25353355.0, + "step": 86 + }, + { + "epoch": 0.10771113831089352, + "grad_norm": 0.2843937277793884, + "learning_rate": 1.7682926829268292e-05, + "loss": 0.4967, + "num_tokens": 25910479.0, + "step": 88 + }, + { + "epoch": 0.11015911872705018, + "grad_norm": 0.2720652222633362, + "learning_rate": 1.8089430894308945e-05, + "loss": 0.5036, + "num_tokens": 26499675.0, + "step": 90 + }, + { + "epoch": 0.11260709914320685, + "grad_norm": 0.25513291358947754, + "learning_rate": 1.8495934959349594e-05, + "loss": 0.4842, + "num_tokens": 27077382.0, + "step": 92 + }, + { + "epoch": 0.11505507955936352, + "grad_norm": 0.29018568992614746, + "learning_rate": 1.8902439024390246e-05, + "loss": 0.4968, + "num_tokens": 27670497.0, + "step": 94 + }, + { + "epoch": 0.1175030599755202, + "grad_norm": 0.2705482244491577, + "learning_rate": 1.9308943089430896e-05, + "loss": 0.4894, + "num_tokens": 28263901.0, + "step": 96 + }, + { + "epoch": 0.11995104039167687, + "grad_norm": 0.25819292664527893, + "learning_rate": 1.9715447154471545e-05, + "loss": 0.4706, + "num_tokens": 28845275.0, + "step": 98 + }, + { + "epoch": 0.12239902080783353, + "grad_norm": 0.2706543505191803, + "learning_rate": 2.0121951219512197e-05, + "loss": 0.4878, + "num_tokens": 29436166.0, + "step": 100 + }, + { + "epoch": 0.12484700122399021, + "grad_norm": 0.28708574175834656, + "learning_rate": 2.0528455284552847e-05, + "loss": 0.4837, + "num_tokens": 30044583.0, + "step": 102 + }, + { + "epoch": 0.12729498164014688, + "grad_norm": 0.2903023362159729, + "learning_rate": 2.0934959349593496e-05, + "loss": 0.485, + "num_tokens": 30645726.0, + "step": 104 + }, + { + "epoch": 0.12974296205630356, + "grad_norm": 0.3191259503364563, + "learning_rate": 2.134146341463415e-05, + "loss": 0.4979, + "num_tokens": 31251475.0, + "step": 106 + }, + { + "epoch": 0.13219094247246022, + "grad_norm": 0.2853090763092041, + "learning_rate": 2.1747967479674798e-05, + "loss": 0.4726, + "num_tokens": 31830636.0, + "step": 108 + }, + { + "epoch": 0.1346389228886169, + "grad_norm": 0.2693997919559479, + "learning_rate": 2.215447154471545e-05, + "loss": 0.4926, + "num_tokens": 32435998.0, + "step": 110 + }, + { + "epoch": 0.13708690330477355, + "grad_norm": 0.2778482735157013, + "learning_rate": 2.25609756097561e-05, + "loss": 0.4729, + "num_tokens": 33041535.0, + "step": 112 + }, + { + "epoch": 0.13953488372093023, + "grad_norm": 0.28160908818244934, + "learning_rate": 2.296747967479675e-05, + "loss": 0.4877, + "num_tokens": 33617226.0, + "step": 114 + }, + { + "epoch": 0.1419828641370869, + "grad_norm": 0.28517454862594604, + "learning_rate": 2.3373983739837398e-05, + "loss": 0.481, + "num_tokens": 34184157.0, + "step": 116 + }, + { + "epoch": 0.14443084455324356, + "grad_norm": 0.29330670833587646, + "learning_rate": 2.378048780487805e-05, + "loss": 0.4694, + "num_tokens": 34779714.0, + "step": 118 + }, + { + "epoch": 0.14687882496940025, + "grad_norm": 0.2896597683429718, + "learning_rate": 2.4186991869918703e-05, + "loss": 0.4701, + "num_tokens": 35361759.0, + "step": 120 + }, + { + "epoch": 0.14932680538555693, + "grad_norm": 0.2746509611606598, + "learning_rate": 2.4593495934959352e-05, + "loss": 0.4724, + "num_tokens": 35960992.0, + "step": 122 + }, + { + "epoch": 0.15177478580171358, + "grad_norm": 0.2720145285129547, + "learning_rate": 2.5e-05, + "loss": 0.4851, + "num_tokens": 36577873.0, + "step": 124 + }, + { + "epoch": 0.15422276621787026, + "grad_norm": 0.29591360688209534, + "learning_rate": 2.5406504065040654e-05, + "loss": 0.4862, + "num_tokens": 37173083.0, + "step": 126 + }, + { + "epoch": 0.15667074663402691, + "grad_norm": 0.2629620134830475, + "learning_rate": 2.58130081300813e-05, + "loss": 0.4833, + "num_tokens": 37772321.0, + "step": 128 + }, + { + "epoch": 0.1591187270501836, + "grad_norm": 0.27666398882865906, + "learning_rate": 2.6219512195121952e-05, + "loss": 0.4879, + "num_tokens": 38366280.0, + "step": 130 + }, + { + "epoch": 0.16156670746634028, + "grad_norm": 0.27502506971359253, + "learning_rate": 2.66260162601626e-05, + "loss": 0.4984, + "num_tokens": 38963886.0, + "step": 132 + }, + { + "epoch": 0.16401468788249693, + "grad_norm": 0.2742544412612915, + "learning_rate": 2.7032520325203254e-05, + "loss": 0.4685, + "num_tokens": 39590863.0, + "step": 134 + }, + { + "epoch": 0.1664626682986536, + "grad_norm": 0.2967033386230469, + "learning_rate": 2.7439024390243906e-05, + "loss": 0.4712, + "num_tokens": 40182071.0, + "step": 136 + }, + { + "epoch": 0.1689106487148103, + "grad_norm": 0.2629011273384094, + "learning_rate": 2.7845528455284552e-05, + "loss": 0.4678, + "num_tokens": 40758834.0, + "step": 138 + }, + { + "epoch": 0.17135862913096694, + "grad_norm": 0.2847088873386383, + "learning_rate": 2.8252032520325205e-05, + "loss": 0.4971, + "num_tokens": 41366991.0, + "step": 140 + }, + { + "epoch": 0.17380660954712362, + "grad_norm": 0.30970853567123413, + "learning_rate": 2.8658536585365854e-05, + "loss": 0.4875, + "num_tokens": 41970776.0, + "step": 142 + }, + { + "epoch": 0.1762545899632803, + "grad_norm": 0.28295695781707764, + "learning_rate": 2.9065040650406507e-05, + "loss": 0.4694, + "num_tokens": 42545064.0, + "step": 144 + }, + { + "epoch": 0.17870257037943696, + "grad_norm": 0.2902471721172333, + "learning_rate": 2.947154471544716e-05, + "loss": 0.4615, + "num_tokens": 43097646.0, + "step": 146 + }, + { + "epoch": 0.18115055079559364, + "grad_norm": 0.29337817430496216, + "learning_rate": 2.9878048780487805e-05, + "loss": 0.4897, + "num_tokens": 43704720.0, + "step": 148 + }, + { + "epoch": 0.1835985312117503, + "grad_norm": 0.3198813796043396, + "learning_rate": 3.0284552845528458e-05, + "loss": 0.4497, + "num_tokens": 44291468.0, + "step": 150 + }, + { + "epoch": 0.18604651162790697, + "grad_norm": 0.31788963079452515, + "learning_rate": 3.069105691056911e-05, + "loss": 0.4773, + "num_tokens": 44889809.0, + "step": 152 + }, + { + "epoch": 0.18849449204406366, + "grad_norm": 0.2984258234500885, + "learning_rate": 3.109756097560976e-05, + "loss": 0.4651, + "num_tokens": 45471363.0, + "step": 154 + }, + { + "epoch": 0.1909424724602203, + "grad_norm": 0.3456822335720062, + "learning_rate": 3.150406504065041e-05, + "loss": 0.4619, + "num_tokens": 46044686.0, + "step": 156 + }, + { + "epoch": 0.193390452876377, + "grad_norm": 0.2635233700275421, + "learning_rate": 3.191056910569106e-05, + "loss": 0.4501, + "num_tokens": 46638282.0, + "step": 158 + }, + { + "epoch": 0.19583843329253367, + "grad_norm": 0.29789999127388, + "learning_rate": 3.231707317073171e-05, + "loss": 0.477, + "num_tokens": 47254725.0, + "step": 160 + }, + { + "epoch": 0.19828641370869032, + "grad_norm": 0.32337552309036255, + "learning_rate": 3.2723577235772356e-05, + "loss": 0.4622, + "num_tokens": 47825874.0, + "step": 162 + }, + { + "epoch": 0.200734394124847, + "grad_norm": 0.32470235228538513, + "learning_rate": 3.313008130081301e-05, + "loss": 0.4678, + "num_tokens": 48404000.0, + "step": 164 + }, + { + "epoch": 0.20318237454100369, + "grad_norm": 0.3373109698295593, + "learning_rate": 3.353658536585366e-05, + "loss": 0.4796, + "num_tokens": 49020620.0, + "step": 166 + }, + { + "epoch": 0.20563035495716034, + "grad_norm": 0.29925215244293213, + "learning_rate": 3.394308943089431e-05, + "loss": 0.4649, + "num_tokens": 49610138.0, + "step": 168 + }, + { + "epoch": 0.20807833537331702, + "grad_norm": 0.2854861319065094, + "learning_rate": 3.434959349593496e-05, + "loss": 0.4698, + "num_tokens": 50239651.0, + "step": 170 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.30415239930152893, + "learning_rate": 3.475609756097561e-05, + "loss": 0.4561, + "num_tokens": 50821539.0, + "step": 172 + }, + { + "epoch": 0.21297429620563035, + "grad_norm": 0.32493048906326294, + "learning_rate": 3.5162601626016265e-05, + "loss": 0.4654, + "num_tokens": 51392202.0, + "step": 174 + }, + { + "epoch": 0.21542227662178703, + "grad_norm": 0.3101598024368286, + "learning_rate": 3.556910569105692e-05, + "loss": 0.4725, + "num_tokens": 51973577.0, + "step": 176 + }, + { + "epoch": 0.2178702570379437, + "grad_norm": 0.2753882110118866, + "learning_rate": 3.597560975609756e-05, + "loss": 0.4585, + "num_tokens": 52557644.0, + "step": 178 + }, + { + "epoch": 0.22031823745410037, + "grad_norm": 0.28565946221351624, + "learning_rate": 3.6382113821138216e-05, + "loss": 0.4575, + "num_tokens": 53136591.0, + "step": 180 + }, + { + "epoch": 0.22276621787025705, + "grad_norm": 0.31612062454223633, + "learning_rate": 3.678861788617886e-05, + "loss": 0.4737, + "num_tokens": 53723609.0, + "step": 182 + }, + { + "epoch": 0.2252141982864137, + "grad_norm": 0.30531564354896545, + "learning_rate": 3.7195121951219514e-05, + "loss": 0.4758, + "num_tokens": 54317296.0, + "step": 184 + }, + { + "epoch": 0.22766217870257038, + "grad_norm": 0.31228843331336975, + "learning_rate": 3.760162601626017e-05, + "loss": 0.4597, + "num_tokens": 54898930.0, + "step": 186 + }, + { + "epoch": 0.23011015911872704, + "grad_norm": 0.30503806471824646, + "learning_rate": 3.800813008130081e-05, + "loss": 0.4669, + "num_tokens": 55520199.0, + "step": 188 + }, + { + "epoch": 0.23255813953488372, + "grad_norm": 0.26810380816459656, + "learning_rate": 3.8414634146341465e-05, + "loss": 0.4569, + "num_tokens": 56148692.0, + "step": 190 + }, + { + "epoch": 0.2350061199510404, + "grad_norm": 0.28248918056488037, + "learning_rate": 3.882113821138211e-05, + "loss": 0.4648, + "num_tokens": 56744145.0, + "step": 192 + }, + { + "epoch": 0.23745410036719705, + "grad_norm": 0.2889978289604187, + "learning_rate": 3.922764227642276e-05, + "loss": 0.4617, + "num_tokens": 57372074.0, + "step": 194 + }, + { + "epoch": 0.23990208078335373, + "grad_norm": 0.280788779258728, + "learning_rate": 3.9634146341463416e-05, + "loss": 0.4805, + "num_tokens": 57970018.0, + "step": 196 + }, + { + "epoch": 0.2423500611995104, + "grad_norm": 0.2759288251399994, + "learning_rate": 4.004065040650407e-05, + "loss": 0.4758, + "num_tokens": 58616468.0, + "step": 198 + }, + { + "epoch": 0.24479804161566707, + "grad_norm": 0.32008427381515503, + "learning_rate": 4.044715447154472e-05, + "loss": 0.4631, + "num_tokens": 59212867.0, + "step": 200 + }, + { + "epoch": 0.24724602203182375, + "grad_norm": 0.33380889892578125, + "learning_rate": 4.085365853658537e-05, + "loss": 0.4664, + "num_tokens": 59831819.0, + "step": 202 + }, + { + "epoch": 0.24969400244798043, + "grad_norm": 0.3152978718280792, + "learning_rate": 4.126016260162602e-05, + "loss": 0.4509, + "num_tokens": 60406569.0, + "step": 204 + }, + { + "epoch": 0.2521419828641371, + "grad_norm": 0.3157564103603363, + "learning_rate": 4.166666666666667e-05, + "loss": 0.4674, + "num_tokens": 60992960.0, + "step": 206 + }, + { + "epoch": 0.25458996328029376, + "grad_norm": 0.3165920078754425, + "learning_rate": 4.207317073170732e-05, + "loss": 0.4593, + "num_tokens": 61582502.0, + "step": 208 + }, + { + "epoch": 0.25703794369645044, + "grad_norm": 0.27943071722984314, + "learning_rate": 4.247967479674797e-05, + "loss": 0.4576, + "num_tokens": 62170270.0, + "step": 210 + }, + { + "epoch": 0.2594859241126071, + "grad_norm": 0.3024695813655853, + "learning_rate": 4.2886178861788616e-05, + "loss": 0.4529, + "num_tokens": 62762518.0, + "step": 212 + }, + { + "epoch": 0.26193390452876375, + "grad_norm": 0.29120710492134094, + "learning_rate": 4.329268292682927e-05, + "loss": 0.4612, + "num_tokens": 63338317.0, + "step": 214 + }, + { + "epoch": 0.26438188494492043, + "grad_norm": 0.2895090878009796, + "learning_rate": 4.369918699186992e-05, + "loss": 0.4468, + "num_tokens": 63946534.0, + "step": 216 + }, + { + "epoch": 0.2668298653610771, + "grad_norm": 0.27670982480049133, + "learning_rate": 4.410569105691057e-05, + "loss": 0.4477, + "num_tokens": 64529245.0, + "step": 218 + }, + { + "epoch": 0.2692778457772338, + "grad_norm": 0.2945539057254791, + "learning_rate": 4.451219512195122e-05, + "loss": 0.4713, + "num_tokens": 65101588.0, + "step": 220 + }, + { + "epoch": 0.2717258261933905, + "grad_norm": 0.26842209696769714, + "learning_rate": 4.491869918699187e-05, + "loss": 0.4592, + "num_tokens": 65693459.0, + "step": 222 + }, + { + "epoch": 0.2741738066095471, + "grad_norm": 0.28789806365966797, + "learning_rate": 4.5325203252032525e-05, + "loss": 0.464, + "num_tokens": 66264575.0, + "step": 224 + }, + { + "epoch": 0.2766217870257038, + "grad_norm": 0.2838912904262543, + "learning_rate": 4.573170731707318e-05, + "loss": 0.4424, + "num_tokens": 66836693.0, + "step": 226 + }, + { + "epoch": 0.27906976744186046, + "grad_norm": 0.3043358027935028, + "learning_rate": 4.613821138211382e-05, + "loss": 0.4658, + "num_tokens": 67435169.0, + "step": 228 + }, + { + "epoch": 0.28151774785801714, + "grad_norm": 0.28893885016441345, + "learning_rate": 4.6544715447154476e-05, + "loss": 0.4706, + "num_tokens": 68047327.0, + "step": 230 + }, + { + "epoch": 0.2839657282741738, + "grad_norm": 0.3065444827079773, + "learning_rate": 4.695121951219512e-05, + "loss": 0.463, + "num_tokens": 68652895.0, + "step": 232 + }, + { + "epoch": 0.2864137086903305, + "grad_norm": 0.28071919083595276, + "learning_rate": 4.7357723577235774e-05, + "loss": 0.4645, + "num_tokens": 69234680.0, + "step": 234 + }, + { + "epoch": 0.28886168910648713, + "grad_norm": 0.29526597261428833, + "learning_rate": 4.776422764227643e-05, + "loss": 0.4652, + "num_tokens": 69826413.0, + "step": 236 + }, + { + "epoch": 0.2913096695226438, + "grad_norm": 0.3039819300174713, + "learning_rate": 4.817073170731707e-05, + "loss": 0.4585, + "num_tokens": 70441126.0, + "step": 238 + }, + { + "epoch": 0.2937576499388005, + "grad_norm": 0.26642394065856934, + "learning_rate": 4.8577235772357725e-05, + "loss": 0.4546, + "num_tokens": 71026297.0, + "step": 240 + }, + { + "epoch": 0.2962056303549572, + "grad_norm": 0.2610216736793518, + "learning_rate": 4.898373983739837e-05, + "loss": 0.4457, + "num_tokens": 71615415.0, + "step": 242 + }, + { + "epoch": 0.29865361077111385, + "grad_norm": 0.2910749018192291, + "learning_rate": 4.9390243902439024e-05, + "loss": 0.4591, + "num_tokens": 72187205.0, + "step": 244 + }, + { + "epoch": 0.3011015911872705, + "grad_norm": 0.2814527153968811, + "learning_rate": 4.9796747967479676e-05, + "loss": 0.4521, + "num_tokens": 72766789.0, + "step": 246 + }, + { + "epoch": 0.30354957160342716, + "grad_norm": 0.273590624332428, + "learning_rate": 4.997732426303855e-05, + "loss": 0.4623, + "num_tokens": 73367195.0, + "step": 248 + }, + { + "epoch": 0.30599755201958384, + "grad_norm": 0.3425842225551605, + "learning_rate": 4.993197278911565e-05, + "loss": 0.4687, + "num_tokens": 73931883.0, + "step": 250 + }, + { + "epoch": 0.3084455324357405, + "grad_norm": 0.3046858608722687, + "learning_rate": 4.9886621315192745e-05, + "loss": 0.4656, + "num_tokens": 74512785.0, + "step": 252 + }, + { + "epoch": 0.3108935128518972, + "grad_norm": 0.2800957262516022, + "learning_rate": 4.9841269841269845e-05, + "loss": 0.4554, + "num_tokens": 75103604.0, + "step": 254 + }, + { + "epoch": 0.31334149326805383, + "grad_norm": 0.32037150859832764, + "learning_rate": 4.979591836734694e-05, + "loss": 0.4707, + "num_tokens": 75691456.0, + "step": 256 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 0.3167163133621216, + "learning_rate": 4.975056689342404e-05, + "loss": 0.458, + "num_tokens": 76296514.0, + "step": 258 + }, + { + "epoch": 0.3182374541003672, + "grad_norm": 0.2827634811401367, + "learning_rate": 4.970521541950114e-05, + "loss": 0.4609, + "num_tokens": 76901295.0, + "step": 260 + }, + { + "epoch": 0.32068543451652387, + "grad_norm": 0.3510238528251648, + "learning_rate": 4.965986394557823e-05, + "loss": 0.4515, + "num_tokens": 77473398.0, + "step": 262 + }, + { + "epoch": 0.32313341493268055, + "grad_norm": 0.3035884201526642, + "learning_rate": 4.961451247165533e-05, + "loss": 0.4688, + "num_tokens": 78068399.0, + "step": 264 + }, + { + "epoch": 0.32558139534883723, + "grad_norm": 0.2979573905467987, + "learning_rate": 4.9569160997732425e-05, + "loss": 0.4741, + "num_tokens": 78683018.0, + "step": 266 + }, + { + "epoch": 0.32802937576499386, + "grad_norm": 0.262450248003006, + "learning_rate": 4.9523809523809525e-05, + "loss": 0.4511, + "num_tokens": 79280661.0, + "step": 268 + }, + { + "epoch": 0.33047735618115054, + "grad_norm": 0.31849756836891174, + "learning_rate": 4.9478458049886625e-05, + "loss": 0.4524, + "num_tokens": 79894114.0, + "step": 270 + }, + { + "epoch": 0.3329253365973072, + "grad_norm": 0.31471672654151917, + "learning_rate": 4.9433106575963725e-05, + "loss": 0.4563, + "num_tokens": 80498761.0, + "step": 272 + }, + { + "epoch": 0.3353733170134639, + "grad_norm": 0.3063153922557831, + "learning_rate": 4.938775510204082e-05, + "loss": 0.4454, + "num_tokens": 81090664.0, + "step": 274 + }, + { + "epoch": 0.3378212974296206, + "grad_norm": 0.2842086851596832, + "learning_rate": 4.934240362811792e-05, + "loss": 0.4534, + "num_tokens": 81682310.0, + "step": 276 + }, + { + "epoch": 0.3402692778457772, + "grad_norm": 0.2741132378578186, + "learning_rate": 4.929705215419501e-05, + "loss": 0.4375, + "num_tokens": 82257211.0, + "step": 278 + }, + { + "epoch": 0.3427172582619339, + "grad_norm": 0.27241361141204834, + "learning_rate": 4.925170068027211e-05, + "loss": 0.4391, + "num_tokens": 82873665.0, + "step": 280 + }, + { + "epoch": 0.34516523867809057, + "grad_norm": 0.28653502464294434, + "learning_rate": 4.9206349206349204e-05, + "loss": 0.451, + "num_tokens": 83459915.0, + "step": 282 + }, + { + "epoch": 0.34761321909424725, + "grad_norm": 0.2593699097633362, + "learning_rate": 4.9160997732426304e-05, + "loss": 0.46, + "num_tokens": 84064495.0, + "step": 284 + }, + { + "epoch": 0.35006119951040393, + "grad_norm": 0.2680375874042511, + "learning_rate": 4.9115646258503404e-05, + "loss": 0.4382, + "num_tokens": 84650583.0, + "step": 286 + }, + { + "epoch": 0.3525091799265606, + "grad_norm": 0.2482227087020874, + "learning_rate": 4.9070294784580504e-05, + "loss": 0.4488, + "num_tokens": 85253276.0, + "step": 288 + }, + { + "epoch": 0.35495716034271724, + "grad_norm": 0.27130281925201416, + "learning_rate": 4.9024943310657604e-05, + "loss": 0.4481, + "num_tokens": 85833675.0, + "step": 290 + }, + { + "epoch": 0.3574051407588739, + "grad_norm": 0.27251341938972473, + "learning_rate": 4.89795918367347e-05, + "loss": 0.4415, + "num_tokens": 86421330.0, + "step": 292 + }, + { + "epoch": 0.3598531211750306, + "grad_norm": 0.29418668150901794, + "learning_rate": 4.89342403628118e-05, + "loss": 0.4537, + "num_tokens": 87030151.0, + "step": 294 + }, + { + "epoch": 0.3623011015911873, + "grad_norm": 0.2746061384677887, + "learning_rate": 4.888888888888889e-05, + "loss": 0.4319, + "num_tokens": 87567827.0, + "step": 296 + }, + { + "epoch": 0.36474908200734396, + "grad_norm": 0.27210670709609985, + "learning_rate": 4.884353741496599e-05, + "loss": 0.44, + "num_tokens": 88157730.0, + "step": 298 + }, + { + "epoch": 0.3671970624235006, + "grad_norm": 0.2963593304157257, + "learning_rate": 4.879818594104308e-05, + "loss": 0.4527, + "num_tokens": 88759382.0, + "step": 300 + }, + { + "epoch": 0.36964504283965727, + "grad_norm": 0.31659379601478577, + "learning_rate": 4.875283446712018e-05, + "loss": 0.4473, + "num_tokens": 89342429.0, + "step": 302 + }, + { + "epoch": 0.37209302325581395, + "grad_norm": 0.27105456590652466, + "learning_rate": 4.8707482993197276e-05, + "loss": 0.4466, + "num_tokens": 89934089.0, + "step": 304 + }, + { + "epoch": 0.37454100367197063, + "grad_norm": 0.2646068036556244, + "learning_rate": 4.8662131519274376e-05, + "loss": 0.4426, + "num_tokens": 90525609.0, + "step": 306 + }, + { + "epoch": 0.3769889840881273, + "grad_norm": 0.25810450315475464, + "learning_rate": 4.8616780045351476e-05, + "loss": 0.4221, + "num_tokens": 91116420.0, + "step": 308 + }, + { + "epoch": 0.379436964504284, + "grad_norm": 0.25624772906303406, + "learning_rate": 4.8571428571428576e-05, + "loss": 0.4466, + "num_tokens": 91706872.0, + "step": 310 + }, + { + "epoch": 0.3818849449204406, + "grad_norm": 0.261426717042923, + "learning_rate": 4.8526077097505676e-05, + "loss": 0.4426, + "num_tokens": 92273116.0, + "step": 312 + }, + { + "epoch": 0.3843329253365973, + "grad_norm": 0.28192079067230225, + "learning_rate": 4.848072562358277e-05, + "loss": 0.4513, + "num_tokens": 92877700.0, + "step": 314 + }, + { + "epoch": 0.386780905752754, + "grad_norm": 0.27346062660217285, + "learning_rate": 4.843537414965987e-05, + "loss": 0.4545, + "num_tokens": 93448212.0, + "step": 316 + }, + { + "epoch": 0.38922888616891066, + "grad_norm": 0.3034634292125702, + "learning_rate": 4.839002267573696e-05, + "loss": 0.4443, + "num_tokens": 94015732.0, + "step": 318 + }, + { + "epoch": 0.39167686658506734, + "grad_norm": 0.26929572224617004, + "learning_rate": 4.834467120181406e-05, + "loss": 0.4387, + "num_tokens": 94583212.0, + "step": 320 + }, + { + "epoch": 0.39412484700122397, + "grad_norm": 0.26306092739105225, + "learning_rate": 4.8299319727891155e-05, + "loss": 0.4304, + "num_tokens": 95154747.0, + "step": 322 + }, + { + "epoch": 0.39657282741738065, + "grad_norm": 0.2616621255874634, + "learning_rate": 4.8253968253968255e-05, + "loss": 0.4558, + "num_tokens": 95736389.0, + "step": 324 + }, + { + "epoch": 0.3990208078335373, + "grad_norm": 0.26457124948501587, + "learning_rate": 4.820861678004535e-05, + "loss": 0.4513, + "num_tokens": 96331517.0, + "step": 326 + }, + { + "epoch": 0.401468788249694, + "grad_norm": 0.2591368854045868, + "learning_rate": 4.816326530612245e-05, + "loss": 0.4428, + "num_tokens": 96919369.0, + "step": 328 + }, + { + "epoch": 0.4039167686658507, + "grad_norm": 0.2573201656341553, + "learning_rate": 4.811791383219955e-05, + "loss": 0.4367, + "num_tokens": 97509288.0, + "step": 330 + }, + { + "epoch": 0.40636474908200737, + "grad_norm": 0.243800088763237, + "learning_rate": 4.807256235827665e-05, + "loss": 0.4497, + "num_tokens": 98114408.0, + "step": 332 + }, + { + "epoch": 0.408812729498164, + "grad_norm": 0.25118839740753174, + "learning_rate": 4.802721088435375e-05, + "loss": 0.432, + "num_tokens": 98713441.0, + "step": 334 + }, + { + "epoch": 0.4112607099143207, + "grad_norm": 0.2801310122013092, + "learning_rate": 4.798185941043084e-05, + "loss": 0.4379, + "num_tokens": 99327738.0, + "step": 336 + }, + { + "epoch": 0.41370869033047736, + "grad_norm": 0.2499040812253952, + "learning_rate": 4.793650793650794e-05, + "loss": 0.4425, + "num_tokens": 99919049.0, + "step": 338 + }, + { + "epoch": 0.41615667074663404, + "grad_norm": 0.25859567523002625, + "learning_rate": 4.7891156462585034e-05, + "loss": 0.4318, + "num_tokens": 100516376.0, + "step": 340 + }, + { + "epoch": 0.4186046511627907, + "grad_norm": 0.26886188983917236, + "learning_rate": 4.7845804988662134e-05, + "loss": 0.4457, + "num_tokens": 101133012.0, + "step": 342 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.26822352409362793, + "learning_rate": 4.780045351473923e-05, + "loss": 0.4337, + "num_tokens": 101700842.0, + "step": 344 + }, + { + "epoch": 0.423500611995104, + "grad_norm": 0.2541097104549408, + "learning_rate": 4.775510204081633e-05, + "loss": 0.4505, + "num_tokens": 102301821.0, + "step": 346 + }, + { + "epoch": 0.4259485924112607, + "grad_norm": 0.2610437572002411, + "learning_rate": 4.770975056689343e-05, + "loss": 0.4421, + "num_tokens": 102911147.0, + "step": 348 + }, + { + "epoch": 0.4283965728274174, + "grad_norm": 0.2511634826660156, + "learning_rate": 4.766439909297053e-05, + "loss": 0.4204, + "num_tokens": 103494243.0, + "step": 350 + }, + { + "epoch": 0.43084455324357407, + "grad_norm": 0.25283822417259216, + "learning_rate": 4.761904761904762e-05, + "loss": 0.4404, + "num_tokens": 104100107.0, + "step": 352 + }, + { + "epoch": 0.43329253365973075, + "grad_norm": 0.2520866394042969, + "learning_rate": 4.757369614512472e-05, + "loss": 0.4377, + "num_tokens": 104696583.0, + "step": 354 + }, + { + "epoch": 0.4357405140758874, + "grad_norm": 0.262451171875, + "learning_rate": 4.752834467120182e-05, + "loss": 0.4402, + "num_tokens": 105266973.0, + "step": 356 + }, + { + "epoch": 0.43818849449204406, + "grad_norm": 0.25137123465538025, + "learning_rate": 4.7482993197278913e-05, + "loss": 0.4301, + "num_tokens": 105855583.0, + "step": 358 + }, + { + "epoch": 0.44063647490820074, + "grad_norm": 0.2414858043193817, + "learning_rate": 4.743764172335601e-05, + "loss": 0.4373, + "num_tokens": 106413444.0, + "step": 360 + }, + { + "epoch": 0.4430844553243574, + "grad_norm": 0.26945173740386963, + "learning_rate": 4.7392290249433106e-05, + "loss": 0.4475, + "num_tokens": 107001653.0, + "step": 362 + }, + { + "epoch": 0.4455324357405141, + "grad_norm": 0.2666376233100891, + "learning_rate": 4.7346938775510206e-05, + "loss": 0.4396, + "num_tokens": 107595819.0, + "step": 364 + }, + { + "epoch": 0.4479804161566707, + "grad_norm": 0.24366697669029236, + "learning_rate": 4.73015873015873e-05, + "loss": 0.4137, + "num_tokens": 108198205.0, + "step": 366 + }, + { + "epoch": 0.4504283965728274, + "grad_norm": 0.25142210721969604, + "learning_rate": 4.72562358276644e-05, + "loss": 0.4205, + "num_tokens": 108768930.0, + "step": 368 + }, + { + "epoch": 0.4528763769889841, + "grad_norm": 0.2570977210998535, + "learning_rate": 4.72108843537415e-05, + "loss": 0.4486, + "num_tokens": 109393449.0, + "step": 370 + }, + { + "epoch": 0.45532435740514077, + "grad_norm": 0.23821303248405457, + "learning_rate": 4.71655328798186e-05, + "loss": 0.4291, + "num_tokens": 109998048.0, + "step": 372 + }, + { + "epoch": 0.45777233782129745, + "grad_norm": 0.25796109437942505, + "learning_rate": 4.712018140589569e-05, + "loss": 0.4394, + "num_tokens": 110602780.0, + "step": 374 + }, + { + "epoch": 0.4602203182374541, + "grad_norm": 0.24663180112838745, + "learning_rate": 4.707482993197279e-05, + "loss": 0.4314, + "num_tokens": 111196271.0, + "step": 376 + }, + { + "epoch": 0.46266829865361075, + "grad_norm": 0.24093066155910492, + "learning_rate": 4.702947845804989e-05, + "loss": 0.4263, + "num_tokens": 111762398.0, + "step": 378 + }, + { + "epoch": 0.46511627906976744, + "grad_norm": 0.23159019649028778, + "learning_rate": 4.6984126984126986e-05, + "loss": 0.4344, + "num_tokens": 112344276.0, + "step": 380 + }, + { + "epoch": 0.4675642594859241, + "grad_norm": 0.2863178253173828, + "learning_rate": 4.6938775510204086e-05, + "loss": 0.4284, + "num_tokens": 112929852.0, + "step": 382 + }, + { + "epoch": 0.4700122399020808, + "grad_norm": 0.26733890175819397, + "learning_rate": 4.689342403628118e-05, + "loss": 0.4302, + "num_tokens": 113524769.0, + "step": 384 + }, + { + "epoch": 0.4724602203182375, + "grad_norm": 0.2527005076408386, + "learning_rate": 4.684807256235828e-05, + "loss": 0.4174, + "num_tokens": 114097175.0, + "step": 386 + }, + { + "epoch": 0.4749082007343941, + "grad_norm": 0.2460888773202896, + "learning_rate": 4.680272108843537e-05, + "loss": 0.44, + "num_tokens": 114681195.0, + "step": 388 + }, + { + "epoch": 0.4773561811505508, + "grad_norm": 0.23544274270534515, + "learning_rate": 4.675736961451247e-05, + "loss": 0.4439, + "num_tokens": 115275959.0, + "step": 390 + }, + { + "epoch": 0.47980416156670747, + "grad_norm": 0.3866446316242218, + "learning_rate": 4.671201814058957e-05, + "loss": 0.4202, + "num_tokens": 115836837.0, + "step": 392 + }, + { + "epoch": 0.48225214198286415, + "grad_norm": 0.24436108767986298, + "learning_rate": 4.666666666666667e-05, + "loss": 0.441, + "num_tokens": 116449305.0, + "step": 394 + }, + { + "epoch": 0.4847001223990208, + "grad_norm": 0.2648446261882782, + "learning_rate": 4.6621315192743765e-05, + "loss": 0.4235, + "num_tokens": 117045707.0, + "step": 396 + }, + { + "epoch": 0.48714810281517745, + "grad_norm": 0.23078219592571259, + "learning_rate": 4.6575963718820865e-05, + "loss": 0.4321, + "num_tokens": 117635477.0, + "step": 398 + }, + { + "epoch": 0.48959608323133413, + "grad_norm": 0.2425663322210312, + "learning_rate": 4.653061224489796e-05, + "loss": 0.4361, + "num_tokens": 118231246.0, + "step": 400 + }, + { + "epoch": 0.4920440636474908, + "grad_norm": 0.24755631387233734, + "learning_rate": 4.648526077097506e-05, + "loss": 0.4289, + "num_tokens": 118844143.0, + "step": 402 + }, + { + "epoch": 0.4944920440636475, + "grad_norm": 0.25055649876594543, + "learning_rate": 4.643990929705216e-05, + "loss": 0.4248, + "num_tokens": 119416215.0, + "step": 404 + }, + { + "epoch": 0.4969400244798042, + "grad_norm": 0.23205067217350006, + "learning_rate": 4.639455782312925e-05, + "loss": 0.4319, + "num_tokens": 120009624.0, + "step": 406 + }, + { + "epoch": 0.49938800489596086, + "grad_norm": 0.24580615758895874, + "learning_rate": 4.634920634920635e-05, + "loss": 0.4178, + "num_tokens": 120601206.0, + "step": 408 + }, + { + "epoch": 0.5018359853121175, + "grad_norm": 0.26284557580947876, + "learning_rate": 4.630385487528345e-05, + "loss": 0.4406, + "num_tokens": 121202335.0, + "step": 410 + }, + { + "epoch": 0.5042839657282742, + "grad_norm": 0.2503519058227539, + "learning_rate": 4.625850340136055e-05, + "loss": 0.4303, + "num_tokens": 121787505.0, + "step": 412 + }, + { + "epoch": 0.5067319461444308, + "grad_norm": 0.23268626630306244, + "learning_rate": 4.6213151927437644e-05, + "loss": 0.4302, + "num_tokens": 122383008.0, + "step": 414 + }, + { + "epoch": 0.5091799265605875, + "grad_norm": 0.24775122106075287, + "learning_rate": 4.6167800453514744e-05, + "loss": 0.4396, + "num_tokens": 122980153.0, + "step": 416 + }, + { + "epoch": 0.5116279069767442, + "grad_norm": 0.23879091441631317, + "learning_rate": 4.612244897959184e-05, + "loss": 0.4173, + "num_tokens": 123564055.0, + "step": 418 + }, + { + "epoch": 0.5140758873929009, + "grad_norm": 0.2565329968929291, + "learning_rate": 4.607709750566894e-05, + "loss": 0.4277, + "num_tokens": 124154714.0, + "step": 420 + }, + { + "epoch": 0.5165238678090576, + "grad_norm": 0.24127976596355438, + "learning_rate": 4.603174603174603e-05, + "loss": 0.4368, + "num_tokens": 124725604.0, + "step": 422 + }, + { + "epoch": 0.5189718482252142, + "grad_norm": 0.23468266427516937, + "learning_rate": 4.598639455782313e-05, + "loss": 0.4297, + "num_tokens": 125329669.0, + "step": 424 + }, + { + "epoch": 0.5214198286413708, + "grad_norm": 0.23767265677452087, + "learning_rate": 4.594104308390023e-05, + "loss": 0.4207, + "num_tokens": 125871390.0, + "step": 426 + }, + { + "epoch": 0.5238678090575275, + "grad_norm": 0.22062784433364868, + "learning_rate": 4.589569160997732e-05, + "loss": 0.4353, + "num_tokens": 126485091.0, + "step": 428 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.24753126502037048, + "learning_rate": 4.585034013605442e-05, + "loss": 0.4275, + "num_tokens": 127070743.0, + "step": 430 + }, + { + "epoch": 0.5287637698898409, + "grad_norm": 0.2816902697086334, + "learning_rate": 4.580498866213152e-05, + "loss": 0.4392, + "num_tokens": 127643183.0, + "step": 432 + }, + { + "epoch": 0.5312117503059975, + "grad_norm": 0.2535652220249176, + "learning_rate": 4.575963718820862e-05, + "loss": 0.435, + "num_tokens": 128238869.0, + "step": 434 + }, + { + "epoch": 0.5336597307221542, + "grad_norm": 0.2562016248703003, + "learning_rate": 4.5714285714285716e-05, + "loss": 0.4197, + "num_tokens": 128833166.0, + "step": 436 + }, + { + "epoch": 0.5361077111383109, + "grad_norm": 0.2548801004886627, + "learning_rate": 4.5668934240362816e-05, + "loss": 0.438, + "num_tokens": 129432358.0, + "step": 438 + }, + { + "epoch": 0.5385556915544676, + "grad_norm": 0.2447548508644104, + "learning_rate": 4.562358276643991e-05, + "loss": 0.4118, + "num_tokens": 130005589.0, + "step": 440 + }, + { + "epoch": 0.5410036719706243, + "grad_norm": 0.2459680140018463, + "learning_rate": 4.557823129251701e-05, + "loss": 0.4457, + "num_tokens": 130617968.0, + "step": 442 + }, + { + "epoch": 0.543451652386781, + "grad_norm": 0.24559234082698822, + "learning_rate": 4.55328798185941e-05, + "loss": 0.4155, + "num_tokens": 131211641.0, + "step": 444 + }, + { + "epoch": 0.5458996328029376, + "grad_norm": 0.2618561387062073, + "learning_rate": 4.54875283446712e-05, + "loss": 0.4244, + "num_tokens": 131815751.0, + "step": 446 + }, + { + "epoch": 0.5483476132190942, + "grad_norm": 0.253743052482605, + "learning_rate": 4.54421768707483e-05, + "loss": 0.4303, + "num_tokens": 132435428.0, + "step": 448 + }, + { + "epoch": 0.5507955936352509, + "grad_norm": 0.23675714433193207, + "learning_rate": 4.5396825396825395e-05, + "loss": 0.4385, + "num_tokens": 133026844.0, + "step": 450 + }, + { + "epoch": 0.5532435740514076, + "grad_norm": 0.25438225269317627, + "learning_rate": 4.53514739229025e-05, + "loss": 0.4361, + "num_tokens": 133610279.0, + "step": 452 + }, + { + "epoch": 0.5556915544675642, + "grad_norm": 0.24375559389591217, + "learning_rate": 4.5306122448979595e-05, + "loss": 0.4223, + "num_tokens": 134186783.0, + "step": 454 + }, + { + "epoch": 0.5581395348837209, + "grad_norm": 0.2337706834077835, + "learning_rate": 4.5260770975056695e-05, + "loss": 0.408, + "num_tokens": 134766199.0, + "step": 456 + }, + { + "epoch": 0.5605875152998776, + "grad_norm": 0.2438063621520996, + "learning_rate": 4.521541950113379e-05, + "loss": 0.421, + "num_tokens": 135355631.0, + "step": 458 + }, + { + "epoch": 0.5630354957160343, + "grad_norm": 0.2823568880558014, + "learning_rate": 4.517006802721089e-05, + "loss": 0.4414, + "num_tokens": 135973241.0, + "step": 460 + }, + { + "epoch": 0.565483476132191, + "grad_norm": 0.22876766324043274, + "learning_rate": 4.512471655328798e-05, + "loss": 0.4155, + "num_tokens": 136579751.0, + "step": 462 + }, + { + "epoch": 0.5679314565483476, + "grad_norm": 0.23762549459934235, + "learning_rate": 4.507936507936508e-05, + "loss": 0.4264, + "num_tokens": 137187474.0, + "step": 464 + }, + { + "epoch": 0.5703794369645043, + "grad_norm": 0.24829839169979095, + "learning_rate": 4.5034013605442174e-05, + "loss": 0.443, + "num_tokens": 137796342.0, + "step": 466 + }, + { + "epoch": 0.572827417380661, + "grad_norm": 0.24192968010902405, + "learning_rate": 4.4988662131519274e-05, + "loss": 0.4208, + "num_tokens": 138415346.0, + "step": 468 + }, + { + "epoch": 0.5752753977968176, + "grad_norm": 0.24213774502277374, + "learning_rate": 4.4943310657596374e-05, + "loss": 0.4372, + "num_tokens": 139036676.0, + "step": 470 + }, + { + "epoch": 0.5777233782129743, + "grad_norm": 0.233334481716156, + "learning_rate": 4.4897959183673474e-05, + "loss": 0.435, + "num_tokens": 139647226.0, + "step": 472 + }, + { + "epoch": 0.5801713586291309, + "grad_norm": 0.2323165237903595, + "learning_rate": 4.4852607709750574e-05, + "loss": 0.4161, + "num_tokens": 140222429.0, + "step": 474 + }, + { + "epoch": 0.5826193390452876, + "grad_norm": 0.2356025129556656, + "learning_rate": 4.480725623582767e-05, + "loss": 0.4351, + "num_tokens": 140846203.0, + "step": 476 + }, + { + "epoch": 0.5850673194614443, + "grad_norm": 0.2731492817401886, + "learning_rate": 4.476190476190477e-05, + "loss": 0.4163, + "num_tokens": 141448874.0, + "step": 478 + }, + { + "epoch": 0.587515299877601, + "grad_norm": 0.2720438241958618, + "learning_rate": 4.471655328798186e-05, + "loss": 0.4131, + "num_tokens": 142029277.0, + "step": 480 + }, + { + "epoch": 0.5899632802937577, + "grad_norm": 0.2699112892150879, + "learning_rate": 4.467120181405896e-05, + "loss": 0.4126, + "num_tokens": 142624816.0, + "step": 482 + }, + { + "epoch": 0.5924112607099143, + "grad_norm": 0.25922295451164246, + "learning_rate": 4.4625850340136054e-05, + "loss": 0.4159, + "num_tokens": 143212355.0, + "step": 484 + }, + { + "epoch": 0.594859241126071, + "grad_norm": 0.23546801507472992, + "learning_rate": 4.4580498866213154e-05, + "loss": 0.4341, + "num_tokens": 143820960.0, + "step": 486 + }, + { + "epoch": 0.5973072215422277, + "grad_norm": 0.22402498126029968, + "learning_rate": 4.453514739229025e-05, + "loss": 0.4309, + "num_tokens": 144394056.0, + "step": 488 + }, + { + "epoch": 0.5997552019583844, + "grad_norm": 0.23887427151203156, + "learning_rate": 4.448979591836735e-05, + "loss": 0.4188, + "num_tokens": 144976355.0, + "step": 490 + }, + { + "epoch": 0.602203182374541, + "grad_norm": 0.25990355014801025, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.4217, + "num_tokens": 145575990.0, + "step": 492 + }, + { + "epoch": 0.6046511627906976, + "grad_norm": 0.2554283142089844, + "learning_rate": 4.4399092970521547e-05, + "loss": 0.4205, + "num_tokens": 146180118.0, + "step": 494 + }, + { + "epoch": 0.6070991432068543, + "grad_norm": 0.26520001888275146, + "learning_rate": 4.4353741496598646e-05, + "loss": 0.4305, + "num_tokens": 146763569.0, + "step": 496 + }, + { + "epoch": 0.609547123623011, + "grad_norm": 0.2609451413154602, + "learning_rate": 4.430839002267574e-05, + "loss": 0.4182, + "num_tokens": 147348892.0, + "step": 498 + }, + { + "epoch": 0.6119951040391677, + "grad_norm": 0.23169048130512238, + "learning_rate": 4.426303854875284e-05, + "loss": 0.421, + "num_tokens": 147934106.0, + "step": 500 + }, + { + "epoch": 0.6144430844553244, + "grad_norm": 0.2318364381790161, + "learning_rate": 4.421768707482993e-05, + "loss": 0.4394, + "num_tokens": 148526284.0, + "step": 502 + }, + { + "epoch": 0.616891064871481, + "grad_norm": 0.23065966367721558, + "learning_rate": 4.417233560090703e-05, + "loss": 0.4377, + "num_tokens": 149116759.0, + "step": 504 + }, + { + "epoch": 0.6193390452876377, + "grad_norm": 0.23499241471290588, + "learning_rate": 4.4126984126984126e-05, + "loss": 0.4332, + "num_tokens": 149733718.0, + "step": 506 + }, + { + "epoch": 0.6217870257037944, + "grad_norm": 0.23843276500701904, + "learning_rate": 4.4081632653061226e-05, + "loss": 0.4252, + "num_tokens": 150325552.0, + "step": 508 + }, + { + "epoch": 0.6242350061199511, + "grad_norm": 0.24199701845645905, + "learning_rate": 4.4036281179138326e-05, + "loss": 0.4236, + "num_tokens": 150919640.0, + "step": 510 + }, + { + "epoch": 0.6266829865361077, + "grad_norm": 0.20851343870162964, + "learning_rate": 4.3990929705215426e-05, + "loss": 0.3977, + "num_tokens": 151479810.0, + "step": 512 + }, + { + "epoch": 0.6291309669522643, + "grad_norm": 0.23365788161754608, + "learning_rate": 4.394557823129252e-05, + "loss": 0.4258, + "num_tokens": 152069098.0, + "step": 514 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.2224223017692566, + "learning_rate": 4.390022675736962e-05, + "loss": 0.4146, + "num_tokens": 152650689.0, + "step": 516 + }, + { + "epoch": 0.6340269277845777, + "grad_norm": 0.21994057297706604, + "learning_rate": 4.385487528344671e-05, + "loss": 0.4269, + "num_tokens": 153238050.0, + "step": 518 + }, + { + "epoch": 0.6364749082007344, + "grad_norm": 0.22317960858345032, + "learning_rate": 4.380952380952381e-05, + "loss": 0.4227, + "num_tokens": 153848706.0, + "step": 520 + }, + { + "epoch": 0.6389228886168911, + "grad_norm": 0.2505166530609131, + "learning_rate": 4.376417233560091e-05, + "loss": 0.44, + "num_tokens": 154435999.0, + "step": 522 + }, + { + "epoch": 0.6413708690330477, + "grad_norm": 0.24617008864879608, + "learning_rate": 4.3718820861678005e-05, + "loss": 0.4071, + "num_tokens": 155012325.0, + "step": 524 + }, + { + "epoch": 0.6438188494492044, + "grad_norm": 0.2711974084377289, + "learning_rate": 4.3673469387755105e-05, + "loss": 0.3975, + "num_tokens": 155577911.0, + "step": 526 + }, + { + "epoch": 0.6462668298653611, + "grad_norm": 0.3099214434623718, + "learning_rate": 4.36281179138322e-05, + "loss": 0.4271, + "num_tokens": 156159354.0, + "step": 528 + }, + { + "epoch": 0.6487148102815178, + "grad_norm": 0.2503284513950348, + "learning_rate": 4.35827664399093e-05, + "loss": 0.4209, + "num_tokens": 156734371.0, + "step": 530 + }, + { + "epoch": 0.6511627906976745, + "grad_norm": 5.179694652557373, + "learning_rate": 4.35374149659864e-05, + "loss": 0.4422, + "num_tokens": 157342481.0, + "step": 532 + }, + { + "epoch": 0.653610771113831, + "grad_norm": 0.7401638627052307, + "learning_rate": 4.34920634920635e-05, + "loss": 0.4241, + "num_tokens": 157940378.0, + "step": 534 + }, + { + "epoch": 0.6560587515299877, + "grad_norm": 0.29225748777389526, + "learning_rate": 4.344671201814059e-05, + "loss": 0.4432, + "num_tokens": 158529483.0, + "step": 536 + }, + { + "epoch": 0.6585067319461444, + "grad_norm": 0.2533261179924011, + "learning_rate": 4.340136054421769e-05, + "loss": 0.4292, + "num_tokens": 159116553.0, + "step": 538 + }, + { + "epoch": 0.6609547123623011, + "grad_norm": 0.24542203545570374, + "learning_rate": 4.3356009070294784e-05, + "loss": 0.4201, + "num_tokens": 159704424.0, + "step": 540 + }, + { + "epoch": 0.6634026927784578, + "grad_norm": 0.22720332443714142, + "learning_rate": 4.3310657596371884e-05, + "loss": 0.4283, + "num_tokens": 160315337.0, + "step": 542 + }, + { + "epoch": 0.6658506731946144, + "grad_norm": 0.23304791748523712, + "learning_rate": 4.3265306122448984e-05, + "loss": 0.4255, + "num_tokens": 160906136.0, + "step": 544 + }, + { + "epoch": 0.6682986536107711, + "grad_norm": 0.3766651451587677, + "learning_rate": 4.321995464852608e-05, + "loss": 0.4007, + "num_tokens": 161482176.0, + "step": 546 + }, + { + "epoch": 0.6707466340269278, + "grad_norm": 0.21966175734996796, + "learning_rate": 4.317460317460318e-05, + "loss": 0.4079, + "num_tokens": 162066339.0, + "step": 548 + }, + { + "epoch": 0.6731946144430845, + "grad_norm": 0.2210390567779541, + "learning_rate": 4.312925170068027e-05, + "loss": 0.4106, + "num_tokens": 162686859.0, + "step": 550 + }, + { + "epoch": 0.6756425948592412, + "grad_norm": 0.24606673419475555, + "learning_rate": 4.308390022675737e-05, + "loss": 0.407, + "num_tokens": 163261743.0, + "step": 552 + }, + { + "epoch": 0.6780905752753978, + "grad_norm": 0.2407507747411728, + "learning_rate": 4.303854875283447e-05, + "loss": 0.4119, + "num_tokens": 163864744.0, + "step": 554 + }, + { + "epoch": 0.6805385556915544, + "grad_norm": 0.2362566441297531, + "learning_rate": 4.299319727891157e-05, + "loss": 0.4014, + "num_tokens": 164424223.0, + "step": 556 + }, + { + "epoch": 0.6829865361077111, + "grad_norm": 0.27575215697288513, + "learning_rate": 4.294784580498866e-05, + "loss": 0.4227, + "num_tokens": 165031113.0, + "step": 558 + }, + { + "epoch": 0.6854345165238678, + "grad_norm": 0.24118904769420624, + "learning_rate": 4.290249433106576e-05, + "loss": 0.4172, + "num_tokens": 165645826.0, + "step": 560 + }, + { + "epoch": 0.6878824969400245, + "grad_norm": 0.2281220406293869, + "learning_rate": 4.2857142857142856e-05, + "loss": 0.4167, + "num_tokens": 166270973.0, + "step": 562 + }, + { + "epoch": 0.6903304773561811, + "grad_norm": 0.24280649423599243, + "learning_rate": 4.2811791383219956e-05, + "loss": 0.4328, + "num_tokens": 166855785.0, + "step": 564 + }, + { + "epoch": 0.6927784577723378, + "grad_norm": 0.2196795791387558, + "learning_rate": 4.2766439909297056e-05, + "loss": 0.4179, + "num_tokens": 167481909.0, + "step": 566 + }, + { + "epoch": 0.6952264381884945, + "grad_norm": 0.2251538783311844, + "learning_rate": 4.272108843537415e-05, + "loss": 0.4085, + "num_tokens": 168078039.0, + "step": 568 + }, + { + "epoch": 0.6976744186046512, + "grad_norm": 0.22692939639091492, + "learning_rate": 4.267573696145125e-05, + "loss": 0.4148, + "num_tokens": 168668586.0, + "step": 570 + }, + { + "epoch": 0.7001223990208079, + "grad_norm": 0.23282669484615326, + "learning_rate": 4.263038548752835e-05, + "loss": 0.4094, + "num_tokens": 169280779.0, + "step": 572 + }, + { + "epoch": 0.7025703794369645, + "grad_norm": 0.22933503985404968, + "learning_rate": 4.258503401360545e-05, + "loss": 0.3957, + "num_tokens": 169869109.0, + "step": 574 + }, + { + "epoch": 0.7050183598531212, + "grad_norm": 0.22762653231620789, + "learning_rate": 4.253968253968254e-05, + "loss": 0.4226, + "num_tokens": 170459382.0, + "step": 576 + }, + { + "epoch": 0.7074663402692778, + "grad_norm": 0.2274516373872757, + "learning_rate": 4.249433106575964e-05, + "loss": 0.4282, + "num_tokens": 171059799.0, + "step": 578 + }, + { + "epoch": 0.7099143206854345, + "grad_norm": 0.24952532351016998, + "learning_rate": 4.2448979591836735e-05, + "loss": 0.419, + "num_tokens": 171616638.0, + "step": 580 + }, + { + "epoch": 0.7123623011015912, + "grad_norm": 0.239512637257576, + "learning_rate": 4.2403628117913835e-05, + "loss": 0.4227, + "num_tokens": 172215523.0, + "step": 582 + }, + { + "epoch": 0.7148102815177478, + "grad_norm": 0.21042466163635254, + "learning_rate": 4.235827664399093e-05, + "loss": 0.4104, + "num_tokens": 172816261.0, + "step": 584 + }, + { + "epoch": 0.7172582619339045, + "grad_norm": 0.2306113988161087, + "learning_rate": 4.231292517006803e-05, + "loss": 0.4047, + "num_tokens": 173418219.0, + "step": 586 + }, + { + "epoch": 0.7197062423500612, + "grad_norm": 0.2314852774143219, + "learning_rate": 4.226757369614512e-05, + "loss": 0.4159, + "num_tokens": 174001845.0, + "step": 588 + }, + { + "epoch": 0.7221542227662179, + "grad_norm": 0.22891540825366974, + "learning_rate": 4.222222222222222e-05, + "loss": 0.3911, + "num_tokens": 174573841.0, + "step": 590 + }, + { + "epoch": 0.7246022031823746, + "grad_norm": 0.2313038408756256, + "learning_rate": 4.217687074829932e-05, + "loss": 0.4124, + "num_tokens": 175157025.0, + "step": 592 + }, + { + "epoch": 0.7270501835985312, + "grad_norm": 0.2241615653038025, + "learning_rate": 4.213151927437642e-05, + "loss": 0.3986, + "num_tokens": 175763839.0, + "step": 594 + }, + { + "epoch": 0.7294981640146879, + "grad_norm": 0.2242891639471054, + "learning_rate": 4.208616780045352e-05, + "loss": 0.4076, + "num_tokens": 176340303.0, + "step": 596 + }, + { + "epoch": 0.7319461444308446, + "grad_norm": 0.23930864036083221, + "learning_rate": 4.2040816326530615e-05, + "loss": 0.4216, + "num_tokens": 176933693.0, + "step": 598 + }, + { + "epoch": 0.7343941248470012, + "grad_norm": 0.23385366797447205, + "learning_rate": 4.1995464852607714e-05, + "loss": 0.4122, + "num_tokens": 177551417.0, + "step": 600 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 0.23149244487285614, + "learning_rate": 4.195011337868481e-05, + "loss": 0.4061, + "num_tokens": 178108215.0, + "step": 602 + }, + { + "epoch": 0.7392900856793145, + "grad_norm": 0.23809857666492462, + "learning_rate": 4.190476190476191e-05, + "loss": 0.412, + "num_tokens": 178678758.0, + "step": 604 + }, + { + "epoch": 0.7417380660954712, + "grad_norm": 0.23036766052246094, + "learning_rate": 4.1859410430839e-05, + "loss": 0.4039, + "num_tokens": 179263018.0, + "step": 606 + }, + { + "epoch": 0.7441860465116279, + "grad_norm": 0.22353483736515045, + "learning_rate": 4.18140589569161e-05, + "loss": 0.4125, + "num_tokens": 179877385.0, + "step": 608 + }, + { + "epoch": 0.7466340269277846, + "grad_norm": 0.23328787088394165, + "learning_rate": 4.1768707482993194e-05, + "loss": 0.4119, + "num_tokens": 180464341.0, + "step": 610 + }, + { + "epoch": 0.7490820073439413, + "grad_norm": 0.2343887835741043, + "learning_rate": 4.1723356009070294e-05, + "loss": 0.4183, + "num_tokens": 181070260.0, + "step": 612 + }, + { + "epoch": 0.7515299877600979, + "grad_norm": 0.2277335226535797, + "learning_rate": 4.1678004535147394e-05, + "loss": 0.4005, + "num_tokens": 181650173.0, + "step": 614 + }, + { + "epoch": 0.7539779681762546, + "grad_norm": 0.30731311440467834, + "learning_rate": 4.1632653061224494e-05, + "loss": 0.405, + "num_tokens": 182238618.0, + "step": 616 + }, + { + "epoch": 0.7564259485924113, + "grad_norm": 0.24196062982082367, + "learning_rate": 4.1587301587301594e-05, + "loss": 0.3987, + "num_tokens": 182798204.0, + "step": 618 + }, + { + "epoch": 0.758873929008568, + "grad_norm": 0.21955657005310059, + "learning_rate": 4.154195011337869e-05, + "loss": 0.4057, + "num_tokens": 183380029.0, + "step": 620 + }, + { + "epoch": 0.7613219094247246, + "grad_norm": 0.2298729419708252, + "learning_rate": 4.149659863945579e-05, + "loss": 0.4134, + "num_tokens": 183966255.0, + "step": 622 + }, + { + "epoch": 0.7637698898408812, + "grad_norm": 0.22229833900928497, + "learning_rate": 4.145124716553288e-05, + "loss": 0.4209, + "num_tokens": 184583415.0, + "step": 624 + }, + { + "epoch": 0.7662178702570379, + "grad_norm": 0.23135779798030853, + "learning_rate": 4.140589569160998e-05, + "loss": 0.4178, + "num_tokens": 185197955.0, + "step": 626 + }, + { + "epoch": 0.7686658506731946, + "grad_norm": 0.2185201495885849, + "learning_rate": 4.136054421768707e-05, + "loss": 0.3971, + "num_tokens": 185785557.0, + "step": 628 + }, + { + "epoch": 0.7711138310893513, + "grad_norm": 0.21955755352973938, + "learning_rate": 4.131519274376417e-05, + "loss": 0.4074, + "num_tokens": 186356034.0, + "step": 630 + }, + { + "epoch": 0.773561811505508, + "grad_norm": 0.22553059458732605, + "learning_rate": 4.126984126984127e-05, + "loss": 0.4163, + "num_tokens": 186944825.0, + "step": 632 + }, + { + "epoch": 0.7760097919216646, + "grad_norm": 0.22468779981136322, + "learning_rate": 4.122448979591837e-05, + "loss": 0.4158, + "num_tokens": 187555120.0, + "step": 634 + }, + { + "epoch": 0.7784577723378213, + "grad_norm": 0.23950043320655823, + "learning_rate": 4.1179138321995466e-05, + "loss": 0.4173, + "num_tokens": 188158111.0, + "step": 636 + }, + { + "epoch": 0.780905752753978, + "grad_norm": 0.21780972182750702, + "learning_rate": 4.1133786848072566e-05, + "loss": 0.4198, + "num_tokens": 188752896.0, + "step": 638 + }, + { + "epoch": 0.7833537331701347, + "grad_norm": 0.20845623314380646, + "learning_rate": 4.1088435374149666e-05, + "loss": 0.4171, + "num_tokens": 189348219.0, + "step": 640 + }, + { + "epoch": 0.7858017135862914, + "grad_norm": 0.22173522412776947, + "learning_rate": 4.104308390022676e-05, + "loss": 0.4007, + "num_tokens": 189894564.0, + "step": 642 + }, + { + "epoch": 0.7882496940024479, + "grad_norm": 0.19926130771636963, + "learning_rate": 4.099773242630386e-05, + "loss": 0.4188, + "num_tokens": 190529171.0, + "step": 644 + }, + { + "epoch": 0.7906976744186046, + "grad_norm": 0.22314372658729553, + "learning_rate": 4.095238095238095e-05, + "loss": 0.4075, + "num_tokens": 191131915.0, + "step": 646 + }, + { + "epoch": 0.7931456548347613, + "grad_norm": 0.21730034053325653, + "learning_rate": 4.090702947845805e-05, + "loss": 0.4214, + "num_tokens": 191724725.0, + "step": 648 + }, + { + "epoch": 0.795593635250918, + "grad_norm": 0.22011788189411163, + "learning_rate": 4.0861678004535145e-05, + "loss": 0.4153, + "num_tokens": 192336245.0, + "step": 650 + }, + { + "epoch": 0.7980416156670747, + "grad_norm": 0.21682187914848328, + "learning_rate": 4.0816326530612245e-05, + "loss": 0.4073, + "num_tokens": 192923779.0, + "step": 652 + }, + { + "epoch": 0.8004895960832313, + "grad_norm": 0.22444431483745575, + "learning_rate": 4.0770975056689345e-05, + "loss": 0.3916, + "num_tokens": 193499166.0, + "step": 654 + }, + { + "epoch": 0.802937576499388, + "grad_norm": 0.2279336303472519, + "learning_rate": 4.0725623582766445e-05, + "loss": 0.4113, + "num_tokens": 194104195.0, + "step": 656 + }, + { + "epoch": 0.8053855569155447, + "grad_norm": 0.23833976686000824, + "learning_rate": 4.068027210884354e-05, + "loss": 0.4197, + "num_tokens": 194700389.0, + "step": 658 + }, + { + "epoch": 0.8078335373317014, + "grad_norm": 0.2620058059692383, + "learning_rate": 4.063492063492064e-05, + "loss": 0.419, + "num_tokens": 195293054.0, + "step": 660 + }, + { + "epoch": 0.8102815177478581, + "grad_norm": 0.2198343724012375, + "learning_rate": 4.058956916099774e-05, + "loss": 0.3991, + "num_tokens": 195898402.0, + "step": 662 + }, + { + "epoch": 0.8127294981640147, + "grad_norm": 0.25709268450737, + "learning_rate": 4.054421768707483e-05, + "loss": 0.4047, + "num_tokens": 196479623.0, + "step": 664 + }, + { + "epoch": 0.8151774785801713, + "grad_norm": 0.23747345805168152, + "learning_rate": 4.049886621315193e-05, + "loss": 0.4037, + "num_tokens": 197025340.0, + "step": 666 + }, + { + "epoch": 0.817625458996328, + "grad_norm": 0.20556145906448364, + "learning_rate": 4.0453514739229024e-05, + "loss": 0.4013, + "num_tokens": 197594800.0, + "step": 668 + }, + { + "epoch": 0.8200734394124847, + "grad_norm": 0.24500206112861633, + "learning_rate": 4.0408163265306124e-05, + "loss": 0.4083, + "num_tokens": 198193576.0, + "step": 670 + }, + { + "epoch": 0.8225214198286414, + "grad_norm": 0.22687214612960815, + "learning_rate": 4.036281179138322e-05, + "loss": 0.4033, + "num_tokens": 198803995.0, + "step": 672 + }, + { + "epoch": 0.824969400244798, + "grad_norm": 0.22586381435394287, + "learning_rate": 4.031746031746032e-05, + "loss": 0.4067, + "num_tokens": 199416611.0, + "step": 674 + }, + { + "epoch": 0.8274173806609547, + "grad_norm": 0.22488325834274292, + "learning_rate": 4.027210884353742e-05, + "loss": 0.3983, + "num_tokens": 200007315.0, + "step": 676 + }, + { + "epoch": 0.8298653610771114, + "grad_norm": 0.21779657900333405, + "learning_rate": 4.022675736961452e-05, + "loss": 0.4038, + "num_tokens": 200608406.0, + "step": 678 + }, + { + "epoch": 0.8323133414932681, + "grad_norm": 0.21070869266986847, + "learning_rate": 4.018140589569161e-05, + "loss": 0.4176, + "num_tokens": 201208706.0, + "step": 680 + }, + { + "epoch": 0.8347613219094248, + "grad_norm": 0.21609735488891602, + "learning_rate": 4.013605442176871e-05, + "loss": 0.3824, + "num_tokens": 201783927.0, + "step": 682 + }, + { + "epoch": 0.8372093023255814, + "grad_norm": 0.20457634329795837, + "learning_rate": 4.009070294784581e-05, + "loss": 0.413, + "num_tokens": 202380174.0, + "step": 684 + }, + { + "epoch": 0.8396572827417381, + "grad_norm": 0.22148260474205017, + "learning_rate": 4.00453514739229e-05, + "loss": 0.4123, + "num_tokens": 202985735.0, + "step": 686 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.21002599596977234, + "learning_rate": 4e-05, + "loss": 0.4188, + "num_tokens": 203582111.0, + "step": 688 + }, + { + "epoch": 0.8445532435740514, + "grad_norm": 0.21967588365077972, + "learning_rate": 3.9954648526077096e-05, + "loss": 0.3983, + "num_tokens": 204153573.0, + "step": 690 + }, + { + "epoch": 0.847001223990208, + "grad_norm": 0.21827496588230133, + "learning_rate": 3.9909297052154196e-05, + "loss": 0.4019, + "num_tokens": 204763271.0, + "step": 692 + }, + { + "epoch": 0.8494492044063647, + "grad_norm": 0.21960563957691193, + "learning_rate": 3.9863945578231296e-05, + "loss": 0.4042, + "num_tokens": 205341916.0, + "step": 694 + }, + { + "epoch": 0.8518971848225214, + "grad_norm": 0.22129298746585846, + "learning_rate": 3.9818594104308396e-05, + "loss": 0.4208, + "num_tokens": 205924333.0, + "step": 696 + }, + { + "epoch": 0.8543451652386781, + "grad_norm": 0.21248981356620789, + "learning_rate": 3.977324263038549e-05, + "loss": 0.4098, + "num_tokens": 206489074.0, + "step": 698 + }, + { + "epoch": 0.8567931456548348, + "grad_norm": 0.2336939424276352, + "learning_rate": 3.972789115646259e-05, + "loss": 0.4044, + "num_tokens": 207076635.0, + "step": 700 + }, + { + "epoch": 0.8592411260709915, + "grad_norm": 0.22042514383792877, + "learning_rate": 3.968253968253968e-05, + "loss": 0.4148, + "num_tokens": 207679550.0, + "step": 702 + }, + { + "epoch": 0.8616891064871481, + "grad_norm": 0.2362031191587448, + "learning_rate": 3.963718820861678e-05, + "loss": 0.4114, + "num_tokens": 208285797.0, + "step": 704 + }, + { + "epoch": 0.8641370869033048, + "grad_norm": 0.23842987418174744, + "learning_rate": 3.9591836734693876e-05, + "loss": 0.4133, + "num_tokens": 208900402.0, + "step": 706 + }, + { + "epoch": 0.8665850673194615, + "grad_norm": 0.23245958983898163, + "learning_rate": 3.9546485260770976e-05, + "loss": 0.3988, + "num_tokens": 209478996.0, + "step": 708 + }, + { + "epoch": 0.8690330477356181, + "grad_norm": 0.21248674392700195, + "learning_rate": 3.9501133786848075e-05, + "loss": 0.3997, + "num_tokens": 210061792.0, + "step": 710 + }, + { + "epoch": 0.8714810281517748, + "grad_norm": 0.22569961845874786, + "learning_rate": 3.945578231292517e-05, + "loss": 0.4054, + "num_tokens": 210619894.0, + "step": 712 + }, + { + "epoch": 0.8739290085679314, + "grad_norm": 0.2181859165430069, + "learning_rate": 3.941043083900227e-05, + "loss": 0.4046, + "num_tokens": 211195236.0, + "step": 714 + }, + { + "epoch": 0.8763769889840881, + "grad_norm": 0.2249060571193695, + "learning_rate": 3.936507936507937e-05, + "loss": 0.4005, + "num_tokens": 211757983.0, + "step": 716 + }, + { + "epoch": 0.8788249694002448, + "grad_norm": 0.22957013547420502, + "learning_rate": 3.931972789115647e-05, + "loss": 0.4062, + "num_tokens": 212332438.0, + "step": 718 + }, + { + "epoch": 0.8812729498164015, + "grad_norm": 0.21647459268569946, + "learning_rate": 3.927437641723356e-05, + "loss": 0.3956, + "num_tokens": 212936652.0, + "step": 720 + }, + { + "epoch": 0.8837209302325582, + "grad_norm": 0.21820461750030518, + "learning_rate": 3.922902494331066e-05, + "loss": 0.3897, + "num_tokens": 213505407.0, + "step": 722 + }, + { + "epoch": 0.8861689106487148, + "grad_norm": 0.21371085941791534, + "learning_rate": 3.9183673469387755e-05, + "loss": 0.4046, + "num_tokens": 214094524.0, + "step": 724 + }, + { + "epoch": 0.8886168910648715, + "grad_norm": 0.22490878403186798, + "learning_rate": 3.9138321995464855e-05, + "loss": 0.3844, + "num_tokens": 214670619.0, + "step": 726 + }, + { + "epoch": 0.8910648714810282, + "grad_norm": 0.21619531512260437, + "learning_rate": 3.909297052154195e-05, + "loss": 0.3972, + "num_tokens": 215236751.0, + "step": 728 + }, + { + "epoch": 0.8935128518971848, + "grad_norm": 0.2364090383052826, + "learning_rate": 3.904761904761905e-05, + "loss": 0.3987, + "num_tokens": 215828751.0, + "step": 730 + }, + { + "epoch": 0.8959608323133414, + "grad_norm": 0.21980957686901093, + "learning_rate": 3.900226757369615e-05, + "loss": 0.4, + "num_tokens": 216449250.0, + "step": 732 + }, + { + "epoch": 0.8984088127294981, + "grad_norm": 0.21486304700374603, + "learning_rate": 3.895691609977324e-05, + "loss": 0.3965, + "num_tokens": 217044781.0, + "step": 734 + }, + { + "epoch": 0.9008567931456548, + "grad_norm": 0.2082967758178711, + "learning_rate": 3.891156462585034e-05, + "loss": 0.398, + "num_tokens": 217611996.0, + "step": 736 + }, + { + "epoch": 0.9033047735618115, + "grad_norm": 0.19968794286251068, + "learning_rate": 3.886621315192744e-05, + "loss": 0.3898, + "num_tokens": 218232059.0, + "step": 738 + }, + { + "epoch": 0.9057527539779682, + "grad_norm": 0.2238074243068695, + "learning_rate": 3.882086167800454e-05, + "loss": 0.4282, + "num_tokens": 218810653.0, + "step": 740 + }, + { + "epoch": 0.9082007343941249, + "grad_norm": 0.22314809262752533, + "learning_rate": 3.8775510204081634e-05, + "loss": 0.3933, + "num_tokens": 219409815.0, + "step": 742 + }, + { + "epoch": 0.9106487148102815, + "grad_norm": 0.2527824342250824, + "learning_rate": 3.8730158730158734e-05, + "loss": 0.4087, + "num_tokens": 220016419.0, + "step": 744 + }, + { + "epoch": 0.9130966952264382, + "grad_norm": 0.21935120224952698, + "learning_rate": 3.868480725623583e-05, + "loss": 0.4024, + "num_tokens": 220599687.0, + "step": 746 + }, + { + "epoch": 0.9155446756425949, + "grad_norm": 0.2203340232372284, + "learning_rate": 3.863945578231293e-05, + "loss": 0.4032, + "num_tokens": 221170683.0, + "step": 748 + }, + { + "epoch": 0.9179926560587516, + "grad_norm": 0.21916158497333527, + "learning_rate": 3.859410430839002e-05, + "loss": 0.4011, + "num_tokens": 221764282.0, + "step": 750 + }, + { + "epoch": 0.9204406364749081, + "grad_norm": 0.21900133788585663, + "learning_rate": 3.854875283446712e-05, + "loss": 0.4069, + "num_tokens": 222383973.0, + "step": 752 + }, + { + "epoch": 0.9228886168910648, + "grad_norm": 0.21528606116771698, + "learning_rate": 3.850340136054422e-05, + "loss": 0.3981, + "num_tokens": 222981657.0, + "step": 754 + }, + { + "epoch": 0.9253365973072215, + "grad_norm": 0.219988152384758, + "learning_rate": 3.845804988662132e-05, + "loss": 0.41, + "num_tokens": 223580164.0, + "step": 756 + }, + { + "epoch": 0.9277845777233782, + "grad_norm": 0.22437667846679688, + "learning_rate": 3.841269841269842e-05, + "loss": 0.3986, + "num_tokens": 224181708.0, + "step": 758 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 0.22234344482421875, + "learning_rate": 3.836734693877551e-05, + "loss": 0.409, + "num_tokens": 224750929.0, + "step": 760 + }, + { + "epoch": 0.9326805385556916, + "grad_norm": 0.21830721199512482, + "learning_rate": 3.832199546485261e-05, + "loss": 0.4042, + "num_tokens": 225353399.0, + "step": 762 + }, + { + "epoch": 0.9351285189718482, + "grad_norm": 0.22780901193618774, + "learning_rate": 3.8276643990929706e-05, + "loss": 0.4122, + "num_tokens": 225946912.0, + "step": 764 + }, + { + "epoch": 0.9375764993880049, + "grad_norm": 0.21044515073299408, + "learning_rate": 3.8231292517006806e-05, + "loss": 0.3941, + "num_tokens": 226513122.0, + "step": 766 + }, + { + "epoch": 0.9400244798041616, + "grad_norm": 0.20034442842006683, + "learning_rate": 3.81859410430839e-05, + "loss": 0.399, + "num_tokens": 227097275.0, + "step": 768 + }, + { + "epoch": 0.9424724602203183, + "grad_norm": 0.2140418291091919, + "learning_rate": 3.8140589569161e-05, + "loss": 0.4009, + "num_tokens": 227685155.0, + "step": 770 + }, + { + "epoch": 0.944920440636475, + "grad_norm": 0.20679739117622375, + "learning_rate": 3.809523809523809e-05, + "loss": 0.3939, + "num_tokens": 228268933.0, + "step": 772 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.2113947570323944, + "learning_rate": 3.804988662131519e-05, + "loss": 0.3936, + "num_tokens": 228844822.0, + "step": 774 + }, + { + "epoch": 0.9498164014687882, + "grad_norm": 0.23549847304821014, + "learning_rate": 3.800453514739229e-05, + "loss": 0.3986, + "num_tokens": 229413402.0, + "step": 776 + }, + { + "epoch": 0.9522643818849449, + "grad_norm": 0.21228280663490295, + "learning_rate": 3.795918367346939e-05, + "loss": 0.3948, + "num_tokens": 229957992.0, + "step": 778 + }, + { + "epoch": 0.9547123623011016, + "grad_norm": 0.22272227704524994, + "learning_rate": 3.791383219954649e-05, + "loss": 0.4108, + "num_tokens": 230534148.0, + "step": 780 + }, + { + "epoch": 0.9571603427172583, + "grad_norm": 0.22994649410247803, + "learning_rate": 3.7868480725623585e-05, + "loss": 0.3913, + "num_tokens": 231127999.0, + "step": 782 + }, + { + "epoch": 0.9596083231334149, + "grad_norm": 0.2112966775894165, + "learning_rate": 3.7823129251700685e-05, + "loss": 0.4054, + "num_tokens": 231731253.0, + "step": 784 + }, + { + "epoch": 0.9620563035495716, + "grad_norm": 0.20147815346717834, + "learning_rate": 3.777777777777778e-05, + "loss": 0.3999, + "num_tokens": 232342748.0, + "step": 786 + }, + { + "epoch": 0.9645042839657283, + "grad_norm": 0.2231309562921524, + "learning_rate": 3.773242630385488e-05, + "loss": 0.3927, + "num_tokens": 232934775.0, + "step": 788 + }, + { + "epoch": 0.966952264381885, + "grad_norm": 0.21832215785980225, + "learning_rate": 3.768707482993197e-05, + "loss": 0.4043, + "num_tokens": 233502064.0, + "step": 790 + }, + { + "epoch": 0.9694002447980417, + "grad_norm": 0.22625139355659485, + "learning_rate": 3.764172335600907e-05, + "loss": 0.3959, + "num_tokens": 234073172.0, + "step": 792 + }, + { + "epoch": 0.9718482252141983, + "grad_norm": 0.2636200487613678, + "learning_rate": 3.7596371882086164e-05, + "loss": 0.4125, + "num_tokens": 234661532.0, + "step": 794 + }, + { + "epoch": 0.9742962056303549, + "grad_norm": 0.24178043007850647, + "learning_rate": 3.7551020408163264e-05, + "loss": 0.4142, + "num_tokens": 235274908.0, + "step": 796 + }, + { + "epoch": 0.9767441860465116, + "grad_norm": 0.2327648252248764, + "learning_rate": 3.7505668934240364e-05, + "loss": 0.4118, + "num_tokens": 235855663.0, + "step": 798 + }, + { + "epoch": 0.9791921664626683, + "grad_norm": 0.21242301166057587, + "learning_rate": 3.7460317460317464e-05, + "loss": 0.3836, + "num_tokens": 236428558.0, + "step": 800 + }, + { + "epoch": 0.981640146878825, + "grad_norm": 0.20703096687793732, + "learning_rate": 3.7414965986394564e-05, + "loss": 0.4023, + "num_tokens": 237023018.0, + "step": 802 + }, + { + "epoch": 0.9840881272949816, + "grad_norm": 0.22681297361850739, + "learning_rate": 3.736961451247166e-05, + "loss": 0.3997, + "num_tokens": 237611598.0, + "step": 804 + }, + { + "epoch": 0.9865361077111383, + "grad_norm": 0.2221280038356781, + "learning_rate": 3.732426303854876e-05, + "loss": 0.3913, + "num_tokens": 238214543.0, + "step": 806 + }, + { + "epoch": 0.988984088127295, + "grad_norm": 0.20904044806957245, + "learning_rate": 3.727891156462585e-05, + "loss": 0.3983, + "num_tokens": 238786408.0, + "step": 808 + }, + { + "epoch": 0.9914320685434517, + "grad_norm": 0.21625512838363647, + "learning_rate": 3.723356009070295e-05, + "loss": 0.3946, + "num_tokens": 239363151.0, + "step": 810 + }, + { + "epoch": 0.9938800489596084, + "grad_norm": 0.21922200918197632, + "learning_rate": 3.7188208616780044e-05, + "loss": 0.4008, + "num_tokens": 239967829.0, + "step": 812 + }, + { + "epoch": 0.996328029375765, + "grad_norm": 0.20125487446784973, + "learning_rate": 3.7142857142857143e-05, + "loss": 0.3901, + "num_tokens": 240551191.0, + "step": 814 + }, + { + "epoch": 0.9987760097919217, + "grad_norm": 0.20867830514907837, + "learning_rate": 3.7097505668934243e-05, + "loss": 0.404, + "num_tokens": 241153107.0, + "step": 816 + }, + { + "epoch": 1.0012239902080784, + "grad_norm": 0.22716069221496582, + "learning_rate": 3.705215419501134e-05, + "loss": 0.3647, + "num_tokens": 241769492.0, + "step": 818 + }, + { + "epoch": 1.003671970624235, + "grad_norm": 0.22431360185146332, + "learning_rate": 3.7006802721088437e-05, + "loss": 0.3259, + "num_tokens": 242359313.0, + "step": 820 + }, + { + "epoch": 1.0061199510403918, + "grad_norm": 0.26084810495376587, + "learning_rate": 3.6961451247165536e-05, + "loss": 0.3227, + "num_tokens": 242911445.0, + "step": 822 + }, + { + "epoch": 1.0085679314565483, + "grad_norm": 0.22556562721729279, + "learning_rate": 3.691609977324263e-05, + "loss": 0.335, + "num_tokens": 243515905.0, + "step": 824 + }, + { + "epoch": 1.0110159118727051, + "grad_norm": 0.22972659766674042, + "learning_rate": 3.687074829931973e-05, + "loss": 0.3297, + "num_tokens": 244085094.0, + "step": 826 + }, + { + "epoch": 1.0134638922888617, + "grad_norm": 0.23143792152404785, + "learning_rate": 3.682539682539683e-05, + "loss": 0.342, + "num_tokens": 244710189.0, + "step": 828 + }, + { + "epoch": 1.0159118727050183, + "grad_norm": 0.22278034687042236, + "learning_rate": 3.678004535147392e-05, + "loss": 0.3227, + "num_tokens": 245302719.0, + "step": 830 + }, + { + "epoch": 1.018359853121175, + "grad_norm": 0.23144645988941193, + "learning_rate": 3.673469387755102e-05, + "loss": 0.3377, + "num_tokens": 245899907.0, + "step": 832 + }, + { + "epoch": 1.0208078335373316, + "grad_norm": 0.21694451570510864, + "learning_rate": 3.6689342403628116e-05, + "loss": 0.3355, + "num_tokens": 246514507.0, + "step": 834 + }, + { + "epoch": 1.0232558139534884, + "grad_norm": 0.21809172630310059, + "learning_rate": 3.6643990929705216e-05, + "loss": 0.3251, + "num_tokens": 247121076.0, + "step": 836 + }, + { + "epoch": 1.025703794369645, + "grad_norm": 0.2095286250114441, + "learning_rate": 3.6598639455782316e-05, + "loss": 0.3192, + "num_tokens": 247731957.0, + "step": 838 + }, + { + "epoch": 1.0281517747858018, + "grad_norm": 0.21449701488018036, + "learning_rate": 3.6553287981859416e-05, + "loss": 0.3258, + "num_tokens": 248333325.0, + "step": 840 + }, + { + "epoch": 1.0305997552019583, + "grad_norm": 0.20430485904216766, + "learning_rate": 3.650793650793651e-05, + "loss": 0.3231, + "num_tokens": 248929993.0, + "step": 842 + }, + { + "epoch": 1.0330477356181151, + "grad_norm": 0.22047553956508636, + "learning_rate": 3.646258503401361e-05, + "loss": 0.3138, + "num_tokens": 249530707.0, + "step": 844 + }, + { + "epoch": 1.0354957160342717, + "grad_norm": 0.21630778908729553, + "learning_rate": 3.64172335600907e-05, + "loss": 0.3274, + "num_tokens": 250144988.0, + "step": 846 + }, + { + "epoch": 1.0379436964504285, + "grad_norm": 0.22004160284996033, + "learning_rate": 3.63718820861678e-05, + "loss": 0.3327, + "num_tokens": 250726834.0, + "step": 848 + }, + { + "epoch": 1.040391676866585, + "grad_norm": 0.21484030783176422, + "learning_rate": 3.63265306122449e-05, + "loss": 0.3337, + "num_tokens": 251338286.0, + "step": 850 + }, + { + "epoch": 1.0428396572827416, + "grad_norm": 0.2330131083726883, + "learning_rate": 3.6281179138321995e-05, + "loss": 0.3295, + "num_tokens": 251938256.0, + "step": 852 + }, + { + "epoch": 1.0452876376988984, + "grad_norm": 0.21885158121585846, + "learning_rate": 3.6235827664399095e-05, + "loss": 0.318, + "num_tokens": 252528052.0, + "step": 854 + }, + { + "epoch": 1.047735618115055, + "grad_norm": 0.21099720895290375, + "learning_rate": 3.619047619047619e-05, + "loss": 0.3222, + "num_tokens": 253131838.0, + "step": 856 + }, + { + "epoch": 1.0501835985312118, + "grad_norm": 0.2184596210718155, + "learning_rate": 3.6145124716553295e-05, + "loss": 0.3177, + "num_tokens": 253683217.0, + "step": 858 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.20888565480709076, + "learning_rate": 3.609977324263039e-05, + "loss": 0.3299, + "num_tokens": 254305603.0, + "step": 860 + }, + { + "epoch": 1.0550795593635252, + "grad_norm": 0.21422079205513, + "learning_rate": 3.605442176870749e-05, + "loss": 0.3179, + "num_tokens": 254866953.0, + "step": 862 + }, + { + "epoch": 1.0575275397796817, + "grad_norm": 0.24968288838863373, + "learning_rate": 3.600907029478458e-05, + "loss": 0.3317, + "num_tokens": 255438135.0, + "step": 864 + }, + { + "epoch": 1.0599755201958385, + "grad_norm": 0.22170448303222656, + "learning_rate": 3.596371882086168e-05, + "loss": 0.3184, + "num_tokens": 256042020.0, + "step": 866 + }, + { + "epoch": 1.062423500611995, + "grad_norm": 0.21579118072986603, + "learning_rate": 3.5918367346938774e-05, + "loss": 0.3246, + "num_tokens": 256618543.0, + "step": 868 + }, + { + "epoch": 1.0648714810281519, + "grad_norm": 0.2116389125585556, + "learning_rate": 3.5873015873015874e-05, + "loss": 0.3289, + "num_tokens": 257226439.0, + "step": 870 + }, + { + "epoch": 1.0673194614443084, + "grad_norm": 0.21718326210975647, + "learning_rate": 3.5827664399092974e-05, + "loss": 0.3255, + "num_tokens": 257822132.0, + "step": 872 + }, + { + "epoch": 1.069767441860465, + "grad_norm": 0.22444726526737213, + "learning_rate": 3.578231292517007e-05, + "loss": 0.3187, + "num_tokens": 258411819.0, + "step": 874 + }, + { + "epoch": 1.0722154222766218, + "grad_norm": 0.23174837231636047, + "learning_rate": 3.573696145124717e-05, + "loss": 0.3299, + "num_tokens": 258980885.0, + "step": 876 + }, + { + "epoch": 1.0746634026927784, + "grad_norm": 0.2177923321723938, + "learning_rate": 3.569160997732427e-05, + "loss": 0.3333, + "num_tokens": 259568916.0, + "step": 878 + }, + { + "epoch": 1.0771113831089352, + "grad_norm": 0.21427904069423676, + "learning_rate": 3.564625850340137e-05, + "loss": 0.3306, + "num_tokens": 260173762.0, + "step": 880 + }, + { + "epoch": 1.0795593635250917, + "grad_norm": 0.2163168489933014, + "learning_rate": 3.560090702947846e-05, + "loss": 0.3383, + "num_tokens": 260780328.0, + "step": 882 + }, + { + "epoch": 1.0820073439412485, + "grad_norm": 0.2089541107416153, + "learning_rate": 3.555555555555556e-05, + "loss": 0.3222, + "num_tokens": 261378955.0, + "step": 884 + }, + { + "epoch": 1.084455324357405, + "grad_norm": 0.21609053015708923, + "learning_rate": 3.551020408163265e-05, + "loss": 0.3231, + "num_tokens": 261963127.0, + "step": 886 + }, + { + "epoch": 1.086903304773562, + "grad_norm": 0.2107062190771103, + "learning_rate": 3.546485260770975e-05, + "loss": 0.3284, + "num_tokens": 262561647.0, + "step": 888 + }, + { + "epoch": 1.0893512851897185, + "grad_norm": 0.2224922925233841, + "learning_rate": 3.5419501133786846e-05, + "loss": 0.3286, + "num_tokens": 263136540.0, + "step": 890 + }, + { + "epoch": 1.091799265605875, + "grad_norm": 0.24692034721374512, + "learning_rate": 3.5374149659863946e-05, + "loss": 0.3393, + "num_tokens": 263709944.0, + "step": 892 + }, + { + "epoch": 1.0942472460220318, + "grad_norm": 0.2155454009771347, + "learning_rate": 3.532879818594104e-05, + "loss": 0.332, + "num_tokens": 264272683.0, + "step": 894 + }, + { + "epoch": 1.0966952264381884, + "grad_norm": 0.20599579811096191, + "learning_rate": 3.528344671201814e-05, + "loss": 0.3268, + "num_tokens": 264868896.0, + "step": 896 + }, + { + "epoch": 1.0991432068543452, + "grad_norm": 0.21908614039421082, + "learning_rate": 3.523809523809524e-05, + "loss": 0.3276, + "num_tokens": 265469534.0, + "step": 898 + }, + { + "epoch": 1.1015911872705018, + "grad_norm": 0.21601365506649017, + "learning_rate": 3.519274376417234e-05, + "loss": 0.3361, + "num_tokens": 266079945.0, + "step": 900 + }, + { + "epoch": 1.1040391676866586, + "grad_norm": 0.2164461314678192, + "learning_rate": 3.514739229024944e-05, + "loss": 0.3498, + "num_tokens": 266677132.0, + "step": 902 + }, + { + "epoch": 1.1064871481028151, + "grad_norm": 0.2142316699028015, + "learning_rate": 3.510204081632653e-05, + "loss": 0.3313, + "num_tokens": 267251319.0, + "step": 904 + }, + { + "epoch": 1.108935128518972, + "grad_norm": 0.23115405440330505, + "learning_rate": 3.505668934240363e-05, + "loss": 0.3349, + "num_tokens": 267851096.0, + "step": 906 + }, + { + "epoch": 1.1113831089351285, + "grad_norm": 0.20882946252822876, + "learning_rate": 3.5011337868480725e-05, + "loss": 0.3371, + "num_tokens": 268461913.0, + "step": 908 + }, + { + "epoch": 1.1138310893512853, + "grad_norm": 0.24057303369045258, + "learning_rate": 3.4965986394557825e-05, + "loss": 0.3308, + "num_tokens": 269034487.0, + "step": 910 + }, + { + "epoch": 1.1162790697674418, + "grad_norm": 0.20046664774417877, + "learning_rate": 3.492063492063492e-05, + "loss": 0.3199, + "num_tokens": 269627033.0, + "step": 912 + }, + { + "epoch": 1.1187270501835984, + "grad_norm": 0.2168475240468979, + "learning_rate": 3.487528344671202e-05, + "loss": 0.3149, + "num_tokens": 270221053.0, + "step": 914 + }, + { + "epoch": 1.1211750305997552, + "grad_norm": 0.2270416021347046, + "learning_rate": 3.482993197278911e-05, + "loss": 0.3258, + "num_tokens": 270796741.0, + "step": 916 + }, + { + "epoch": 1.1236230110159118, + "grad_norm": 0.2177935093641281, + "learning_rate": 3.478458049886622e-05, + "loss": 0.3089, + "num_tokens": 271365659.0, + "step": 918 + }, + { + "epoch": 1.1260709914320686, + "grad_norm": 0.23041482269763947, + "learning_rate": 3.473922902494332e-05, + "loss": 0.3268, + "num_tokens": 271959464.0, + "step": 920 + }, + { + "epoch": 1.1285189718482251, + "grad_norm": 0.20442229509353638, + "learning_rate": 3.469387755102041e-05, + "loss": 0.3239, + "num_tokens": 272536606.0, + "step": 922 + }, + { + "epoch": 1.130966952264382, + "grad_norm": 0.21590664982795715, + "learning_rate": 3.464852607709751e-05, + "loss": 0.3156, + "num_tokens": 273133730.0, + "step": 924 + }, + { + "epoch": 1.1334149326805385, + "grad_norm": 0.2138853222131729, + "learning_rate": 3.4603174603174604e-05, + "loss": 0.316, + "num_tokens": 273707414.0, + "step": 926 + }, + { + "epoch": 1.1358629130966953, + "grad_norm": 0.21543395519256592, + "learning_rate": 3.4557823129251704e-05, + "loss": 0.3189, + "num_tokens": 274288846.0, + "step": 928 + }, + { + "epoch": 1.1383108935128519, + "grad_norm": 0.19650688767433167, + "learning_rate": 3.45124716553288e-05, + "loss": 0.3214, + "num_tokens": 274886178.0, + "step": 930 + }, + { + "epoch": 1.1407588739290087, + "grad_norm": 0.22805742919445038, + "learning_rate": 3.44671201814059e-05, + "loss": 0.3257, + "num_tokens": 275470345.0, + "step": 932 + }, + { + "epoch": 1.1432068543451652, + "grad_norm": 0.21502400934696198, + "learning_rate": 3.442176870748299e-05, + "loss": 0.3307, + "num_tokens": 276068187.0, + "step": 934 + }, + { + "epoch": 1.1456548347613218, + "grad_norm": 0.26400136947631836, + "learning_rate": 3.437641723356009e-05, + "loss": 0.3215, + "num_tokens": 276667953.0, + "step": 936 + }, + { + "epoch": 1.1481028151774786, + "grad_norm": 0.2162887305021286, + "learning_rate": 3.433106575963719e-05, + "loss": 0.3324, + "num_tokens": 277258175.0, + "step": 938 + }, + { + "epoch": 1.1505507955936352, + "grad_norm": 0.21460330486297607, + "learning_rate": 3.428571428571429e-05, + "loss": 0.322, + "num_tokens": 277836527.0, + "step": 940 + }, + { + "epoch": 1.152998776009792, + "grad_norm": 0.20602042973041534, + "learning_rate": 3.4240362811791384e-05, + "loss": 0.3315, + "num_tokens": 278405534.0, + "step": 942 + }, + { + "epoch": 1.1554467564259485, + "grad_norm": 0.21357451379299164, + "learning_rate": 3.4195011337868484e-05, + "loss": 0.3287, + "num_tokens": 278993945.0, + "step": 944 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 0.21809221804141998, + "learning_rate": 3.4149659863945583e-05, + "loss": 0.3346, + "num_tokens": 279568621.0, + "step": 946 + }, + { + "epoch": 1.1603427172582619, + "grad_norm": 0.21002452075481415, + "learning_rate": 3.410430839002268e-05, + "loss": 0.3329, + "num_tokens": 280153080.0, + "step": 948 + }, + { + "epoch": 1.1627906976744187, + "grad_norm": 0.20573993027210236, + "learning_rate": 3.4058956916099777e-05, + "loss": 0.3421, + "num_tokens": 280750498.0, + "step": 950 + }, + { + "epoch": 1.1652386780905752, + "grad_norm": 0.21614262461662292, + "learning_rate": 3.401360544217687e-05, + "loss": 0.3345, + "num_tokens": 281356910.0, + "step": 952 + }, + { + "epoch": 1.167686658506732, + "grad_norm": 0.22179672122001648, + "learning_rate": 3.396825396825397e-05, + "loss": 0.3338, + "num_tokens": 281936839.0, + "step": 954 + }, + { + "epoch": 1.1701346389228886, + "grad_norm": 0.21308833360671997, + "learning_rate": 3.392290249433106e-05, + "loss": 0.3247, + "num_tokens": 282545026.0, + "step": 956 + }, + { + "epoch": 1.1725826193390452, + "grad_norm": 0.21291963756084442, + "learning_rate": 3.387755102040816e-05, + "loss": 0.3268, + "num_tokens": 283148652.0, + "step": 958 + }, + { + "epoch": 1.175030599755202, + "grad_norm": 0.23181042075157166, + "learning_rate": 3.383219954648526e-05, + "loss": 0.3272, + "num_tokens": 283741655.0, + "step": 960 + }, + { + "epoch": 1.1774785801713585, + "grad_norm": 0.23421810567378998, + "learning_rate": 3.378684807256236e-05, + "loss": 0.3322, + "num_tokens": 284338996.0, + "step": 962 + }, + { + "epoch": 1.1799265605875153, + "grad_norm": 0.21957792341709137, + "learning_rate": 3.3741496598639456e-05, + "loss": 0.3364, + "num_tokens": 284937493.0, + "step": 964 + }, + { + "epoch": 1.182374541003672, + "grad_norm": 0.2224060446023941, + "learning_rate": 3.3696145124716556e-05, + "loss": 0.3376, + "num_tokens": 285523397.0, + "step": 966 + }, + { + "epoch": 1.1848225214198287, + "grad_norm": 0.20715020596981049, + "learning_rate": 3.3650793650793656e-05, + "loss": 0.3367, + "num_tokens": 286120001.0, + "step": 968 + }, + { + "epoch": 1.1872705018359853, + "grad_norm": 0.1912972629070282, + "learning_rate": 3.360544217687075e-05, + "loss": 0.3258, + "num_tokens": 286695816.0, + "step": 970 + }, + { + "epoch": 1.189718482252142, + "grad_norm": 0.20228362083435059, + "learning_rate": 3.356009070294785e-05, + "loss": 0.3261, + "num_tokens": 287308514.0, + "step": 972 + }, + { + "epoch": 1.1921664626682986, + "grad_norm": 0.21035577356815338, + "learning_rate": 3.351473922902494e-05, + "loss": 0.3111, + "num_tokens": 287864124.0, + "step": 974 + }, + { + "epoch": 1.1946144430844554, + "grad_norm": 0.21060359477996826, + "learning_rate": 3.346938775510204e-05, + "loss": 0.3225, + "num_tokens": 288440548.0, + "step": 976 + }, + { + "epoch": 1.197062423500612, + "grad_norm": 0.21807031333446503, + "learning_rate": 3.342403628117914e-05, + "loss": 0.3266, + "num_tokens": 289023705.0, + "step": 978 + }, + { + "epoch": 1.1995104039167686, + "grad_norm": 0.19630470871925354, + "learning_rate": 3.337868480725624e-05, + "loss": 0.3292, + "num_tokens": 289592388.0, + "step": 980 + }, + { + "epoch": 1.2019583843329253, + "grad_norm": 0.21963970363140106, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.3243, + "num_tokens": 290139669.0, + "step": 982 + }, + { + "epoch": 1.204406364749082, + "grad_norm": 0.20611660182476044, + "learning_rate": 3.3287981859410435e-05, + "loss": 0.3115, + "num_tokens": 290703981.0, + "step": 984 + }, + { + "epoch": 1.2068543451652387, + "grad_norm": 0.21273091435432434, + "learning_rate": 3.324263038548753e-05, + "loss": 0.338, + "num_tokens": 291284094.0, + "step": 986 + }, + { + "epoch": 1.2093023255813953, + "grad_norm": 0.21075144410133362, + "learning_rate": 3.319727891156463e-05, + "loss": 0.3392, + "num_tokens": 291859678.0, + "step": 988 + }, + { + "epoch": 1.211750305997552, + "grad_norm": 0.20937347412109375, + "learning_rate": 3.315192743764173e-05, + "loss": 0.3282, + "num_tokens": 292461042.0, + "step": 990 + }, + { + "epoch": 1.2141982864137086, + "grad_norm": 0.21959975361824036, + "learning_rate": 3.310657596371882e-05, + "loss": 0.3273, + "num_tokens": 293014987.0, + "step": 992 + }, + { + "epoch": 1.2166462668298654, + "grad_norm": 0.19682170450687408, + "learning_rate": 3.306122448979592e-05, + "loss": 0.3153, + "num_tokens": 293602670.0, + "step": 994 + }, + { + "epoch": 1.219094247246022, + "grad_norm": 0.2379043698310852, + "learning_rate": 3.3015873015873014e-05, + "loss": 0.33, + "num_tokens": 294198701.0, + "step": 996 + }, + { + "epoch": 1.2215422276621788, + "grad_norm": 0.21255435049533844, + "learning_rate": 3.2970521541950114e-05, + "loss": 0.3245, + "num_tokens": 294765389.0, + "step": 998 + }, + { + "epoch": 1.2239902080783354, + "grad_norm": 0.2116040587425232, + "learning_rate": 3.2925170068027214e-05, + "loss": 0.3254, + "num_tokens": 295355827.0, + "step": 1000 + }, + { + "epoch": 1.226438188494492, + "grad_norm": 0.21479982137680054, + "learning_rate": 3.2879818594104314e-05, + "loss": 0.3253, + "num_tokens": 295939047.0, + "step": 1002 + }, + { + "epoch": 1.2288861689106487, + "grad_norm": 0.22433559596538544, + "learning_rate": 3.283446712018141e-05, + "loss": 0.332, + "num_tokens": 296511108.0, + "step": 1004 + }, + { + "epoch": 1.2313341493268053, + "grad_norm": 0.2185741513967514, + "learning_rate": 3.278911564625851e-05, + "loss": 0.3355, + "num_tokens": 297091745.0, + "step": 1006 + }, + { + "epoch": 1.233782129742962, + "grad_norm": 0.20891353487968445, + "learning_rate": 3.27437641723356e-05, + "loss": 0.3345, + "num_tokens": 297693752.0, + "step": 1008 + }, + { + "epoch": 1.2362301101591187, + "grad_norm": 0.20652121305465698, + "learning_rate": 3.26984126984127e-05, + "loss": 0.3305, + "num_tokens": 298274684.0, + "step": 1010 + }, + { + "epoch": 1.2386780905752754, + "grad_norm": 0.20441685616970062, + "learning_rate": 3.265306122448979e-05, + "loss": 0.3251, + "num_tokens": 298871738.0, + "step": 1012 + }, + { + "epoch": 1.241126070991432, + "grad_norm": 0.2070181965827942, + "learning_rate": 3.260770975056689e-05, + "loss": 0.3256, + "num_tokens": 299465770.0, + "step": 1014 + }, + { + "epoch": 1.2435740514075888, + "grad_norm": 0.21503402292728424, + "learning_rate": 3.256235827664399e-05, + "loss": 0.3274, + "num_tokens": 300053388.0, + "step": 1016 + }, + { + "epoch": 1.2460220318237454, + "grad_norm": 0.2063676118850708, + "learning_rate": 3.2517006802721086e-05, + "loss": 0.3316, + "num_tokens": 300639945.0, + "step": 1018 + }, + { + "epoch": 1.2484700122399022, + "grad_norm": 0.21437892317771912, + "learning_rate": 3.2471655328798186e-05, + "loss": 0.3482, + "num_tokens": 301230177.0, + "step": 1020 + }, + { + "epoch": 1.2509179926560587, + "grad_norm": 0.2130657136440277, + "learning_rate": 3.2426303854875286e-05, + "loss": 0.3389, + "num_tokens": 301830469.0, + "step": 1022 + }, + { + "epoch": 1.2533659730722153, + "grad_norm": 0.20428848266601562, + "learning_rate": 3.2380952380952386e-05, + "loss": 0.3217, + "num_tokens": 302402485.0, + "step": 1024 + }, + { + "epoch": 1.255813953488372, + "grad_norm": 0.21149156987667084, + "learning_rate": 3.233560090702948e-05, + "loss": 0.3282, + "num_tokens": 303004905.0, + "step": 1026 + }, + { + "epoch": 1.258261933904529, + "grad_norm": 0.2086249142885208, + "learning_rate": 3.229024943310658e-05, + "loss": 0.3253, + "num_tokens": 303599350.0, + "step": 1028 + }, + { + "epoch": 1.2607099143206855, + "grad_norm": 0.1994834840297699, + "learning_rate": 3.224489795918367e-05, + "loss": 0.3306, + "num_tokens": 304216931.0, + "step": 1030 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 0.21735838055610657, + "learning_rate": 3.219954648526077e-05, + "loss": 0.3282, + "num_tokens": 304825541.0, + "step": 1032 + }, + { + "epoch": 1.2656058751529988, + "grad_norm": 0.1988687366247177, + "learning_rate": 3.2154195011337866e-05, + "loss": 0.3335, + "num_tokens": 305415939.0, + "step": 1034 + }, + { + "epoch": 1.2680538555691554, + "grad_norm": 0.20836502313613892, + "learning_rate": 3.2108843537414965e-05, + "loss": 0.3306, + "num_tokens": 306017718.0, + "step": 1036 + }, + { + "epoch": 1.2705018359853122, + "grad_norm": 0.21479956805706024, + "learning_rate": 3.2063492063492065e-05, + "loss": 0.3347, + "num_tokens": 306632370.0, + "step": 1038 + }, + { + "epoch": 1.2729498164014688, + "grad_norm": 0.1927805244922638, + "learning_rate": 3.2018140589569165e-05, + "loss": 0.3281, + "num_tokens": 307220724.0, + "step": 1040 + }, + { + "epoch": 1.2753977968176256, + "grad_norm": 0.20731164515018463, + "learning_rate": 3.1972789115646265e-05, + "loss": 0.3315, + "num_tokens": 307807114.0, + "step": 1042 + }, + { + "epoch": 1.2778457772337821, + "grad_norm": 0.2059209644794464, + "learning_rate": 3.192743764172336e-05, + "loss": 0.3113, + "num_tokens": 308391074.0, + "step": 1044 + }, + { + "epoch": 1.2802937576499387, + "grad_norm": 0.2285885214805603, + "learning_rate": 3.188208616780046e-05, + "loss": 0.3315, + "num_tokens": 308994281.0, + "step": 1046 + }, + { + "epoch": 1.2827417380660955, + "grad_norm": 0.21077679097652435, + "learning_rate": 3.183673469387755e-05, + "loss": 0.3322, + "num_tokens": 309584100.0, + "step": 1048 + }, + { + "epoch": 1.2851897184822523, + "grad_norm": 0.2088124305009842, + "learning_rate": 3.179138321995465e-05, + "loss": 0.3237, + "num_tokens": 310173078.0, + "step": 1050 + }, + { + "epoch": 1.2876376988984088, + "grad_norm": 0.21228265762329102, + "learning_rate": 3.1746031746031745e-05, + "loss": 0.3283, + "num_tokens": 310756371.0, + "step": 1052 + }, + { + "epoch": 1.2900856793145654, + "grad_norm": 0.2185690701007843, + "learning_rate": 3.1700680272108845e-05, + "loss": 0.3248, + "num_tokens": 311329068.0, + "step": 1054 + }, + { + "epoch": 1.2925336597307222, + "grad_norm": 0.19668897986412048, + "learning_rate": 3.165532879818594e-05, + "loss": 0.3398, + "num_tokens": 311943569.0, + "step": 1056 + }, + { + "epoch": 1.2949816401468788, + "grad_norm": 0.18875733017921448, + "learning_rate": 3.160997732426304e-05, + "loss": 0.3087, + "num_tokens": 312529715.0, + "step": 1058 + }, + { + "epoch": 1.2974296205630356, + "grad_norm": 0.1982879638671875, + "learning_rate": 3.156462585034014e-05, + "loss": 0.3172, + "num_tokens": 313111336.0, + "step": 1060 + }, + { + "epoch": 1.2998776009791921, + "grad_norm": 0.20667016506195068, + "learning_rate": 3.151927437641724e-05, + "loss": 0.3348, + "num_tokens": 313717861.0, + "step": 1062 + }, + { + "epoch": 1.302325581395349, + "grad_norm": 0.20200657844543457, + "learning_rate": 3.147392290249434e-05, + "loss": 0.3245, + "num_tokens": 314306581.0, + "step": 1064 + }, + { + "epoch": 1.3047735618115055, + "grad_norm": 0.2066364288330078, + "learning_rate": 3.142857142857143e-05, + "loss": 0.3094, + "num_tokens": 314870264.0, + "step": 1066 + }, + { + "epoch": 1.307221542227662, + "grad_norm": 0.21934255957603455, + "learning_rate": 3.138321995464853e-05, + "loss": 0.3262, + "num_tokens": 315449261.0, + "step": 1068 + }, + { + "epoch": 1.3096695226438189, + "grad_norm": 0.20628714561462402, + "learning_rate": 3.1337868480725624e-05, + "loss": 0.324, + "num_tokens": 316027882.0, + "step": 1070 + }, + { + "epoch": 1.3121175030599757, + "grad_norm": 0.20073823630809784, + "learning_rate": 3.1292517006802724e-05, + "loss": 0.3214, + "num_tokens": 316628343.0, + "step": 1072 + }, + { + "epoch": 1.3145654834761322, + "grad_norm": 0.20874254405498505, + "learning_rate": 3.124716553287982e-05, + "loss": 0.3402, + "num_tokens": 317198024.0, + "step": 1074 + }, + { + "epoch": 1.3170134638922888, + "grad_norm": 0.20343975722789764, + "learning_rate": 3.120181405895692e-05, + "loss": 0.3269, + "num_tokens": 317772116.0, + "step": 1076 + }, + { + "epoch": 1.3194614443084456, + "grad_norm": 0.20150567591190338, + "learning_rate": 3.115646258503401e-05, + "loss": 0.3309, + "num_tokens": 318373768.0, + "step": 1078 + }, + { + "epoch": 1.3219094247246022, + "grad_norm": 0.20412245392799377, + "learning_rate": 3.111111111111111e-05, + "loss": 0.3277, + "num_tokens": 318961371.0, + "step": 1080 + }, + { + "epoch": 1.324357405140759, + "grad_norm": 0.192244753241539, + "learning_rate": 3.106575963718821e-05, + "loss": 0.3259, + "num_tokens": 319555666.0, + "step": 1082 + }, + { + "epoch": 1.3268053855569155, + "grad_norm": 0.20275932550430298, + "learning_rate": 3.102040816326531e-05, + "loss": 0.3182, + "num_tokens": 320139721.0, + "step": 1084 + }, + { + "epoch": 1.3292533659730723, + "grad_norm": 0.2061118483543396, + "learning_rate": 3.097505668934241e-05, + "loss": 0.3345, + "num_tokens": 320715131.0, + "step": 1086 + }, + { + "epoch": 1.3317013463892289, + "grad_norm": 0.20496678352355957, + "learning_rate": 3.09297052154195e-05, + "loss": 0.3459, + "num_tokens": 321315666.0, + "step": 1088 + }, + { + "epoch": 1.3341493268053854, + "grad_norm": 0.23101527988910675, + "learning_rate": 3.08843537414966e-05, + "loss": 0.328, + "num_tokens": 321875487.0, + "step": 1090 + }, + { + "epoch": 1.3365973072215422, + "grad_norm": 0.2093493640422821, + "learning_rate": 3.0839002267573696e-05, + "loss": 0.3175, + "num_tokens": 322452818.0, + "step": 1092 + }, + { + "epoch": 1.339045287637699, + "grad_norm": 0.22374677658081055, + "learning_rate": 3.0793650793650796e-05, + "loss": 0.3226, + "num_tokens": 323038736.0, + "step": 1094 + }, + { + "epoch": 1.3414932680538556, + "grad_norm": 0.2096761018037796, + "learning_rate": 3.074829931972789e-05, + "loss": 0.3271, + "num_tokens": 323658114.0, + "step": 1096 + }, + { + "epoch": 1.3439412484700122, + "grad_norm": 0.21691083908081055, + "learning_rate": 3.070294784580499e-05, + "loss": 0.3373, + "num_tokens": 324245794.0, + "step": 1098 + }, + { + "epoch": 1.346389228886169, + "grad_norm": 0.2019423097372055, + "learning_rate": 3.065759637188209e-05, + "loss": 0.3267, + "num_tokens": 324874276.0, + "step": 1100 + }, + { + "epoch": 1.3488372093023255, + "grad_norm": 0.23190709948539734, + "learning_rate": 3.061224489795919e-05, + "loss": 0.3252, + "num_tokens": 325448734.0, + "step": 1102 + }, + { + "epoch": 1.3512851897184823, + "grad_norm": 0.2017262876033783, + "learning_rate": 3.056689342403628e-05, + "loss": 0.336, + "num_tokens": 326035181.0, + "step": 1104 + }, + { + "epoch": 1.353733170134639, + "grad_norm": 0.19880731403827667, + "learning_rate": 3.052154195011338e-05, + "loss": 0.3326, + "num_tokens": 326621091.0, + "step": 1106 + }, + { + "epoch": 1.3561811505507957, + "grad_norm": 0.19091013073921204, + "learning_rate": 3.0476190476190482e-05, + "loss": 0.3223, + "num_tokens": 327207895.0, + "step": 1108 + }, + { + "epoch": 1.3586291309669523, + "grad_norm": 0.20472408831119537, + "learning_rate": 3.0430839002267575e-05, + "loss": 0.3386, + "num_tokens": 327792682.0, + "step": 1110 + }, + { + "epoch": 1.3610771113831088, + "grad_norm": 0.20697343349456787, + "learning_rate": 3.0385487528344675e-05, + "loss": 0.3422, + "num_tokens": 328402200.0, + "step": 1112 + }, + { + "epoch": 1.3635250917992656, + "grad_norm": 0.19943897426128387, + "learning_rate": 3.0340136054421768e-05, + "loss": 0.3134, + "num_tokens": 329018344.0, + "step": 1114 + }, + { + "epoch": 1.3659730722154224, + "grad_norm": 0.2011730670928955, + "learning_rate": 3.0294784580498868e-05, + "loss": 0.3129, + "num_tokens": 329597401.0, + "step": 1116 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 0.19293339550495148, + "learning_rate": 3.0249433106575965e-05, + "loss": 0.3338, + "num_tokens": 330218091.0, + "step": 1118 + }, + { + "epoch": 1.3708690330477356, + "grad_norm": 0.2098134309053421, + "learning_rate": 3.0204081632653065e-05, + "loss": 0.3374, + "num_tokens": 330784836.0, + "step": 1120 + }, + { + "epoch": 1.3733170134638923, + "grad_norm": 0.21670743823051453, + "learning_rate": 3.0158730158730158e-05, + "loss": 0.336, + "num_tokens": 331351354.0, + "step": 1122 + }, + { + "epoch": 1.375764993880049, + "grad_norm": 0.2102426141500473, + "learning_rate": 3.0113378684807258e-05, + "loss": 0.3216, + "num_tokens": 331933476.0, + "step": 1124 + }, + { + "epoch": 1.3782129742962057, + "grad_norm": 0.21992817521095276, + "learning_rate": 3.0068027210884354e-05, + "loss": 0.3228, + "num_tokens": 332523353.0, + "step": 1126 + }, + { + "epoch": 1.3806609547123623, + "grad_norm": 0.2108524590730667, + "learning_rate": 3.0022675736961454e-05, + "loss": 0.3224, + "num_tokens": 333096257.0, + "step": 1128 + }, + { + "epoch": 1.383108935128519, + "grad_norm": 0.20453369617462158, + "learning_rate": 2.9977324263038547e-05, + "loss": 0.34, + "num_tokens": 333704506.0, + "step": 1130 + }, + { + "epoch": 1.3855569155446756, + "grad_norm": 0.20225991308689117, + "learning_rate": 2.9931972789115647e-05, + "loss": 0.3245, + "num_tokens": 334279055.0, + "step": 1132 + }, + { + "epoch": 1.3880048959608322, + "grad_norm": 0.21464355289936066, + "learning_rate": 2.9886621315192747e-05, + "loss": 0.3341, + "num_tokens": 334885694.0, + "step": 1134 + }, + { + "epoch": 1.390452876376989, + "grad_norm": 0.20687837898731232, + "learning_rate": 2.9841269841269844e-05, + "loss": 0.3311, + "num_tokens": 335482541.0, + "step": 1136 + }, + { + "epoch": 1.3929008567931458, + "grad_norm": 0.20861484110355377, + "learning_rate": 2.9795918367346944e-05, + "loss": 0.3327, + "num_tokens": 336090417.0, + "step": 1138 + }, + { + "epoch": 1.3953488372093024, + "grad_norm": 0.20611432194709778, + "learning_rate": 2.9750566893424037e-05, + "loss": 0.322, + "num_tokens": 336652660.0, + "step": 1140 + }, + { + "epoch": 1.397796817625459, + "grad_norm": 0.215005561709404, + "learning_rate": 2.9705215419501137e-05, + "loss": 0.3167, + "num_tokens": 337236563.0, + "step": 1142 + }, + { + "epoch": 1.4002447980416157, + "grad_norm": 0.20051907002925873, + "learning_rate": 2.965986394557823e-05, + "loss": 0.3236, + "num_tokens": 337843323.0, + "step": 1144 + }, + { + "epoch": 1.4026927784577723, + "grad_norm": 0.20609988272190094, + "learning_rate": 2.961451247165533e-05, + "loss": 0.323, + "num_tokens": 338432787.0, + "step": 1146 + }, + { + "epoch": 1.405140758873929, + "grad_norm": 0.21007870137691498, + "learning_rate": 2.9569160997732426e-05, + "loss": 0.3237, + "num_tokens": 339035087.0, + "step": 1148 + }, + { + "epoch": 1.4075887392900857, + "grad_norm": 0.2034088373184204, + "learning_rate": 2.9523809523809526e-05, + "loss": 0.3362, + "num_tokens": 339651239.0, + "step": 1150 + }, + { + "epoch": 1.4100367197062424, + "grad_norm": 0.20170451700687408, + "learning_rate": 2.947845804988662e-05, + "loss": 0.3203, + "num_tokens": 340245882.0, + "step": 1152 + }, + { + "epoch": 1.412484700122399, + "grad_norm": 0.20183579623699188, + "learning_rate": 2.943310657596372e-05, + "loss": 0.3253, + "num_tokens": 340840812.0, + "step": 1154 + }, + { + "epoch": 1.4149326805385556, + "grad_norm": 0.20300714671611786, + "learning_rate": 2.938775510204082e-05, + "loss": 0.3234, + "num_tokens": 341421452.0, + "step": 1156 + }, + { + "epoch": 1.4173806609547124, + "grad_norm": 0.2021717131137848, + "learning_rate": 2.9342403628117916e-05, + "loss": 0.3229, + "num_tokens": 342020118.0, + "step": 1158 + }, + { + "epoch": 1.4198286413708692, + "grad_norm": 0.19948482513427734, + "learning_rate": 2.9297052154195016e-05, + "loss": 0.3295, + "num_tokens": 342641497.0, + "step": 1160 + }, + { + "epoch": 1.4222766217870257, + "grad_norm": 0.20790904760360718, + "learning_rate": 2.925170068027211e-05, + "loss": 0.3248, + "num_tokens": 343240062.0, + "step": 1162 + }, + { + "epoch": 1.4247246022031823, + "grad_norm": 0.18957248330116272, + "learning_rate": 2.920634920634921e-05, + "loss": 0.3258, + "num_tokens": 343834833.0, + "step": 1164 + }, + { + "epoch": 1.427172582619339, + "grad_norm": 0.199366956949234, + "learning_rate": 2.9160997732426306e-05, + "loss": 0.3181, + "num_tokens": 344424327.0, + "step": 1166 + }, + { + "epoch": 1.4296205630354957, + "grad_norm": 0.2107955366373062, + "learning_rate": 2.9115646258503405e-05, + "loss": 0.3386, + "num_tokens": 345023321.0, + "step": 1168 + }, + { + "epoch": 1.4320685434516525, + "grad_norm": 0.19503648579120636, + "learning_rate": 2.90702947845805e-05, + "loss": 0.3311, + "num_tokens": 345650727.0, + "step": 1170 + }, + { + "epoch": 1.434516523867809, + "grad_norm": 0.21081791818141937, + "learning_rate": 2.90249433106576e-05, + "loss": 0.3137, + "num_tokens": 346241537.0, + "step": 1172 + }, + { + "epoch": 1.4369645042839658, + "grad_norm": 0.21696153283119202, + "learning_rate": 2.8979591836734692e-05, + "loss": 0.3388, + "num_tokens": 346834512.0, + "step": 1174 + }, + { + "epoch": 1.4394124847001224, + "grad_norm": 0.21718978881835938, + "learning_rate": 2.893424036281179e-05, + "loss": 0.3284, + "num_tokens": 347427754.0, + "step": 1176 + }, + { + "epoch": 1.441860465116279, + "grad_norm": 0.20073866844177246, + "learning_rate": 2.8888888888888888e-05, + "loss": 0.3302, + "num_tokens": 348010163.0, + "step": 1178 + }, + { + "epoch": 1.4443084455324358, + "grad_norm": 0.20857316255569458, + "learning_rate": 2.8843537414965988e-05, + "loss": 0.3279, + "num_tokens": 348586523.0, + "step": 1180 + }, + { + "epoch": 1.4467564259485923, + "grad_norm": 0.2052152156829834, + "learning_rate": 2.8798185941043088e-05, + "loss": 0.3147, + "num_tokens": 349176270.0, + "step": 1182 + }, + { + "epoch": 1.4492044063647491, + "grad_norm": 0.19987402856349945, + "learning_rate": 2.875283446712018e-05, + "loss": 0.3302, + "num_tokens": 349747952.0, + "step": 1184 + }, + { + "epoch": 1.4516523867809057, + "grad_norm": 0.20046475529670715, + "learning_rate": 2.870748299319728e-05, + "loss": 0.3209, + "num_tokens": 350345000.0, + "step": 1186 + }, + { + "epoch": 1.4541003671970625, + "grad_norm": 0.20955757796764374, + "learning_rate": 2.8662131519274378e-05, + "loss": 0.3226, + "num_tokens": 350916326.0, + "step": 1188 + }, + { + "epoch": 1.456548347613219, + "grad_norm": 0.19657090306282043, + "learning_rate": 2.8616780045351478e-05, + "loss": 0.3269, + "num_tokens": 351523950.0, + "step": 1190 + }, + { + "epoch": 1.4589963280293758, + "grad_norm": 0.20323355495929718, + "learning_rate": 2.857142857142857e-05, + "loss": 0.323, + "num_tokens": 352134013.0, + "step": 1192 + }, + { + "epoch": 1.4614443084455324, + "grad_norm": 0.20946143567562103, + "learning_rate": 2.852607709750567e-05, + "loss": 0.3237, + "num_tokens": 352718396.0, + "step": 1194 + }, + { + "epoch": 1.4638922888616892, + "grad_norm": 0.20157726109027863, + "learning_rate": 2.8480725623582767e-05, + "loss": 0.3332, + "num_tokens": 353339745.0, + "step": 1196 + }, + { + "epoch": 1.4663402692778458, + "grad_norm": 0.20638763904571533, + "learning_rate": 2.8435374149659867e-05, + "loss": 0.3148, + "num_tokens": 353931150.0, + "step": 1198 + }, + { + "epoch": 1.4687882496940023, + "grad_norm": 0.21108146011829376, + "learning_rate": 2.839002267573696e-05, + "loss": 0.3316, + "num_tokens": 354522064.0, + "step": 1200 + }, + { + "epoch": 1.4712362301101591, + "grad_norm": 0.20062784850597382, + "learning_rate": 2.834467120181406e-05, + "loss": 0.3165, + "num_tokens": 355117113.0, + "step": 1202 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 0.20963874459266663, + "learning_rate": 2.829931972789116e-05, + "loss": 0.3248, + "num_tokens": 355730097.0, + "step": 1204 + }, + { + "epoch": 1.4761321909424725, + "grad_norm": 0.20895878970623016, + "learning_rate": 2.8253968253968253e-05, + "loss": 0.3229, + "num_tokens": 356338488.0, + "step": 1206 + }, + { + "epoch": 1.478580171358629, + "grad_norm": 0.19683930277824402, + "learning_rate": 2.8208616780045353e-05, + "loss": 0.3356, + "num_tokens": 356939071.0, + "step": 1208 + }, + { + "epoch": 1.4810281517747859, + "grad_norm": 0.196613609790802, + "learning_rate": 2.816326530612245e-05, + "loss": 0.3294, + "num_tokens": 357525179.0, + "step": 1210 + }, + { + "epoch": 1.4834761321909424, + "grad_norm": 0.19904600083827972, + "learning_rate": 2.811791383219955e-05, + "loss": 0.3296, + "num_tokens": 358154615.0, + "step": 1212 + }, + { + "epoch": 1.4859241126070992, + "grad_norm": 0.20643939077854156, + "learning_rate": 2.8072562358276643e-05, + "loss": 0.3272, + "num_tokens": 358716821.0, + "step": 1214 + }, + { + "epoch": 1.4883720930232558, + "grad_norm": 0.19810499250888824, + "learning_rate": 2.8027210884353743e-05, + "loss": 0.3236, + "num_tokens": 359309088.0, + "step": 1216 + }, + { + "epoch": 1.4908200734394126, + "grad_norm": 0.19915635883808136, + "learning_rate": 2.798185941043084e-05, + "loss": 0.3221, + "num_tokens": 359882160.0, + "step": 1218 + }, + { + "epoch": 1.4932680538555692, + "grad_norm": 0.19255715608596802, + "learning_rate": 2.793650793650794e-05, + "loss": 0.3289, + "num_tokens": 360488038.0, + "step": 1220 + }, + { + "epoch": 1.4957160342717257, + "grad_norm": 0.19077791273593903, + "learning_rate": 2.7891156462585033e-05, + "loss": 0.3222, + "num_tokens": 361096239.0, + "step": 1222 + }, + { + "epoch": 1.4981640146878825, + "grad_norm": 0.19892817735671997, + "learning_rate": 2.7845804988662133e-05, + "loss": 0.3199, + "num_tokens": 361655687.0, + "step": 1224 + }, + { + "epoch": 1.5006119951040393, + "grad_norm": 0.2043662965297699, + "learning_rate": 2.7800453514739233e-05, + "loss": 0.319, + "num_tokens": 362235721.0, + "step": 1226 + }, + { + "epoch": 1.5030599755201959, + "grad_norm": 0.19817014038562775, + "learning_rate": 2.775510204081633e-05, + "loss": 0.3369, + "num_tokens": 362845579.0, + "step": 1228 + }, + { + "epoch": 1.5055079559363524, + "grad_norm": 0.2069779485464096, + "learning_rate": 2.770975056689343e-05, + "loss": 0.3198, + "num_tokens": 363422588.0, + "step": 1230 + }, + { + "epoch": 1.5079559363525092, + "grad_norm": 0.2065594643354416, + "learning_rate": 2.7664399092970522e-05, + "loss": 0.3295, + "num_tokens": 364012902.0, + "step": 1232 + }, + { + "epoch": 1.5104039167686658, + "grad_norm": 0.19589748978614807, + "learning_rate": 2.7619047619047622e-05, + "loss": 0.3132, + "num_tokens": 364590381.0, + "step": 1234 + }, + { + "epoch": 1.5128518971848224, + "grad_norm": 0.20354430377483368, + "learning_rate": 2.7573696145124715e-05, + "loss": 0.32, + "num_tokens": 365171373.0, + "step": 1236 + }, + { + "epoch": 1.5152998776009792, + "grad_norm": 0.206694558262825, + "learning_rate": 2.7528344671201815e-05, + "loss": 0.3239, + "num_tokens": 365763431.0, + "step": 1238 + }, + { + "epoch": 1.517747858017136, + "grad_norm": 0.20106418430805206, + "learning_rate": 2.7482993197278912e-05, + "loss": 0.3286, + "num_tokens": 366380291.0, + "step": 1240 + }, + { + "epoch": 1.5201958384332925, + "grad_norm": 0.21432510018348694, + "learning_rate": 2.7437641723356012e-05, + "loss": 0.3258, + "num_tokens": 366967811.0, + "step": 1242 + }, + { + "epoch": 1.522643818849449, + "grad_norm": 0.22852499783039093, + "learning_rate": 2.7392290249433105e-05, + "loss": 0.3276, + "num_tokens": 367561596.0, + "step": 1244 + }, + { + "epoch": 1.525091799265606, + "grad_norm": 0.19916276633739471, + "learning_rate": 2.7346938775510205e-05, + "loss": 0.3134, + "num_tokens": 368150273.0, + "step": 1246 + }, + { + "epoch": 1.5275397796817627, + "grad_norm": 0.19989925622940063, + "learning_rate": 2.73015873015873e-05, + "loss": 0.3182, + "num_tokens": 368720930.0, + "step": 1248 + }, + { + "epoch": 1.5299877600979193, + "grad_norm": 0.19930680096149445, + "learning_rate": 2.72562358276644e-05, + "loss": 0.3334, + "num_tokens": 369314143.0, + "step": 1250 + }, + { + "epoch": 1.5324357405140758, + "grad_norm": 0.2082466185092926, + "learning_rate": 2.72108843537415e-05, + "loss": 0.3179, + "num_tokens": 369888591.0, + "step": 1252 + }, + { + "epoch": 1.5348837209302326, + "grad_norm": 0.19917330145835876, + "learning_rate": 2.7165532879818594e-05, + "loss": 0.3291, + "num_tokens": 370474915.0, + "step": 1254 + }, + { + "epoch": 1.5373317013463892, + "grad_norm": 0.20406271517276764, + "learning_rate": 2.7120181405895694e-05, + "loss": 0.3117, + "num_tokens": 371038197.0, + "step": 1256 + }, + { + "epoch": 1.5397796817625458, + "grad_norm": 0.1972333788871765, + "learning_rate": 2.707482993197279e-05, + "loss": 0.3259, + "num_tokens": 371644781.0, + "step": 1258 + }, + { + "epoch": 1.5422276621787026, + "grad_norm": 0.19415195286273956, + "learning_rate": 2.702947845804989e-05, + "loss": 0.3137, + "num_tokens": 372236254.0, + "step": 1260 + }, + { + "epoch": 1.5446756425948593, + "grad_norm": 0.21226201951503754, + "learning_rate": 2.6984126984126984e-05, + "loss": 0.3247, + "num_tokens": 372798274.0, + "step": 1262 + }, + { + "epoch": 1.547123623011016, + "grad_norm": 0.19908924400806427, + "learning_rate": 2.6938775510204084e-05, + "loss": 0.3251, + "num_tokens": 373380435.0, + "step": 1264 + }, + { + "epoch": 1.5495716034271725, + "grad_norm": 0.20480336248874664, + "learning_rate": 2.6893424036281177e-05, + "loss": 0.3184, + "num_tokens": 373955134.0, + "step": 1266 + }, + { + "epoch": 1.5520195838433293, + "grad_norm": 0.20377330482006073, + "learning_rate": 2.6848072562358277e-05, + "loss": 0.3084, + "num_tokens": 374540265.0, + "step": 1268 + }, + { + "epoch": 1.554467564259486, + "grad_norm": 0.19568265974521637, + "learning_rate": 2.6802721088435374e-05, + "loss": 0.3254, + "num_tokens": 375154456.0, + "step": 1270 + }, + { + "epoch": 1.5569155446756426, + "grad_norm": 0.20017024874687195, + "learning_rate": 2.6757369614512473e-05, + "loss": 0.327, + "num_tokens": 375773065.0, + "step": 1272 + }, + { + "epoch": 1.5593635250917992, + "grad_norm": 0.19712147116661072, + "learning_rate": 2.6712018140589573e-05, + "loss": 0.3182, + "num_tokens": 376389443.0, + "step": 1274 + }, + { + "epoch": 1.561811505507956, + "grad_norm": 0.20283843576908112, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.3221, + "num_tokens": 376982296.0, + "step": 1276 + }, + { + "epoch": 1.5642594859241126, + "grad_norm": 0.19853192567825317, + "learning_rate": 2.6621315192743767e-05, + "loss": 0.3228, + "num_tokens": 377584261.0, + "step": 1278 + }, + { + "epoch": 1.5667074663402691, + "grad_norm": 0.1841582953929901, + "learning_rate": 2.6575963718820863e-05, + "loss": 0.3275, + "num_tokens": 378197370.0, + "step": 1280 + }, + { + "epoch": 1.569155446756426, + "grad_norm": 0.20520064234733582, + "learning_rate": 2.6530612244897963e-05, + "loss": 0.3297, + "num_tokens": 378763480.0, + "step": 1282 + }, + { + "epoch": 1.5716034271725827, + "grad_norm": 0.1980048269033432, + "learning_rate": 2.6485260770975056e-05, + "loss": 0.3232, + "num_tokens": 379348302.0, + "step": 1284 + }, + { + "epoch": 1.5740514075887393, + "grad_norm": 0.2057088166475296, + "learning_rate": 2.6439909297052156e-05, + "loss": 0.3201, + "num_tokens": 379944029.0, + "step": 1286 + }, + { + "epoch": 1.5764993880048959, + "grad_norm": 0.19687974452972412, + "learning_rate": 2.6394557823129253e-05, + "loss": 0.3308, + "num_tokens": 380545276.0, + "step": 1288 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.19232483208179474, + "learning_rate": 2.6349206349206353e-05, + "loss": 0.3177, + "num_tokens": 381165952.0, + "step": 1290 + }, + { + "epoch": 1.5813953488372094, + "grad_norm": 0.19555847346782684, + "learning_rate": 2.6303854875283446e-05, + "loss": 0.3235, + "num_tokens": 381731938.0, + "step": 1292 + }, + { + "epoch": 1.583843329253366, + "grad_norm": 0.200959712266922, + "learning_rate": 2.6258503401360546e-05, + "loss": 0.3318, + "num_tokens": 382347508.0, + "step": 1294 + }, + { + "epoch": 1.5862913096695226, + "grad_norm": 0.20044825971126556, + "learning_rate": 2.621315192743764e-05, + "loss": 0.3107, + "num_tokens": 382959697.0, + "step": 1296 + }, + { + "epoch": 1.5887392900856794, + "grad_norm": 0.20267418026924133, + "learning_rate": 2.616780045351474e-05, + "loss": 0.3175, + "num_tokens": 383562023.0, + "step": 1298 + }, + { + "epoch": 1.591187270501836, + "grad_norm": 0.2004210501909256, + "learning_rate": 2.612244897959184e-05, + "loss": 0.3244, + "num_tokens": 384162298.0, + "step": 1300 + }, + { + "epoch": 1.5936352509179925, + "grad_norm": 0.2034674882888794, + "learning_rate": 2.6077097505668935e-05, + "loss": 0.3214, + "num_tokens": 384785150.0, + "step": 1302 + }, + { + "epoch": 1.5960832313341493, + "grad_norm": 0.19652917981147766, + "learning_rate": 2.6031746031746035e-05, + "loss": 0.3256, + "num_tokens": 385378921.0, + "step": 1304 + }, + { + "epoch": 1.598531211750306, + "grad_norm": 0.19352909922599792, + "learning_rate": 2.598639455782313e-05, + "loss": 0.3222, + "num_tokens": 385993130.0, + "step": 1306 + }, + { + "epoch": 1.6009791921664627, + "grad_norm": 0.19561731815338135, + "learning_rate": 2.594104308390023e-05, + "loss": 0.3193, + "num_tokens": 386558774.0, + "step": 1308 + }, + { + "epoch": 1.6034271725826192, + "grad_norm": 0.19160446524620056, + "learning_rate": 2.5895691609977325e-05, + "loss": 0.325, + "num_tokens": 387160513.0, + "step": 1310 + }, + { + "epoch": 1.605875152998776, + "grad_norm": 0.195190891623497, + "learning_rate": 2.5850340136054425e-05, + "loss": 0.3231, + "num_tokens": 387749539.0, + "step": 1312 + }, + { + "epoch": 1.6083231334149328, + "grad_norm": 0.1952180415391922, + "learning_rate": 2.5804988662131518e-05, + "loss": 0.319, + "num_tokens": 388325169.0, + "step": 1314 + }, + { + "epoch": 1.6107711138310894, + "grad_norm": 0.20217673480510712, + "learning_rate": 2.5759637188208618e-05, + "loss": 0.3296, + "num_tokens": 388917268.0, + "step": 1316 + }, + { + "epoch": 1.613219094247246, + "grad_norm": 0.21074482798576355, + "learning_rate": 2.5714285714285714e-05, + "loss": 0.3294, + "num_tokens": 389511580.0, + "step": 1318 + }, + { + "epoch": 1.6156670746634028, + "grad_norm": 0.1909942328929901, + "learning_rate": 2.5668934240362814e-05, + "loss": 0.3237, + "num_tokens": 390087089.0, + "step": 1320 + }, + { + "epoch": 1.6181150550795593, + "grad_norm": 0.2082855999469757, + "learning_rate": 2.5623582766439914e-05, + "loss": 0.3199, + "num_tokens": 390679866.0, + "step": 1322 + }, + { + "epoch": 1.620563035495716, + "grad_norm": 0.19247978925704956, + "learning_rate": 2.5578231292517007e-05, + "loss": 0.319, + "num_tokens": 391280514.0, + "step": 1324 + }, + { + "epoch": 1.6230110159118727, + "grad_norm": 0.19276100397109985, + "learning_rate": 2.5532879818594107e-05, + "loss": 0.3202, + "num_tokens": 391889240.0, + "step": 1326 + }, + { + "epoch": 1.6254589963280295, + "grad_norm": 0.20893600583076477, + "learning_rate": 2.54875283446712e-05, + "loss": 0.3134, + "num_tokens": 392471411.0, + "step": 1328 + }, + { + "epoch": 1.627906976744186, + "grad_norm": 0.20791074633598328, + "learning_rate": 2.54421768707483e-05, + "loss": 0.3204, + "num_tokens": 393038825.0, + "step": 1330 + }, + { + "epoch": 1.6303549571603426, + "grad_norm": 0.19956758618354797, + "learning_rate": 2.5396825396825397e-05, + "loss": 0.3174, + "num_tokens": 393629546.0, + "step": 1332 + }, + { + "epoch": 1.6328029375764994, + "grad_norm": 0.22147151827812195, + "learning_rate": 2.5351473922902497e-05, + "loss": 0.3329, + "num_tokens": 394214582.0, + "step": 1334 + }, + { + "epoch": 1.6352509179926562, + "grad_norm": 0.20094825327396393, + "learning_rate": 2.530612244897959e-05, + "loss": 0.3148, + "num_tokens": 394799830.0, + "step": 1336 + }, + { + "epoch": 1.6376988984088128, + "grad_norm": 0.20825228095054626, + "learning_rate": 2.526077097505669e-05, + "loss": 0.3285, + "num_tokens": 395383444.0, + "step": 1338 + }, + { + "epoch": 1.6401468788249693, + "grad_norm": 0.19322358071804047, + "learning_rate": 2.5215419501133787e-05, + "loss": 0.3203, + "num_tokens": 395962925.0, + "step": 1340 + }, + { + "epoch": 1.6425948592411261, + "grad_norm": 0.19959576427936554, + "learning_rate": 2.5170068027210887e-05, + "loss": 0.3165, + "num_tokens": 396542126.0, + "step": 1342 + }, + { + "epoch": 1.6450428396572827, + "grad_norm": 0.1989603191614151, + "learning_rate": 2.5124716553287987e-05, + "loss": 0.3216, + "num_tokens": 397126157.0, + "step": 1344 + }, + { + "epoch": 1.6474908200734393, + "grad_norm": 0.19598689675331116, + "learning_rate": 2.507936507936508e-05, + "loss": 0.3111, + "num_tokens": 397699343.0, + "step": 1346 + }, + { + "epoch": 1.649938800489596, + "grad_norm": 0.18742188811302185, + "learning_rate": 2.503401360544218e-05, + "loss": 0.3206, + "num_tokens": 398272984.0, + "step": 1348 + }, + { + "epoch": 1.6523867809057529, + "grad_norm": 0.19236139953136444, + "learning_rate": 2.4988662131519276e-05, + "loss": 0.3167, + "num_tokens": 398864706.0, + "step": 1350 + }, + { + "epoch": 1.6548347613219094, + "grad_norm": 0.20463594794273376, + "learning_rate": 2.4943310657596373e-05, + "loss": 0.3102, + "num_tokens": 399430725.0, + "step": 1352 + }, + { + "epoch": 1.657282741738066, + "grad_norm": 0.20807313919067383, + "learning_rate": 2.489795918367347e-05, + "loss": 0.3084, + "num_tokens": 400017577.0, + "step": 1354 + }, + { + "epoch": 1.6597307221542228, + "grad_norm": 0.1957245022058487, + "learning_rate": 2.485260770975057e-05, + "loss": 0.3359, + "num_tokens": 400605339.0, + "step": 1356 + }, + { + "epoch": 1.6621787025703796, + "grad_norm": 0.2026834934949875, + "learning_rate": 2.4807256235827666e-05, + "loss": 0.3065, + "num_tokens": 401187208.0, + "step": 1358 + }, + { + "epoch": 1.6646266829865362, + "grad_norm": 0.20324279367923737, + "learning_rate": 2.4761904761904762e-05, + "loss": 0.3119, + "num_tokens": 401780772.0, + "step": 1360 + }, + { + "epoch": 1.6670746634026927, + "grad_norm": 0.19772303104400635, + "learning_rate": 2.4716553287981862e-05, + "loss": 0.3375, + "num_tokens": 402372693.0, + "step": 1362 + }, + { + "epoch": 1.6695226438188495, + "grad_norm": 0.18982139229774475, + "learning_rate": 2.467120181405896e-05, + "loss": 0.3225, + "num_tokens": 402964749.0, + "step": 1364 + }, + { + "epoch": 1.671970624235006, + "grad_norm": 0.2030271738767624, + "learning_rate": 2.4625850340136055e-05, + "loss": 0.3222, + "num_tokens": 403556832.0, + "step": 1366 + }, + { + "epoch": 1.6744186046511627, + "grad_norm": 0.2075023055076599, + "learning_rate": 2.4580498866213152e-05, + "loss": 0.3256, + "num_tokens": 404148708.0, + "step": 1368 + }, + { + "epoch": 1.6768665850673194, + "grad_norm": 0.1922995150089264, + "learning_rate": 2.4535147392290252e-05, + "loss": 0.3216, + "num_tokens": 404747610.0, + "step": 1370 + }, + { + "epoch": 1.6793145654834762, + "grad_norm": 0.2042100578546524, + "learning_rate": 2.448979591836735e-05, + "loss": 0.3214, + "num_tokens": 405325414.0, + "step": 1372 + }, + { + "epoch": 1.6817625458996328, + "grad_norm": 0.20673443377017975, + "learning_rate": 2.4444444444444445e-05, + "loss": 0.3281, + "num_tokens": 405909420.0, + "step": 1374 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 0.20394203066825867, + "learning_rate": 2.439909297052154e-05, + "loss": 0.3273, + "num_tokens": 406467182.0, + "step": 1376 + }, + { + "epoch": 1.6866585067319462, + "grad_norm": 0.18502503633499146, + "learning_rate": 2.4353741496598638e-05, + "loss": 0.3245, + "num_tokens": 407084206.0, + "step": 1378 + }, + { + "epoch": 1.689106487148103, + "grad_norm": 0.19569917023181915, + "learning_rate": 2.4308390022675738e-05, + "loss": 0.3221, + "num_tokens": 407692897.0, + "step": 1380 + }, + { + "epoch": 1.6915544675642595, + "grad_norm": 0.23221711814403534, + "learning_rate": 2.4263038548752838e-05, + "loss": 0.3224, + "num_tokens": 408288198.0, + "step": 1382 + }, + { + "epoch": 1.694002447980416, + "grad_norm": 0.28485190868377686, + "learning_rate": 2.4217687074829934e-05, + "loss": 0.318, + "num_tokens": 408862070.0, + "step": 1384 + }, + { + "epoch": 1.696450428396573, + "grad_norm": 0.19002261757850647, + "learning_rate": 2.417233560090703e-05, + "loss": 0.3053, + "num_tokens": 409401455.0, + "step": 1386 + }, + { + "epoch": 1.6988984088127295, + "grad_norm": 0.21468402445316315, + "learning_rate": 2.4126984126984128e-05, + "loss": 0.316, + "num_tokens": 409981536.0, + "step": 1388 + }, + { + "epoch": 1.701346389228886, + "grad_norm": 0.19267933070659637, + "learning_rate": 2.4081632653061224e-05, + "loss": 0.3222, + "num_tokens": 410624142.0, + "step": 1390 + }, + { + "epoch": 1.7037943696450428, + "grad_norm": 0.19758130609989166, + "learning_rate": 2.4036281179138324e-05, + "loss": 0.314, + "num_tokens": 411217859.0, + "step": 1392 + }, + { + "epoch": 1.7062423500611996, + "grad_norm": 0.20600932836532593, + "learning_rate": 2.399092970521542e-05, + "loss": 0.3161, + "num_tokens": 411793639.0, + "step": 1394 + }, + { + "epoch": 1.7086903304773562, + "grad_norm": 0.2014395147562027, + "learning_rate": 2.3945578231292517e-05, + "loss": 0.3248, + "num_tokens": 412382835.0, + "step": 1396 + }, + { + "epoch": 1.7111383108935128, + "grad_norm": 0.19331228733062744, + "learning_rate": 2.3900226757369614e-05, + "loss": 0.3121, + "num_tokens": 412978513.0, + "step": 1398 + }, + { + "epoch": 1.7135862913096696, + "grad_norm": 0.2003536969423294, + "learning_rate": 2.3854875283446714e-05, + "loss": 0.3227, + "num_tokens": 413548350.0, + "step": 1400 + }, + { + "epoch": 1.7160342717258263, + "grad_norm": 0.19575215876102448, + "learning_rate": 2.380952380952381e-05, + "loss": 0.3138, + "num_tokens": 414157385.0, + "step": 1402 + }, + { + "epoch": 1.718482252141983, + "grad_norm": 0.20070458948612213, + "learning_rate": 2.376417233560091e-05, + "loss": 0.3203, + "num_tokens": 414745862.0, + "step": 1404 + }, + { + "epoch": 1.7209302325581395, + "grad_norm": 0.19983048737049103, + "learning_rate": 2.3718820861678007e-05, + "loss": 0.3236, + "num_tokens": 415343977.0, + "step": 1406 + }, + { + "epoch": 1.7233782129742963, + "grad_norm": 0.1968611776828766, + "learning_rate": 2.3673469387755103e-05, + "loss": 0.3037, + "num_tokens": 415928711.0, + "step": 1408 + }, + { + "epoch": 1.7258261933904528, + "grad_norm": 0.18583181500434875, + "learning_rate": 2.36281179138322e-05, + "loss": 0.3056, + "num_tokens": 416548135.0, + "step": 1410 + }, + { + "epoch": 1.7282741738066094, + "grad_norm": 0.2043532431125641, + "learning_rate": 2.35827664399093e-05, + "loss": 0.3198, + "num_tokens": 417136899.0, + "step": 1412 + }, + { + "epoch": 1.7307221542227662, + "grad_norm": 0.19686143100261688, + "learning_rate": 2.3537414965986396e-05, + "loss": 0.3239, + "num_tokens": 417734774.0, + "step": 1414 + }, + { + "epoch": 1.733170134638923, + "grad_norm": 0.20929764211177826, + "learning_rate": 2.3492063492063493e-05, + "loss": 0.3055, + "num_tokens": 418335313.0, + "step": 1416 + }, + { + "epoch": 1.7356181150550796, + "grad_norm": 0.18701575696468353, + "learning_rate": 2.344671201814059e-05, + "loss": 0.3268, + "num_tokens": 418929394.0, + "step": 1418 + }, + { + "epoch": 1.7380660954712361, + "grad_norm": 0.21724991500377655, + "learning_rate": 2.3401360544217686e-05, + "loss": 0.3127, + "num_tokens": 419530495.0, + "step": 1420 + }, + { + "epoch": 1.740514075887393, + "grad_norm": 0.21731635928153992, + "learning_rate": 2.3356009070294786e-05, + "loss": 0.33, + "num_tokens": 420103794.0, + "step": 1422 + }, + { + "epoch": 1.7429620563035497, + "grad_norm": 0.19159282743930817, + "learning_rate": 2.3310657596371882e-05, + "loss": 0.3353, + "num_tokens": 420709158.0, + "step": 1424 + }, + { + "epoch": 1.7454100367197063, + "grad_norm": 0.19120649993419647, + "learning_rate": 2.326530612244898e-05, + "loss": 0.3311, + "num_tokens": 421336727.0, + "step": 1426 + }, + { + "epoch": 1.7478580171358629, + "grad_norm": 0.19759586453437805, + "learning_rate": 2.321995464852608e-05, + "loss": 0.3176, + "num_tokens": 421928607.0, + "step": 1428 + }, + { + "epoch": 1.7503059975520197, + "grad_norm": 0.195876806974411, + "learning_rate": 2.3174603174603175e-05, + "loss": 0.3141, + "num_tokens": 422497348.0, + "step": 1430 + }, + { + "epoch": 1.7527539779681762, + "grad_norm": 0.18995866179466248, + "learning_rate": 2.3129251700680275e-05, + "loss": 0.3161, + "num_tokens": 423079979.0, + "step": 1432 + }, + { + "epoch": 1.7552019583843328, + "grad_norm": 0.18858186900615692, + "learning_rate": 2.3083900226757372e-05, + "loss": 0.3202, + "num_tokens": 423687525.0, + "step": 1434 + }, + { + "epoch": 1.7576499388004896, + "grad_norm": 0.18937230110168457, + "learning_rate": 2.303854875283447e-05, + "loss": 0.3108, + "num_tokens": 424262443.0, + "step": 1436 + }, + { + "epoch": 1.7600979192166464, + "grad_norm": 0.205905944108963, + "learning_rate": 2.2993197278911565e-05, + "loss": 0.3221, + "num_tokens": 424847534.0, + "step": 1438 + }, + { + "epoch": 1.762545899632803, + "grad_norm": 0.19281575083732605, + "learning_rate": 2.294784580498866e-05, + "loss": 0.3184, + "num_tokens": 425444472.0, + "step": 1440 + }, + { + "epoch": 1.7649938800489595, + "grad_norm": 0.20127756893634796, + "learning_rate": 2.290249433106576e-05, + "loss": 0.3305, + "num_tokens": 426049494.0, + "step": 1442 + }, + { + "epoch": 1.7674418604651163, + "grad_norm": 0.19298484921455383, + "learning_rate": 2.2857142857142858e-05, + "loss": 0.3174, + "num_tokens": 426653447.0, + "step": 1444 + }, + { + "epoch": 1.769889840881273, + "grad_norm": 0.19422824680805206, + "learning_rate": 2.2811791383219955e-05, + "loss": 0.3269, + "num_tokens": 427239537.0, + "step": 1446 + }, + { + "epoch": 1.7723378212974297, + "grad_norm": 0.2057882845401764, + "learning_rate": 2.276643990929705e-05, + "loss": 0.3262, + "num_tokens": 427830403.0, + "step": 1448 + }, + { + "epoch": 1.7747858017135862, + "grad_norm": 0.19421933591365814, + "learning_rate": 2.272108843537415e-05, + "loss": 0.327, + "num_tokens": 428446453.0, + "step": 1450 + }, + { + "epoch": 1.777233782129743, + "grad_norm": 0.19620618224143982, + "learning_rate": 2.267573696145125e-05, + "loss": 0.3226, + "num_tokens": 429059159.0, + "step": 1452 + }, + { + "epoch": 1.7796817625458996, + "grad_norm": 0.18654832243919373, + "learning_rate": 2.2630385487528348e-05, + "loss": 0.3256, + "num_tokens": 429673711.0, + "step": 1454 + }, + { + "epoch": 1.7821297429620562, + "grad_norm": 0.1963110715150833, + "learning_rate": 2.2585034013605444e-05, + "loss": 0.3102, + "num_tokens": 430239528.0, + "step": 1456 + }, + { + "epoch": 1.784577723378213, + "grad_norm": 0.191807359457016, + "learning_rate": 2.253968253968254e-05, + "loss": 0.3173, + "num_tokens": 430837471.0, + "step": 1458 + }, + { + "epoch": 1.7870257037943698, + "grad_norm": 0.1968986988067627, + "learning_rate": 2.2494331065759637e-05, + "loss": 0.3136, + "num_tokens": 431441012.0, + "step": 1460 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 0.21570736169815063, + "learning_rate": 2.2448979591836737e-05, + "loss": 0.3364, + "num_tokens": 432042964.0, + "step": 1462 + }, + { + "epoch": 1.791921664626683, + "grad_norm": 0.19731880724430084, + "learning_rate": 2.2403628117913834e-05, + "loss": 0.3276, + "num_tokens": 432654843.0, + "step": 1464 + }, + { + "epoch": 1.7943696450428397, + "grad_norm": 0.2002767026424408, + "learning_rate": 2.235827664399093e-05, + "loss": 0.3148, + "num_tokens": 433233846.0, + "step": 1466 + }, + { + "epoch": 1.7968176254589965, + "grad_norm": 0.21068984270095825, + "learning_rate": 2.2312925170068027e-05, + "loss": 0.3192, + "num_tokens": 433809551.0, + "step": 1468 + }, + { + "epoch": 1.799265605875153, + "grad_norm": 0.18816709518432617, + "learning_rate": 2.2267573696145123e-05, + "loss": 0.3117, + "num_tokens": 434414308.0, + "step": 1470 + }, + { + "epoch": 1.8017135862913096, + "grad_norm": 0.19244688749313354, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.3167, + "num_tokens": 435020164.0, + "step": 1472 + }, + { + "epoch": 1.8041615667074664, + "grad_norm": 0.19031742215156555, + "learning_rate": 2.2176870748299323e-05, + "loss": 0.3206, + "num_tokens": 435605245.0, + "step": 1474 + }, + { + "epoch": 1.806609547123623, + "grad_norm": 0.20432260632514954, + "learning_rate": 2.213151927437642e-05, + "loss": 0.3155, + "num_tokens": 436186660.0, + "step": 1476 + }, + { + "epoch": 1.8090575275397796, + "grad_norm": 0.1968659907579422, + "learning_rate": 2.2086167800453516e-05, + "loss": 0.315, + "num_tokens": 436790859.0, + "step": 1478 + }, + { + "epoch": 1.8115055079559363, + "grad_norm": 0.20364122092723846, + "learning_rate": 2.2040816326530613e-05, + "loss": 0.3252, + "num_tokens": 437359974.0, + "step": 1480 + }, + { + "epoch": 1.8139534883720931, + "grad_norm": 0.21255619823932648, + "learning_rate": 2.1995464852607713e-05, + "loss": 0.313, + "num_tokens": 437955768.0, + "step": 1482 + }, + { + "epoch": 1.8164014687882497, + "grad_norm": 0.23566021025180817, + "learning_rate": 2.195011337868481e-05, + "loss": 0.3224, + "num_tokens": 438555223.0, + "step": 1484 + }, + { + "epoch": 1.8188494492044063, + "grad_norm": 0.18237993121147156, + "learning_rate": 2.1904761904761906e-05, + "loss": 0.3201, + "num_tokens": 439148244.0, + "step": 1486 + }, + { + "epoch": 1.821297429620563, + "grad_norm": 0.20443126559257507, + "learning_rate": 2.1859410430839002e-05, + "loss": 0.3123, + "num_tokens": 439759887.0, + "step": 1488 + }, + { + "epoch": 1.8237454100367199, + "grad_norm": 0.18899664282798767, + "learning_rate": 2.18140589569161e-05, + "loss": 0.3036, + "num_tokens": 440362617.0, + "step": 1490 + }, + { + "epoch": 1.8261933904528764, + "grad_norm": 0.1955849975347519, + "learning_rate": 2.17687074829932e-05, + "loss": 0.3156, + "num_tokens": 440952427.0, + "step": 1492 + }, + { + "epoch": 1.828641370869033, + "grad_norm": 0.19111017882823944, + "learning_rate": 2.1723356009070295e-05, + "loss": 0.3217, + "num_tokens": 441559643.0, + "step": 1494 + }, + { + "epoch": 1.8310893512851898, + "grad_norm": 0.19410696625709534, + "learning_rate": 2.1678004535147392e-05, + "loss": 0.3224, + "num_tokens": 442151992.0, + "step": 1496 + }, + { + "epoch": 1.8335373317013464, + "grad_norm": 0.1928848773241043, + "learning_rate": 2.1632653061224492e-05, + "loss": 0.314, + "num_tokens": 442736825.0, + "step": 1498 + }, + { + "epoch": 1.835985312117503, + "grad_norm": 0.2009039670228958, + "learning_rate": 2.158730158730159e-05, + "loss": 0.318, + "num_tokens": 443350122.0, + "step": 1500 + }, + { + "epoch": 1.8384332925336597, + "grad_norm": 0.1868344396352768, + "learning_rate": 2.1541950113378685e-05, + "loss": 0.3163, + "num_tokens": 443921129.0, + "step": 1502 + }, + { + "epoch": 1.8408812729498165, + "grad_norm": 0.1903611421585083, + "learning_rate": 2.1496598639455785e-05, + "loss": 0.3052, + "num_tokens": 444501391.0, + "step": 1504 + }, + { + "epoch": 1.843329253365973, + "grad_norm": 0.19270755350589752, + "learning_rate": 2.145124716553288e-05, + "loss": 0.3176, + "num_tokens": 445096945.0, + "step": 1506 + }, + { + "epoch": 1.8457772337821297, + "grad_norm": 0.19637158513069153, + "learning_rate": 2.1405895691609978e-05, + "loss": 0.3123, + "num_tokens": 445709301.0, + "step": 1508 + }, + { + "epoch": 1.8482252141982864, + "grad_norm": 0.19586153328418732, + "learning_rate": 2.1360544217687075e-05, + "loss": 0.3274, + "num_tokens": 446286266.0, + "step": 1510 + }, + { + "epoch": 1.8506731946144432, + "grad_norm": 0.18815799057483673, + "learning_rate": 2.1315192743764175e-05, + "loss": 0.3003, + "num_tokens": 446840279.0, + "step": 1512 + }, + { + "epoch": 1.8531211750305998, + "grad_norm": 0.1773037314414978, + "learning_rate": 2.126984126984127e-05, + "loss": 0.3195, + "num_tokens": 447457483.0, + "step": 1514 + }, + { + "epoch": 1.8555691554467564, + "grad_norm": 0.19612428545951843, + "learning_rate": 2.1224489795918368e-05, + "loss": 0.2979, + "num_tokens": 448021563.0, + "step": 1516 + }, + { + "epoch": 1.8580171358629132, + "grad_norm": 0.1957685798406601, + "learning_rate": 2.1179138321995464e-05, + "loss": 0.3061, + "num_tokens": 448586869.0, + "step": 1518 + }, + { + "epoch": 1.8604651162790697, + "grad_norm": 0.19157424569129944, + "learning_rate": 2.113378684807256e-05, + "loss": 0.3187, + "num_tokens": 449141403.0, + "step": 1520 + }, + { + "epoch": 1.8629130966952263, + "grad_norm": 0.19342470169067383, + "learning_rate": 2.108843537414966e-05, + "loss": 0.3125, + "num_tokens": 449731655.0, + "step": 1522 + }, + { + "epoch": 1.865361077111383, + "grad_norm": 0.19065633416175842, + "learning_rate": 2.104308390022676e-05, + "loss": 0.3265, + "num_tokens": 450331437.0, + "step": 1524 + }, + { + "epoch": 1.86780905752754, + "grad_norm": 0.1888921856880188, + "learning_rate": 2.0997732426303857e-05, + "loss": 0.3306, + "num_tokens": 450949263.0, + "step": 1526 + }, + { + "epoch": 1.8702570379436965, + "grad_norm": 0.1860380321741104, + "learning_rate": 2.0952380952380954e-05, + "loss": 0.3195, + "num_tokens": 451562816.0, + "step": 1528 + }, + { + "epoch": 1.872705018359853, + "grad_norm": 0.1864960640668869, + "learning_rate": 2.090702947845805e-05, + "loss": 0.3104, + "num_tokens": 452147375.0, + "step": 1530 + }, + { + "epoch": 1.8751529987760098, + "grad_norm": 0.22677704691886902, + "learning_rate": 2.0861678004535147e-05, + "loss": 0.3318, + "num_tokens": 452751522.0, + "step": 1532 + }, + { + "epoch": 1.8776009791921666, + "grad_norm": 0.18683534860610962, + "learning_rate": 2.0816326530612247e-05, + "loss": 0.3074, + "num_tokens": 453353463.0, + "step": 1534 + }, + { + "epoch": 1.880048959608323, + "grad_norm": 0.1986975520849228, + "learning_rate": 2.0770975056689343e-05, + "loss": 0.3164, + "num_tokens": 453920518.0, + "step": 1536 + }, + { + "epoch": 1.8824969400244798, + "grad_norm": 0.2015327662229538, + "learning_rate": 2.072562358276644e-05, + "loss": 0.3276, + "num_tokens": 454526302.0, + "step": 1538 + }, + { + "epoch": 1.8849449204406366, + "grad_norm": 0.19301684200763702, + "learning_rate": 2.0680272108843536e-05, + "loss": 0.3155, + "num_tokens": 455106980.0, + "step": 1540 + }, + { + "epoch": 1.8873929008567931, + "grad_norm": 0.18719364702701569, + "learning_rate": 2.0634920634920636e-05, + "loss": 0.3141, + "num_tokens": 455714265.0, + "step": 1542 + }, + { + "epoch": 1.8898408812729497, + "grad_norm": 0.18764334917068481, + "learning_rate": 2.0589569160997733e-05, + "loss": 0.3129, + "num_tokens": 456305393.0, + "step": 1544 + }, + { + "epoch": 1.8922888616891065, + "grad_norm": 0.19276875257492065, + "learning_rate": 2.0544217687074833e-05, + "loss": 0.3203, + "num_tokens": 456904664.0, + "step": 1546 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.202528178691864, + "learning_rate": 2.049886621315193e-05, + "loss": 0.3246, + "num_tokens": 457465974.0, + "step": 1548 + }, + { + "epoch": 1.8971848225214198, + "grad_norm": 0.18794238567352295, + "learning_rate": 2.0453514739229026e-05, + "loss": 0.3174, + "num_tokens": 458056695.0, + "step": 1550 + }, + { + "epoch": 1.8996328029375764, + "grad_norm": 0.18893173336982727, + "learning_rate": 2.0408163265306123e-05, + "loss": 0.3192, + "num_tokens": 458648053.0, + "step": 1552 + }, + { + "epoch": 1.9020807833537332, + "grad_norm": 0.1958337128162384, + "learning_rate": 2.0362811791383222e-05, + "loss": 0.321, + "num_tokens": 459216811.0, + "step": 1554 + }, + { + "epoch": 1.90452876376989, + "grad_norm": 0.18343977630138397, + "learning_rate": 2.031746031746032e-05, + "loss": 0.3256, + "num_tokens": 459842266.0, + "step": 1556 + }, + { + "epoch": 1.9069767441860463, + "grad_norm": 0.1893829107284546, + "learning_rate": 2.0272108843537416e-05, + "loss": 0.3139, + "num_tokens": 460423041.0, + "step": 1558 + }, + { + "epoch": 1.9094247246022031, + "grad_norm": 0.19329939782619476, + "learning_rate": 2.0226757369614512e-05, + "loss": 0.3169, + "num_tokens": 461028921.0, + "step": 1560 + }, + { + "epoch": 1.91187270501836, + "grad_norm": 0.19080796837806702, + "learning_rate": 2.018140589569161e-05, + "loss": 0.3268, + "num_tokens": 461614263.0, + "step": 1562 + }, + { + "epoch": 1.9143206854345165, + "grad_norm": 0.1843099594116211, + "learning_rate": 2.013605442176871e-05, + "loss": 0.3168, + "num_tokens": 462219672.0, + "step": 1564 + }, + { + "epoch": 1.916768665850673, + "grad_norm": 0.19338639080524445, + "learning_rate": 2.0090702947845805e-05, + "loss": 0.3188, + "num_tokens": 462786923.0, + "step": 1566 + }, + { + "epoch": 1.9192166462668299, + "grad_norm": 0.20353542268276215, + "learning_rate": 2.0045351473922905e-05, + "loss": 0.3222, + "num_tokens": 463378442.0, + "step": 1568 + }, + { + "epoch": 1.9216646266829867, + "grad_norm": 0.19092589616775513, + "learning_rate": 2e-05, + "loss": 0.3173, + "num_tokens": 463959340.0, + "step": 1570 + }, + { + "epoch": 1.9241126070991432, + "grad_norm": 0.18475502729415894, + "learning_rate": 1.9954648526077098e-05, + "loss": 0.3201, + "num_tokens": 464550595.0, + "step": 1572 + }, + { + "epoch": 1.9265605875152998, + "grad_norm": 0.197591170668602, + "learning_rate": 1.9909297052154198e-05, + "loss": 0.3057, + "num_tokens": 465127457.0, + "step": 1574 + }, + { + "epoch": 1.9290085679314566, + "grad_norm": 0.19674883782863617, + "learning_rate": 1.9863945578231295e-05, + "loss": 0.3199, + "num_tokens": 465688708.0, + "step": 1576 + }, + { + "epoch": 1.9314565483476134, + "grad_norm": 0.18538399040699005, + "learning_rate": 1.981859410430839e-05, + "loss": 0.3199, + "num_tokens": 466310661.0, + "step": 1578 + }, + { + "epoch": 1.9339045287637697, + "grad_norm": 0.18499000370502472, + "learning_rate": 1.9773242630385488e-05, + "loss": 0.3184, + "num_tokens": 466914190.0, + "step": 1580 + }, + { + "epoch": 1.9363525091799265, + "grad_norm": 0.20361009240150452, + "learning_rate": 1.9727891156462584e-05, + "loss": 0.3266, + "num_tokens": 467542417.0, + "step": 1582 + }, + { + "epoch": 1.9388004895960833, + "grad_norm": 0.1878398358821869, + "learning_rate": 1.9682539682539684e-05, + "loss": 0.3235, + "num_tokens": 468147376.0, + "step": 1584 + }, + { + "epoch": 1.9412484700122399, + "grad_norm": 0.19775955379009247, + "learning_rate": 1.963718820861678e-05, + "loss": 0.3225, + "num_tokens": 468759897.0, + "step": 1586 + }, + { + "epoch": 1.9436964504283964, + "grad_norm": 0.18105393648147583, + "learning_rate": 1.9591836734693877e-05, + "loss": 0.3246, + "num_tokens": 469367956.0, + "step": 1588 + }, + { + "epoch": 1.9461444308445532, + "grad_norm": 0.1867077350616455, + "learning_rate": 1.9546485260770974e-05, + "loss": 0.3245, + "num_tokens": 469959880.0, + "step": 1590 + }, + { + "epoch": 1.94859241126071, + "grad_norm": 0.18449635803699493, + "learning_rate": 1.9501133786848074e-05, + "loss": 0.3112, + "num_tokens": 470552183.0, + "step": 1592 + }, + { + "epoch": 1.9510403916768666, + "grad_norm": 0.19630222022533417, + "learning_rate": 1.945578231292517e-05, + "loss": 0.325, + "num_tokens": 471124861.0, + "step": 1594 + }, + { + "epoch": 1.9534883720930232, + "grad_norm": 0.1801752746105194, + "learning_rate": 1.941043083900227e-05, + "loss": 0.2998, + "num_tokens": 471746645.0, + "step": 1596 + }, + { + "epoch": 1.95593635250918, + "grad_norm": 0.21052420139312744, + "learning_rate": 1.9365079365079367e-05, + "loss": 0.3218, + "num_tokens": 472348999.0, + "step": 1598 + }, + { + "epoch": 1.9583843329253368, + "grad_norm": 0.18748612701892853, + "learning_rate": 1.9319727891156463e-05, + "loss": 0.3219, + "num_tokens": 472943557.0, + "step": 1600 + }, + { + "epoch": 1.960832313341493, + "grad_norm": 0.197321817278862, + "learning_rate": 1.927437641723356e-05, + "loss": 0.3105, + "num_tokens": 473517613.0, + "step": 1602 + }, + { + "epoch": 1.96328029375765, + "grad_norm": 0.19740061461925507, + "learning_rate": 1.922902494331066e-05, + "loss": 0.3206, + "num_tokens": 474128167.0, + "step": 1604 + }, + { + "epoch": 1.9657282741738067, + "grad_norm": 0.1990179419517517, + "learning_rate": 1.9183673469387756e-05, + "loss": 0.3159, + "num_tokens": 474698870.0, + "step": 1606 + }, + { + "epoch": 1.9681762545899633, + "grad_norm": 0.19557151198387146, + "learning_rate": 1.9138321995464853e-05, + "loss": 0.314, + "num_tokens": 475292232.0, + "step": 1608 + }, + { + "epoch": 1.9706242350061198, + "grad_norm": 0.1849379688501358, + "learning_rate": 1.909297052154195e-05, + "loss": 0.3188, + "num_tokens": 475896729.0, + "step": 1610 + }, + { + "epoch": 1.9730722154222766, + "grad_norm": 0.18985800445079803, + "learning_rate": 1.9047619047619046e-05, + "loss": 0.3193, + "num_tokens": 476487140.0, + "step": 1612 + }, + { + "epoch": 1.9755201958384334, + "grad_norm": 0.18395473062992096, + "learning_rate": 1.9002267573696146e-05, + "loss": 0.3199, + "num_tokens": 477092050.0, + "step": 1614 + }, + { + "epoch": 1.97796817625459, + "grad_norm": 0.21562960743904114, + "learning_rate": 1.8956916099773246e-05, + "loss": 0.3177, + "num_tokens": 477694927.0, + "step": 1616 + }, + { + "epoch": 1.9804161566707466, + "grad_norm": 0.1928740292787552, + "learning_rate": 1.8911564625850343e-05, + "loss": 0.3128, + "num_tokens": 478253698.0, + "step": 1618 + }, + { + "epoch": 1.9828641370869033, + "grad_norm": 0.19650574028491974, + "learning_rate": 1.886621315192744e-05, + "loss": 0.3185, + "num_tokens": 478839057.0, + "step": 1620 + }, + { + "epoch": 1.9853121175030601, + "grad_norm": 0.18753434717655182, + "learning_rate": 1.8820861678004536e-05, + "loss": 0.3146, + "num_tokens": 479421346.0, + "step": 1622 + }, + { + "epoch": 1.9877600979192165, + "grad_norm": 0.19200855493545532, + "learning_rate": 1.8775510204081632e-05, + "loss": 0.3024, + "num_tokens": 479966942.0, + "step": 1624 + }, + { + "epoch": 1.9902080783353733, + "grad_norm": 0.18329738080501556, + "learning_rate": 1.8730158730158732e-05, + "loss": 0.3099, + "num_tokens": 480573283.0, + "step": 1626 + }, + { + "epoch": 1.99265605875153, + "grad_norm": 0.18802450597286224, + "learning_rate": 1.868480725623583e-05, + "loss": 0.3072, + "num_tokens": 481160679.0, + "step": 1628 + }, + { + "epoch": 1.9951040391676866, + "grad_norm": 0.17617082595825195, + "learning_rate": 1.8639455782312925e-05, + "loss": 0.3067, + "num_tokens": 481747619.0, + "step": 1630 + }, + { + "epoch": 1.9975520195838432, + "grad_norm": 0.18958795070648193, + "learning_rate": 1.8594104308390022e-05, + "loss": 0.3078, + "num_tokens": 482345468.0, + "step": 1632 + }, + { + "epoch": 2.0, + "grad_norm": 0.19223837554454803, + "learning_rate": 1.8548752834467122e-05, + "loss": 0.3041, + "num_tokens": 482921493.0, + "step": 1634 + }, + { + "epoch": 2.002447980416157, + "grad_norm": 0.228647843003273, + "learning_rate": 1.8503401360544218e-05, + "loss": 0.2359, + "num_tokens": 483515476.0, + "step": 1636 + }, + { + "epoch": 2.004895960832313, + "grad_norm": 0.2760069668292999, + "learning_rate": 1.8458049886621315e-05, + "loss": 0.2293, + "num_tokens": 484097617.0, + "step": 1638 + }, + { + "epoch": 2.00734394124847, + "grad_norm": 0.24974100291728973, + "learning_rate": 1.8412698412698415e-05, + "loss": 0.2281, + "num_tokens": 484698438.0, + "step": 1640 + }, + { + "epoch": 2.0097919216646267, + "grad_norm": 0.21910329163074493, + "learning_rate": 1.836734693877551e-05, + "loss": 0.2384, + "num_tokens": 485322172.0, + "step": 1642 + }, + { + "epoch": 2.0122399020807835, + "grad_norm": 0.21504969894886017, + "learning_rate": 1.8321995464852608e-05, + "loss": 0.2275, + "num_tokens": 485913634.0, + "step": 1644 + }, + { + "epoch": 2.01468788249694, + "grad_norm": 0.21915705502033234, + "learning_rate": 1.8276643990929708e-05, + "loss": 0.2199, + "num_tokens": 486494338.0, + "step": 1646 + }, + { + "epoch": 2.0171358629130967, + "grad_norm": 0.23618166148662567, + "learning_rate": 1.8231292517006804e-05, + "loss": 0.2199, + "num_tokens": 487063240.0, + "step": 1648 + }, + { + "epoch": 2.0195838433292534, + "grad_norm": 0.21529266238212585, + "learning_rate": 1.81859410430839e-05, + "loss": 0.2243, + "num_tokens": 487665600.0, + "step": 1650 + }, + { + "epoch": 2.0220318237454102, + "grad_norm": 0.21565918624401093, + "learning_rate": 1.8140589569160997e-05, + "loss": 0.2254, + "num_tokens": 488283259.0, + "step": 1652 + }, + { + "epoch": 2.0244798041615666, + "grad_norm": 0.20759893953800201, + "learning_rate": 1.8095238095238094e-05, + "loss": 0.2197, + "num_tokens": 488889864.0, + "step": 1654 + }, + { + "epoch": 2.0269277845777234, + "grad_norm": 0.2181428223848343, + "learning_rate": 1.8049886621315194e-05, + "loss": 0.2197, + "num_tokens": 489460036.0, + "step": 1656 + }, + { + "epoch": 2.02937576499388, + "grad_norm": 0.21928352117538452, + "learning_rate": 1.800453514739229e-05, + "loss": 0.2252, + "num_tokens": 490051389.0, + "step": 1658 + }, + { + "epoch": 2.0318237454100365, + "grad_norm": 0.21130070090293884, + "learning_rate": 1.7959183673469387e-05, + "loss": 0.2221, + "num_tokens": 490655180.0, + "step": 1660 + }, + { + "epoch": 2.0342717258261933, + "grad_norm": 0.20319539308547974, + "learning_rate": 1.7913832199546487e-05, + "loss": 0.2132, + "num_tokens": 491227428.0, + "step": 1662 + }, + { + "epoch": 2.03671970624235, + "grad_norm": 0.2168501764535904, + "learning_rate": 1.7868480725623583e-05, + "loss": 0.2145, + "num_tokens": 491798533.0, + "step": 1664 + }, + { + "epoch": 2.039167686658507, + "grad_norm": 0.21746525168418884, + "learning_rate": 1.7823129251700683e-05, + "loss": 0.2267, + "num_tokens": 492371682.0, + "step": 1666 + }, + { + "epoch": 2.0416156670746632, + "grad_norm": 0.22001875936985016, + "learning_rate": 1.777777777777778e-05, + "loss": 0.2315, + "num_tokens": 492960200.0, + "step": 1668 + }, + { + "epoch": 2.04406364749082, + "grad_norm": 0.2007884979248047, + "learning_rate": 1.7732426303854877e-05, + "loss": 0.224, + "num_tokens": 493564120.0, + "step": 1670 + }, + { + "epoch": 2.046511627906977, + "grad_norm": 0.20926231145858765, + "learning_rate": 1.7687074829931973e-05, + "loss": 0.2176, + "num_tokens": 494172197.0, + "step": 1672 + }, + { + "epoch": 2.0489596083231336, + "grad_norm": 0.20303648710250854, + "learning_rate": 1.764172335600907e-05, + "loss": 0.2172, + "num_tokens": 494742516.0, + "step": 1674 + }, + { + "epoch": 2.05140758873929, + "grad_norm": 0.20408904552459717, + "learning_rate": 1.759637188208617e-05, + "loss": 0.2196, + "num_tokens": 495332436.0, + "step": 1676 + }, + { + "epoch": 2.0538555691554468, + "grad_norm": 0.211123988032341, + "learning_rate": 1.7551020408163266e-05, + "loss": 0.2199, + "num_tokens": 495921640.0, + "step": 1678 + }, + { + "epoch": 2.0563035495716036, + "grad_norm": 0.20674318075180054, + "learning_rate": 1.7505668934240363e-05, + "loss": 0.2209, + "num_tokens": 496515340.0, + "step": 1680 + }, + { + "epoch": 2.05875152998776, + "grad_norm": 0.19681598246097565, + "learning_rate": 1.746031746031746e-05, + "loss": 0.2214, + "num_tokens": 497148845.0, + "step": 1682 + }, + { + "epoch": 2.0611995104039167, + "grad_norm": 0.19517882168293, + "learning_rate": 1.7414965986394556e-05, + "loss": 0.2157, + "num_tokens": 497737297.0, + "step": 1684 + }, + { + "epoch": 2.0636474908200735, + "grad_norm": 0.20753493905067444, + "learning_rate": 1.736961451247166e-05, + "loss": 0.2211, + "num_tokens": 498326487.0, + "step": 1686 + }, + { + "epoch": 2.0660954712362303, + "grad_norm": 0.21051937341690063, + "learning_rate": 1.7324263038548756e-05, + "loss": 0.2267, + "num_tokens": 498969162.0, + "step": 1688 + }, + { + "epoch": 2.0685434516523866, + "grad_norm": 0.21348614990711212, + "learning_rate": 1.7278911564625852e-05, + "loss": 0.2267, + "num_tokens": 499568032.0, + "step": 1690 + }, + { + "epoch": 2.0709914320685434, + "grad_norm": 0.20301321148872375, + "learning_rate": 1.723356009070295e-05, + "loss": 0.2168, + "num_tokens": 500144933.0, + "step": 1692 + }, + { + "epoch": 2.0734394124847, + "grad_norm": 0.21038226783275604, + "learning_rate": 1.7188208616780045e-05, + "loss": 0.2207, + "num_tokens": 500725291.0, + "step": 1694 + }, + { + "epoch": 2.075887392900857, + "grad_norm": 0.21632690727710724, + "learning_rate": 1.7142857142857145e-05, + "loss": 0.2259, + "num_tokens": 501303892.0, + "step": 1696 + }, + { + "epoch": 2.0783353733170133, + "grad_norm": 0.21102431416511536, + "learning_rate": 1.7097505668934242e-05, + "loss": 0.217, + "num_tokens": 501902181.0, + "step": 1698 + }, + { + "epoch": 2.08078335373317, + "grad_norm": 0.20315778255462646, + "learning_rate": 1.705215419501134e-05, + "loss": 0.2143, + "num_tokens": 502489081.0, + "step": 1700 + }, + { + "epoch": 2.083231334149327, + "grad_norm": 0.20665831863880157, + "learning_rate": 1.7006802721088435e-05, + "loss": 0.224, + "num_tokens": 503067488.0, + "step": 1702 + }, + { + "epoch": 2.0856793145654833, + "grad_norm": 0.20425787568092346, + "learning_rate": 1.696145124716553e-05, + "loss": 0.2181, + "num_tokens": 503682463.0, + "step": 1704 + }, + { + "epoch": 2.08812729498164, + "grad_norm": 0.2112618386745453, + "learning_rate": 1.691609977324263e-05, + "loss": 0.2147, + "num_tokens": 504260385.0, + "step": 1706 + }, + { + "epoch": 2.090575275397797, + "grad_norm": 0.21806055307388306, + "learning_rate": 1.6870748299319728e-05, + "loss": 0.2121, + "num_tokens": 504818016.0, + "step": 1708 + }, + { + "epoch": 2.0930232558139537, + "grad_norm": 0.20046468079090118, + "learning_rate": 1.6825396825396828e-05, + "loss": 0.2255, + "num_tokens": 505420156.0, + "step": 1710 + }, + { + "epoch": 2.09547123623011, + "grad_norm": 0.21384507417678833, + "learning_rate": 1.6780045351473924e-05, + "loss": 0.2231, + "num_tokens": 506003562.0, + "step": 1712 + }, + { + "epoch": 2.097919216646267, + "grad_norm": 0.20575720071792603, + "learning_rate": 1.673469387755102e-05, + "loss": 0.2197, + "num_tokens": 506569857.0, + "step": 1714 + }, + { + "epoch": 2.1003671970624236, + "grad_norm": 0.21101944148540497, + "learning_rate": 1.668934240362812e-05, + "loss": 0.2221, + "num_tokens": 507152804.0, + "step": 1716 + }, + { + "epoch": 2.1028151774785804, + "grad_norm": 0.19983737170696259, + "learning_rate": 1.6643990929705217e-05, + "loss": 0.2179, + "num_tokens": 507745676.0, + "step": 1718 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.21477234363555908, + "learning_rate": 1.6598639455782314e-05, + "loss": 0.2244, + "num_tokens": 508337927.0, + "step": 1720 + }, + { + "epoch": 2.1077111383108935, + "grad_norm": 0.2133484184741974, + "learning_rate": 1.655328798185941e-05, + "loss": 0.2268, + "num_tokens": 508944839.0, + "step": 1722 + }, + { + "epoch": 2.1101591187270503, + "grad_norm": 0.2040819674730301, + "learning_rate": 1.6507936507936507e-05, + "loss": 0.2221, + "num_tokens": 509571003.0, + "step": 1724 + }, + { + "epoch": 2.1126070991432067, + "grad_norm": 0.2098955512046814, + "learning_rate": 1.6462585034013607e-05, + "loss": 0.2232, + "num_tokens": 510170203.0, + "step": 1726 + }, + { + "epoch": 2.1150550795593634, + "grad_norm": 0.20623375475406647, + "learning_rate": 1.6417233560090704e-05, + "loss": 0.2232, + "num_tokens": 510765039.0, + "step": 1728 + }, + { + "epoch": 2.1175030599755202, + "grad_norm": 0.20884713530540466, + "learning_rate": 1.63718820861678e-05, + "loss": 0.2144, + "num_tokens": 511358514.0, + "step": 1730 + }, + { + "epoch": 2.119951040391677, + "grad_norm": 0.21065747737884521, + "learning_rate": 1.6326530612244897e-05, + "loss": 0.2249, + "num_tokens": 511957548.0, + "step": 1732 + }, + { + "epoch": 2.1223990208078334, + "grad_norm": 0.2145635038614273, + "learning_rate": 1.6281179138321997e-05, + "loss": 0.2232, + "num_tokens": 512569250.0, + "step": 1734 + }, + { + "epoch": 2.12484700122399, + "grad_norm": 0.20901666581630707, + "learning_rate": 1.6235827664399093e-05, + "loss": 0.2171, + "num_tokens": 513164306.0, + "step": 1736 + }, + { + "epoch": 2.127294981640147, + "grad_norm": 0.2056090235710144, + "learning_rate": 1.6190476190476193e-05, + "loss": 0.2204, + "num_tokens": 513778079.0, + "step": 1738 + }, + { + "epoch": 2.1297429620563038, + "grad_norm": 0.2049223780632019, + "learning_rate": 1.614512471655329e-05, + "loss": 0.2194, + "num_tokens": 514377473.0, + "step": 1740 + }, + { + "epoch": 2.13219094247246, + "grad_norm": 0.2132883071899414, + "learning_rate": 1.6099773242630386e-05, + "loss": 0.2248, + "num_tokens": 514945324.0, + "step": 1742 + }, + { + "epoch": 2.134638922888617, + "grad_norm": 0.20702043175697327, + "learning_rate": 1.6054421768707483e-05, + "loss": 0.2241, + "num_tokens": 515541771.0, + "step": 1744 + }, + { + "epoch": 2.1370869033047737, + "grad_norm": 0.19732582569122314, + "learning_rate": 1.6009070294784583e-05, + "loss": 0.2168, + "num_tokens": 516132145.0, + "step": 1746 + }, + { + "epoch": 2.13953488372093, + "grad_norm": 0.20628586411476135, + "learning_rate": 1.596371882086168e-05, + "loss": 0.2175, + "num_tokens": 516711107.0, + "step": 1748 + }, + { + "epoch": 2.141982864137087, + "grad_norm": 0.21025317907333374, + "learning_rate": 1.5918367346938776e-05, + "loss": 0.2269, + "num_tokens": 517319149.0, + "step": 1750 + }, + { + "epoch": 2.1444308445532436, + "grad_norm": 0.21749946475028992, + "learning_rate": 1.5873015873015872e-05, + "loss": 0.2288, + "num_tokens": 517933087.0, + "step": 1752 + }, + { + "epoch": 2.1468788249694004, + "grad_norm": 0.20515209436416626, + "learning_rate": 1.582766439909297e-05, + "loss": 0.217, + "num_tokens": 518519588.0, + "step": 1754 + }, + { + "epoch": 2.1493268053855568, + "grad_norm": 0.20249415934085846, + "learning_rate": 1.578231292517007e-05, + "loss": 0.2227, + "num_tokens": 519107244.0, + "step": 1756 + }, + { + "epoch": 2.1517747858017136, + "grad_norm": 0.20989085733890533, + "learning_rate": 1.573696145124717e-05, + "loss": 0.2283, + "num_tokens": 519725034.0, + "step": 1758 + }, + { + "epoch": 2.1542227662178703, + "grad_norm": 0.2062552571296692, + "learning_rate": 1.5691609977324265e-05, + "loss": 0.2186, + "num_tokens": 520314131.0, + "step": 1760 + }, + { + "epoch": 2.1566707466340267, + "grad_norm": 0.20996250212192535, + "learning_rate": 1.5646258503401362e-05, + "loss": 0.2221, + "num_tokens": 520914252.0, + "step": 1762 + }, + { + "epoch": 2.1591187270501835, + "grad_norm": 0.20457345247268677, + "learning_rate": 1.560090702947846e-05, + "loss": 0.219, + "num_tokens": 521508450.0, + "step": 1764 + }, + { + "epoch": 2.1615667074663403, + "grad_norm": 0.2074558138847351, + "learning_rate": 1.5555555555555555e-05, + "loss": 0.2187, + "num_tokens": 522084987.0, + "step": 1766 + }, + { + "epoch": 2.164014687882497, + "grad_norm": 0.20682254433631897, + "learning_rate": 1.5510204081632655e-05, + "loss": 0.2229, + "num_tokens": 522670927.0, + "step": 1768 + }, + { + "epoch": 2.1664626682986534, + "grad_norm": 0.20974300801753998, + "learning_rate": 1.546485260770975e-05, + "loss": 0.2245, + "num_tokens": 523281129.0, + "step": 1770 + }, + { + "epoch": 2.16891064871481, + "grad_norm": 0.2181571125984192, + "learning_rate": 1.5419501133786848e-05, + "loss": 0.2308, + "num_tokens": 523856430.0, + "step": 1772 + }, + { + "epoch": 2.171358629130967, + "grad_norm": 0.22432172298431396, + "learning_rate": 1.5374149659863945e-05, + "loss": 0.2212, + "num_tokens": 524441523.0, + "step": 1774 + }, + { + "epoch": 2.173806609547124, + "grad_norm": 0.20023378729820251, + "learning_rate": 1.5328798185941044e-05, + "loss": 0.2145, + "num_tokens": 525037294.0, + "step": 1776 + }, + { + "epoch": 2.17625458996328, + "grad_norm": 0.207432821393013, + "learning_rate": 1.528344671201814e-05, + "loss": 0.2234, + "num_tokens": 525629137.0, + "step": 1778 + }, + { + "epoch": 2.178702570379437, + "grad_norm": 0.2099551111459732, + "learning_rate": 1.5238095238095241e-05, + "loss": 0.2222, + "num_tokens": 526213621.0, + "step": 1780 + }, + { + "epoch": 2.1811505507955937, + "grad_norm": 0.2081080675125122, + "learning_rate": 1.5192743764172338e-05, + "loss": 0.2268, + "num_tokens": 526818522.0, + "step": 1782 + }, + { + "epoch": 2.18359853121175, + "grad_norm": 0.20909056067466736, + "learning_rate": 1.5147392290249434e-05, + "loss": 0.2201, + "num_tokens": 527394298.0, + "step": 1784 + }, + { + "epoch": 2.186046511627907, + "grad_norm": 0.20970280468463898, + "learning_rate": 1.5102040816326532e-05, + "loss": 0.2194, + "num_tokens": 527969454.0, + "step": 1786 + }, + { + "epoch": 2.1884944920440637, + "grad_norm": 0.2134752869606018, + "learning_rate": 1.5056689342403629e-05, + "loss": 0.2204, + "num_tokens": 528545888.0, + "step": 1788 + }, + { + "epoch": 2.1909424724602204, + "grad_norm": 0.21386772394180298, + "learning_rate": 1.5011337868480727e-05, + "loss": 0.2181, + "num_tokens": 529131315.0, + "step": 1790 + }, + { + "epoch": 2.193390452876377, + "grad_norm": 0.1974865198135376, + "learning_rate": 1.4965986394557824e-05, + "loss": 0.2101, + "num_tokens": 529723367.0, + "step": 1792 + }, + { + "epoch": 2.1958384332925336, + "grad_norm": 0.21619471907615662, + "learning_rate": 1.4920634920634922e-05, + "loss": 0.2305, + "num_tokens": 530327587.0, + "step": 1794 + }, + { + "epoch": 2.1982864137086904, + "grad_norm": 0.20477572083473206, + "learning_rate": 1.4875283446712018e-05, + "loss": 0.2124, + "num_tokens": 530889090.0, + "step": 1796 + }, + { + "epoch": 2.200734394124847, + "grad_norm": 0.20885427296161652, + "learning_rate": 1.4829931972789115e-05, + "loss": 0.2193, + "num_tokens": 531493483.0, + "step": 1798 + }, + { + "epoch": 2.2031823745410035, + "grad_norm": 0.22199825942516327, + "learning_rate": 1.4784580498866213e-05, + "loss": 0.2195, + "num_tokens": 532080460.0, + "step": 1800 + }, + { + "epoch": 2.2056303549571603, + "grad_norm": 0.2270926684141159, + "learning_rate": 1.473922902494331e-05, + "loss": 0.2188, + "num_tokens": 532657667.0, + "step": 1802 + }, + { + "epoch": 2.208078335373317, + "grad_norm": 0.2040078490972519, + "learning_rate": 1.469387755102041e-05, + "loss": 0.2193, + "num_tokens": 533263711.0, + "step": 1804 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 0.20712848007678986, + "learning_rate": 1.4648526077097508e-05, + "loss": 0.2168, + "num_tokens": 533838662.0, + "step": 1806 + }, + { + "epoch": 2.2129742962056302, + "grad_norm": 0.2041410207748413, + "learning_rate": 1.4603174603174605e-05, + "loss": 0.2133, + "num_tokens": 534415077.0, + "step": 1808 + }, + { + "epoch": 2.215422276621787, + "grad_norm": 0.20708684623241425, + "learning_rate": 1.4557823129251703e-05, + "loss": 0.2239, + "num_tokens": 535000584.0, + "step": 1810 + }, + { + "epoch": 2.217870257037944, + "grad_norm": 0.207489475607872, + "learning_rate": 1.45124716553288e-05, + "loss": 0.2198, + "num_tokens": 535613712.0, + "step": 1812 + }, + { + "epoch": 2.2203182374541, + "grad_norm": 0.200792133808136, + "learning_rate": 1.4467120181405896e-05, + "loss": 0.2229, + "num_tokens": 536192857.0, + "step": 1814 + }, + { + "epoch": 2.222766217870257, + "grad_norm": 0.20014187693595886, + "learning_rate": 1.4421768707482994e-05, + "loss": 0.2107, + "num_tokens": 536773075.0, + "step": 1816 + }, + { + "epoch": 2.2252141982864138, + "grad_norm": 0.20676304399967194, + "learning_rate": 1.437641723356009e-05, + "loss": 0.2228, + "num_tokens": 537353506.0, + "step": 1818 + }, + { + "epoch": 2.2276621787025706, + "grad_norm": 0.20207972824573517, + "learning_rate": 1.4331065759637189e-05, + "loss": 0.2232, + "num_tokens": 537936418.0, + "step": 1820 + }, + { + "epoch": 2.230110159118727, + "grad_norm": 0.20188836753368378, + "learning_rate": 1.4285714285714285e-05, + "loss": 0.2175, + "num_tokens": 538520632.0, + "step": 1822 + }, + { + "epoch": 2.2325581395348837, + "grad_norm": 0.20550166070461273, + "learning_rate": 1.4240362811791384e-05, + "loss": 0.2137, + "num_tokens": 539092641.0, + "step": 1824 + }, + { + "epoch": 2.2350061199510405, + "grad_norm": 0.21152400970458984, + "learning_rate": 1.419501133786848e-05, + "loss": 0.2249, + "num_tokens": 539694485.0, + "step": 1826 + }, + { + "epoch": 2.237454100367197, + "grad_norm": 0.21759860217571259, + "learning_rate": 1.414965986394558e-05, + "loss": 0.2317, + "num_tokens": 540263920.0, + "step": 1828 + }, + { + "epoch": 2.2399020807833536, + "grad_norm": 0.21021634340286255, + "learning_rate": 1.4104308390022677e-05, + "loss": 0.2194, + "num_tokens": 540856830.0, + "step": 1830 + }, + { + "epoch": 2.2423500611995104, + "grad_norm": 0.20810818672180176, + "learning_rate": 1.4058956916099775e-05, + "loss": 0.221, + "num_tokens": 541436762.0, + "step": 1832 + }, + { + "epoch": 2.244798041615667, + "grad_norm": 0.2109844833612442, + "learning_rate": 1.4013605442176872e-05, + "loss": 0.2243, + "num_tokens": 541995364.0, + "step": 1834 + }, + { + "epoch": 2.2472460220318236, + "grad_norm": 0.21267253160476685, + "learning_rate": 1.396825396825397e-05, + "loss": 0.2255, + "num_tokens": 542600855.0, + "step": 1836 + }, + { + "epoch": 2.2496940024479803, + "grad_norm": 0.20551930367946625, + "learning_rate": 1.3922902494331066e-05, + "loss": 0.2176, + "num_tokens": 543188935.0, + "step": 1838 + }, + { + "epoch": 2.252141982864137, + "grad_norm": 0.20995569229125977, + "learning_rate": 1.3877551020408165e-05, + "loss": 0.2248, + "num_tokens": 543784382.0, + "step": 1840 + }, + { + "epoch": 2.254589963280294, + "grad_norm": 0.19542281329631805, + "learning_rate": 1.3832199546485261e-05, + "loss": 0.2152, + "num_tokens": 544384073.0, + "step": 1842 + }, + { + "epoch": 2.2570379436964503, + "grad_norm": 0.20019961893558502, + "learning_rate": 1.3786848072562358e-05, + "loss": 0.2238, + "num_tokens": 544990218.0, + "step": 1844 + }, + { + "epoch": 2.259485924112607, + "grad_norm": 0.19489425420761108, + "learning_rate": 1.3741496598639456e-05, + "loss": 0.23, + "num_tokens": 545628639.0, + "step": 1846 + }, + { + "epoch": 2.261933904528764, + "grad_norm": 0.20234620571136475, + "learning_rate": 1.3696145124716552e-05, + "loss": 0.2136, + "num_tokens": 546211376.0, + "step": 1848 + }, + { + "epoch": 2.26438188494492, + "grad_norm": 0.20311376452445984, + "learning_rate": 1.365079365079365e-05, + "loss": 0.2227, + "num_tokens": 546803405.0, + "step": 1850 + }, + { + "epoch": 2.266829865361077, + "grad_norm": 0.20581483840942383, + "learning_rate": 1.360544217687075e-05, + "loss": 0.2245, + "num_tokens": 547440699.0, + "step": 1852 + }, + { + "epoch": 2.269277845777234, + "grad_norm": 0.19781197607517242, + "learning_rate": 1.3560090702947847e-05, + "loss": 0.2168, + "num_tokens": 548043840.0, + "step": 1854 + }, + { + "epoch": 2.2717258261933906, + "grad_norm": 0.2047962099313736, + "learning_rate": 1.3514739229024945e-05, + "loss": 0.2157, + "num_tokens": 548627311.0, + "step": 1856 + }, + { + "epoch": 2.274173806609547, + "grad_norm": 0.203412726521492, + "learning_rate": 1.3469387755102042e-05, + "loss": 0.2167, + "num_tokens": 549226322.0, + "step": 1858 + }, + { + "epoch": 2.2766217870257037, + "grad_norm": 0.20610330998897552, + "learning_rate": 1.3424036281179139e-05, + "loss": 0.2197, + "num_tokens": 549818583.0, + "step": 1860 + }, + { + "epoch": 2.2790697674418605, + "grad_norm": 0.20106534659862518, + "learning_rate": 1.3378684807256237e-05, + "loss": 0.2169, + "num_tokens": 550418803.0, + "step": 1862 + }, + { + "epoch": 2.2815177478580173, + "grad_norm": 0.2146470844745636, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.2199, + "num_tokens": 551009193.0, + "step": 1864 + }, + { + "epoch": 2.2839657282741737, + "grad_norm": 0.2034270465373993, + "learning_rate": 1.3287981859410432e-05, + "loss": 0.226, + "num_tokens": 551620628.0, + "step": 1866 + }, + { + "epoch": 2.2864137086903304, + "grad_norm": 0.1988794505596161, + "learning_rate": 1.3242630385487528e-05, + "loss": 0.2236, + "num_tokens": 552216326.0, + "step": 1868 + }, + { + "epoch": 2.2888616891064872, + "grad_norm": 0.21192826330661774, + "learning_rate": 1.3197278911564626e-05, + "loss": 0.2173, + "num_tokens": 552767919.0, + "step": 1870 + }, + { + "epoch": 2.2913096695226436, + "grad_norm": 0.20824791491031647, + "learning_rate": 1.3151927437641723e-05, + "loss": 0.2242, + "num_tokens": 553359740.0, + "step": 1872 + }, + { + "epoch": 2.2937576499388004, + "grad_norm": 0.20938055217266083, + "learning_rate": 1.310657596371882e-05, + "loss": 0.2193, + "num_tokens": 553929560.0, + "step": 1874 + }, + { + "epoch": 2.296205630354957, + "grad_norm": 0.2002384066581726, + "learning_rate": 1.306122448979592e-05, + "loss": 0.2116, + "num_tokens": 554509164.0, + "step": 1876 + }, + { + "epoch": 2.298653610771114, + "grad_norm": 0.21084435284137726, + "learning_rate": 1.3015873015873018e-05, + "loss": 0.2135, + "num_tokens": 555085608.0, + "step": 1878 + }, + { + "epoch": 2.3011015911872703, + "grad_norm": 0.20441339910030365, + "learning_rate": 1.2970521541950114e-05, + "loss": 0.2245, + "num_tokens": 555692582.0, + "step": 1880 + }, + { + "epoch": 2.303549571603427, + "grad_norm": 0.21268080174922943, + "learning_rate": 1.2925170068027212e-05, + "loss": 0.2186, + "num_tokens": 556270724.0, + "step": 1882 + }, + { + "epoch": 2.305997552019584, + "grad_norm": 0.20324358344078064, + "learning_rate": 1.2879818594104309e-05, + "loss": 0.2257, + "num_tokens": 556896471.0, + "step": 1884 + }, + { + "epoch": 2.3084455324357407, + "grad_norm": 0.207892507314682, + "learning_rate": 1.2834467120181407e-05, + "loss": 0.2254, + "num_tokens": 557526501.0, + "step": 1886 + }, + { + "epoch": 2.310893512851897, + "grad_norm": 0.2031894475221634, + "learning_rate": 1.2789115646258504e-05, + "loss": 0.217, + "num_tokens": 558106373.0, + "step": 1888 + }, + { + "epoch": 2.313341493268054, + "grad_norm": 0.20541709661483765, + "learning_rate": 1.27437641723356e-05, + "loss": 0.2259, + "num_tokens": 558721567.0, + "step": 1890 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.19755509495735168, + "learning_rate": 1.2698412698412699e-05, + "loss": 0.2094, + "num_tokens": 559294674.0, + "step": 1892 + }, + { + "epoch": 2.318237454100367, + "grad_norm": 0.213950052857399, + "learning_rate": 1.2653061224489795e-05, + "loss": 0.2273, + "num_tokens": 559915277.0, + "step": 1894 + }, + { + "epoch": 2.3206854345165238, + "grad_norm": 0.1976771205663681, + "learning_rate": 1.2607709750566893e-05, + "loss": 0.2188, + "num_tokens": 560511121.0, + "step": 1896 + }, + { + "epoch": 2.3231334149326806, + "grad_norm": 0.20825719833374023, + "learning_rate": 1.2562358276643993e-05, + "loss": 0.2214, + "num_tokens": 561115783.0, + "step": 1898 + }, + { + "epoch": 2.3255813953488373, + "grad_norm": 0.2104465216398239, + "learning_rate": 1.251700680272109e-05, + "loss": 0.2214, + "num_tokens": 561711040.0, + "step": 1900 + }, + { + "epoch": 2.3280293757649937, + "grad_norm": 0.20058591663837433, + "learning_rate": 1.2471655328798186e-05, + "loss": 0.2192, + "num_tokens": 562281256.0, + "step": 1902 + }, + { + "epoch": 2.3304773561811505, + "grad_norm": 0.21316313743591309, + "learning_rate": 1.2426303854875285e-05, + "loss": 0.2178, + "num_tokens": 562884623.0, + "step": 1904 + }, + { + "epoch": 2.3329253365973073, + "grad_norm": 0.20399659872055054, + "learning_rate": 1.2380952380952381e-05, + "loss": 0.2205, + "num_tokens": 563490600.0, + "step": 1906 + }, + { + "epoch": 2.335373317013464, + "grad_norm": 0.2007429599761963, + "learning_rate": 1.233560090702948e-05, + "loss": 0.2202, + "num_tokens": 564115260.0, + "step": 1908 + }, + { + "epoch": 2.3378212974296204, + "grad_norm": 0.20211383700370789, + "learning_rate": 1.2290249433106576e-05, + "loss": 0.2227, + "num_tokens": 564701265.0, + "step": 1910 + }, + { + "epoch": 2.340269277845777, + "grad_norm": 0.20754925906658173, + "learning_rate": 1.2244897959183674e-05, + "loss": 0.2173, + "num_tokens": 565293180.0, + "step": 1912 + }, + { + "epoch": 2.342717258261934, + "grad_norm": 0.21266242861747742, + "learning_rate": 1.219954648526077e-05, + "loss": 0.2235, + "num_tokens": 565881398.0, + "step": 1914 + }, + { + "epoch": 2.3451652386780903, + "grad_norm": 0.20951025187969208, + "learning_rate": 1.2154195011337869e-05, + "loss": 0.2194, + "num_tokens": 566461651.0, + "step": 1916 + }, + { + "epoch": 2.347613219094247, + "grad_norm": 0.21266640722751617, + "learning_rate": 1.2108843537414967e-05, + "loss": 0.2271, + "num_tokens": 567032472.0, + "step": 1918 + }, + { + "epoch": 2.350061199510404, + "grad_norm": 0.20573103427886963, + "learning_rate": 1.2063492063492064e-05, + "loss": 0.2124, + "num_tokens": 567603503.0, + "step": 1920 + }, + { + "epoch": 2.3525091799265607, + "grad_norm": 0.21274706721305847, + "learning_rate": 1.2018140589569162e-05, + "loss": 0.2134, + "num_tokens": 568184976.0, + "step": 1922 + }, + { + "epoch": 2.354957160342717, + "grad_norm": 0.20786137878894806, + "learning_rate": 1.1972789115646259e-05, + "loss": 0.219, + "num_tokens": 568789357.0, + "step": 1924 + }, + { + "epoch": 2.357405140758874, + "grad_norm": 0.20455925166606903, + "learning_rate": 1.1927437641723357e-05, + "loss": 0.2157, + "num_tokens": 569385024.0, + "step": 1926 + }, + { + "epoch": 2.3598531211750307, + "grad_norm": 0.2080461084842682, + "learning_rate": 1.1882086167800455e-05, + "loss": 0.2215, + "num_tokens": 569987501.0, + "step": 1928 + }, + { + "epoch": 2.3623011015911874, + "grad_norm": 0.21297095715999603, + "learning_rate": 1.1836734693877552e-05, + "loss": 0.2201, + "num_tokens": 570556664.0, + "step": 1930 + }, + { + "epoch": 2.364749082007344, + "grad_norm": 0.20728906989097595, + "learning_rate": 1.179138321995465e-05, + "loss": 0.2226, + "num_tokens": 571140803.0, + "step": 1932 + }, + { + "epoch": 2.3671970624235006, + "grad_norm": 0.20225830376148224, + "learning_rate": 1.1746031746031746e-05, + "loss": 0.2208, + "num_tokens": 571750584.0, + "step": 1934 + }, + { + "epoch": 2.3696450428396574, + "grad_norm": 0.19995316863059998, + "learning_rate": 1.1700680272108843e-05, + "loss": 0.2255, + "num_tokens": 572361673.0, + "step": 1936 + }, + { + "epoch": 2.3720930232558137, + "grad_norm": 0.21096406877040863, + "learning_rate": 1.1655328798185941e-05, + "loss": 0.2139, + "num_tokens": 572913448.0, + "step": 1938 + }, + { + "epoch": 2.3745410036719705, + "grad_norm": 0.20674946904182434, + "learning_rate": 1.160997732426304e-05, + "loss": 0.2243, + "num_tokens": 573516005.0, + "step": 1940 + }, + { + "epoch": 2.3769889840881273, + "grad_norm": 0.20043015480041504, + "learning_rate": 1.1564625850340138e-05, + "loss": 0.2166, + "num_tokens": 574105094.0, + "step": 1942 + }, + { + "epoch": 2.379436964504284, + "grad_norm": 0.2008933275938034, + "learning_rate": 1.1519274376417234e-05, + "loss": 0.2209, + "num_tokens": 574702697.0, + "step": 1944 + }, + { + "epoch": 2.3818849449204405, + "grad_norm": 0.20273694396018982, + "learning_rate": 1.147392290249433e-05, + "loss": 0.2205, + "num_tokens": 575300004.0, + "step": 1946 + }, + { + "epoch": 2.3843329253365972, + "grad_norm": 0.21550051867961884, + "learning_rate": 1.1428571428571429e-05, + "loss": 0.221, + "num_tokens": 575880877.0, + "step": 1948 + }, + { + "epoch": 2.386780905752754, + "grad_norm": 0.20158825814723969, + "learning_rate": 1.1383219954648526e-05, + "loss": 0.2221, + "num_tokens": 576466412.0, + "step": 1950 + }, + { + "epoch": 2.389228886168911, + "grad_norm": 0.1992563009262085, + "learning_rate": 1.1337868480725626e-05, + "loss": 0.2134, + "num_tokens": 577070553.0, + "step": 1952 + }, + { + "epoch": 2.391676866585067, + "grad_norm": 0.2096358835697174, + "learning_rate": 1.1292517006802722e-05, + "loss": 0.2237, + "num_tokens": 577665008.0, + "step": 1954 + }, + { + "epoch": 2.394124847001224, + "grad_norm": 0.2027975469827652, + "learning_rate": 1.1247165532879819e-05, + "loss": 0.2281, + "num_tokens": 578279786.0, + "step": 1956 + }, + { + "epoch": 2.3965728274173808, + "grad_norm": 0.20276425778865814, + "learning_rate": 1.1201814058956917e-05, + "loss": 0.2108, + "num_tokens": 578855198.0, + "step": 1958 + }, + { + "epoch": 2.399020807833537, + "grad_norm": 0.20276838541030884, + "learning_rate": 1.1156462585034013e-05, + "loss": 0.2227, + "num_tokens": 579461982.0, + "step": 1960 + }, + { + "epoch": 2.401468788249694, + "grad_norm": 0.20120473206043243, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.2178, + "num_tokens": 580083880.0, + "step": 1962 + }, + { + "epoch": 2.4039167686658507, + "grad_norm": 0.20670291781425476, + "learning_rate": 1.106575963718821e-05, + "loss": 0.218, + "num_tokens": 580674677.0, + "step": 1964 + }, + { + "epoch": 2.4063647490820075, + "grad_norm": 0.20671899616718292, + "learning_rate": 1.1020408163265306e-05, + "loss": 0.2144, + "num_tokens": 581252431.0, + "step": 1966 + }, + { + "epoch": 2.408812729498164, + "grad_norm": 0.21666701138019562, + "learning_rate": 1.0975056689342405e-05, + "loss": 0.2166, + "num_tokens": 581826830.0, + "step": 1968 + }, + { + "epoch": 2.4112607099143206, + "grad_norm": 0.2020387202501297, + "learning_rate": 1.0929705215419501e-05, + "loss": 0.2132, + "num_tokens": 582419945.0, + "step": 1970 + }, + { + "epoch": 2.4137086903304774, + "grad_norm": 0.20495234429836273, + "learning_rate": 1.08843537414966e-05, + "loss": 0.2227, + "num_tokens": 583019823.0, + "step": 1972 + }, + { + "epoch": 2.416156670746634, + "grad_norm": 0.20530062913894653, + "learning_rate": 1.0839002267573696e-05, + "loss": 0.22, + "num_tokens": 583615207.0, + "step": 1974 + }, + { + "epoch": 2.4186046511627906, + "grad_norm": 0.20127837359905243, + "learning_rate": 1.0793650793650794e-05, + "loss": 0.2166, + "num_tokens": 584205801.0, + "step": 1976 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 0.2071184515953064, + "learning_rate": 1.0748299319727893e-05, + "loss": 0.2217, + "num_tokens": 584813233.0, + "step": 1978 + }, + { + "epoch": 2.423500611995104, + "grad_norm": 0.203725203871727, + "learning_rate": 1.0702947845804989e-05, + "loss": 0.2214, + "num_tokens": 585413200.0, + "step": 1980 + }, + { + "epoch": 2.4259485924112605, + "grad_norm": 0.21335862576961517, + "learning_rate": 1.0657596371882087e-05, + "loss": 0.2228, + "num_tokens": 586004101.0, + "step": 1982 + }, + { + "epoch": 2.4283965728274173, + "grad_norm": 0.20036020874977112, + "learning_rate": 1.0612244897959184e-05, + "loss": 0.2178, + "num_tokens": 586611890.0, + "step": 1984 + }, + { + "epoch": 2.430844553243574, + "grad_norm": 0.20689314603805542, + "learning_rate": 1.056689342403628e-05, + "loss": 0.2216, + "num_tokens": 587233067.0, + "step": 1986 + }, + { + "epoch": 2.433292533659731, + "grad_norm": 0.20356044173240662, + "learning_rate": 1.052154195011338e-05, + "loss": 0.2147, + "num_tokens": 587825439.0, + "step": 1988 + }, + { + "epoch": 2.435740514075887, + "grad_norm": 0.21283699572086334, + "learning_rate": 1.0476190476190477e-05, + "loss": 0.2224, + "num_tokens": 588370273.0, + "step": 1990 + }, + { + "epoch": 2.438188494492044, + "grad_norm": 0.21021555364131927, + "learning_rate": 1.0430839002267573e-05, + "loss": 0.2186, + "num_tokens": 588956992.0, + "step": 1992 + }, + { + "epoch": 2.440636474908201, + "grad_norm": 0.2062564641237259, + "learning_rate": 1.0385487528344672e-05, + "loss": 0.2185, + "num_tokens": 589538749.0, + "step": 1994 + }, + { + "epoch": 2.4430844553243576, + "grad_norm": 0.21829988062381744, + "learning_rate": 1.0340136054421768e-05, + "loss": 0.218, + "num_tokens": 590104133.0, + "step": 1996 + }, + { + "epoch": 2.445532435740514, + "grad_norm": 0.2025780826807022, + "learning_rate": 1.0294784580498866e-05, + "loss": 0.2124, + "num_tokens": 590669861.0, + "step": 1998 + }, + { + "epoch": 2.4479804161566707, + "grad_norm": 0.21584515273571014, + "learning_rate": 1.0249433106575965e-05, + "loss": 0.2197, + "num_tokens": 591221963.0, + "step": 2000 + }, + { + "epoch": 2.4504283965728275, + "grad_norm": 0.21827919781208038, + "learning_rate": 1.0204081632653061e-05, + "loss": 0.2348, + "num_tokens": 591811044.0, + "step": 2002 + }, + { + "epoch": 2.452876376988984, + "grad_norm": 0.20520959794521332, + "learning_rate": 1.015873015873016e-05, + "loss": 0.2252, + "num_tokens": 592387471.0, + "step": 2004 + }, + { + "epoch": 2.4553243574051407, + "grad_norm": 0.2005368322134018, + "learning_rate": 1.0113378684807256e-05, + "loss": 0.218, + "num_tokens": 592989967.0, + "step": 2006 + }, + { + "epoch": 2.4577723378212974, + "grad_norm": 0.1999003291130066, + "learning_rate": 1.0068027210884354e-05, + "loss": 0.2273, + "num_tokens": 593587458.0, + "step": 2008 + }, + { + "epoch": 2.4602203182374542, + "grad_norm": 0.20240548253059387, + "learning_rate": 1.0022675736961453e-05, + "loss": 0.2161, + "num_tokens": 594183957.0, + "step": 2010 + }, + { + "epoch": 2.4626682986536106, + "grad_norm": 0.20234538614749908, + "learning_rate": 9.977324263038549e-06, + "loss": 0.2202, + "num_tokens": 594746164.0, + "step": 2012 + }, + { + "epoch": 2.4651162790697674, + "grad_norm": 0.20611034333705902, + "learning_rate": 9.931972789115647e-06, + "loss": 0.2206, + "num_tokens": 595347460.0, + "step": 2014 + }, + { + "epoch": 2.467564259485924, + "grad_norm": 0.20783595740795135, + "learning_rate": 9.886621315192744e-06, + "loss": 0.2271, + "num_tokens": 595973539.0, + "step": 2016 + }, + { + "epoch": 2.470012239902081, + "grad_norm": 0.20083051919937134, + "learning_rate": 9.841269841269842e-06, + "loss": 0.2167, + "num_tokens": 596562768.0, + "step": 2018 + }, + { + "epoch": 2.4724602203182373, + "grad_norm": 0.1989588439464569, + "learning_rate": 9.795918367346939e-06, + "loss": 0.2142, + "num_tokens": 597168866.0, + "step": 2020 + }, + { + "epoch": 2.474908200734394, + "grad_norm": 0.20066891610622406, + "learning_rate": 9.750566893424037e-06, + "loss": 0.2202, + "num_tokens": 597761188.0, + "step": 2022 + }, + { + "epoch": 2.477356181150551, + "grad_norm": 0.20809365808963776, + "learning_rate": 9.705215419501135e-06, + "loss": 0.2169, + "num_tokens": 598389562.0, + "step": 2024 + }, + { + "epoch": 2.4798041615667072, + "grad_norm": 0.21078921854496002, + "learning_rate": 9.659863945578232e-06, + "loss": 0.2143, + "num_tokens": 598946756.0, + "step": 2026 + }, + { + "epoch": 2.482252141982864, + "grad_norm": 0.21085739135742188, + "learning_rate": 9.61451247165533e-06, + "loss": 0.2288, + "num_tokens": 599531313.0, + "step": 2028 + }, + { + "epoch": 2.484700122399021, + "grad_norm": 0.20648927986621857, + "learning_rate": 9.569160997732427e-06, + "loss": 0.2166, + "num_tokens": 600099284.0, + "step": 2030 + }, + { + "epoch": 2.4871481028151776, + "grad_norm": 0.20511531829833984, + "learning_rate": 9.523809523809523e-06, + "loss": 0.2218, + "num_tokens": 600691716.0, + "step": 2032 + }, + { + "epoch": 2.489596083231334, + "grad_norm": 0.19825713336467743, + "learning_rate": 9.478458049886623e-06, + "loss": 0.2113, + "num_tokens": 601274074.0, + "step": 2034 + }, + { + "epoch": 2.4920440636474908, + "grad_norm": 0.20427091419696808, + "learning_rate": 9.43310657596372e-06, + "loss": 0.2229, + "num_tokens": 601887278.0, + "step": 2036 + }, + { + "epoch": 2.4944920440636476, + "grad_norm": 0.1981639564037323, + "learning_rate": 9.387755102040816e-06, + "loss": 0.2102, + "num_tokens": 602487180.0, + "step": 2038 + }, + { + "epoch": 2.4969400244798043, + "grad_norm": 0.24981583654880524, + "learning_rate": 9.342403628117914e-06, + "loss": 0.2181, + "num_tokens": 603104061.0, + "step": 2040 + }, + { + "epoch": 2.4993880048959607, + "grad_norm": 0.207317054271698, + "learning_rate": 9.297052154195011e-06, + "loss": 0.2206, + "num_tokens": 603685828.0, + "step": 2042 + }, + { + "epoch": 2.5018359853121175, + "grad_norm": 0.2043956220149994, + "learning_rate": 9.251700680272109e-06, + "loss": 0.2179, + "num_tokens": 604295415.0, + "step": 2044 + }, + { + "epoch": 2.5042839657282743, + "grad_norm": 0.20288239419460297, + "learning_rate": 9.206349206349207e-06, + "loss": 0.2186, + "num_tokens": 604880695.0, + "step": 2046 + }, + { + "epoch": 2.5067319461444306, + "grad_norm": 0.19663789868354797, + "learning_rate": 9.160997732426304e-06, + "loss": 0.2082, + "num_tokens": 605487013.0, + "step": 2048 + }, + { + "epoch": 2.5091799265605874, + "grad_norm": 0.20082242786884308, + "learning_rate": 9.115646258503402e-06, + "loss": 0.217, + "num_tokens": 606077613.0, + "step": 2050 + }, + { + "epoch": 2.511627906976744, + "grad_norm": 0.20340029895305634, + "learning_rate": 9.070294784580499e-06, + "loss": 0.2219, + "num_tokens": 606678252.0, + "step": 2052 + }, + { + "epoch": 2.514075887392901, + "grad_norm": 0.2021356076002121, + "learning_rate": 9.024943310657597e-06, + "loss": 0.2124, + "num_tokens": 607274565.0, + "step": 2054 + }, + { + "epoch": 2.516523867809058, + "grad_norm": 0.2082587480545044, + "learning_rate": 8.979591836734694e-06, + "loss": 0.2171, + "num_tokens": 607866871.0, + "step": 2056 + }, + { + "epoch": 2.518971848225214, + "grad_norm": 0.20120975375175476, + "learning_rate": 8.934240362811792e-06, + "loss": 0.2116, + "num_tokens": 608450364.0, + "step": 2058 + }, + { + "epoch": 2.521419828641371, + "grad_norm": 0.20468412339687347, + "learning_rate": 8.88888888888889e-06, + "loss": 0.2198, + "num_tokens": 609057444.0, + "step": 2060 + }, + { + "epoch": 2.5238678090575277, + "grad_norm": 0.20275817811489105, + "learning_rate": 8.843537414965987e-06, + "loss": 0.2169, + "num_tokens": 609665155.0, + "step": 2062 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.20833300054073334, + "learning_rate": 8.798185941043085e-06, + "loss": 0.2166, + "num_tokens": 610216154.0, + "step": 2064 + }, + { + "epoch": 2.528763769889841, + "grad_norm": 0.20524562895298004, + "learning_rate": 8.752834467120181e-06, + "loss": 0.2077, + "num_tokens": 610744235.0, + "step": 2066 + }, + { + "epoch": 2.5312117503059977, + "grad_norm": 0.21746841073036194, + "learning_rate": 8.707482993197278e-06, + "loss": 0.2303, + "num_tokens": 611354880.0, + "step": 2068 + }, + { + "epoch": 2.533659730722154, + "grad_norm": 0.20403002202510834, + "learning_rate": 8.662131519274378e-06, + "loss": 0.2223, + "num_tokens": 611932914.0, + "step": 2070 + }, + { + "epoch": 2.536107711138311, + "grad_norm": 0.2110048532485962, + "learning_rate": 8.616780045351474e-06, + "loss": 0.2208, + "num_tokens": 612527495.0, + "step": 2072 + }, + { + "epoch": 2.5385556915544676, + "grad_norm": 0.20549672842025757, + "learning_rate": 8.571428571428573e-06, + "loss": 0.2121, + "num_tokens": 613095194.0, + "step": 2074 + }, + { + "epoch": 2.5410036719706244, + "grad_norm": 0.2024473398923874, + "learning_rate": 8.52607709750567e-06, + "loss": 0.228, + "num_tokens": 613682960.0, + "step": 2076 + }, + { + "epoch": 2.543451652386781, + "grad_norm": 0.21335433423519135, + "learning_rate": 8.480725623582766e-06, + "loss": 0.2258, + "num_tokens": 614274193.0, + "step": 2078 + }, + { + "epoch": 2.5458996328029375, + "grad_norm": 0.20197437703609467, + "learning_rate": 8.435374149659864e-06, + "loss": 0.2172, + "num_tokens": 614860583.0, + "step": 2080 + }, + { + "epoch": 2.5483476132190943, + "grad_norm": 0.19978688657283783, + "learning_rate": 8.390022675736962e-06, + "loss": 0.2225, + "num_tokens": 615472445.0, + "step": 2082 + }, + { + "epoch": 2.550795593635251, + "grad_norm": 0.20164211094379425, + "learning_rate": 8.34467120181406e-06, + "loss": 0.2207, + "num_tokens": 616064181.0, + "step": 2084 + }, + { + "epoch": 2.5532435740514074, + "grad_norm": 0.20899419486522675, + "learning_rate": 8.299319727891157e-06, + "loss": 0.2123, + "num_tokens": 616638870.0, + "step": 2086 + }, + { + "epoch": 2.5556915544675642, + "grad_norm": 0.19722531735897064, + "learning_rate": 8.253968253968254e-06, + "loss": 0.2096, + "num_tokens": 617223154.0, + "step": 2088 + }, + { + "epoch": 2.558139534883721, + "grad_norm": 0.20290645956993103, + "learning_rate": 8.208616780045352e-06, + "loss": 0.2175, + "num_tokens": 617833820.0, + "step": 2090 + }, + { + "epoch": 2.5605875152998774, + "grad_norm": 0.20936761796474457, + "learning_rate": 8.163265306122448e-06, + "loss": 0.2184, + "num_tokens": 618403979.0, + "step": 2092 + }, + { + "epoch": 2.563035495716034, + "grad_norm": 0.1907576471567154, + "learning_rate": 8.117913832199547e-06, + "loss": 0.2109, + "num_tokens": 619001521.0, + "step": 2094 + }, + { + "epoch": 2.565483476132191, + "grad_norm": 0.20716920495033264, + "learning_rate": 8.072562358276645e-06, + "loss": 0.2216, + "num_tokens": 619589133.0, + "step": 2096 + }, + { + "epoch": 2.5679314565483478, + "grad_norm": 0.20768308639526367, + "learning_rate": 8.027210884353741e-06, + "loss": 0.2159, + "num_tokens": 620154713.0, + "step": 2098 + }, + { + "epoch": 2.5703794369645045, + "grad_norm": 0.19920216500759125, + "learning_rate": 7.98185941043084e-06, + "loss": 0.2234, + "num_tokens": 620755444.0, + "step": 2100 + }, + { + "epoch": 2.572827417380661, + "grad_norm": 0.20001494884490967, + "learning_rate": 7.936507936507936e-06, + "loss": 0.2103, + "num_tokens": 621356721.0, + "step": 2102 + }, + { + "epoch": 2.5752753977968177, + "grad_norm": 0.2038969099521637, + "learning_rate": 7.891156462585034e-06, + "loss": 0.2173, + "num_tokens": 621925451.0, + "step": 2104 + }, + { + "epoch": 2.5777233782129745, + "grad_norm": 0.20438076555728912, + "learning_rate": 7.845804988662133e-06, + "loss": 0.2134, + "num_tokens": 622519805.0, + "step": 2106 + }, + { + "epoch": 2.580171358629131, + "grad_norm": 0.20135213434696198, + "learning_rate": 7.80045351473923e-06, + "loss": 0.2217, + "num_tokens": 623105671.0, + "step": 2108 + }, + { + "epoch": 2.5826193390452876, + "grad_norm": 0.20885169506072998, + "learning_rate": 7.755102040816327e-06, + "loss": 0.2075, + "num_tokens": 623690747.0, + "step": 2110 + }, + { + "epoch": 2.5850673194614444, + "grad_norm": 0.21076223254203796, + "learning_rate": 7.709750566893424e-06, + "loss": 0.2267, + "num_tokens": 624294951.0, + "step": 2112 + }, + { + "epoch": 2.5875152998776008, + "grad_norm": 0.21530042588710785, + "learning_rate": 7.664399092970522e-06, + "loss": 0.2146, + "num_tokens": 624909690.0, + "step": 2114 + }, + { + "epoch": 2.5899632802937576, + "grad_norm": 0.20319397747516632, + "learning_rate": 7.6190476190476205e-06, + "loss": 0.2179, + "num_tokens": 625499900.0, + "step": 2116 + }, + { + "epoch": 2.5924112607099143, + "grad_norm": 0.21407631039619446, + "learning_rate": 7.573696145124717e-06, + "loss": 0.2205, + "num_tokens": 626093247.0, + "step": 2118 + }, + { + "epoch": 2.594859241126071, + "grad_norm": 0.20182758569717407, + "learning_rate": 7.528344671201814e-06, + "loss": 0.2136, + "num_tokens": 626698070.0, + "step": 2120 + }, + { + "epoch": 2.597307221542228, + "grad_norm": 0.2087131142616272, + "learning_rate": 7.482993197278912e-06, + "loss": 0.2198, + "num_tokens": 627297991.0, + "step": 2122 + }, + { + "epoch": 2.5997552019583843, + "grad_norm": 0.2005620002746582, + "learning_rate": 7.437641723356009e-06, + "loss": 0.2157, + "num_tokens": 627905748.0, + "step": 2124 + }, + { + "epoch": 2.602203182374541, + "grad_norm": 0.21059371531009674, + "learning_rate": 7.392290249433107e-06, + "loss": 0.2185, + "num_tokens": 628484476.0, + "step": 2126 + }, + { + "epoch": 2.604651162790698, + "grad_norm": 0.20286332070827484, + "learning_rate": 7.346938775510205e-06, + "loss": 0.2194, + "num_tokens": 629057508.0, + "step": 2128 + }, + { + "epoch": 2.607099143206854, + "grad_norm": 0.20924660563468933, + "learning_rate": 7.301587301587302e-06, + "loss": 0.2158, + "num_tokens": 629679902.0, + "step": 2130 + }, + { + "epoch": 2.609547123623011, + "grad_norm": 0.20604848861694336, + "learning_rate": 7.2562358276644e-06, + "loss": 0.2121, + "num_tokens": 630272084.0, + "step": 2132 + }, + { + "epoch": 2.611995104039168, + "grad_norm": 0.20435495674610138, + "learning_rate": 7.210884353741497e-06, + "loss": 0.2137, + "num_tokens": 630856877.0, + "step": 2134 + }, + { + "epoch": 2.614443084455324, + "grad_norm": 0.21040421724319458, + "learning_rate": 7.1655328798185944e-06, + "loss": 0.2178, + "num_tokens": 631454871.0, + "step": 2136 + }, + { + "epoch": 2.616891064871481, + "grad_norm": 0.20193393528461456, + "learning_rate": 7.120181405895692e-06, + "loss": 0.2131, + "num_tokens": 632017566.0, + "step": 2138 + }, + { + "epoch": 2.6193390452876377, + "grad_norm": 0.20342284440994263, + "learning_rate": 7.07482993197279e-06, + "loss": 0.22, + "num_tokens": 632620343.0, + "step": 2140 + }, + { + "epoch": 2.6217870257037945, + "grad_norm": 0.20233403146266937, + "learning_rate": 7.0294784580498875e-06, + "loss": 0.2152, + "num_tokens": 633214118.0, + "step": 2142 + }, + { + "epoch": 2.6242350061199513, + "grad_norm": 0.20694856345653534, + "learning_rate": 6.984126984126985e-06, + "loss": 0.2145, + "num_tokens": 633792202.0, + "step": 2144 + }, + { + "epoch": 2.6266829865361077, + "grad_norm": 0.2045925408601761, + "learning_rate": 6.938775510204082e-06, + "loss": 0.2189, + "num_tokens": 634396172.0, + "step": 2146 + }, + { + "epoch": 2.6291309669522644, + "grad_norm": 0.1949472427368164, + "learning_rate": 6.893424036281179e-06, + "loss": 0.2101, + "num_tokens": 635000724.0, + "step": 2148 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.20409946143627167, + "learning_rate": 6.848072562358276e-06, + "loss": 0.2173, + "num_tokens": 635606099.0, + "step": 2150 + }, + { + "epoch": 2.6340269277845776, + "grad_norm": 0.1999250054359436, + "learning_rate": 6.802721088435375e-06, + "loss": 0.2119, + "num_tokens": 636190686.0, + "step": 2152 + }, + { + "epoch": 2.6364749082007344, + "grad_norm": 0.1983923763036728, + "learning_rate": 6.757369614512473e-06, + "loss": 0.22, + "num_tokens": 636806626.0, + "step": 2154 + }, + { + "epoch": 2.638922888616891, + "grad_norm": 0.1971396952867508, + "learning_rate": 6.712018140589569e-06, + "loss": 0.2145, + "num_tokens": 637443282.0, + "step": 2156 + }, + { + "epoch": 2.6413708690330475, + "grad_norm": 0.19660604000091553, + "learning_rate": 6.666666666666667e-06, + "loss": 0.2103, + "num_tokens": 638029353.0, + "step": 2158 + }, + { + "epoch": 2.6438188494492043, + "grad_norm": 0.21491332352161407, + "learning_rate": 6.621315192743764e-06, + "loss": 0.2229, + "num_tokens": 638613492.0, + "step": 2160 + }, + { + "epoch": 2.646266829865361, + "grad_norm": 0.19658035039901733, + "learning_rate": 6.5759637188208614e-06, + "loss": 0.2109, + "num_tokens": 639198436.0, + "step": 2162 + }, + { + "epoch": 2.648714810281518, + "grad_norm": 0.20467783510684967, + "learning_rate": 6.53061224489796e-06, + "loss": 0.211, + "num_tokens": 639768254.0, + "step": 2164 + }, + { + "epoch": 2.6511627906976747, + "grad_norm": 0.21264995634555817, + "learning_rate": 6.485260770975057e-06, + "loss": 0.2225, + "num_tokens": 640344145.0, + "step": 2166 + }, + { + "epoch": 2.653610771113831, + "grad_norm": 0.20328077673912048, + "learning_rate": 6.4399092970521545e-06, + "loss": 0.2117, + "num_tokens": 640931530.0, + "step": 2168 + }, + { + "epoch": 2.656058751529988, + "grad_norm": 0.20107775926589966, + "learning_rate": 6.394557823129252e-06, + "loss": 0.2175, + "num_tokens": 641523514.0, + "step": 2170 + }, + { + "epoch": 2.6585067319461446, + "grad_norm": 0.20048564672470093, + "learning_rate": 6.349206349206349e-06, + "loss": 0.2205, + "num_tokens": 642126466.0, + "step": 2172 + }, + { + "epoch": 2.660954712362301, + "grad_norm": 0.20081067085266113, + "learning_rate": 6.303854875283447e-06, + "loss": 0.2129, + "num_tokens": 642723021.0, + "step": 2174 + }, + { + "epoch": 2.6634026927784578, + "grad_norm": 0.2027258276939392, + "learning_rate": 6.258503401360545e-06, + "loss": 0.2237, + "num_tokens": 643333735.0, + "step": 2176 + }, + { + "epoch": 2.6658506731946146, + "grad_norm": 0.20192566514015198, + "learning_rate": 6.213151927437642e-06, + "loss": 0.212, + "num_tokens": 643916079.0, + "step": 2178 + }, + { + "epoch": 2.668298653610771, + "grad_norm": 0.23704135417938232, + "learning_rate": 6.16780045351474e-06, + "loss": 0.2221, + "num_tokens": 644501388.0, + "step": 2180 + }, + { + "epoch": 2.6707466340269277, + "grad_norm": 0.21341145038604736, + "learning_rate": 6.122448979591837e-06, + "loss": 0.2164, + "num_tokens": 645060541.0, + "step": 2182 + }, + { + "epoch": 2.6731946144430845, + "grad_norm": 0.2012021392583847, + "learning_rate": 6.0770975056689345e-06, + "loss": 0.2092, + "num_tokens": 645641292.0, + "step": 2184 + }, + { + "epoch": 2.6756425948592413, + "grad_norm": 0.20674800872802734, + "learning_rate": 6.031746031746032e-06, + "loss": 0.2148, + "num_tokens": 646229567.0, + "step": 2186 + }, + { + "epoch": 2.678090575275398, + "grad_norm": 0.19545283913612366, + "learning_rate": 5.986394557823129e-06, + "loss": 0.2136, + "num_tokens": 646841368.0, + "step": 2188 + }, + { + "epoch": 2.6805385556915544, + "grad_norm": 0.21263349056243896, + "learning_rate": 5.9410430839002275e-06, + "loss": 0.2127, + "num_tokens": 647403157.0, + "step": 2190 + }, + { + "epoch": 2.682986536107711, + "grad_norm": 0.2056170105934143, + "learning_rate": 5.895691609977325e-06, + "loss": 0.2122, + "num_tokens": 648008580.0, + "step": 2192 + }, + { + "epoch": 2.685434516523868, + "grad_norm": 0.2027404010295868, + "learning_rate": 5.8503401360544215e-06, + "loss": 0.2161, + "num_tokens": 648593429.0, + "step": 2194 + }, + { + "epoch": 2.6878824969400243, + "grad_norm": 0.2061365693807602, + "learning_rate": 5.80498866213152e-06, + "loss": 0.2156, + "num_tokens": 649184439.0, + "step": 2196 + }, + { + "epoch": 2.690330477356181, + "grad_norm": 0.20601129531860352, + "learning_rate": 5.759637188208617e-06, + "loss": 0.2163, + "num_tokens": 649752602.0, + "step": 2198 + }, + { + "epoch": 2.692778457772338, + "grad_norm": 0.21004599332809448, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.2173, + "num_tokens": 650348288.0, + "step": 2200 + }, + { + "epoch": 2.6952264381884943, + "grad_norm": 0.19335652887821198, + "learning_rate": 5.668934240362813e-06, + "loss": 0.2019, + "num_tokens": 650922019.0, + "step": 2202 + }, + { + "epoch": 2.697674418604651, + "grad_norm": 0.20110733807086945, + "learning_rate": 5.623582766439909e-06, + "loss": 0.2167, + "num_tokens": 651517921.0, + "step": 2204 + }, + { + "epoch": 2.700122399020808, + "grad_norm": 0.1986030787229538, + "learning_rate": 5.578231292517007e-06, + "loss": 0.2133, + "num_tokens": 652089785.0, + "step": 2206 + }, + { + "epoch": 2.7025703794369647, + "grad_norm": 0.2035319060087204, + "learning_rate": 5.532879818594105e-06, + "loss": 0.2146, + "num_tokens": 652681970.0, + "step": 2208 + }, + { + "epoch": 2.7050183598531214, + "grad_norm": 0.21302953362464905, + "learning_rate": 5.487528344671202e-06, + "loss": 0.2091, + "num_tokens": 653257877.0, + "step": 2210 + }, + { + "epoch": 2.707466340269278, + "grad_norm": 0.2049965262413025, + "learning_rate": 5.4421768707483e-06, + "loss": 0.2211, + "num_tokens": 653840579.0, + "step": 2212 + }, + { + "epoch": 2.7099143206854346, + "grad_norm": 0.20661477744579315, + "learning_rate": 5.396825396825397e-06, + "loss": 0.2208, + "num_tokens": 654427105.0, + "step": 2214 + }, + { + "epoch": 2.7123623011015914, + "grad_norm": 0.2041454017162323, + "learning_rate": 5.3514739229024945e-06, + "loss": 0.2172, + "num_tokens": 655032207.0, + "step": 2216 + }, + { + "epoch": 2.7148102815177477, + "grad_norm": 0.20284762978553772, + "learning_rate": 5.306122448979592e-06, + "loss": 0.2125, + "num_tokens": 655601921.0, + "step": 2218 + }, + { + "epoch": 2.7172582619339045, + "grad_norm": 0.210398331284523, + "learning_rate": 5.26077097505669e-06, + "loss": 0.2187, + "num_tokens": 656186469.0, + "step": 2220 + }, + { + "epoch": 2.7197062423500613, + "grad_norm": 0.1941971480846405, + "learning_rate": 5.215419501133787e-06, + "loss": 0.2049, + "num_tokens": 656769428.0, + "step": 2222 + }, + { + "epoch": 2.7221542227662177, + "grad_norm": 0.20290575921535492, + "learning_rate": 5.170068027210884e-06, + "loss": 0.2122, + "num_tokens": 657368388.0, + "step": 2224 + }, + { + "epoch": 2.7246022031823744, + "grad_norm": 0.2026207000017166, + "learning_rate": 5.124716553287982e-06, + "loss": 0.2133, + "num_tokens": 657997584.0, + "step": 2226 + }, + { + "epoch": 2.7270501835985312, + "grad_norm": 0.20051315426826477, + "learning_rate": 5.07936507936508e-06, + "loss": 0.2152, + "num_tokens": 658581509.0, + "step": 2228 + }, + { + "epoch": 2.729498164014688, + "grad_norm": 0.21063509583473206, + "learning_rate": 5.034013605442177e-06, + "loss": 0.2196, + "num_tokens": 659149765.0, + "step": 2230 + }, + { + "epoch": 2.731946144430845, + "grad_norm": 0.20423410832881927, + "learning_rate": 4.9886621315192745e-06, + "loss": 0.216, + "num_tokens": 659749573.0, + "step": 2232 + }, + { + "epoch": 2.734394124847001, + "grad_norm": 0.2028772234916687, + "learning_rate": 4.943310657596372e-06, + "loss": 0.2183, + "num_tokens": 660337622.0, + "step": 2234 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 0.2014693319797516, + "learning_rate": 4.897959183673469e-06, + "loss": 0.2196, + "num_tokens": 660931521.0, + "step": 2236 + }, + { + "epoch": 2.7392900856793148, + "grad_norm": 0.20792703330516815, + "learning_rate": 4.852607709750568e-06, + "loss": 0.2163, + "num_tokens": 661503491.0, + "step": 2238 + }, + { + "epoch": 2.741738066095471, + "grad_norm": 0.2017057090997696, + "learning_rate": 4.807256235827665e-06, + "loss": 0.216, + "num_tokens": 662106323.0, + "step": 2240 + }, + { + "epoch": 2.744186046511628, + "grad_norm": 0.20376071333885193, + "learning_rate": 4.7619047619047615e-06, + "loss": 0.214, + "num_tokens": 662674829.0, + "step": 2242 + }, + { + "epoch": 2.7466340269277847, + "grad_norm": 0.19702783226966858, + "learning_rate": 4.71655328798186e-06, + "loss": 0.2088, + "num_tokens": 663250157.0, + "step": 2244 + }, + { + "epoch": 2.749082007343941, + "grad_norm": 0.20456190407276154, + "learning_rate": 4.671201814058957e-06, + "loss": 0.2152, + "num_tokens": 663824906.0, + "step": 2246 + }, + { + "epoch": 2.751529987760098, + "grad_norm": 0.22313641011714935, + "learning_rate": 4.6258503401360546e-06, + "loss": 0.2095, + "num_tokens": 664364384.0, + "step": 2248 + }, + { + "epoch": 2.7539779681762546, + "grad_norm": 0.1967461109161377, + "learning_rate": 4.580498866213152e-06, + "loss": 0.2081, + "num_tokens": 664956949.0, + "step": 2250 + }, + { + "epoch": 2.7564259485924114, + "grad_norm": 0.20178091526031494, + "learning_rate": 4.535147392290249e-06, + "loss": 0.2153, + "num_tokens": 665558407.0, + "step": 2252 + }, + { + "epoch": 2.758873929008568, + "grad_norm": 0.2028370052576065, + "learning_rate": 4.489795918367347e-06, + "loss": 0.2165, + "num_tokens": 666138104.0, + "step": 2254 + }, + { + "epoch": 2.7613219094247246, + "grad_norm": 0.2022654414176941, + "learning_rate": 4.444444444444445e-06, + "loss": 0.2131, + "num_tokens": 666720341.0, + "step": 2256 + }, + { + "epoch": 2.7637698898408813, + "grad_norm": 0.20616286993026733, + "learning_rate": 4.399092970521542e-06, + "loss": 0.2226, + "num_tokens": 667325825.0, + "step": 2258 + }, + { + "epoch": 2.766217870257038, + "grad_norm": 0.20297648012638092, + "learning_rate": 4.353741496598639e-06, + "loss": 0.2084, + "num_tokens": 667907451.0, + "step": 2260 + }, + { + "epoch": 2.7686658506731945, + "grad_norm": 0.19758343696594238, + "learning_rate": 4.308390022675737e-06, + "loss": 0.2114, + "num_tokens": 668507302.0, + "step": 2262 + }, + { + "epoch": 2.7711138310893513, + "grad_norm": 0.1957991123199463, + "learning_rate": 4.263038548752835e-06, + "loss": 0.2121, + "num_tokens": 669109387.0, + "step": 2264 + }, + { + "epoch": 2.773561811505508, + "grad_norm": 0.22704368829727173, + "learning_rate": 4.217687074829932e-06, + "loss": 0.2225, + "num_tokens": 669697277.0, + "step": 2266 + }, + { + "epoch": 2.7760097919216644, + "grad_norm": 0.20988866686820984, + "learning_rate": 4.17233560090703e-06, + "loss": 0.2165, + "num_tokens": 670256774.0, + "step": 2268 + }, + { + "epoch": 2.778457772337821, + "grad_norm": 0.20561383664608002, + "learning_rate": 4.126984126984127e-06, + "loss": 0.2231, + "num_tokens": 670844515.0, + "step": 2270 + }, + { + "epoch": 2.780905752753978, + "grad_norm": 0.20028753578662872, + "learning_rate": 4.081632653061224e-06, + "loss": 0.2187, + "num_tokens": 671428725.0, + "step": 2272 + }, + { + "epoch": 2.783353733170135, + "grad_norm": 0.19528907537460327, + "learning_rate": 4.036281179138322e-06, + "loss": 0.2113, + "num_tokens": 672032503.0, + "step": 2274 + }, + { + "epoch": 2.7858017135862916, + "grad_norm": 0.21101114153862, + "learning_rate": 3.99092970521542e-06, + "loss": 0.2189, + "num_tokens": 672632112.0, + "step": 2276 + }, + { + "epoch": 2.788249694002448, + "grad_norm": 0.2036006897687912, + "learning_rate": 3.945578231292517e-06, + "loss": 0.2126, + "num_tokens": 673239115.0, + "step": 2278 + }, + { + "epoch": 2.7906976744186047, + "grad_norm": 0.1993958055973053, + "learning_rate": 3.900226757369615e-06, + "loss": 0.2141, + "num_tokens": 673823122.0, + "step": 2280 + }, + { + "epoch": 2.7931456548347615, + "grad_norm": 0.2041313350200653, + "learning_rate": 3.854875283446712e-06, + "loss": 0.2167, + "num_tokens": 674400715.0, + "step": 2282 + }, + { + "epoch": 2.795593635250918, + "grad_norm": 0.20430049300193787, + "learning_rate": 3.8095238095238102e-06, + "loss": 0.2209, + "num_tokens": 674999813.0, + "step": 2284 + }, + { + "epoch": 2.7980416156670747, + "grad_norm": 0.20763136446475983, + "learning_rate": 3.764172335600907e-06, + "loss": 0.219, + "num_tokens": 675581370.0, + "step": 2286 + }, + { + "epoch": 2.8004895960832314, + "grad_norm": 0.19610559940338135, + "learning_rate": 3.7188208616780046e-06, + "loss": 0.2118, + "num_tokens": 676190424.0, + "step": 2288 + }, + { + "epoch": 2.802937576499388, + "grad_norm": 0.1994568258523941, + "learning_rate": 3.6734693877551024e-06, + "loss": 0.2206, + "num_tokens": 676777546.0, + "step": 2290 + }, + { + "epoch": 2.8053855569155446, + "grad_norm": 0.18920975923538208, + "learning_rate": 3.6281179138322e-06, + "loss": 0.2128, + "num_tokens": 677393020.0, + "step": 2292 + }, + { + "epoch": 2.8078335373317014, + "grad_norm": 0.19751234352588654, + "learning_rate": 3.5827664399092972e-06, + "loss": 0.2166, + "num_tokens": 678003000.0, + "step": 2294 + }, + { + "epoch": 2.810281517747858, + "grad_norm": 0.1962030678987503, + "learning_rate": 3.537414965986395e-06, + "loss": 0.2157, + "num_tokens": 678590022.0, + "step": 2296 + }, + { + "epoch": 2.812729498164015, + "grad_norm": 0.20084716379642487, + "learning_rate": 3.4920634920634924e-06, + "loss": 0.2111, + "num_tokens": 679142958.0, + "step": 2298 + }, + { + "epoch": 2.8151774785801713, + "grad_norm": 0.20106807351112366, + "learning_rate": 3.4467120181405894e-06, + "loss": 0.2114, + "num_tokens": 679731448.0, + "step": 2300 + }, + { + "epoch": 2.817625458996328, + "grad_norm": 0.18955543637275696, + "learning_rate": 3.4013605442176877e-06, + "loss": 0.2101, + "num_tokens": 680350001.0, + "step": 2302 + }, + { + "epoch": 2.820073439412485, + "grad_norm": 0.19428835809230804, + "learning_rate": 3.3560090702947846e-06, + "loss": 0.2081, + "num_tokens": 680937088.0, + "step": 2304 + }, + { + "epoch": 2.8225214198286412, + "grad_norm": 0.20524539053440094, + "learning_rate": 3.310657596371882e-06, + "loss": 0.2058, + "num_tokens": 681530459.0, + "step": 2306 + }, + { + "epoch": 2.824969400244798, + "grad_norm": 0.19657205045223236, + "learning_rate": 3.26530612244898e-06, + "loss": 0.2152, + "num_tokens": 682127765.0, + "step": 2308 + }, + { + "epoch": 2.827417380660955, + "grad_norm": 0.20542286336421967, + "learning_rate": 3.2199546485260772e-06, + "loss": 0.2138, + "num_tokens": 682735151.0, + "step": 2310 + }, + { + "epoch": 2.829865361077111, + "grad_norm": 0.20899607241153717, + "learning_rate": 3.1746031746031746e-06, + "loss": 0.211, + "num_tokens": 683302404.0, + "step": 2312 + }, + { + "epoch": 2.832313341493268, + "grad_norm": 0.19887445867061615, + "learning_rate": 3.1292517006802725e-06, + "loss": 0.2089, + "num_tokens": 683903142.0, + "step": 2314 + }, + { + "epoch": 2.8347613219094248, + "grad_norm": 0.207116961479187, + "learning_rate": 3.08390022675737e-06, + "loss": 0.2052, + "num_tokens": 684470940.0, + "step": 2316 + }, + { + "epoch": 2.8372093023255816, + "grad_norm": 0.20242339372634888, + "learning_rate": 3.0385487528344672e-06, + "loss": 0.2142, + "num_tokens": 685065504.0, + "step": 2318 + }, + { + "epoch": 2.8396572827417383, + "grad_norm": 0.19418197870254517, + "learning_rate": 2.9931972789115646e-06, + "loss": 0.2177, + "num_tokens": 685674495.0, + "step": 2320 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.2051716297864914, + "learning_rate": 2.9478458049886625e-06, + "loss": 0.21, + "num_tokens": 686235927.0, + "step": 2322 + }, + { + "epoch": 2.8445532435740515, + "grad_norm": 0.20248575508594513, + "learning_rate": 2.90249433106576e-06, + "loss": 0.2091, + "num_tokens": 686827996.0, + "step": 2324 + }, + { + "epoch": 2.8470012239902083, + "grad_norm": 0.20682653784751892, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.2159, + "num_tokens": 687395121.0, + "step": 2326 + }, + { + "epoch": 2.8494492044063646, + "grad_norm": 0.1974363923072815, + "learning_rate": 2.8117913832199547e-06, + "loss": 0.2122, + "num_tokens": 688006136.0, + "step": 2328 + }, + { + "epoch": 2.8518971848225214, + "grad_norm": 0.20209476351737976, + "learning_rate": 2.7664399092970525e-06, + "loss": 0.2102, + "num_tokens": 688607077.0, + "step": 2330 + }, + { + "epoch": 2.854345165238678, + "grad_norm": 0.19610457122325897, + "learning_rate": 2.72108843537415e-06, + "loss": 0.2106, + "num_tokens": 689217950.0, + "step": 2332 + }, + { + "epoch": 2.8567931456548346, + "grad_norm": 0.1952834129333496, + "learning_rate": 2.6757369614512473e-06, + "loss": 0.205, + "num_tokens": 689776846.0, + "step": 2334 + }, + { + "epoch": 2.8592411260709913, + "grad_norm": 0.2004018872976303, + "learning_rate": 2.630385487528345e-06, + "loss": 0.2184, + "num_tokens": 690380814.0, + "step": 2336 + }, + { + "epoch": 2.861689106487148, + "grad_norm": 0.21395283937454224, + "learning_rate": 2.585034013605442e-06, + "loss": 0.2122, + "num_tokens": 690958742.0, + "step": 2338 + }, + { + "epoch": 2.864137086903305, + "grad_norm": 0.19755862653255463, + "learning_rate": 2.53968253968254e-06, + "loss": 0.2221, + "num_tokens": 691578000.0, + "step": 2340 + }, + { + "epoch": 2.8665850673194617, + "grad_norm": 0.2016908973455429, + "learning_rate": 2.4943310657596373e-06, + "loss": 0.21, + "num_tokens": 692176460.0, + "step": 2342 + }, + { + "epoch": 2.869033047735618, + "grad_norm": 0.19597068428993225, + "learning_rate": 2.4489795918367347e-06, + "loss": 0.2083, + "num_tokens": 692736315.0, + "step": 2344 + }, + { + "epoch": 2.871481028151775, + "grad_norm": 0.2030777633190155, + "learning_rate": 2.4036281179138325e-06, + "loss": 0.2135, + "num_tokens": 693321933.0, + "step": 2346 + }, + { + "epoch": 2.8739290085679317, + "grad_norm": 0.20476028323173523, + "learning_rate": 2.35827664399093e-06, + "loss": 0.2163, + "num_tokens": 693917503.0, + "step": 2348 + }, + { + "epoch": 2.876376988984088, + "grad_norm": 0.207066610455513, + "learning_rate": 2.3129251700680273e-06, + "loss": 0.2141, + "num_tokens": 694509270.0, + "step": 2350 + }, + { + "epoch": 2.878824969400245, + "grad_norm": 0.2097940593957901, + "learning_rate": 2.2675736961451247e-06, + "loss": 0.2109, + "num_tokens": 695084971.0, + "step": 2352 + }, + { + "epoch": 2.8812729498164016, + "grad_norm": 0.20786161720752716, + "learning_rate": 2.2222222222222225e-06, + "loss": 0.2037, + "num_tokens": 695663690.0, + "step": 2354 + }, + { + "epoch": 2.883720930232558, + "grad_norm": 0.20921112596988678, + "learning_rate": 2.1768707482993195e-06, + "loss": 0.2001, + "num_tokens": 696223952.0, + "step": 2356 + }, + { + "epoch": 2.8861689106487147, + "grad_norm": 0.20549894869327545, + "learning_rate": 2.1315192743764173e-06, + "loss": 0.2104, + "num_tokens": 696802413.0, + "step": 2358 + }, + { + "epoch": 2.8886168910648715, + "grad_norm": 0.2109750360250473, + "learning_rate": 2.086167800453515e-06, + "loss": 0.2188, + "num_tokens": 697393199.0, + "step": 2360 + }, + { + "epoch": 2.8910648714810283, + "grad_norm": 0.1953602135181427, + "learning_rate": 2.040816326530612e-06, + "loss": 0.2136, + "num_tokens": 697986888.0, + "step": 2362 + }, + { + "epoch": 2.8935128518971847, + "grad_norm": 0.19946685433387756, + "learning_rate": 1.99546485260771e-06, + "loss": 0.2199, + "num_tokens": 698605323.0, + "step": 2364 + }, + { + "epoch": 2.8959608323133414, + "grad_norm": 0.20311054587364197, + "learning_rate": 1.9501133786848073e-06, + "loss": 0.216, + "num_tokens": 699194635.0, + "step": 2366 + }, + { + "epoch": 2.8984088127294982, + "grad_norm": 0.19384844601154327, + "learning_rate": 1.9047619047619051e-06, + "loss": 0.2118, + "num_tokens": 699786297.0, + "step": 2368 + }, + { + "epoch": 2.900856793145655, + "grad_norm": 0.19617673754692078, + "learning_rate": 1.8594104308390023e-06, + "loss": 0.2094, + "num_tokens": 700391881.0, + "step": 2370 + }, + { + "epoch": 2.9033047735618114, + "grad_norm": 0.2009812593460083, + "learning_rate": 1.8140589569161e-06, + "loss": 0.217, + "num_tokens": 700991307.0, + "step": 2372 + }, + { + "epoch": 2.905752753977968, + "grad_norm": 0.19046810269355774, + "learning_rate": 1.7687074829931975e-06, + "loss": 0.2067, + "num_tokens": 701591252.0, + "step": 2374 + }, + { + "epoch": 2.908200734394125, + "grad_norm": 0.2012416422367096, + "learning_rate": 1.7233560090702947e-06, + "loss": 0.2143, + "num_tokens": 702140881.0, + "step": 2376 + }, + { + "epoch": 2.9106487148102813, + "grad_norm": 0.19575682282447815, + "learning_rate": 1.6780045351473923e-06, + "loss": 0.207, + "num_tokens": 702722928.0, + "step": 2378 + }, + { + "epoch": 2.913096695226438, + "grad_norm": 0.1994391679763794, + "learning_rate": 1.63265306122449e-06, + "loss": 0.2096, + "num_tokens": 703321076.0, + "step": 2380 + }, + { + "epoch": 2.915544675642595, + "grad_norm": 0.2001073956489563, + "learning_rate": 1.5873015873015873e-06, + "loss": 0.2086, + "num_tokens": 703916558.0, + "step": 2382 + }, + { + "epoch": 2.9179926560587517, + "grad_norm": 0.21320343017578125, + "learning_rate": 1.541950113378685e-06, + "loss": 0.2131, + "num_tokens": 704510059.0, + "step": 2384 + }, + { + "epoch": 2.920440636474908, + "grad_norm": 0.19521069526672363, + "learning_rate": 1.4965986394557823e-06, + "loss": 0.2055, + "num_tokens": 705096763.0, + "step": 2386 + }, + { + "epoch": 2.922888616891065, + "grad_norm": 0.21753311157226562, + "learning_rate": 1.45124716553288e-06, + "loss": 0.2067, + "num_tokens": 705671117.0, + "step": 2388 + }, + { + "epoch": 2.9253365973072216, + "grad_norm": 0.19942820072174072, + "learning_rate": 1.4058956916099773e-06, + "loss": 0.2101, + "num_tokens": 706250003.0, + "step": 2390 + }, + { + "epoch": 2.9277845777233784, + "grad_norm": 0.20289285480976105, + "learning_rate": 1.360544217687075e-06, + "loss": 0.2115, + "num_tokens": 706818958.0, + "step": 2392 + }, + { + "epoch": 2.9302325581395348, + "grad_norm": 0.19491668045520782, + "learning_rate": 1.3151927437641725e-06, + "loss": 0.2128, + "num_tokens": 707430938.0, + "step": 2394 + }, + { + "epoch": 2.9326805385556916, + "grad_norm": 0.20326243340969086, + "learning_rate": 1.26984126984127e-06, + "loss": 0.2133, + "num_tokens": 708019038.0, + "step": 2396 + }, + { + "epoch": 2.9351285189718483, + "grad_norm": 0.1958457976579666, + "learning_rate": 1.2244897959183673e-06, + "loss": 0.2122, + "num_tokens": 708616258.0, + "step": 2398 + }, + { + "epoch": 2.9375764993880047, + "grad_norm": 0.19866766035556793, + "learning_rate": 1.179138321995465e-06, + "loss": 0.2154, + "num_tokens": 709209493.0, + "step": 2400 + }, + { + "epoch": 2.9400244798041615, + "grad_norm": 0.21173644065856934, + "learning_rate": 1.1337868480725623e-06, + "loss": 0.2125, + "num_tokens": 709825912.0, + "step": 2402 + }, + { + "epoch": 2.9424724602203183, + "grad_norm": 0.1950209140777588, + "learning_rate": 1.0884353741496597e-06, + "loss": 0.2179, + "num_tokens": 710452243.0, + "step": 2404 + }, + { + "epoch": 2.944920440636475, + "grad_norm": 0.2072354406118393, + "learning_rate": 1.0430839002267576e-06, + "loss": 0.2155, + "num_tokens": 711020995.0, + "step": 2406 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.19998763501644135, + "learning_rate": 9.97732426303855e-07, + "loss": 0.2072, + "num_tokens": 711603881.0, + "step": 2408 + }, + { + "epoch": 2.949816401468788, + "grad_norm": 0.1954042762517929, + "learning_rate": 9.523809523809526e-07, + "loss": 0.2174, + "num_tokens": 712214187.0, + "step": 2410 + }, + { + "epoch": 2.952264381884945, + "grad_norm": 0.20569762587547302, + "learning_rate": 9.0702947845805e-07, + "loss": 0.2022, + "num_tokens": 712774292.0, + "step": 2412 + }, + { + "epoch": 2.954712362301102, + "grad_norm": 0.2011638879776001, + "learning_rate": 8.616780045351474e-07, + "loss": 0.202, + "num_tokens": 713345613.0, + "step": 2414 + }, + { + "epoch": 2.957160342717258, + "grad_norm": 0.2028873711824417, + "learning_rate": 8.16326530612245e-07, + "loss": 0.2104, + "num_tokens": 713915408.0, + "step": 2416 + }, + { + "epoch": 2.959608323133415, + "grad_norm": 0.2281429022550583, + "learning_rate": 7.709750566893425e-07, + "loss": 0.2241, + "num_tokens": 714511809.0, + "step": 2418 + }, + { + "epoch": 2.9620563035495717, + "grad_norm": 0.217108353972435, + "learning_rate": 7.2562358276644e-07, + "loss": 0.2096, + "num_tokens": 715096370.0, + "step": 2420 + }, + { + "epoch": 2.964504283965728, + "grad_norm": 0.20696958899497986, + "learning_rate": 6.802721088435375e-07, + "loss": 0.2095, + "num_tokens": 715679361.0, + "step": 2422 + }, + { + "epoch": 2.966952264381885, + "grad_norm": 0.2019839584827423, + "learning_rate": 6.34920634920635e-07, + "loss": 0.2076, + "num_tokens": 716270441.0, + "step": 2424 + }, + { + "epoch": 2.9694002447980417, + "grad_norm": 0.19319204986095428, + "learning_rate": 5.895691609977325e-07, + "loss": 0.2135, + "num_tokens": 716868065.0, + "step": 2426 + }, + { + "epoch": 2.9718482252141984, + "grad_norm": 0.1938907951116562, + "learning_rate": 5.442176870748299e-07, + "loss": 0.2145, + "num_tokens": 717508169.0, + "step": 2428 + }, + { + "epoch": 2.974296205630355, + "grad_norm": 0.19258098304271698, + "learning_rate": 4.988662131519275e-07, + "loss": 0.2199, + "num_tokens": 718121648.0, + "step": 2430 + }, + { + "epoch": 2.9767441860465116, + "grad_norm": 0.19628973305225372, + "learning_rate": 4.53514739229025e-07, + "loss": 0.2119, + "num_tokens": 718759489.0, + "step": 2432 + }, + { + "epoch": 2.9791921664626684, + "grad_norm": 0.20656625926494598, + "learning_rate": 4.081632653061225e-07, + "loss": 0.2138, + "num_tokens": 719344050.0, + "step": 2434 + }, + { + "epoch": 2.981640146878825, + "grad_norm": 0.19949139654636383, + "learning_rate": 3.6281179138322e-07, + "loss": 0.2148, + "num_tokens": 719938626.0, + "step": 2436 + }, + { + "epoch": 2.9840881272949815, + "grad_norm": 0.20050382614135742, + "learning_rate": 3.174603174603175e-07, + "loss": 0.2114, + "num_tokens": 720512657.0, + "step": 2438 + }, + { + "epoch": 2.9865361077111383, + "grad_norm": 0.19941125810146332, + "learning_rate": 2.7210884353741493e-07, + "loss": 0.2099, + "num_tokens": 721085528.0, + "step": 2440 + }, + { + "epoch": 2.988984088127295, + "grad_norm": 0.1979449987411499, + "learning_rate": 2.267573696145125e-07, + "loss": 0.2111, + "num_tokens": 721698658.0, + "step": 2442 + }, + { + "epoch": 2.9914320685434515, + "grad_norm": 0.19296221435070038, + "learning_rate": 1.8140589569161e-07, + "loss": 0.205, + "num_tokens": 722290392.0, + "step": 2444 + }, + { + "epoch": 2.9938800489596082, + "grad_norm": 0.20509815216064453, + "learning_rate": 1.3605442176870747e-07, + "loss": 0.2113, + "num_tokens": 722882715.0, + "step": 2446 + }, + { + "epoch": 2.996328029375765, + "grad_norm": 0.19921015202999115, + "learning_rate": 9.0702947845805e-08, + "loss": 0.2103, + "num_tokens": 723485991.0, + "step": 2448 + }, + { + "epoch": 2.998776009791922, + "grad_norm": 0.19426876306533813, + "learning_rate": 4.53514739229025e-08, + "loss": 0.2182, + "num_tokens": 724088652.0, + "step": 2450 + }, + { + "epoch": 3.0, + "num_tokens": 724392322.0, + "step": 2451, + "total_flos": 2.993584104037312e+19, + "train_loss": 0.3322167656510181, + "train_runtime": 19094.4225, + "train_samples_per_second": 14.368, + "train_steps_per_second": 0.128 + } + ], + "logging_steps": 2, + "max_steps": 2451, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 123, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.993584104037312e+19, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}