{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9836065573770494, "eval_steps": 500, "global_step": 426, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00702576112412178, "grad_norm": 0.18053510785102844, "learning_rate": 2.3255813953488374e-07, "loss": 0.7704, "step": 1 }, { "epoch": 0.01405152224824356, "grad_norm": 0.18344072997570038, "learning_rate": 4.651162790697675e-07, "loss": 0.7172, "step": 2 }, { "epoch": 0.02107728337236534, "grad_norm": 0.17655028402805328, "learning_rate": 6.976744186046513e-07, "loss": 0.7562, "step": 3 }, { "epoch": 0.02810304449648712, "grad_norm": 0.1985616534948349, "learning_rate": 9.30232558139535e-07, "loss": 0.8166, "step": 4 }, { "epoch": 0.0351288056206089, "grad_norm": 0.14133895933628082, "learning_rate": 1.1627906976744188e-06, "loss": 0.668, "step": 5 }, { "epoch": 0.04215456674473068, "grad_norm": 0.17750921845436096, "learning_rate": 1.3953488372093025e-06, "loss": 0.7862, "step": 6 }, { "epoch": 0.04918032786885246, "grad_norm": 0.1687806397676468, "learning_rate": 1.6279069767441862e-06, "loss": 0.7697, "step": 7 }, { "epoch": 0.05620608899297424, "grad_norm": 0.14865685999393463, "learning_rate": 1.86046511627907e-06, "loss": 0.6865, "step": 8 }, { "epoch": 0.06323185011709602, "grad_norm": 0.15419331192970276, "learning_rate": 2.0930232558139536e-06, "loss": 0.7145, "step": 9 }, { "epoch": 0.0702576112412178, "grad_norm": 0.1144702211022377, "learning_rate": 2.3255813953488376e-06, "loss": 0.6761, "step": 10 }, { "epoch": 0.07728337236533958, "grad_norm": 0.12051534652709961, "learning_rate": 2.558139534883721e-06, "loss": 0.7301, "step": 11 }, { "epoch": 0.08430913348946135, "grad_norm": 0.11249984055757523, "learning_rate": 2.790697674418605e-06, "loss": 0.7126, "step": 12 }, { "epoch": 0.09133489461358314, "grad_norm": 0.11579480022192001, "learning_rate": 3.0232558139534885e-06, "loss": 0.6869, "step": 13 }, { "epoch": 0.09836065573770492, "grad_norm": 0.08192656189203262, "learning_rate": 3.2558139534883724e-06, "loss": 0.6698, "step": 14 }, { "epoch": 0.1053864168618267, "grad_norm": 0.08397164940834045, "learning_rate": 3.4883720930232564e-06, "loss": 0.6257, "step": 15 }, { "epoch": 0.11241217798594848, "grad_norm": 0.07774341851472855, "learning_rate": 3.72093023255814e-06, "loss": 0.5955, "step": 16 }, { "epoch": 0.11943793911007025, "grad_norm": 0.08383660763502121, "learning_rate": 3.953488372093024e-06, "loss": 0.5698, "step": 17 }, { "epoch": 0.12646370023419204, "grad_norm": 0.08330123126506805, "learning_rate": 4.186046511627907e-06, "loss": 0.6513, "step": 18 }, { "epoch": 0.13348946135831383, "grad_norm": 0.07081824541091919, "learning_rate": 4.418604651162791e-06, "loss": 0.5878, "step": 19 }, { "epoch": 0.1405152224824356, "grad_norm": 0.09581199288368225, "learning_rate": 4.651162790697675e-06, "loss": 0.6923, "step": 20 }, { "epoch": 0.14754098360655737, "grad_norm": 0.06888371706008911, "learning_rate": 4.883720930232559e-06, "loss": 0.5683, "step": 21 }, { "epoch": 0.15456674473067916, "grad_norm": 0.07578740268945694, "learning_rate": 5.116279069767442e-06, "loss": 0.5844, "step": 22 }, { "epoch": 0.16159250585480095, "grad_norm": 0.0799824595451355, "learning_rate": 5.348837209302326e-06, "loss": 0.6688, "step": 23 }, { "epoch": 0.1686182669789227, "grad_norm": 0.06833522021770477, "learning_rate": 5.58139534883721e-06, "loss": 0.5623, "step": 24 }, { "epoch": 0.1756440281030445, "grad_norm": 0.07280978560447693, "learning_rate": 5.8139534883720935e-06, "loss": 0.6319, "step": 25 }, { "epoch": 0.18266978922716628, "grad_norm": 0.06204582750797272, "learning_rate": 6.046511627906977e-06, "loss": 0.563, "step": 26 }, { "epoch": 0.18969555035128804, "grad_norm": 0.06134812906384468, "learning_rate": 6.279069767441861e-06, "loss": 0.5847, "step": 27 }, { "epoch": 0.19672131147540983, "grad_norm": 0.05748824402689934, "learning_rate": 6.511627906976745e-06, "loss": 0.5311, "step": 28 }, { "epoch": 0.20374707259953162, "grad_norm": 0.06215423345565796, "learning_rate": 6.744186046511628e-06, "loss": 0.5514, "step": 29 }, { "epoch": 0.2107728337236534, "grad_norm": 0.06185278296470642, "learning_rate": 6.976744186046513e-06, "loss": 0.5563, "step": 30 }, { "epoch": 0.21779859484777517, "grad_norm": 0.05966101959347725, "learning_rate": 7.209302325581395e-06, "loss": 0.5758, "step": 31 }, { "epoch": 0.22482435597189696, "grad_norm": 0.06163804978132248, "learning_rate": 7.44186046511628e-06, "loss": 0.5063, "step": 32 }, { "epoch": 0.23185011709601874, "grad_norm": 0.06333377212285995, "learning_rate": 7.674418604651164e-06, "loss": 0.5583, "step": 33 }, { "epoch": 0.2388758782201405, "grad_norm": 0.07251594960689545, "learning_rate": 7.906976744186048e-06, "loss": 0.5886, "step": 34 }, { "epoch": 0.2459016393442623, "grad_norm": 0.0646405890583992, "learning_rate": 8.139534883720931e-06, "loss": 0.5421, "step": 35 }, { "epoch": 0.2529274004683841, "grad_norm": 0.08668368309736252, "learning_rate": 8.372093023255815e-06, "loss": 0.5354, "step": 36 }, { "epoch": 0.25995316159250587, "grad_norm": 0.06333053857088089, "learning_rate": 8.604651162790698e-06, "loss": 0.5425, "step": 37 }, { "epoch": 0.26697892271662765, "grad_norm": 0.053554244339466095, "learning_rate": 8.837209302325582e-06, "loss": 0.4706, "step": 38 }, { "epoch": 0.27400468384074944, "grad_norm": 0.05727384239435196, "learning_rate": 9.069767441860465e-06, "loss": 0.4781, "step": 39 }, { "epoch": 0.2810304449648712, "grad_norm": 0.05613735318183899, "learning_rate": 9.30232558139535e-06, "loss": 0.4938, "step": 40 }, { "epoch": 0.28805620608899296, "grad_norm": 0.05665242299437523, "learning_rate": 9.534883720930234e-06, "loss": 0.516, "step": 41 }, { "epoch": 0.29508196721311475, "grad_norm": 0.06407799571752548, "learning_rate": 9.767441860465117e-06, "loss": 0.5219, "step": 42 }, { "epoch": 0.30210772833723654, "grad_norm": 0.059850167483091354, "learning_rate": 1e-05, "loss": 0.5281, "step": 43 }, { "epoch": 0.3091334894613583, "grad_norm": 0.058536890894174576, "learning_rate": 9.99983179466314e-06, "loss": 0.4646, "step": 44 }, { "epoch": 0.3161592505854801, "grad_norm": 0.053681179881095886, "learning_rate": 9.999327189969768e-06, "loss": 0.4662, "step": 45 }, { "epoch": 0.3231850117096019, "grad_norm": 0.05520308017730713, "learning_rate": 9.998486219870769e-06, "loss": 0.4972, "step": 46 }, { "epoch": 0.33021077283372363, "grad_norm": 0.06100783869624138, "learning_rate": 9.997308940948405e-06, "loss": 0.5449, "step": 47 }, { "epoch": 0.3372365339578454, "grad_norm": 0.05527447536587715, "learning_rate": 9.995795432412513e-06, "loss": 0.517, "step": 48 }, { "epoch": 0.3442622950819672, "grad_norm": 0.06016694754362106, "learning_rate": 9.993945796095183e-06, "loss": 0.4996, "step": 49 }, { "epoch": 0.351288056206089, "grad_norm": 0.061447110027074814, "learning_rate": 9.991760156443892e-06, "loss": 0.4678, "step": 50 }, { "epoch": 0.3583138173302108, "grad_norm": 0.05987882614135742, "learning_rate": 9.989238660513141e-06, "loss": 0.5699, "step": 51 }, { "epoch": 0.36533957845433257, "grad_norm": 0.05668003484606743, "learning_rate": 9.98638147795456e-06, "loss": 0.505, "step": 52 }, { "epoch": 0.37236533957845436, "grad_norm": 0.05434773117303848, "learning_rate": 9.983188801005492e-06, "loss": 0.4742, "step": 53 }, { "epoch": 0.3793911007025761, "grad_norm": 0.05131086707115173, "learning_rate": 9.979660844476056e-06, "loss": 0.5096, "step": 54 }, { "epoch": 0.3864168618266979, "grad_norm": 0.058555662631988525, "learning_rate": 9.975797845734699e-06, "loss": 0.5277, "step": 55 }, { "epoch": 0.39344262295081966, "grad_norm": 0.0570974200963974, "learning_rate": 9.971600064692222e-06, "loss": 0.4801, "step": 56 }, { "epoch": 0.40046838407494145, "grad_norm": 0.056893255561590195, "learning_rate": 9.967067783784297e-06, "loss": 0.4857, "step": 57 }, { "epoch": 0.40749414519906324, "grad_norm": 0.05847252905368805, "learning_rate": 9.962201307952455e-06, "loss": 0.5099, "step": 58 }, { "epoch": 0.41451990632318503, "grad_norm": 0.06154610961675644, "learning_rate": 9.957000964623585e-06, "loss": 0.5099, "step": 59 }, { "epoch": 0.4215456674473068, "grad_norm": 0.05544036626815796, "learning_rate": 9.951467103687879e-06, "loss": 0.4713, "step": 60 }, { "epoch": 0.42857142857142855, "grad_norm": 0.04792679101228714, "learning_rate": 9.945600097475322e-06, "loss": 0.4576, "step": 61 }, { "epoch": 0.43559718969555034, "grad_norm": 0.05695069581270218, "learning_rate": 9.939400340730611e-06, "loss": 0.5169, "step": 62 }, { "epoch": 0.4426229508196721, "grad_norm": 0.049334120005369186, "learning_rate": 9.932868250586619e-06, "loss": 0.4828, "step": 63 }, { "epoch": 0.4496487119437939, "grad_norm": 0.045351915061473846, "learning_rate": 9.926004266536314e-06, "loss": 0.4305, "step": 64 }, { "epoch": 0.4566744730679157, "grad_norm": 0.049619413912296295, "learning_rate": 9.918808850403192e-06, "loss": 0.4892, "step": 65 }, { "epoch": 0.4637002341920375, "grad_norm": 0.05420510098338127, "learning_rate": 9.911282486310214e-06, "loss": 0.4722, "step": 66 }, { "epoch": 0.4707259953161593, "grad_norm": 0.06276430934667587, "learning_rate": 9.903425680647225e-06, "loss": 0.5461, "step": 67 }, { "epoch": 0.477751756440281, "grad_norm": 0.06031210348010063, "learning_rate": 9.895238962036878e-06, "loss": 0.426, "step": 68 }, { "epoch": 0.4847775175644028, "grad_norm": 0.05641568452119827, "learning_rate": 9.88672288129908e-06, "loss": 0.4954, "step": 69 }, { "epoch": 0.4918032786885246, "grad_norm": 0.05691906809806824, "learning_rate": 9.877878011413924e-06, "loss": 0.4511, "step": 70 }, { "epoch": 0.49882903981264637, "grad_norm": 0.06182294711470604, "learning_rate": 9.868704947483134e-06, "loss": 0.5105, "step": 71 }, { "epoch": 0.5058548009367682, "grad_norm": 0.053402017802000046, "learning_rate": 9.859204306690038e-06, "loss": 0.4792, "step": 72 }, { "epoch": 0.5128805620608899, "grad_norm": 0.08146236836910248, "learning_rate": 9.849376728258024e-06, "loss": 0.496, "step": 73 }, { "epoch": 0.5199063231850117, "grad_norm": 0.051524728536605835, "learning_rate": 9.839222873407553e-06, "loss": 0.4674, "step": 74 }, { "epoch": 0.5269320843091335, "grad_norm": 0.059568360447883606, "learning_rate": 9.828743425311654e-06, "loss": 0.4675, "step": 75 }, { "epoch": 0.5339578454332553, "grad_norm": 0.05749664828181267, "learning_rate": 9.817939089049964e-06, "loss": 0.449, "step": 76 }, { "epoch": 0.5409836065573771, "grad_norm": 0.051995862275362015, "learning_rate": 9.806810591561295e-06, "loss": 0.4323, "step": 77 }, { "epoch": 0.5480093676814989, "grad_norm": 0.051814232021570206, "learning_rate": 9.795358681594712e-06, "loss": 0.4724, "step": 78 }, { "epoch": 0.5550351288056206, "grad_norm": 0.056114014238119125, "learning_rate": 9.783584129659162e-06, "loss": 0.4983, "step": 79 }, { "epoch": 0.5620608899297423, "grad_norm": 0.06239385902881622, "learning_rate": 9.771487727971642e-06, "loss": 0.4808, "step": 80 }, { "epoch": 0.5690866510538641, "grad_norm": 0.054439231753349304, "learning_rate": 9.759070290403873e-06, "loss": 0.4642, "step": 81 }, { "epoch": 0.5761124121779859, "grad_norm": 0.05044344440102577, "learning_rate": 9.746332652427566e-06, "loss": 0.4654, "step": 82 }, { "epoch": 0.5831381733021077, "grad_norm": 0.06456462293863297, "learning_rate": 9.733275671058195e-06, "loss": 0.5002, "step": 83 }, { "epoch": 0.5901639344262295, "grad_norm": 0.05796905234456062, "learning_rate": 9.71990022479734e-06, "loss": 0.4735, "step": 84 }, { "epoch": 0.5971896955503513, "grad_norm": 0.055456362664699554, "learning_rate": 9.70620721357358e-06, "loss": 0.4612, "step": 85 }, { "epoch": 0.6042154566744731, "grad_norm": 0.07052718102931976, "learning_rate": 9.69219755868194e-06, "loss": 0.4645, "step": 86 }, { "epoch": 0.6112412177985949, "grad_norm": 0.057100776582956314, "learning_rate": 9.677872202721906e-06, "loss": 0.4949, "step": 87 }, { "epoch": 0.6182669789227166, "grad_norm": 0.056776516139507294, "learning_rate": 9.663232109534011e-06, "loss": 0.5361, "step": 88 }, { "epoch": 0.6252927400468384, "grad_norm": 0.05490226671099663, "learning_rate": 9.648278264134977e-06, "loss": 0.441, "step": 89 }, { "epoch": 0.6323185011709602, "grad_norm": 0.0555921345949173, "learning_rate": 9.633011672651443e-06, "loss": 0.4116, "step": 90 }, { "epoch": 0.639344262295082, "grad_norm": 0.053403131663799286, "learning_rate": 9.617433362252277e-06, "loss": 0.482, "step": 91 }, { "epoch": 0.6463700234192038, "grad_norm": 0.04735114425420761, "learning_rate": 9.601544381079457e-06, "loss": 0.4476, "step": 92 }, { "epoch": 0.6533957845433255, "grad_norm": 0.051030732691287994, "learning_rate": 9.585345798177557e-06, "loss": 0.4997, "step": 93 }, { "epoch": 0.6604215456674473, "grad_norm": 0.054940689355134964, "learning_rate": 9.56883870342181e-06, "loss": 0.4406, "step": 94 }, { "epoch": 0.667447306791569, "grad_norm": 0.06611790508031845, "learning_rate": 9.552024207444794e-06, "loss": 0.4291, "step": 95 }, { "epoch": 0.6744730679156908, "grad_norm": 0.06092395260930061, "learning_rate": 9.534903441561693e-06, "loss": 0.5012, "step": 96 }, { "epoch": 0.6814988290398126, "grad_norm": 0.0560709647834301, "learning_rate": 9.517477557694182e-06, "loss": 0.5025, "step": 97 }, { "epoch": 0.6885245901639344, "grad_norm": 0.06219569593667984, "learning_rate": 9.499747728292928e-06, "loss": 0.5078, "step": 98 }, { "epoch": 0.6955503512880562, "grad_norm": 0.05389304459095001, "learning_rate": 9.481715146258699e-06, "loss": 0.4275, "step": 99 }, { "epoch": 0.702576112412178, "grad_norm": 0.055709365755319595, "learning_rate": 9.463381024862116e-06, "loss": 0.5152, "step": 100 }, { "epoch": 0.7096018735362998, "grad_norm": 0.07741156220436096, "learning_rate": 9.444746597662e-06, "loss": 0.504, "step": 101 }, { "epoch": 0.7166276346604216, "grad_norm": 0.049752719700336456, "learning_rate": 9.425813118422393e-06, "loss": 0.4613, "step": 102 }, { "epoch": 0.7236533957845434, "grad_norm": 0.056723710149526596, "learning_rate": 9.406581861028199e-06, "loss": 0.5293, "step": 103 }, { "epoch": 0.7306791569086651, "grad_norm": 0.05891204625368118, "learning_rate": 9.387054119399466e-06, "loss": 0.4229, "step": 104 }, { "epoch": 0.7377049180327869, "grad_norm": 0.053549159318208694, "learning_rate": 9.36723120740434e-06, "loss": 0.4525, "step": 105 }, { "epoch": 0.7447306791569087, "grad_norm": 0.05781884491443634, "learning_rate": 9.347114458770656e-06, "loss": 0.4756, "step": 106 }, { "epoch": 0.7517564402810304, "grad_norm": 0.052228234708309174, "learning_rate": 9.326705226996207e-06, "loss": 0.4577, "step": 107 }, { "epoch": 0.7587822014051522, "grad_norm": 0.05756888911128044, "learning_rate": 9.306004885257675e-06, "loss": 0.4604, "step": 108 }, { "epoch": 0.765807962529274, "grad_norm": 0.07348482310771942, "learning_rate": 9.28501482631824e-06, "loss": 0.4768, "step": 109 }, { "epoch": 0.7728337236533958, "grad_norm": 0.060214102268218994, "learning_rate": 9.26373646243388e-06, "loss": 0.4891, "step": 110 }, { "epoch": 0.7798594847775175, "grad_norm": 0.05229140818119049, "learning_rate": 9.242171225258336e-06, "loss": 0.4719, "step": 111 }, { "epoch": 0.7868852459016393, "grad_norm": 0.053298093378543854, "learning_rate": 9.220320565746806e-06, "loss": 0.4449, "step": 112 }, { "epoch": 0.7939110070257611, "grad_norm": 0.04904168099164963, "learning_rate": 9.198185954058305e-06, "loss": 0.4682, "step": 113 }, { "epoch": 0.8009367681498829, "grad_norm": 0.055706869810819626, "learning_rate": 9.175768879456759e-06, "loss": 0.4522, "step": 114 }, { "epoch": 0.8079625292740047, "grad_norm": 0.057728879153728485, "learning_rate": 9.153070850210803e-06, "loss": 0.4465, "step": 115 }, { "epoch": 0.8149882903981265, "grad_norm": 0.04915739968419075, "learning_rate": 9.130093393492302e-06, "loss": 0.4254, "step": 116 }, { "epoch": 0.8220140515222483, "grad_norm": 0.04908665642142296, "learning_rate": 9.106838055273589e-06, "loss": 0.4492, "step": 117 }, { "epoch": 0.8290398126463701, "grad_norm": 0.05924678593873978, "learning_rate": 9.083306400223465e-06, "loss": 0.4821, "step": 118 }, { "epoch": 0.8360655737704918, "grad_norm": 0.05097891017794609, "learning_rate": 9.059500011601919e-06, "loss": 0.4454, "step": 119 }, { "epoch": 0.8430913348946136, "grad_norm": 0.05687716603279114, "learning_rate": 9.035420491153596e-06, "loss": 0.4329, "step": 120 }, { "epoch": 0.8501170960187353, "grad_norm": 0.0736871287226677, "learning_rate": 9.011069459000035e-06, "loss": 0.4685, "step": 121 }, { "epoch": 0.8571428571428571, "grad_norm": 0.05752350762486458, "learning_rate": 8.986448553530665e-06, "loss": 0.4282, "step": 122 }, { "epoch": 0.8641686182669789, "grad_norm": 0.05238118767738342, "learning_rate": 8.961559431292562e-06, "loss": 0.4381, "step": 123 }, { "epoch": 0.8711943793911007, "grad_norm": 0.055724114179611206, "learning_rate": 8.936403766879003e-06, "loss": 0.4239, "step": 124 }, { "epoch": 0.8782201405152225, "grad_norm": 0.05202113091945648, "learning_rate": 8.910983252816794e-06, "loss": 0.4127, "step": 125 }, { "epoch": 0.8852459016393442, "grad_norm": 0.05094996467232704, "learning_rate": 8.885299599452381e-06, "loss": 0.447, "step": 126 }, { "epoch": 0.892271662763466, "grad_norm": 0.060567859560251236, "learning_rate": 8.859354534836797e-06, "loss": 0.4433, "step": 127 }, { "epoch": 0.8992974238875878, "grad_norm": 0.05051707103848457, "learning_rate": 8.833149804609372e-06, "loss": 0.409, "step": 128 }, { "epoch": 0.9063231850117096, "grad_norm": 0.05201994627714157, "learning_rate": 8.806687171880298e-06, "loss": 0.4511, "step": 129 }, { "epoch": 0.9133489461358314, "grad_norm": 0.049249548465013504, "learning_rate": 8.779968417111991e-06, "loss": 0.4365, "step": 130 }, { "epoch": 0.9203747072599532, "grad_norm": 0.05699831247329712, "learning_rate": 8.752995337999316e-06, "loss": 0.5187, "step": 131 }, { "epoch": 0.927400468384075, "grad_norm": 0.05579648166894913, "learning_rate": 8.725769749348612e-06, "loss": 0.4587, "step": 132 }, { "epoch": 0.9344262295081968, "grad_norm": 0.05366150662302971, "learning_rate": 8.698293482955605e-06, "loss": 0.4599, "step": 133 }, { "epoch": 0.9414519906323185, "grad_norm": 0.05521006882190704, "learning_rate": 8.670568387482153e-06, "loss": 0.4634, "step": 134 }, { "epoch": 0.9484777517564403, "grad_norm": 0.058564692735672, "learning_rate": 8.642596328331864e-06, "loss": 0.4546, "step": 135 }, { "epoch": 0.955503512880562, "grad_norm": 0.22925680875778198, "learning_rate": 8.614379187524593e-06, "loss": 0.4392, "step": 136 }, { "epoch": 0.9625292740046838, "grad_norm": 0.05385170876979828, "learning_rate": 8.585918863569806e-06, "loss": 0.4316, "step": 137 }, { "epoch": 0.9695550351288056, "grad_norm": 0.05049740523099899, "learning_rate": 8.55721727133886e-06, "loss": 0.4467, "step": 138 }, { "epoch": 0.9765807962529274, "grad_norm": 0.0547189861536026, "learning_rate": 8.528276341936146e-06, "loss": 0.4707, "step": 139 }, { "epoch": 0.9836065573770492, "grad_norm": 0.057287830859422684, "learning_rate": 8.499098022569177e-06, "loss": 0.4413, "step": 140 }, { "epoch": 0.990632318501171, "grad_norm": 0.05672721937298775, "learning_rate": 8.469684276417568e-06, "loss": 0.485, "step": 141 }, { "epoch": 0.9976580796252927, "grad_norm": 0.05014313757419586, "learning_rate": 8.440037082500953e-06, "loss": 0.441, "step": 142 }, { "epoch": 1.0, "grad_norm": 0.05014313757419586, "learning_rate": 8.410158435545825e-06, "loss": 0.4919, "step": 143 }, { "epoch": 1.0070257611241218, "grad_norm": 0.09903538227081299, "learning_rate": 8.380050345851338e-06, "loss": 0.3345, "step": 144 }, { "epoch": 1.0140515222482436, "grad_norm": 0.04598068818449974, "learning_rate": 8.349714839154035e-06, "loss": 0.3456, "step": 145 }, { "epoch": 1.0210772833723654, "grad_norm": 0.05103905871510506, "learning_rate": 8.319153956491567e-06, "loss": 0.3466, "step": 146 }, { "epoch": 1.0281030444964872, "grad_norm": 0.050713758915662766, "learning_rate": 8.288369754065362e-06, "loss": 0.3337, "step": 147 }, { "epoch": 1.035128805620609, "grad_norm": 0.04552711173892021, "learning_rate": 8.257364303102275e-06, "loss": 0.3678, "step": 148 }, { "epoch": 1.0421545667447307, "grad_norm": 0.05452848598361015, "learning_rate": 8.226139689715233e-06, "loss": 0.3589, "step": 149 }, { "epoch": 1.0491803278688525, "grad_norm": 0.048317231237888336, "learning_rate": 8.19469801476288e-06, "loss": 0.3848, "step": 150 }, { "epoch": 1.0562060889929743, "grad_norm": 0.04934811219573021, "learning_rate": 8.16304139370823e-06, "loss": 0.3017, "step": 151 }, { "epoch": 1.063231850117096, "grad_norm": 0.04967602342367172, "learning_rate": 8.131171956476328e-06, "loss": 0.3647, "step": 152 }, { "epoch": 1.0702576112412179, "grad_norm": 0.048318326473236084, "learning_rate": 8.09909184731094e-06, "loss": 0.3386, "step": 153 }, { "epoch": 1.0772833723653397, "grad_norm": 0.06288264691829681, "learning_rate": 8.066803224630295e-06, "loss": 0.334, "step": 154 }, { "epoch": 1.0843091334894615, "grad_norm": 0.05197209119796753, "learning_rate": 8.034308260881854e-06, "loss": 0.3538, "step": 155 }, { "epoch": 1.0913348946135832, "grad_norm": 0.0542137436568737, "learning_rate": 8.00160914239615e-06, "loss": 0.3296, "step": 156 }, { "epoch": 1.098360655737705, "grad_norm": 0.05365780368447304, "learning_rate": 7.968708069239672e-06, "loss": 0.3932, "step": 157 }, { "epoch": 1.1053864168618266, "grad_norm": 0.060223035514354706, "learning_rate": 7.935607255066867e-06, "loss": 0.3679, "step": 158 }, { "epoch": 1.1124121779859484, "grad_norm": 0.05058826506137848, "learning_rate": 7.902308926971166e-06, "loss": 0.3321, "step": 159 }, { "epoch": 1.1194379391100702, "grad_norm": 0.054012175649404526, "learning_rate": 7.868815325335168e-06, "loss": 0.3501, "step": 160 }, { "epoch": 1.126463700234192, "grad_norm": 0.061976704746484756, "learning_rate": 7.835128703679896e-06, "loss": 0.35, "step": 161 }, { "epoch": 1.1334894613583137, "grad_norm": 0.05166299641132355, "learning_rate": 7.801251328513164e-06, "loss": 0.3521, "step": 162 }, { "epoch": 1.1405152224824355, "grad_norm": 0.05794499069452286, "learning_rate": 7.767185479177092e-06, "loss": 0.3318, "step": 163 }, { "epoch": 1.1475409836065573, "grad_norm": 0.05970500782132149, "learning_rate": 7.732933447694748e-06, "loss": 0.3601, "step": 164 }, { "epoch": 1.154566744730679, "grad_norm": 0.04941580072045326, "learning_rate": 7.698497538615928e-06, "loss": 0.321, "step": 165 }, { "epoch": 1.161592505854801, "grad_norm": 0.061307549476623535, "learning_rate": 7.663880068862106e-06, "loss": 0.3103, "step": 166 }, { "epoch": 1.1686182669789227, "grad_norm": 0.055390097200870514, "learning_rate": 7.629083367570547e-06, "loss": 0.3561, "step": 167 }, { "epoch": 1.1756440281030445, "grad_norm": 0.06362055987119675, "learning_rate": 7.594109775937595e-06, "loss": 0.3477, "step": 168 }, { "epoch": 1.1826697892271663, "grad_norm": 0.05179616063833237, "learning_rate": 7.558961647061156e-06, "loss": 0.3534, "step": 169 }, { "epoch": 1.189695550351288, "grad_norm": 0.0547102652490139, "learning_rate": 7.5236413457823745e-06, "loss": 0.3023, "step": 170 }, { "epoch": 1.1967213114754098, "grad_norm": 0.0540938526391983, "learning_rate": 7.488151248526518e-06, "loss": 0.3144, "step": 171 }, { "epoch": 1.2037470725995316, "grad_norm": 0.04804334416985512, "learning_rate": 7.452493743143092e-06, "loss": 0.317, "step": 172 }, { "epoch": 1.2107728337236534, "grad_norm": 0.05935240909457207, "learning_rate": 7.416671228745181e-06, "loss": 0.3801, "step": 173 }, { "epoch": 1.2177985948477752, "grad_norm": 0.05715787410736084, "learning_rate": 7.380686115548024e-06, "loss": 0.3286, "step": 174 }, { "epoch": 1.224824355971897, "grad_norm": 0.05142643675208092, "learning_rate": 7.344540824706855e-06, "loss": 0.3351, "step": 175 }, { "epoch": 1.2318501170960188, "grad_norm": 0.0654173269867897, "learning_rate": 7.3082377881540025e-06, "loss": 0.344, "step": 176 }, { "epoch": 1.2388758782201406, "grad_norm": 0.05274007469415665, "learning_rate": 7.271779448435265e-06, "loss": 0.3427, "step": 177 }, { "epoch": 1.2459016393442623, "grad_norm": 0.05858859047293663, "learning_rate": 7.235168258545569e-06, "loss": 0.3477, "step": 178 }, { "epoch": 1.2529274004683841, "grad_norm": 0.05919253081083298, "learning_rate": 7.198406681763925e-06, "loss": 0.348, "step": 179 }, { "epoch": 1.259953161592506, "grad_norm": 0.05598839744925499, "learning_rate": 7.161497191487693e-06, "loss": 0.3529, "step": 180 }, { "epoch": 1.2669789227166277, "grad_norm": 0.04682192578911781, "learning_rate": 7.124442271066174e-06, "loss": 0.3346, "step": 181 }, { "epoch": 1.2740046838407495, "grad_norm": 0.05666663125157356, "learning_rate": 7.087244413633516e-06, "loss": 0.3372, "step": 182 }, { "epoch": 1.281030444964871, "grad_norm": 0.05192001909017563, "learning_rate": 7.049906121940974e-06, "loss": 0.3495, "step": 183 }, { "epoch": 1.288056206088993, "grad_norm": 0.056901462376117706, "learning_rate": 7.012429908188523e-06, "loss": 0.3189, "step": 184 }, { "epoch": 1.2950819672131146, "grad_norm": 0.05573394522070885, "learning_rate": 6.9748182938558225e-06, "loss": 0.3074, "step": 185 }, { "epoch": 1.3021077283372366, "grad_norm": 0.051766857504844666, "learning_rate": 6.937073809532581e-06, "loss": 0.3801, "step": 186 }, { "epoch": 1.3091334894613582, "grad_norm": 0.0547168143093586, "learning_rate": 6.899198994748274e-06, "loss": 0.3197, "step": 187 }, { "epoch": 1.3161592505854802, "grad_norm": 0.055122897028923035, "learning_rate": 6.861196397801297e-06, "loss": 0.2924, "step": 188 }, { "epoch": 1.3231850117096018, "grad_norm": 0.04852576553821564, "learning_rate": 6.823068575587496e-06, "loss": 0.3215, "step": 189 }, { "epoch": 1.3302107728337236, "grad_norm": 0.04942547157406807, "learning_rate": 6.784818093428144e-06, "loss": 0.31, "step": 190 }, { "epoch": 1.3372365339578454, "grad_norm": 0.05302649736404419, "learning_rate": 6.746447524897335e-06, "loss": 0.3386, "step": 191 }, { "epoch": 1.3442622950819672, "grad_norm": 0.04981255903840065, "learning_rate": 6.70795945164883e-06, "loss": 0.3429, "step": 192 }, { "epoch": 1.351288056206089, "grad_norm": 0.061598002910614014, "learning_rate": 6.6693564632423626e-06, "loss": 0.3444, "step": 193 }, { "epoch": 1.3583138173302107, "grad_norm": 0.06926045566797256, "learning_rate": 6.630641156969397e-06, "loss": 0.3119, "step": 194 }, { "epoch": 1.3653395784543325, "grad_norm": 0.05165793001651764, "learning_rate": 6.591816137678388e-06, "loss": 0.3309, "step": 195 }, { "epoch": 1.3723653395784543, "grad_norm": 0.05770070105791092, "learning_rate": 6.552884017599517e-06, "loss": 0.3527, "step": 196 }, { "epoch": 1.379391100702576, "grad_norm": 0.06011437624692917, "learning_rate": 6.513847416168929e-06, "loss": 0.3574, "step": 197 }, { "epoch": 1.3864168618266979, "grad_norm": 0.05262259393930435, "learning_rate": 6.474708959852504e-06, "loss": 0.2922, "step": 198 }, { "epoch": 1.3934426229508197, "grad_norm": 0.05124557390809059, "learning_rate": 6.435471281969133e-06, "loss": 0.3053, "step": 199 }, { "epoch": 1.4004683840749415, "grad_norm": 0.054529063403606415, "learning_rate": 6.396137022513545e-06, "loss": 0.3311, "step": 200 }, { "epoch": 1.4074941451990632, "grad_norm": 0.05387987568974495, "learning_rate": 6.3567088279786885e-06, "loss": 0.3508, "step": 201 }, { "epoch": 1.414519906323185, "grad_norm": 0.052831970155239105, "learning_rate": 6.317189351177657e-06, "loss": 0.3425, "step": 202 }, { "epoch": 1.4215456674473068, "grad_norm": 0.0602416917681694, "learning_rate": 6.277581251065217e-06, "loss": 0.3448, "step": 203 }, { "epoch": 1.4285714285714286, "grad_norm": 0.04873733967542648, "learning_rate": 6.237887192558894e-06, "loss": 0.361, "step": 204 }, { "epoch": 1.4355971896955504, "grad_norm": 0.05855415388941765, "learning_rate": 6.198109846359682e-06, "loss": 0.3324, "step": 205 }, { "epoch": 1.4426229508196722, "grad_norm": 0.05669476091861725, "learning_rate": 6.15825188877235e-06, "loss": 0.3892, "step": 206 }, { "epoch": 1.449648711943794, "grad_norm": 0.0551174134016037, "learning_rate": 6.118316001525368e-06, "loss": 0.3579, "step": 207 }, { "epoch": 1.4566744730679158, "grad_norm": 0.05648890882730484, "learning_rate": 6.078304871590485e-06, "loss": 0.3183, "step": 208 }, { "epoch": 1.4637002341920375, "grad_norm": 0.05304768308997154, "learning_rate": 6.038221191001935e-06, "loss": 0.3346, "step": 209 }, { "epoch": 1.4707259953161593, "grad_norm": 0.04858344793319702, "learning_rate": 5.998067656675318e-06, "loss": 0.3545, "step": 210 }, { "epoch": 1.4777517564402811, "grad_norm": 0.060655366629362106, "learning_rate": 5.95784697022614e-06, "loss": 0.3321, "step": 211 }, { "epoch": 1.4847775175644027, "grad_norm": 0.053603459149599075, "learning_rate": 5.917561837788046e-06, "loss": 0.3308, "step": 212 }, { "epoch": 1.4918032786885247, "grad_norm": 0.04807604104280472, "learning_rate": 5.877214969830746e-06, "loss": 0.2965, "step": 213 }, { "epoch": 1.4988290398126463, "grad_norm": 0.05581279471516609, "learning_rate": 5.836809080977644e-06, "loss": 0.3358, "step": 214 }, { "epoch": 1.5058548009367683, "grad_norm": 0.054010357707738876, "learning_rate": 5.7963468898232026e-06, "loss": 0.3361, "step": 215 }, { "epoch": 1.5128805620608898, "grad_norm": 0.06058075278997421, "learning_rate": 5.755831118750016e-06, "loss": 0.3657, "step": 216 }, { "epoch": 1.5199063231850118, "grad_norm": 0.05514070391654968, "learning_rate": 5.715264493745652e-06, "loss": 0.3355, "step": 217 }, { "epoch": 1.5269320843091334, "grad_norm": 0.056756455451250076, "learning_rate": 5.6746497442192425e-06, "loss": 0.3322, "step": 218 }, { "epoch": 1.5339578454332554, "grad_norm": 0.051219794899225235, "learning_rate": 5.633989602817837e-06, "loss": 0.314, "step": 219 }, { "epoch": 1.540983606557377, "grad_norm": 0.05676623433828354, "learning_rate": 5.593286805242549e-06, "loss": 0.3442, "step": 220 }, { "epoch": 1.548009367681499, "grad_norm": 0.059845082461833954, "learning_rate": 5.552544090064487e-06, "loss": 0.3441, "step": 221 }, { "epoch": 1.5550351288056206, "grad_norm": 0.05877413973212242, "learning_rate": 5.5117641985405055e-06, "loss": 0.3165, "step": 222 }, { "epoch": 1.5620608899297423, "grad_norm": 0.04743430018424988, "learning_rate": 5.47094987442876e-06, "loss": 0.3253, "step": 223 }, { "epoch": 1.5690866510538641, "grad_norm": 0.05630961060523987, "learning_rate": 5.430103863804107e-06, "loss": 0.3384, "step": 224 }, { "epoch": 1.576112412177986, "grad_norm": 0.05597255006432533, "learning_rate": 5.389228914873334e-06, "loss": 0.3488, "step": 225 }, { "epoch": 1.5831381733021077, "grad_norm": 0.045843351632356644, "learning_rate": 5.348327777790262e-06, "loss": 0.3169, "step": 226 }, { "epoch": 1.5901639344262295, "grad_norm": 0.049752555787563324, "learning_rate": 5.307403204470711e-06, "loss": 0.3393, "step": 227 }, { "epoch": 1.5971896955503513, "grad_norm": 0.059289898723363876, "learning_rate": 5.266457948407336e-06, "loss": 0.3531, "step": 228 }, { "epoch": 1.604215456674473, "grad_norm": 0.05609236657619476, "learning_rate": 5.2254947644843735e-06, "loss": 0.3384, "step": 229 }, { "epoch": 1.6112412177985949, "grad_norm": 0.053165484219789505, "learning_rate": 5.18451640879228e-06, "loss": 0.32, "step": 230 }, { "epoch": 1.6182669789227166, "grad_norm": 0.05502665415406227, "learning_rate": 5.14352563844231e-06, "loss": 0.2832, "step": 231 }, { "epoch": 1.6252927400468384, "grad_norm": 0.04645336791872978, "learning_rate": 5.1025252113809945e-06, "loss": 0.3047, "step": 232 }, { "epoch": 1.6323185011709602, "grad_norm": 0.05163678526878357, "learning_rate": 5.061517886204592e-06, "loss": 0.3143, "step": 233 }, { "epoch": 1.639344262295082, "grad_norm": 0.05668428912758827, "learning_rate": 5.02050642197348e-06, "loss": 0.334, "step": 234 }, { "epoch": 1.6463700234192038, "grad_norm": 0.062112439423799515, "learning_rate": 4.979493578026523e-06, "loss": 0.3259, "step": 235 }, { "epoch": 1.6533957845433256, "grad_norm": 0.050369080156087875, "learning_rate": 4.9384821137954106e-06, "loss": 0.3506, "step": 236 }, { "epoch": 1.6604215456674472, "grad_norm": 0.0596189871430397, "learning_rate": 4.897474788619007e-06, "loss": 0.3211, "step": 237 }, { "epoch": 1.6674473067915692, "grad_norm": 0.05669752135872841, "learning_rate": 4.856474361557692e-06, "loss": 0.3037, "step": 238 }, { "epoch": 1.6744730679156907, "grad_norm": 0.048729464411735535, "learning_rate": 4.815483591207721e-06, "loss": 0.3546, "step": 239 }, { "epoch": 1.6814988290398127, "grad_norm": 0.05216846615076065, "learning_rate": 4.774505235515628e-06, "loss": 0.3619, "step": 240 }, { "epoch": 1.6885245901639343, "grad_norm": 0.05519293621182442, "learning_rate": 4.733542051592665e-06, "loss": 0.3097, "step": 241 }, { "epoch": 1.6955503512880563, "grad_norm": 0.05697103589773178, "learning_rate": 4.69259679552929e-06, "loss": 0.343, "step": 242 }, { "epoch": 1.7025761124121779, "grad_norm": 0.057407498359680176, "learning_rate": 4.651672222209738e-06, "loss": 0.3594, "step": 243 }, { "epoch": 1.7096018735362999, "grad_norm": 0.0562685951590538, "learning_rate": 4.6107710851266695e-06, "loss": 0.3212, "step": 244 }, { "epoch": 1.7166276346604215, "grad_norm": 0.0513756088912487, "learning_rate": 4.5698961361958955e-06, "loss": 0.3164, "step": 245 }, { "epoch": 1.7236533957845435, "grad_norm": 0.05124720558524132, "learning_rate": 4.529050125571241e-06, "loss": 0.3212, "step": 246 }, { "epoch": 1.730679156908665, "grad_norm": 0.052108533680438995, "learning_rate": 4.488235801459495e-06, "loss": 0.3056, "step": 247 }, { "epoch": 1.737704918032787, "grad_norm": 0.058752771466970444, "learning_rate": 4.447455909935513e-06, "loss": 0.3539, "step": 248 }, { "epoch": 1.7447306791569086, "grad_norm": 0.053762953728437424, "learning_rate": 4.4067131947574515e-06, "loss": 0.3157, "step": 249 }, { "epoch": 1.7517564402810304, "grad_norm": 0.04892832413315773, "learning_rate": 4.3660103971821635e-06, "loss": 0.3279, "step": 250 }, { "epoch": 1.7587822014051522, "grad_norm": 0.0599125474691391, "learning_rate": 4.3253502557807575e-06, "loss": 0.3255, "step": 251 }, { "epoch": 1.765807962529274, "grad_norm": 0.05481419712305069, "learning_rate": 4.28473550625435e-06, "loss": 0.3502, "step": 252 }, { "epoch": 1.7728337236533958, "grad_norm": 0.05331256613135338, "learning_rate": 4.244168881249986e-06, "loss": 0.3565, "step": 253 }, { "epoch": 1.7798594847775175, "grad_norm": 0.053473543375730515, "learning_rate": 4.203653110176798e-06, "loss": 0.2904, "step": 254 }, { "epoch": 1.7868852459016393, "grad_norm": 0.053232353180646896, "learning_rate": 4.163190919022357e-06, "loss": 0.321, "step": 255 }, { "epoch": 1.7939110070257611, "grad_norm": 0.05323438718914986, "learning_rate": 4.122785030169256e-06, "loss": 0.3562, "step": 256 }, { "epoch": 1.800936768149883, "grad_norm": 0.049881111830472946, "learning_rate": 4.082438162211955e-06, "loss": 0.3218, "step": 257 }, { "epoch": 1.8079625292740047, "grad_norm": 0.2167116105556488, "learning_rate": 4.042153029773861e-06, "loss": 0.3218, "step": 258 }, { "epoch": 1.8149882903981265, "grad_norm": 0.051224831491708755, "learning_rate": 4.001932343324683e-06, "loss": 0.3317, "step": 259 }, { "epoch": 1.8220140515222483, "grad_norm": 0.10497119277715683, "learning_rate": 3.961778808998066e-06, "loss": 0.3676, "step": 260 }, { "epoch": 1.82903981264637, "grad_norm": 0.05344713106751442, "learning_rate": 3.921695128409517e-06, "loss": 0.3619, "step": 261 }, { "epoch": 1.8360655737704918, "grad_norm": 0.04895958676934242, "learning_rate": 3.8816839984746334e-06, "loss": 0.2942, "step": 262 }, { "epoch": 1.8430913348946136, "grad_norm": 0.05228881910443306, "learning_rate": 3.841748111227652e-06, "loss": 0.3259, "step": 263 }, { "epoch": 1.8501170960187352, "grad_norm": 0.04963938146829605, "learning_rate": 3.8018901536403198e-06, "loss": 0.3582, "step": 264 }, { "epoch": 1.8571428571428572, "grad_norm": 0.056527718901634216, "learning_rate": 3.762112807441108e-06, "loss": 0.3681, "step": 265 }, { "epoch": 1.8641686182669788, "grad_norm": 0.05749901384115219, "learning_rate": 3.7224187489347847e-06, "loss": 0.3329, "step": 266 }, { "epoch": 1.8711943793911008, "grad_norm": 0.05997895449399948, "learning_rate": 3.682810648822343e-06, "loss": 0.3636, "step": 267 }, { "epoch": 1.8782201405152223, "grad_norm": 0.05262213572859764, "learning_rate": 3.6432911720213127e-06, "loss": 0.3473, "step": 268 }, { "epoch": 1.8852459016393444, "grad_norm": 0.05787108838558197, "learning_rate": 3.6038629774864563e-06, "loss": 0.3231, "step": 269 }, { "epoch": 1.892271662763466, "grad_norm": 0.06611717492341995, "learning_rate": 3.56452871803087e-06, "loss": 0.3687, "step": 270 }, { "epoch": 1.899297423887588, "grad_norm": 0.056811776012182236, "learning_rate": 3.525291040147498e-06, "loss": 0.298, "step": 271 }, { "epoch": 1.9063231850117095, "grad_norm": 0.0592680424451828, "learning_rate": 3.486152583831072e-06, "loss": 0.3253, "step": 272 }, { "epoch": 1.9133489461358315, "grad_norm": 0.05636563152074814, "learning_rate": 3.447115982400485e-06, "loss": 0.3421, "step": 273 }, { "epoch": 1.920374707259953, "grad_norm": 0.05371502414345741, "learning_rate": 3.4081838623216124e-06, "loss": 0.3643, "step": 274 }, { "epoch": 1.927400468384075, "grad_norm": 0.05231010541319847, "learning_rate": 3.3693588430306035e-06, "loss": 0.2898, "step": 275 }, { "epoch": 1.9344262295081966, "grad_norm": 0.055993061512708664, "learning_rate": 3.330643536757638e-06, "loss": 0.3231, "step": 276 }, { "epoch": 1.9414519906323187, "grad_norm": 0.05135795846581459, "learning_rate": 3.2920405483511702e-06, "loss": 0.3241, "step": 277 }, { "epoch": 1.9484777517564402, "grad_norm": 0.05628650262951851, "learning_rate": 3.253552475102668e-06, "loss": 0.3325, "step": 278 }, { "epoch": 1.955503512880562, "grad_norm": 0.05219852551817894, "learning_rate": 3.215181906571858e-06, "loss": 0.3499, "step": 279 }, { "epoch": 1.9625292740046838, "grad_norm": 0.05198737978935242, "learning_rate": 3.1769314244125056e-06, "loss": 0.3252, "step": 280 }, { "epoch": 1.9695550351288056, "grad_norm": 0.0620955154299736, "learning_rate": 3.1388036021987047e-06, "loss": 0.38, "step": 281 }, { "epoch": 1.9765807962529274, "grad_norm": 0.05170467123389244, "learning_rate": 3.100801005251727e-06, "loss": 0.3206, "step": 282 }, { "epoch": 1.9836065573770492, "grad_norm": 0.05948978289961815, "learning_rate": 3.0629261904674206e-06, "loss": 0.3355, "step": 283 }, { "epoch": 1.990632318501171, "grad_norm": 0.055122461169958115, "learning_rate": 3.025181706144178e-06, "loss": 0.2983, "step": 284 }, { "epoch": 1.9976580796252927, "grad_norm": 0.05636753514409065, "learning_rate": 2.987570091811479e-06, "loss": 0.3636, "step": 285 }, { "epoch": 2.0, "grad_norm": 0.05636753514409065, "learning_rate": 2.9500938780590276e-06, "loss": 0.3186, "step": 286 }, { "epoch": 2.0070257611241216, "grad_norm": 0.10458986461162567, "learning_rate": 2.9127555863664857e-06, "loss": 0.254, "step": 287 }, { "epoch": 2.0140515222482436, "grad_norm": 0.059524934738874435, "learning_rate": 2.8755577289338267e-06, "loss": 0.2405, "step": 288 }, { "epoch": 2.021077283372365, "grad_norm": 0.0579247772693634, "learning_rate": 2.838502808512309e-06, "loss": 0.2159, "step": 289 }, { "epoch": 2.028103044496487, "grad_norm": 0.06685949116945267, "learning_rate": 2.801593318236078e-06, "loss": 0.2268, "step": 290 }, { "epoch": 2.0351288056206087, "grad_norm": 0.05411679297685623, "learning_rate": 2.764831741454432e-06, "loss": 0.2579, "step": 291 }, { "epoch": 2.0421545667447307, "grad_norm": 0.058176521211862564, "learning_rate": 2.7282205515647348e-06, "loss": 0.195, "step": 292 }, { "epoch": 2.0491803278688523, "grad_norm": 0.051991887390613556, "learning_rate": 2.6917622118459975e-06, "loss": 0.2002, "step": 293 }, { "epoch": 2.0562060889929743, "grad_norm": 0.06430847942829132, "learning_rate": 2.655459175293146e-06, "loss": 0.1863, "step": 294 }, { "epoch": 2.063231850117096, "grad_norm": 0.06031821295619011, "learning_rate": 2.6193138844519785e-06, "loss": 0.2521, "step": 295 }, { "epoch": 2.070257611241218, "grad_norm": 0.07283060252666473, "learning_rate": 2.58332877125482e-06, "loss": 0.2138, "step": 296 }, { "epoch": 2.0772833723653394, "grad_norm": 0.05646536499261856, "learning_rate": 2.5475062568569077e-06, "loss": 0.2463, "step": 297 }, { "epoch": 2.0843091334894615, "grad_norm": 0.0762052983045578, "learning_rate": 2.511848751473485e-06, "loss": 0.2135, "step": 298 }, { "epoch": 2.091334894613583, "grad_norm": 0.04977189749479294, "learning_rate": 2.476358654217627e-06, "loss": 0.1957, "step": 299 }, { "epoch": 2.098360655737705, "grad_norm": 0.05424179881811142, "learning_rate": 2.4410383529388448e-06, "loss": 0.2154, "step": 300 }, { "epoch": 2.1053864168618266, "grad_norm": 0.05405616760253906, "learning_rate": 2.405890224062406e-06, "loss": 0.2171, "step": 301 }, { "epoch": 2.1124121779859486, "grad_norm": 0.050341855734586716, "learning_rate": 2.370916632429455e-06, "loss": 0.1926, "step": 302 }, { "epoch": 2.11943793911007, "grad_norm": 0.05347389727830887, "learning_rate": 2.336119931137897e-06, "loss": 0.2238, "step": 303 }, { "epoch": 2.126463700234192, "grad_norm": 0.061148934066295624, "learning_rate": 2.3015024613840742e-06, "loss": 0.2218, "step": 304 }, { "epoch": 2.1334894613583137, "grad_norm": 0.056503091007471085, "learning_rate": 2.2670665523052534e-06, "loss": 0.2147, "step": 305 }, { "epoch": 2.1405152224824358, "grad_norm": 0.05360401049256325, "learning_rate": 2.2328145208229096e-06, "loss": 0.1889, "step": 306 }, { "epoch": 2.1475409836065573, "grad_norm": 0.05166396126151085, "learning_rate": 2.1987486714868384e-06, "loss": 0.1911, "step": 307 }, { "epoch": 2.1545667447306793, "grad_norm": 0.05574238300323486, "learning_rate": 2.164871296320106e-06, "loss": 0.2184, "step": 308 }, { "epoch": 2.161592505854801, "grad_norm": 0.05423097312450409, "learning_rate": 2.1311846746648325e-06, "loss": 0.1904, "step": 309 }, { "epoch": 2.168618266978923, "grad_norm": 0.04529114067554474, "learning_rate": 2.097691073028836e-06, "loss": 0.2049, "step": 310 }, { "epoch": 2.1756440281030445, "grad_norm": 0.050600163638591766, "learning_rate": 2.064392744933135e-06, "loss": 0.2136, "step": 311 }, { "epoch": 2.1826697892271665, "grad_norm": 0.04993721842765808, "learning_rate": 2.0312919307603286e-06, "loss": 0.1835, "step": 312 }, { "epoch": 2.189695550351288, "grad_norm": 0.050447553396224976, "learning_rate": 1.998390857603853e-06, "loss": 0.202, "step": 313 }, { "epoch": 2.19672131147541, "grad_norm": 0.05513535067439079, "learning_rate": 1.965691739118146e-06, "loss": 0.1964, "step": 314 }, { "epoch": 2.2037470725995316, "grad_norm": 0.05925602465867996, "learning_rate": 1.9331967753697077e-06, "loss": 0.1935, "step": 315 }, { "epoch": 2.210772833723653, "grad_norm": 0.05441617593169212, "learning_rate": 1.9009081526890622e-06, "loss": 0.1894, "step": 316 }, { "epoch": 2.217798594847775, "grad_norm": 0.06225774437189102, "learning_rate": 1.8688280435236732e-06, "loss": 0.1849, "step": 317 }, { "epoch": 2.2248243559718968, "grad_norm": 0.05718431621789932, "learning_rate": 1.8369586062917693e-06, "loss": 0.2327, "step": 318 }, { "epoch": 2.2318501170960188, "grad_norm": 0.05463183671236038, "learning_rate": 1.8053019852371195e-06, "loss": 0.2159, "step": 319 }, { "epoch": 2.2388758782201403, "grad_norm": 0.05712981894612312, "learning_rate": 1.7738603102847696e-06, "loss": 0.2057, "step": 320 }, { "epoch": 2.2459016393442623, "grad_norm": 0.05346499755978584, "learning_rate": 1.7426356968977265e-06, "loss": 0.2406, "step": 321 }, { "epoch": 2.252927400468384, "grad_norm": 0.06216704845428467, "learning_rate": 1.711630245934638e-06, "loss": 0.2307, "step": 322 }, { "epoch": 2.259953161592506, "grad_norm": 0.04996800422668457, "learning_rate": 1.6808460435084316e-06, "loss": 0.2145, "step": 323 }, { "epoch": 2.2669789227166275, "grad_norm": 0.05448152869939804, "learning_rate": 1.6502851608459668e-06, "loss": 0.2114, "step": 324 }, { "epoch": 2.2740046838407495, "grad_norm": 0.05008429288864136, "learning_rate": 1.6199496541486647e-06, "loss": 0.1881, "step": 325 }, { "epoch": 2.281030444964871, "grad_norm": 0.04923682287335396, "learning_rate": 1.589841564454176e-06, "loss": 0.2114, "step": 326 }, { "epoch": 2.288056206088993, "grad_norm": 0.04919268935918808, "learning_rate": 1.5599629174990482e-06, "loss": 0.2149, "step": 327 }, { "epoch": 2.2950819672131146, "grad_norm": 0.05411992594599724, "learning_rate": 1.5303157235824323e-06, "loss": 0.2034, "step": 328 }, { "epoch": 2.3021077283372366, "grad_norm": 0.047855135053396225, "learning_rate": 1.5009019774308249e-06, "loss": 0.1977, "step": 329 }, { "epoch": 2.309133489461358, "grad_norm": 0.05060090869665146, "learning_rate": 1.471723658063856e-06, "loss": 0.226, "step": 330 }, { "epoch": 2.3161592505854802, "grad_norm": 0.05514250695705414, "learning_rate": 1.4427827286611412e-06, "loss": 0.2213, "step": 331 }, { "epoch": 2.323185011709602, "grad_norm": 0.05435176566243172, "learning_rate": 1.4140811364301931e-06, "loss": 0.1782, "step": 332 }, { "epoch": 2.330210772833724, "grad_norm": 0.05435670167207718, "learning_rate": 1.385620812475409e-06, "loss": 0.2196, "step": 333 }, { "epoch": 2.3372365339578454, "grad_norm": 0.060686711221933365, "learning_rate": 1.3574036716681366e-06, "loss": 0.2386, "step": 334 }, { "epoch": 2.3442622950819674, "grad_norm": 0.045962344855070114, "learning_rate": 1.3294316125178474e-06, "loss": 0.2073, "step": 335 }, { "epoch": 2.351288056206089, "grad_norm": 0.05140933021903038, "learning_rate": 1.301706517044395e-06, "loss": 0.2094, "step": 336 }, { "epoch": 2.358313817330211, "grad_norm": 0.052996646612882614, "learning_rate": 1.2742302506513894e-06, "loss": 0.2183, "step": 337 }, { "epoch": 2.3653395784543325, "grad_norm": 0.060029152780771255, "learning_rate": 1.247004662000686e-06, "loss": 0.2298, "step": 338 }, { "epoch": 2.3723653395784545, "grad_norm": 0.0602019801735878, "learning_rate": 1.2200315828880094e-06, "loss": 0.2162, "step": 339 }, { "epoch": 2.379391100702576, "grad_norm": 0.0632694885134697, "learning_rate": 1.1933128281197042e-06, "loss": 0.2032, "step": 340 }, { "epoch": 2.3864168618266977, "grad_norm": 0.05728806555271149, "learning_rate": 1.166850195390628e-06, "loss": 0.238, "step": 341 }, { "epoch": 2.3934426229508197, "grad_norm": 0.05771248787641525, "learning_rate": 1.1406454651632042e-06, "loss": 0.2385, "step": 342 }, { "epoch": 2.4004683840749417, "grad_norm": 0.09708061069250107, "learning_rate": 1.1147004005476192e-06, "loss": 0.2367, "step": 343 }, { "epoch": 2.4074941451990632, "grad_norm": 0.05926092714071274, "learning_rate": 1.089016747183208e-06, "loss": 0.1787, "step": 344 }, { "epoch": 2.414519906323185, "grad_norm": 0.05011197552084923, "learning_rate": 1.063596233120997e-06, "loss": 0.2119, "step": 345 }, { "epoch": 2.421545667447307, "grad_norm": 0.05037648603320122, "learning_rate": 1.03844056870744e-06, "loss": 0.2177, "step": 346 }, { "epoch": 2.4285714285714284, "grad_norm": 0.05578481778502464, "learning_rate": 1.013551446469337e-06, "loss": 0.1988, "step": 347 }, { "epoch": 2.4355971896955504, "grad_norm": 0.05308612063527107, "learning_rate": 9.889305409999656e-07, "loss": 0.2006, "step": 348 }, { "epoch": 2.442622950819672, "grad_norm": 0.05964551866054535, "learning_rate": 9.64579508846405e-07, "loss": 0.2598, "step": 349 }, { "epoch": 2.449648711943794, "grad_norm": 0.057544246315956116, "learning_rate": 9.40499988398082e-07, "loss": 0.2095, "step": 350 }, { "epoch": 2.4566744730679155, "grad_norm": 0.05650878697633743, "learning_rate": 9.166935997765364e-07, "loss": 0.2347, "step": 351 }, { "epoch": 2.4637002341920375, "grad_norm": 0.07671873271465302, "learning_rate": 8.93161944726414e-07, "loss": 0.2145, "step": 352 }, { "epoch": 2.470725995316159, "grad_norm": 0.0586552731692791, "learning_rate": 8.699066065077005e-07, "loss": 0.2078, "step": 353 }, { "epoch": 2.477751756440281, "grad_norm": 0.045289915055036545, "learning_rate": 8.469291497891979e-07, "loss": 0.2129, "step": 354 }, { "epoch": 2.4847775175644027, "grad_norm": 0.05548376590013504, "learning_rate": 8.242311205432418e-07, "loss": 0.1944, "step": 355 }, { "epoch": 2.4918032786885247, "grad_norm": 0.04866700619459152, "learning_rate": 8.018140459416962e-07, "loss": 0.2228, "step": 356 }, { "epoch": 2.4988290398126463, "grad_norm": 0.05934524163603783, "learning_rate": 7.796794342531949e-07, "loss": 0.2155, "step": 357 }, { "epoch": 2.5058548009367683, "grad_norm": 0.056303609162569046, "learning_rate": 7.57828774741664e-07, "loss": 0.2048, "step": 358 }, { "epoch": 2.51288056206089, "grad_norm": 0.05041569098830223, "learning_rate": 7.362635375661225e-07, "loss": 0.224, "step": 359 }, { "epoch": 2.519906323185012, "grad_norm": 0.05846166983246803, "learning_rate": 7.149851736817609e-07, "loss": 0.2502, "step": 360 }, { "epoch": 2.5269320843091334, "grad_norm": 0.0575077049434185, "learning_rate": 6.939951147423269e-07, "loss": 0.1915, "step": 361 }, { "epoch": 2.5339578454332554, "grad_norm": 0.05687614157795906, "learning_rate": 6.732947730037936e-07, "loss": 0.2019, "step": 362 }, { "epoch": 2.540983606557377, "grad_norm": 0.05574915185570717, "learning_rate": 6.52885541229345e-07, "loss": 0.2085, "step": 363 }, { "epoch": 2.548009367681499, "grad_norm": 0.05865642428398132, "learning_rate": 6.327687925956616e-07, "loss": 0.1975, "step": 364 }, { "epoch": 2.5550351288056206, "grad_norm": 0.04809865728020668, "learning_rate": 6.12945880600535e-07, "loss": 0.1823, "step": 365 }, { "epoch": 2.562060889929742, "grad_norm": 0.05387086421251297, "learning_rate": 5.93418138971803e-07, "loss": 0.2394, "step": 366 }, { "epoch": 2.569086651053864, "grad_norm": 0.05231115594506264, "learning_rate": 5.741868815776081e-07, "loss": 0.2235, "step": 367 }, { "epoch": 2.576112412177986, "grad_norm": 0.07717155665159225, "learning_rate": 5.552534023380024e-07, "loss": 0.2279, "step": 368 }, { "epoch": 2.5831381733021077, "grad_norm": 0.05653085187077522, "learning_rate": 5.366189751378858e-07, "loss": 0.1869, "step": 369 }, { "epoch": 2.5901639344262293, "grad_norm": 0.05067911744117737, "learning_rate": 5.18284853741301e-07, "loss": 0.2376, "step": 370 }, { "epoch": 2.5971896955503513, "grad_norm": 0.05797469988465309, "learning_rate": 5.002522717070751e-07, "loss": 0.1973, "step": 371 }, { "epoch": 2.6042154566744733, "grad_norm": 0.05374491214752197, "learning_rate": 4.8252244230582e-07, "loss": 0.2014, "step": 372 }, { "epoch": 2.611241217798595, "grad_norm": 0.05649881809949875, "learning_rate": 4.6509655843830827e-07, "loss": 0.2027, "step": 373 }, { "epoch": 2.6182669789227164, "grad_norm": 0.054554861038923264, "learning_rate": 4.4797579255520585e-07, "loss": 0.2092, "step": 374 }, { "epoch": 2.6252927400468384, "grad_norm": 0.054530441761016846, "learning_rate": 4.311612965781903e-07, "loss": 0.2196, "step": 375 }, { "epoch": 2.6323185011709604, "grad_norm": 0.0523928701877594, "learning_rate": 4.1465420182244476e-07, "loss": 0.2104, "step": 376 }, { "epoch": 2.639344262295082, "grad_norm": 0.057155121117830276, "learning_rate": 3.984556189205441e-07, "loss": 0.1966, "step": 377 }, { "epoch": 2.6463700234192036, "grad_norm": 0.05527928099036217, "learning_rate": 3.8256663774772383e-07, "loss": 0.235, "step": 378 }, { "epoch": 2.6533957845433256, "grad_norm": 0.05507628992199898, "learning_rate": 3.669883273485575e-07, "loss": 0.2284, "step": 379 }, { "epoch": 2.660421545667447, "grad_norm": 0.05377492681145668, "learning_rate": 3.5172173586502543e-07, "loss": 0.1792, "step": 380 }, { "epoch": 2.667447306791569, "grad_norm": 0.05631445720791817, "learning_rate": 3.3676789046599045e-07, "loss": 0.1963, "step": 381 }, { "epoch": 2.6744730679156907, "grad_norm": 0.05693857744336128, "learning_rate": 3.2212779727809504e-07, "loss": 0.2265, "step": 382 }, { "epoch": 2.6814988290398127, "grad_norm": 0.052210524678230286, "learning_rate": 3.0780244131806193e-07, "loss": 0.2292, "step": 383 }, { "epoch": 2.6885245901639343, "grad_norm": 0.06421205401420593, "learning_rate": 2.937927864264206e-07, "loss": 0.2247, "step": 384 }, { "epoch": 2.6955503512880563, "grad_norm": 0.054567910730838776, "learning_rate": 2.800997752026596e-07, "loss": 0.2115, "step": 385 }, { "epoch": 2.702576112412178, "grad_norm": 0.05900174379348755, "learning_rate": 2.667243289418059e-07, "loss": 0.2014, "step": 386 }, { "epoch": 2.7096018735363, "grad_norm": 0.05543851852416992, "learning_rate": 2.5366734757243496e-07, "loss": 0.1795, "step": 387 }, { "epoch": 2.7166276346604215, "grad_norm": 0.05802013352513313, "learning_rate": 2.4092970959612885e-07, "loss": 0.195, "step": 388 }, { "epoch": 2.7236533957845435, "grad_norm": 0.04725894704461098, "learning_rate": 2.2851227202836002e-07, "loss": 0.2301, "step": 389 }, { "epoch": 2.730679156908665, "grad_norm": 0.05542716756463051, "learning_rate": 2.1641587034083756e-07, "loss": 0.2115, "step": 390 }, { "epoch": 2.737704918032787, "grad_norm": 0.048591192811727524, "learning_rate": 2.0464131840528978e-07, "loss": 0.1922, "step": 391 }, { "epoch": 2.7447306791569086, "grad_norm": 0.04870427027344704, "learning_rate": 1.9318940843870594e-07, "loss": 0.1852, "step": 392 }, { "epoch": 2.7517564402810306, "grad_norm": 0.05185515433549881, "learning_rate": 1.8206091095003543e-07, "loss": 0.2278, "step": 393 }, { "epoch": 2.758782201405152, "grad_norm": 0.056988395750522614, "learning_rate": 1.7125657468834656e-07, "loss": 0.1981, "step": 394 }, { "epoch": 2.7658079625292737, "grad_norm": 0.050774361938238144, "learning_rate": 1.6077712659244792e-07, "loss": 0.2147, "step": 395 }, { "epoch": 2.7728337236533958, "grad_norm": 0.053753916174173355, "learning_rate": 1.5062327174197645e-07, "loss": 0.1922, "step": 396 }, { "epoch": 2.7798594847775178, "grad_norm": 0.06190885975956917, "learning_rate": 1.4079569330996412e-07, "loss": 0.2409, "step": 397 }, { "epoch": 2.7868852459016393, "grad_norm": 0.05707972124218941, "learning_rate": 1.3129505251686603e-07, "loss": 0.1889, "step": 398 }, { "epoch": 2.793911007025761, "grad_norm": 0.05355476588010788, "learning_rate": 1.2212198858607694e-07, "loss": 0.2026, "step": 399 }, { "epoch": 2.800936768149883, "grad_norm": 0.055988870561122894, "learning_rate": 1.1327711870091963e-07, "loss": 0.2114, "step": 400 }, { "epoch": 2.807962529274005, "grad_norm": 0.056202538311481476, "learning_rate": 1.0476103796312254e-07, "loss": 0.2217, "step": 401 }, { "epoch": 2.8149882903981265, "grad_norm": 0.06274823844432831, "learning_rate": 9.657431935277629e-08, "loss": 0.2113, "step": 402 }, { "epoch": 2.822014051522248, "grad_norm": 0.049836281687021255, "learning_rate": 8.871751368978554e-08, "loss": 0.2032, "step": 403 }, { "epoch": 2.82903981264637, "grad_norm": 0.05938475951552391, "learning_rate": 8.119114959680929e-08, "loss": 0.19, "step": 404 }, { "epoch": 2.836065573770492, "grad_norm": 0.06170409545302391, "learning_rate": 7.399573346368871e-08, "loss": 0.2377, "step": 405 }, { "epoch": 2.8430913348946136, "grad_norm": 0.05599289387464523, "learning_rate": 6.713174941338163e-08, "loss": 0.2307, "step": 406 }, { "epoch": 2.850117096018735, "grad_norm": 0.05716150254011154, "learning_rate": 6.05996592693886e-08, "loss": 0.1509, "step": 407 }, { "epoch": 2.857142857142857, "grad_norm": 0.04617927595973015, "learning_rate": 5.439990252467886e-08, "loss": 0.2429, "step": 408 }, { "epoch": 2.8641686182669788, "grad_norm": 0.05094977468252182, "learning_rate": 4.853289631212066e-08, "loss": 0.2302, "step": 409 }, { "epoch": 2.871194379391101, "grad_norm": 0.05809462442994118, "learning_rate": 4.299903537641703e-08, "loss": 0.1985, "step": 410 }, { "epoch": 2.8782201405152223, "grad_norm": 0.0512741394340992, "learning_rate": 3.779869204754427e-08, "loss": 0.2277, "step": 411 }, { "epoch": 2.8852459016393444, "grad_norm": 0.06057005748152733, "learning_rate": 3.2932216215704195e-08, "loss": 0.1957, "step": 412 }, { "epoch": 2.892271662763466, "grad_norm": 0.05182475969195366, "learning_rate": 2.8399935307778516e-08, "loss": 0.1988, "step": 413 }, { "epoch": 2.899297423887588, "grad_norm": 0.05228479206562042, "learning_rate": 2.420215426530259e-08, "loss": 0.1873, "step": 414 }, { "epoch": 2.9063231850117095, "grad_norm": 0.05834292992949486, "learning_rate": 2.0339155523945164e-08, "loss": 0.1845, "step": 415 }, { "epoch": 2.9133489461358315, "grad_norm": 0.053505126386880875, "learning_rate": 1.681119899450856e-08, "loss": 0.208, "step": 416 }, { "epoch": 2.920374707259953, "grad_norm": 0.05727040395140648, "learning_rate": 1.3618522045439897e-08, "loss": 0.2026, "step": 417 }, { "epoch": 2.927400468384075, "grad_norm": 0.15337811410427094, "learning_rate": 1.0761339486859424e-08, "loss": 0.2024, "step": 418 }, { "epoch": 2.9344262295081966, "grad_norm": 0.05246254801750183, "learning_rate": 8.239843556108739e-09, "loss": 0.1698, "step": 419 }, { "epoch": 2.9414519906323187, "grad_norm": 0.06088126823306084, "learning_rate": 6.054203904817812e-09, "loss": 0.2337, "step": 420 }, { "epoch": 2.9484777517564402, "grad_norm": 0.05842256173491478, "learning_rate": 4.204567587486885e-09, "loss": 0.2318, "step": 421 }, { "epoch": 2.9555035128805622, "grad_norm": 0.05337844043970108, "learning_rate": 2.6910590515966117e-09, "loss": 0.1856, "step": 422 }, { "epoch": 2.962529274004684, "grad_norm": 0.05508222058415413, "learning_rate": 1.5137801292325338e-09, "loss": 0.227, "step": 423 }, { "epoch": 2.9695550351288054, "grad_norm": 0.05756290256977081, "learning_rate": 6.728100302327844e-10, "loss": 0.1856, "step": 424 }, { "epoch": 2.9765807962529274, "grad_norm": 0.05223892256617546, "learning_rate": 1.6820533686179308e-10, "loss": 0.2299, "step": 425 }, { "epoch": 2.9836065573770494, "grad_norm": 0.0553586483001709, "learning_rate": 0.0, "loss": 0.1971, "step": 426 }, { "epoch": 2.9836065573770494, "step": 426, "total_flos": 1.0638855097443942e+17, "train_loss": 0.35510572714145194, "train_runtime": 58523.3937, "train_samples_per_second": 0.175, "train_steps_per_second": 0.007 } ], "logging_steps": 1, "max_steps": 426, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0638855097443942e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }