| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.972522897585345, | |
| "eval_steps": 500, | |
| "global_step": 750, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.013322231473771857, | |
| "grad_norm": 5.780612066318371, | |
| "learning_rate": 1.3333333333333336e-07, | |
| "loss": 0.9209, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.026644462947543714, | |
| "grad_norm": 5.803797516781435, | |
| "learning_rate": 2.666666666666667e-07, | |
| "loss": 0.9065, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.03996669442131557, | |
| "grad_norm": 5.67259689729213, | |
| "learning_rate": 4.0000000000000003e-07, | |
| "loss": 0.9182, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.05328892589508743, | |
| "grad_norm": 5.354376269147714, | |
| "learning_rate": 5.333333333333335e-07, | |
| "loss": 0.9233, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.06661115736885928, | |
| "grad_norm": 4.851036441351623, | |
| "learning_rate": 6.666666666666667e-07, | |
| "loss": 0.8906, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07993338884263114, | |
| "grad_norm": 4.566190641845786, | |
| "learning_rate": 8.000000000000001e-07, | |
| "loss": 0.8719, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.093255620316403, | |
| "grad_norm": 4.013324466824848, | |
| "learning_rate": 9.333333333333334e-07, | |
| "loss": 0.8861, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.10657785179017486, | |
| "grad_norm": 3.540718772081854, | |
| "learning_rate": 1.066666666666667e-06, | |
| "loss": 0.8791, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.11990008326394672, | |
| "grad_norm": 2.88258712818169, | |
| "learning_rate": 1.2000000000000002e-06, | |
| "loss": 0.8532, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.13322231473771856, | |
| "grad_norm": 2.6937271254013613, | |
| "learning_rate": 1.3333333333333334e-06, | |
| "loss": 0.8455, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.14654454621149043, | |
| "grad_norm": 2.3220586555688816, | |
| "learning_rate": 1.4666666666666669e-06, | |
| "loss": 0.8159, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.15986677768526228, | |
| "grad_norm": 3.400225929248788, | |
| "learning_rate": 1.6000000000000001e-06, | |
| "loss": 0.8004, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.17318900915903415, | |
| "grad_norm": 3.5504999091076845, | |
| "learning_rate": 1.7333333333333336e-06, | |
| "loss": 0.8084, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.186511240632806, | |
| "grad_norm": 3.2668071978806057, | |
| "learning_rate": 1.8666666666666669e-06, | |
| "loss": 0.8086, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.19983347210657784, | |
| "grad_norm": 2.4897231787044154, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.7821, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.21315570358034971, | |
| "grad_norm": 2.006655051652645, | |
| "learning_rate": 2.133333333333334e-06, | |
| "loss": 0.7767, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.22647793505412156, | |
| "grad_norm": 1.8054118845570075, | |
| "learning_rate": 2.266666666666667e-06, | |
| "loss": 0.764, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.23980016652789343, | |
| "grad_norm": 2.060437415012226, | |
| "learning_rate": 2.4000000000000003e-06, | |
| "loss": 0.7577, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.2531223980016653, | |
| "grad_norm": 1.822445369621608, | |
| "learning_rate": 2.5333333333333338e-06, | |
| "loss": 0.7441, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.2664446294754371, | |
| "grad_norm": 1.702085750109042, | |
| "learning_rate": 2.666666666666667e-06, | |
| "loss": 0.7364, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.279766860949209, | |
| "grad_norm": 1.4905140822613037, | |
| "learning_rate": 2.8000000000000003e-06, | |
| "loss": 0.7252, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.29308909242298087, | |
| "grad_norm": 1.602012507677594, | |
| "learning_rate": 2.9333333333333338e-06, | |
| "loss": 0.727, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.3064113238967527, | |
| "grad_norm": 1.5462649172094083, | |
| "learning_rate": 3.066666666666667e-06, | |
| "loss": 0.7221, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.31973355537052456, | |
| "grad_norm": 1.4593506403426082, | |
| "learning_rate": 3.2000000000000003e-06, | |
| "loss": 0.6922, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.33305578684429643, | |
| "grad_norm": 1.42990322544597, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.6873, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3463780183180683, | |
| "grad_norm": 1.4803196580352989, | |
| "learning_rate": 3.4666666666666672e-06, | |
| "loss": 0.6975, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.3597002497918401, | |
| "grad_norm": 1.5152423594984497, | |
| "learning_rate": 3.6000000000000003e-06, | |
| "loss": 0.662, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.373022481265612, | |
| "grad_norm": 1.4183767379176757, | |
| "learning_rate": 3.7333333333333337e-06, | |
| "loss": 0.6578, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.38634471273938387, | |
| "grad_norm": 1.5361891229630817, | |
| "learning_rate": 3.866666666666667e-06, | |
| "loss": 0.6655, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.3996669442131557, | |
| "grad_norm": 1.430106096852744, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.648, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.41298917568692756, | |
| "grad_norm": 1.4201912298932542, | |
| "learning_rate": 4.133333333333333e-06, | |
| "loss": 0.6449, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.42631140716069943, | |
| "grad_norm": 1.45821665687646, | |
| "learning_rate": 4.266666666666668e-06, | |
| "loss": 0.636, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.43963363863447125, | |
| "grad_norm": 1.351061303304786, | |
| "learning_rate": 4.4e-06, | |
| "loss": 0.6287, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.4529558701082431, | |
| "grad_norm": 1.4213406831821087, | |
| "learning_rate": 4.533333333333334e-06, | |
| "loss": 0.6162, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.466278101582015, | |
| "grad_norm": 1.5920606382997864, | |
| "learning_rate": 4.666666666666667e-06, | |
| "loss": 0.6269, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.47960033305578686, | |
| "grad_norm": 1.4259278183625448, | |
| "learning_rate": 4.800000000000001e-06, | |
| "loss": 0.628, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.4929225645295587, | |
| "grad_norm": 1.4277383814389801, | |
| "learning_rate": 4.933333333333334e-06, | |
| "loss": 0.6047, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.5062447960033306, | |
| "grad_norm": 1.4056841775145905, | |
| "learning_rate": 4.999972922944898e-06, | |
| "loss": 0.5984, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.5195670274771024, | |
| "grad_norm": 1.3201694715565466, | |
| "learning_rate": 4.999756310023261e-06, | |
| "loss": 0.5954, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.5328892589508742, | |
| "grad_norm": 1.3231909628376382, | |
| "learning_rate": 4.999323102948655e-06, | |
| "loss": 0.5954, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5462114904246461, | |
| "grad_norm": 1.3735284931415068, | |
| "learning_rate": 4.998673339256785e-06, | |
| "loss": 0.5744, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.559533721898418, | |
| "grad_norm": 1.4504463513541146, | |
| "learning_rate": 4.997807075247147e-06, | |
| "loss": 0.593, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.5728559533721899, | |
| "grad_norm": 1.2740174759395542, | |
| "learning_rate": 4.996724385978142e-06, | |
| "loss": 0.5903, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.5861781848459617, | |
| "grad_norm": 1.3594170816449038, | |
| "learning_rate": 4.995425365260585e-06, | |
| "loss": 0.5748, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.5995004163197336, | |
| "grad_norm": 1.3782163690261147, | |
| "learning_rate": 4.993910125649561e-06, | |
| "loss": 0.5814, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.6128226477935054, | |
| "grad_norm": 1.2584130530987572, | |
| "learning_rate": 4.992178798434684e-06, | |
| "loss": 0.5752, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.6261448792672772, | |
| "grad_norm": 1.4216349386698004, | |
| "learning_rate": 4.990231533628719e-06, | |
| "loss": 0.5757, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.6394671107410491, | |
| "grad_norm": 1.3768977749050733, | |
| "learning_rate": 4.988068499954578e-06, | |
| "loss": 0.5555, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.652789342214821, | |
| "grad_norm": 1.463407945745149, | |
| "learning_rate": 4.985689884830711e-06, | |
| "loss": 0.5591, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.6661115736885929, | |
| "grad_norm": 1.3808427236512926, | |
| "learning_rate": 4.983095894354858e-06, | |
| "loss": 0.5588, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6794338051623647, | |
| "grad_norm": 1.482231013162315, | |
| "learning_rate": 4.980286753286196e-06, | |
| "loss": 0.5418, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.6927560366361366, | |
| "grad_norm": 1.3778109634949367, | |
| "learning_rate": 4.97726270502586e-06, | |
| "loss": 0.5399, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.7060782681099084, | |
| "grad_norm": 1.4002755485164502, | |
| "learning_rate": 4.974024011595864e-06, | |
| "loss": 0.5533, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.7194004995836802, | |
| "grad_norm": 1.3296620938997752, | |
| "learning_rate": 4.970570953616383e-06, | |
| "loss": 0.5438, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.7327227310574521, | |
| "grad_norm": 1.4458203791375825, | |
| "learning_rate": 4.966903830281449e-06, | |
| "loss": 0.5378, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.746044962531224, | |
| "grad_norm": 1.5136526829998074, | |
| "learning_rate": 4.9630229593330226e-06, | |
| "loss": 0.5348, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.7593671940049959, | |
| "grad_norm": 1.4362377777815807, | |
| "learning_rate": 4.958928677033465e-06, | |
| "loss": 0.5267, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.7726894254787677, | |
| "grad_norm": 1.2730640176398647, | |
| "learning_rate": 4.954621338136399e-06, | |
| "loss": 0.5393, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.7860116569525396, | |
| "grad_norm": 1.3685353603260022, | |
| "learning_rate": 4.95010131585597e-06, | |
| "loss": 0.534, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.7993338884263114, | |
| "grad_norm": 1.2683696145515575, | |
| "learning_rate": 4.9453690018345144e-06, | |
| "loss": 0.527, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.8126561199000832, | |
| "grad_norm": 1.323958192575613, | |
| "learning_rate": 4.940424806108619e-06, | |
| "loss": 0.5267, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.8259783513738551, | |
| "grad_norm": 1.2428318596261736, | |
| "learning_rate": 4.935269157073597e-06, | |
| "loss": 0.5149, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.839300582847627, | |
| "grad_norm": 1.27851729445364, | |
| "learning_rate": 4.9299025014463665e-06, | |
| "loss": 0.5228, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.8526228143213989, | |
| "grad_norm": 1.2913119874277892, | |
| "learning_rate": 4.924325304226745e-06, | |
| "loss": 0.5028, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.8659450457951707, | |
| "grad_norm": 1.3471089811240304, | |
| "learning_rate": 4.91853804865716e-06, | |
| "loss": 0.5402, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.8792672772689425, | |
| "grad_norm": 1.3919989303105873, | |
| "learning_rate": 4.912541236180779e-06, | |
| "loss": 0.5208, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.8925895087427144, | |
| "grad_norm": 1.336135856095439, | |
| "learning_rate": 4.9063353863980565e-06, | |
| "loss": 0.5232, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.9059117402164862, | |
| "grad_norm": 1.535058182009125, | |
| "learning_rate": 4.899921037021719e-06, | |
| "loss": 0.5183, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.9192339716902581, | |
| "grad_norm": 1.4366704774523757, | |
| "learning_rate": 4.893298743830168e-06, | |
| "loss": 0.5152, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.93255620316403, | |
| "grad_norm": 1.4306647802429082, | |
| "learning_rate": 4.88646908061933e-06, | |
| "loss": 0.5241, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.9458784346378019, | |
| "grad_norm": 1.3151003083587773, | |
| "learning_rate": 4.879432639152935e-06, | |
| "loss": 0.518, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.9592006661115737, | |
| "grad_norm": 1.3682779135005043, | |
| "learning_rate": 4.8721900291112415e-06, | |
| "loss": 0.51, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.9725228975853455, | |
| "grad_norm": 1.3896990341168534, | |
| "learning_rate": 4.864741878038218e-06, | |
| "loss": 0.5207, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.9858451290591174, | |
| "grad_norm": 1.2929489978661655, | |
| "learning_rate": 4.857088831287158e-06, | |
| "loss": 0.5121, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.9991673605328892, | |
| "grad_norm": 1.3614193317791738, | |
| "learning_rate": 4.849231551964771e-06, | |
| "loss": 0.5016, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.0066611157368859, | |
| "grad_norm": 1.3229981405906006, | |
| "learning_rate": 4.841170720873723e-06, | |
| "loss": 0.2569, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.0199833472106579, | |
| "grad_norm": 1.2274098346213043, | |
| "learning_rate": 4.832907036453647e-06, | |
| "loss": 0.4662, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.0333055786844296, | |
| "grad_norm": 1.3810724651132364, | |
| "learning_rate": 4.824441214720629e-06, | |
| "loss": 0.4503, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.0466278101582014, | |
| "grad_norm": 1.5094355408493076, | |
| "learning_rate": 4.815773989205165e-06, | |
| "loss": 0.4525, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.0599500416319734, | |
| "grad_norm": 1.191750486186588, | |
| "learning_rate": 4.806906110888606e-06, | |
| "loss": 0.4548, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.0732722731057451, | |
| "grad_norm": 1.2840884507072778, | |
| "learning_rate": 4.7978383481380865e-06, | |
| "loss": 0.4552, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.0865945045795171, | |
| "grad_norm": 1.3818002604555029, | |
| "learning_rate": 4.788571486639948e-06, | |
| "loss": 0.452, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.0999167360532889, | |
| "grad_norm": 1.3200111006279347, | |
| "learning_rate": 4.779106329331665e-06, | |
| "loss": 0.45, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.1132389675270609, | |
| "grad_norm": 1.2755161939993753, | |
| "learning_rate": 4.769443696332272e-06, | |
| "loss": 0.4454, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.1265611990008326, | |
| "grad_norm": 1.3421067926153882, | |
| "learning_rate": 4.759584424871302e-06, | |
| "loss": 0.4429, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.1398834304746046, | |
| "grad_norm": 1.2219457405458125, | |
| "learning_rate": 4.749529369216246e-06, | |
| "loss": 0.4481, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.1532056619483764, | |
| "grad_norm": 1.330574738869651, | |
| "learning_rate": 4.7392794005985324e-06, | |
| "loss": 0.4459, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.1665278934221481, | |
| "grad_norm": 1.2042952174150132, | |
| "learning_rate": 4.7288354071380415e-06, | |
| "loss": 0.4339, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.1798501248959201, | |
| "grad_norm": 1.2535265876319093, | |
| "learning_rate": 4.7181982937661485e-06, | |
| "loss": 0.4364, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.1931723563696919, | |
| "grad_norm": 1.1967067502698956, | |
| "learning_rate": 4.707368982147318e-06, | |
| "loss": 0.4484, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.2064945878434639, | |
| "grad_norm": 1.3022379327320546, | |
| "learning_rate": 4.696348410599244e-06, | |
| "loss": 0.4468, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.2198168193172356, | |
| "grad_norm": 1.3137228151962215, | |
| "learning_rate": 4.685137534011549e-06, | |
| "loss": 0.4492, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.2331390507910074, | |
| "grad_norm": 1.3650226627212705, | |
| "learning_rate": 4.673737323763048e-06, | |
| "loss": 0.4389, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.2464612822647794, | |
| "grad_norm": 1.3122923570081069, | |
| "learning_rate": 4.662148767637578e-06, | |
| "loss": 0.4426, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.2597835137385511, | |
| "grad_norm": 1.3191199275346543, | |
| "learning_rate": 4.650372869738415e-06, | |
| "loss": 0.434, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.2731057452123231, | |
| "grad_norm": 1.4425884017899313, | |
| "learning_rate": 4.638410650401267e-06, | |
| "loss": 0.4382, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.2864279766860949, | |
| "grad_norm": 1.4066578837011166, | |
| "learning_rate": 4.626263146105875e-06, | |
| "loss": 0.4473, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.2997502081598669, | |
| "grad_norm": 1.4430824831613096, | |
| "learning_rate": 4.613931409386196e-06, | |
| "loss": 0.4488, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.3130724396336386, | |
| "grad_norm": 1.2217740909502797, | |
| "learning_rate": 4.601416508739211e-06, | |
| "loss": 0.4395, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.3263946711074106, | |
| "grad_norm": 1.474039226711776, | |
| "learning_rate": 4.588719528532342e-06, | |
| "loss": 0.4381, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.3397169025811824, | |
| "grad_norm": 1.2503538717797444, | |
| "learning_rate": 4.575841568909494e-06, | |
| "loss": 0.4317, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.3530391340549541, | |
| "grad_norm": 1.3172152085291207, | |
| "learning_rate": 4.562783745695738e-06, | |
| "loss": 0.4284, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.3663613655287261, | |
| "grad_norm": 1.2950216606489513, | |
| "learning_rate": 4.549547190300622e-06, | |
| "loss": 0.4372, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.3796835970024979, | |
| "grad_norm": 1.2065789326345406, | |
| "learning_rate": 4.536133049620143e-06, | |
| "loss": 0.4376, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.3930058284762699, | |
| "grad_norm": 1.450309483143858, | |
| "learning_rate": 4.522542485937369e-06, | |
| "loss": 0.4368, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.4063280599500416, | |
| "grad_norm": 1.2856432394840618, | |
| "learning_rate": 4.508776676821739e-06, | |
| "loss": 0.4359, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.4196502914238134, | |
| "grad_norm": 1.303392410991855, | |
| "learning_rate": 4.494836815027022e-06, | |
| "loss": 0.437, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.4329725228975854, | |
| "grad_norm": 1.2374383776516957, | |
| "learning_rate": 4.4807241083879774e-06, | |
| "loss": 0.4277, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.4462947543713571, | |
| "grad_norm": 1.1895403487373037, | |
| "learning_rate": 4.466439779715696e-06, | |
| "loss": 0.4219, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.4596169858451291, | |
| "grad_norm": 1.3959427610193165, | |
| "learning_rate": 4.451985066691649e-06, | |
| "loss": 0.4341, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.4729392173189009, | |
| "grad_norm": 1.2421484766590198, | |
| "learning_rate": 4.437361221760449e-06, | |
| "loss": 0.4162, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.4862614487926726, | |
| "grad_norm": 1.287463815955178, | |
| "learning_rate": 4.422569512021332e-06, | |
| "loss": 0.4282, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.4995836802664446, | |
| "grad_norm": 1.4250139528752677, | |
| "learning_rate": 4.407611219118363e-06, | |
| "loss": 0.421, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.5129059117402166, | |
| "grad_norm": 1.239295099017855, | |
| "learning_rate": 4.3924876391293915e-06, | |
| "loss": 0.427, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.5262281432139884, | |
| "grad_norm": 1.3453909852418124, | |
| "learning_rate": 4.377200082453748e-06, | |
| "loss": 0.4357, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.5395503746877601, | |
| "grad_norm": 1.2197270804139342, | |
| "learning_rate": 4.361749873698707e-06, | |
| "loss": 0.4101, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.552872606161532, | |
| "grad_norm": 1.2833857194816787, | |
| "learning_rate": 4.346138351564711e-06, | |
| "loss": 0.424, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.5661948376353039, | |
| "grad_norm": 1.2293200008377447, | |
| "learning_rate": 4.330366868729376e-06, | |
| "loss": 0.421, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.5795170691090759, | |
| "grad_norm": 1.1926560926173428, | |
| "learning_rate": 4.3144367917302964e-06, | |
| "loss": 0.4142, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.5928393005828476, | |
| "grad_norm": 1.1594494067766803, | |
| "learning_rate": 4.2983495008466285e-06, | |
| "loss": 0.4191, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.6061615320566194, | |
| "grad_norm": 1.224729384745418, | |
| "learning_rate": 4.2821063899795015e-06, | |
| "loss": 0.4128, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.6194837635303914, | |
| "grad_norm": 1.1481228567549495, | |
| "learning_rate": 4.265708866531238e-06, | |
| "loss": 0.4279, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.6328059950041633, | |
| "grad_norm": 1.3467092580505746, | |
| "learning_rate": 4.249158351283414e-06, | |
| "loss": 0.4262, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.646128226477935, | |
| "grad_norm": 1.2776898545321251, | |
| "learning_rate": 4.232456278273743e-06, | |
| "loss": 0.4314, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.6594504579517069, | |
| "grad_norm": 1.2719662910087424, | |
| "learning_rate": 4.215604094671835e-06, | |
| "loss": 0.4108, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.6727726894254786, | |
| "grad_norm": 1.1745562871590098, | |
| "learning_rate": 4.198603260653792e-06, | |
| "loss": 0.4165, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.6860949208992506, | |
| "grad_norm": 1.2455715420366917, | |
| "learning_rate": 4.181455249275701e-06, | |
| "loss": 0.4079, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.6994171523730226, | |
| "grad_norm": 1.3896213959063652, | |
| "learning_rate": 4.1641615463459926e-06, | |
| "loss": 0.417, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.7127393838467944, | |
| "grad_norm": 1.2131393445621887, | |
| "learning_rate": 4.146723650296701e-06, | |
| "loss": 0.4116, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.7260616153205661, | |
| "grad_norm": 1.2101597375627524, | |
| "learning_rate": 4.129143072053639e-06, | |
| "loss": 0.4169, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.739383846794338, | |
| "grad_norm": 1.2983597203629458, | |
| "learning_rate": 4.111421334905468e-06, | |
| "loss": 0.4101, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.7527060782681099, | |
| "grad_norm": 1.1756761204986788, | |
| "learning_rate": 4.093559974371725e-06, | |
| "loss": 0.4023, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.7660283097418819, | |
| "grad_norm": 1.296750722093234, | |
| "learning_rate": 4.075560538069767e-06, | |
| "loss": 0.4037, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.7793505412156536, | |
| "grad_norm": 1.2664686153860956, | |
| "learning_rate": 4.05742458558068e-06, | |
| "loss": 0.4005, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.7926727726894254, | |
| "grad_norm": 1.3144115093925024, | |
| "learning_rate": 4.039153688314146e-06, | |
| "loss": 0.4123, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.8059950041631974, | |
| "grad_norm": 1.177870994913812, | |
| "learning_rate": 4.020749429372286e-06, | |
| "loss": 0.4061, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.8193172356369693, | |
| "grad_norm": 1.1211392036639862, | |
| "learning_rate": 4.002213403412492e-06, | |
| "loss": 0.4207, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.832639467110741, | |
| "grad_norm": 1.1967335338983747, | |
| "learning_rate": 3.983547216509254e-06, | |
| "loss": 0.4037, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.8459616985845129, | |
| "grad_norm": 1.163438902600854, | |
| "learning_rate": 3.964752486015001e-06, | |
| "loss": 0.3983, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.8592839300582846, | |
| "grad_norm": 1.3897690758852341, | |
| "learning_rate": 3.945830840419966e-06, | |
| "loss": 0.406, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.8726061615320566, | |
| "grad_norm": 1.2302319797016965, | |
| "learning_rate": 3.92678391921108e-06, | |
| "loss": 0.4102, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.8859283930058286, | |
| "grad_norm": 1.2515743950418428, | |
| "learning_rate": 3.907613372729916e-06, | |
| "loss": 0.4121, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.8992506244796004, | |
| "grad_norm": 1.2250514633864378, | |
| "learning_rate": 3.888320862029699e-06, | |
| "loss": 0.4135, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.9125728559533721, | |
| "grad_norm": 1.1786595929578796, | |
| "learning_rate": 3.868908058731376e-06, | |
| "loss": 0.3961, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.9258950874271439, | |
| "grad_norm": 1.2316483388259516, | |
| "learning_rate": 3.849376644878783e-06, | |
| "loss": 0.3991, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.9392173189009159, | |
| "grad_norm": 1.2218522002215788, | |
| "learning_rate": 3.829728312792895e-06, | |
| "loss": 0.4068, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.9525395503746878, | |
| "grad_norm": 1.218981908305007, | |
| "learning_rate": 3.8099647649251984e-06, | |
| "loss": 0.4116, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.9658617818484596, | |
| "grad_norm": 1.1473329397682062, | |
| "learning_rate": 3.790087713710179e-06, | |
| "loss": 0.3961, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.9791840133222314, | |
| "grad_norm": 1.15330486401059, | |
| "learning_rate": 3.770098881416945e-06, | |
| "loss": 0.397, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.9925062447960034, | |
| "grad_norm": 1.1147439818886564, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.391, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.1888727583821848, | |
| "learning_rate": 3.7297928109491765e-06, | |
| "loss": 0.2238, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 2.0133222314737718, | |
| "grad_norm": 1.1682600742115117, | |
| "learning_rate": 3.7094790651387414e-06, | |
| "loss": 0.3464, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 2.0266444629475435, | |
| "grad_norm": 1.2543709475465634, | |
| "learning_rate": 3.689060522675689e-06, | |
| "loss": 0.3299, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 2.0399666944213157, | |
| "grad_norm": 1.209782299511866, | |
| "learning_rate": 3.668538952747236e-06, | |
| "loss": 0.3335, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 2.0532889258950875, | |
| "grad_norm": 1.2314074580378418, | |
| "learning_rate": 3.6479161334675294e-06, | |
| "loss": 0.3402, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.0666111573688593, | |
| "grad_norm": 1.089978871118908, | |
| "learning_rate": 3.627193851723577e-06, | |
| "loss": 0.3282, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 2.079933388842631, | |
| "grad_norm": 1.1440650029159125, | |
| "learning_rate": 3.6063739030204226e-06, | |
| "loss": 0.3353, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 2.0932556203164028, | |
| "grad_norm": 1.1412527172991913, | |
| "learning_rate": 3.5854580913255706e-06, | |
| "loss": 0.3377, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 2.106577851790175, | |
| "grad_norm": 1.1374336855151732, | |
| "learning_rate": 3.564448228912682e-06, | |
| "loss": 0.3303, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 2.1199000832639467, | |
| "grad_norm": 1.1689768541112975, | |
| "learning_rate": 3.543346136204545e-06, | |
| "loss": 0.3269, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.1332223147377185, | |
| "grad_norm": 1.1613635619803697, | |
| "learning_rate": 3.522153641615345e-06, | |
| "loss": 0.3447, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 2.1465445462114903, | |
| "grad_norm": 1.0764748235217316, | |
| "learning_rate": 3.5008725813922383e-06, | |
| "loss": 0.3347, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 2.1598667776852625, | |
| "grad_norm": 1.242351223908071, | |
| "learning_rate": 3.4795047994562463e-06, | |
| "loss": 0.3337, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 2.1731890091590342, | |
| "grad_norm": 1.1068446291466676, | |
| "learning_rate": 3.458052147242494e-06, | |
| "loss": 0.3411, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 2.186511240632806, | |
| "grad_norm": 1.16808964966109, | |
| "learning_rate": 3.436516483539781e-06, | |
| "loss": 0.3376, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.1998334721065778, | |
| "grad_norm": 1.1025319948129593, | |
| "learning_rate": 3.4148996743295305e-06, | |
| "loss": 0.3316, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 2.2131557035803495, | |
| "grad_norm": 1.1758686501416102, | |
| "learning_rate": 3.3932035926241103e-06, | |
| "loss": 0.3355, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 2.2264779350541217, | |
| "grad_norm": 1.1003768444337116, | |
| "learning_rate": 3.3714301183045382e-06, | |
| "loss": 0.3357, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 2.2398001665278935, | |
| "grad_norm": 1.0881028666604091, | |
| "learning_rate": 3.349581137957604e-06, | |
| "loss": 0.3364, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 2.2531223980016652, | |
| "grad_norm": 1.211964671213877, | |
| "learning_rate": 3.3276585447123957e-06, | |
| "loss": 0.3353, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.266444629475437, | |
| "grad_norm": 1.163639286937533, | |
| "learning_rate": 3.3056642380762783e-06, | |
| "loss": 0.329, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 2.279766860949209, | |
| "grad_norm": 1.1618660863336634, | |
| "learning_rate": 3.2836001237702993e-06, | |
| "loss": 0.3299, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 2.293089092422981, | |
| "grad_norm": 1.1575282219975258, | |
| "learning_rate": 3.2614681135640696e-06, | |
| "loss": 0.3297, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 2.3064113238967527, | |
| "grad_norm": 1.1756458412662194, | |
| "learning_rate": 3.2392701251101172e-06, | |
| "loss": 0.3367, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 2.3197335553705245, | |
| "grad_norm": 1.1830174958146948, | |
| "learning_rate": 3.217008081777726e-06, | |
| "loss": 0.3319, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.3330557868442963, | |
| "grad_norm": 1.1667340496607632, | |
| "learning_rate": 3.1946839124862873e-06, | |
| "loss": 0.3361, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 2.3463780183180685, | |
| "grad_norm": 1.1105411198444493, | |
| "learning_rate": 3.1722995515381644e-06, | |
| "loss": 0.3425, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 2.3597002497918402, | |
| "grad_norm": 1.1234133483520614, | |
| "learning_rate": 3.149856938451094e-06, | |
| "loss": 0.3314, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 2.373022481265612, | |
| "grad_norm": 1.1838235154662082, | |
| "learning_rate": 3.127358017790132e-06, | |
| "loss": 0.3392, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 2.3863447127393838, | |
| "grad_norm": 1.080453742242657, | |
| "learning_rate": 3.1048047389991693e-06, | |
| "loss": 0.3336, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.3996669442131555, | |
| "grad_norm": 1.1140835000073062, | |
| "learning_rate": 3.082199056232015e-06, | |
| "loss": 0.3414, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 2.4129891756869277, | |
| "grad_norm": 1.138752925836035, | |
| "learning_rate": 3.059542928183079e-06, | |
| "loss": 0.3329, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 2.4263114071606995, | |
| "grad_norm": 1.0610831482375092, | |
| "learning_rate": 3.0368383179176584e-06, | |
| "loss": 0.342, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 2.4396336386344712, | |
| "grad_norm": 1.1718171313930514, | |
| "learning_rate": 3.0140871927018466e-06, | |
| "loss": 0.3266, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 2.452955870108243, | |
| "grad_norm": 1.2039181830598997, | |
| "learning_rate": 2.9912915238320755e-06, | |
| "loss": 0.338, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.4662781015820148, | |
| "grad_norm": 1.0760682240024106, | |
| "learning_rate": 2.9684532864643123e-06, | |
| "loss": 0.3277, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 2.479600333055787, | |
| "grad_norm": 1.2378751102400485, | |
| "learning_rate": 2.945574459442917e-06, | |
| "loss": 0.3398, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 2.4929225645295587, | |
| "grad_norm": 1.171184691228538, | |
| "learning_rate": 2.922657025129185e-06, | |
| "loss": 0.3313, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 2.5062447960033305, | |
| "grad_norm": 1.179077198361453, | |
| "learning_rate": 2.8997029692295875e-06, | |
| "loss": 0.3364, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 2.5195670274771023, | |
| "grad_norm": 1.1745776843262559, | |
| "learning_rate": 2.876714280623708e-06, | |
| "loss": 0.3261, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.532889258950874, | |
| "grad_norm": 1.1445296979936388, | |
| "learning_rate": 2.8536929511919227e-06, | |
| "loss": 0.3352, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 2.5462114904246462, | |
| "grad_norm": 1.2025025630072426, | |
| "learning_rate": 2.8306409756428067e-06, | |
| "loss": 0.3375, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 2.559533721898418, | |
| "grad_norm": 1.0971352592709565, | |
| "learning_rate": 2.807560351340302e-06, | |
| "loss": 0.3313, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 2.5728559533721898, | |
| "grad_norm": 1.1249045530287038, | |
| "learning_rate": 2.7844530781306544e-06, | |
| "loss": 0.3359, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 2.586178184845962, | |
| "grad_norm": 1.1665793984798016, | |
| "learning_rate": 2.761321158169134e-06, | |
| "loss": 0.3251, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.5995004163197337, | |
| "grad_norm": 1.1275088907272068, | |
| "learning_rate": 2.738166595746554e-06, | |
| "loss": 0.3189, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 2.6128226477935055, | |
| "grad_norm": 1.1697820606518197, | |
| "learning_rate": 2.7149913971156105e-06, | |
| "loss": 0.3305, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 2.6261448792672772, | |
| "grad_norm": 1.0995774846811734, | |
| "learning_rate": 2.6917975703170466e-06, | |
| "loss": 0.3323, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 2.639467110741049, | |
| "grad_norm": 1.1471735378793595, | |
| "learning_rate": 2.668587125005663e-06, | |
| "loss": 0.3348, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 2.652789342214821, | |
| "grad_norm": 1.1043284251546557, | |
| "learning_rate": 2.6453620722761897e-06, | |
| "loss": 0.3244, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.666111573688593, | |
| "grad_norm": 1.2133025722214072, | |
| "learning_rate": 2.6221244244890336e-06, | |
| "loss": 0.3297, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 2.6794338051623647, | |
| "grad_norm": 1.0759704642431338, | |
| "learning_rate": 2.5988761950959133e-06, | |
| "loss": 0.3294, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 2.6927560366361365, | |
| "grad_norm": 1.1303123852236616, | |
| "learning_rate": 2.575619398465402e-06, | |
| "loss": 0.327, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 2.7060782681099083, | |
| "grad_norm": 1.1874408347855483, | |
| "learning_rate": 2.5523560497083927e-06, | |
| "loss": 0.3297, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 2.7194004995836805, | |
| "grad_norm": 1.0814676034937838, | |
| "learning_rate": 2.5290881645034932e-06, | |
| "loss": 0.3308, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.7327227310574522, | |
| "grad_norm": 1.0821014638265758, | |
| "learning_rate": 2.5058177589223766e-06, | |
| "loss": 0.3286, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 2.746044962531224, | |
| "grad_norm": 1.1078782647950531, | |
| "learning_rate": 2.482546849255096e-06, | |
| "loss": 0.3289, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 2.7593671940049957, | |
| "grad_norm": 1.0709928206025467, | |
| "learning_rate": 2.4592774518353858e-06, | |
| "loss": 0.3349, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 2.7726894254787675, | |
| "grad_norm": 0.9986348268877544, | |
| "learning_rate": 2.436011582865945e-06, | |
| "loss": 0.3284, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 2.7860116569525397, | |
| "grad_norm": 1.0407659756205825, | |
| "learning_rate": 2.4127512582437486e-06, | |
| "loss": 0.3255, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.7993338884263115, | |
| "grad_norm": 1.1250160278057286, | |
| "learning_rate": 2.3894984933853734e-06, | |
| "loss": 0.3189, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 2.8126561199000832, | |
| "grad_norm": 1.1080847331634105, | |
| "learning_rate": 2.366255303052377e-06, | |
| "loss": 0.3286, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 2.825978351373855, | |
| "grad_norm": 1.1096965833381898, | |
| "learning_rate": 2.3430237011767166e-06, | |
| "loss": 0.3393, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 2.8393005828476268, | |
| "grad_norm": 1.177552126401324, | |
| "learning_rate": 2.319805700686257e-06, | |
| "loss": 0.323, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 2.852622814321399, | |
| "grad_norm": 1.1531088333635726, | |
| "learning_rate": 2.296603313330355e-06, | |
| "loss": 0.3275, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.8659450457951707, | |
| "grad_norm": 1.1189431785006225, | |
| "learning_rate": 2.2734185495055503e-06, | |
| "loss": 0.3234, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 2.8792672772689425, | |
| "grad_norm": 1.0872804861007128, | |
| "learning_rate": 2.250253418081373e-06, | |
| "loss": 0.3304, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 2.8925895087427143, | |
| "grad_norm": 1.100386612123568, | |
| "learning_rate": 2.22710992622628e-06, | |
| "loss": 0.326, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 2.905911740216486, | |
| "grad_norm": 1.0750519008987303, | |
| "learning_rate": 2.2039900792337477e-06, | |
| "loss": 0.3161, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 2.9192339716902582, | |
| "grad_norm": 1.0912428298625954, | |
| "learning_rate": 2.1808958803485134e-06, | |
| "loss": 0.3209, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.93255620316403, | |
| "grad_norm": 1.107507049641638, | |
| "learning_rate": 2.157829330593008e-06, | |
| "loss": 0.3363, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 2.9458784346378017, | |
| "grad_norm": 1.169768928903536, | |
| "learning_rate": 2.134792428593971e-06, | |
| "loss": 0.3327, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 2.959200666111574, | |
| "grad_norm": 1.1405241904375514, | |
| "learning_rate": 2.1117871704092818e-06, | |
| "loss": 0.3264, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 2.9725228975853453, | |
| "grad_norm": 1.1001277407179797, | |
| "learning_rate": 2.0888155493550027e-06, | |
| "loss": 0.3135, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 2.9858451290591175, | |
| "grad_norm": 1.1129546974887563, | |
| "learning_rate": 2.0658795558326745e-06, | |
| "loss": 0.3234, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.9991673605328892, | |
| "grad_norm": 1.1080818714362697, | |
| "learning_rate": 2.0429811771568468e-06, | |
| "loss": 0.322, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 3.006661115736886, | |
| "grad_norm": 1.1307642011297194, | |
| "learning_rate": 2.0201223973828917e-06, | |
| "loss": 0.1617, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 3.019983347210658, | |
| "grad_norm": 1.0035953126863488, | |
| "learning_rate": 1.997305197135089e-06, | |
| "loss": 0.2598, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 3.0333055786844296, | |
| "grad_norm": 1.0534955052337636, | |
| "learning_rate": 1.9745315534350157e-06, | |
| "loss": 0.2715, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 3.0466278101582014, | |
| "grad_norm": 1.189844390221147, | |
| "learning_rate": 1.9518034395302413e-06, | |
| "loss": 0.2646, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 3.059950041631973, | |
| "grad_norm": 1.1253150795326456, | |
| "learning_rate": 1.9291228247233607e-06, | |
| "loss": 0.2701, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 3.0732722731057454, | |
| "grad_norm": 1.1193701526310147, | |
| "learning_rate": 1.9064916742013515e-06, | |
| "loss": 0.2673, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 3.086594504579517, | |
| "grad_norm": 1.0959015217977324, | |
| "learning_rate": 1.883911948865306e-06, | |
| "loss": 0.2649, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 3.099916736053289, | |
| "grad_norm": 1.1965240464412776, | |
| "learning_rate": 1.8613856051605242e-06, | |
| "loss": 0.2629, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 3.1132389675270606, | |
| "grad_norm": 1.0473419859838504, | |
| "learning_rate": 1.8389145949069953e-06, | |
| "loss": 0.2613, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.126561199000833, | |
| "grad_norm": 1.2108832644207754, | |
| "learning_rate": 1.816500865130279e-06, | |
| "loss": 0.2571, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 3.1398834304746046, | |
| "grad_norm": 1.0441673255917416, | |
| "learning_rate": 1.7941463578928088e-06, | |
| "loss": 0.2766, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 3.1532056619483764, | |
| "grad_norm": 1.1708679609837331, | |
| "learning_rate": 1.7718530101256115e-06, | |
| "loss": 0.2718, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 3.166527893422148, | |
| "grad_norm": 1.1284481739249688, | |
| "learning_rate": 1.7496227534604859e-06, | |
| "loss": 0.2575, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 3.17985012489592, | |
| "grad_norm": 1.0770429901542908, | |
| "learning_rate": 1.7274575140626318e-06, | |
| "loss": 0.2629, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.193172356369692, | |
| "grad_norm": 1.0692152501631402, | |
| "learning_rate": 1.7053592124637557e-06, | |
| "loss": 0.2694, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 3.206494587843464, | |
| "grad_norm": 1.039094308900864, | |
| "learning_rate": 1.6833297633956647e-06, | |
| "loss": 0.2687, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 3.2198168193172356, | |
| "grad_norm": 1.1432726083538918, | |
| "learning_rate": 1.661371075624363e-06, | |
| "loss": 0.2722, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 3.2331390507910074, | |
| "grad_norm": 1.047486598216707, | |
| "learning_rate": 1.6394850517846621e-06, | |
| "loss": 0.26, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 3.246461282264779, | |
| "grad_norm": 1.1299207627919639, | |
| "learning_rate": 1.6176735882153284e-06, | |
| "loss": 0.2646, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.2597835137385514, | |
| "grad_norm": 1.0456944660867535, | |
| "learning_rate": 1.5959385747947697e-06, | |
| "loss": 0.2628, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 3.273105745212323, | |
| "grad_norm": 1.0617694211022177, | |
| "learning_rate": 1.5742818947772875e-06, | |
| "loss": 0.2576, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 3.286427976686095, | |
| "grad_norm": 1.0978333522782833, | |
| "learning_rate": 1.552705424629898e-06, | |
| "loss": 0.2703, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 3.2997502081598666, | |
| "grad_norm": 1.0865484727411876, | |
| "learning_rate": 1.5312110338697427e-06, | |
| "loss": 0.2692, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 3.313072439633639, | |
| "grad_norm": 1.0418725249305938, | |
| "learning_rate": 1.509800584902108e-06, | |
| "loss": 0.2642, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.3263946711074106, | |
| "grad_norm": 1.0660556477168224, | |
| "learning_rate": 1.4884759328590476e-06, | |
| "loss": 0.2633, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 3.3397169025811824, | |
| "grad_norm": 1.0851955569492033, | |
| "learning_rate": 1.467238925438646e-06, | |
| "loss": 0.2677, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 3.353039134054954, | |
| "grad_norm": 1.0460266601554127, | |
| "learning_rate": 1.446091402744923e-06, | |
| "loss": 0.2682, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 3.366361365528726, | |
| "grad_norm": 1.0320032665713081, | |
| "learning_rate": 1.4250351971283937e-06, | |
| "loss": 0.2673, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 3.379683597002498, | |
| "grad_norm": 1.0694000065346523, | |
| "learning_rate": 1.4040721330273063e-06, | |
| "loss": 0.273, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 3.39300582847627, | |
| "grad_norm": 1.0986183217647922, | |
| "learning_rate": 1.3832040268095589e-06, | |
| "loss": 0.2615, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 3.4063280599500416, | |
| "grad_norm": 1.063489274495733, | |
| "learning_rate": 1.362432686615316e-06, | |
| "loss": 0.2763, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 3.4196502914238134, | |
| "grad_norm": 1.0408747367635172, | |
| "learning_rate": 1.3417599122003464e-06, | |
| "loss": 0.2677, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 3.432972522897585, | |
| "grad_norm": 1.1352124059324844, | |
| "learning_rate": 1.3211874947800747e-06, | |
| "loss": 0.2614, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 3.4462947543713573, | |
| "grad_norm": 1.0881790246993637, | |
| "learning_rate": 1.3007172168743854e-06, | |
| "loss": 0.2659, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 3.459616985845129, | |
| "grad_norm": 1.080506653895442, | |
| "learning_rate": 1.280350852153168e-06, | |
| "loss": 0.2666, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 3.472939217318901, | |
| "grad_norm": 1.0583310544029485, | |
| "learning_rate": 1.260090165282645e-06, | |
| "loss": 0.2648, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 3.4862614487926726, | |
| "grad_norm": 1.1152914883872809, | |
| "learning_rate": 1.2399369117724582e-06, | |
| "loss": 0.2704, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 3.4995836802664444, | |
| "grad_norm": 1.0455279885524973, | |
| "learning_rate": 1.2198928378235717e-06, | |
| "loss": 0.2672, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 3.5129059117402166, | |
| "grad_norm": 1.0576879823812282, | |
| "learning_rate": 1.1999596801769617e-06, | |
| "loss": 0.264, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 3.5262281432139884, | |
| "grad_norm": 1.1014402939329688, | |
| "learning_rate": 1.1801391659631423e-06, | |
| "loss": 0.2654, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 3.53955037468776, | |
| "grad_norm": 1.028865585013293, | |
| "learning_rate": 1.160433012552508e-06, | |
| "loss": 0.2637, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 3.5528726061615323, | |
| "grad_norm": 1.0546829340917359, | |
| "learning_rate": 1.1408429274065418e-06, | |
| "loss": 0.27, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 3.5661948376353036, | |
| "grad_norm": 1.0417474358737957, | |
| "learning_rate": 1.1213706079298566e-06, | |
| "loss": 0.2589, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 3.579517069109076, | |
| "grad_norm": 1.0937717215676659, | |
| "learning_rate": 1.1020177413231334e-06, | |
| "loss": 0.2697, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 3.5928393005828476, | |
| "grad_norm": 1.0736209133266341, | |
| "learning_rate": 1.0827860044369226e-06, | |
| "loss": 0.2645, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 3.6061615320566194, | |
| "grad_norm": 1.0474112636925237, | |
| "learning_rate": 1.06367706362636e-06, | |
| "loss": 0.2681, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 3.6194837635303916, | |
| "grad_norm": 1.103432827044926, | |
| "learning_rate": 1.0446925746067768e-06, | |
| "loss": 0.2695, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 3.6328059950041633, | |
| "grad_norm": 1.0836832730433759, | |
| "learning_rate": 1.0258341823102418e-06, | |
| "loss": 0.2632, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 3.646128226477935, | |
| "grad_norm": 1.0859645184669795, | |
| "learning_rate": 1.0071035207430352e-06, | |
| "loss": 0.2669, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 3.659450457951707, | |
| "grad_norm": 1.1090309698734075, | |
| "learning_rate": 9.88502212844063e-07, | |
| "loss": 0.2598, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 3.6727726894254786, | |
| "grad_norm": 1.040309343372892, | |
| "learning_rate": 9.700318703442437e-07, | |
| "loss": 0.259, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 3.686094920899251, | |
| "grad_norm": 1.0821866491462884, | |
| "learning_rate": 9.516940936268504e-07, | |
| "loss": 0.261, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 3.6994171523730226, | |
| "grad_norm": 1.032839245512739, | |
| "learning_rate": 9.334904715888496e-07, | |
| "loss": 0.2726, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 3.7127393838467944, | |
| "grad_norm": 1.2154127605600453, | |
| "learning_rate": 9.154225815032242e-07, | |
| "loss": 0.257, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 3.726061615320566, | |
| "grad_norm": 1.0461952582538157, | |
| "learning_rate": 8.974919888823164e-07, | |
| "loss": 0.255, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 3.739383846794338, | |
| "grad_norm": 1.0817761044485013, | |
| "learning_rate": 8.797002473421729e-07, | |
| "loss": 0.2672, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 3.75270607826811, | |
| "grad_norm": 1.1088064613565192, | |
| "learning_rate": 8.620488984679378e-07, | |
| "loss": 0.2701, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 3.766028309741882, | |
| "grad_norm": 1.0905619434743687, | |
| "learning_rate": 8.445394716802754e-07, | |
| "loss": 0.2699, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 3.7793505412156536, | |
| "grad_norm": 1.1348105280382488, | |
| "learning_rate": 8.271734841028553e-07, | |
| "loss": 0.2625, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 3.7926727726894254, | |
| "grad_norm": 1.0895135923548163, | |
| "learning_rate": 8.099524404308948e-07, | |
| "loss": 0.2652, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 3.805995004163197, | |
| "grad_norm": 1.081980856394784, | |
| "learning_rate": 7.928778328007918e-07, | |
| "loss": 0.2725, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 3.8193172356369693, | |
| "grad_norm": 1.072896110364212, | |
| "learning_rate": 7.759511406608255e-07, | |
| "loss": 0.2534, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 3.832639467110741, | |
| "grad_norm": 1.0739615029579452, | |
| "learning_rate": 7.591738306429769e-07, | |
| "loss": 0.2664, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 3.845961698584513, | |
| "grad_norm": 1.050183747712219, | |
| "learning_rate": 7.425473564358457e-07, | |
| "loss": 0.2644, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 3.8592839300582846, | |
| "grad_norm": 0.9907767398098887, | |
| "learning_rate": 7.260731586586983e-07, | |
| "loss": 0.2654, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 3.8726061615320564, | |
| "grad_norm": 1.0920216934407247, | |
| "learning_rate": 7.097526647366379e-07, | |
| "loss": 0.2652, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 3.8859283930058286, | |
| "grad_norm": 1.0675877474822888, | |
| "learning_rate": 6.935872887769299e-07, | |
| "loss": 0.265, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 3.8992506244796004, | |
| "grad_norm": 1.047365669548006, | |
| "learning_rate": 6.775784314464717e-07, | |
| "loss": 0.2635, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 3.912572855953372, | |
| "grad_norm": 1.030022751644315, | |
| "learning_rate": 6.617274798504286e-07, | |
| "loss": 0.2628, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 3.925895087427144, | |
| "grad_norm": 1.044545063593376, | |
| "learning_rate": 6.460358074120518e-07, | |
| "loss": 0.2647, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 3.9392173189009156, | |
| "grad_norm": 1.0225003684521647, | |
| "learning_rate": 6.305047737536707e-07, | |
| "loss": 0.2625, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 3.952539550374688, | |
| "grad_norm": 1.0533487294826005, | |
| "learning_rate": 6.151357245788917e-07, | |
| "loss": 0.2731, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 3.9658617818484596, | |
| "grad_norm": 1.034466643395701, | |
| "learning_rate": 5.999299915559956e-07, | |
| "loss": 0.2558, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 3.9791840133222314, | |
| "grad_norm": 1.037903752833203, | |
| "learning_rate": 5.848888922025553e-07, | |
| "loss": 0.2618, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.9925062447960036, | |
| "grad_norm": 1.027043640385128, | |
| "learning_rate": 5.700137297712749e-07, | |
| "loss": 0.2669, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 1.098155802724837, | |
| "learning_rate": 5.553057931370729e-07, | |
| "loss": 0.1505, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 4.013322231473772, | |
| "grad_norm": 1.0453208918296613, | |
| "learning_rate": 5.407663566854008e-07, | |
| "loss": 0.2321, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 4.0266444629475435, | |
| "grad_norm": 1.0336345571966956, | |
| "learning_rate": 5.263966802018275e-07, | |
| "loss": 0.2359, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 4.039966694421316, | |
| "grad_norm": 1.0051543661966322, | |
| "learning_rate": 5.121980087628802e-07, | |
| "loss": 0.2286, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 4.053288925895087, | |
| "grad_norm": 1.0291906973127083, | |
| "learning_rate": 4.981715726281666e-07, | |
| "loss": 0.2322, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 4.066611157368859, | |
| "grad_norm": 1.0535023572236821, | |
| "learning_rate": 4.843185871337722e-07, | |
| "loss": 0.2402, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 4.0799333888426315, | |
| "grad_norm": 1.1023709895301759, | |
| "learning_rate": 4.706402525869633e-07, | |
| "loss": 0.2322, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 4.093255620316403, | |
| "grad_norm": 1.0283405302482598, | |
| "learning_rate": 4.5713775416217884e-07, | |
| "loss": 0.2238, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 4.106577851790175, | |
| "grad_norm": 1.0393264816137988, | |
| "learning_rate": 4.438122617983442e-07, | |
| "loss": 0.2292, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 4.119900083263946, | |
| "grad_norm": 1.0587543311732102, | |
| "learning_rate": 4.3066493009749853e-07, | |
| "loss": 0.2293, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 4.1332223147377185, | |
| "grad_norm": 1.0914720689441537, | |
| "learning_rate": 4.1769689822475147e-07, | |
| "loss": 0.2317, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 4.146544546211491, | |
| "grad_norm": 1.0417273334496886, | |
| "learning_rate": 4.049092898095816e-07, | |
| "loss": 0.2358, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 4.159866777685262, | |
| "grad_norm": 1.0248130840986234, | |
| "learning_rate": 3.9230321284847856e-07, | |
| "loss": 0.2355, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 4.173189009159034, | |
| "grad_norm": 1.0589305077753277, | |
| "learning_rate": 3.798797596089351e-07, | |
| "loss": 0.2331, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 4.1865112406328056, | |
| "grad_norm": 1.0399059506346249, | |
| "learning_rate": 3.6764000653481263e-07, | |
| "loss": 0.2352, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 4.199833472106578, | |
| "grad_norm": 1.0352919697923912, | |
| "learning_rate": 3.555850141530659e-07, | |
| "loss": 0.2327, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 4.21315570358035, | |
| "grad_norm": 0.989140712662966, | |
| "learning_rate": 3.4371582698185636e-07, | |
| "loss": 0.228, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 4.226477935054121, | |
| "grad_norm": 1.0090105290858724, | |
| "learning_rate": 3.3203347344004737e-07, | |
| "loss": 0.2258, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 4.2398001665278935, | |
| "grad_norm": 0.9991149517617007, | |
| "learning_rate": 3.2053896575809426e-07, | |
| "loss": 0.2199, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 4.253122398001666, | |
| "grad_norm": 1.014374420272404, | |
| "learning_rate": 3.092332998903416e-07, | |
| "loss": 0.2261, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 4.266444629475437, | |
| "grad_norm": 1.0250317424256117, | |
| "learning_rate": 2.981174554287239e-07, | |
| "loss": 0.2381, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 4.279766860949209, | |
| "grad_norm": 1.0442581559998447, | |
| "learning_rate": 2.871923955178918e-07, | |
| "loss": 0.2315, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 4.2930890924229805, | |
| "grad_norm": 1.0098371613642636, | |
| "learning_rate": 2.764590667717562e-07, | |
| "loss": 0.2272, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 4.306411323896753, | |
| "grad_norm": 1.0807767731419033, | |
| "learning_rate": 2.6591839919146963e-07, | |
| "loss": 0.2394, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 4.319733555370525, | |
| "grad_norm": 1.054167521910636, | |
| "learning_rate": 2.555713060848433e-07, | |
| "loss": 0.2324, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 4.333055786844296, | |
| "grad_norm": 1.107035611645368, | |
| "learning_rate": 2.454186839872158e-07, | |
| "loss": 0.2357, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 4.3463780183180685, | |
| "grad_norm": 1.0552376707704954, | |
| "learning_rate": 2.3546141258376786e-07, | |
| "loss": 0.2289, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 4.35970024979184, | |
| "grad_norm": 1.0047757316250936, | |
| "learning_rate": 2.257003546333042e-07, | |
| "loss": 0.2281, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 4.373022481265612, | |
| "grad_norm": 1.0426529499317703, | |
| "learning_rate": 2.1613635589349756e-07, | |
| "loss": 0.2351, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 4.386344712739384, | |
| "grad_norm": 1.0168386832947722, | |
| "learning_rate": 2.0677024504760752e-07, | |
| "loss": 0.2329, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 4.3996669442131555, | |
| "grad_norm": 1.004364247984247, | |
| "learning_rate": 1.9760283363267684e-07, | |
| "loss": 0.2309, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 4.412989175686928, | |
| "grad_norm": 1.0575692314944383, | |
| "learning_rate": 1.8863491596921745e-07, | |
| "loss": 0.2338, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 4.426311407160699, | |
| "grad_norm": 1.0256602646785253, | |
| "learning_rate": 1.798672690923828e-07, | |
| "loss": 0.2286, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 4.439633638634471, | |
| "grad_norm": 0.9903962555666792, | |
| "learning_rate": 1.713006526846439e-07, | |
| "loss": 0.2299, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 4.4529558701082435, | |
| "grad_norm": 1.006720208531802, | |
| "learning_rate": 1.629358090099639e-07, | |
| "loss": 0.2308, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 4.466278101582015, | |
| "grad_norm": 1.0131829979414444, | |
| "learning_rate": 1.5477346284948292e-07, | |
| "loss": 0.2291, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 4.479600333055787, | |
| "grad_norm": 1.0035493986435864, | |
| "learning_rate": 1.4681432143872133e-07, | |
| "loss": 0.2345, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 4.492922564529558, | |
| "grad_norm": 1.0043750746548528, | |
| "learning_rate": 1.3905907440629752e-07, | |
| "loss": 0.2293, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 4.5062447960033305, | |
| "grad_norm": 1.041883268646126, | |
| "learning_rate": 1.31508393714177e-07, | |
| "loss": 0.2228, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 4.519567027477103, | |
| "grad_norm": 1.0405556943028, | |
| "learning_rate": 1.241629335994471e-07, | |
| "loss": 0.2281, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 4.532889258950874, | |
| "grad_norm": 1.0206604077473356, | |
| "learning_rate": 1.1702333051763271e-07, | |
| "loss": 0.2223, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 4.546211490424646, | |
| "grad_norm": 1.1168719067709043, | |
| "learning_rate": 1.1009020308754587e-07, | |
| "loss": 0.2296, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 4.559533721898418, | |
| "grad_norm": 1.061012715283086, | |
| "learning_rate": 1.0336415203768962e-07, | |
| "loss": 0.2338, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 4.57285595337219, | |
| "grad_norm": 1.0309834474331188, | |
| "learning_rate": 9.684576015420277e-08, | |
| "loss": 0.2328, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 4.586178184845962, | |
| "grad_norm": 1.0027277768812777, | |
| "learning_rate": 9.053559223036746e-08, | |
| "loss": 0.2195, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 4.599500416319733, | |
| "grad_norm": 0.993857076400692, | |
| "learning_rate": 8.44341950176683e-08, | |
| "loss": 0.2256, | |
| "step": 694 | |
| }, | |
| { | |
| "epoch": 4.6128226477935055, | |
| "grad_norm": 1.007429158342742, | |
| "learning_rate": 7.854209717842231e-08, | |
| "loss": 0.2319, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 4.626144879267278, | |
| "grad_norm": 1.033726145073033, | |
| "learning_rate": 7.285980923996989e-08, | |
| "loss": 0.2342, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 4.639467110741049, | |
| "grad_norm": 1.0168527853674483, | |
| "learning_rate": 6.738782355044048e-08, | |
| "loss": 0.234, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 4.652789342214821, | |
| "grad_norm": 1.0477507076137358, | |
| "learning_rate": 6.212661423609184e-08, | |
| "loss": 0.2342, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 4.6661115736885925, | |
| "grad_norm": 1.0066144201644125, | |
| "learning_rate": 5.707663716023021e-08, | |
| "loss": 0.2181, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 4.679433805162365, | |
| "grad_norm": 1.0584682384889934, | |
| "learning_rate": 5.22383298837098e-08, | |
| "loss": 0.2316, | |
| "step": 706 | |
| }, | |
| { | |
| "epoch": 4.692756036636137, | |
| "grad_norm": 0.9785329546521053, | |
| "learning_rate": 4.761211162702117e-08, | |
| "loss": 0.23, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 4.706078268109908, | |
| "grad_norm": 1.0383108228822218, | |
| "learning_rate": 4.319838323396691e-08, | |
| "loss": 0.2331, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 4.7194004995836805, | |
| "grad_norm": 1.0036079922992014, | |
| "learning_rate": 3.8997527136930004e-08, | |
| "loss": 0.2255, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 4.732722731057452, | |
| "grad_norm": 1.0350261932080171, | |
| "learning_rate": 3.5009907323737826e-08, | |
| "loss": 0.241, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 4.746044962531224, | |
| "grad_norm": 1.0546445582741784, | |
| "learning_rate": 3.1235869306123766e-08, | |
| "loss": 0.2278, | |
| "step": 716 | |
| }, | |
| { | |
| "epoch": 4.759367194004996, | |
| "grad_norm": 1.002658916833135, | |
| "learning_rate": 2.767574008979007e-08, | |
| "loss": 0.2263, | |
| "step": 718 | |
| }, | |
| { | |
| "epoch": 4.7726894254787675, | |
| "grad_norm": 1.0293431900981997, | |
| "learning_rate": 2.4329828146074096e-08, | |
| "loss": 0.234, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 4.78601165695254, | |
| "grad_norm": 1.048978506558003, | |
| "learning_rate": 2.1198423385220822e-08, | |
| "loss": 0.2272, | |
| "step": 722 | |
| }, | |
| { | |
| "epoch": 4.799333888426311, | |
| "grad_norm": 1.0188945259859212, | |
| "learning_rate": 1.82817971312621e-08, | |
| "loss": 0.2254, | |
| "step": 724 | |
| }, | |
| { | |
| "epoch": 4.812656119900083, | |
| "grad_norm": 1.0799756942526888, | |
| "learning_rate": 1.5580202098509078e-08, | |
| "loss": 0.2324, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 4.8259783513738554, | |
| "grad_norm": 1.0033120373906426, | |
| "learning_rate": 1.3093872369654148e-08, | |
| "loss": 0.2261, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 4.839300582847627, | |
| "grad_norm": 1.0341825743001596, | |
| "learning_rate": 1.0823023375489128e-08, | |
| "loss": 0.2301, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 4.852622814321399, | |
| "grad_norm": 0.9930386687248629, | |
| "learning_rate": 8.767851876239075e-09, | |
| "loss": 0.2289, | |
| "step": 732 | |
| }, | |
| { | |
| "epoch": 4.86594504579517, | |
| "grad_norm": 1.050611973007718, | |
| "learning_rate": 6.9285359445145366e-09, | |
| "loss": 0.2322, | |
| "step": 734 | |
| }, | |
| { | |
| "epoch": 4.8792672772689425, | |
| "grad_norm": 1.0238154022229573, | |
| "learning_rate": 5.305234949880001e-09, | |
| "loss": 0.2314, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 4.892589508742715, | |
| "grad_norm": 1.0359224074295086, | |
| "learning_rate": 3.8980895450474455e-09, | |
| "loss": 0.2445, | |
| "step": 738 | |
| }, | |
| { | |
| "epoch": 4.905911740216486, | |
| "grad_norm": 1.0513486639225555, | |
| "learning_rate": 2.7072216536885855e-09, | |
| "loss": 0.2366, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 4.919233971690258, | |
| "grad_norm": 1.0043522118975665, | |
| "learning_rate": 1.7327344598702667e-09, | |
| "loss": 0.2373, | |
| "step": 742 | |
| }, | |
| { | |
| "epoch": 4.9325562031640295, | |
| "grad_norm": 1.0244814544018699, | |
| "learning_rate": 9.747123991141193e-10, | |
| "loss": 0.2333, | |
| "step": 744 | |
| }, | |
| { | |
| "epoch": 4.945878434637802, | |
| "grad_norm": 1.0043902802504958, | |
| "learning_rate": 4.332211510807427e-10, | |
| "loss": 0.2322, | |
| "step": 746 | |
| }, | |
| { | |
| "epoch": 4.959200666111574, | |
| "grad_norm": 0.9910871784096957, | |
| "learning_rate": 1.0830763387897902e-10, | |
| "loss": 0.2172, | |
| "step": 748 | |
| }, | |
| { | |
| "epoch": 4.972522897585345, | |
| "grad_norm": 1.0059895919675135, | |
| "learning_rate": 0.0, | |
| "loss": 0.2232, | |
| "step": 750 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 750, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1025260732350464.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |