| { | |
| "best_global_step": 441, | |
| "best_metric": 0.32748550176620483, | |
| "best_model_checkpoint": "saves/prompt-tuning/llama-3-8b-instruct/train_wsc_456_1760356427/checkpoint-441", | |
| "epoch": 10.0, | |
| "eval_steps": 63, | |
| "global_step": 1250, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 4.75, | |
| "learning_rate": 0.00096, | |
| "loss": 0.6432, | |
| "num_input_tokens_seen": 2048, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 0.0021599999999999996, | |
| "loss": 0.4748, | |
| "num_input_tokens_seen": 4000, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 87.0, | |
| "learning_rate": 0.00336, | |
| "loss": 4.1821, | |
| "num_input_tokens_seen": 5920, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 93.0, | |
| "learning_rate": 0.00456, | |
| "loss": 2.1496, | |
| "num_input_tokens_seen": 8000, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 264.0, | |
| "learning_rate": 0.0057599999999999995, | |
| "loss": 1.1387, | |
| "num_input_tokens_seen": 10176, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 16.125, | |
| "learning_rate": 0.00696, | |
| "loss": 1.4367, | |
| "num_input_tokens_seen": 12256, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 24.375, | |
| "learning_rate": 0.00816, | |
| "loss": 1.488, | |
| "num_input_tokens_seen": 14112, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 2.25, | |
| "learning_rate": 0.00936, | |
| "loss": 0.5887, | |
| "num_input_tokens_seen": 15808, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 4.75, | |
| "learning_rate": 0.010559999999999998, | |
| "loss": 1.0068, | |
| "num_input_tokens_seen": 17600, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 0.01176, | |
| "loss": 0.4659, | |
| "num_input_tokens_seen": 19296, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 0.01296, | |
| "loss": 0.4367, | |
| "num_input_tokens_seen": 21376, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 2.25, | |
| "learning_rate": 0.014159999999999999, | |
| "loss": 0.9258, | |
| "num_input_tokens_seen": 23680, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "eval_loss": 0.3463467061519623, | |
| "eval_runtime": 0.9482, | |
| "eval_samples_per_second": 59.062, | |
| "eval_steps_per_second": 14.766, | |
| "num_input_tokens_seen": 24704, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 0.01536, | |
| "loss": 0.5618, | |
| "num_input_tokens_seen": 25440, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.016560000000000002, | |
| "loss": 0.4041, | |
| "num_input_tokens_seen": 27232, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.1083984375, | |
| "learning_rate": 0.017759999999999998, | |
| "loss": 0.4381, | |
| "num_input_tokens_seen": 29216, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.0927734375, | |
| "learning_rate": 0.01896, | |
| "loss": 0.393, | |
| "num_input_tokens_seen": 30912, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 0.02016, | |
| "loss": 0.3895, | |
| "num_input_tokens_seen": 32736, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.0791015625, | |
| "learning_rate": 0.021359999999999997, | |
| "loss": 0.3465, | |
| "num_input_tokens_seen": 34592, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 0.02256, | |
| "loss": 0.3712, | |
| "num_input_tokens_seen": 36480, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.05078125, | |
| "learning_rate": 0.02376, | |
| "loss": 0.3497, | |
| "num_input_tokens_seen": 38624, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.072265625, | |
| "learning_rate": 0.02496, | |
| "loss": 0.3676, | |
| "num_input_tokens_seen": 40672, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.038330078125, | |
| "learning_rate": 0.02616, | |
| "loss": 0.3566, | |
| "num_input_tokens_seen": 42784, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.050537109375, | |
| "learning_rate": 0.02736, | |
| "loss": 0.3705, | |
| "num_input_tokens_seen": 44320, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.02490234375, | |
| "learning_rate": 0.02856, | |
| "loss": 0.3427, | |
| "num_input_tokens_seen": 46528, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.1494140625, | |
| "learning_rate": 0.029759999999999998, | |
| "loss": 0.4236, | |
| "num_input_tokens_seen": 48240, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.008, | |
| "eval_loss": 0.3752361834049225, | |
| "eval_runtime": 0.9851, | |
| "eval_samples_per_second": 56.847, | |
| "eval_steps_per_second": 14.212, | |
| "num_input_tokens_seen": 48688, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 0.0255126953125, | |
| "learning_rate": 0.029999064225016296, | |
| "loss": 0.3789, | |
| "num_input_tokens_seen": 50192, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 0.04931640625, | |
| "learning_rate": 0.029995262839249498, | |
| "loss": 0.3428, | |
| "num_input_tokens_seen": 52080, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 0.021484375, | |
| "learning_rate": 0.0299885380972807, | |
| "loss": 0.3735, | |
| "num_input_tokens_seen": 53936, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 0.034912109375, | |
| "learning_rate": 0.02997889131011168, | |
| "loss": 0.3521, | |
| "num_input_tokens_seen": 56176, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.09130859375, | |
| "learning_rate": 0.0299663243584027, | |
| "loss": 0.3711, | |
| "num_input_tokens_seen": 58096, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 0.029950839692105897, | |
| "loss": 0.4052, | |
| "num_input_tokens_seen": 60208, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 0.029932440329987653, | |
| "loss": 0.5242, | |
| "num_input_tokens_seen": 62064, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 0.10791015625, | |
| "learning_rate": 0.02991112985904007, | |
| "loss": 1.0726, | |
| "num_input_tokens_seen": 64304, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 0.11181640625, | |
| "learning_rate": 0.029886912433781675, | |
| "loss": 0.3788, | |
| "num_input_tokens_seen": 66480, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 0.09130859375, | |
| "learning_rate": 0.02985979277544751, | |
| "loss": 0.3532, | |
| "num_input_tokens_seen": 68080, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 0.019287109375, | |
| "learning_rate": 0.029829776171068707, | |
| "loss": 0.3652, | |
| "num_input_tokens_seen": 70288, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 0.052490234375, | |
| "learning_rate": 0.029796868472441763, | |
| "loss": 0.3432, | |
| "num_input_tokens_seen": 72048, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.512, | |
| "eval_loss": 0.3334037959575653, | |
| "eval_runtime": 0.9347, | |
| "eval_samples_per_second": 59.91, | |
| "eval_steps_per_second": 14.977, | |
| "num_input_tokens_seen": 73456, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 0.023681640625, | |
| "learning_rate": 0.029761076094987723, | |
| "loss": 0.3447, | |
| "num_input_tokens_seen": 74096, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 0.05712890625, | |
| "learning_rate": 0.02972240601650149, | |
| "loss": 0.3917, | |
| "num_input_tokens_seen": 76080, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.00750732421875, | |
| "learning_rate": 0.029680865775791494, | |
| "loss": 0.3633, | |
| "num_input_tokens_seen": 78192, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.6400000000000001, | |
| "grad_norm": 0.08447265625, | |
| "learning_rate": 0.02963646347120996, | |
| "loss": 0.3483, | |
| "num_input_tokens_seen": 80048, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 0.02099609375, | |
| "learning_rate": 0.029589207759074154, | |
| "loss": 0.3647, | |
| "num_input_tokens_seen": 81872, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 0.099609375, | |
| "learning_rate": 0.029539107851978778, | |
| "loss": 0.3687, | |
| "num_input_tokens_seen": 83568, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 0.044189453125, | |
| "learning_rate": 0.02948617351699999, | |
| "loss": 0.3515, | |
| "num_input_tokens_seen": 85808, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.01220703125, | |
| "learning_rate": 0.029430415073791287, | |
| "loss": 0.3457, | |
| "num_input_tokens_seen": 87568, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 0.059814453125, | |
| "learning_rate": 0.029371843392571644, | |
| "loss": 0.3506, | |
| "num_input_tokens_seen": 89424, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 0.006988525390625, | |
| "learning_rate": 0.029310469892006367, | |
| "loss": 0.3775, | |
| "num_input_tokens_seen": 91120, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 0.01434326171875, | |
| "learning_rate": 0.029246306536981, | |
| "loss": 0.3575, | |
| "num_input_tokens_seen": 93072, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 0.0267333984375, | |
| "learning_rate": 0.02917936583626874, | |
| "loss": 0.3234, | |
| "num_input_tokens_seen": 95216, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.029052734375, | |
| "learning_rate": 0.029109660840091818, | |
| "loss": 0.3609, | |
| "num_input_tokens_seen": 96896, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.016, | |
| "eval_loss": 0.32756733894348145, | |
| "eval_runtime": 0.974, | |
| "eval_samples_per_second": 57.497, | |
| "eval_steps_per_second": 14.374, | |
| "num_input_tokens_seen": 97568, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 0.059814453125, | |
| "learning_rate": 0.029037205137577363, | |
| "loss": 0.3726, | |
| "num_input_tokens_seen": 98816, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 0.03271484375, | |
| "learning_rate": 0.02896201285410813, | |
| "loss": 0.3451, | |
| "num_input_tokens_seen": 100736, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 0.00927734375, | |
| "learning_rate": 0.028884098648568782, | |
| "loss": 0.3571, | |
| "num_input_tokens_seen": 102592, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 0.006927490234375, | |
| "learning_rate": 0.028803477710488055, | |
| "loss": 0.3437, | |
| "num_input_tokens_seen": 104224, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 0.0625, | |
| "learning_rate": 0.028720165757077573, | |
| "loss": 0.3383, | |
| "num_input_tokens_seen": 105984, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 0.0301513671875, | |
| "learning_rate": 0.02863417903016773, | |
| "loss": 0.3627, | |
| "num_input_tokens_seen": 107840, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.2800000000000002, | |
| "grad_norm": 0.046630859375, | |
| "learning_rate": 0.02854553429304131, | |
| "loss": 0.3626, | |
| "num_input_tokens_seen": 110048, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 0.006256103515625, | |
| "learning_rate": 0.02845424882716545, | |
| "loss": 0.3415, | |
| "num_input_tokens_seen": 112320, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 0.051513671875, | |
| "learning_rate": 0.028360340428822597, | |
| "loss": 0.3509, | |
| "num_input_tokens_seen": 114048, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.06591796875, | |
| "learning_rate": 0.028263827405641085, | |
| "loss": 0.3427, | |
| "num_input_tokens_seen": 115936, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 0.0279541015625, | |
| "learning_rate": 0.028164728573026005, | |
| "loss": 0.3288, | |
| "num_input_tokens_seen": 117792, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 0.022216796875, | |
| "learning_rate": 0.02806306325049113, | |
| "loss": 0.3418, | |
| "num_input_tokens_seen": 119872, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 0.05126953125, | |
| "learning_rate": 0.027958851257892527, | |
| "loss": 0.3639, | |
| "num_input_tokens_seen": 121888, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "eval_loss": 0.37361711263656616, | |
| "eval_runtime": 0.9479, | |
| "eval_samples_per_second": 59.081, | |
| "eval_steps_per_second": 14.77, | |
| "num_input_tokens_seen": 121888, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.06494140625, | |
| "learning_rate": 0.02785211291156464, | |
| "loss": 0.3619, | |
| "num_input_tokens_seen": 123936, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 0.053466796875, | |
| "learning_rate": 0.027742869020359582, | |
| "loss": 0.3424, | |
| "num_input_tokens_seen": 125952, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 0.01361083984375, | |
| "learning_rate": 0.027631140881590383, | |
| "loss": 0.3878, | |
| "num_input_tokens_seen": 128352, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 0.01129150390625, | |
| "learning_rate": 0.027516950276879084, | |
| "loss": 0.3405, | |
| "num_input_tokens_seen": 130496, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 0.01043701171875, | |
| "learning_rate": 0.02740031946791033, | |
| "loss": 0.3449, | |
| "num_input_tokens_seen": 132864, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 0.026611328125, | |
| "learning_rate": 0.027281271192091415, | |
| "loss": 0.3469, | |
| "num_input_tokens_seen": 134592, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.027099609375, | |
| "learning_rate": 0.027159828658119597, | |
| "loss": 0.3497, | |
| "num_input_tokens_seen": 136224, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 0.031494140625, | |
| "learning_rate": 0.0270360155414575, | |
| "loss": 0.3476, | |
| "num_input_tokens_seen": 138048, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.059814453125, | |
| "learning_rate": 0.02690985597971753, | |
| "loss": 0.351, | |
| "num_input_tokens_seen": 139840, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 0.00653076171875, | |
| "learning_rate": 0.026781374567956224, | |
| "loss": 0.348, | |
| "num_input_tokens_seen": 141728, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 0.03125, | |
| "learning_rate": 0.026650596353879386, | |
| "loss": 0.3574, | |
| "num_input_tokens_seen": 143584, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.051513671875, | |
| "learning_rate": 0.026517546832958965, | |
| "loss": 0.3418, | |
| "num_input_tokens_seen": 145184, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 3.024, | |
| "eval_loss": 0.32964640855789185, | |
| "eval_runtime": 0.9858, | |
| "eval_samples_per_second": 56.807, | |
| "eval_steps_per_second": 14.202, | |
| "num_input_tokens_seen": 146336, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 0.019775390625, | |
| "learning_rate": 0.026382251943462682, | |
| "loss": 0.3617, | |
| "num_input_tokens_seen": 147328, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 0.02001953125, | |
| "learning_rate": 0.026244738061397325, | |
| "loss": 0.3622, | |
| "num_input_tokens_seen": 149376, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 0.031494140625, | |
| "learning_rate": 0.026105031995366672, | |
| "loss": 0.3533, | |
| "num_input_tokens_seen": 151744, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 0.05126953125, | |
| "learning_rate": 0.025963160981345105, | |
| "loss": 0.347, | |
| "num_input_tokens_seen": 153920, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 0.0234375, | |
| "learning_rate": 0.02581915267736791, | |
| "loss": 0.3437, | |
| "num_input_tokens_seen": 155776, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.24, | |
| "grad_norm": 0.00848388671875, | |
| "learning_rate": 0.025673035158139283, | |
| "loss": 0.3403, | |
| "num_input_tokens_seen": 157952, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 3.2800000000000002, | |
| "grad_norm": 0.0079345703125, | |
| "learning_rate": 0.02552483690955911, | |
| "loss": 0.3583, | |
| "num_input_tokens_seen": 160032, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.32, | |
| "grad_norm": 0.006866455078125, | |
| "learning_rate": 0.0253745868231696, | |
| "loss": 0.3624, | |
| "num_input_tokens_seen": 162048, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 0.0263671875, | |
| "learning_rate": 0.025222314190522798, | |
| "loss": 0.3483, | |
| "num_input_tokens_seen": 164352, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 0.049560546875, | |
| "learning_rate": 0.02506804869747014, | |
| "loss": 0.3387, | |
| "num_input_tokens_seen": 166144, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 0.03759765625, | |
| "learning_rate": 0.024911820418375166, | |
| "loss": 0.3578, | |
| "num_input_tokens_seen": 168288, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 0.040283203125, | |
| "learning_rate": 0.02475365981025043, | |
| "loss": 0.331, | |
| "num_input_tokens_seen": 170080, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 0.012939453125, | |
| "learning_rate": 0.02459359770681987, | |
| "loss": 0.3506, | |
| "num_input_tokens_seen": 172000, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 3.528, | |
| "eval_loss": 0.32748550176620483, | |
| "eval_runtime": 0.9494, | |
| "eval_samples_per_second": 58.984, | |
| "eval_steps_per_second": 14.746, | |
| "num_input_tokens_seen": 172480, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 0.0101318359375, | |
| "learning_rate": 0.02443166531250769, | |
| "loss": 0.3917, | |
| "num_input_tokens_seen": 174336, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 0.022216796875, | |
| "learning_rate": 0.024267894196355017, | |
| "loss": 0.3457, | |
| "num_input_tokens_seen": 176512, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 3.64, | |
| "grad_norm": 0.0052490234375, | |
| "learning_rate": 0.024102316285865434, | |
| "loss": 0.345, | |
| "num_input_tokens_seen": 178368, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 0.0262451171875, | |
| "learning_rate": 0.02393496386078067, | |
| "loss": 0.3422, | |
| "num_input_tokens_seen": 180224, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 3.7199999999999998, | |
| "grad_norm": 0.0224609375, | |
| "learning_rate": 0.02376586954678758, | |
| "loss": 0.3466, | |
| "num_input_tokens_seen": 181984, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 0.005645751953125, | |
| "learning_rate": 0.02359506630915773, | |
| "loss": 0.3405, | |
| "num_input_tokens_seen": 184064, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 0.0296630859375, | |
| "learning_rate": 0.023422587446320715, | |
| "loss": 0.3697, | |
| "num_input_tokens_seen": 185856, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 0.0439453125, | |
| "learning_rate": 0.0232484665833726, | |
| "loss": 0.3426, | |
| "num_input_tokens_seen": 187840, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 0.025390625, | |
| "learning_rate": 0.023072737665520607, | |
| "loss": 0.3741, | |
| "num_input_tokens_seen": 189536, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 0.00628662109375, | |
| "learning_rate": 0.022895434951465468, | |
| "loss": 0.3444, | |
| "num_input_tokens_seen": 191328, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 0.017333984375, | |
| "learning_rate": 0.022716593006722595, | |
| "loss": 0.3556, | |
| "num_input_tokens_seen": 192960, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.056640625, | |
| "learning_rate": 0.02253624669688347, | |
| "loss": 0.377, | |
| "num_input_tokens_seen": 194384, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.032, | |
| "eval_loss": 0.3651432991027832, | |
| "eval_runtime": 0.9723, | |
| "eval_samples_per_second": 57.597, | |
| "eval_steps_per_second": 14.399, | |
| "num_input_tokens_seen": 196240, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 4.04, | |
| "grad_norm": 0.050537109375, | |
| "learning_rate": 0.022354431180818528, | |
| "loss": 0.365, | |
| "num_input_tokens_seen": 196528, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "grad_norm": 0.0250244140625, | |
| "learning_rate": 0.022171181903822883, | |
| "loss": 0.3468, | |
| "num_input_tokens_seen": 198512, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "grad_norm": 0.020263671875, | |
| "learning_rate": 0.021986534590706163, | |
| "loss": 0.3571, | |
| "num_input_tokens_seen": 200208, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 0.02001953125, | |
| "learning_rate": 0.021800525238827927, | |
| "loss": 0.3332, | |
| "num_input_tokens_seen": 202480, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "grad_norm": 0.0106201171875, | |
| "learning_rate": 0.02161319011107988, | |
| "loss": 0.3391, | |
| "num_input_tokens_seen": 204336, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 4.24, | |
| "grad_norm": 0.0294189453125, | |
| "learning_rate": 0.021424565728816354, | |
| "loss": 0.3634, | |
| "num_input_tokens_seen": 206448, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 4.28, | |
| "grad_norm": 0.0203857421875, | |
| "learning_rate": 0.021234688864734418, | |
| "loss": 0.3445, | |
| "num_input_tokens_seen": 208144, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "grad_norm": 0.021240234375, | |
| "learning_rate": 0.02104359653570494, | |
| "loss": 0.3365, | |
| "num_input_tokens_seen": 210288, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 4.36, | |
| "grad_norm": 0.0286865234375, | |
| "learning_rate": 0.020851325995556093, | |
| "loss": 0.3553, | |
| "num_input_tokens_seen": 212016, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 0.026611328125, | |
| "learning_rate": 0.020657914727810648, | |
| "loss": 0.3492, | |
| "num_input_tokens_seen": 214128, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 4.44, | |
| "grad_norm": 0.020751953125, | |
| "learning_rate": 0.020463400438378472, | |
| "loss": 0.343, | |
| "num_input_tokens_seen": 216240, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 0.004425048828125, | |
| "learning_rate": 0.020267821048205698, | |
| "loss": 0.3577, | |
| "num_input_tokens_seen": 218288, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 4.52, | |
| "grad_norm": 0.007171630859375, | |
| "learning_rate": 0.02007121468588196, | |
| "loss": 0.3479, | |
| "num_input_tokens_seen": 220240, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 4.536, | |
| "eval_loss": 0.356781005859375, | |
| "eval_runtime": 0.97, | |
| "eval_samples_per_second": 57.734, | |
| "eval_steps_per_second": 14.433, | |
| "num_input_tokens_seen": 221136, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 4.5600000000000005, | |
| "grad_norm": 0.049560546875, | |
| "learning_rate": 0.019873619680207146, | |
| "loss": 0.3374, | |
| "num_input_tokens_seen": 222256, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 4.6, | |
| "grad_norm": 0.0098876953125, | |
| "learning_rate": 0.019675074552719125, | |
| "loss": 0.3454, | |
| "num_input_tokens_seen": 224272, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 4.64, | |
| "grad_norm": 0.037353515625, | |
| "learning_rate": 0.019475618010183906, | |
| "loss": 0.3695, | |
| "num_input_tokens_seen": 226320, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 4.68, | |
| "grad_norm": 0.0269775390625, | |
| "learning_rate": 0.01927528893704964, | |
| "loss": 0.3548, | |
| "num_input_tokens_seen": 228528, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 4.72, | |
| "grad_norm": 0.00457763671875, | |
| "learning_rate": 0.01907412638786608, | |
| "loss": 0.3525, | |
| "num_input_tokens_seen": 230224, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 4.76, | |
| "grad_norm": 0.025634765625, | |
| "learning_rate": 0.018872169579670764, | |
| "loss": 0.3496, | |
| "num_input_tokens_seen": 232048, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 0.00634765625, | |
| "learning_rate": 0.01866945788434361, | |
| "loss": 0.3528, | |
| "num_input_tokens_seen": 234096, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 4.84, | |
| "grad_norm": 0.0206298828125, | |
| "learning_rate": 0.018466030820931272, | |
| "loss": 0.3389, | |
| "num_input_tokens_seen": 235888, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "grad_norm": 0.00335693359375, | |
| "learning_rate": 0.01826192804794282, | |
| "loss": 0.3418, | |
| "num_input_tokens_seen": 237648, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 4.92, | |
| "grad_norm": 0.0037384033203125, | |
| "learning_rate": 0.018057189355618276, | |
| "loss": 0.3529, | |
| "num_input_tokens_seen": 239408, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 0.004486083984375, | |
| "learning_rate": 0.01785185465817135, | |
| "loss": 0.3451, | |
| "num_input_tokens_seen": 241296, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.0076904296875, | |
| "learning_rate": 0.017645963986008185, | |
| "loss": 0.3406, | |
| "num_input_tokens_seen": 242624, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 5.04, | |
| "grad_norm": 0.043701171875, | |
| "learning_rate": 0.017439557477923254, | |
| "loss": 0.3271, | |
| "num_input_tokens_seen": 244736, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 5.04, | |
| "eval_loss": 0.33552998304367065, | |
| "eval_runtime": 0.9773, | |
| "eval_samples_per_second": 57.301, | |
| "eval_steps_per_second": 14.325, | |
| "num_input_tokens_seen": 244736, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 5.08, | |
| "grad_norm": 0.00775146484375, | |
| "learning_rate": 0.017232675373274282, | |
| "loss": 0.3665, | |
| "num_input_tokens_seen": 246624, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 5.12, | |
| "grad_norm": 0.0279541015625, | |
| "learning_rate": 0.017025358004137486, | |
| "loss": 0.3566, | |
| "num_input_tokens_seen": 248256, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 5.16, | |
| "grad_norm": 0.01953125, | |
| "learning_rate": 0.016817645787444758, | |
| "loss": 0.3418, | |
| "num_input_tokens_seen": 249888, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 5.2, | |
| "grad_norm": 0.0233154296875, | |
| "learning_rate": 0.0166095792171043, | |
| "loss": 0.3624, | |
| "num_input_tokens_seen": 251808, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 5.24, | |
| "grad_norm": 0.0198974609375, | |
| "learning_rate": 0.01640119885610626, | |
| "loss": 0.3462, | |
| "num_input_tokens_seen": 253504, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 5.28, | |
| "grad_norm": 0.003448486328125, | |
| "learning_rate": 0.016192545328614895, | |
| "loss": 0.3466, | |
| "num_input_tokens_seen": 255552, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 5.32, | |
| "grad_norm": 0.0478515625, | |
| "learning_rate": 0.015983659312048825, | |
| "loss": 0.364, | |
| "num_input_tokens_seen": 257760, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 5.36, | |
| "grad_norm": 0.0030517578125, | |
| "learning_rate": 0.015774581529150847, | |
| "loss": 0.3449, | |
| "num_input_tokens_seen": 259488, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 5.4, | |
| "grad_norm": 0.021484375, | |
| "learning_rate": 0.01556535274004902, | |
| "loss": 0.3508, | |
| "num_input_tokens_seen": 261344, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 5.44, | |
| "grad_norm": 0.00537109375, | |
| "learning_rate": 0.01535601373431033, | |
| "loss": 0.3418, | |
| "num_input_tokens_seen": 263488, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 5.48, | |
| "grad_norm": 0.0194091796875, | |
| "learning_rate": 0.015146605322988737, | |
| "loss": 0.3408, | |
| "num_input_tokens_seen": 265600, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 5.52, | |
| "grad_norm": 0.04052734375, | |
| "learning_rate": 0.014937168330668944, | |
| "loss": 0.3529, | |
| "num_input_tokens_seen": 267360, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 5.5440000000000005, | |
| "eval_loss": 0.33484506607055664, | |
| "eval_runtime": 0.9589, | |
| "eval_samples_per_second": 58.4, | |
| "eval_steps_per_second": 14.6, | |
| "num_input_tokens_seen": 268480, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 5.5600000000000005, | |
| "grad_norm": 0.005340576171875, | |
| "learning_rate": 0.014727743587507579, | |
| "loss": 0.3438, | |
| "num_input_tokens_seen": 269120, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "grad_norm": 0.006561279296875, | |
| "learning_rate": 0.014518371921273277, | |
| "loss": 0.3506, | |
| "num_input_tokens_seen": 271104, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 5.64, | |
| "grad_norm": 0.005767822265625, | |
| "learning_rate": 0.014309094149387214, | |
| "loss": 0.3413, | |
| "num_input_tokens_seen": 272832, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 5.68, | |
| "grad_norm": 0.0263671875, | |
| "learning_rate": 0.014099951070965693, | |
| "loss": 0.3523, | |
| "num_input_tokens_seen": 274784, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 5.72, | |
| "grad_norm": 0.02685546875, | |
| "learning_rate": 0.013890983458866225, | |
| "loss": 0.3412, | |
| "num_input_tokens_seen": 277024, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 5.76, | |
| "grad_norm": 0.0189208984375, | |
| "learning_rate": 0.013682232051738852, | |
| "loss": 0.3568, | |
| "num_input_tokens_seen": 279008, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 5.8, | |
| "grad_norm": 0.049560546875, | |
| "learning_rate": 0.013473737546084006, | |
| "loss": 0.3503, | |
| "num_input_tokens_seen": 281280, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 5.84, | |
| "grad_norm": 0.0206298828125, | |
| "learning_rate": 0.013265540588318678, | |
| "loss": 0.3467, | |
| "num_input_tokens_seen": 283392, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 5.88, | |
| "grad_norm": 0.0230712890625, | |
| "learning_rate": 0.013057681766852297, | |
| "loss": 0.3497, | |
| "num_input_tokens_seen": 285184, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 5.92, | |
| "grad_norm": 0.00689697265625, | |
| "learning_rate": 0.012850201604173958, | |
| "loss": 0.3403, | |
| "num_input_tokens_seen": 287424, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 5.96, | |
| "grad_norm": 0.0234375, | |
| "learning_rate": 0.012643140548952488, | |
| "loss": 0.3495, | |
| "num_input_tokens_seen": 289600, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.008544921875, | |
| "learning_rate": 0.012436538968150852, | |
| "loss": 0.3465, | |
| "num_input_tokens_seen": 291216, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 6.04, | |
| "grad_norm": 0.02099609375, | |
| "learning_rate": 0.012230437139156598, | |
| "loss": 0.3433, | |
| "num_input_tokens_seen": 292944, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 6.048, | |
| "eval_loss": 0.3435487449169159, | |
| "eval_runtime": 1.0271, | |
| "eval_samples_per_second": 54.523, | |
| "eval_steps_per_second": 13.631, | |
| "num_input_tokens_seen": 293424, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 6.08, | |
| "grad_norm": 0.0242919921875, | |
| "learning_rate": 0.012024875241929653, | |
| "loss": 0.3495, | |
| "num_input_tokens_seen": 294768, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 6.12, | |
| "grad_norm": 0.0235595703125, | |
| "learning_rate": 0.011819893351169184, | |
| "loss": 0.3465, | |
| "num_input_tokens_seen": 296816, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 6.16, | |
| "grad_norm": 0.0027008056640625, | |
| "learning_rate": 0.011615531428500938, | |
| "loss": 0.3479, | |
| "num_input_tokens_seen": 298480, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 6.2, | |
| "grad_norm": 0.005035400390625, | |
| "learning_rate": 0.01141182931468666, | |
| "loss": 0.3524, | |
| "num_input_tokens_seen": 300528, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 6.24, | |
| "grad_norm": 0.021728515625, | |
| "learning_rate": 0.01120882672185706, | |
| "loss": 0.3477, | |
| "num_input_tokens_seen": 302448, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 6.28, | |
| "grad_norm": 0.0224609375, | |
| "learning_rate": 0.011006563225769832, | |
| "loss": 0.3492, | |
| "num_input_tokens_seen": 304496, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 6.32, | |
| "grad_norm": 0.0225830078125, | |
| "learning_rate": 0.010805078258094304, | |
| "loss": 0.3524, | |
| "num_input_tokens_seen": 306256, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 6.36, | |
| "grad_norm": 0.045654296875, | |
| "learning_rate": 0.01060441109872414, | |
| "loss": 0.3492, | |
| "num_input_tokens_seen": 308592, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "grad_norm": 0.005645751953125, | |
| "learning_rate": 0.01040460086811966, | |
| "loss": 0.3525, | |
| "num_input_tokens_seen": 310768, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 6.44, | |
| "grad_norm": 0.022216796875, | |
| "learning_rate": 0.010205686519681232, | |
| "loss": 0.3416, | |
| "num_input_tokens_seen": 312656, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 6.48, | |
| "grad_norm": 0.04248046875, | |
| "learning_rate": 0.0100077068321552, | |
| "loss": 0.3512, | |
| "num_input_tokens_seen": 314320, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 6.52, | |
| "grad_norm": 0.004180908203125, | |
| "learning_rate": 0.009810700402073928, | |
| "loss": 0.354, | |
| "num_input_tokens_seen": 316176, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 6.552, | |
| "eval_loss": 0.34503957629203796, | |
| "eval_runtime": 1.0109, | |
| "eval_samples_per_second": 55.397, | |
| "eval_steps_per_second": 13.849, | |
| "num_input_tokens_seen": 317840, | |
| "step": 819 | |
| }, | |
| { | |
| "epoch": 6.5600000000000005, | |
| "grad_norm": 0.022216796875, | |
| "learning_rate": 0.009614705636231307, | |
| "loss": 0.3446, | |
| "num_input_tokens_seen": 318128, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 6.6, | |
| "grad_norm": 0.006072998046875, | |
| "learning_rate": 0.009419760744195283, | |
| "loss": 0.3436, | |
| "num_input_tokens_seen": 319984, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 6.64, | |
| "grad_norm": 0.0269775390625, | |
| "learning_rate": 0.00922590373085881, | |
| "loss": 0.341, | |
| "num_input_tokens_seen": 321680, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 6.68, | |
| "grad_norm": 0.02978515625, | |
| "learning_rate": 0.009033172389030755, | |
| "loss": 0.3496, | |
| "num_input_tokens_seen": 323440, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 6.72, | |
| "grad_norm": 0.0167236328125, | |
| "learning_rate": 0.00884160429206808, | |
| "loss": 0.3507, | |
| "num_input_tokens_seen": 325232, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 6.76, | |
| "grad_norm": 0.029052734375, | |
| "learning_rate": 0.008651236786550862, | |
| "loss": 0.3628, | |
| "num_input_tokens_seen": 327024, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 6.8, | |
| "grad_norm": 0.0286865234375, | |
| "learning_rate": 0.00846210698500149, | |
| "loss": 0.3682, | |
| "num_input_tokens_seen": 329488, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 6.84, | |
| "grad_norm": 0.0052490234375, | |
| "learning_rate": 0.008274251758649518, | |
| "loss": 0.3491, | |
| "num_input_tokens_seen": 331568, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 6.88, | |
| "grad_norm": 0.02587890625, | |
| "learning_rate": 0.008087707730243539, | |
| "loss": 0.3498, | |
| "num_input_tokens_seen": 333904, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 6.92, | |
| "grad_norm": 0.004364013671875, | |
| "learning_rate": 0.007902511266911504, | |
| "loss": 0.3495, | |
| "num_input_tokens_seen": 336048, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 6.96, | |
| "grad_norm": 0.003936767578125, | |
| "learning_rate": 0.00771869847307089, | |
| "loss": 0.3461, | |
| "num_input_tokens_seen": 338064, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 0.00653076171875, | |
| "learning_rate": 0.007536305183390062, | |
| "loss": 0.3461, | |
| "num_input_tokens_seen": 339568, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 7.04, | |
| "grad_norm": 0.0228271484375, | |
| "learning_rate": 0.007355366955802234, | |
| "loss": 0.348, | |
| "num_input_tokens_seen": 341584, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 7.056, | |
| "eval_loss": 0.3410987854003906, | |
| "eval_runtime": 1.0012, | |
| "eval_samples_per_second": 55.934, | |
| "eval_steps_per_second": 13.984, | |
| "num_input_tokens_seen": 342384, | |
| "step": 882 | |
| }, | |
| { | |
| "epoch": 7.08, | |
| "grad_norm": 0.02490234375, | |
| "learning_rate": 0.007175919064573383, | |
| "loss": 0.3578, | |
| "num_input_tokens_seen": 343440, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 7.12, | |
| "grad_norm": 0.0419921875, | |
| "learning_rate": 0.006997996493425461, | |
| "loss": 0.345, | |
| "num_input_tokens_seen": 345232, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 7.16, | |
| "grad_norm": 0.02001953125, | |
| "learning_rate": 0.0068216339287162486, | |
| "loss": 0.3435, | |
| "num_input_tokens_seen": 347056, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 7.2, | |
| "grad_norm": 0.004669189453125, | |
| "learning_rate": 0.006646865752677185, | |
| "loss": 0.3421, | |
| "num_input_tokens_seen": 348816, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 7.24, | |
| "grad_norm": 0.01953125, | |
| "learning_rate": 0.00647372603671046, | |
| "loss": 0.3405, | |
| "num_input_tokens_seen": 351120, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 7.28, | |
| "grad_norm": 0.020751953125, | |
| "learning_rate": 0.0063022485347467615, | |
| "loss": 0.3468, | |
| "num_input_tokens_seen": 352912, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 7.32, | |
| "grad_norm": 0.00592041015625, | |
| "learning_rate": 0.00613246667666487, | |
| "loss": 0.344, | |
| "num_input_tokens_seen": 354768, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 7.36, | |
| "grad_norm": 0.04150390625, | |
| "learning_rate": 0.005964413561774424, | |
| "loss": 0.3517, | |
| "num_input_tokens_seen": 356944, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 7.4, | |
| "grad_norm": 0.041748046875, | |
| "learning_rate": 0.0057981219523631404, | |
| "loss": 0.3457, | |
| "num_input_tokens_seen": 358896, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 7.44, | |
| "grad_norm": 0.004241943359375, | |
| "learning_rate": 0.005633624267309767, | |
| "loss": 0.3486, | |
| "num_input_tokens_seen": 360784, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 7.48, | |
| "grad_norm": 0.0262451171875, | |
| "learning_rate": 0.005470952575763933, | |
| "loss": 0.3551, | |
| "num_input_tokens_seen": 362512, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 7.52, | |
| "grad_norm": 0.048095703125, | |
| "learning_rate": 0.0053101385908942405, | |
| "loss": 0.3495, | |
| "num_input_tokens_seen": 364400, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 7.5600000000000005, | |
| "grad_norm": 0.02490234375, | |
| "learning_rate": 0.0051512136637056555, | |
| "loss": 0.3469, | |
| "num_input_tokens_seen": 366288, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 7.5600000000000005, | |
| "eval_loss": 0.34017425775527954, | |
| "eval_runtime": 0.9703, | |
| "eval_samples_per_second": 57.716, | |
| "eval_steps_per_second": 14.429, | |
| "num_input_tokens_seen": 366288, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 7.6, | |
| "grad_norm": 0.0198974609375, | |
| "learning_rate": 0.004994208776927635, | |
| "loss": 0.3549, | |
| "num_input_tokens_seen": 368656, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 7.64, | |
| "grad_norm": 0.0234375, | |
| "learning_rate": 0.004839154538973943, | |
| "loss": 0.35, | |
| "num_input_tokens_seen": 370608, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 7.68, | |
| "grad_norm": 0.0419921875, | |
| "learning_rate": 0.00468608117797549, | |
| "loss": 0.3435, | |
| "num_input_tokens_seen": 372336, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 7.72, | |
| "grad_norm": 0.04296875, | |
| "learning_rate": 0.004535018535887305, | |
| "loss": 0.3387, | |
| "num_input_tokens_seen": 374288, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 7.76, | |
| "grad_norm": 0.004119873046875, | |
| "learning_rate": 0.004385996062670774, | |
| "loss": 0.3529, | |
| "num_input_tokens_seen": 376144, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 7.8, | |
| "grad_norm": 0.021240234375, | |
| "learning_rate": 0.0042390428105523225, | |
| "loss": 0.3513, | |
| "num_input_tokens_seen": 378160, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 7.84, | |
| "grad_norm": 0.0208740234375, | |
| "learning_rate": 0.004094187428359625, | |
| "loss": 0.3481, | |
| "num_input_tokens_seen": 380208, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 7.88, | |
| "grad_norm": 0.0047607421875, | |
| "learning_rate": 0.003951458155936452, | |
| "loss": 0.3401, | |
| "num_input_tokens_seen": 382384, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 7.92, | |
| "grad_norm": 0.005218505859375, | |
| "learning_rate": 0.0038108828186372685, | |
| "loss": 0.3496, | |
| "num_input_tokens_seen": 384688, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 7.96, | |
| "grad_norm": 0.006439208984375, | |
| "learning_rate": 0.003672488821902614, | |
| "loss": 0.3465, | |
| "num_input_tokens_seen": 386736, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.005889892578125, | |
| "learning_rate": 0.0035363031459163647, | |
| "loss": 0.3498, | |
| "num_input_tokens_seen": 388576, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 8.04, | |
| "grad_norm": 0.003936767578125, | |
| "learning_rate": 0.0034023523403458908, | |
| "loss": 0.342, | |
| "num_input_tokens_seen": 390848, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 8.064, | |
| "eval_loss": 0.3409908711910248, | |
| "eval_runtime": 0.9724, | |
| "eval_samples_per_second": 57.587, | |
| "eval_steps_per_second": 14.397, | |
| "num_input_tokens_seen": 391840, | |
| "step": 1008 | |
| }, | |
| { | |
| "epoch": 8.08, | |
| "grad_norm": 0.0211181640625, | |
| "learning_rate": 0.003270662519166149, | |
| "loss": 0.3528, | |
| "num_input_tokens_seen": 392480, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 8.12, | |
| "grad_norm": 0.043212890625, | |
| "learning_rate": 0.003141259355568705, | |
| "loss": 0.3435, | |
| "num_input_tokens_seen": 394016, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 8.16, | |
| "grad_norm": 0.00714111328125, | |
| "learning_rate": 0.003014168076956707, | |
| "loss": 0.3498, | |
| "num_input_tokens_seen": 395904, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 8.2, | |
| "grad_norm": 0.0203857421875, | |
| "learning_rate": 0.002889413460026724, | |
| "loss": 0.3326, | |
| "num_input_tokens_seen": 398272, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 8.24, | |
| "grad_norm": 0.004486083984375, | |
| "learning_rate": 0.0027670198259385275, | |
| "loss": 0.3563, | |
| "num_input_tokens_seen": 400384, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 8.28, | |
| "grad_norm": 0.0045166015625, | |
| "learning_rate": 0.0026470110355735882, | |
| "loss": 0.3468, | |
| "num_input_tokens_seen": 402432, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 8.32, | |
| "grad_norm": 0.0201416015625, | |
| "learning_rate": 0.0025294104848833754, | |
| "loss": 0.3502, | |
| "num_input_tokens_seen": 404448, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 8.36, | |
| "grad_norm": 0.00701904296875, | |
| "learning_rate": 0.002414241100328251, | |
| "loss": 0.3609, | |
| "num_input_tokens_seen": 406432, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 8.4, | |
| "grad_norm": 0.0439453125, | |
| "learning_rate": 0.002301525334407931, | |
| "loss": 0.3469, | |
| "num_input_tokens_seen": 408640, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 8.44, | |
| "grad_norm": 0.02099609375, | |
| "learning_rate": 0.0021912851612843243, | |
| "loss": 0.3373, | |
| "num_input_tokens_seen": 410304, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 8.48, | |
| "grad_norm": 0.00421142578125, | |
| "learning_rate": 0.002083542072497606, | |
| "loss": 0.3576, | |
| "num_input_tokens_seen": 412064, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 8.52, | |
| "grad_norm": 0.0240478515625, | |
| "learning_rate": 0.001978317072776413, | |
| "loss": 0.3405, | |
| "num_input_tokens_seen": 414144, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 8.56, | |
| "grad_norm": 0.003936767578125, | |
| "learning_rate": 0.0018756306759429363, | |
| "loss": 0.3469, | |
| "num_input_tokens_seen": 416032, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 8.568, | |
| "eval_loss": 0.3438374996185303, | |
| "eval_runtime": 0.9714, | |
| "eval_samples_per_second": 57.648, | |
| "eval_steps_per_second": 14.412, | |
| "num_input_tokens_seen": 416320, | |
| "step": 1071 | |
| }, | |
| { | |
| "epoch": 8.6, | |
| "grad_norm": 0.0250244140625, | |
| "learning_rate": 0.001775502900913697, | |
| "loss": 0.3547, | |
| "num_input_tokens_seen": 417824, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 8.64, | |
| "grad_norm": 0.0228271484375, | |
| "learning_rate": 0.0016779532677968327, | |
| "loss": 0.3391, | |
| "num_input_tokens_seen": 420096, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 8.68, | |
| "grad_norm": 0.00726318359375, | |
| "learning_rate": 0.0015830007940866035, | |
| "loss": 0.3454, | |
| "num_input_tokens_seen": 421824, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 8.72, | |
| "grad_norm": 0.004730224609375, | |
| "learning_rate": 0.0014906639909558954, | |
| "loss": 0.3451, | |
| "num_input_tokens_seen": 423552, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 8.76, | |
| "grad_norm": 0.02490234375, | |
| "learning_rate": 0.0014009608596474348, | |
| "loss": 0.3515, | |
| "num_input_tokens_seen": 425376, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 8.8, | |
| "grad_norm": 0.0238037109375, | |
| "learning_rate": 0.001313908887964409, | |
| "loss": 0.3561, | |
| "num_input_tokens_seen": 427008, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 8.84, | |
| "grad_norm": 0.005340576171875, | |
| "learning_rate": 0.0012295250468611779, | |
| "loss": 0.3437, | |
| "num_input_tokens_seen": 428960, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 8.88, | |
| "grad_norm": 0.0242919921875, | |
| "learning_rate": 0.0011478257871347663, | |
| "loss": 0.3498, | |
| "num_input_tokens_seen": 431072, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 8.92, | |
| "grad_norm": 0.00994873046875, | |
| "learning_rate": 0.0010688270362177355, | |
| "loss": 0.3498, | |
| "num_input_tokens_seen": 433280, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 8.96, | |
| "grad_norm": 0.004150390625, | |
| "learning_rate": 0.0009925441950730985, | |
| "loss": 0.3357, | |
| "num_input_tokens_seen": 434976, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 0.00457763671875, | |
| "learning_rate": 0.0009189921351918889, | |
| "loss": 0.3452, | |
| "num_input_tokens_seen": 436656, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 9.04, | |
| "grad_norm": 0.0419921875, | |
| "learning_rate": 0.0008481851956939134, | |
| "loss": 0.3467, | |
| "num_input_tokens_seen": 438544, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 9.072, | |
| "eval_loss": 0.34365683794021606, | |
| "eval_runtime": 0.9736, | |
| "eval_samples_per_second": 57.517, | |
| "eval_steps_per_second": 14.379, | |
| "num_input_tokens_seen": 440048, | |
| "step": 1134 | |
| }, | |
| { | |
| "epoch": 9.08, | |
| "grad_norm": 0.042724609375, | |
| "learning_rate": 0.0007801371805323276, | |
| "loss": 0.3404, | |
| "num_input_tokens_seen": 440464, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 9.12, | |
| "grad_norm": 0.00390625, | |
| "learning_rate": 0.0007148613558025102, | |
| "loss": 0.3482, | |
| "num_input_tokens_seen": 442064, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 9.16, | |
| "grad_norm": 0.007049560546875, | |
| "learning_rate": 0.0006523704471558306, | |
| "loss": 0.3482, | |
| "num_input_tokens_seen": 444016, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 9.2, | |
| "grad_norm": 0.021484375, | |
| "learning_rate": 0.0005926766373187531, | |
| "loss": 0.3421, | |
| "num_input_tokens_seen": 445904, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 9.24, | |
| "grad_norm": 0.0247802734375, | |
| "learning_rate": 0.0005357915637177817, | |
| "loss": 0.339, | |
| "num_input_tokens_seen": 448080, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 9.28, | |
| "grad_norm": 0.0201416015625, | |
| "learning_rate": 0.00048172631621072045, | |
| "loss": 0.3436, | |
| "num_input_tokens_seen": 450352, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 9.32, | |
| "grad_norm": 0.007720947265625, | |
| "learning_rate": 0.00043049143492470017, | |
| "loss": 0.3389, | |
| "num_input_tokens_seen": 452208, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 9.36, | |
| "grad_norm": 0.0216064453125, | |
| "learning_rate": 0.00038209690820134145, | |
| "loss": 0.3388, | |
| "num_input_tokens_seen": 454256, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 9.4, | |
| "grad_norm": 0.0047607421875, | |
| "learning_rate": 0.0003365521706495234, | |
| "loss": 0.3482, | |
| "num_input_tokens_seen": 456048, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 9.44, | |
| "grad_norm": 0.0205078125, | |
| "learning_rate": 0.00029386610130606504, | |
| "loss": 0.3466, | |
| "num_input_tokens_seen": 457840, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 9.48, | |
| "grad_norm": 0.005950927734375, | |
| "learning_rate": 0.00025404702190476856, | |
| "loss": 0.3498, | |
| "num_input_tokens_seen": 460080, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 9.52, | |
| "grad_norm": 0.0211181640625, | |
| "learning_rate": 0.00021710269525405834, | |
| "loss": 0.3497, | |
| "num_input_tokens_seen": 461840, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 9.56, | |
| "grad_norm": 0.004638671875, | |
| "learning_rate": 0.00018304032372361666, | |
| "loss": 0.3497, | |
| "num_input_tokens_seen": 463952, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 9.576, | |
| "eval_loss": 0.34312018752098083, | |
| "eval_runtime": 0.9538, | |
| "eval_samples_per_second": 58.714, | |
| "eval_steps_per_second": 14.679, | |
| "num_input_tokens_seen": 464688, | |
| "step": 1197 | |
| }, | |
| { | |
| "epoch": 9.6, | |
| "grad_norm": 0.0203857421875, | |
| "learning_rate": 0.00015186654784026365, | |
| "loss": 0.3451, | |
| "num_input_tokens_seen": 465904, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 9.64, | |
| "grad_norm": 0.004669189453125, | |
| "learning_rate": 0.00012358744499337603, | |
| "loss": 0.3531, | |
| "num_input_tokens_seen": 467728, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 9.68, | |
| "grad_norm": 0.0247802734375, | |
| "learning_rate": 9.820852825008664e-05, | |
| "loss": 0.3466, | |
| "num_input_tokens_seen": 469840, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 9.72, | |
| "grad_norm": 0.021240234375, | |
| "learning_rate": 7.57347452804974e-05, | |
| "loss": 0.3496, | |
| "num_input_tokens_seen": 472144, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 9.76, | |
| "grad_norm": 0.0235595703125, | |
| "learning_rate": 5.6170477393130966e-05, | |
| "loss": 0.3387, | |
| "num_input_tokens_seen": 473808, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 9.8, | |
| "grad_norm": 0.043212890625, | |
| "learning_rate": 3.951953868077229e-05, | |
| "loss": 0.3561, | |
| "num_input_tokens_seen": 475344, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 9.84, | |
| "grad_norm": 0.020263671875, | |
| "learning_rate": 2.5785175276920034e-05, | |
| "loss": 0.3405, | |
| "num_input_tokens_seen": 477488, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 9.88, | |
| "grad_norm": 0.0230712890625, | |
| "learning_rate": 1.4970064722929499e-05, | |
| "loss": 0.3498, | |
| "num_input_tokens_seen": 479280, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 9.92, | |
| "grad_norm": 0.00579833984375, | |
| "learning_rate": 7.076315446033487e-06, | |
| "loss": 0.3451, | |
| "num_input_tokens_seen": 480944, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 9.96, | |
| "grad_norm": 0.005950927734375, | |
| "learning_rate": 2.105466348294449e-06, | |
| "loss": 0.3468, | |
| "num_input_tokens_seen": 482992, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.00872802734375, | |
| "learning_rate": 5.848650659112664e-08, | |
| "loss": 0.353, | |
| "num_input_tokens_seen": 485152, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "num_input_tokens_seen": 485152, | |
| "step": 1250, | |
| "total_flos": 2.1846175286820864e+16, | |
| "train_loss": 0.39903659229278565, | |
| "train_runtime": 185.1613, | |
| "train_samples_per_second": 26.895, | |
| "train_steps_per_second": 6.751 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1250, | |
| "num_input_tokens_seen": 485152, | |
| "num_train_epochs": 10, | |
| "save_steps": 63, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.1846175286820864e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |