train_wsc_456_1760356427 / trainer_state.json
rbelanec's picture
End of training
aa21961 verified
{
"best_global_step": 441,
"best_metric": 0.32748550176620483,
"best_model_checkpoint": "saves/prompt-tuning/llama-3-8b-instruct/train_wsc_456_1760356427/checkpoint-441",
"epoch": 10.0,
"eval_steps": 63,
"global_step": 1250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04,
"grad_norm": 4.75,
"learning_rate": 0.00096,
"loss": 0.6432,
"num_input_tokens_seen": 2048,
"step": 5
},
{
"epoch": 0.08,
"grad_norm": 11.4375,
"learning_rate": 0.0021599999999999996,
"loss": 0.4748,
"num_input_tokens_seen": 4000,
"step": 10
},
{
"epoch": 0.12,
"grad_norm": 87.0,
"learning_rate": 0.00336,
"loss": 4.1821,
"num_input_tokens_seen": 5920,
"step": 15
},
{
"epoch": 0.16,
"grad_norm": 93.0,
"learning_rate": 0.00456,
"loss": 2.1496,
"num_input_tokens_seen": 8000,
"step": 20
},
{
"epoch": 0.2,
"grad_norm": 264.0,
"learning_rate": 0.0057599999999999995,
"loss": 1.1387,
"num_input_tokens_seen": 10176,
"step": 25
},
{
"epoch": 0.24,
"grad_norm": 16.125,
"learning_rate": 0.00696,
"loss": 1.4367,
"num_input_tokens_seen": 12256,
"step": 30
},
{
"epoch": 0.28,
"grad_norm": 24.375,
"learning_rate": 0.00816,
"loss": 1.488,
"num_input_tokens_seen": 14112,
"step": 35
},
{
"epoch": 0.32,
"grad_norm": 2.25,
"learning_rate": 0.00936,
"loss": 0.5887,
"num_input_tokens_seen": 15808,
"step": 40
},
{
"epoch": 0.36,
"grad_norm": 4.75,
"learning_rate": 0.010559999999999998,
"loss": 1.0068,
"num_input_tokens_seen": 17600,
"step": 45
},
{
"epoch": 0.4,
"grad_norm": 1.0625,
"learning_rate": 0.01176,
"loss": 0.4659,
"num_input_tokens_seen": 19296,
"step": 50
},
{
"epoch": 0.44,
"grad_norm": 2.65625,
"learning_rate": 0.01296,
"loss": 0.4367,
"num_input_tokens_seen": 21376,
"step": 55
},
{
"epoch": 0.48,
"grad_norm": 2.25,
"learning_rate": 0.014159999999999999,
"loss": 0.9258,
"num_input_tokens_seen": 23680,
"step": 60
},
{
"epoch": 0.504,
"eval_loss": 0.3463467061519623,
"eval_runtime": 0.9482,
"eval_samples_per_second": 59.062,
"eval_steps_per_second": 14.766,
"num_input_tokens_seen": 24704,
"step": 63
},
{
"epoch": 0.52,
"grad_norm": 0.9296875,
"learning_rate": 0.01536,
"loss": 0.5618,
"num_input_tokens_seen": 25440,
"step": 65
},
{
"epoch": 0.56,
"grad_norm": 0.70703125,
"learning_rate": 0.016560000000000002,
"loss": 0.4041,
"num_input_tokens_seen": 27232,
"step": 70
},
{
"epoch": 0.6,
"grad_norm": 0.1083984375,
"learning_rate": 0.017759999999999998,
"loss": 0.4381,
"num_input_tokens_seen": 29216,
"step": 75
},
{
"epoch": 0.64,
"grad_norm": 0.0927734375,
"learning_rate": 0.01896,
"loss": 0.393,
"num_input_tokens_seen": 30912,
"step": 80
},
{
"epoch": 0.68,
"grad_norm": 0.271484375,
"learning_rate": 0.02016,
"loss": 0.3895,
"num_input_tokens_seen": 32736,
"step": 85
},
{
"epoch": 0.72,
"grad_norm": 0.0791015625,
"learning_rate": 0.021359999999999997,
"loss": 0.3465,
"num_input_tokens_seen": 34592,
"step": 90
},
{
"epoch": 0.76,
"grad_norm": 0.0673828125,
"learning_rate": 0.02256,
"loss": 0.3712,
"num_input_tokens_seen": 36480,
"step": 95
},
{
"epoch": 0.8,
"grad_norm": 0.05078125,
"learning_rate": 0.02376,
"loss": 0.3497,
"num_input_tokens_seen": 38624,
"step": 100
},
{
"epoch": 0.84,
"grad_norm": 0.072265625,
"learning_rate": 0.02496,
"loss": 0.3676,
"num_input_tokens_seen": 40672,
"step": 105
},
{
"epoch": 0.88,
"grad_norm": 0.038330078125,
"learning_rate": 0.02616,
"loss": 0.3566,
"num_input_tokens_seen": 42784,
"step": 110
},
{
"epoch": 0.92,
"grad_norm": 0.050537109375,
"learning_rate": 0.02736,
"loss": 0.3705,
"num_input_tokens_seen": 44320,
"step": 115
},
{
"epoch": 0.96,
"grad_norm": 0.02490234375,
"learning_rate": 0.02856,
"loss": 0.3427,
"num_input_tokens_seen": 46528,
"step": 120
},
{
"epoch": 1.0,
"grad_norm": 0.1494140625,
"learning_rate": 0.029759999999999998,
"loss": 0.4236,
"num_input_tokens_seen": 48240,
"step": 125
},
{
"epoch": 1.008,
"eval_loss": 0.3752361834049225,
"eval_runtime": 0.9851,
"eval_samples_per_second": 56.847,
"eval_steps_per_second": 14.212,
"num_input_tokens_seen": 48688,
"step": 126
},
{
"epoch": 1.04,
"grad_norm": 0.0255126953125,
"learning_rate": 0.029999064225016296,
"loss": 0.3789,
"num_input_tokens_seen": 50192,
"step": 130
},
{
"epoch": 1.08,
"grad_norm": 0.04931640625,
"learning_rate": 0.029995262839249498,
"loss": 0.3428,
"num_input_tokens_seen": 52080,
"step": 135
},
{
"epoch": 1.12,
"grad_norm": 0.021484375,
"learning_rate": 0.0299885380972807,
"loss": 0.3735,
"num_input_tokens_seen": 53936,
"step": 140
},
{
"epoch": 1.16,
"grad_norm": 0.034912109375,
"learning_rate": 0.02997889131011168,
"loss": 0.3521,
"num_input_tokens_seen": 56176,
"step": 145
},
{
"epoch": 1.2,
"grad_norm": 0.09130859375,
"learning_rate": 0.0299663243584027,
"loss": 0.3711,
"num_input_tokens_seen": 58096,
"step": 150
},
{
"epoch": 1.24,
"grad_norm": 10.9375,
"learning_rate": 0.029950839692105897,
"loss": 0.4052,
"num_input_tokens_seen": 60208,
"step": 155
},
{
"epoch": 1.28,
"grad_norm": 0.416015625,
"learning_rate": 0.029932440329987653,
"loss": 0.5242,
"num_input_tokens_seen": 62064,
"step": 160
},
{
"epoch": 1.32,
"grad_norm": 0.10791015625,
"learning_rate": 0.02991112985904007,
"loss": 1.0726,
"num_input_tokens_seen": 64304,
"step": 165
},
{
"epoch": 1.3599999999999999,
"grad_norm": 0.11181640625,
"learning_rate": 0.029886912433781675,
"loss": 0.3788,
"num_input_tokens_seen": 66480,
"step": 170
},
{
"epoch": 1.4,
"grad_norm": 0.09130859375,
"learning_rate": 0.02985979277544751,
"loss": 0.3532,
"num_input_tokens_seen": 68080,
"step": 175
},
{
"epoch": 1.44,
"grad_norm": 0.019287109375,
"learning_rate": 0.029829776171068707,
"loss": 0.3652,
"num_input_tokens_seen": 70288,
"step": 180
},
{
"epoch": 1.48,
"grad_norm": 0.052490234375,
"learning_rate": 0.029796868472441763,
"loss": 0.3432,
"num_input_tokens_seen": 72048,
"step": 185
},
{
"epoch": 1.512,
"eval_loss": 0.3334037959575653,
"eval_runtime": 0.9347,
"eval_samples_per_second": 59.91,
"eval_steps_per_second": 14.977,
"num_input_tokens_seen": 73456,
"step": 189
},
{
"epoch": 1.52,
"grad_norm": 0.023681640625,
"learning_rate": 0.029761076094987723,
"loss": 0.3447,
"num_input_tokens_seen": 74096,
"step": 190
},
{
"epoch": 1.56,
"grad_norm": 0.05712890625,
"learning_rate": 0.02972240601650149,
"loss": 0.3917,
"num_input_tokens_seen": 76080,
"step": 195
},
{
"epoch": 1.6,
"grad_norm": 0.00750732421875,
"learning_rate": 0.029680865775791494,
"loss": 0.3633,
"num_input_tokens_seen": 78192,
"step": 200
},
{
"epoch": 1.6400000000000001,
"grad_norm": 0.08447265625,
"learning_rate": 0.02963646347120996,
"loss": 0.3483,
"num_input_tokens_seen": 80048,
"step": 205
},
{
"epoch": 1.6800000000000002,
"grad_norm": 0.02099609375,
"learning_rate": 0.029589207759074154,
"loss": 0.3647,
"num_input_tokens_seen": 81872,
"step": 210
},
{
"epoch": 1.72,
"grad_norm": 0.099609375,
"learning_rate": 0.029539107851978778,
"loss": 0.3687,
"num_input_tokens_seen": 83568,
"step": 215
},
{
"epoch": 1.76,
"grad_norm": 0.044189453125,
"learning_rate": 0.02948617351699999,
"loss": 0.3515,
"num_input_tokens_seen": 85808,
"step": 220
},
{
"epoch": 1.8,
"grad_norm": 0.01220703125,
"learning_rate": 0.029430415073791287,
"loss": 0.3457,
"num_input_tokens_seen": 87568,
"step": 225
},
{
"epoch": 1.8399999999999999,
"grad_norm": 0.059814453125,
"learning_rate": 0.029371843392571644,
"loss": 0.3506,
"num_input_tokens_seen": 89424,
"step": 230
},
{
"epoch": 1.88,
"grad_norm": 0.006988525390625,
"learning_rate": 0.029310469892006367,
"loss": 0.3775,
"num_input_tokens_seen": 91120,
"step": 235
},
{
"epoch": 1.92,
"grad_norm": 0.01434326171875,
"learning_rate": 0.029246306536981,
"loss": 0.3575,
"num_input_tokens_seen": 93072,
"step": 240
},
{
"epoch": 1.96,
"grad_norm": 0.0267333984375,
"learning_rate": 0.02917936583626874,
"loss": 0.3234,
"num_input_tokens_seen": 95216,
"step": 245
},
{
"epoch": 2.0,
"grad_norm": 0.029052734375,
"learning_rate": 0.029109660840091818,
"loss": 0.3609,
"num_input_tokens_seen": 96896,
"step": 250
},
{
"epoch": 2.016,
"eval_loss": 0.32756733894348145,
"eval_runtime": 0.974,
"eval_samples_per_second": 57.497,
"eval_steps_per_second": 14.374,
"num_input_tokens_seen": 97568,
"step": 252
},
{
"epoch": 2.04,
"grad_norm": 0.059814453125,
"learning_rate": 0.029037205137577363,
"loss": 0.3726,
"num_input_tokens_seen": 98816,
"step": 255
},
{
"epoch": 2.08,
"grad_norm": 0.03271484375,
"learning_rate": 0.02896201285410813,
"loss": 0.3451,
"num_input_tokens_seen": 100736,
"step": 260
},
{
"epoch": 2.12,
"grad_norm": 0.00927734375,
"learning_rate": 0.028884098648568782,
"loss": 0.3571,
"num_input_tokens_seen": 102592,
"step": 265
},
{
"epoch": 2.16,
"grad_norm": 0.006927490234375,
"learning_rate": 0.028803477710488055,
"loss": 0.3437,
"num_input_tokens_seen": 104224,
"step": 270
},
{
"epoch": 2.2,
"grad_norm": 0.0625,
"learning_rate": 0.028720165757077573,
"loss": 0.3383,
"num_input_tokens_seen": 105984,
"step": 275
},
{
"epoch": 2.24,
"grad_norm": 0.0301513671875,
"learning_rate": 0.02863417903016773,
"loss": 0.3627,
"num_input_tokens_seen": 107840,
"step": 280
},
{
"epoch": 2.2800000000000002,
"grad_norm": 0.046630859375,
"learning_rate": 0.02854553429304131,
"loss": 0.3626,
"num_input_tokens_seen": 110048,
"step": 285
},
{
"epoch": 2.32,
"grad_norm": 0.006256103515625,
"learning_rate": 0.02845424882716545,
"loss": 0.3415,
"num_input_tokens_seen": 112320,
"step": 290
},
{
"epoch": 2.36,
"grad_norm": 0.051513671875,
"learning_rate": 0.028360340428822597,
"loss": 0.3509,
"num_input_tokens_seen": 114048,
"step": 295
},
{
"epoch": 2.4,
"grad_norm": 0.06591796875,
"learning_rate": 0.028263827405641085,
"loss": 0.3427,
"num_input_tokens_seen": 115936,
"step": 300
},
{
"epoch": 2.44,
"grad_norm": 0.0279541015625,
"learning_rate": 0.028164728573026005,
"loss": 0.3288,
"num_input_tokens_seen": 117792,
"step": 305
},
{
"epoch": 2.48,
"grad_norm": 0.022216796875,
"learning_rate": 0.02806306325049113,
"loss": 0.3418,
"num_input_tokens_seen": 119872,
"step": 310
},
{
"epoch": 2.52,
"grad_norm": 0.05126953125,
"learning_rate": 0.027958851257892527,
"loss": 0.3639,
"num_input_tokens_seen": 121888,
"step": 315
},
{
"epoch": 2.52,
"eval_loss": 0.37361711263656616,
"eval_runtime": 0.9479,
"eval_samples_per_second": 59.081,
"eval_steps_per_second": 14.77,
"num_input_tokens_seen": 121888,
"step": 315
},
{
"epoch": 2.56,
"grad_norm": 0.06494140625,
"learning_rate": 0.02785211291156464,
"loss": 0.3619,
"num_input_tokens_seen": 123936,
"step": 320
},
{
"epoch": 2.6,
"grad_norm": 0.053466796875,
"learning_rate": 0.027742869020359582,
"loss": 0.3424,
"num_input_tokens_seen": 125952,
"step": 325
},
{
"epoch": 2.64,
"grad_norm": 0.01361083984375,
"learning_rate": 0.027631140881590383,
"loss": 0.3878,
"num_input_tokens_seen": 128352,
"step": 330
},
{
"epoch": 2.68,
"grad_norm": 0.01129150390625,
"learning_rate": 0.027516950276879084,
"loss": 0.3405,
"num_input_tokens_seen": 130496,
"step": 335
},
{
"epoch": 2.7199999999999998,
"grad_norm": 0.01043701171875,
"learning_rate": 0.02740031946791033,
"loss": 0.3449,
"num_input_tokens_seen": 132864,
"step": 340
},
{
"epoch": 2.76,
"grad_norm": 0.026611328125,
"learning_rate": 0.027281271192091415,
"loss": 0.3469,
"num_input_tokens_seen": 134592,
"step": 345
},
{
"epoch": 2.8,
"grad_norm": 0.027099609375,
"learning_rate": 0.027159828658119597,
"loss": 0.3497,
"num_input_tokens_seen": 136224,
"step": 350
},
{
"epoch": 2.84,
"grad_norm": 0.031494140625,
"learning_rate": 0.0270360155414575,
"loss": 0.3476,
"num_input_tokens_seen": 138048,
"step": 355
},
{
"epoch": 2.88,
"grad_norm": 0.059814453125,
"learning_rate": 0.02690985597971753,
"loss": 0.351,
"num_input_tokens_seen": 139840,
"step": 360
},
{
"epoch": 2.92,
"grad_norm": 0.00653076171875,
"learning_rate": 0.026781374567956224,
"loss": 0.348,
"num_input_tokens_seen": 141728,
"step": 365
},
{
"epoch": 2.96,
"grad_norm": 0.03125,
"learning_rate": 0.026650596353879386,
"loss": 0.3574,
"num_input_tokens_seen": 143584,
"step": 370
},
{
"epoch": 3.0,
"grad_norm": 0.051513671875,
"learning_rate": 0.026517546832958965,
"loss": 0.3418,
"num_input_tokens_seen": 145184,
"step": 375
},
{
"epoch": 3.024,
"eval_loss": 0.32964640855789185,
"eval_runtime": 0.9858,
"eval_samples_per_second": 56.807,
"eval_steps_per_second": 14.202,
"num_input_tokens_seen": 146336,
"step": 378
},
{
"epoch": 3.04,
"grad_norm": 0.019775390625,
"learning_rate": 0.026382251943462682,
"loss": 0.3617,
"num_input_tokens_seen": 147328,
"step": 380
},
{
"epoch": 3.08,
"grad_norm": 0.02001953125,
"learning_rate": 0.026244738061397325,
"loss": 0.3622,
"num_input_tokens_seen": 149376,
"step": 385
},
{
"epoch": 3.12,
"grad_norm": 0.031494140625,
"learning_rate": 0.026105031995366672,
"loss": 0.3533,
"num_input_tokens_seen": 151744,
"step": 390
},
{
"epoch": 3.16,
"grad_norm": 0.05126953125,
"learning_rate": 0.025963160981345105,
"loss": 0.347,
"num_input_tokens_seen": 153920,
"step": 395
},
{
"epoch": 3.2,
"grad_norm": 0.0234375,
"learning_rate": 0.02581915267736791,
"loss": 0.3437,
"num_input_tokens_seen": 155776,
"step": 400
},
{
"epoch": 3.24,
"grad_norm": 0.00848388671875,
"learning_rate": 0.025673035158139283,
"loss": 0.3403,
"num_input_tokens_seen": 157952,
"step": 405
},
{
"epoch": 3.2800000000000002,
"grad_norm": 0.0079345703125,
"learning_rate": 0.02552483690955911,
"loss": 0.3583,
"num_input_tokens_seen": 160032,
"step": 410
},
{
"epoch": 3.32,
"grad_norm": 0.006866455078125,
"learning_rate": 0.0253745868231696,
"loss": 0.3624,
"num_input_tokens_seen": 162048,
"step": 415
},
{
"epoch": 3.36,
"grad_norm": 0.0263671875,
"learning_rate": 0.025222314190522798,
"loss": 0.3483,
"num_input_tokens_seen": 164352,
"step": 420
},
{
"epoch": 3.4,
"grad_norm": 0.049560546875,
"learning_rate": 0.02506804869747014,
"loss": 0.3387,
"num_input_tokens_seen": 166144,
"step": 425
},
{
"epoch": 3.44,
"grad_norm": 0.03759765625,
"learning_rate": 0.024911820418375166,
"loss": 0.3578,
"num_input_tokens_seen": 168288,
"step": 430
},
{
"epoch": 3.48,
"grad_norm": 0.040283203125,
"learning_rate": 0.02475365981025043,
"loss": 0.331,
"num_input_tokens_seen": 170080,
"step": 435
},
{
"epoch": 3.52,
"grad_norm": 0.012939453125,
"learning_rate": 0.02459359770681987,
"loss": 0.3506,
"num_input_tokens_seen": 172000,
"step": 440
},
{
"epoch": 3.528,
"eval_loss": 0.32748550176620483,
"eval_runtime": 0.9494,
"eval_samples_per_second": 58.984,
"eval_steps_per_second": 14.746,
"num_input_tokens_seen": 172480,
"step": 441
},
{
"epoch": 3.56,
"grad_norm": 0.0101318359375,
"learning_rate": 0.02443166531250769,
"loss": 0.3917,
"num_input_tokens_seen": 174336,
"step": 445
},
{
"epoch": 3.6,
"grad_norm": 0.022216796875,
"learning_rate": 0.024267894196355017,
"loss": 0.3457,
"num_input_tokens_seen": 176512,
"step": 450
},
{
"epoch": 3.64,
"grad_norm": 0.0052490234375,
"learning_rate": 0.024102316285865434,
"loss": 0.345,
"num_input_tokens_seen": 178368,
"step": 455
},
{
"epoch": 3.68,
"grad_norm": 0.0262451171875,
"learning_rate": 0.02393496386078067,
"loss": 0.3422,
"num_input_tokens_seen": 180224,
"step": 460
},
{
"epoch": 3.7199999999999998,
"grad_norm": 0.0224609375,
"learning_rate": 0.02376586954678758,
"loss": 0.3466,
"num_input_tokens_seen": 181984,
"step": 465
},
{
"epoch": 3.76,
"grad_norm": 0.005645751953125,
"learning_rate": 0.02359506630915773,
"loss": 0.3405,
"num_input_tokens_seen": 184064,
"step": 470
},
{
"epoch": 3.8,
"grad_norm": 0.0296630859375,
"learning_rate": 0.023422587446320715,
"loss": 0.3697,
"num_input_tokens_seen": 185856,
"step": 475
},
{
"epoch": 3.84,
"grad_norm": 0.0439453125,
"learning_rate": 0.0232484665833726,
"loss": 0.3426,
"num_input_tokens_seen": 187840,
"step": 480
},
{
"epoch": 3.88,
"grad_norm": 0.025390625,
"learning_rate": 0.023072737665520607,
"loss": 0.3741,
"num_input_tokens_seen": 189536,
"step": 485
},
{
"epoch": 3.92,
"grad_norm": 0.00628662109375,
"learning_rate": 0.022895434951465468,
"loss": 0.3444,
"num_input_tokens_seen": 191328,
"step": 490
},
{
"epoch": 3.96,
"grad_norm": 0.017333984375,
"learning_rate": 0.022716593006722595,
"loss": 0.3556,
"num_input_tokens_seen": 192960,
"step": 495
},
{
"epoch": 4.0,
"grad_norm": 0.056640625,
"learning_rate": 0.02253624669688347,
"loss": 0.377,
"num_input_tokens_seen": 194384,
"step": 500
},
{
"epoch": 4.032,
"eval_loss": 0.3651432991027832,
"eval_runtime": 0.9723,
"eval_samples_per_second": 57.597,
"eval_steps_per_second": 14.399,
"num_input_tokens_seen": 196240,
"step": 504
},
{
"epoch": 4.04,
"grad_norm": 0.050537109375,
"learning_rate": 0.022354431180818528,
"loss": 0.365,
"num_input_tokens_seen": 196528,
"step": 505
},
{
"epoch": 4.08,
"grad_norm": 0.0250244140625,
"learning_rate": 0.022171181903822883,
"loss": 0.3468,
"num_input_tokens_seen": 198512,
"step": 510
},
{
"epoch": 4.12,
"grad_norm": 0.020263671875,
"learning_rate": 0.021986534590706163,
"loss": 0.3571,
"num_input_tokens_seen": 200208,
"step": 515
},
{
"epoch": 4.16,
"grad_norm": 0.02001953125,
"learning_rate": 0.021800525238827927,
"loss": 0.3332,
"num_input_tokens_seen": 202480,
"step": 520
},
{
"epoch": 4.2,
"grad_norm": 0.0106201171875,
"learning_rate": 0.02161319011107988,
"loss": 0.3391,
"num_input_tokens_seen": 204336,
"step": 525
},
{
"epoch": 4.24,
"grad_norm": 0.0294189453125,
"learning_rate": 0.021424565728816354,
"loss": 0.3634,
"num_input_tokens_seen": 206448,
"step": 530
},
{
"epoch": 4.28,
"grad_norm": 0.0203857421875,
"learning_rate": 0.021234688864734418,
"loss": 0.3445,
"num_input_tokens_seen": 208144,
"step": 535
},
{
"epoch": 4.32,
"grad_norm": 0.021240234375,
"learning_rate": 0.02104359653570494,
"loss": 0.3365,
"num_input_tokens_seen": 210288,
"step": 540
},
{
"epoch": 4.36,
"grad_norm": 0.0286865234375,
"learning_rate": 0.020851325995556093,
"loss": 0.3553,
"num_input_tokens_seen": 212016,
"step": 545
},
{
"epoch": 4.4,
"grad_norm": 0.026611328125,
"learning_rate": 0.020657914727810648,
"loss": 0.3492,
"num_input_tokens_seen": 214128,
"step": 550
},
{
"epoch": 4.44,
"grad_norm": 0.020751953125,
"learning_rate": 0.020463400438378472,
"loss": 0.343,
"num_input_tokens_seen": 216240,
"step": 555
},
{
"epoch": 4.48,
"grad_norm": 0.004425048828125,
"learning_rate": 0.020267821048205698,
"loss": 0.3577,
"num_input_tokens_seen": 218288,
"step": 560
},
{
"epoch": 4.52,
"grad_norm": 0.007171630859375,
"learning_rate": 0.02007121468588196,
"loss": 0.3479,
"num_input_tokens_seen": 220240,
"step": 565
},
{
"epoch": 4.536,
"eval_loss": 0.356781005859375,
"eval_runtime": 0.97,
"eval_samples_per_second": 57.734,
"eval_steps_per_second": 14.433,
"num_input_tokens_seen": 221136,
"step": 567
},
{
"epoch": 4.5600000000000005,
"grad_norm": 0.049560546875,
"learning_rate": 0.019873619680207146,
"loss": 0.3374,
"num_input_tokens_seen": 222256,
"step": 570
},
{
"epoch": 4.6,
"grad_norm": 0.0098876953125,
"learning_rate": 0.019675074552719125,
"loss": 0.3454,
"num_input_tokens_seen": 224272,
"step": 575
},
{
"epoch": 4.64,
"grad_norm": 0.037353515625,
"learning_rate": 0.019475618010183906,
"loss": 0.3695,
"num_input_tokens_seen": 226320,
"step": 580
},
{
"epoch": 4.68,
"grad_norm": 0.0269775390625,
"learning_rate": 0.01927528893704964,
"loss": 0.3548,
"num_input_tokens_seen": 228528,
"step": 585
},
{
"epoch": 4.72,
"grad_norm": 0.00457763671875,
"learning_rate": 0.01907412638786608,
"loss": 0.3525,
"num_input_tokens_seen": 230224,
"step": 590
},
{
"epoch": 4.76,
"grad_norm": 0.025634765625,
"learning_rate": 0.018872169579670764,
"loss": 0.3496,
"num_input_tokens_seen": 232048,
"step": 595
},
{
"epoch": 4.8,
"grad_norm": 0.00634765625,
"learning_rate": 0.01866945788434361,
"loss": 0.3528,
"num_input_tokens_seen": 234096,
"step": 600
},
{
"epoch": 4.84,
"grad_norm": 0.0206298828125,
"learning_rate": 0.018466030820931272,
"loss": 0.3389,
"num_input_tokens_seen": 235888,
"step": 605
},
{
"epoch": 4.88,
"grad_norm": 0.00335693359375,
"learning_rate": 0.01826192804794282,
"loss": 0.3418,
"num_input_tokens_seen": 237648,
"step": 610
},
{
"epoch": 4.92,
"grad_norm": 0.0037384033203125,
"learning_rate": 0.018057189355618276,
"loss": 0.3529,
"num_input_tokens_seen": 239408,
"step": 615
},
{
"epoch": 4.96,
"grad_norm": 0.004486083984375,
"learning_rate": 0.01785185465817135,
"loss": 0.3451,
"num_input_tokens_seen": 241296,
"step": 620
},
{
"epoch": 5.0,
"grad_norm": 0.0076904296875,
"learning_rate": 0.017645963986008185,
"loss": 0.3406,
"num_input_tokens_seen": 242624,
"step": 625
},
{
"epoch": 5.04,
"grad_norm": 0.043701171875,
"learning_rate": 0.017439557477923254,
"loss": 0.3271,
"num_input_tokens_seen": 244736,
"step": 630
},
{
"epoch": 5.04,
"eval_loss": 0.33552998304367065,
"eval_runtime": 0.9773,
"eval_samples_per_second": 57.301,
"eval_steps_per_second": 14.325,
"num_input_tokens_seen": 244736,
"step": 630
},
{
"epoch": 5.08,
"grad_norm": 0.00775146484375,
"learning_rate": 0.017232675373274282,
"loss": 0.3665,
"num_input_tokens_seen": 246624,
"step": 635
},
{
"epoch": 5.12,
"grad_norm": 0.0279541015625,
"learning_rate": 0.017025358004137486,
"loss": 0.3566,
"num_input_tokens_seen": 248256,
"step": 640
},
{
"epoch": 5.16,
"grad_norm": 0.01953125,
"learning_rate": 0.016817645787444758,
"loss": 0.3418,
"num_input_tokens_seen": 249888,
"step": 645
},
{
"epoch": 5.2,
"grad_norm": 0.0233154296875,
"learning_rate": 0.0166095792171043,
"loss": 0.3624,
"num_input_tokens_seen": 251808,
"step": 650
},
{
"epoch": 5.24,
"grad_norm": 0.0198974609375,
"learning_rate": 0.01640119885610626,
"loss": 0.3462,
"num_input_tokens_seen": 253504,
"step": 655
},
{
"epoch": 5.28,
"grad_norm": 0.003448486328125,
"learning_rate": 0.016192545328614895,
"loss": 0.3466,
"num_input_tokens_seen": 255552,
"step": 660
},
{
"epoch": 5.32,
"grad_norm": 0.0478515625,
"learning_rate": 0.015983659312048825,
"loss": 0.364,
"num_input_tokens_seen": 257760,
"step": 665
},
{
"epoch": 5.36,
"grad_norm": 0.0030517578125,
"learning_rate": 0.015774581529150847,
"loss": 0.3449,
"num_input_tokens_seen": 259488,
"step": 670
},
{
"epoch": 5.4,
"grad_norm": 0.021484375,
"learning_rate": 0.01556535274004902,
"loss": 0.3508,
"num_input_tokens_seen": 261344,
"step": 675
},
{
"epoch": 5.44,
"grad_norm": 0.00537109375,
"learning_rate": 0.01535601373431033,
"loss": 0.3418,
"num_input_tokens_seen": 263488,
"step": 680
},
{
"epoch": 5.48,
"grad_norm": 0.0194091796875,
"learning_rate": 0.015146605322988737,
"loss": 0.3408,
"num_input_tokens_seen": 265600,
"step": 685
},
{
"epoch": 5.52,
"grad_norm": 0.04052734375,
"learning_rate": 0.014937168330668944,
"loss": 0.3529,
"num_input_tokens_seen": 267360,
"step": 690
},
{
"epoch": 5.5440000000000005,
"eval_loss": 0.33484506607055664,
"eval_runtime": 0.9589,
"eval_samples_per_second": 58.4,
"eval_steps_per_second": 14.6,
"num_input_tokens_seen": 268480,
"step": 693
},
{
"epoch": 5.5600000000000005,
"grad_norm": 0.005340576171875,
"learning_rate": 0.014727743587507579,
"loss": 0.3438,
"num_input_tokens_seen": 269120,
"step": 695
},
{
"epoch": 5.6,
"grad_norm": 0.006561279296875,
"learning_rate": 0.014518371921273277,
"loss": 0.3506,
"num_input_tokens_seen": 271104,
"step": 700
},
{
"epoch": 5.64,
"grad_norm": 0.005767822265625,
"learning_rate": 0.014309094149387214,
"loss": 0.3413,
"num_input_tokens_seen": 272832,
"step": 705
},
{
"epoch": 5.68,
"grad_norm": 0.0263671875,
"learning_rate": 0.014099951070965693,
"loss": 0.3523,
"num_input_tokens_seen": 274784,
"step": 710
},
{
"epoch": 5.72,
"grad_norm": 0.02685546875,
"learning_rate": 0.013890983458866225,
"loss": 0.3412,
"num_input_tokens_seen": 277024,
"step": 715
},
{
"epoch": 5.76,
"grad_norm": 0.0189208984375,
"learning_rate": 0.013682232051738852,
"loss": 0.3568,
"num_input_tokens_seen": 279008,
"step": 720
},
{
"epoch": 5.8,
"grad_norm": 0.049560546875,
"learning_rate": 0.013473737546084006,
"loss": 0.3503,
"num_input_tokens_seen": 281280,
"step": 725
},
{
"epoch": 5.84,
"grad_norm": 0.0206298828125,
"learning_rate": 0.013265540588318678,
"loss": 0.3467,
"num_input_tokens_seen": 283392,
"step": 730
},
{
"epoch": 5.88,
"grad_norm": 0.0230712890625,
"learning_rate": 0.013057681766852297,
"loss": 0.3497,
"num_input_tokens_seen": 285184,
"step": 735
},
{
"epoch": 5.92,
"grad_norm": 0.00689697265625,
"learning_rate": 0.012850201604173958,
"loss": 0.3403,
"num_input_tokens_seen": 287424,
"step": 740
},
{
"epoch": 5.96,
"grad_norm": 0.0234375,
"learning_rate": 0.012643140548952488,
"loss": 0.3495,
"num_input_tokens_seen": 289600,
"step": 745
},
{
"epoch": 6.0,
"grad_norm": 0.008544921875,
"learning_rate": 0.012436538968150852,
"loss": 0.3465,
"num_input_tokens_seen": 291216,
"step": 750
},
{
"epoch": 6.04,
"grad_norm": 0.02099609375,
"learning_rate": 0.012230437139156598,
"loss": 0.3433,
"num_input_tokens_seen": 292944,
"step": 755
},
{
"epoch": 6.048,
"eval_loss": 0.3435487449169159,
"eval_runtime": 1.0271,
"eval_samples_per_second": 54.523,
"eval_steps_per_second": 13.631,
"num_input_tokens_seen": 293424,
"step": 756
},
{
"epoch": 6.08,
"grad_norm": 0.0242919921875,
"learning_rate": 0.012024875241929653,
"loss": 0.3495,
"num_input_tokens_seen": 294768,
"step": 760
},
{
"epoch": 6.12,
"grad_norm": 0.0235595703125,
"learning_rate": 0.011819893351169184,
"loss": 0.3465,
"num_input_tokens_seen": 296816,
"step": 765
},
{
"epoch": 6.16,
"grad_norm": 0.0027008056640625,
"learning_rate": 0.011615531428500938,
"loss": 0.3479,
"num_input_tokens_seen": 298480,
"step": 770
},
{
"epoch": 6.2,
"grad_norm": 0.005035400390625,
"learning_rate": 0.01141182931468666,
"loss": 0.3524,
"num_input_tokens_seen": 300528,
"step": 775
},
{
"epoch": 6.24,
"grad_norm": 0.021728515625,
"learning_rate": 0.01120882672185706,
"loss": 0.3477,
"num_input_tokens_seen": 302448,
"step": 780
},
{
"epoch": 6.28,
"grad_norm": 0.0224609375,
"learning_rate": 0.011006563225769832,
"loss": 0.3492,
"num_input_tokens_seen": 304496,
"step": 785
},
{
"epoch": 6.32,
"grad_norm": 0.0225830078125,
"learning_rate": 0.010805078258094304,
"loss": 0.3524,
"num_input_tokens_seen": 306256,
"step": 790
},
{
"epoch": 6.36,
"grad_norm": 0.045654296875,
"learning_rate": 0.01060441109872414,
"loss": 0.3492,
"num_input_tokens_seen": 308592,
"step": 795
},
{
"epoch": 6.4,
"grad_norm": 0.005645751953125,
"learning_rate": 0.01040460086811966,
"loss": 0.3525,
"num_input_tokens_seen": 310768,
"step": 800
},
{
"epoch": 6.44,
"grad_norm": 0.022216796875,
"learning_rate": 0.010205686519681232,
"loss": 0.3416,
"num_input_tokens_seen": 312656,
"step": 805
},
{
"epoch": 6.48,
"grad_norm": 0.04248046875,
"learning_rate": 0.0100077068321552,
"loss": 0.3512,
"num_input_tokens_seen": 314320,
"step": 810
},
{
"epoch": 6.52,
"grad_norm": 0.004180908203125,
"learning_rate": 0.009810700402073928,
"loss": 0.354,
"num_input_tokens_seen": 316176,
"step": 815
},
{
"epoch": 6.552,
"eval_loss": 0.34503957629203796,
"eval_runtime": 1.0109,
"eval_samples_per_second": 55.397,
"eval_steps_per_second": 13.849,
"num_input_tokens_seen": 317840,
"step": 819
},
{
"epoch": 6.5600000000000005,
"grad_norm": 0.022216796875,
"learning_rate": 0.009614705636231307,
"loss": 0.3446,
"num_input_tokens_seen": 318128,
"step": 820
},
{
"epoch": 6.6,
"grad_norm": 0.006072998046875,
"learning_rate": 0.009419760744195283,
"loss": 0.3436,
"num_input_tokens_seen": 319984,
"step": 825
},
{
"epoch": 6.64,
"grad_norm": 0.0269775390625,
"learning_rate": 0.00922590373085881,
"loss": 0.341,
"num_input_tokens_seen": 321680,
"step": 830
},
{
"epoch": 6.68,
"grad_norm": 0.02978515625,
"learning_rate": 0.009033172389030755,
"loss": 0.3496,
"num_input_tokens_seen": 323440,
"step": 835
},
{
"epoch": 6.72,
"grad_norm": 0.0167236328125,
"learning_rate": 0.00884160429206808,
"loss": 0.3507,
"num_input_tokens_seen": 325232,
"step": 840
},
{
"epoch": 6.76,
"grad_norm": 0.029052734375,
"learning_rate": 0.008651236786550862,
"loss": 0.3628,
"num_input_tokens_seen": 327024,
"step": 845
},
{
"epoch": 6.8,
"grad_norm": 0.0286865234375,
"learning_rate": 0.00846210698500149,
"loss": 0.3682,
"num_input_tokens_seen": 329488,
"step": 850
},
{
"epoch": 6.84,
"grad_norm": 0.0052490234375,
"learning_rate": 0.008274251758649518,
"loss": 0.3491,
"num_input_tokens_seen": 331568,
"step": 855
},
{
"epoch": 6.88,
"grad_norm": 0.02587890625,
"learning_rate": 0.008087707730243539,
"loss": 0.3498,
"num_input_tokens_seen": 333904,
"step": 860
},
{
"epoch": 6.92,
"grad_norm": 0.004364013671875,
"learning_rate": 0.007902511266911504,
"loss": 0.3495,
"num_input_tokens_seen": 336048,
"step": 865
},
{
"epoch": 6.96,
"grad_norm": 0.003936767578125,
"learning_rate": 0.00771869847307089,
"loss": 0.3461,
"num_input_tokens_seen": 338064,
"step": 870
},
{
"epoch": 7.0,
"grad_norm": 0.00653076171875,
"learning_rate": 0.007536305183390062,
"loss": 0.3461,
"num_input_tokens_seen": 339568,
"step": 875
},
{
"epoch": 7.04,
"grad_norm": 0.0228271484375,
"learning_rate": 0.007355366955802234,
"loss": 0.348,
"num_input_tokens_seen": 341584,
"step": 880
},
{
"epoch": 7.056,
"eval_loss": 0.3410987854003906,
"eval_runtime": 1.0012,
"eval_samples_per_second": 55.934,
"eval_steps_per_second": 13.984,
"num_input_tokens_seen": 342384,
"step": 882
},
{
"epoch": 7.08,
"grad_norm": 0.02490234375,
"learning_rate": 0.007175919064573383,
"loss": 0.3578,
"num_input_tokens_seen": 343440,
"step": 885
},
{
"epoch": 7.12,
"grad_norm": 0.0419921875,
"learning_rate": 0.006997996493425461,
"loss": 0.345,
"num_input_tokens_seen": 345232,
"step": 890
},
{
"epoch": 7.16,
"grad_norm": 0.02001953125,
"learning_rate": 0.0068216339287162486,
"loss": 0.3435,
"num_input_tokens_seen": 347056,
"step": 895
},
{
"epoch": 7.2,
"grad_norm": 0.004669189453125,
"learning_rate": 0.006646865752677185,
"loss": 0.3421,
"num_input_tokens_seen": 348816,
"step": 900
},
{
"epoch": 7.24,
"grad_norm": 0.01953125,
"learning_rate": 0.00647372603671046,
"loss": 0.3405,
"num_input_tokens_seen": 351120,
"step": 905
},
{
"epoch": 7.28,
"grad_norm": 0.020751953125,
"learning_rate": 0.0063022485347467615,
"loss": 0.3468,
"num_input_tokens_seen": 352912,
"step": 910
},
{
"epoch": 7.32,
"grad_norm": 0.00592041015625,
"learning_rate": 0.00613246667666487,
"loss": 0.344,
"num_input_tokens_seen": 354768,
"step": 915
},
{
"epoch": 7.36,
"grad_norm": 0.04150390625,
"learning_rate": 0.005964413561774424,
"loss": 0.3517,
"num_input_tokens_seen": 356944,
"step": 920
},
{
"epoch": 7.4,
"grad_norm": 0.041748046875,
"learning_rate": 0.0057981219523631404,
"loss": 0.3457,
"num_input_tokens_seen": 358896,
"step": 925
},
{
"epoch": 7.44,
"grad_norm": 0.004241943359375,
"learning_rate": 0.005633624267309767,
"loss": 0.3486,
"num_input_tokens_seen": 360784,
"step": 930
},
{
"epoch": 7.48,
"grad_norm": 0.0262451171875,
"learning_rate": 0.005470952575763933,
"loss": 0.3551,
"num_input_tokens_seen": 362512,
"step": 935
},
{
"epoch": 7.52,
"grad_norm": 0.048095703125,
"learning_rate": 0.0053101385908942405,
"loss": 0.3495,
"num_input_tokens_seen": 364400,
"step": 940
},
{
"epoch": 7.5600000000000005,
"grad_norm": 0.02490234375,
"learning_rate": 0.0051512136637056555,
"loss": 0.3469,
"num_input_tokens_seen": 366288,
"step": 945
},
{
"epoch": 7.5600000000000005,
"eval_loss": 0.34017425775527954,
"eval_runtime": 0.9703,
"eval_samples_per_second": 57.716,
"eval_steps_per_second": 14.429,
"num_input_tokens_seen": 366288,
"step": 945
},
{
"epoch": 7.6,
"grad_norm": 0.0198974609375,
"learning_rate": 0.004994208776927635,
"loss": 0.3549,
"num_input_tokens_seen": 368656,
"step": 950
},
{
"epoch": 7.64,
"grad_norm": 0.0234375,
"learning_rate": 0.004839154538973943,
"loss": 0.35,
"num_input_tokens_seen": 370608,
"step": 955
},
{
"epoch": 7.68,
"grad_norm": 0.0419921875,
"learning_rate": 0.00468608117797549,
"loss": 0.3435,
"num_input_tokens_seen": 372336,
"step": 960
},
{
"epoch": 7.72,
"grad_norm": 0.04296875,
"learning_rate": 0.004535018535887305,
"loss": 0.3387,
"num_input_tokens_seen": 374288,
"step": 965
},
{
"epoch": 7.76,
"grad_norm": 0.004119873046875,
"learning_rate": 0.004385996062670774,
"loss": 0.3529,
"num_input_tokens_seen": 376144,
"step": 970
},
{
"epoch": 7.8,
"grad_norm": 0.021240234375,
"learning_rate": 0.0042390428105523225,
"loss": 0.3513,
"num_input_tokens_seen": 378160,
"step": 975
},
{
"epoch": 7.84,
"grad_norm": 0.0208740234375,
"learning_rate": 0.004094187428359625,
"loss": 0.3481,
"num_input_tokens_seen": 380208,
"step": 980
},
{
"epoch": 7.88,
"grad_norm": 0.0047607421875,
"learning_rate": 0.003951458155936452,
"loss": 0.3401,
"num_input_tokens_seen": 382384,
"step": 985
},
{
"epoch": 7.92,
"grad_norm": 0.005218505859375,
"learning_rate": 0.0038108828186372685,
"loss": 0.3496,
"num_input_tokens_seen": 384688,
"step": 990
},
{
"epoch": 7.96,
"grad_norm": 0.006439208984375,
"learning_rate": 0.003672488821902614,
"loss": 0.3465,
"num_input_tokens_seen": 386736,
"step": 995
},
{
"epoch": 8.0,
"grad_norm": 0.005889892578125,
"learning_rate": 0.0035363031459163647,
"loss": 0.3498,
"num_input_tokens_seen": 388576,
"step": 1000
},
{
"epoch": 8.04,
"grad_norm": 0.003936767578125,
"learning_rate": 0.0034023523403458908,
"loss": 0.342,
"num_input_tokens_seen": 390848,
"step": 1005
},
{
"epoch": 8.064,
"eval_loss": 0.3409908711910248,
"eval_runtime": 0.9724,
"eval_samples_per_second": 57.587,
"eval_steps_per_second": 14.397,
"num_input_tokens_seen": 391840,
"step": 1008
},
{
"epoch": 8.08,
"grad_norm": 0.0211181640625,
"learning_rate": 0.003270662519166149,
"loss": 0.3528,
"num_input_tokens_seen": 392480,
"step": 1010
},
{
"epoch": 8.12,
"grad_norm": 0.043212890625,
"learning_rate": 0.003141259355568705,
"loss": 0.3435,
"num_input_tokens_seen": 394016,
"step": 1015
},
{
"epoch": 8.16,
"grad_norm": 0.00714111328125,
"learning_rate": 0.003014168076956707,
"loss": 0.3498,
"num_input_tokens_seen": 395904,
"step": 1020
},
{
"epoch": 8.2,
"grad_norm": 0.0203857421875,
"learning_rate": 0.002889413460026724,
"loss": 0.3326,
"num_input_tokens_seen": 398272,
"step": 1025
},
{
"epoch": 8.24,
"grad_norm": 0.004486083984375,
"learning_rate": 0.0027670198259385275,
"loss": 0.3563,
"num_input_tokens_seen": 400384,
"step": 1030
},
{
"epoch": 8.28,
"grad_norm": 0.0045166015625,
"learning_rate": 0.0026470110355735882,
"loss": 0.3468,
"num_input_tokens_seen": 402432,
"step": 1035
},
{
"epoch": 8.32,
"grad_norm": 0.0201416015625,
"learning_rate": 0.0025294104848833754,
"loss": 0.3502,
"num_input_tokens_seen": 404448,
"step": 1040
},
{
"epoch": 8.36,
"grad_norm": 0.00701904296875,
"learning_rate": 0.002414241100328251,
"loss": 0.3609,
"num_input_tokens_seen": 406432,
"step": 1045
},
{
"epoch": 8.4,
"grad_norm": 0.0439453125,
"learning_rate": 0.002301525334407931,
"loss": 0.3469,
"num_input_tokens_seen": 408640,
"step": 1050
},
{
"epoch": 8.44,
"grad_norm": 0.02099609375,
"learning_rate": 0.0021912851612843243,
"loss": 0.3373,
"num_input_tokens_seen": 410304,
"step": 1055
},
{
"epoch": 8.48,
"grad_norm": 0.00421142578125,
"learning_rate": 0.002083542072497606,
"loss": 0.3576,
"num_input_tokens_seen": 412064,
"step": 1060
},
{
"epoch": 8.52,
"grad_norm": 0.0240478515625,
"learning_rate": 0.001978317072776413,
"loss": 0.3405,
"num_input_tokens_seen": 414144,
"step": 1065
},
{
"epoch": 8.56,
"grad_norm": 0.003936767578125,
"learning_rate": 0.0018756306759429363,
"loss": 0.3469,
"num_input_tokens_seen": 416032,
"step": 1070
},
{
"epoch": 8.568,
"eval_loss": 0.3438374996185303,
"eval_runtime": 0.9714,
"eval_samples_per_second": 57.648,
"eval_steps_per_second": 14.412,
"num_input_tokens_seen": 416320,
"step": 1071
},
{
"epoch": 8.6,
"grad_norm": 0.0250244140625,
"learning_rate": 0.001775502900913697,
"loss": 0.3547,
"num_input_tokens_seen": 417824,
"step": 1075
},
{
"epoch": 8.64,
"grad_norm": 0.0228271484375,
"learning_rate": 0.0016779532677968327,
"loss": 0.3391,
"num_input_tokens_seen": 420096,
"step": 1080
},
{
"epoch": 8.68,
"grad_norm": 0.00726318359375,
"learning_rate": 0.0015830007940866035,
"loss": 0.3454,
"num_input_tokens_seen": 421824,
"step": 1085
},
{
"epoch": 8.72,
"grad_norm": 0.004730224609375,
"learning_rate": 0.0014906639909558954,
"loss": 0.3451,
"num_input_tokens_seen": 423552,
"step": 1090
},
{
"epoch": 8.76,
"grad_norm": 0.02490234375,
"learning_rate": 0.0014009608596474348,
"loss": 0.3515,
"num_input_tokens_seen": 425376,
"step": 1095
},
{
"epoch": 8.8,
"grad_norm": 0.0238037109375,
"learning_rate": 0.001313908887964409,
"loss": 0.3561,
"num_input_tokens_seen": 427008,
"step": 1100
},
{
"epoch": 8.84,
"grad_norm": 0.005340576171875,
"learning_rate": 0.0012295250468611779,
"loss": 0.3437,
"num_input_tokens_seen": 428960,
"step": 1105
},
{
"epoch": 8.88,
"grad_norm": 0.0242919921875,
"learning_rate": 0.0011478257871347663,
"loss": 0.3498,
"num_input_tokens_seen": 431072,
"step": 1110
},
{
"epoch": 8.92,
"grad_norm": 0.00994873046875,
"learning_rate": 0.0010688270362177355,
"loss": 0.3498,
"num_input_tokens_seen": 433280,
"step": 1115
},
{
"epoch": 8.96,
"grad_norm": 0.004150390625,
"learning_rate": 0.0009925441950730985,
"loss": 0.3357,
"num_input_tokens_seen": 434976,
"step": 1120
},
{
"epoch": 9.0,
"grad_norm": 0.00457763671875,
"learning_rate": 0.0009189921351918889,
"loss": 0.3452,
"num_input_tokens_seen": 436656,
"step": 1125
},
{
"epoch": 9.04,
"grad_norm": 0.0419921875,
"learning_rate": 0.0008481851956939134,
"loss": 0.3467,
"num_input_tokens_seen": 438544,
"step": 1130
},
{
"epoch": 9.072,
"eval_loss": 0.34365683794021606,
"eval_runtime": 0.9736,
"eval_samples_per_second": 57.517,
"eval_steps_per_second": 14.379,
"num_input_tokens_seen": 440048,
"step": 1134
},
{
"epoch": 9.08,
"grad_norm": 0.042724609375,
"learning_rate": 0.0007801371805323276,
"loss": 0.3404,
"num_input_tokens_seen": 440464,
"step": 1135
},
{
"epoch": 9.12,
"grad_norm": 0.00390625,
"learning_rate": 0.0007148613558025102,
"loss": 0.3482,
"num_input_tokens_seen": 442064,
"step": 1140
},
{
"epoch": 9.16,
"grad_norm": 0.007049560546875,
"learning_rate": 0.0006523704471558306,
"loss": 0.3482,
"num_input_tokens_seen": 444016,
"step": 1145
},
{
"epoch": 9.2,
"grad_norm": 0.021484375,
"learning_rate": 0.0005926766373187531,
"loss": 0.3421,
"num_input_tokens_seen": 445904,
"step": 1150
},
{
"epoch": 9.24,
"grad_norm": 0.0247802734375,
"learning_rate": 0.0005357915637177817,
"loss": 0.339,
"num_input_tokens_seen": 448080,
"step": 1155
},
{
"epoch": 9.28,
"grad_norm": 0.0201416015625,
"learning_rate": 0.00048172631621072045,
"loss": 0.3436,
"num_input_tokens_seen": 450352,
"step": 1160
},
{
"epoch": 9.32,
"grad_norm": 0.007720947265625,
"learning_rate": 0.00043049143492470017,
"loss": 0.3389,
"num_input_tokens_seen": 452208,
"step": 1165
},
{
"epoch": 9.36,
"grad_norm": 0.0216064453125,
"learning_rate": 0.00038209690820134145,
"loss": 0.3388,
"num_input_tokens_seen": 454256,
"step": 1170
},
{
"epoch": 9.4,
"grad_norm": 0.0047607421875,
"learning_rate": 0.0003365521706495234,
"loss": 0.3482,
"num_input_tokens_seen": 456048,
"step": 1175
},
{
"epoch": 9.44,
"grad_norm": 0.0205078125,
"learning_rate": 0.00029386610130606504,
"loss": 0.3466,
"num_input_tokens_seen": 457840,
"step": 1180
},
{
"epoch": 9.48,
"grad_norm": 0.005950927734375,
"learning_rate": 0.00025404702190476856,
"loss": 0.3498,
"num_input_tokens_seen": 460080,
"step": 1185
},
{
"epoch": 9.52,
"grad_norm": 0.0211181640625,
"learning_rate": 0.00021710269525405834,
"loss": 0.3497,
"num_input_tokens_seen": 461840,
"step": 1190
},
{
"epoch": 9.56,
"grad_norm": 0.004638671875,
"learning_rate": 0.00018304032372361666,
"loss": 0.3497,
"num_input_tokens_seen": 463952,
"step": 1195
},
{
"epoch": 9.576,
"eval_loss": 0.34312018752098083,
"eval_runtime": 0.9538,
"eval_samples_per_second": 58.714,
"eval_steps_per_second": 14.679,
"num_input_tokens_seen": 464688,
"step": 1197
},
{
"epoch": 9.6,
"grad_norm": 0.0203857421875,
"learning_rate": 0.00015186654784026365,
"loss": 0.3451,
"num_input_tokens_seen": 465904,
"step": 1200
},
{
"epoch": 9.64,
"grad_norm": 0.004669189453125,
"learning_rate": 0.00012358744499337603,
"loss": 0.3531,
"num_input_tokens_seen": 467728,
"step": 1205
},
{
"epoch": 9.68,
"grad_norm": 0.0247802734375,
"learning_rate": 9.820852825008664e-05,
"loss": 0.3466,
"num_input_tokens_seen": 469840,
"step": 1210
},
{
"epoch": 9.72,
"grad_norm": 0.021240234375,
"learning_rate": 7.57347452804974e-05,
"loss": 0.3496,
"num_input_tokens_seen": 472144,
"step": 1215
},
{
"epoch": 9.76,
"grad_norm": 0.0235595703125,
"learning_rate": 5.6170477393130966e-05,
"loss": 0.3387,
"num_input_tokens_seen": 473808,
"step": 1220
},
{
"epoch": 9.8,
"grad_norm": 0.043212890625,
"learning_rate": 3.951953868077229e-05,
"loss": 0.3561,
"num_input_tokens_seen": 475344,
"step": 1225
},
{
"epoch": 9.84,
"grad_norm": 0.020263671875,
"learning_rate": 2.5785175276920034e-05,
"loss": 0.3405,
"num_input_tokens_seen": 477488,
"step": 1230
},
{
"epoch": 9.88,
"grad_norm": 0.0230712890625,
"learning_rate": 1.4970064722929499e-05,
"loss": 0.3498,
"num_input_tokens_seen": 479280,
"step": 1235
},
{
"epoch": 9.92,
"grad_norm": 0.00579833984375,
"learning_rate": 7.076315446033487e-06,
"loss": 0.3451,
"num_input_tokens_seen": 480944,
"step": 1240
},
{
"epoch": 9.96,
"grad_norm": 0.005950927734375,
"learning_rate": 2.105466348294449e-06,
"loss": 0.3468,
"num_input_tokens_seen": 482992,
"step": 1245
},
{
"epoch": 10.0,
"grad_norm": 0.00872802734375,
"learning_rate": 5.848650659112664e-08,
"loss": 0.353,
"num_input_tokens_seen": 485152,
"step": 1250
},
{
"epoch": 10.0,
"num_input_tokens_seen": 485152,
"step": 1250,
"total_flos": 2.1846175286820864e+16,
"train_loss": 0.39903659229278565,
"train_runtime": 185.1613,
"train_samples_per_second": 26.895,
"train_steps_per_second": 6.751
}
],
"logging_steps": 5,
"max_steps": 1250,
"num_input_tokens_seen": 485152,
"num_train_epochs": 10,
"save_steps": 63,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.1846175286820864e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}