Lalu-Prakash's picture
Upload folder using huggingface_hub
97f723d verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 556,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.1117496490478516,
"epoch": 0.007233273056057866,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 2.636,
"mean_token_accuracy": 0.5501300543546677,
"num_tokens": 1403.0,
"step": 1
},
{
"entropy": 1.1266475915908813,
"epoch": 0.014466546112115732,
"grad_norm": 0.0,
"learning_rate": 4e-05,
"loss": 2.5193,
"mean_token_accuracy": 0.545694500207901,
"num_tokens": 2824.0,
"step": 2
},
{
"entropy": 1.093862533569336,
"epoch": 0.0216998191681736,
"grad_norm": 0.0,
"learning_rate": 8e-05,
"loss": 2.5567,
"mean_token_accuracy": 0.5492231100797653,
"num_tokens": 4187.0,
"step": 3
},
{
"entropy": 1.1016794741153717,
"epoch": 0.028933092224231464,
"grad_norm": 0.0,
"learning_rate": 0.00012,
"loss": 2.5151,
"mean_token_accuracy": 0.5384537577629089,
"num_tokens": 5595.0,
"step": 4
},
{
"entropy": 1.1031606495380402,
"epoch": 0.03616636528028933,
"grad_norm": 0.0,
"learning_rate": 0.00016,
"loss": 2.5984,
"mean_token_accuracy": 0.5410859286785126,
"num_tokens": 6938.0,
"step": 5
},
{
"entropy": 1.0830009877681732,
"epoch": 0.0433996383363472,
"grad_norm": 0.0,
"learning_rate": 0.0002,
"loss": 2.4901,
"mean_token_accuracy": 0.5498783439397812,
"num_tokens": 8372.0,
"step": 6
},
{
"entropy": 1.0646579265594482,
"epoch": 0.05063291139240506,
"grad_norm": 0.0,
"learning_rate": 0.00019963702359346644,
"loss": 2.4523,
"mean_token_accuracy": 0.551283709704876,
"num_tokens": 9816.0,
"step": 7
},
{
"entropy": 1.1242448091506958,
"epoch": 0.05786618444846293,
"grad_norm": 0.0,
"learning_rate": 0.00019927404718693284,
"loss": 2.5337,
"mean_token_accuracy": 0.5342100262641907,
"num_tokens": 11199.0,
"step": 8
},
{
"entropy": 1.0672060549259186,
"epoch": 0.0650994575045208,
"grad_norm": 0.0,
"learning_rate": 0.00019891107078039928,
"loss": 2.4281,
"mean_token_accuracy": 0.5488722920417786,
"num_tokens": 12583.0,
"step": 9
},
{
"entropy": 1.1072518527507782,
"epoch": 0.07233273056057866,
"grad_norm": 0.0,
"learning_rate": 0.0001985480943738657,
"loss": 2.5747,
"mean_token_accuracy": 0.526348888874054,
"num_tokens": 13959.0,
"step": 10
},
{
"entropy": 1.0887023508548737,
"epoch": 0.07956600361663653,
"grad_norm": 0.0,
"learning_rate": 0.0001981851179673321,
"loss": 2.575,
"mean_token_accuracy": 0.5386092066764832,
"num_tokens": 15344.0,
"step": 11
},
{
"entropy": 1.053345412015915,
"epoch": 0.0867992766726944,
"grad_norm": 0.0,
"learning_rate": 0.00019782214156079857,
"loss": 2.4709,
"mean_token_accuracy": 0.5469983220100403,
"num_tokens": 16791.0,
"step": 12
},
{
"entropy": 1.1248537600040436,
"epoch": 0.09403254972875226,
"grad_norm": 0.0,
"learning_rate": 0.000197459165154265,
"loss": 2.3924,
"mean_token_accuracy": 0.5607019513845444,
"num_tokens": 18262.0,
"step": 13
},
{
"entropy": 1.1298492848873138,
"epoch": 0.10126582278481013,
"grad_norm": 0.0,
"learning_rate": 0.0001970961887477314,
"loss": 2.3621,
"mean_token_accuracy": 0.5567697584629059,
"num_tokens": 19693.0,
"step": 14
},
{
"entropy": 1.1234960854053497,
"epoch": 0.10849909584086799,
"grad_norm": 0.0,
"learning_rate": 0.00019673321234119784,
"loss": 2.4681,
"mean_token_accuracy": 0.5486533343791962,
"num_tokens": 21081.0,
"step": 15
},
{
"entropy": 1.155756562948227,
"epoch": 0.11573236889692586,
"grad_norm": 0.0,
"learning_rate": 0.00019637023593466427,
"loss": 2.5987,
"mean_token_accuracy": 0.5197644233703613,
"num_tokens": 22400.0,
"step": 16
},
{
"entropy": 1.0533479899168015,
"epoch": 0.12296564195298372,
"grad_norm": 0.0,
"learning_rate": 0.00019600725952813067,
"loss": 2.3976,
"mean_token_accuracy": 0.5573096722364426,
"num_tokens": 23833.0,
"step": 17
},
{
"entropy": 1.1212570667266846,
"epoch": 0.1301989150090416,
"grad_norm": 0.0,
"learning_rate": 0.0001956442831215971,
"loss": 2.6562,
"mean_token_accuracy": 0.5246226489543915,
"num_tokens": 25173.0,
"step": 18
},
{
"entropy": 1.0642149150371552,
"epoch": 0.13743218806509946,
"grad_norm": 0.0,
"learning_rate": 0.00019528130671506353,
"loss": 2.5246,
"mean_token_accuracy": 0.5611959397792816,
"num_tokens": 26505.0,
"step": 19
},
{
"entropy": 1.0779947936534882,
"epoch": 0.14466546112115733,
"grad_norm": 0.0,
"learning_rate": 0.00019491833030852994,
"loss": 2.5068,
"mean_token_accuracy": 0.5473824739456177,
"num_tokens": 27946.0,
"step": 20
},
{
"entropy": 1.1348789632320404,
"epoch": 0.1518987341772152,
"grad_norm": 0.0,
"learning_rate": 0.00019455535390199637,
"loss": 2.4636,
"mean_token_accuracy": 0.5414505749940872,
"num_tokens": 29342.0,
"step": 21
},
{
"entropy": 1.094531387090683,
"epoch": 0.15913200723327306,
"grad_norm": 0.0,
"learning_rate": 0.0001941923774954628,
"loss": 2.4485,
"mean_token_accuracy": 0.5563794821500778,
"num_tokens": 30704.0,
"step": 22
},
{
"entropy": 1.106130212545395,
"epoch": 0.16636528028933092,
"grad_norm": 0.0,
"learning_rate": 0.00019382940108892923,
"loss": 2.4934,
"mean_token_accuracy": 0.5429124981164932,
"num_tokens": 32117.0,
"step": 23
},
{
"entropy": 1.1143657565116882,
"epoch": 0.1735985533453888,
"grad_norm": 0.0,
"learning_rate": 0.00019346642468239566,
"loss": 2.4279,
"mean_token_accuracy": 0.5571769028902054,
"num_tokens": 33548.0,
"step": 24
},
{
"entropy": 1.0552422404289246,
"epoch": 0.18083182640144665,
"grad_norm": 0.0,
"learning_rate": 0.0001931034482758621,
"loss": 2.4475,
"mean_token_accuracy": 0.5655805617570877,
"num_tokens": 34947.0,
"step": 25
},
{
"entropy": 1.0628145337104797,
"epoch": 0.18806509945750452,
"grad_norm": 0.0,
"learning_rate": 0.0001927404718693285,
"loss": 2.5125,
"mean_token_accuracy": 0.556572213768959,
"num_tokens": 36377.0,
"step": 26
},
{
"entropy": 1.1124870479106903,
"epoch": 0.19529837251356238,
"grad_norm": 0.0,
"learning_rate": 0.00019237749546279493,
"loss": 2.5352,
"mean_token_accuracy": 0.5449838191270828,
"num_tokens": 37708.0,
"step": 27
},
{
"entropy": 1.1239155530929565,
"epoch": 0.20253164556962025,
"grad_norm": 0.0,
"learning_rate": 0.00019201451905626136,
"loss": 2.6023,
"mean_token_accuracy": 0.5333832204341888,
"num_tokens": 39029.0,
"step": 28
},
{
"entropy": 1.0808220505714417,
"epoch": 0.20976491862567812,
"grad_norm": 0.0,
"learning_rate": 0.0001916515426497278,
"loss": 2.4872,
"mean_token_accuracy": 0.55105359852314,
"num_tokens": 40431.0,
"step": 29
},
{
"entropy": 1.09044648706913,
"epoch": 0.21699819168173598,
"grad_norm": 0.0,
"learning_rate": 0.0001912885662431942,
"loss": 2.5347,
"mean_token_accuracy": 0.5469508171081543,
"num_tokens": 41795.0,
"step": 30
},
{
"entropy": 1.096373587846756,
"epoch": 0.22423146473779385,
"grad_norm": 0.0,
"learning_rate": 0.00019092558983666063,
"loss": 2.5209,
"mean_token_accuracy": 0.5446725934743881,
"num_tokens": 43161.0,
"step": 31
},
{
"entropy": 1.1086195409297943,
"epoch": 0.2314647377938517,
"grad_norm": 0.0,
"learning_rate": 0.00019056261343012706,
"loss": 2.5568,
"mean_token_accuracy": 0.5395984202623367,
"num_tokens": 44548.0,
"step": 32
},
{
"entropy": 1.096370369195938,
"epoch": 0.23869801084990958,
"grad_norm": 0.0,
"learning_rate": 0.00019019963702359346,
"loss": 2.5431,
"mean_token_accuracy": 0.5369362384080887,
"num_tokens": 45907.0,
"step": 33
},
{
"entropy": 1.0829149782657623,
"epoch": 0.24593128390596744,
"grad_norm": 0.0,
"learning_rate": 0.0001898366606170599,
"loss": 2.5536,
"mean_token_accuracy": 0.5458774566650391,
"num_tokens": 47273.0,
"step": 34
},
{
"entropy": 1.1014858782291412,
"epoch": 0.25316455696202533,
"grad_norm": 0.0,
"learning_rate": 0.00018947368421052632,
"loss": 2.5563,
"mean_token_accuracy": 0.5427176207304001,
"num_tokens": 48630.0,
"step": 35
},
{
"entropy": 1.0944049954414368,
"epoch": 0.2603978300180832,
"grad_norm": 0.0,
"learning_rate": 0.00018911070780399275,
"loss": 2.5237,
"mean_token_accuracy": 0.543226882815361,
"num_tokens": 50010.0,
"step": 36
},
{
"entropy": 1.0851400792598724,
"epoch": 0.26763110307414106,
"grad_norm": 0.0,
"learning_rate": 0.00018874773139745919,
"loss": 2.5001,
"mean_token_accuracy": 0.5489733219146729,
"num_tokens": 51422.0,
"step": 37
},
{
"entropy": 1.0902953743934631,
"epoch": 0.27486437613019893,
"grad_norm": 0.0,
"learning_rate": 0.00018838475499092562,
"loss": 2.5394,
"mean_token_accuracy": 0.5441585332155228,
"num_tokens": 52829.0,
"step": 38
},
{
"entropy": 1.0722315609455109,
"epoch": 0.2820976491862568,
"grad_norm": 0.0,
"learning_rate": 0.00018802177858439202,
"loss": 2.5113,
"mean_token_accuracy": 0.544213131070137,
"num_tokens": 54184.0,
"step": 39
},
{
"entropy": 1.1416008174419403,
"epoch": 0.28933092224231466,
"grad_norm": 0.0,
"learning_rate": 0.00018765880217785845,
"loss": 2.5777,
"mean_token_accuracy": 0.5215476900339127,
"num_tokens": 55564.0,
"step": 40
},
{
"entropy": 1.1114321053028107,
"epoch": 0.2965641952983725,
"grad_norm": 0.0,
"learning_rate": 0.00018729582577132488,
"loss": 2.5755,
"mean_token_accuracy": 0.5520491451025009,
"num_tokens": 56976.0,
"step": 41
},
{
"entropy": 1.0591959059238434,
"epoch": 0.3037974683544304,
"grad_norm": 0.0,
"learning_rate": 0.0001869328493647913,
"loss": 2.4821,
"mean_token_accuracy": 0.5502973049879074,
"num_tokens": 58387.0,
"step": 42
},
{
"entropy": 1.116980403661728,
"epoch": 0.31103074141048825,
"grad_norm": 0.0,
"learning_rate": 0.00018656987295825772,
"loss": 2.485,
"mean_token_accuracy": 0.5487600713968277,
"num_tokens": 59720.0,
"step": 43
},
{
"entropy": 1.0845508873462677,
"epoch": 0.3182640144665461,
"grad_norm": 0.0,
"learning_rate": 0.00018620689655172415,
"loss": 2.629,
"mean_token_accuracy": 0.5307356417179108,
"num_tokens": 61066.0,
"step": 44
},
{
"entropy": 1.0685547292232513,
"epoch": 0.325497287522604,
"grad_norm": 0.0,
"learning_rate": 0.00018584392014519055,
"loss": 2.4535,
"mean_token_accuracy": 0.5622027516365051,
"num_tokens": 62411.0,
"step": 45
},
{
"entropy": 1.1736293733119965,
"epoch": 0.33273056057866185,
"grad_norm": 0.0,
"learning_rate": 0.00018548094373865698,
"loss": 2.5437,
"mean_token_accuracy": 0.5360411405563354,
"num_tokens": 63821.0,
"step": 46
},
{
"entropy": 1.1335414946079254,
"epoch": 0.3399638336347197,
"grad_norm": 0.0,
"learning_rate": 0.00018511796733212342,
"loss": 2.4349,
"mean_token_accuracy": 0.5393011569976807,
"num_tokens": 65280.0,
"step": 47
},
{
"entropy": 1.109579473733902,
"epoch": 0.3471971066907776,
"grad_norm": 0.0,
"learning_rate": 0.00018475499092558985,
"loss": 2.3379,
"mean_token_accuracy": 0.5629399716854095,
"num_tokens": 66754.0,
"step": 48
},
{
"entropy": 1.0618711709976196,
"epoch": 0.35443037974683544,
"grad_norm": 0.0,
"learning_rate": 0.00018439201451905628,
"loss": 2.4828,
"mean_token_accuracy": 0.5528846383094788,
"num_tokens": 68154.0,
"step": 49
},
{
"entropy": 1.0883667767047882,
"epoch": 0.3616636528028933,
"grad_norm": 0.0,
"learning_rate": 0.0001840290381125227,
"loss": 2.3981,
"mean_token_accuracy": 0.5410507619380951,
"num_tokens": 69549.0,
"step": 50
},
{
"entropy": 1.1461284458637238,
"epoch": 0.3688969258589512,
"grad_norm": 0.0,
"learning_rate": 0.00018366606170598911,
"loss": 2.4298,
"mean_token_accuracy": 0.5563310235738754,
"num_tokens": 70850.0,
"step": 51
},
{
"entropy": 1.0271961838006973,
"epoch": 0.37613019891500904,
"grad_norm": 0.0,
"learning_rate": 0.00018330308529945554,
"loss": 2.455,
"mean_token_accuracy": 0.5506248325109482,
"num_tokens": 72256.0,
"step": 52
},
{
"entropy": 1.1474315524101257,
"epoch": 0.3833634719710669,
"grad_norm": 0.0,
"learning_rate": 0.00018294010889292198,
"loss": 2.5817,
"mean_token_accuracy": 0.5281436145305634,
"num_tokens": 73633.0,
"step": 53
},
{
"entropy": 1.1158095598220825,
"epoch": 0.39059674502712477,
"grad_norm": 0.0,
"learning_rate": 0.00018257713248638838,
"loss": 2.558,
"mean_token_accuracy": 0.5412166118621826,
"num_tokens": 74998.0,
"step": 54
},
{
"entropy": 1.0468406975269318,
"epoch": 0.39783001808318263,
"grad_norm": 0.0,
"learning_rate": 0.0001822141560798548,
"loss": 2.4055,
"mean_token_accuracy": 0.562886506319046,
"num_tokens": 76370.0,
"step": 55
},
{
"entropy": 1.1383771300315857,
"epoch": 0.4050632911392405,
"grad_norm": 0.0,
"learning_rate": 0.00018185117967332124,
"loss": 2.5243,
"mean_token_accuracy": 0.5516170412302017,
"num_tokens": 77676.0,
"step": 56
},
{
"entropy": 1.1219291388988495,
"epoch": 0.41229656419529837,
"grad_norm": 0.0,
"learning_rate": 0.00018148820326678765,
"loss": 2.5717,
"mean_token_accuracy": 0.5452308058738708,
"num_tokens": 79014.0,
"step": 57
},
{
"entropy": 1.1263241171836853,
"epoch": 0.41952983725135623,
"grad_norm": 0.0,
"learning_rate": 0.00018112522686025408,
"loss": 2.5163,
"mean_token_accuracy": 0.5439134389162064,
"num_tokens": 80362.0,
"step": 58
},
{
"entropy": 1.0441412180662155,
"epoch": 0.4267631103074141,
"grad_norm": 0.0,
"learning_rate": 0.00018076225045372054,
"loss": 2.4519,
"mean_token_accuracy": 0.5616925954818726,
"num_tokens": 81795.0,
"step": 59
},
{
"entropy": 1.078106850385666,
"epoch": 0.43399638336347196,
"grad_norm": 0.0,
"learning_rate": 0.00018039927404718694,
"loss": 2.5087,
"mean_token_accuracy": 0.5514906644821167,
"num_tokens": 83176.0,
"step": 60
},
{
"entropy": 1.0974124670028687,
"epoch": 0.4412296564195298,
"grad_norm": 0.0,
"learning_rate": 0.00018003629764065337,
"loss": 2.6424,
"mean_token_accuracy": 0.5243298336863518,
"num_tokens": 84553.0,
"step": 61
},
{
"entropy": 1.1125823557376862,
"epoch": 0.4484629294755877,
"grad_norm": 0.0,
"learning_rate": 0.0001796733212341198,
"loss": 2.4969,
"mean_token_accuracy": 0.5476825684309006,
"num_tokens": 85861.0,
"step": 62
},
{
"entropy": 1.1592190861701965,
"epoch": 0.45569620253164556,
"grad_norm": 0.0,
"learning_rate": 0.0001793103448275862,
"loss": 2.6226,
"mean_token_accuracy": 0.5309869945049286,
"num_tokens": 87231.0,
"step": 63
},
{
"entropy": 1.0958653390407562,
"epoch": 0.4629294755877034,
"grad_norm": 0.0,
"learning_rate": 0.00017894736842105264,
"loss": 2.5135,
"mean_token_accuracy": 0.5463672578334808,
"num_tokens": 88563.0,
"step": 64
},
{
"entropy": 1.079408586025238,
"epoch": 0.4701627486437613,
"grad_norm": 0.0,
"learning_rate": 0.00017858439201451907,
"loss": 2.4558,
"mean_token_accuracy": 0.5488910973072052,
"num_tokens": 89953.0,
"step": 65
},
{
"entropy": 1.1434482634067535,
"epoch": 0.47739602169981915,
"grad_norm": 0.0,
"learning_rate": 0.0001782214156079855,
"loss": 2.51,
"mean_token_accuracy": 0.5425025671720505,
"num_tokens": 91361.0,
"step": 66
},
{
"entropy": 1.1196760833263397,
"epoch": 0.484629294755877,
"grad_norm": 0.0,
"learning_rate": 0.0001778584392014519,
"loss": 2.5184,
"mean_token_accuracy": 0.5284354239702225,
"num_tokens": 92710.0,
"step": 67
},
{
"entropy": 1.1134625673294067,
"epoch": 0.4918625678119349,
"grad_norm": 0.0,
"learning_rate": 0.00017749546279491833,
"loss": 2.5273,
"mean_token_accuracy": 0.5285885334014893,
"num_tokens": 94147.0,
"step": 68
},
{
"entropy": 1.0853535532951355,
"epoch": 0.49909584086799275,
"grad_norm": 0.0,
"learning_rate": 0.00017713248638838477,
"loss": 2.5684,
"mean_token_accuracy": 0.5399613529443741,
"num_tokens": 95496.0,
"step": 69
},
{
"entropy": 1.1049972176551819,
"epoch": 0.5063291139240507,
"grad_norm": 0.0,
"learning_rate": 0.00017676950998185117,
"loss": 2.5876,
"mean_token_accuracy": 0.5382668077945709,
"num_tokens": 96879.0,
"step": 70
},
{
"entropy": 1.0942886769771576,
"epoch": 0.5135623869801085,
"grad_norm": 0.0,
"learning_rate": 0.00017640653357531763,
"loss": 2.5334,
"mean_token_accuracy": 0.540867269039154,
"num_tokens": 98252.0,
"step": 71
},
{
"entropy": 1.101898044347763,
"epoch": 0.5207956600361664,
"grad_norm": 0.0,
"learning_rate": 0.00017604355716878403,
"loss": 2.4839,
"mean_token_accuracy": 0.5541739910840988,
"num_tokens": 99645.0,
"step": 72
},
{
"entropy": 1.056696429848671,
"epoch": 0.5280289330922242,
"grad_norm": 0.0,
"learning_rate": 0.00017568058076225046,
"loss": 2.5304,
"mean_token_accuracy": 0.5393856465816498,
"num_tokens": 101032.0,
"step": 73
},
{
"entropy": 1.1291647851467133,
"epoch": 0.5352622061482821,
"grad_norm": 0.0,
"learning_rate": 0.0001753176043557169,
"loss": 2.5736,
"mean_token_accuracy": 0.5367253422737122,
"num_tokens": 102394.0,
"step": 74
},
{
"entropy": 1.1129019260406494,
"epoch": 0.5424954792043399,
"grad_norm": 0.0,
"learning_rate": 0.00017495462794918333,
"loss": 2.4803,
"mean_token_accuracy": 0.5489845424890518,
"num_tokens": 103803.0,
"step": 75
},
{
"entropy": 1.0258008986711502,
"epoch": 0.5497287522603979,
"grad_norm": 0.0,
"learning_rate": 0.00017459165154264973,
"loss": 2.4433,
"mean_token_accuracy": 0.562438115477562,
"num_tokens": 105227.0,
"step": 76
},
{
"entropy": 1.0876842439174652,
"epoch": 0.5569620253164557,
"grad_norm": 0.0,
"learning_rate": 0.00017422867513611616,
"loss": 2.4997,
"mean_token_accuracy": 0.5498750060796738,
"num_tokens": 106564.0,
"step": 77
},
{
"entropy": 1.1114664673805237,
"epoch": 0.5641952983725136,
"grad_norm": 0.0,
"learning_rate": 0.0001738656987295826,
"loss": 2.4511,
"mean_token_accuracy": 0.5540181249380112,
"num_tokens": 107971.0,
"step": 78
},
{
"entropy": 1.1240213513374329,
"epoch": 0.5714285714285714,
"grad_norm": 0.0,
"learning_rate": 0.000173502722323049,
"loss": 2.5108,
"mean_token_accuracy": 0.5341273844242096,
"num_tokens": 109325.0,
"step": 79
},
{
"entropy": 1.126657396554947,
"epoch": 0.5786618444846293,
"grad_norm": 0.0,
"learning_rate": 0.00017313974591651543,
"loss": 2.55,
"mean_token_accuracy": 0.534925252199173,
"num_tokens": 110659.0,
"step": 80
},
{
"entropy": 1.0591940432786942,
"epoch": 0.5858951175406871,
"grad_norm": 0.0,
"learning_rate": 0.00017277676950998186,
"loss": 2.5122,
"mean_token_accuracy": 0.5615774989128113,
"num_tokens": 112064.0,
"step": 81
},
{
"entropy": 1.080923855304718,
"epoch": 0.593128390596745,
"grad_norm": 0.0,
"learning_rate": 0.00017241379310344826,
"loss": 2.5076,
"mean_token_accuracy": 0.5642599016427994,
"num_tokens": 113390.0,
"step": 82
},
{
"entropy": 1.0417269170284271,
"epoch": 0.6003616636528029,
"grad_norm": 0.0,
"learning_rate": 0.00017205081669691472,
"loss": 2.5414,
"mean_token_accuracy": 0.5496295690536499,
"num_tokens": 114784.0,
"step": 83
},
{
"entropy": 1.0593893826007843,
"epoch": 0.6075949367088608,
"grad_norm": 0.0,
"learning_rate": 0.00017168784029038115,
"loss": 2.4682,
"mean_token_accuracy": 0.5482211858034134,
"num_tokens": 116148.0,
"step": 84
},
{
"entropy": 1.1121591329574585,
"epoch": 0.6148282097649186,
"grad_norm": 0.0,
"learning_rate": 0.00017132486388384756,
"loss": 2.5569,
"mean_token_accuracy": 0.5326060503721237,
"num_tokens": 117506.0,
"step": 85
},
{
"entropy": 1.0343488901853561,
"epoch": 0.6220614828209765,
"grad_norm": 0.0,
"learning_rate": 0.000170961887477314,
"loss": 2.4415,
"mean_token_accuracy": 0.5560837835073471,
"num_tokens": 118976.0,
"step": 86
},
{
"entropy": 1.1269948780536652,
"epoch": 0.6292947558770343,
"grad_norm": 0.0,
"learning_rate": 0.00017059891107078042,
"loss": 2.4604,
"mean_token_accuracy": 0.5488670170307159,
"num_tokens": 120366.0,
"step": 87
},
{
"entropy": 1.0923850238323212,
"epoch": 0.6365280289330922,
"grad_norm": 0.0,
"learning_rate": 0.00017023593466424682,
"loss": 2.4487,
"mean_token_accuracy": 0.5475012511014938,
"num_tokens": 121782.0,
"step": 88
},
{
"entropy": 1.1426692605018616,
"epoch": 0.64376130198915,
"grad_norm": 0.0,
"learning_rate": 0.00016987295825771325,
"loss": 2.4899,
"mean_token_accuracy": 0.5527313947677612,
"num_tokens": 123192.0,
"step": 89
},
{
"entropy": 1.1227373778820038,
"epoch": 0.650994575045208,
"grad_norm": 0.0,
"learning_rate": 0.00016950998185117968,
"loss": 2.5995,
"mean_token_accuracy": 0.541309654712677,
"num_tokens": 124494.0,
"step": 90
},
{
"entropy": 1.1019038259983063,
"epoch": 0.6582278481012658,
"grad_norm": 0.0,
"learning_rate": 0.0001691470054446461,
"loss": 2.5882,
"mean_token_accuracy": 0.5407281070947647,
"num_tokens": 125814.0,
"step": 91
},
{
"entropy": 1.1230742931365967,
"epoch": 0.6654611211573237,
"grad_norm": 0.0,
"learning_rate": 0.00016878402903811252,
"loss": 2.4567,
"mean_token_accuracy": 0.5464838892221451,
"num_tokens": 127254.0,
"step": 92
},
{
"entropy": 1.108274668455124,
"epoch": 0.6726943942133815,
"grad_norm": 0.0,
"learning_rate": 0.00016842105263157895,
"loss": 2.6057,
"mean_token_accuracy": 0.5359428226947784,
"num_tokens": 128607.0,
"step": 93
},
{
"entropy": 1.0955757647752762,
"epoch": 0.6799276672694394,
"grad_norm": 0.0,
"learning_rate": 0.00016805807622504538,
"loss": 2.5135,
"mean_token_accuracy": 0.5578029453754425,
"num_tokens": 129995.0,
"step": 94
},
{
"entropy": 1.0912747085094452,
"epoch": 0.6871609403254972,
"grad_norm": 0.0,
"learning_rate": 0.0001676950998185118,
"loss": 2.5535,
"mean_token_accuracy": 0.5480784773826599,
"num_tokens": 131342.0,
"step": 95
},
{
"entropy": 1.109260231256485,
"epoch": 0.6943942133815552,
"grad_norm": 0.0,
"learning_rate": 0.00016733212341197824,
"loss": 2.5485,
"mean_token_accuracy": 0.5332305133342743,
"num_tokens": 132706.0,
"step": 96
},
{
"entropy": 1.1296232044696808,
"epoch": 0.701627486437613,
"grad_norm": 0.0,
"learning_rate": 0.00016696914700544465,
"loss": 2.4693,
"mean_token_accuracy": 0.5429563969373703,
"num_tokens": 134068.0,
"step": 97
},
{
"entropy": 1.110869139432907,
"epoch": 0.7088607594936709,
"grad_norm": 0.0,
"learning_rate": 0.00016660617059891108,
"loss": 2.544,
"mean_token_accuracy": 0.5363138318061829,
"num_tokens": 135503.0,
"step": 98
},
{
"entropy": 1.1182821094989777,
"epoch": 0.7160940325497287,
"grad_norm": 0.0,
"learning_rate": 0.0001662431941923775,
"loss": 2.4547,
"mean_token_accuracy": 0.5564229190349579,
"num_tokens": 136916.0,
"step": 99
},
{
"entropy": 1.123103141784668,
"epoch": 0.7233273056057866,
"grad_norm": 0.0,
"learning_rate": 0.00016588021778584392,
"loss": 2.5336,
"mean_token_accuracy": 0.5383445471525192,
"num_tokens": 138301.0,
"step": 100
},
{
"entropy": 1.0634585916996002,
"epoch": 0.7305605786618445,
"grad_norm": 0.0,
"learning_rate": 0.00016551724137931035,
"loss": 2.4965,
"mean_token_accuracy": 0.553234338760376,
"num_tokens": 139688.0,
"step": 101
},
{
"entropy": 1.0800490379333496,
"epoch": 0.7377938517179023,
"grad_norm": 0.0,
"learning_rate": 0.00016515426497277678,
"loss": 2.5361,
"mean_token_accuracy": 0.5373943001031876,
"num_tokens": 141048.0,
"step": 102
},
{
"entropy": 1.1085204184055328,
"epoch": 0.7450271247739603,
"grad_norm": 0.0,
"learning_rate": 0.0001647912885662432,
"loss": 2.5261,
"mean_token_accuracy": 0.5488307178020477,
"num_tokens": 142376.0,
"step": 103
},
{
"entropy": 1.101633608341217,
"epoch": 0.7522603978300181,
"grad_norm": 0.0,
"learning_rate": 0.0001644283121597096,
"loss": 2.465,
"mean_token_accuracy": 0.5500718951225281,
"num_tokens": 143718.0,
"step": 104
},
{
"entropy": 1.0608249604701996,
"epoch": 0.759493670886076,
"grad_norm": 0.0,
"learning_rate": 0.00016406533575317604,
"loss": 2.4165,
"mean_token_accuracy": 0.5679080486297607,
"num_tokens": 145194.0,
"step": 105
},
{
"entropy": 1.0754668712615967,
"epoch": 0.7667269439421338,
"grad_norm": 0.0,
"learning_rate": 0.00016370235934664247,
"loss": 2.5464,
"mean_token_accuracy": 0.5387931615114212,
"num_tokens": 146550.0,
"step": 106
},
{
"entropy": 1.1067388951778412,
"epoch": 0.7739602169981917,
"grad_norm": 0.0,
"learning_rate": 0.0001633393829401089,
"loss": 2.5155,
"mean_token_accuracy": 0.5482369661331177,
"num_tokens": 147913.0,
"step": 107
},
{
"entropy": 1.1152404248714447,
"epoch": 0.7811934900542495,
"grad_norm": 0.0,
"learning_rate": 0.00016297640653357534,
"loss": 2.5465,
"mean_token_accuracy": 0.5414672940969467,
"num_tokens": 149291.0,
"step": 108
},
{
"entropy": 1.125922292470932,
"epoch": 0.7884267631103075,
"grad_norm": 0.0,
"learning_rate": 0.00016261343012704177,
"loss": 2.5742,
"mean_token_accuracy": 0.5460067391395569,
"num_tokens": 150642.0,
"step": 109
},
{
"entropy": 1.1228927671909332,
"epoch": 0.7956600361663653,
"grad_norm": 0.0,
"learning_rate": 0.00016225045372050817,
"loss": 2.4777,
"mean_token_accuracy": 0.5416840761899948,
"num_tokens": 151976.0,
"step": 110
},
{
"entropy": 1.1512284874916077,
"epoch": 0.8028933092224232,
"grad_norm": 0.0,
"learning_rate": 0.0001618874773139746,
"loss": 2.4426,
"mean_token_accuracy": 0.5420899093151093,
"num_tokens": 153365.0,
"step": 111
},
{
"entropy": 1.0402842164039612,
"epoch": 0.810126582278481,
"grad_norm": 0.0,
"learning_rate": 0.00016152450090744103,
"loss": 2.511,
"mean_token_accuracy": 0.5547743141651154,
"num_tokens": 154672.0,
"step": 112
},
{
"entropy": 1.070629358291626,
"epoch": 0.8173598553345389,
"grad_norm": 0.0,
"learning_rate": 0.00016116152450090744,
"loss": 2.5149,
"mean_token_accuracy": 0.5439868271350861,
"num_tokens": 156040.0,
"step": 113
},
{
"entropy": 1.0861331224441528,
"epoch": 0.8245931283905967,
"grad_norm": 0.0,
"learning_rate": 0.00016079854809437387,
"loss": 2.5305,
"mean_token_accuracy": 0.533889502286911,
"num_tokens": 157355.0,
"step": 114
},
{
"entropy": 1.0892849266529083,
"epoch": 0.8318264014466547,
"grad_norm": 0.0,
"learning_rate": 0.0001604355716878403,
"loss": 2.5118,
"mean_token_accuracy": 0.5504042059183121,
"num_tokens": 158758.0,
"step": 115
},
{
"entropy": 1.0792168974876404,
"epoch": 0.8390596745027125,
"grad_norm": 0.0,
"learning_rate": 0.0001600725952813067,
"loss": 2.6109,
"mean_token_accuracy": 0.5412319302558899,
"num_tokens": 160121.0,
"step": 116
},
{
"entropy": 1.1541117131710052,
"epoch": 0.8462929475587704,
"grad_norm": 0.0,
"learning_rate": 0.00015970961887477314,
"loss": 2.4966,
"mean_token_accuracy": 0.5413856208324432,
"num_tokens": 161480.0,
"step": 117
},
{
"entropy": 1.089859127998352,
"epoch": 0.8535262206148282,
"grad_norm": 0.0,
"learning_rate": 0.0001593466424682396,
"loss": 2.5301,
"mean_token_accuracy": 0.5545037686824799,
"num_tokens": 162869.0,
"step": 118
},
{
"entropy": 1.0838179290294647,
"epoch": 0.8607594936708861,
"grad_norm": 0.0,
"learning_rate": 0.000158983666061706,
"loss": 2.5073,
"mean_token_accuracy": 0.5579670369625092,
"num_tokens": 164228.0,
"step": 119
},
{
"entropy": 1.156173586845398,
"epoch": 0.8679927667269439,
"grad_norm": 0.0,
"learning_rate": 0.00015862068965517243,
"loss": 2.5787,
"mean_token_accuracy": 0.5258647873997688,
"num_tokens": 165624.0,
"step": 120
},
{
"entropy": 1.1117530465126038,
"epoch": 0.8752260397830018,
"grad_norm": 0.0,
"learning_rate": 0.00015825771324863886,
"loss": 2.6018,
"mean_token_accuracy": 0.5411983877420425,
"num_tokens": 167013.0,
"step": 121
},
{
"entropy": 1.1654876172542572,
"epoch": 0.8824593128390597,
"grad_norm": 0.0,
"learning_rate": 0.00015789473684210527,
"loss": 2.5169,
"mean_token_accuracy": 0.551274299621582,
"num_tokens": 168361.0,
"step": 122
},
{
"entropy": 1.1018896400928497,
"epoch": 0.8896925858951176,
"grad_norm": 0.0,
"learning_rate": 0.0001575317604355717,
"loss": 2.5214,
"mean_token_accuracy": 0.5413067489862442,
"num_tokens": 169751.0,
"step": 123
},
{
"entropy": 1.0984440445899963,
"epoch": 0.8969258589511754,
"grad_norm": 0.0,
"learning_rate": 0.00015716878402903813,
"loss": 2.5656,
"mean_token_accuracy": 0.5381608605384827,
"num_tokens": 171127.0,
"step": 124
},
{
"entropy": 1.0864254534244537,
"epoch": 0.9041591320072333,
"grad_norm": 0.0,
"learning_rate": 0.00015680580762250453,
"loss": 2.5288,
"mean_token_accuracy": 0.5271068960428238,
"num_tokens": 172564.0,
"step": 125
},
{
"entropy": 1.0896047800779343,
"epoch": 0.9113924050632911,
"grad_norm": 0.0,
"learning_rate": 0.00015644283121597096,
"loss": 2.4613,
"mean_token_accuracy": 0.5481883883476257,
"num_tokens": 173965.0,
"step": 126
},
{
"entropy": 1.0906076729297638,
"epoch": 0.918625678119349,
"grad_norm": 0.0,
"learning_rate": 0.0001560798548094374,
"loss": 2.4223,
"mean_token_accuracy": 0.5527500957250595,
"num_tokens": 175364.0,
"step": 127
},
{
"entropy": 1.064597100019455,
"epoch": 0.9258589511754068,
"grad_norm": 0.0,
"learning_rate": 0.0001557168784029038,
"loss": 2.5124,
"mean_token_accuracy": 0.551444873213768,
"num_tokens": 176694.0,
"step": 128
},
{
"entropy": 1.1372613608837128,
"epoch": 0.9330922242314648,
"grad_norm": 0.0,
"learning_rate": 0.00015535390199637023,
"loss": 2.5197,
"mean_token_accuracy": 0.5470752865076065,
"num_tokens": 178074.0,
"step": 129
},
{
"entropy": 1.1365208625793457,
"epoch": 0.9403254972875226,
"grad_norm": 0.0,
"learning_rate": 0.0001549909255898367,
"loss": 2.5013,
"mean_token_accuracy": 0.5439895391464233,
"num_tokens": 179506.0,
"step": 130
},
{
"entropy": 1.1201069951057434,
"epoch": 0.9475587703435805,
"grad_norm": 0.0,
"learning_rate": 0.0001546279491833031,
"loss": 2.4971,
"mean_token_accuracy": 0.5508686006069183,
"num_tokens": 180890.0,
"step": 131
},
{
"entropy": 1.1062006503343582,
"epoch": 0.9547920433996383,
"grad_norm": 0.0,
"learning_rate": 0.00015426497277676952,
"loss": 2.5202,
"mean_token_accuracy": 0.5457829236984253,
"num_tokens": 182250.0,
"step": 132
},
{
"entropy": 1.08922478556633,
"epoch": 0.9620253164556962,
"grad_norm": 0.0,
"learning_rate": 0.00015390199637023595,
"loss": 2.5269,
"mean_token_accuracy": 0.548294797539711,
"num_tokens": 183644.0,
"step": 133
},
{
"entropy": 1.0965730547904968,
"epoch": 0.969258589511754,
"grad_norm": 0.0,
"learning_rate": 0.00015353901996370236,
"loss": 2.5512,
"mean_token_accuracy": 0.5375129878520966,
"num_tokens": 184988.0,
"step": 134
},
{
"entropy": 1.047539085149765,
"epoch": 0.976491862567812,
"grad_norm": 0.0,
"learning_rate": 0.0001531760435571688,
"loss": 2.4542,
"mean_token_accuracy": 0.5518470257520676,
"num_tokens": 186434.0,
"step": 135
},
{
"entropy": 1.058751568198204,
"epoch": 0.9837251356238698,
"grad_norm": 0.0,
"learning_rate": 0.00015281306715063522,
"loss": 2.4307,
"mean_token_accuracy": 0.5505292564630508,
"num_tokens": 187884.0,
"step": 136
},
{
"entropy": 1.0880784392356873,
"epoch": 0.9909584086799277,
"grad_norm": 0.0,
"learning_rate": 0.00015245009074410162,
"loss": 2.4285,
"mean_token_accuracy": 0.5528350919485092,
"num_tokens": 189290.0,
"step": 137
},
{
"entropy": 1.057303100824356,
"epoch": 0.9981916817359855,
"grad_norm": 0.0,
"learning_rate": 0.00015208711433756806,
"loss": 2.5462,
"mean_token_accuracy": 0.5505049228668213,
"num_tokens": 190628.0,
"step": 138
},
{
"entropy": 1.1205418109893799,
"epoch": 1.0,
"grad_norm": 0.0,
"learning_rate": 0.00015172413793103449,
"loss": 2.7426,
"mean_token_accuracy": 0.518750011920929,
"num_tokens": 190790.0,
"step": 139
},
{
"entropy": 1.0945520102977753,
"epoch": 1.0072332730560578,
"grad_norm": 0.0,
"learning_rate": 0.00015136116152450092,
"loss": 2.4704,
"mean_token_accuracy": 0.5627106875181198,
"num_tokens": 192197.0,
"step": 140
},
{
"entropy": 1.115374892950058,
"epoch": 1.0144665461121158,
"grad_norm": 0.0,
"learning_rate": 0.00015099818511796735,
"loss": 2.5376,
"mean_token_accuracy": 0.5344210714101791,
"num_tokens": 193569.0,
"step": 141
},
{
"entropy": 1.0689078569412231,
"epoch": 1.0216998191681737,
"grad_norm": 0.0,
"learning_rate": 0.00015063520871143378,
"loss": 2.4321,
"mean_token_accuracy": 0.5700321197509766,
"num_tokens": 194987.0,
"step": 142
},
{
"entropy": 1.0969620048999786,
"epoch": 1.0289330922242315,
"grad_norm": 0.0,
"learning_rate": 0.00015027223230490018,
"loss": 2.5325,
"mean_token_accuracy": 0.5415942668914795,
"num_tokens": 196398.0,
"step": 143
},
{
"entropy": 1.0174528360366821,
"epoch": 1.0361663652802893,
"grad_norm": 0.0,
"learning_rate": 0.00014990925589836661,
"loss": 2.4498,
"mean_token_accuracy": 0.5438085496425629,
"num_tokens": 197820.0,
"step": 144
},
{
"entropy": 1.1043200492858887,
"epoch": 1.0433996383363473,
"grad_norm": 0.0,
"learning_rate": 0.00014954627949183305,
"loss": 2.458,
"mean_token_accuracy": 0.5458298474550247,
"num_tokens": 199213.0,
"step": 145
},
{
"entropy": 1.1462195813655853,
"epoch": 1.0506329113924051,
"grad_norm": 0.0,
"learning_rate": 0.00014918330308529948,
"loss": 2.5472,
"mean_token_accuracy": 0.5426424294710159,
"num_tokens": 200561.0,
"step": 146
},
{
"entropy": 1.139638602733612,
"epoch": 1.057866184448463,
"grad_norm": 0.0,
"learning_rate": 0.00014882032667876588,
"loss": 2.4519,
"mean_token_accuracy": 0.5445922464132309,
"num_tokens": 201975.0,
"step": 147
},
{
"entropy": 1.0623964667320251,
"epoch": 1.0650994575045207,
"grad_norm": 0.0,
"learning_rate": 0.0001484573502722323,
"loss": 2.465,
"mean_token_accuracy": 0.5589400827884674,
"num_tokens": 203414.0,
"step": 148
},
{
"entropy": 1.076666384935379,
"epoch": 1.0723327305605788,
"grad_norm": 0.0,
"learning_rate": 0.00014809437386569874,
"loss": 2.5235,
"mean_token_accuracy": 0.5452157557010651,
"num_tokens": 204796.0,
"step": 149
},
{
"entropy": 1.1187179684638977,
"epoch": 1.0795660036166366,
"grad_norm": 0.0,
"learning_rate": 0.00014773139745916515,
"loss": 2.5372,
"mean_token_accuracy": 0.5426393896341324,
"num_tokens": 206181.0,
"step": 150
},
{
"entropy": 1.0679790079593658,
"epoch": 1.0867992766726944,
"grad_norm": 0.0,
"learning_rate": 0.00014736842105263158,
"loss": 2.4031,
"mean_token_accuracy": 0.5628975033760071,
"num_tokens": 207604.0,
"step": 151
},
{
"entropy": 1.1154986917972565,
"epoch": 1.0940325497287522,
"grad_norm": 0.0,
"learning_rate": 0.000147005444646098,
"loss": 2.5325,
"mean_token_accuracy": 0.5442689210176468,
"num_tokens": 208978.0,
"step": 152
},
{
"entropy": 1.0788207352161407,
"epoch": 1.1012658227848102,
"grad_norm": 0.0,
"learning_rate": 0.00014664246823956444,
"loss": 2.4932,
"mean_token_accuracy": 0.5427374988794327,
"num_tokens": 210332.0,
"step": 153
},
{
"entropy": 1.1146452724933624,
"epoch": 1.108499095840868,
"grad_norm": 0.0,
"learning_rate": 0.00014627949183303087,
"loss": 2.5571,
"mean_token_accuracy": 0.5369889438152313,
"num_tokens": 211700.0,
"step": 154
},
{
"entropy": 1.1245636343955994,
"epoch": 1.1157323688969258,
"grad_norm": 0.0,
"learning_rate": 0.0001459165154264973,
"loss": 2.6096,
"mean_token_accuracy": 0.5278495326638222,
"num_tokens": 213050.0,
"step": 155
},
{
"entropy": 1.1087840497493744,
"epoch": 1.1229656419529837,
"grad_norm": 0.0,
"learning_rate": 0.0001455535390199637,
"loss": 2.5262,
"mean_token_accuracy": 0.5457671284675598,
"num_tokens": 214475.0,
"step": 156
},
{
"entropy": 1.0188728719949722,
"epoch": 1.1301989150090417,
"grad_norm": 0.0,
"learning_rate": 0.00014519056261343014,
"loss": 2.4321,
"mean_token_accuracy": 0.5751179605722427,
"num_tokens": 215929.0,
"step": 157
},
{
"entropy": 1.127344325184822,
"epoch": 1.1374321880650995,
"grad_norm": 0.0,
"learning_rate": 0.00014482758620689657,
"loss": 2.6004,
"mean_token_accuracy": 0.5295312106609344,
"num_tokens": 217327.0,
"step": 158
},
{
"entropy": 1.104835420846939,
"epoch": 1.1446654611211573,
"grad_norm": 0.0,
"learning_rate": 0.00014446460980036297,
"loss": 2.4725,
"mean_token_accuracy": 0.5560386776924133,
"num_tokens": 218718.0,
"step": 159
},
{
"entropy": 1.098018765449524,
"epoch": 1.1518987341772151,
"grad_norm": 0.0,
"learning_rate": 0.0001441016333938294,
"loss": 2.5442,
"mean_token_accuracy": 0.5397144109010696,
"num_tokens": 220055.0,
"step": 160
},
{
"entropy": 1.1457242369651794,
"epoch": 1.1591320072332731,
"grad_norm": 0.0,
"learning_rate": 0.00014373865698729584,
"loss": 2.5436,
"mean_token_accuracy": 0.5351956188678741,
"num_tokens": 221397.0,
"step": 161
},
{
"entropy": 1.092559427022934,
"epoch": 1.166365280289331,
"grad_norm": 0.0,
"learning_rate": 0.00014337568058076224,
"loss": 2.4248,
"mean_token_accuracy": 0.5560291260480881,
"num_tokens": 222789.0,
"step": 162
},
{
"entropy": 1.0653438866138458,
"epoch": 1.1735985533453888,
"grad_norm": 0.0,
"learning_rate": 0.00014301270417422867,
"loss": 2.5878,
"mean_token_accuracy": 0.5465674847364426,
"num_tokens": 224159.0,
"step": 163
},
{
"entropy": 1.1752942502498627,
"epoch": 1.1808318264014466,
"grad_norm": 0.0,
"learning_rate": 0.0001426497277676951,
"loss": 2.5277,
"mean_token_accuracy": 0.5325864478945732,
"num_tokens": 225582.0,
"step": 164
},
{
"entropy": 1.0615761578083038,
"epoch": 1.1880650994575046,
"grad_norm": 0.0,
"learning_rate": 0.00014228675136116153,
"loss": 2.4925,
"mean_token_accuracy": 0.5536267906427383,
"num_tokens": 226983.0,
"step": 165
},
{
"entropy": 1.1374837756156921,
"epoch": 1.1952983725135624,
"grad_norm": 0.0,
"learning_rate": 0.00014192377495462796,
"loss": 2.4594,
"mean_token_accuracy": 0.5434800386428833,
"num_tokens": 228399.0,
"step": 166
},
{
"entropy": 1.1201145946979523,
"epoch": 1.2025316455696202,
"grad_norm": 0.0,
"learning_rate": 0.0001415607985480944,
"loss": 2.4985,
"mean_token_accuracy": 0.5561137795448303,
"num_tokens": 229805.0,
"step": 167
},
{
"entropy": 1.1054023802280426,
"epoch": 1.209764918625678,
"grad_norm": 0.0,
"learning_rate": 0.0001411978221415608,
"loss": 2.4873,
"mean_token_accuracy": 0.547326996922493,
"num_tokens": 231177.0,
"step": 168
},
{
"entropy": 1.072383537888527,
"epoch": 1.216998191681736,
"grad_norm": 0.0,
"learning_rate": 0.00014083484573502723,
"loss": 2.4556,
"mean_token_accuracy": 0.5583519786596298,
"num_tokens": 232645.0,
"step": 169
},
{
"entropy": 1.085743099451065,
"epoch": 1.2242314647377939,
"grad_norm": 0.0,
"learning_rate": 0.00014047186932849366,
"loss": 2.5287,
"mean_token_accuracy": 0.5549855530261993,
"num_tokens": 233952.0,
"step": 170
},
{
"entropy": 1.0745936632156372,
"epoch": 1.2314647377938517,
"grad_norm": 0.0,
"learning_rate": 0.00014010889292196007,
"loss": 2.5143,
"mean_token_accuracy": 0.5558921694755554,
"num_tokens": 235287.0,
"step": 171
},
{
"entropy": 1.0693705677986145,
"epoch": 1.2386980108499095,
"grad_norm": 0.0,
"learning_rate": 0.0001397459165154265,
"loss": 2.4938,
"mean_token_accuracy": 0.5452517122030258,
"num_tokens": 236670.0,
"step": 172
},
{
"entropy": 1.135264903306961,
"epoch": 1.2459312839059675,
"grad_norm": 0.0,
"learning_rate": 0.00013938294010889293,
"loss": 2.5618,
"mean_token_accuracy": 0.5449370294809341,
"num_tokens": 238001.0,
"step": 173
},
{
"entropy": 1.0745739191770554,
"epoch": 1.2531645569620253,
"grad_norm": 0.0,
"learning_rate": 0.00013901996370235933,
"loss": 2.4831,
"mean_token_accuracy": 0.5497026294469833,
"num_tokens": 239380.0,
"step": 174
},
{
"entropy": 1.1606217920780182,
"epoch": 1.2603978300180831,
"grad_norm": 0.0,
"learning_rate": 0.00013865698729582576,
"loss": 2.5234,
"mean_token_accuracy": 0.5369236767292023,
"num_tokens": 240671.0,
"step": 175
},
{
"entropy": 1.1321864128112793,
"epoch": 1.267631103074141,
"grad_norm": 0.0,
"learning_rate": 0.00013829401088929222,
"loss": 2.568,
"mean_token_accuracy": 0.5358796268701553,
"num_tokens": 241999.0,
"step": 176
},
{
"entropy": 1.0919426679611206,
"epoch": 1.274864376130199,
"grad_norm": 0.0,
"learning_rate": 0.00013793103448275863,
"loss": 2.4199,
"mean_token_accuracy": 0.5525386333465576,
"num_tokens": 243432.0,
"step": 177
},
{
"entropy": 1.1321356147527695,
"epoch": 1.2820976491862568,
"grad_norm": 0.0,
"learning_rate": 0.00013756805807622506,
"loss": 2.5929,
"mean_token_accuracy": 0.5318445116281509,
"num_tokens": 244799.0,
"step": 178
},
{
"entropy": 1.0925188958644867,
"epoch": 1.2893309222423146,
"grad_norm": 0.0,
"learning_rate": 0.0001372050816696915,
"loss": 2.412,
"mean_token_accuracy": 0.5604078769683838,
"num_tokens": 246201.0,
"step": 179
},
{
"entropy": 1.0641421675682068,
"epoch": 1.2965641952983726,
"grad_norm": 0.0,
"learning_rate": 0.0001368421052631579,
"loss": 2.3832,
"mean_token_accuracy": 0.5665835738182068,
"num_tokens": 247652.0,
"step": 180
},
{
"entropy": 1.1118652522563934,
"epoch": 1.3037974683544304,
"grad_norm": 0.0,
"learning_rate": 0.00013647912885662432,
"loss": 2.5912,
"mean_token_accuracy": 0.5334130972623825,
"num_tokens": 249011.0,
"step": 181
},
{
"entropy": 1.093698412179947,
"epoch": 1.3110307414104883,
"grad_norm": 0.0,
"learning_rate": 0.00013611615245009076,
"loss": 2.4982,
"mean_token_accuracy": 0.5461122542619705,
"num_tokens": 250442.0,
"step": 182
},
{
"entropy": 1.0915465950965881,
"epoch": 1.318264014466546,
"grad_norm": 0.0,
"learning_rate": 0.00013575317604355719,
"loss": 2.5026,
"mean_token_accuracy": 0.5573518574237823,
"num_tokens": 251829.0,
"step": 183
},
{
"entropy": 1.0844445824623108,
"epoch": 1.3254972875226039,
"grad_norm": 0.0,
"learning_rate": 0.0001353901996370236,
"loss": 2.4848,
"mean_token_accuracy": 0.541678175330162,
"num_tokens": 253229.0,
"step": 184
},
{
"entropy": 1.1480746567249298,
"epoch": 1.332730560578662,
"grad_norm": 0.0,
"learning_rate": 0.00013502722323049002,
"loss": 2.5386,
"mean_token_accuracy": 0.5468995273113251,
"num_tokens": 254604.0,
"step": 185
},
{
"entropy": 1.1767261922359467,
"epoch": 1.3399638336347197,
"grad_norm": 0.0,
"learning_rate": 0.00013466424682395645,
"loss": 2.5959,
"mean_token_accuracy": 0.5261930972337723,
"num_tokens": 255900.0,
"step": 186
},
{
"entropy": 1.0286410748958588,
"epoch": 1.3471971066907775,
"grad_norm": 0.0,
"learning_rate": 0.00013430127041742286,
"loss": 2.4155,
"mean_token_accuracy": 0.5538035929203033,
"num_tokens": 257311.0,
"step": 187
},
{
"entropy": 1.1496650278568268,
"epoch": 1.3544303797468356,
"grad_norm": 0.0,
"learning_rate": 0.00013393829401088931,
"loss": 2.5327,
"mean_token_accuracy": 0.5415185838937759,
"num_tokens": 258679.0,
"step": 188
},
{
"entropy": 1.0800335109233856,
"epoch": 1.3616636528028934,
"grad_norm": 0.0,
"learning_rate": 0.00013357531760435572,
"loss": 2.4404,
"mean_token_accuracy": 0.5601710379123688,
"num_tokens": 260035.0,
"step": 189
},
{
"entropy": 1.0977840423583984,
"epoch": 1.3688969258589512,
"grad_norm": 0.0,
"learning_rate": 0.00013321234119782215,
"loss": 2.5612,
"mean_token_accuracy": 0.5392811894416809,
"num_tokens": 261446.0,
"step": 190
},
{
"entropy": 1.0847298502922058,
"epoch": 1.376130198915009,
"grad_norm": 0.0,
"learning_rate": 0.00013284936479128858,
"loss": 2.4941,
"mean_token_accuracy": 0.553534135222435,
"num_tokens": 262823.0,
"step": 191
},
{
"entropy": 1.0621435940265656,
"epoch": 1.3833634719710668,
"grad_norm": 0.0,
"learning_rate": 0.000132486388384755,
"loss": 2.5295,
"mean_token_accuracy": 0.5515837073326111,
"num_tokens": 264213.0,
"step": 192
},
{
"entropy": 1.1017491519451141,
"epoch": 1.3905967450271248,
"grad_norm": 0.0,
"learning_rate": 0.00013212341197822142,
"loss": 2.592,
"mean_token_accuracy": 0.5382756292819977,
"num_tokens": 265526.0,
"step": 193
},
{
"entropy": 1.0822009593248367,
"epoch": 1.3978300180831826,
"grad_norm": 0.0,
"learning_rate": 0.00013176043557168785,
"loss": 2.5066,
"mean_token_accuracy": 0.5384088605642319,
"num_tokens": 266949.0,
"step": 194
},
{
"entropy": 1.1282154023647308,
"epoch": 1.4050632911392404,
"grad_norm": 0.0,
"learning_rate": 0.00013139745916515428,
"loss": 2.5788,
"mean_token_accuracy": 0.5369668304920197,
"num_tokens": 268304.0,
"step": 195
},
{
"entropy": 1.0464816391468048,
"epoch": 1.4122965641952985,
"grad_norm": 0.0,
"learning_rate": 0.00013103448275862068,
"loss": 2.4394,
"mean_token_accuracy": 0.5474424809217453,
"num_tokens": 269668.0,
"step": 196
},
{
"entropy": 1.151181936264038,
"epoch": 1.4195298372513563,
"grad_norm": 0.0,
"learning_rate": 0.00013067150635208711,
"loss": 2.4921,
"mean_token_accuracy": 0.5326793938875198,
"num_tokens": 271048.0,
"step": 197
},
{
"entropy": 1.1585266888141632,
"epoch": 1.426763110307414,
"grad_norm": 0.0,
"learning_rate": 0.00013030852994555355,
"loss": 2.5596,
"mean_token_accuracy": 0.5290036201477051,
"num_tokens": 272423.0,
"step": 198
},
{
"entropy": 1.0876199752092361,
"epoch": 1.433996383363472,
"grad_norm": 0.0,
"learning_rate": 0.00012994555353901995,
"loss": 2.5371,
"mean_token_accuracy": 0.542813628911972,
"num_tokens": 273880.0,
"step": 199
},
{
"entropy": 1.0546641200780869,
"epoch": 1.4412296564195297,
"grad_norm": 0.0,
"learning_rate": 0.0001295825771324864,
"loss": 2.4553,
"mean_token_accuracy": 0.5557200312614441,
"num_tokens": 275289.0,
"step": 200
},
{
"entropy": 1.117970123887062,
"epoch": 1.4484629294755877,
"grad_norm": 0.0,
"learning_rate": 0.00012921960072595284,
"loss": 2.5695,
"mean_token_accuracy": 0.5380512326955795,
"num_tokens": 276656.0,
"step": 201
},
{
"entropy": 1.0969232022762299,
"epoch": 1.4556962025316456,
"grad_norm": 0.0,
"learning_rate": 0.00012885662431941924,
"loss": 2.4635,
"mean_token_accuracy": 0.5425622910261154,
"num_tokens": 278083.0,
"step": 202
},
{
"entropy": 1.0811368227005005,
"epoch": 1.4629294755877034,
"grad_norm": 0.0,
"learning_rate": 0.00012849364791288567,
"loss": 2.5302,
"mean_token_accuracy": 0.5425570607185364,
"num_tokens": 279431.0,
"step": 203
},
{
"entropy": 1.055485188961029,
"epoch": 1.4701627486437614,
"grad_norm": 0.0,
"learning_rate": 0.0001281306715063521,
"loss": 2.4713,
"mean_token_accuracy": 0.5471125394105911,
"num_tokens": 280835.0,
"step": 204
},
{
"entropy": 1.1141844391822815,
"epoch": 1.4773960216998192,
"grad_norm": 0.0,
"learning_rate": 0.0001277676950998185,
"loss": 2.5834,
"mean_token_accuracy": 0.5362391173839569,
"num_tokens": 282200.0,
"step": 205
},
{
"entropy": 1.1039184033870697,
"epoch": 1.484629294755877,
"grad_norm": 0.0,
"learning_rate": 0.00012740471869328494,
"loss": 2.4992,
"mean_token_accuracy": 0.5513607412576675,
"num_tokens": 283567.0,
"step": 206
},
{
"entropy": 1.0966024100780487,
"epoch": 1.4918625678119348,
"grad_norm": 0.0,
"learning_rate": 0.00012704174228675137,
"loss": 2.3996,
"mean_token_accuracy": 0.552715390920639,
"num_tokens": 284967.0,
"step": 207
},
{
"entropy": 1.096381276845932,
"epoch": 1.4990958408679926,
"grad_norm": 0.0,
"learning_rate": 0.00012667876588021778,
"loss": 2.5746,
"mean_token_accuracy": 0.5394180566072464,
"num_tokens": 286329.0,
"step": 208
},
{
"entropy": 1.0701113641262054,
"epoch": 1.5063291139240507,
"grad_norm": 0.0,
"learning_rate": 0.0001263157894736842,
"loss": 2.4981,
"mean_token_accuracy": 0.5468792766332626,
"num_tokens": 287720.0,
"step": 209
},
{
"entropy": 1.1413646340370178,
"epoch": 1.5135623869801085,
"grad_norm": 0.0,
"learning_rate": 0.00012595281306715064,
"loss": 2.5805,
"mean_token_accuracy": 0.533017098903656,
"num_tokens": 289127.0,
"step": 210
},
{
"entropy": 1.0838797986507416,
"epoch": 1.5207956600361663,
"grad_norm": 0.0,
"learning_rate": 0.00012558983666061704,
"loss": 2.4496,
"mean_token_accuracy": 0.565049484372139,
"num_tokens": 290482.0,
"step": 211
},
{
"entropy": 1.0366474837064743,
"epoch": 1.5280289330922243,
"grad_norm": 0.0,
"learning_rate": 0.0001252268602540835,
"loss": 2.5012,
"mean_token_accuracy": 0.5630057752132416,
"num_tokens": 291876.0,
"step": 212
},
{
"entropy": 1.0706925094127655,
"epoch": 1.5352622061482821,
"grad_norm": 0.0,
"learning_rate": 0.00012486388384754993,
"loss": 2.5012,
"mean_token_accuracy": 0.5362317860126495,
"num_tokens": 293306.0,
"step": 213
},
{
"entropy": 1.0606517046689987,
"epoch": 1.54249547920434,
"grad_norm": 0.0,
"learning_rate": 0.00012450090744101634,
"loss": 2.5622,
"mean_token_accuracy": 0.5519603192806244,
"num_tokens": 294662.0,
"step": 214
},
{
"entropy": 1.0365487039089203,
"epoch": 1.549728752260398,
"grad_norm": 0.0,
"learning_rate": 0.00012413793103448277,
"loss": 2.4236,
"mean_token_accuracy": 0.5674052089452744,
"num_tokens": 296087.0,
"step": 215
},
{
"entropy": 1.0755655467510223,
"epoch": 1.5569620253164556,
"grad_norm": 0.0,
"learning_rate": 0.0001237749546279492,
"loss": 2.5169,
"mean_token_accuracy": 0.5508773624897003,
"num_tokens": 297502.0,
"step": 216
},
{
"entropy": 1.1377713978290558,
"epoch": 1.5641952983725136,
"grad_norm": 0.0,
"learning_rate": 0.0001234119782214156,
"loss": 2.5534,
"mean_token_accuracy": 0.5336230993270874,
"num_tokens": 298836.0,
"step": 217
},
{
"entropy": 1.109740287065506,
"epoch": 1.5714285714285714,
"grad_norm": 0.0,
"learning_rate": 0.00012304900181488203,
"loss": 2.7175,
"mean_token_accuracy": 0.5418126434087753,
"num_tokens": 300111.0,
"step": 218
},
{
"entropy": 1.0909195244312286,
"epoch": 1.5786618444846292,
"grad_norm": 0.0,
"learning_rate": 0.00012268602540834846,
"loss": 2.5177,
"mean_token_accuracy": 0.5417619496583939,
"num_tokens": 301501.0,
"step": 219
},
{
"entropy": 1.06816665828228,
"epoch": 1.5858951175406872,
"grad_norm": 0.0,
"learning_rate": 0.0001223230490018149,
"loss": 2.4367,
"mean_token_accuracy": 0.559718519449234,
"num_tokens": 302926.0,
"step": 220
},
{
"entropy": 1.0440122485160828,
"epoch": 1.593128390596745,
"grad_norm": 0.0,
"learning_rate": 0.0001219600725952813,
"loss": 2.4394,
"mean_token_accuracy": 0.5527326017618179,
"num_tokens": 304337.0,
"step": 221
},
{
"entropy": 1.0804894715547562,
"epoch": 1.6003616636528029,
"grad_norm": 0.0,
"learning_rate": 0.00012159709618874773,
"loss": 2.4707,
"mean_token_accuracy": 0.5539764165878296,
"num_tokens": 305707.0,
"step": 222
},
{
"entropy": 1.114003211259842,
"epoch": 1.6075949367088609,
"grad_norm": 0.0,
"learning_rate": 0.00012123411978221418,
"loss": 2.4948,
"mean_token_accuracy": 0.5415279567241669,
"num_tokens": 307138.0,
"step": 223
},
{
"entropy": 1.1211613416671753,
"epoch": 1.6148282097649185,
"grad_norm": 0.0,
"learning_rate": 0.00012087114337568059,
"loss": 2.5239,
"mean_token_accuracy": 0.5385608822107315,
"num_tokens": 308507.0,
"step": 224
},
{
"entropy": 1.1505621373653412,
"epoch": 1.6220614828209765,
"grad_norm": 0.0,
"learning_rate": 0.00012050816696914702,
"loss": 2.527,
"mean_token_accuracy": 0.5417819917201996,
"num_tokens": 309890.0,
"step": 225
},
{
"entropy": 1.1308437585830688,
"epoch": 1.6292947558770343,
"grad_norm": 0.0,
"learning_rate": 0.00012014519056261344,
"loss": 2.5056,
"mean_token_accuracy": 0.5359330922365189,
"num_tokens": 311303.0,
"step": 226
},
{
"entropy": 1.0914883315563202,
"epoch": 1.6365280289330921,
"grad_norm": 0.0,
"learning_rate": 0.00011978221415607986,
"loss": 2.5595,
"mean_token_accuracy": 0.5537950694561005,
"num_tokens": 312697.0,
"step": 227
},
{
"entropy": 1.0523104220628738,
"epoch": 1.6437613019891502,
"grad_norm": 0.0,
"learning_rate": 0.00011941923774954629,
"loss": 2.4534,
"mean_token_accuracy": 0.5497398674488068,
"num_tokens": 314071.0,
"step": 228
},
{
"entropy": 1.1443000137805939,
"epoch": 1.650994575045208,
"grad_norm": 0.0,
"learning_rate": 0.00011905626134301271,
"loss": 2.5725,
"mean_token_accuracy": 0.5432182401418686,
"num_tokens": 315403.0,
"step": 229
},
{
"entropy": 1.0856167376041412,
"epoch": 1.6582278481012658,
"grad_norm": 0.0,
"learning_rate": 0.00011869328493647913,
"loss": 2.5599,
"mean_token_accuracy": 0.5425290018320084,
"num_tokens": 316771.0,
"step": 230
},
{
"entropy": 1.118817538022995,
"epoch": 1.6654611211573238,
"grad_norm": 0.0,
"learning_rate": 0.00011833030852994556,
"loss": 2.5283,
"mean_token_accuracy": 0.544951319694519,
"num_tokens": 318111.0,
"step": 231
},
{
"entropy": 1.0854854881763458,
"epoch": 1.6726943942133814,
"grad_norm": 0.0,
"learning_rate": 0.00011796733212341197,
"loss": 2.577,
"mean_token_accuracy": 0.5448784381151199,
"num_tokens": 319462.0,
"step": 232
},
{
"entropy": 1.1421701908111572,
"epoch": 1.6799276672694394,
"grad_norm": 0.0,
"learning_rate": 0.0001176043557168784,
"loss": 2.5542,
"mean_token_accuracy": 0.5450168401002884,
"num_tokens": 320863.0,
"step": 233
},
{
"entropy": 1.0995429754257202,
"epoch": 1.6871609403254972,
"grad_norm": 0.0,
"learning_rate": 0.00011724137931034482,
"loss": 2.4889,
"mean_token_accuracy": 0.5333838164806366,
"num_tokens": 322273.0,
"step": 234
},
{
"entropy": 1.091221421957016,
"epoch": 1.694394213381555,
"grad_norm": 0.0,
"learning_rate": 0.00011687840290381127,
"loss": 2.5964,
"mean_token_accuracy": 0.5492272824048996,
"num_tokens": 323681.0,
"step": 235
},
{
"entropy": 1.089509442448616,
"epoch": 1.701627486437613,
"grad_norm": 0.0,
"learning_rate": 0.00011651542649727769,
"loss": 2.4769,
"mean_token_accuracy": 0.5460635870695114,
"num_tokens": 325074.0,
"step": 236
},
{
"entropy": 1.1351898610591888,
"epoch": 1.7088607594936709,
"grad_norm": 0.0,
"learning_rate": 0.00011615245009074412,
"loss": 2.4841,
"mean_token_accuracy": 0.5464938133955002,
"num_tokens": 326448.0,
"step": 237
},
{
"entropy": 1.065805822610855,
"epoch": 1.7160940325497287,
"grad_norm": 0.0,
"learning_rate": 0.00011578947368421053,
"loss": 2.5789,
"mean_token_accuracy": 0.5493966788053513,
"num_tokens": 327816.0,
"step": 238
},
{
"entropy": 1.1061568558216095,
"epoch": 1.7233273056057867,
"grad_norm": 0.0,
"learning_rate": 0.00011542649727767697,
"loss": 2.5684,
"mean_token_accuracy": 0.5350202769041061,
"num_tokens": 329183.0,
"step": 239
},
{
"entropy": 1.1101190447807312,
"epoch": 1.7305605786618445,
"grad_norm": 0.0,
"learning_rate": 0.00011506352087114338,
"loss": 2.544,
"mean_token_accuracy": 0.5512830317020416,
"num_tokens": 330564.0,
"step": 240
},
{
"entropy": 1.1330992877483368,
"epoch": 1.7377938517179023,
"grad_norm": 0.0,
"learning_rate": 0.0001147005444646098,
"loss": 2.5997,
"mean_token_accuracy": 0.5283233672380447,
"num_tokens": 331896.0,
"step": 241
},
{
"entropy": 1.088073879480362,
"epoch": 1.7450271247739604,
"grad_norm": 0.0,
"learning_rate": 0.00011433756805807623,
"loss": 2.5063,
"mean_token_accuracy": 0.5544091314077377,
"num_tokens": 333237.0,
"step": 242
},
{
"entropy": 1.1106895804405212,
"epoch": 1.752260397830018,
"grad_norm": 0.0,
"learning_rate": 0.00011397459165154265,
"loss": 2.5657,
"mean_token_accuracy": 0.5488100796937943,
"num_tokens": 334592.0,
"step": 243
},
{
"entropy": 1.0510992854833603,
"epoch": 1.759493670886076,
"grad_norm": 0.0,
"learning_rate": 0.00011361161524500907,
"loss": 2.5307,
"mean_token_accuracy": 0.5456000864505768,
"num_tokens": 336006.0,
"step": 244
},
{
"entropy": 1.0965907573699951,
"epoch": 1.7667269439421338,
"grad_norm": 0.0,
"learning_rate": 0.0001132486388384755,
"loss": 2.4492,
"mean_token_accuracy": 0.5489227473735809,
"num_tokens": 337410.0,
"step": 245
},
{
"entropy": 1.0655700862407684,
"epoch": 1.7739602169981916,
"grad_norm": 0.0,
"learning_rate": 0.00011288566243194192,
"loss": 2.4842,
"mean_token_accuracy": 0.5555550754070282,
"num_tokens": 338810.0,
"step": 246
},
{
"entropy": 1.1214852780103683,
"epoch": 1.7811934900542497,
"grad_norm": 0.0,
"learning_rate": 0.00011252268602540836,
"loss": 2.4906,
"mean_token_accuracy": 0.5389305800199509,
"num_tokens": 340273.0,
"step": 247
},
{
"entropy": 1.1377345025539398,
"epoch": 1.7884267631103075,
"grad_norm": 0.0,
"learning_rate": 0.00011215970961887479,
"loss": 2.5227,
"mean_token_accuracy": 0.530749037861824,
"num_tokens": 341670.0,
"step": 248
},
{
"entropy": 1.1064392030239105,
"epoch": 1.7956600361663653,
"grad_norm": 0.0,
"learning_rate": 0.00011179673321234121,
"loss": 2.5451,
"mean_token_accuracy": 0.5409312695264816,
"num_tokens": 343002.0,
"step": 249
},
{
"entropy": 1.115884155035019,
"epoch": 1.8028933092224233,
"grad_norm": 0.0,
"learning_rate": 0.00011143375680580763,
"loss": 2.5614,
"mean_token_accuracy": 0.5375799685716629,
"num_tokens": 344415.0,
"step": 250
},
{
"entropy": 1.1006877422332764,
"epoch": 1.810126582278481,
"grad_norm": 0.0,
"learning_rate": 0.00011107078039927406,
"loss": 2.4458,
"mean_token_accuracy": 0.5491993427276611,
"num_tokens": 345822.0,
"step": 251
},
{
"entropy": 1.0959600508213043,
"epoch": 1.817359855334539,
"grad_norm": 0.0,
"learning_rate": 0.00011070780399274048,
"loss": 2.4732,
"mean_token_accuracy": 0.5487753301858902,
"num_tokens": 347197.0,
"step": 252
},
{
"entropy": 1.069685012102127,
"epoch": 1.8245931283905967,
"grad_norm": 0.0,
"learning_rate": 0.0001103448275862069,
"loss": 2.4234,
"mean_token_accuracy": 0.5564672946929932,
"num_tokens": 348560.0,
"step": 253
},
{
"entropy": 1.1855289340019226,
"epoch": 1.8318264014466545,
"grad_norm": 0.0,
"learning_rate": 0.00010998185117967332,
"loss": 2.4667,
"mean_token_accuracy": 0.5307918041944504,
"num_tokens": 349915.0,
"step": 254
},
{
"entropy": 1.0919787287712097,
"epoch": 1.8390596745027126,
"grad_norm": 0.0,
"learning_rate": 0.00010961887477313974,
"loss": 2.452,
"mean_token_accuracy": 0.5391028076410294,
"num_tokens": 351361.0,
"step": 255
},
{
"entropy": 1.0317457616329193,
"epoch": 1.8462929475587704,
"grad_norm": 0.0,
"learning_rate": 0.00010925589836660617,
"loss": 2.483,
"mean_token_accuracy": 0.5674040019512177,
"num_tokens": 352716.0,
"step": 256
},
{
"entropy": 1.1169418692588806,
"epoch": 1.8535262206148282,
"grad_norm": 0.0,
"learning_rate": 0.00010889292196007259,
"loss": 2.5078,
"mean_token_accuracy": 0.5412558689713478,
"num_tokens": 354073.0,
"step": 257
},
{
"entropy": 1.0786909759044647,
"epoch": 1.8607594936708862,
"grad_norm": 0.0,
"learning_rate": 0.00010852994555353901,
"loss": 2.447,
"mean_token_accuracy": 0.5568330138921738,
"num_tokens": 355466.0,
"step": 258
},
{
"entropy": 1.1377921402454376,
"epoch": 1.8679927667269438,
"grad_norm": 0.0,
"learning_rate": 0.00010816696914700545,
"loss": 2.5446,
"mean_token_accuracy": 0.5410029292106628,
"num_tokens": 356819.0,
"step": 259
},
{
"entropy": 1.0561549663543701,
"epoch": 1.8752260397830018,
"grad_norm": 0.0,
"learning_rate": 0.00010780399274047188,
"loss": 2.4241,
"mean_token_accuracy": 0.562323585152626,
"num_tokens": 358205.0,
"step": 260
},
{
"entropy": 1.1019063591957092,
"epoch": 1.8824593128390597,
"grad_norm": 0.0,
"learning_rate": 0.0001074410163339383,
"loss": 2.4751,
"mean_token_accuracy": 0.549642950296402,
"num_tokens": 359634.0,
"step": 261
},
{
"entropy": 1.1285791844129562,
"epoch": 1.8896925858951175,
"grad_norm": 0.0,
"learning_rate": 0.00010707803992740473,
"loss": 2.5493,
"mean_token_accuracy": 0.526919350028038,
"num_tokens": 360970.0,
"step": 262
},
{
"entropy": 1.0747187435626984,
"epoch": 1.8969258589511755,
"grad_norm": 0.0,
"learning_rate": 0.00010671506352087115,
"loss": 2.5552,
"mean_token_accuracy": 0.5351353734731674,
"num_tokens": 362305.0,
"step": 263
},
{
"entropy": 1.0817217528820038,
"epoch": 1.9041591320072333,
"grad_norm": 0.0,
"learning_rate": 0.00010635208711433757,
"loss": 2.5681,
"mean_token_accuracy": 0.5475313067436218,
"num_tokens": 363677.0,
"step": 264
},
{
"entropy": 1.132976919412613,
"epoch": 1.9113924050632911,
"grad_norm": 0.0,
"learning_rate": 0.000105989110707804,
"loss": 2.5469,
"mean_token_accuracy": 0.5420292168855667,
"num_tokens": 365019.0,
"step": 265
},
{
"entropy": 1.1142061054706573,
"epoch": 1.9186256781193491,
"grad_norm": 0.0,
"learning_rate": 0.00010562613430127042,
"loss": 2.5654,
"mean_token_accuracy": 0.5359852463006973,
"num_tokens": 366376.0,
"step": 266
},
{
"entropy": 1.1108311414718628,
"epoch": 1.9258589511754067,
"grad_norm": 0.0,
"learning_rate": 0.00010526315789473685,
"loss": 2.5543,
"mean_token_accuracy": 0.5375416427850723,
"num_tokens": 367689.0,
"step": 267
},
{
"entropy": 1.0747735798358917,
"epoch": 1.9330922242314648,
"grad_norm": 0.0,
"learning_rate": 0.00010490018148820327,
"loss": 2.4855,
"mean_token_accuracy": 0.5498427450656891,
"num_tokens": 369062.0,
"step": 268
},
{
"entropy": 1.067014902830124,
"epoch": 1.9403254972875226,
"grad_norm": 0.0,
"learning_rate": 0.00010453720508166968,
"loss": 2.501,
"mean_token_accuracy": 0.5583888292312622,
"num_tokens": 370453.0,
"step": 269
},
{
"entropy": 1.0880665332078934,
"epoch": 1.9475587703435804,
"grad_norm": 0.0,
"learning_rate": 0.00010417422867513613,
"loss": 2.533,
"mean_token_accuracy": 0.5387668460607529,
"num_tokens": 371819.0,
"step": 270
},
{
"entropy": 1.0962998867034912,
"epoch": 1.9547920433996384,
"grad_norm": 0.0,
"learning_rate": 0.00010381125226860256,
"loss": 2.5378,
"mean_token_accuracy": 0.5404876172542572,
"num_tokens": 373163.0,
"step": 271
},
{
"entropy": 1.1243647336959839,
"epoch": 1.9620253164556962,
"grad_norm": 0.0,
"learning_rate": 0.00010344827586206898,
"loss": 2.4849,
"mean_token_accuracy": 0.5434926003217697,
"num_tokens": 374562.0,
"step": 272
},
{
"entropy": 1.1283120214939117,
"epoch": 1.969258589511754,
"grad_norm": 0.0,
"learning_rate": 0.0001030852994555354,
"loss": 2.5624,
"mean_token_accuracy": 0.5324713289737701,
"num_tokens": 375913.0,
"step": 273
},
{
"entropy": 1.1570648550987244,
"epoch": 1.976491862567812,
"grad_norm": 0.0,
"learning_rate": 0.00010272232304900183,
"loss": 2.5273,
"mean_token_accuracy": 0.5250543802976608,
"num_tokens": 377318.0,
"step": 274
},
{
"entropy": 1.1177105605602264,
"epoch": 1.9837251356238697,
"grad_norm": 0.0,
"learning_rate": 0.00010235934664246824,
"loss": 2.5293,
"mean_token_accuracy": 0.5483842194080353,
"num_tokens": 378658.0,
"step": 275
},
{
"entropy": 1.1408209800720215,
"epoch": 1.9909584086799277,
"grad_norm": 0.0,
"learning_rate": 0.00010199637023593467,
"loss": 2.5535,
"mean_token_accuracy": 0.5317675769329071,
"num_tokens": 380010.0,
"step": 276
},
{
"entropy": 1.0788544714450836,
"epoch": 1.9981916817359855,
"grad_norm": 0.0,
"learning_rate": 0.00010163339382940109,
"loss": 2.5116,
"mean_token_accuracy": 0.5480602532625198,
"num_tokens": 381410.0,
"step": 277
},
{
"entropy": 1.132047176361084,
"epoch": 2.0,
"grad_norm": 0.0,
"learning_rate": 0.00010127041742286751,
"loss": 2.3664,
"mean_token_accuracy": 0.5952380895614624,
"num_tokens": 381580.0,
"step": 278
},
{
"entropy": 1.1153302490711212,
"epoch": 2.007233273056058,
"grad_norm": 0.0,
"learning_rate": 0.00010090744101633394,
"loss": 2.5106,
"mean_token_accuracy": 0.5436052531003952,
"num_tokens": 382940.0,
"step": 279
},
{
"entropy": 1.1441259980201721,
"epoch": 2.0144665461121156,
"grad_norm": 0.0,
"learning_rate": 0.00010054446460980036,
"loss": 2.6192,
"mean_token_accuracy": 0.5275150388479233,
"num_tokens": 384262.0,
"step": 280
},
{
"entropy": 1.1141088157892227,
"epoch": 2.0216998191681737,
"grad_norm": 0.0,
"learning_rate": 0.00010018148820326678,
"loss": 2.5423,
"mean_token_accuracy": 0.5334936380386353,
"num_tokens": 385654.0,
"step": 281
},
{
"entropy": 1.1348835080862045,
"epoch": 2.0289330922242317,
"grad_norm": 0.0,
"learning_rate": 9.981851179673322e-05,
"loss": 2.523,
"mean_token_accuracy": 0.5435043275356293,
"num_tokens": 386982.0,
"step": 282
},
{
"entropy": 1.0941515564918518,
"epoch": 2.0361663652802893,
"grad_norm": 0.0,
"learning_rate": 9.945553539019964e-05,
"loss": 2.4501,
"mean_token_accuracy": 0.5536217093467712,
"num_tokens": 388414.0,
"step": 283
},
{
"entropy": 1.0650618076324463,
"epoch": 2.0433996383363473,
"grad_norm": 0.0,
"learning_rate": 9.909255898366606e-05,
"loss": 2.5364,
"mean_token_accuracy": 0.5536990314722061,
"num_tokens": 389792.0,
"step": 284
},
{
"entropy": 1.2052790224552155,
"epoch": 2.050632911392405,
"grad_norm": 0.0,
"learning_rate": 9.87295825771325e-05,
"loss": 2.6587,
"mean_token_accuracy": 0.5142560675740242,
"num_tokens": 391102.0,
"step": 285
},
{
"entropy": 1.0709343552589417,
"epoch": 2.057866184448463,
"grad_norm": 0.0,
"learning_rate": 9.836660617059892e-05,
"loss": 2.4229,
"mean_token_accuracy": 0.5499039888381958,
"num_tokens": 392525.0,
"step": 286
},
{
"entropy": 1.0422286242246628,
"epoch": 2.065099457504521,
"grad_norm": 0.0,
"learning_rate": 9.800362976406534e-05,
"loss": 2.5312,
"mean_token_accuracy": 0.5531226098537445,
"num_tokens": 393832.0,
"step": 287
},
{
"entropy": 1.1782008707523346,
"epoch": 2.0723327305605785,
"grad_norm": 0.0,
"learning_rate": 9.764065335753177e-05,
"loss": 2.4966,
"mean_token_accuracy": 0.5358275324106216,
"num_tokens": 395262.0,
"step": 288
},
{
"entropy": 1.075025349855423,
"epoch": 2.0795660036166366,
"grad_norm": 0.0,
"learning_rate": 9.727767695099818e-05,
"loss": 2.5047,
"mean_token_accuracy": 0.5378954857587814,
"num_tokens": 396636.0,
"step": 289
},
{
"entropy": 1.1651588082313538,
"epoch": 2.0867992766726946,
"grad_norm": 0.0,
"learning_rate": 9.691470054446462e-05,
"loss": 2.5779,
"mean_token_accuracy": 0.5389810055494308,
"num_tokens": 397961.0,
"step": 290
},
{
"entropy": 1.1003607213497162,
"epoch": 2.094032549728752,
"grad_norm": 0.0,
"learning_rate": 9.655172413793105e-05,
"loss": 2.4637,
"mean_token_accuracy": 0.5527941435575485,
"num_tokens": 399350.0,
"step": 291
},
{
"entropy": 1.0847670435905457,
"epoch": 2.1012658227848102,
"grad_norm": 0.0,
"learning_rate": 9.618874773139746e-05,
"loss": 2.5583,
"mean_token_accuracy": 0.5427492707967758,
"num_tokens": 400728.0,
"step": 292
},
{
"entropy": 1.0903814435005188,
"epoch": 2.108499095840868,
"grad_norm": 0.0,
"learning_rate": 9.58257713248639e-05,
"loss": 2.5381,
"mean_token_accuracy": 0.5464666932821274,
"num_tokens": 402105.0,
"step": 293
},
{
"entropy": 1.0938865840435028,
"epoch": 2.115732368896926,
"grad_norm": 0.0,
"learning_rate": 9.546279491833031e-05,
"loss": 2.5563,
"mean_token_accuracy": 0.5378870666027069,
"num_tokens": 403436.0,
"step": 294
},
{
"entropy": 1.1337309926748276,
"epoch": 2.122965641952984,
"grad_norm": 0.0,
"learning_rate": 9.509981851179673e-05,
"loss": 2.5028,
"mean_token_accuracy": 0.537694551050663,
"num_tokens": 404762.0,
"step": 295
},
{
"entropy": 1.0870742499828339,
"epoch": 2.1301989150090415,
"grad_norm": 0.0,
"learning_rate": 9.473684210526316e-05,
"loss": 2.4864,
"mean_token_accuracy": 0.5463483482599258,
"num_tokens": 406079.0,
"step": 296
},
{
"entropy": 1.113557517528534,
"epoch": 2.1374321880650995,
"grad_norm": 0.0,
"learning_rate": 9.437386569872959e-05,
"loss": 2.4162,
"mean_token_accuracy": 0.5570182800292969,
"num_tokens": 407484.0,
"step": 297
},
{
"entropy": 1.0499663054943085,
"epoch": 2.1446654611211575,
"grad_norm": 0.0,
"learning_rate": 9.401088929219601e-05,
"loss": 2.4024,
"mean_token_accuracy": 0.5615633577108383,
"num_tokens": 408928.0,
"step": 298
},
{
"entropy": 1.1128461360931396,
"epoch": 2.151898734177215,
"grad_norm": 0.0,
"learning_rate": 9.364791288566244e-05,
"loss": 2.4792,
"mean_token_accuracy": 0.545862227678299,
"num_tokens": 410349.0,
"step": 299
},
{
"entropy": 1.0802654325962067,
"epoch": 2.159132007233273,
"grad_norm": 0.0,
"learning_rate": 9.328493647912886e-05,
"loss": 2.5234,
"mean_token_accuracy": 0.541241779923439,
"num_tokens": 411719.0,
"step": 300
},
{
"entropy": 1.074128895998001,
"epoch": 2.1663652802893307,
"grad_norm": 0.0,
"learning_rate": 9.292196007259528e-05,
"loss": 2.4736,
"mean_token_accuracy": 0.5454134047031403,
"num_tokens": 413112.0,
"step": 301
},
{
"entropy": 1.0438470542430878,
"epoch": 2.1735985533453888,
"grad_norm": 0.0,
"learning_rate": 9.255898366606171e-05,
"loss": 2.4601,
"mean_token_accuracy": 0.5526005625724792,
"num_tokens": 414468.0,
"step": 302
},
{
"entropy": 1.048665851354599,
"epoch": 2.180831826401447,
"grad_norm": 0.0,
"learning_rate": 9.219600725952814e-05,
"loss": 2.5202,
"mean_token_accuracy": 0.5461462587118149,
"num_tokens": 415831.0,
"step": 303
},
{
"entropy": 1.008974403142929,
"epoch": 2.1880650994575044,
"grad_norm": 0.0,
"learning_rate": 9.183303085299456e-05,
"loss": 2.4394,
"mean_token_accuracy": 0.5644627660512924,
"num_tokens": 417221.0,
"step": 304
},
{
"entropy": 1.1043965220451355,
"epoch": 2.1952983725135624,
"grad_norm": 0.0,
"learning_rate": 9.147005444646099e-05,
"loss": 2.5703,
"mean_token_accuracy": 0.545972928404808,
"num_tokens": 418583.0,
"step": 305
},
{
"entropy": 1.0962344855070114,
"epoch": 2.2025316455696204,
"grad_norm": 0.0,
"learning_rate": 9.11070780399274e-05,
"loss": 2.4894,
"mean_token_accuracy": 0.5580534338951111,
"num_tokens": 419865.0,
"step": 306
},
{
"entropy": 1.1428653001785278,
"epoch": 2.209764918625678,
"grad_norm": 0.0,
"learning_rate": 9.074410163339382e-05,
"loss": 2.5941,
"mean_token_accuracy": 0.5405401438474655,
"num_tokens": 421190.0,
"step": 307
},
{
"entropy": 1.0979954898357391,
"epoch": 2.216998191681736,
"grad_norm": 0.0,
"learning_rate": 9.038112522686027e-05,
"loss": 2.5265,
"mean_token_accuracy": 0.530515693128109,
"num_tokens": 422593.0,
"step": 308
},
{
"entropy": 1.0707228779792786,
"epoch": 2.2242314647377937,
"grad_norm": 0.0,
"learning_rate": 9.001814882032669e-05,
"loss": 2.4784,
"mean_token_accuracy": 0.5580956488847733,
"num_tokens": 424028.0,
"step": 309
},
{
"entropy": 1.139556735754013,
"epoch": 2.2314647377938517,
"grad_norm": 0.0,
"learning_rate": 8.96551724137931e-05,
"loss": 2.5697,
"mean_token_accuracy": 0.532667800784111,
"num_tokens": 425381.0,
"step": 310
},
{
"entropy": 1.108998566865921,
"epoch": 2.2386980108499097,
"grad_norm": 0.0,
"learning_rate": 8.929219600725953e-05,
"loss": 2.5312,
"mean_token_accuracy": 0.5356175154447556,
"num_tokens": 426797.0,
"step": 311
},
{
"entropy": 1.1161520779132843,
"epoch": 2.2459312839059673,
"grad_norm": 0.0,
"learning_rate": 8.892921960072595e-05,
"loss": 2.5586,
"mean_token_accuracy": 0.542470321059227,
"num_tokens": 428147.0,
"step": 312
},
{
"entropy": 1.0842461287975311,
"epoch": 2.2531645569620253,
"grad_norm": 0.0,
"learning_rate": 8.856624319419238e-05,
"loss": 2.5123,
"mean_token_accuracy": 0.5529365092515945,
"num_tokens": 429559.0,
"step": 313
},
{
"entropy": 1.1063741445541382,
"epoch": 2.2603978300180834,
"grad_norm": 0.0,
"learning_rate": 8.820326678765881e-05,
"loss": 2.6709,
"mean_token_accuracy": 0.527004636824131,
"num_tokens": 430920.0,
"step": 314
},
{
"entropy": 1.0897562205791473,
"epoch": 2.267631103074141,
"grad_norm": 0.0,
"learning_rate": 8.784029038112523e-05,
"loss": 2.4544,
"mean_token_accuracy": 0.5409349501132965,
"num_tokens": 432317.0,
"step": 315
},
{
"entropy": 1.0743544101715088,
"epoch": 2.274864376130199,
"grad_norm": 0.0,
"learning_rate": 8.747731397459166e-05,
"loss": 2.5057,
"mean_token_accuracy": 0.5578703433275223,
"num_tokens": 433677.0,
"step": 316
},
{
"entropy": 1.116711288690567,
"epoch": 2.282097649186257,
"grad_norm": 0.0,
"learning_rate": 8.711433756805808e-05,
"loss": 2.5612,
"mean_token_accuracy": 0.5628638714551926,
"num_tokens": 435033.0,
"step": 317
},
{
"entropy": 1.0676509886980057,
"epoch": 2.2893309222423146,
"grad_norm": 0.0,
"learning_rate": 8.67513611615245e-05,
"loss": 2.5598,
"mean_token_accuracy": 0.5456564128398895,
"num_tokens": 436427.0,
"step": 318
},
{
"entropy": 1.0859863460063934,
"epoch": 2.2965641952983726,
"grad_norm": 0.0,
"learning_rate": 8.638838475499093e-05,
"loss": 2.4101,
"mean_token_accuracy": 0.5709582716226578,
"num_tokens": 437883.0,
"step": 319
},
{
"entropy": 1.0581027567386627,
"epoch": 2.3037974683544302,
"grad_norm": 0.0,
"learning_rate": 8.602540834845736e-05,
"loss": 2.5804,
"mean_token_accuracy": 0.5438152998685837,
"num_tokens": 439239.0,
"step": 320
},
{
"entropy": 1.06120565533638,
"epoch": 2.3110307414104883,
"grad_norm": 0.0,
"learning_rate": 8.566243194192378e-05,
"loss": 2.5033,
"mean_token_accuracy": 0.5619644969701767,
"num_tokens": 440618.0,
"step": 321
},
{
"entropy": 1.0986764430999756,
"epoch": 2.3182640144665463,
"grad_norm": 0.0,
"learning_rate": 8.529945553539021e-05,
"loss": 2.5249,
"mean_token_accuracy": 0.5383182018995285,
"num_tokens": 441969.0,
"step": 322
},
{
"entropy": 1.0080211162567139,
"epoch": 2.325497287522604,
"grad_norm": 0.0,
"learning_rate": 8.493647912885663e-05,
"loss": 2.4389,
"mean_token_accuracy": 0.5622601956129074,
"num_tokens": 443374.0,
"step": 323
},
{
"entropy": 1.0605318397283554,
"epoch": 2.332730560578662,
"grad_norm": 0.0,
"learning_rate": 8.457350272232304e-05,
"loss": 2.4748,
"mean_token_accuracy": 0.5619993209838867,
"num_tokens": 444719.0,
"step": 324
},
{
"entropy": 1.0709475874900818,
"epoch": 2.3399638336347195,
"grad_norm": 0.0,
"learning_rate": 8.421052631578948e-05,
"loss": 2.5582,
"mean_token_accuracy": 0.5409077703952789,
"num_tokens": 446087.0,
"step": 325
},
{
"entropy": 1.1008452475070953,
"epoch": 2.3471971066907775,
"grad_norm": 0.0,
"learning_rate": 8.38475499092559e-05,
"loss": 2.4498,
"mean_token_accuracy": 0.5474594086408615,
"num_tokens": 447476.0,
"step": 326
},
{
"entropy": 1.0892058312892914,
"epoch": 2.3544303797468356,
"grad_norm": 0.0,
"learning_rate": 8.348457350272232e-05,
"loss": 2.3334,
"mean_token_accuracy": 0.5688722282648087,
"num_tokens": 448884.0,
"step": 327
},
{
"entropy": 1.1114183962345123,
"epoch": 2.361663652802893,
"grad_norm": 0.0,
"learning_rate": 8.312159709618876e-05,
"loss": 2.4905,
"mean_token_accuracy": 0.5403403639793396,
"num_tokens": 450339.0,
"step": 328
},
{
"entropy": 1.117778092622757,
"epoch": 2.368896925858951,
"grad_norm": 0.0,
"learning_rate": 8.275862068965517e-05,
"loss": 2.4494,
"mean_token_accuracy": 0.5517453998327255,
"num_tokens": 451759.0,
"step": 329
},
{
"entropy": 1.0825735926628113,
"epoch": 2.376130198915009,
"grad_norm": 0.0,
"learning_rate": 8.23956442831216e-05,
"loss": 2.5498,
"mean_token_accuracy": 0.5487709194421768,
"num_tokens": 453123.0,
"step": 330
},
{
"entropy": 1.0718315988779068,
"epoch": 2.383363471971067,
"grad_norm": 0.0,
"learning_rate": 8.203266787658802e-05,
"loss": 2.4747,
"mean_token_accuracy": 0.5481411963701248,
"num_tokens": 454529.0,
"step": 331
},
{
"entropy": 1.0700356364250183,
"epoch": 2.390596745027125,
"grad_norm": 0.0,
"learning_rate": 8.166969147005445e-05,
"loss": 2.4382,
"mean_token_accuracy": 0.5582916736602783,
"num_tokens": 455969.0,
"step": 332
},
{
"entropy": 1.1482441425323486,
"epoch": 2.397830018083183,
"grad_norm": 0.0,
"learning_rate": 8.130671506352088e-05,
"loss": 2.5565,
"mean_token_accuracy": 0.5328105837106705,
"num_tokens": 457364.0,
"step": 333
},
{
"entropy": 1.0800821483135223,
"epoch": 2.4050632911392404,
"grad_norm": 0.0,
"learning_rate": 8.09437386569873e-05,
"loss": 2.5446,
"mean_token_accuracy": 0.5423636585474014,
"num_tokens": 458716.0,
"step": 334
},
{
"entropy": 1.1315284073352814,
"epoch": 2.4122965641952985,
"grad_norm": 0.0,
"learning_rate": 8.058076225045372e-05,
"loss": 2.4926,
"mean_token_accuracy": 0.5568676143884659,
"num_tokens": 460055.0,
"step": 335
},
{
"entropy": 1.1484705805778503,
"epoch": 2.419529837251356,
"grad_norm": 0.0,
"learning_rate": 8.021778584392015e-05,
"loss": 2.5977,
"mean_token_accuracy": 0.524099811911583,
"num_tokens": 461402.0,
"step": 336
},
{
"entropy": 1.1127368807792664,
"epoch": 2.426763110307414,
"grad_norm": 0.0,
"learning_rate": 7.985480943738657e-05,
"loss": 2.6225,
"mean_token_accuracy": 0.5300293117761612,
"num_tokens": 462757.0,
"step": 337
},
{
"entropy": 1.08088980615139,
"epoch": 2.433996383363472,
"grad_norm": 0.0,
"learning_rate": 7.9491833030853e-05,
"loss": 2.4575,
"mean_token_accuracy": 0.5447369813919067,
"num_tokens": 464138.0,
"step": 338
},
{
"entropy": 1.0856578946113586,
"epoch": 2.4412296564195297,
"grad_norm": 0.0,
"learning_rate": 7.912885662431943e-05,
"loss": 2.4591,
"mean_token_accuracy": 0.5506195574998856,
"num_tokens": 465536.0,
"step": 339
},
{
"entropy": 1.090321958065033,
"epoch": 2.4484629294755877,
"grad_norm": 0.0,
"learning_rate": 7.876588021778585e-05,
"loss": 2.5399,
"mean_token_accuracy": 0.5420294851064682,
"num_tokens": 466896.0,
"step": 340
},
{
"entropy": 1.1008956581354141,
"epoch": 2.4556962025316453,
"grad_norm": 0.0,
"learning_rate": 7.840290381125227e-05,
"loss": 2.609,
"mean_token_accuracy": 0.541169598698616,
"num_tokens": 468219.0,
"step": 341
},
{
"entropy": 1.1546584069728851,
"epoch": 2.4629294755877034,
"grad_norm": 0.0,
"learning_rate": 7.80399274047187e-05,
"loss": 2.6097,
"mean_token_accuracy": 0.530337005853653,
"num_tokens": 469569.0,
"step": 342
},
{
"entropy": 1.1146379113197327,
"epoch": 2.4701627486437614,
"grad_norm": 0.0,
"learning_rate": 7.767695099818511e-05,
"loss": 2.4637,
"mean_token_accuracy": 0.5511259138584137,
"num_tokens": 470954.0,
"step": 343
},
{
"entropy": 1.1002293676137924,
"epoch": 2.477396021699819,
"grad_norm": 0.0,
"learning_rate": 7.731397459165155e-05,
"loss": 2.4593,
"mean_token_accuracy": 0.5492667853832245,
"num_tokens": 472325.0,
"step": 344
},
{
"entropy": 1.0669283270835876,
"epoch": 2.484629294755877,
"grad_norm": 0.0,
"learning_rate": 7.695099818511798e-05,
"loss": 2.414,
"mean_token_accuracy": 0.5588005930185318,
"num_tokens": 473725.0,
"step": 345
},
{
"entropy": 1.0927933007478714,
"epoch": 2.491862567811935,
"grad_norm": 0.0,
"learning_rate": 7.65880217785844e-05,
"loss": 2.5172,
"mean_token_accuracy": 0.544892281293869,
"num_tokens": 475106.0,
"step": 346
},
{
"entropy": 1.111391931772232,
"epoch": 2.4990958408679926,
"grad_norm": 0.0,
"learning_rate": 7.622504537205081e-05,
"loss": 2.469,
"mean_token_accuracy": 0.5566596537828445,
"num_tokens": 476487.0,
"step": 347
},
{
"entropy": 1.0637227594852448,
"epoch": 2.5063291139240507,
"grad_norm": 0.0,
"learning_rate": 7.586206896551724e-05,
"loss": 2.4938,
"mean_token_accuracy": 0.5464789718389511,
"num_tokens": 477906.0,
"step": 348
},
{
"entropy": 1.137012243270874,
"epoch": 2.5135623869801087,
"grad_norm": 0.0,
"learning_rate": 7.549909255898367e-05,
"loss": 2.5766,
"mean_token_accuracy": 0.5444240942597389,
"num_tokens": 479276.0,
"step": 349
},
{
"entropy": 1.11709363758564,
"epoch": 2.5207956600361663,
"grad_norm": 0.0,
"learning_rate": 7.513611615245009e-05,
"loss": 2.57,
"mean_token_accuracy": 0.5383694916963577,
"num_tokens": 480600.0,
"step": 350
},
{
"entropy": 1.1331063508987427,
"epoch": 2.5280289330922243,
"grad_norm": 0.0,
"learning_rate": 7.477313974591652e-05,
"loss": 2.4282,
"mean_token_accuracy": 0.5514472872018814,
"num_tokens": 481990.0,
"step": 351
},
{
"entropy": 1.115255892276764,
"epoch": 2.535262206148282,
"grad_norm": 0.0,
"learning_rate": 7.441016333938294e-05,
"loss": 2.5939,
"mean_token_accuracy": 0.527045726776123,
"num_tokens": 483334.0,
"step": 352
},
{
"entropy": 1.123845636844635,
"epoch": 2.54249547920434,
"grad_norm": 0.0,
"learning_rate": 7.404718693284937e-05,
"loss": 2.487,
"mean_token_accuracy": 0.556125819683075,
"num_tokens": 484723.0,
"step": 353
},
{
"entropy": 1.097337931394577,
"epoch": 2.549728752260398,
"grad_norm": 0.0,
"learning_rate": 7.368421052631579e-05,
"loss": 2.4926,
"mean_token_accuracy": 0.5411562025547028,
"num_tokens": 486125.0,
"step": 354
},
{
"entropy": 1.0377470254898071,
"epoch": 2.5569620253164556,
"grad_norm": 0.0,
"learning_rate": 7.332123411978222e-05,
"loss": 2.4463,
"mean_token_accuracy": 0.5503391325473785,
"num_tokens": 487597.0,
"step": 355
},
{
"entropy": 1.0625304579734802,
"epoch": 2.5641952983725136,
"grad_norm": 0.0,
"learning_rate": 7.295825771324865e-05,
"loss": 2.4083,
"mean_token_accuracy": 0.560984194278717,
"num_tokens": 489017.0,
"step": 356
},
{
"entropy": 1.1392813175916672,
"epoch": 2.571428571428571,
"grad_norm": 0.0,
"learning_rate": 7.259528130671507e-05,
"loss": 2.5535,
"mean_token_accuracy": 0.5292238146066666,
"num_tokens": 490389.0,
"step": 357
},
{
"entropy": 1.093557357788086,
"epoch": 2.578661844484629,
"grad_norm": 0.0,
"learning_rate": 7.223230490018149e-05,
"loss": 2.5426,
"mean_token_accuracy": 0.5413297116756439,
"num_tokens": 491781.0,
"step": 358
},
{
"entropy": 1.1400565207004547,
"epoch": 2.5858951175406872,
"grad_norm": 0.0,
"learning_rate": 7.186932849364792e-05,
"loss": 2.5368,
"mean_token_accuracy": 0.53314408659935,
"num_tokens": 493138.0,
"step": 359
},
{
"entropy": 1.1020191758871078,
"epoch": 2.5931283905967453,
"grad_norm": 0.0,
"learning_rate": 7.150635208711434e-05,
"loss": 2.536,
"mean_token_accuracy": 0.5494784340262413,
"num_tokens": 494468.0,
"step": 360
},
{
"entropy": 1.1096655130386353,
"epoch": 2.600361663652803,
"grad_norm": 0.0,
"learning_rate": 7.114337568058077e-05,
"loss": 2.5269,
"mean_token_accuracy": 0.5418857932090759,
"num_tokens": 495852.0,
"step": 361
},
{
"entropy": 1.15224489569664,
"epoch": 2.607594936708861,
"grad_norm": 0.0,
"learning_rate": 7.07803992740472e-05,
"loss": 2.4834,
"mean_token_accuracy": 0.5387386232614517,
"num_tokens": 497275.0,
"step": 362
},
{
"entropy": 1.1237707734107971,
"epoch": 2.6148282097649185,
"grad_norm": 0.0,
"learning_rate": 7.041742286751362e-05,
"loss": 2.5333,
"mean_token_accuracy": 0.534076914191246,
"num_tokens": 498628.0,
"step": 363
},
{
"entropy": 1.089442789554596,
"epoch": 2.6220614828209765,
"grad_norm": 0.0,
"learning_rate": 7.005444646098003e-05,
"loss": 2.4523,
"mean_token_accuracy": 0.5411692559719086,
"num_tokens": 500062.0,
"step": 364
},
{
"entropy": 1.0806556940078735,
"epoch": 2.6292947558770345,
"grad_norm": 0.0,
"learning_rate": 6.969147005444646e-05,
"loss": 2.5709,
"mean_token_accuracy": 0.5465729981660843,
"num_tokens": 501411.0,
"step": 365
},
{
"entropy": 1.060823678970337,
"epoch": 2.636528028933092,
"grad_norm": 0.0,
"learning_rate": 6.932849364791288e-05,
"loss": 2.5076,
"mean_token_accuracy": 0.543865293264389,
"num_tokens": 502800.0,
"step": 366
},
{
"entropy": 1.1018014252185822,
"epoch": 2.64376130198915,
"grad_norm": 0.0,
"learning_rate": 6.896551724137931e-05,
"loss": 2.4844,
"mean_token_accuracy": 0.5601823925971985,
"num_tokens": 504242.0,
"step": 367
},
{
"entropy": 1.121594250202179,
"epoch": 2.6509945750452077,
"grad_norm": 0.0,
"learning_rate": 6.860254083484574e-05,
"loss": 2.4678,
"mean_token_accuracy": 0.5592560023069382,
"num_tokens": 505682.0,
"step": 368
},
{
"entropy": 1.0919914245605469,
"epoch": 2.6582278481012658,
"grad_norm": 0.0,
"learning_rate": 6.823956442831216e-05,
"loss": 2.5458,
"mean_token_accuracy": 0.5379031747579575,
"num_tokens": 506999.0,
"step": 369
},
{
"entropy": 1.0880248546600342,
"epoch": 2.665461121157324,
"grad_norm": 0.0,
"learning_rate": 6.787658802177859e-05,
"loss": 2.5395,
"mean_token_accuracy": 0.5501109808683395,
"num_tokens": 508348.0,
"step": 370
},
{
"entropy": 1.1036258935928345,
"epoch": 2.6726943942133814,
"grad_norm": 0.0,
"learning_rate": 6.751361161524501e-05,
"loss": 2.4245,
"mean_token_accuracy": 0.5422181189060211,
"num_tokens": 509802.0,
"step": 371
},
{
"entropy": 1.0616427958011627,
"epoch": 2.6799276672694394,
"grad_norm": 0.0,
"learning_rate": 6.715063520871143e-05,
"loss": 2.4194,
"mean_token_accuracy": 0.5661514550447464,
"num_tokens": 511280.0,
"step": 372
},
{
"entropy": 1.0789577066898346,
"epoch": 2.687160940325497,
"grad_norm": 0.0,
"learning_rate": 6.678765880217786e-05,
"loss": 2.5258,
"mean_token_accuracy": 0.5572595447301865,
"num_tokens": 512694.0,
"step": 373
},
{
"entropy": 1.0851573646068573,
"epoch": 2.694394213381555,
"grad_norm": 0.0,
"learning_rate": 6.642468239564429e-05,
"loss": 2.4856,
"mean_token_accuracy": 0.5498984158039093,
"num_tokens": 514097.0,
"step": 374
},
{
"entropy": 1.1420559734106064,
"epoch": 2.701627486437613,
"grad_norm": 0.0,
"learning_rate": 6.606170598911071e-05,
"loss": 2.5407,
"mean_token_accuracy": 0.5401259958744049,
"num_tokens": 515548.0,
"step": 375
},
{
"entropy": 1.1659338176250458,
"epoch": 2.708860759493671,
"grad_norm": 0.0,
"learning_rate": 6.569872958257714e-05,
"loss": 2.4876,
"mean_token_accuracy": 0.5470705479383469,
"num_tokens": 516910.0,
"step": 376
},
{
"entropy": 1.0591351091861725,
"epoch": 2.7160940325497287,
"grad_norm": 0.0,
"learning_rate": 6.533575317604356e-05,
"loss": 2.5196,
"mean_token_accuracy": 0.5521393418312073,
"num_tokens": 518281.0,
"step": 377
},
{
"entropy": 1.0627894699573517,
"epoch": 2.7233273056057867,
"grad_norm": 0.0,
"learning_rate": 6.497277676950997e-05,
"loss": 2.4929,
"mean_token_accuracy": 0.5605411231517792,
"num_tokens": 519628.0,
"step": 378
},
{
"entropy": 1.1596409678459167,
"epoch": 2.7305605786618443,
"grad_norm": 0.0,
"learning_rate": 6.460980036297642e-05,
"loss": 2.573,
"mean_token_accuracy": 0.532776340842247,
"num_tokens": 520963.0,
"step": 379
},
{
"entropy": 1.1137427985668182,
"epoch": 2.7377938517179023,
"grad_norm": 0.0,
"learning_rate": 6.424682395644284e-05,
"loss": 2.6272,
"mean_token_accuracy": 0.5297515243291855,
"num_tokens": 522361.0,
"step": 380
},
{
"entropy": 1.1028319001197815,
"epoch": 2.7450271247739604,
"grad_norm": 0.0,
"learning_rate": 6.388384754990925e-05,
"loss": 2.5842,
"mean_token_accuracy": 0.5381578505039215,
"num_tokens": 523711.0,
"step": 381
},
{
"entropy": 1.1044474244117737,
"epoch": 2.752260397830018,
"grad_norm": 0.0,
"learning_rate": 6.352087114337569e-05,
"loss": 2.4927,
"mean_token_accuracy": 0.5612015575170517,
"num_tokens": 525121.0,
"step": 382
},
{
"entropy": 1.0691001415252686,
"epoch": 2.759493670886076,
"grad_norm": 0.0,
"learning_rate": 6.31578947368421e-05,
"loss": 2.4769,
"mean_token_accuracy": 0.5529969185590744,
"num_tokens": 526523.0,
"step": 383
},
{
"entropy": 1.113732784986496,
"epoch": 2.7667269439421336,
"grad_norm": 0.0,
"learning_rate": 6.279491833030852e-05,
"loss": 2.604,
"mean_token_accuracy": 0.5437600612640381,
"num_tokens": 527857.0,
"step": 384
},
{
"entropy": 1.1269434988498688,
"epoch": 2.7739602169981916,
"grad_norm": 0.0,
"learning_rate": 6.243194192377497e-05,
"loss": 2.5608,
"mean_token_accuracy": 0.5377811715006828,
"num_tokens": 529150.0,
"step": 385
},
{
"entropy": 1.0413260757923126,
"epoch": 2.7811934900542497,
"grad_norm": 0.0,
"learning_rate": 6.206896551724138e-05,
"loss": 2.5217,
"mean_token_accuracy": 0.5485084652900696,
"num_tokens": 530577.0,
"step": 386
},
{
"entropy": 1.1135965585708618,
"epoch": 2.7884267631103077,
"grad_norm": 0.0,
"learning_rate": 6.17059891107078e-05,
"loss": 2.5482,
"mean_token_accuracy": 0.5371388792991638,
"num_tokens": 531979.0,
"step": 387
},
{
"entropy": 1.1181357651948929,
"epoch": 2.7956600361663653,
"grad_norm": 0.0,
"learning_rate": 6.134301270417423e-05,
"loss": 2.538,
"mean_token_accuracy": 0.5407692342996597,
"num_tokens": 533308.0,
"step": 388
},
{
"entropy": 1.1135433316230774,
"epoch": 2.8028933092224233,
"grad_norm": 0.0,
"learning_rate": 6.098003629764065e-05,
"loss": 2.6532,
"mean_token_accuracy": 0.5302906185388565,
"num_tokens": 534652.0,
"step": 389
},
{
"entropy": 1.0605097115039825,
"epoch": 2.810126582278481,
"grad_norm": 0.0,
"learning_rate": 6.061705989110709e-05,
"loss": 2.3683,
"mean_token_accuracy": 0.5562764406204224,
"num_tokens": 536133.0,
"step": 390
},
{
"entropy": 1.0552656948566437,
"epoch": 2.817359855334539,
"grad_norm": 0.0,
"learning_rate": 6.025408348457351e-05,
"loss": 2.4262,
"mean_token_accuracy": 0.5565686523914337,
"num_tokens": 537572.0,
"step": 391
},
{
"entropy": 1.033959150314331,
"epoch": 2.824593128390597,
"grad_norm": 0.0,
"learning_rate": 5.989110707803993e-05,
"loss": 2.5963,
"mean_token_accuracy": 0.5316084623336792,
"num_tokens": 539020.0,
"step": 392
},
{
"entropy": 1.0652283132076263,
"epoch": 2.8318264014466545,
"grad_norm": 0.0,
"learning_rate": 5.9528130671506354e-05,
"loss": 2.4578,
"mean_token_accuracy": 0.5588241964578629,
"num_tokens": 540459.0,
"step": 393
},
{
"entropy": 1.2018023431301117,
"epoch": 2.8390596745027126,
"grad_norm": 0.0,
"learning_rate": 5.916515426497278e-05,
"loss": 2.5386,
"mean_token_accuracy": 0.5374249666929245,
"num_tokens": 541835.0,
"step": 394
},
{
"entropy": 1.1306479573249817,
"epoch": 2.84629294755877,
"grad_norm": 0.0,
"learning_rate": 5.88021778584392e-05,
"loss": 2.5295,
"mean_token_accuracy": 0.5378138273954391,
"num_tokens": 543205.0,
"step": 395
},
{
"entropy": 1.123464673757553,
"epoch": 2.853526220614828,
"grad_norm": 0.0,
"learning_rate": 5.8439201451905634e-05,
"loss": 2.4656,
"mean_token_accuracy": 0.5502952486276627,
"num_tokens": 544634.0,
"step": 396
},
{
"entropy": 1.0560733824968338,
"epoch": 2.8607594936708862,
"grad_norm": 0.0,
"learning_rate": 5.807622504537206e-05,
"loss": 2.475,
"mean_token_accuracy": 0.5549702495336533,
"num_tokens": 546043.0,
"step": 397
},
{
"entropy": 1.1620246469974518,
"epoch": 2.867992766726944,
"grad_norm": 0.0,
"learning_rate": 5.771324863883848e-05,
"loss": 2.5459,
"mean_token_accuracy": 0.53336001932621,
"num_tokens": 547385.0,
"step": 398
},
{
"entropy": 1.1056917607784271,
"epoch": 2.875226039783002,
"grad_norm": 0.0,
"learning_rate": 5.73502722323049e-05,
"loss": 2.5579,
"mean_token_accuracy": 0.5398988127708435,
"num_tokens": 548771.0,
"step": 399
},
{
"entropy": 1.1166136860847473,
"epoch": 2.8824593128390594,
"grad_norm": 0.0,
"learning_rate": 5.6987295825771325e-05,
"loss": 2.4985,
"mean_token_accuracy": 0.5441780239343643,
"num_tokens": 550099.0,
"step": 400
},
{
"entropy": 1.1156093180179596,
"epoch": 2.8896925858951175,
"grad_norm": 0.0,
"learning_rate": 5.662431941923775e-05,
"loss": 2.4891,
"mean_token_accuracy": 0.5433008521795273,
"num_tokens": 551444.0,
"step": 401
},
{
"entropy": 1.1306461095809937,
"epoch": 2.8969258589511755,
"grad_norm": 0.0,
"learning_rate": 5.626134301270418e-05,
"loss": 2.5102,
"mean_token_accuracy": 0.5288781076669693,
"num_tokens": 552832.0,
"step": 402
},
{
"entropy": 1.076299399137497,
"epoch": 2.9041591320072335,
"grad_norm": 0.0,
"learning_rate": 5.5898366606170604e-05,
"loss": 2.5052,
"mean_token_accuracy": 0.5499210357666016,
"num_tokens": 554262.0,
"step": 403
},
{
"entropy": 1.1156409084796906,
"epoch": 2.911392405063291,
"grad_norm": 0.0,
"learning_rate": 5.553539019963703e-05,
"loss": 2.4818,
"mean_token_accuracy": 0.5514587759971619,
"num_tokens": 555601.0,
"step": 404
},
{
"entropy": 1.1044765412807465,
"epoch": 2.918625678119349,
"grad_norm": 0.0,
"learning_rate": 5.517241379310345e-05,
"loss": 2.4969,
"mean_token_accuracy": 0.5391402244567871,
"num_tokens": 556959.0,
"step": 405
},
{
"entropy": 1.0975346863269806,
"epoch": 2.9258589511754067,
"grad_norm": 0.0,
"learning_rate": 5.480943738656987e-05,
"loss": 2.4576,
"mean_token_accuracy": 0.5543079674243927,
"num_tokens": 558363.0,
"step": 406
},
{
"entropy": 1.1334719359874725,
"epoch": 2.9330922242314648,
"grad_norm": 0.0,
"learning_rate": 5.4446460980036295e-05,
"loss": 2.5585,
"mean_token_accuracy": 0.5375552624464035,
"num_tokens": 559717.0,
"step": 407
},
{
"entropy": 1.0860235095024109,
"epoch": 2.940325497287523,
"grad_norm": 0.0,
"learning_rate": 5.4083484573502726e-05,
"loss": 2.5664,
"mean_token_accuracy": 0.5349879115819931,
"num_tokens": 561050.0,
"step": 408
},
{
"entropy": 1.1346383392810822,
"epoch": 2.9475587703435804,
"grad_norm": 0.0,
"learning_rate": 5.372050816696915e-05,
"loss": 2.5143,
"mean_token_accuracy": 0.536883682012558,
"num_tokens": 562484.0,
"step": 409
},
{
"entropy": 1.0807745456695557,
"epoch": 2.9547920433996384,
"grad_norm": 0.0,
"learning_rate": 5.3357531760435575e-05,
"loss": 2.4388,
"mean_token_accuracy": 0.5592896640300751,
"num_tokens": 563891.0,
"step": 410
},
{
"entropy": 1.1142305731773376,
"epoch": 2.962025316455696,
"grad_norm": 0.0,
"learning_rate": 5.2994555353902e-05,
"loss": 2.4821,
"mean_token_accuracy": 0.5541020184755325,
"num_tokens": 565284.0,
"step": 411
},
{
"entropy": 1.0620263665914536,
"epoch": 2.969258589511754,
"grad_norm": 0.0,
"learning_rate": 5.2631578947368424e-05,
"loss": 2.5498,
"mean_token_accuracy": 0.5445921868085861,
"num_tokens": 566623.0,
"step": 412
},
{
"entropy": 1.0871918499469757,
"epoch": 2.976491862567812,
"grad_norm": 0.0,
"learning_rate": 5.226860254083484e-05,
"loss": 2.4715,
"mean_token_accuracy": 0.5425689667463303,
"num_tokens": 568009.0,
"step": 413
},
{
"entropy": 1.1104594767093658,
"epoch": 2.9837251356238697,
"grad_norm": 0.0,
"learning_rate": 5.190562613430128e-05,
"loss": 2.4909,
"mean_token_accuracy": 0.5514119565486908,
"num_tokens": 569437.0,
"step": 414
},
{
"entropy": 1.103329062461853,
"epoch": 2.9909584086799277,
"grad_norm": 0.0,
"learning_rate": 5.15426497277677e-05,
"loss": 2.5503,
"mean_token_accuracy": 0.5356593430042267,
"num_tokens": 570799.0,
"step": 415
},
{
"entropy": 1.1086884438991547,
"epoch": 2.9981916817359853,
"grad_norm": 0.0,
"learning_rate": 5.117967332123412e-05,
"loss": 2.5718,
"mean_token_accuracy": 0.535956472158432,
"num_tokens": 572196.0,
"step": 416
},
{
"entropy": 0.9301452040672302,
"epoch": 3.0,
"grad_norm": 0.0,
"learning_rate": 5.0816696914700546e-05,
"loss": 2.4567,
"mean_token_accuracy": 0.5465116500854492,
"num_tokens": 572370.0,
"step": 417
},
{
"entropy": 1.082727700471878,
"epoch": 3.007233273056058,
"grad_norm": 0.0,
"learning_rate": 5.045372050816697e-05,
"loss": 2.4503,
"mean_token_accuracy": 0.5427384972572327,
"num_tokens": 573772.0,
"step": 418
},
{
"entropy": 1.1125991344451904,
"epoch": 3.0144665461121156,
"grad_norm": 0.0,
"learning_rate": 5.009074410163339e-05,
"loss": 2.5188,
"mean_token_accuracy": 0.5483275502920151,
"num_tokens": 575161.0,
"step": 419
},
{
"entropy": 1.1073324084281921,
"epoch": 3.0216998191681737,
"grad_norm": 0.0,
"learning_rate": 4.972776769509982e-05,
"loss": 2.5325,
"mean_token_accuracy": 0.5482524484395981,
"num_tokens": 576544.0,
"step": 420
},
{
"entropy": 1.0981022417545319,
"epoch": 3.0289330922242317,
"grad_norm": 0.0,
"learning_rate": 4.936479128856625e-05,
"loss": 2.4728,
"mean_token_accuracy": 0.5473926514387131,
"num_tokens": 577967.0,
"step": 421
},
{
"entropy": 1.0880376398563385,
"epoch": 3.0361663652802893,
"grad_norm": 0.0,
"learning_rate": 4.900181488203267e-05,
"loss": 2.4843,
"mean_token_accuracy": 0.5521259754896164,
"num_tokens": 579347.0,
"step": 422
},
{
"entropy": 1.1007913947105408,
"epoch": 3.0433996383363473,
"grad_norm": 0.0,
"learning_rate": 4.863883847549909e-05,
"loss": 2.5436,
"mean_token_accuracy": 0.5395879149436951,
"num_tokens": 580701.0,
"step": 423
},
{
"entropy": 1.1133521646261215,
"epoch": 3.050632911392405,
"grad_norm": 0.0,
"learning_rate": 4.827586206896552e-05,
"loss": 2.5756,
"mean_token_accuracy": 0.5257195383310318,
"num_tokens": 582118.0,
"step": 424
},
{
"entropy": 1.1162773072719574,
"epoch": 3.057866184448463,
"grad_norm": 0.0,
"learning_rate": 4.791288566243195e-05,
"loss": 2.4577,
"mean_token_accuracy": 0.5567342340946198,
"num_tokens": 583497.0,
"step": 425
},
{
"entropy": 1.1314301490783691,
"epoch": 3.065099457504521,
"grad_norm": 0.0,
"learning_rate": 4.7549909255898365e-05,
"loss": 2.4166,
"mean_token_accuracy": 0.560940682888031,
"num_tokens": 584880.0,
"step": 426
},
{
"entropy": 1.1418559551239014,
"epoch": 3.0723327305605785,
"grad_norm": 0.0,
"learning_rate": 4.7186932849364796e-05,
"loss": 2.5006,
"mean_token_accuracy": 0.5414097160100937,
"num_tokens": 586303.0,
"step": 427
},
{
"entropy": 1.1465589702129364,
"epoch": 3.0795660036166366,
"grad_norm": 0.0,
"learning_rate": 4.682395644283122e-05,
"loss": 2.5196,
"mean_token_accuracy": 0.5303985774517059,
"num_tokens": 587652.0,
"step": 428
},
{
"entropy": 1.093212753534317,
"epoch": 3.0867992766726946,
"grad_norm": 0.0,
"learning_rate": 4.646098003629764e-05,
"loss": 2.6529,
"mean_token_accuracy": 0.5337386429309845,
"num_tokens": 589047.0,
"step": 429
},
{
"entropy": 1.0639213025569916,
"epoch": 3.094032549728752,
"grad_norm": 0.0,
"learning_rate": 4.609800362976407e-05,
"loss": 2.5235,
"mean_token_accuracy": 0.5478297472000122,
"num_tokens": 590357.0,
"step": 430
},
{
"entropy": 1.1245644390583038,
"epoch": 3.1012658227848102,
"grad_norm": 0.0,
"learning_rate": 4.5735027223230494e-05,
"loss": 2.5644,
"mean_token_accuracy": 0.5405333489179611,
"num_tokens": 591728.0,
"step": 431
},
{
"entropy": 1.1015748530626297,
"epoch": 3.108499095840868,
"grad_norm": 0.0,
"learning_rate": 4.537205081669691e-05,
"loss": 2.4838,
"mean_token_accuracy": 0.5478966683149338,
"num_tokens": 593131.0,
"step": 432
},
{
"entropy": 1.0423375070095062,
"epoch": 3.115732368896926,
"grad_norm": 0.0,
"learning_rate": 4.500907441016334e-05,
"loss": 2.3926,
"mean_token_accuracy": 0.5643803477287292,
"num_tokens": 594522.0,
"step": 433
},
{
"entropy": 1.0879007577896118,
"epoch": 3.122965641952984,
"grad_norm": 0.0,
"learning_rate": 4.464609800362977e-05,
"loss": 2.4367,
"mean_token_accuracy": 0.5585435032844543,
"num_tokens": 595933.0,
"step": 434
},
{
"entropy": 1.0884363949298859,
"epoch": 3.1301989150090415,
"grad_norm": 0.0,
"learning_rate": 4.428312159709619e-05,
"loss": 2.5198,
"mean_token_accuracy": 0.5258640795946121,
"num_tokens": 597380.0,
"step": 435
},
{
"entropy": 1.0987628400325775,
"epoch": 3.1374321880650995,
"grad_norm": 0.0,
"learning_rate": 4.3920145190562616e-05,
"loss": 2.6001,
"mean_token_accuracy": 0.5372245460748672,
"num_tokens": 598694.0,
"step": 436
},
{
"entropy": 1.1444098055362701,
"epoch": 3.1446654611211575,
"grad_norm": 0.0,
"learning_rate": 4.355716878402904e-05,
"loss": 2.5247,
"mean_token_accuracy": 0.5320253819227219,
"num_tokens": 600074.0,
"step": 437
},
{
"entropy": 1.0908999145030975,
"epoch": 3.151898734177215,
"grad_norm": 0.0,
"learning_rate": 4.3194192377495465e-05,
"loss": 2.4631,
"mean_token_accuracy": 0.5523603707551956,
"num_tokens": 601454.0,
"step": 438
},
{
"entropy": 1.1113301813602448,
"epoch": 3.159132007233273,
"grad_norm": 0.0,
"learning_rate": 4.283121597096189e-05,
"loss": 2.4427,
"mean_token_accuracy": 0.5540345758199692,
"num_tokens": 602853.0,
"step": 439
},
{
"entropy": 1.0943928360939026,
"epoch": 3.1663652802893307,
"grad_norm": 0.0,
"learning_rate": 4.2468239564428313e-05,
"loss": 2.4994,
"mean_token_accuracy": 0.5468274131417274,
"num_tokens": 604229.0,
"step": 440
},
{
"entropy": 1.051211178302765,
"epoch": 3.1735985533453888,
"grad_norm": 0.0,
"learning_rate": 4.210526315789474e-05,
"loss": 2.452,
"mean_token_accuracy": 0.5569523572921753,
"num_tokens": 605602.0,
"step": 441
},
{
"entropy": 1.1384278237819672,
"epoch": 3.180831826401447,
"grad_norm": 0.0,
"learning_rate": 4.174228675136116e-05,
"loss": 2.5022,
"mean_token_accuracy": 0.5312958657741547,
"num_tokens": 606997.0,
"step": 442
},
{
"entropy": 1.148714303970337,
"epoch": 3.1880650994575044,
"grad_norm": 0.0,
"learning_rate": 4.1379310344827587e-05,
"loss": 2.5076,
"mean_token_accuracy": 0.5392551869153976,
"num_tokens": 608359.0,
"step": 443
},
{
"entropy": 1.0378380715847015,
"epoch": 3.1952983725135624,
"grad_norm": 0.0,
"learning_rate": 4.101633393829401e-05,
"loss": 2.5422,
"mean_token_accuracy": 0.5553303360939026,
"num_tokens": 609744.0,
"step": 444
},
{
"entropy": 1.0927992165088654,
"epoch": 3.2025316455696204,
"grad_norm": 0.0,
"learning_rate": 4.065335753176044e-05,
"loss": 2.4453,
"mean_token_accuracy": 0.5521446019411087,
"num_tokens": 611134.0,
"step": 445
},
{
"entropy": 1.132373720407486,
"epoch": 3.209764918625678,
"grad_norm": 0.0,
"learning_rate": 4.029038112522686e-05,
"loss": 2.494,
"mean_token_accuracy": 0.551935002207756,
"num_tokens": 612463.0,
"step": 446
},
{
"entropy": 1.077909603714943,
"epoch": 3.216998191681736,
"grad_norm": 0.0,
"learning_rate": 3.9927404718693284e-05,
"loss": 2.4535,
"mean_token_accuracy": 0.5600922256708145,
"num_tokens": 613902.0,
"step": 447
},
{
"entropy": 1.1611990630626678,
"epoch": 3.2242314647377937,
"grad_norm": 0.0,
"learning_rate": 3.9564428312159715e-05,
"loss": 2.5906,
"mean_token_accuracy": 0.5318414568901062,
"num_tokens": 615225.0,
"step": 448
},
{
"entropy": 1.105976551771164,
"epoch": 3.2314647377938517,
"grad_norm": 0.0,
"learning_rate": 3.920145190562613e-05,
"loss": 2.5691,
"mean_token_accuracy": 0.5415270179510117,
"num_tokens": 616599.0,
"step": 449
},
{
"entropy": 1.02475506067276,
"epoch": 3.2386980108499097,
"grad_norm": 0.0,
"learning_rate": 3.883847549909256e-05,
"loss": 2.4645,
"mean_token_accuracy": 0.5659692138433456,
"num_tokens": 618012.0,
"step": 450
},
{
"entropy": 1.138374000787735,
"epoch": 3.2459312839059673,
"grad_norm": 0.0,
"learning_rate": 3.847549909255899e-05,
"loss": 2.4612,
"mean_token_accuracy": 0.562134400010109,
"num_tokens": 619371.0,
"step": 451
},
{
"entropy": 1.1324277371168137,
"epoch": 3.2531645569620253,
"grad_norm": 0.0,
"learning_rate": 3.8112522686025406e-05,
"loss": 2.5048,
"mean_token_accuracy": 0.5571979880332947,
"num_tokens": 620687.0,
"step": 452
},
{
"entropy": 1.0829709619283676,
"epoch": 3.2603978300180834,
"grad_norm": 0.0,
"learning_rate": 3.774954627949184e-05,
"loss": 2.5123,
"mean_token_accuracy": 0.5353300124406815,
"num_tokens": 622073.0,
"step": 453
},
{
"entropy": 1.0782041102647781,
"epoch": 3.267631103074141,
"grad_norm": 0.0,
"learning_rate": 3.738656987295826e-05,
"loss": 2.55,
"mean_token_accuracy": 0.5475586950778961,
"num_tokens": 623432.0,
"step": 454
},
{
"entropy": 1.0757603645324707,
"epoch": 3.274864376130199,
"grad_norm": 0.0,
"learning_rate": 3.7023593466424686e-05,
"loss": 2.3509,
"mean_token_accuracy": 0.5611744374036789,
"num_tokens": 624838.0,
"step": 455
},
{
"entropy": 1.103335440158844,
"epoch": 3.282097649186257,
"grad_norm": 0.0,
"learning_rate": 3.666061705989111e-05,
"loss": 2.4486,
"mean_token_accuracy": 0.5478360801935196,
"num_tokens": 626196.0,
"step": 456
},
{
"entropy": 1.10829758644104,
"epoch": 3.2893309222423146,
"grad_norm": 0.0,
"learning_rate": 3.6297640653357535e-05,
"loss": 2.4464,
"mean_token_accuracy": 0.5474723875522614,
"num_tokens": 627589.0,
"step": 457
},
{
"entropy": 1.1216012835502625,
"epoch": 3.2965641952983726,
"grad_norm": 0.0,
"learning_rate": 3.593466424682396e-05,
"loss": 2.5176,
"mean_token_accuracy": 0.541084423661232,
"num_tokens": 628982.0,
"step": 458
},
{
"entropy": 1.1574302315711975,
"epoch": 3.3037974683544302,
"grad_norm": 0.0,
"learning_rate": 3.5571687840290383e-05,
"loss": 2.6293,
"mean_token_accuracy": 0.5247242599725723,
"num_tokens": 630355.0,
"step": 459
},
{
"entropy": 1.1090004444122314,
"epoch": 3.3110307414104883,
"grad_norm": 0.0,
"learning_rate": 3.520871143375681e-05,
"loss": 2.4459,
"mean_token_accuracy": 0.5529509037733078,
"num_tokens": 631715.0,
"step": 460
},
{
"entropy": 1.1352742612361908,
"epoch": 3.3182640144665463,
"grad_norm": 0.0,
"learning_rate": 3.484573502722323e-05,
"loss": 2.5476,
"mean_token_accuracy": 0.5357427150011063,
"num_tokens": 633097.0,
"step": 461
},
{
"entropy": 1.0867815911769867,
"epoch": 3.325497287522604,
"grad_norm": 0.0,
"learning_rate": 3.4482758620689657e-05,
"loss": 2.5339,
"mean_token_accuracy": 0.5402389466762543,
"num_tokens": 634505.0,
"step": 462
},
{
"entropy": 1.0494126379489899,
"epoch": 3.332730560578662,
"grad_norm": 0.0,
"learning_rate": 3.411978221415608e-05,
"loss": 2.4625,
"mean_token_accuracy": 0.5514612942934036,
"num_tokens": 635950.0,
"step": 463
},
{
"entropy": 1.066350743174553,
"epoch": 3.3399638336347195,
"grad_norm": 0.0,
"learning_rate": 3.3756805807622505e-05,
"loss": 2.4877,
"mean_token_accuracy": 0.5456369668245316,
"num_tokens": 637355.0,
"step": 464
},
{
"entropy": 1.0907469242811203,
"epoch": 3.3471971066907775,
"grad_norm": 0.0,
"learning_rate": 3.339382940108893e-05,
"loss": 2.4705,
"mean_token_accuracy": 0.5456852614879608,
"num_tokens": 638708.0,
"step": 465
},
{
"entropy": 1.0532267093658447,
"epoch": 3.3544303797468356,
"grad_norm": 0.0,
"learning_rate": 3.3030852994555354e-05,
"loss": 2.4572,
"mean_token_accuracy": 0.5576175153255463,
"num_tokens": 640097.0,
"step": 466
},
{
"entropy": 1.081478163599968,
"epoch": 3.361663652802893,
"grad_norm": 0.0,
"learning_rate": 3.266787658802178e-05,
"loss": 2.4297,
"mean_token_accuracy": 0.5416488796472549,
"num_tokens": 641589.0,
"step": 467
},
{
"entropy": 1.1244118511676788,
"epoch": 3.368896925858951,
"grad_norm": 0.0,
"learning_rate": 3.230490018148821e-05,
"loss": 2.6067,
"mean_token_accuracy": 0.5368776768445969,
"num_tokens": 642942.0,
"step": 468
},
{
"entropy": 1.1060850024223328,
"epoch": 3.376130198915009,
"grad_norm": 0.0,
"learning_rate": 3.194192377495463e-05,
"loss": 2.4724,
"mean_token_accuracy": 0.555050402879715,
"num_tokens": 644294.0,
"step": 469
},
{
"entropy": 1.0835371911525726,
"epoch": 3.383363471971067,
"grad_norm": 0.0,
"learning_rate": 3.157894736842105e-05,
"loss": 2.5014,
"mean_token_accuracy": 0.5477449595928192,
"num_tokens": 645696.0,
"step": 470
},
{
"entropy": 1.0998270213603973,
"epoch": 3.390596745027125,
"grad_norm": 0.0,
"learning_rate": 3.121597096188748e-05,
"loss": 2.5321,
"mean_token_accuracy": 0.5408206954598427,
"num_tokens": 647073.0,
"step": 471
},
{
"entropy": 1.0803327411413193,
"epoch": 3.397830018083183,
"grad_norm": 0.0,
"learning_rate": 3.08529945553539e-05,
"loss": 2.4254,
"mean_token_accuracy": 0.5565962195396423,
"num_tokens": 648481.0,
"step": 472
},
{
"entropy": 1.1584204137325287,
"epoch": 3.4050632911392404,
"grad_norm": 0.0,
"learning_rate": 3.0490018148820325e-05,
"loss": 2.4628,
"mean_token_accuracy": 0.5492859929800034,
"num_tokens": 649890.0,
"step": 473
},
{
"entropy": 1.0719866752624512,
"epoch": 3.4122965641952985,
"grad_norm": 0.0,
"learning_rate": 3.0127041742286756e-05,
"loss": 2.547,
"mean_token_accuracy": 0.5507875829935074,
"num_tokens": 651282.0,
"step": 474
},
{
"entropy": 1.0890982151031494,
"epoch": 3.419529837251356,
"grad_norm": 0.0,
"learning_rate": 2.9764065335753177e-05,
"loss": 2.4228,
"mean_token_accuracy": 0.5508367717266083,
"num_tokens": 652705.0,
"step": 475
},
{
"entropy": 1.1372348964214325,
"epoch": 3.426763110307414,
"grad_norm": 0.0,
"learning_rate": 2.94010889292196e-05,
"loss": 2.6279,
"mean_token_accuracy": 0.5274247825145721,
"num_tokens": 654090.0,
"step": 476
},
{
"entropy": 1.1086134016513824,
"epoch": 3.433996383363472,
"grad_norm": 0.0,
"learning_rate": 2.903811252268603e-05,
"loss": 2.5969,
"mean_token_accuracy": 0.5212997198104858,
"num_tokens": 655493.0,
"step": 477
},
{
"entropy": 1.1175757050514221,
"epoch": 3.4412296564195297,
"grad_norm": 0.0,
"learning_rate": 2.867513611615245e-05,
"loss": 2.5913,
"mean_token_accuracy": 0.5459360331296921,
"num_tokens": 656820.0,
"step": 478
},
{
"entropy": 1.0289329886436462,
"epoch": 3.4484629294755877,
"grad_norm": 0.0,
"learning_rate": 2.8312159709618874e-05,
"loss": 2.4419,
"mean_token_accuracy": 0.5702246725559235,
"num_tokens": 658208.0,
"step": 479
},
{
"entropy": 1.0913092195987701,
"epoch": 3.4556962025316453,
"grad_norm": 0.0,
"learning_rate": 2.7949183303085302e-05,
"loss": 2.5732,
"mean_token_accuracy": 0.5397117137908936,
"num_tokens": 659557.0,
"step": 480
},
{
"entropy": 1.0869504362344742,
"epoch": 3.4629294755877034,
"grad_norm": 0.0,
"learning_rate": 2.7586206896551727e-05,
"loss": 2.4652,
"mean_token_accuracy": 0.5512167811393738,
"num_tokens": 660949.0,
"step": 481
},
{
"entropy": 1.0968185365200043,
"epoch": 3.4701627486437614,
"grad_norm": 0.0,
"learning_rate": 2.7223230490018148e-05,
"loss": 2.4802,
"mean_token_accuracy": 0.5515278428792953,
"num_tokens": 662362.0,
"step": 482
},
{
"entropy": 1.0313489437103271,
"epoch": 3.477396021699819,
"grad_norm": 0.0,
"learning_rate": 2.6860254083484575e-05,
"loss": 2.4,
"mean_token_accuracy": 0.558410719037056,
"num_tokens": 663815.0,
"step": 483
},
{
"entropy": 1.0685087740421295,
"epoch": 3.484629294755877,
"grad_norm": 0.0,
"learning_rate": 2.6497277676951e-05,
"loss": 2.4427,
"mean_token_accuracy": 0.5495921522378922,
"num_tokens": 665156.0,
"step": 484
},
{
"entropy": 1.0642586052417755,
"epoch": 3.491862567811935,
"grad_norm": 0.0,
"learning_rate": 2.613430127041742e-05,
"loss": 2.4741,
"mean_token_accuracy": 0.5541531145572662,
"num_tokens": 666532.0,
"step": 485
},
{
"entropy": 1.1081467866897583,
"epoch": 3.4990958408679926,
"grad_norm": 0.0,
"learning_rate": 2.577132486388385e-05,
"loss": 2.4227,
"mean_token_accuracy": 0.5435217171907425,
"num_tokens": 667952.0,
"step": 486
},
{
"entropy": 1.0714454650878906,
"epoch": 3.5063291139240507,
"grad_norm": 0.0,
"learning_rate": 2.5408348457350273e-05,
"loss": 2.5223,
"mean_token_accuracy": 0.5453221052885056,
"num_tokens": 669331.0,
"step": 487
},
{
"entropy": 1.1786887049674988,
"epoch": 3.5135623869801087,
"grad_norm": 0.0,
"learning_rate": 2.5045372050816694e-05,
"loss": 2.4881,
"mean_token_accuracy": 0.5456383675336838,
"num_tokens": 670744.0,
"step": 488
},
{
"entropy": 1.0308251529932022,
"epoch": 3.5207956600361663,
"grad_norm": 0.0,
"learning_rate": 2.4682395644283125e-05,
"loss": 2.4407,
"mean_token_accuracy": 0.5562519431114197,
"num_tokens": 672168.0,
"step": 489
},
{
"entropy": 1.1447840631008148,
"epoch": 3.5280289330922243,
"grad_norm": 0.0,
"learning_rate": 2.4319419237749546e-05,
"loss": 2.4995,
"mean_token_accuracy": 0.548570990562439,
"num_tokens": 673522.0,
"step": 490
},
{
"entropy": 1.1574127972126007,
"epoch": 3.535262206148282,
"grad_norm": 0.0,
"learning_rate": 2.3956442831215974e-05,
"loss": 2.5088,
"mean_token_accuracy": 0.5454981774091721,
"num_tokens": 674902.0,
"step": 491
},
{
"entropy": 1.0497371554374695,
"epoch": 3.54249547920434,
"grad_norm": 0.0,
"learning_rate": 2.3593466424682398e-05,
"loss": 2.4379,
"mean_token_accuracy": 0.5521166771650314,
"num_tokens": 676251.0,
"step": 492
},
{
"entropy": 1.0881330370903015,
"epoch": 3.549728752260398,
"grad_norm": 0.0,
"learning_rate": 2.323049001814882e-05,
"loss": 2.6047,
"mean_token_accuracy": 0.5381551831960678,
"num_tokens": 677576.0,
"step": 493
},
{
"entropy": 1.0649862885475159,
"epoch": 3.5569620253164556,
"grad_norm": 0.0,
"learning_rate": 2.2867513611615247e-05,
"loss": 2.5863,
"mean_token_accuracy": 0.5362200736999512,
"num_tokens": 678938.0,
"step": 494
},
{
"entropy": 1.11095330119133,
"epoch": 3.5641952983725136,
"grad_norm": 0.0,
"learning_rate": 2.250453720508167e-05,
"loss": 2.4817,
"mean_token_accuracy": 0.5521639734506607,
"num_tokens": 680340.0,
"step": 495
},
{
"entropy": 1.1321823298931122,
"epoch": 3.571428571428571,
"grad_norm": 0.0,
"learning_rate": 2.2141560798548096e-05,
"loss": 2.5982,
"mean_token_accuracy": 0.5332741737365723,
"num_tokens": 681726.0,
"step": 496
},
{
"entropy": 1.0432183742523193,
"epoch": 3.578661844484629,
"grad_norm": 0.0,
"learning_rate": 2.177858439201452e-05,
"loss": 2.538,
"mean_token_accuracy": 0.5448485761880875,
"num_tokens": 683093.0,
"step": 497
},
{
"entropy": 1.1311353743076324,
"epoch": 3.5858951175406872,
"grad_norm": 0.0,
"learning_rate": 2.1415607985480945e-05,
"loss": 2.5891,
"mean_token_accuracy": 0.5331176221370697,
"num_tokens": 684418.0,
"step": 498
},
{
"entropy": 1.0668546259403229,
"epoch": 3.5931283905967453,
"grad_norm": 0.0,
"learning_rate": 2.105263157894737e-05,
"loss": 2.4809,
"mean_token_accuracy": 0.5538065284490585,
"num_tokens": 685847.0,
"step": 499
},
{
"entropy": 1.105953961610794,
"epoch": 3.600361663652803,
"grad_norm": 0.0,
"learning_rate": 2.0689655172413793e-05,
"loss": 2.5048,
"mean_token_accuracy": 0.5508566051721573,
"num_tokens": 687244.0,
"step": 500
},
{
"entropy": 1.0005443841218948,
"epoch": 3.607594936708861,
"grad_norm": 0.0,
"learning_rate": 2.032667876588022e-05,
"loss": 2.5028,
"mean_token_accuracy": 0.5614646375179291,
"num_tokens": 688625.0,
"step": 501
},
{
"entropy": 1.1225857138633728,
"epoch": 3.6148282097649185,
"grad_norm": 0.0,
"learning_rate": 1.9963702359346642e-05,
"loss": 2.4824,
"mean_token_accuracy": 0.5448538213968277,
"num_tokens": 690000.0,
"step": 502
},
{
"entropy": 1.1489295065402985,
"epoch": 3.6220614828209765,
"grad_norm": 0.0,
"learning_rate": 1.9600725952813066e-05,
"loss": 2.5399,
"mean_token_accuracy": 0.5357647836208344,
"num_tokens": 691320.0,
"step": 503
},
{
"entropy": 1.078998863697052,
"epoch": 3.6292947558770345,
"grad_norm": 0.0,
"learning_rate": 1.9237749546279494e-05,
"loss": 2.6026,
"mean_token_accuracy": 0.5319690853357315,
"num_tokens": 692652.0,
"step": 504
},
{
"entropy": 1.130728840827942,
"epoch": 3.636528028933092,
"grad_norm": 0.0,
"learning_rate": 1.887477313974592e-05,
"loss": 2.5335,
"mean_token_accuracy": 0.5432541519403458,
"num_tokens": 694091.0,
"step": 505
},
{
"entropy": 1.0619003176689148,
"epoch": 3.64376130198915,
"grad_norm": 0.0,
"learning_rate": 1.8511796733212343e-05,
"loss": 2.4622,
"mean_token_accuracy": 0.5655789524316788,
"num_tokens": 695507.0,
"step": 506
},
{
"entropy": 1.1114209294319153,
"epoch": 3.6509945750452077,
"grad_norm": 0.0,
"learning_rate": 1.8148820326678767e-05,
"loss": 2.5158,
"mean_token_accuracy": 0.5567562431097031,
"num_tokens": 696867.0,
"step": 507
},
{
"entropy": 1.1229240000247955,
"epoch": 3.6582278481012658,
"grad_norm": 0.0,
"learning_rate": 1.7785843920145192e-05,
"loss": 2.5613,
"mean_token_accuracy": 0.5479451417922974,
"num_tokens": 698187.0,
"step": 508
},
{
"entropy": 1.15980663895607,
"epoch": 3.665461121157324,
"grad_norm": 0.0,
"learning_rate": 1.7422867513611616e-05,
"loss": 2.569,
"mean_token_accuracy": 0.5393990874290466,
"num_tokens": 699559.0,
"step": 509
},
{
"entropy": 1.0907841324806213,
"epoch": 3.6726943942133814,
"grad_norm": 0.0,
"learning_rate": 1.705989110707804e-05,
"loss": 2.5011,
"mean_token_accuracy": 0.5401911735534668,
"num_tokens": 700954.0,
"step": 510
},
{
"entropy": 1.1135782897472382,
"epoch": 3.6799276672694394,
"grad_norm": 0.0,
"learning_rate": 1.6696914700544465e-05,
"loss": 2.4691,
"mean_token_accuracy": 0.5452233403921127,
"num_tokens": 702378.0,
"step": 511
},
{
"entropy": 1.0565876811742783,
"epoch": 3.687160940325497,
"grad_norm": 0.0,
"learning_rate": 1.633393829401089e-05,
"loss": 2.4989,
"mean_token_accuracy": 0.5499396473169327,
"num_tokens": 703785.0,
"step": 512
},
{
"entropy": 1.0759983956813812,
"epoch": 3.694394213381555,
"grad_norm": 0.0,
"learning_rate": 1.5970961887477314e-05,
"loss": 2.5112,
"mean_token_accuracy": 0.5497391521930695,
"num_tokens": 705219.0,
"step": 513
},
{
"entropy": 1.1497739553451538,
"epoch": 3.701627486437613,
"grad_norm": 0.0,
"learning_rate": 1.560798548094374e-05,
"loss": 2.5841,
"mean_token_accuracy": 0.5287934392690659,
"num_tokens": 706606.0,
"step": 514
},
{
"entropy": 1.0716162323951721,
"epoch": 3.708860759493671,
"grad_norm": 0.0,
"learning_rate": 1.5245009074410162e-05,
"loss": 2.5038,
"mean_token_accuracy": 0.5536665618419647,
"num_tokens": 707989.0,
"step": 515
},
{
"entropy": 1.0738689005374908,
"epoch": 3.7160940325497287,
"grad_norm": 0.0,
"learning_rate": 1.4882032667876588e-05,
"loss": 2.5212,
"mean_token_accuracy": 0.5448974221944809,
"num_tokens": 709349.0,
"step": 516
},
{
"entropy": 1.1064155101776123,
"epoch": 3.7233273056057867,
"grad_norm": 0.0,
"learning_rate": 1.4519056261343015e-05,
"loss": 2.6506,
"mean_token_accuracy": 0.5230308175086975,
"num_tokens": 710705.0,
"step": 517
},
{
"entropy": 1.058555543422699,
"epoch": 3.7305605786618443,
"grad_norm": 0.0,
"learning_rate": 1.4156079854809437e-05,
"loss": 2.4917,
"mean_token_accuracy": 0.5443996042013168,
"num_tokens": 712069.0,
"step": 518
},
{
"entropy": 1.0357494354248047,
"epoch": 3.7377938517179023,
"grad_norm": 0.0,
"learning_rate": 1.3793103448275863e-05,
"loss": 2.4184,
"mean_token_accuracy": 0.5529536008834839,
"num_tokens": 713492.0,
"step": 519
},
{
"entropy": 1.1316678524017334,
"epoch": 3.7450271247739604,
"grad_norm": 0.0,
"learning_rate": 1.3430127041742288e-05,
"loss": 2.4531,
"mean_token_accuracy": 0.544991984963417,
"num_tokens": 714834.0,
"step": 520
},
{
"entropy": 1.1400097012519836,
"epoch": 3.752260397830018,
"grad_norm": 0.0,
"learning_rate": 1.306715063520871e-05,
"loss": 2.5073,
"mean_token_accuracy": 0.5420869141817093,
"num_tokens": 716274.0,
"step": 521
},
{
"entropy": 1.157213181257248,
"epoch": 3.759493670886076,
"grad_norm": 0.0,
"learning_rate": 1.2704174228675136e-05,
"loss": 2.6404,
"mean_token_accuracy": 0.5418842732906342,
"num_tokens": 717608.0,
"step": 522
},
{
"entropy": 1.0655402839183807,
"epoch": 3.7667269439421336,
"grad_norm": 0.0,
"learning_rate": 1.2341197822141563e-05,
"loss": 2.6179,
"mean_token_accuracy": 0.5416123121976852,
"num_tokens": 718918.0,
"step": 523
},
{
"entropy": 1.1274596750736237,
"epoch": 3.7739602169981916,
"grad_norm": 0.0,
"learning_rate": 1.1978221415607987e-05,
"loss": 2.5871,
"mean_token_accuracy": 0.5287514328956604,
"num_tokens": 720304.0,
"step": 524
},
{
"entropy": 1.1309744715690613,
"epoch": 3.7811934900542497,
"grad_norm": 0.0,
"learning_rate": 1.161524500907441e-05,
"loss": 2.4583,
"mean_token_accuracy": 0.5526018738746643,
"num_tokens": 721778.0,
"step": 525
},
{
"entropy": 1.0946650505065918,
"epoch": 3.7884267631103077,
"grad_norm": 0.0,
"learning_rate": 1.1252268602540836e-05,
"loss": 2.6319,
"mean_token_accuracy": 0.5338682383298874,
"num_tokens": 723119.0,
"step": 526
},
{
"entropy": 1.0975251197814941,
"epoch": 3.7956600361663653,
"grad_norm": 0.0,
"learning_rate": 1.088929219600726e-05,
"loss": 2.4436,
"mean_token_accuracy": 0.5491883158683777,
"num_tokens": 724610.0,
"step": 527
},
{
"entropy": 1.0772320330142975,
"epoch": 3.8028933092224233,
"grad_norm": 0.0,
"learning_rate": 1.0526315789473684e-05,
"loss": 2.5004,
"mean_token_accuracy": 0.5622773170471191,
"num_tokens": 725947.0,
"step": 528
},
{
"entropy": 1.0730805099010468,
"epoch": 3.810126582278481,
"grad_norm": 0.0,
"learning_rate": 1.016333938294011e-05,
"loss": 2.4317,
"mean_token_accuracy": 0.5641112923622131,
"num_tokens": 727305.0,
"step": 529
},
{
"entropy": 1.071317881345749,
"epoch": 3.817359855334539,
"grad_norm": 0.0,
"learning_rate": 9.800362976406533e-06,
"loss": 2.5299,
"mean_token_accuracy": 0.5482836663722992,
"num_tokens": 728688.0,
"step": 530
},
{
"entropy": 1.0742418766021729,
"epoch": 3.824593128390597,
"grad_norm": 0.0,
"learning_rate": 9.43738656987296e-06,
"loss": 2.4834,
"mean_token_accuracy": 0.5449412018060684,
"num_tokens": 730106.0,
"step": 531
},
{
"entropy": 1.1253242194652557,
"epoch": 3.8318264014466545,
"grad_norm": 0.0,
"learning_rate": 9.074410163339384e-06,
"loss": 2.6065,
"mean_token_accuracy": 0.5322617739439011,
"num_tokens": 731510.0,
"step": 532
},
{
"entropy": 1.0974994003772736,
"epoch": 3.8390596745027126,
"grad_norm": 0.0,
"learning_rate": 8.711433756805808e-06,
"loss": 2.5402,
"mean_token_accuracy": 0.5461835712194443,
"num_tokens": 732829.0,
"step": 533
},
{
"entropy": 1.1266226470470428,
"epoch": 3.84629294755877,
"grad_norm": 0.0,
"learning_rate": 8.348457350272232e-06,
"loss": 2.6068,
"mean_token_accuracy": 0.5259987786412239,
"num_tokens": 734191.0,
"step": 534
},
{
"entropy": 1.12846839427948,
"epoch": 3.853526220614828,
"grad_norm": 0.0,
"learning_rate": 7.985480943738657e-06,
"loss": 2.4417,
"mean_token_accuracy": 0.5635862648487091,
"num_tokens": 735573.0,
"step": 535
},
{
"entropy": 1.1160639226436615,
"epoch": 3.8607594936708862,
"grad_norm": 0.0,
"learning_rate": 7.622504537205081e-06,
"loss": 2.4455,
"mean_token_accuracy": 0.5517152100801468,
"num_tokens": 737003.0,
"step": 536
},
{
"entropy": 1.0702637135982513,
"epoch": 3.867992766726944,
"grad_norm": 0.0,
"learning_rate": 7.259528130671507e-06,
"loss": 2.5004,
"mean_token_accuracy": 0.5487655699253082,
"num_tokens": 738375.0,
"step": 537
},
{
"entropy": 1.0945512652397156,
"epoch": 3.875226039783002,
"grad_norm": 0.0,
"learning_rate": 6.896551724137932e-06,
"loss": 2.5679,
"mean_token_accuracy": 0.5293791145086288,
"num_tokens": 739748.0,
"step": 538
},
{
"entropy": 1.0917899906635284,
"epoch": 3.8824593128390594,
"grad_norm": 0.0,
"learning_rate": 6.533575317604355e-06,
"loss": 2.6122,
"mean_token_accuracy": 0.5389818549156189,
"num_tokens": 741088.0,
"step": 539
},
{
"entropy": 1.1529441475868225,
"epoch": 3.8896925858951175,
"grad_norm": 0.0,
"learning_rate": 6.170598911070781e-06,
"loss": 2.5513,
"mean_token_accuracy": 0.5297400206327438,
"num_tokens": 742432.0,
"step": 540
},
{
"entropy": 1.0943627953529358,
"epoch": 3.8969258589511755,
"grad_norm": 0.0,
"learning_rate": 5.807622504537205e-06,
"loss": 2.5533,
"mean_token_accuracy": 0.5300375521183014,
"num_tokens": 743872.0,
"step": 541
},
{
"entropy": 1.1212878823280334,
"epoch": 3.9041591320072335,
"grad_norm": 0.0,
"learning_rate": 5.44464609800363e-06,
"loss": 2.5892,
"mean_token_accuracy": 0.5295825377106667,
"num_tokens": 745229.0,
"step": 542
},
{
"entropy": 1.0862334966659546,
"epoch": 3.911392405063291,
"grad_norm": 0.0,
"learning_rate": 5.081669691470055e-06,
"loss": 2.4351,
"mean_token_accuracy": 0.549940288066864,
"num_tokens": 746589.0,
"step": 543
},
{
"entropy": 1.1090565025806427,
"epoch": 3.918625678119349,
"grad_norm": 0.0,
"learning_rate": 4.71869328493648e-06,
"loss": 2.6019,
"mean_token_accuracy": 0.5316396206617355,
"num_tokens": 747923.0,
"step": 544
},
{
"entropy": 1.0873733460903168,
"epoch": 3.9258589511754067,
"grad_norm": 0.0,
"learning_rate": 4.355716878402904e-06,
"loss": 2.4612,
"mean_token_accuracy": 0.5497990250587463,
"num_tokens": 749358.0,
"step": 545
},
{
"entropy": 1.1330247223377228,
"epoch": 3.9330922242314648,
"grad_norm": 0.0,
"learning_rate": 3.992740471869328e-06,
"loss": 2.3935,
"mean_token_accuracy": 0.5545907616615295,
"num_tokens": 750795.0,
"step": 546
},
{
"entropy": 1.1012303829193115,
"epoch": 3.940325497287523,
"grad_norm": 0.0,
"learning_rate": 3.6297640653357536e-06,
"loss": 2.5501,
"mean_token_accuracy": 0.5469983816146851,
"num_tokens": 752133.0,
"step": 547
},
{
"entropy": 1.052758365869522,
"epoch": 3.9475587703435804,
"grad_norm": 0.0,
"learning_rate": 3.2667876588021776e-06,
"loss": 2.473,
"mean_token_accuracy": 0.5600574761629105,
"num_tokens": 753544.0,
"step": 548
},
{
"entropy": 1.098718285560608,
"epoch": 3.9547920433996384,
"grad_norm": 0.0,
"learning_rate": 2.9038112522686024e-06,
"loss": 2.5281,
"mean_token_accuracy": 0.5494914799928665,
"num_tokens": 754898.0,
"step": 549
},
{
"entropy": 1.1130988895893097,
"epoch": 3.962025316455696,
"grad_norm": 0.0,
"learning_rate": 2.5408348457350276e-06,
"loss": 2.5836,
"mean_token_accuracy": 0.5407898128032684,
"num_tokens": 756240.0,
"step": 550
},
{
"entropy": 1.1131569147109985,
"epoch": 3.969258589511754,
"grad_norm": 0.0,
"learning_rate": 2.177858439201452e-06,
"loss": 2.5982,
"mean_token_accuracy": 0.5369937494397163,
"num_tokens": 757580.0,
"step": 551
},
{
"entropy": 1.1359702944755554,
"epoch": 3.976491862567812,
"grad_norm": 0.0,
"learning_rate": 1.8148820326678768e-06,
"loss": 2.5958,
"mean_token_accuracy": 0.5286305099725723,
"num_tokens": 758936.0,
"step": 552
},
{
"entropy": 1.0897059440612793,
"epoch": 3.9837251356238697,
"grad_norm": 0.0,
"learning_rate": 1.4519056261343012e-06,
"loss": 2.4814,
"mean_token_accuracy": 0.5611370354890823,
"num_tokens": 760290.0,
"step": 553
},
{
"entropy": 1.1387740671634674,
"epoch": 3.9909584086799277,
"grad_norm": 0.0,
"learning_rate": 1.088929219600726e-06,
"loss": 2.615,
"mean_token_accuracy": 0.5323370546102524,
"num_tokens": 761671.0,
"step": 554
},
{
"entropy": 1.0992904007434845,
"epoch": 3.9981916817359853,
"grad_norm": 0.0,
"learning_rate": 7.259528130671506e-07,
"loss": 2.5056,
"mean_token_accuracy": 0.5511928796768188,
"num_tokens": 763000.0,
"step": 555
},
{
"entropy": 1.0427762269973755,
"epoch": 4.0,
"grad_norm": 0.0,
"learning_rate": 3.629764065335753e-07,
"loss": 2.6583,
"mean_token_accuracy": 0.5569620132446289,
"num_tokens": 763160.0,
"step": 556
}
],
"logging_steps": 1,
"max_steps": 556,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.167535347687424e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}