| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.0, | |
| "eval_steps": 500, | |
| "global_step": 556, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.1117496490478516, | |
| "epoch": 0.007233273056057866, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 2.636, | |
| "mean_token_accuracy": 0.5501300543546677, | |
| "num_tokens": 1403.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 1.1266475915908813, | |
| "epoch": 0.014466546112115732, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4e-05, | |
| "loss": 2.5193, | |
| "mean_token_accuracy": 0.545694500207901, | |
| "num_tokens": 2824.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 1.093862533569336, | |
| "epoch": 0.0216998191681736, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8e-05, | |
| "loss": 2.5567, | |
| "mean_token_accuracy": 0.5492231100797653, | |
| "num_tokens": 4187.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 1.1016794741153717, | |
| "epoch": 0.028933092224231464, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012, | |
| "loss": 2.5151, | |
| "mean_token_accuracy": 0.5384537577629089, | |
| "num_tokens": 5595.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 1.1031606495380402, | |
| "epoch": 0.03616636528028933, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016, | |
| "loss": 2.5984, | |
| "mean_token_accuracy": 0.5410859286785126, | |
| "num_tokens": 6938.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.0830009877681732, | |
| "epoch": 0.0433996383363472, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0002, | |
| "loss": 2.4901, | |
| "mean_token_accuracy": 0.5498783439397812, | |
| "num_tokens": 8372.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.0646579265594482, | |
| "epoch": 0.05063291139240506, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019963702359346644, | |
| "loss": 2.4523, | |
| "mean_token_accuracy": 0.551283709704876, | |
| "num_tokens": 9816.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 1.1242448091506958, | |
| "epoch": 0.05786618444846293, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019927404718693284, | |
| "loss": 2.5337, | |
| "mean_token_accuracy": 0.5342100262641907, | |
| "num_tokens": 11199.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 1.0672060549259186, | |
| "epoch": 0.0650994575045208, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019891107078039928, | |
| "loss": 2.4281, | |
| "mean_token_accuracy": 0.5488722920417786, | |
| "num_tokens": 12583.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 1.1072518527507782, | |
| "epoch": 0.07233273056057866, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001985480943738657, | |
| "loss": 2.5747, | |
| "mean_token_accuracy": 0.526348888874054, | |
| "num_tokens": 13959.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.0887023508548737, | |
| "epoch": 0.07956600361663653, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001981851179673321, | |
| "loss": 2.575, | |
| "mean_token_accuracy": 0.5386092066764832, | |
| "num_tokens": 15344.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 1.053345412015915, | |
| "epoch": 0.0867992766726944, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019782214156079857, | |
| "loss": 2.4709, | |
| "mean_token_accuracy": 0.5469983220100403, | |
| "num_tokens": 16791.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 1.1248537600040436, | |
| "epoch": 0.09403254972875226, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000197459165154265, | |
| "loss": 2.3924, | |
| "mean_token_accuracy": 0.5607019513845444, | |
| "num_tokens": 18262.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 1.1298492848873138, | |
| "epoch": 0.10126582278481013, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001970961887477314, | |
| "loss": 2.3621, | |
| "mean_token_accuracy": 0.5567697584629059, | |
| "num_tokens": 19693.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 1.1234960854053497, | |
| "epoch": 0.10849909584086799, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019673321234119784, | |
| "loss": 2.4681, | |
| "mean_token_accuracy": 0.5486533343791962, | |
| "num_tokens": 21081.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 1.155756562948227, | |
| "epoch": 0.11573236889692586, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019637023593466427, | |
| "loss": 2.5987, | |
| "mean_token_accuracy": 0.5197644233703613, | |
| "num_tokens": 22400.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 1.0533479899168015, | |
| "epoch": 0.12296564195298372, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019600725952813067, | |
| "loss": 2.3976, | |
| "mean_token_accuracy": 0.5573096722364426, | |
| "num_tokens": 23833.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 1.1212570667266846, | |
| "epoch": 0.1301989150090416, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001956442831215971, | |
| "loss": 2.6562, | |
| "mean_token_accuracy": 0.5246226489543915, | |
| "num_tokens": 25173.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 1.0642149150371552, | |
| "epoch": 0.13743218806509946, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019528130671506353, | |
| "loss": 2.5246, | |
| "mean_token_accuracy": 0.5611959397792816, | |
| "num_tokens": 26505.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 1.0779947936534882, | |
| "epoch": 0.14466546112115733, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019491833030852994, | |
| "loss": 2.5068, | |
| "mean_token_accuracy": 0.5473824739456177, | |
| "num_tokens": 27946.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.1348789632320404, | |
| "epoch": 0.1518987341772152, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019455535390199637, | |
| "loss": 2.4636, | |
| "mean_token_accuracy": 0.5414505749940872, | |
| "num_tokens": 29342.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 1.094531387090683, | |
| "epoch": 0.15913200723327306, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001941923774954628, | |
| "loss": 2.4485, | |
| "mean_token_accuracy": 0.5563794821500778, | |
| "num_tokens": 30704.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 1.106130212545395, | |
| "epoch": 0.16636528028933092, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019382940108892923, | |
| "loss": 2.4934, | |
| "mean_token_accuracy": 0.5429124981164932, | |
| "num_tokens": 32117.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 1.1143657565116882, | |
| "epoch": 0.1735985533453888, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019346642468239566, | |
| "loss": 2.4279, | |
| "mean_token_accuracy": 0.5571769028902054, | |
| "num_tokens": 33548.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 1.0552422404289246, | |
| "epoch": 0.18083182640144665, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001931034482758621, | |
| "loss": 2.4475, | |
| "mean_token_accuracy": 0.5655805617570877, | |
| "num_tokens": 34947.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 1.0628145337104797, | |
| "epoch": 0.18806509945750452, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001927404718693285, | |
| "loss": 2.5125, | |
| "mean_token_accuracy": 0.556572213768959, | |
| "num_tokens": 36377.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 1.1124870479106903, | |
| "epoch": 0.19529837251356238, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019237749546279493, | |
| "loss": 2.5352, | |
| "mean_token_accuracy": 0.5449838191270828, | |
| "num_tokens": 37708.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 1.1239155530929565, | |
| "epoch": 0.20253164556962025, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019201451905626136, | |
| "loss": 2.6023, | |
| "mean_token_accuracy": 0.5333832204341888, | |
| "num_tokens": 39029.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 1.0808220505714417, | |
| "epoch": 0.20976491862567812, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001916515426497278, | |
| "loss": 2.4872, | |
| "mean_token_accuracy": 0.55105359852314, | |
| "num_tokens": 40431.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 1.09044648706913, | |
| "epoch": 0.21699819168173598, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001912885662431942, | |
| "loss": 2.5347, | |
| "mean_token_accuracy": 0.5469508171081543, | |
| "num_tokens": 41795.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.096373587846756, | |
| "epoch": 0.22423146473779385, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019092558983666063, | |
| "loss": 2.5209, | |
| "mean_token_accuracy": 0.5446725934743881, | |
| "num_tokens": 43161.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 1.1086195409297943, | |
| "epoch": 0.2314647377938517, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019056261343012706, | |
| "loss": 2.5568, | |
| "mean_token_accuracy": 0.5395984202623367, | |
| "num_tokens": 44548.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 1.096370369195938, | |
| "epoch": 0.23869801084990958, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019019963702359346, | |
| "loss": 2.5431, | |
| "mean_token_accuracy": 0.5369362384080887, | |
| "num_tokens": 45907.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 1.0829149782657623, | |
| "epoch": 0.24593128390596744, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001898366606170599, | |
| "loss": 2.5536, | |
| "mean_token_accuracy": 0.5458774566650391, | |
| "num_tokens": 47273.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 1.1014858782291412, | |
| "epoch": 0.25316455696202533, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018947368421052632, | |
| "loss": 2.5563, | |
| "mean_token_accuracy": 0.5427176207304001, | |
| "num_tokens": 48630.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 1.0944049954414368, | |
| "epoch": 0.2603978300180832, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018911070780399275, | |
| "loss": 2.5237, | |
| "mean_token_accuracy": 0.543226882815361, | |
| "num_tokens": 50010.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 1.0851400792598724, | |
| "epoch": 0.26763110307414106, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018874773139745919, | |
| "loss": 2.5001, | |
| "mean_token_accuracy": 0.5489733219146729, | |
| "num_tokens": 51422.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 1.0902953743934631, | |
| "epoch": 0.27486437613019893, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018838475499092562, | |
| "loss": 2.5394, | |
| "mean_token_accuracy": 0.5441585332155228, | |
| "num_tokens": 52829.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 1.0722315609455109, | |
| "epoch": 0.2820976491862568, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018802177858439202, | |
| "loss": 2.5113, | |
| "mean_token_accuracy": 0.544213131070137, | |
| "num_tokens": 54184.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 1.1416008174419403, | |
| "epoch": 0.28933092224231466, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018765880217785845, | |
| "loss": 2.5777, | |
| "mean_token_accuracy": 0.5215476900339127, | |
| "num_tokens": 55564.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.1114321053028107, | |
| "epoch": 0.2965641952983725, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018729582577132488, | |
| "loss": 2.5755, | |
| "mean_token_accuracy": 0.5520491451025009, | |
| "num_tokens": 56976.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 1.0591959059238434, | |
| "epoch": 0.3037974683544304, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001869328493647913, | |
| "loss": 2.4821, | |
| "mean_token_accuracy": 0.5502973049879074, | |
| "num_tokens": 58387.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 1.116980403661728, | |
| "epoch": 0.31103074141048825, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018656987295825772, | |
| "loss": 2.485, | |
| "mean_token_accuracy": 0.5487600713968277, | |
| "num_tokens": 59720.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 1.0845508873462677, | |
| "epoch": 0.3182640144665461, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018620689655172415, | |
| "loss": 2.629, | |
| "mean_token_accuracy": 0.5307356417179108, | |
| "num_tokens": 61066.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 1.0685547292232513, | |
| "epoch": 0.325497287522604, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018584392014519055, | |
| "loss": 2.4535, | |
| "mean_token_accuracy": 0.5622027516365051, | |
| "num_tokens": 62411.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 1.1736293733119965, | |
| "epoch": 0.33273056057866185, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018548094373865698, | |
| "loss": 2.5437, | |
| "mean_token_accuracy": 0.5360411405563354, | |
| "num_tokens": 63821.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 1.1335414946079254, | |
| "epoch": 0.3399638336347197, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018511796733212342, | |
| "loss": 2.4349, | |
| "mean_token_accuracy": 0.5393011569976807, | |
| "num_tokens": 65280.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 1.109579473733902, | |
| "epoch": 0.3471971066907776, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018475499092558985, | |
| "loss": 2.3379, | |
| "mean_token_accuracy": 0.5629399716854095, | |
| "num_tokens": 66754.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 1.0618711709976196, | |
| "epoch": 0.35443037974683544, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018439201451905628, | |
| "loss": 2.4828, | |
| "mean_token_accuracy": 0.5528846383094788, | |
| "num_tokens": 68154.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 1.0883667767047882, | |
| "epoch": 0.3616636528028933, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001840290381125227, | |
| "loss": 2.3981, | |
| "mean_token_accuracy": 0.5410507619380951, | |
| "num_tokens": 69549.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.1461284458637238, | |
| "epoch": 0.3688969258589512, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018366606170598911, | |
| "loss": 2.4298, | |
| "mean_token_accuracy": 0.5563310235738754, | |
| "num_tokens": 70850.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 1.0271961838006973, | |
| "epoch": 0.37613019891500904, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018330308529945554, | |
| "loss": 2.455, | |
| "mean_token_accuracy": 0.5506248325109482, | |
| "num_tokens": 72256.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 1.1474315524101257, | |
| "epoch": 0.3833634719710669, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018294010889292198, | |
| "loss": 2.5817, | |
| "mean_token_accuracy": 0.5281436145305634, | |
| "num_tokens": 73633.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 1.1158095598220825, | |
| "epoch": 0.39059674502712477, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018257713248638838, | |
| "loss": 2.558, | |
| "mean_token_accuracy": 0.5412166118621826, | |
| "num_tokens": 74998.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 1.0468406975269318, | |
| "epoch": 0.39783001808318263, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001822141560798548, | |
| "loss": 2.4055, | |
| "mean_token_accuracy": 0.562886506319046, | |
| "num_tokens": 76370.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 1.1383771300315857, | |
| "epoch": 0.4050632911392405, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018185117967332124, | |
| "loss": 2.5243, | |
| "mean_token_accuracy": 0.5516170412302017, | |
| "num_tokens": 77676.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 1.1219291388988495, | |
| "epoch": 0.41229656419529837, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018148820326678765, | |
| "loss": 2.5717, | |
| "mean_token_accuracy": 0.5452308058738708, | |
| "num_tokens": 79014.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 1.1263241171836853, | |
| "epoch": 0.41952983725135623, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018112522686025408, | |
| "loss": 2.5163, | |
| "mean_token_accuracy": 0.5439134389162064, | |
| "num_tokens": 80362.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 1.0441412180662155, | |
| "epoch": 0.4267631103074141, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018076225045372054, | |
| "loss": 2.4519, | |
| "mean_token_accuracy": 0.5616925954818726, | |
| "num_tokens": 81795.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 1.078106850385666, | |
| "epoch": 0.43399638336347196, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018039927404718694, | |
| "loss": 2.5087, | |
| "mean_token_accuracy": 0.5514906644821167, | |
| "num_tokens": 83176.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.0974124670028687, | |
| "epoch": 0.4412296564195298, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018003629764065337, | |
| "loss": 2.6424, | |
| "mean_token_accuracy": 0.5243298336863518, | |
| "num_tokens": 84553.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 1.1125823557376862, | |
| "epoch": 0.4484629294755877, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001796733212341198, | |
| "loss": 2.4969, | |
| "mean_token_accuracy": 0.5476825684309006, | |
| "num_tokens": 85861.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 1.1592190861701965, | |
| "epoch": 0.45569620253164556, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001793103448275862, | |
| "loss": 2.6226, | |
| "mean_token_accuracy": 0.5309869945049286, | |
| "num_tokens": 87231.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 1.0958653390407562, | |
| "epoch": 0.4629294755877034, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017894736842105264, | |
| "loss": 2.5135, | |
| "mean_token_accuracy": 0.5463672578334808, | |
| "num_tokens": 88563.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 1.079408586025238, | |
| "epoch": 0.4701627486437613, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017858439201451907, | |
| "loss": 2.4558, | |
| "mean_token_accuracy": 0.5488910973072052, | |
| "num_tokens": 89953.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 1.1434482634067535, | |
| "epoch": 0.47739602169981915, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001782214156079855, | |
| "loss": 2.51, | |
| "mean_token_accuracy": 0.5425025671720505, | |
| "num_tokens": 91361.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 1.1196760833263397, | |
| "epoch": 0.484629294755877, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001778584392014519, | |
| "loss": 2.5184, | |
| "mean_token_accuracy": 0.5284354239702225, | |
| "num_tokens": 92710.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 1.1134625673294067, | |
| "epoch": 0.4918625678119349, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017749546279491833, | |
| "loss": 2.5273, | |
| "mean_token_accuracy": 0.5285885334014893, | |
| "num_tokens": 94147.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 1.0853535532951355, | |
| "epoch": 0.49909584086799275, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017713248638838477, | |
| "loss": 2.5684, | |
| "mean_token_accuracy": 0.5399613529443741, | |
| "num_tokens": 95496.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 1.1049972176551819, | |
| "epoch": 0.5063291139240507, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017676950998185117, | |
| "loss": 2.5876, | |
| "mean_token_accuracy": 0.5382668077945709, | |
| "num_tokens": 96879.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 1.0942886769771576, | |
| "epoch": 0.5135623869801085, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017640653357531763, | |
| "loss": 2.5334, | |
| "mean_token_accuracy": 0.540867269039154, | |
| "num_tokens": 98252.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 1.101898044347763, | |
| "epoch": 0.5207956600361664, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017604355716878403, | |
| "loss": 2.4839, | |
| "mean_token_accuracy": 0.5541739910840988, | |
| "num_tokens": 99645.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 1.056696429848671, | |
| "epoch": 0.5280289330922242, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017568058076225046, | |
| "loss": 2.5304, | |
| "mean_token_accuracy": 0.5393856465816498, | |
| "num_tokens": 101032.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 1.1291647851467133, | |
| "epoch": 0.5352622061482821, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001753176043557169, | |
| "loss": 2.5736, | |
| "mean_token_accuracy": 0.5367253422737122, | |
| "num_tokens": 102394.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 1.1129019260406494, | |
| "epoch": 0.5424954792043399, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017495462794918333, | |
| "loss": 2.4803, | |
| "mean_token_accuracy": 0.5489845424890518, | |
| "num_tokens": 103803.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 1.0258008986711502, | |
| "epoch": 0.5497287522603979, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017459165154264973, | |
| "loss": 2.4433, | |
| "mean_token_accuracy": 0.562438115477562, | |
| "num_tokens": 105227.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 1.0876842439174652, | |
| "epoch": 0.5569620253164557, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017422867513611616, | |
| "loss": 2.4997, | |
| "mean_token_accuracy": 0.5498750060796738, | |
| "num_tokens": 106564.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 1.1114664673805237, | |
| "epoch": 0.5641952983725136, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001738656987295826, | |
| "loss": 2.4511, | |
| "mean_token_accuracy": 0.5540181249380112, | |
| "num_tokens": 107971.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 1.1240213513374329, | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000173502722323049, | |
| "loss": 2.5108, | |
| "mean_token_accuracy": 0.5341273844242096, | |
| "num_tokens": 109325.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 1.126657396554947, | |
| "epoch": 0.5786618444846293, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017313974591651543, | |
| "loss": 2.55, | |
| "mean_token_accuracy": 0.534925252199173, | |
| "num_tokens": 110659.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.0591940432786942, | |
| "epoch": 0.5858951175406871, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017277676950998186, | |
| "loss": 2.5122, | |
| "mean_token_accuracy": 0.5615774989128113, | |
| "num_tokens": 112064.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 1.080923855304718, | |
| "epoch": 0.593128390596745, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017241379310344826, | |
| "loss": 2.5076, | |
| "mean_token_accuracy": 0.5642599016427994, | |
| "num_tokens": 113390.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 1.0417269170284271, | |
| "epoch": 0.6003616636528029, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017205081669691472, | |
| "loss": 2.5414, | |
| "mean_token_accuracy": 0.5496295690536499, | |
| "num_tokens": 114784.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 1.0593893826007843, | |
| "epoch": 0.6075949367088608, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017168784029038115, | |
| "loss": 2.4682, | |
| "mean_token_accuracy": 0.5482211858034134, | |
| "num_tokens": 116148.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 1.1121591329574585, | |
| "epoch": 0.6148282097649186, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017132486388384756, | |
| "loss": 2.5569, | |
| "mean_token_accuracy": 0.5326060503721237, | |
| "num_tokens": 117506.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 1.0343488901853561, | |
| "epoch": 0.6220614828209765, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000170961887477314, | |
| "loss": 2.4415, | |
| "mean_token_accuracy": 0.5560837835073471, | |
| "num_tokens": 118976.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 1.1269948780536652, | |
| "epoch": 0.6292947558770343, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017059891107078042, | |
| "loss": 2.4604, | |
| "mean_token_accuracy": 0.5488670170307159, | |
| "num_tokens": 120366.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 1.0923850238323212, | |
| "epoch": 0.6365280289330922, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017023593466424682, | |
| "loss": 2.4487, | |
| "mean_token_accuracy": 0.5475012511014938, | |
| "num_tokens": 121782.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 1.1426692605018616, | |
| "epoch": 0.64376130198915, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016987295825771325, | |
| "loss": 2.4899, | |
| "mean_token_accuracy": 0.5527313947677612, | |
| "num_tokens": 123192.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 1.1227373778820038, | |
| "epoch": 0.650994575045208, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016950998185117968, | |
| "loss": 2.5995, | |
| "mean_token_accuracy": 0.541309654712677, | |
| "num_tokens": 124494.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 1.1019038259983063, | |
| "epoch": 0.6582278481012658, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001691470054446461, | |
| "loss": 2.5882, | |
| "mean_token_accuracy": 0.5407281070947647, | |
| "num_tokens": 125814.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 1.1230742931365967, | |
| "epoch": 0.6654611211573237, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016878402903811252, | |
| "loss": 2.4567, | |
| "mean_token_accuracy": 0.5464838892221451, | |
| "num_tokens": 127254.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 1.108274668455124, | |
| "epoch": 0.6726943942133815, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016842105263157895, | |
| "loss": 2.6057, | |
| "mean_token_accuracy": 0.5359428226947784, | |
| "num_tokens": 128607.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 1.0955757647752762, | |
| "epoch": 0.6799276672694394, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016805807622504538, | |
| "loss": 2.5135, | |
| "mean_token_accuracy": 0.5578029453754425, | |
| "num_tokens": 129995.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 1.0912747085094452, | |
| "epoch": 0.6871609403254972, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001676950998185118, | |
| "loss": 2.5535, | |
| "mean_token_accuracy": 0.5480784773826599, | |
| "num_tokens": 131342.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 1.109260231256485, | |
| "epoch": 0.6943942133815552, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016733212341197824, | |
| "loss": 2.5485, | |
| "mean_token_accuracy": 0.5332305133342743, | |
| "num_tokens": 132706.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 1.1296232044696808, | |
| "epoch": 0.701627486437613, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016696914700544465, | |
| "loss": 2.4693, | |
| "mean_token_accuracy": 0.5429563969373703, | |
| "num_tokens": 134068.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 1.110869139432907, | |
| "epoch": 0.7088607594936709, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016660617059891108, | |
| "loss": 2.544, | |
| "mean_token_accuracy": 0.5363138318061829, | |
| "num_tokens": 135503.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 1.1182821094989777, | |
| "epoch": 0.7160940325497287, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001662431941923775, | |
| "loss": 2.4547, | |
| "mean_token_accuracy": 0.5564229190349579, | |
| "num_tokens": 136916.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 1.123103141784668, | |
| "epoch": 0.7233273056057866, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016588021778584392, | |
| "loss": 2.5336, | |
| "mean_token_accuracy": 0.5383445471525192, | |
| "num_tokens": 138301.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.0634585916996002, | |
| "epoch": 0.7305605786618445, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016551724137931035, | |
| "loss": 2.4965, | |
| "mean_token_accuracy": 0.553234338760376, | |
| "num_tokens": 139688.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 1.0800490379333496, | |
| "epoch": 0.7377938517179023, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016515426497277678, | |
| "loss": 2.5361, | |
| "mean_token_accuracy": 0.5373943001031876, | |
| "num_tokens": 141048.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 1.1085204184055328, | |
| "epoch": 0.7450271247739603, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001647912885662432, | |
| "loss": 2.5261, | |
| "mean_token_accuracy": 0.5488307178020477, | |
| "num_tokens": 142376.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 1.101633608341217, | |
| "epoch": 0.7522603978300181, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001644283121597096, | |
| "loss": 2.465, | |
| "mean_token_accuracy": 0.5500718951225281, | |
| "num_tokens": 143718.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 1.0608249604701996, | |
| "epoch": 0.759493670886076, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016406533575317604, | |
| "loss": 2.4165, | |
| "mean_token_accuracy": 0.5679080486297607, | |
| "num_tokens": 145194.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 1.0754668712615967, | |
| "epoch": 0.7667269439421338, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016370235934664247, | |
| "loss": 2.5464, | |
| "mean_token_accuracy": 0.5387931615114212, | |
| "num_tokens": 146550.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 1.1067388951778412, | |
| "epoch": 0.7739602169981917, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001633393829401089, | |
| "loss": 2.5155, | |
| "mean_token_accuracy": 0.5482369661331177, | |
| "num_tokens": 147913.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 1.1152404248714447, | |
| "epoch": 0.7811934900542495, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016297640653357534, | |
| "loss": 2.5465, | |
| "mean_token_accuracy": 0.5414672940969467, | |
| "num_tokens": 149291.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 1.125922292470932, | |
| "epoch": 0.7884267631103075, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016261343012704177, | |
| "loss": 2.5742, | |
| "mean_token_accuracy": 0.5460067391395569, | |
| "num_tokens": 150642.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 1.1228927671909332, | |
| "epoch": 0.7956600361663653, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016225045372050817, | |
| "loss": 2.4777, | |
| "mean_token_accuracy": 0.5416840761899948, | |
| "num_tokens": 151976.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.1512284874916077, | |
| "epoch": 0.8028933092224232, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001618874773139746, | |
| "loss": 2.4426, | |
| "mean_token_accuracy": 0.5420899093151093, | |
| "num_tokens": 153365.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 1.0402842164039612, | |
| "epoch": 0.810126582278481, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016152450090744103, | |
| "loss": 2.511, | |
| "mean_token_accuracy": 0.5547743141651154, | |
| "num_tokens": 154672.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 1.070629358291626, | |
| "epoch": 0.8173598553345389, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016116152450090744, | |
| "loss": 2.5149, | |
| "mean_token_accuracy": 0.5439868271350861, | |
| "num_tokens": 156040.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 1.0861331224441528, | |
| "epoch": 0.8245931283905967, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016079854809437387, | |
| "loss": 2.5305, | |
| "mean_token_accuracy": 0.533889502286911, | |
| "num_tokens": 157355.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 1.0892849266529083, | |
| "epoch": 0.8318264014466547, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001604355716878403, | |
| "loss": 2.5118, | |
| "mean_token_accuracy": 0.5504042059183121, | |
| "num_tokens": 158758.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 1.0792168974876404, | |
| "epoch": 0.8390596745027125, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001600725952813067, | |
| "loss": 2.6109, | |
| "mean_token_accuracy": 0.5412319302558899, | |
| "num_tokens": 160121.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 1.1541117131710052, | |
| "epoch": 0.8462929475587704, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015970961887477314, | |
| "loss": 2.4966, | |
| "mean_token_accuracy": 0.5413856208324432, | |
| "num_tokens": 161480.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 1.089859127998352, | |
| "epoch": 0.8535262206148282, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001593466424682396, | |
| "loss": 2.5301, | |
| "mean_token_accuracy": 0.5545037686824799, | |
| "num_tokens": 162869.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 1.0838179290294647, | |
| "epoch": 0.8607594936708861, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000158983666061706, | |
| "loss": 2.5073, | |
| "mean_token_accuracy": 0.5579670369625092, | |
| "num_tokens": 164228.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 1.156173586845398, | |
| "epoch": 0.8679927667269439, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015862068965517243, | |
| "loss": 2.5787, | |
| "mean_token_accuracy": 0.5258647873997688, | |
| "num_tokens": 165624.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.1117530465126038, | |
| "epoch": 0.8752260397830018, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015825771324863886, | |
| "loss": 2.6018, | |
| "mean_token_accuracy": 0.5411983877420425, | |
| "num_tokens": 167013.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 1.1654876172542572, | |
| "epoch": 0.8824593128390597, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015789473684210527, | |
| "loss": 2.5169, | |
| "mean_token_accuracy": 0.551274299621582, | |
| "num_tokens": 168361.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 1.1018896400928497, | |
| "epoch": 0.8896925858951176, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001575317604355717, | |
| "loss": 2.5214, | |
| "mean_token_accuracy": 0.5413067489862442, | |
| "num_tokens": 169751.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 1.0984440445899963, | |
| "epoch": 0.8969258589511754, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015716878402903813, | |
| "loss": 2.5656, | |
| "mean_token_accuracy": 0.5381608605384827, | |
| "num_tokens": 171127.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 1.0864254534244537, | |
| "epoch": 0.9041591320072333, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015680580762250453, | |
| "loss": 2.5288, | |
| "mean_token_accuracy": 0.5271068960428238, | |
| "num_tokens": 172564.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 1.0896047800779343, | |
| "epoch": 0.9113924050632911, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015644283121597096, | |
| "loss": 2.4613, | |
| "mean_token_accuracy": 0.5481883883476257, | |
| "num_tokens": 173965.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 1.0906076729297638, | |
| "epoch": 0.918625678119349, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001560798548094374, | |
| "loss": 2.4223, | |
| "mean_token_accuracy": 0.5527500957250595, | |
| "num_tokens": 175364.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 1.064597100019455, | |
| "epoch": 0.9258589511754068, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001557168784029038, | |
| "loss": 2.5124, | |
| "mean_token_accuracy": 0.551444873213768, | |
| "num_tokens": 176694.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 1.1372613608837128, | |
| "epoch": 0.9330922242314648, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015535390199637023, | |
| "loss": 2.5197, | |
| "mean_token_accuracy": 0.5470752865076065, | |
| "num_tokens": 178074.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 1.1365208625793457, | |
| "epoch": 0.9403254972875226, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001549909255898367, | |
| "loss": 2.5013, | |
| "mean_token_accuracy": 0.5439895391464233, | |
| "num_tokens": 179506.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.1201069951057434, | |
| "epoch": 0.9475587703435805, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001546279491833031, | |
| "loss": 2.4971, | |
| "mean_token_accuracy": 0.5508686006069183, | |
| "num_tokens": 180890.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 1.1062006503343582, | |
| "epoch": 0.9547920433996383, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015426497277676952, | |
| "loss": 2.5202, | |
| "mean_token_accuracy": 0.5457829236984253, | |
| "num_tokens": 182250.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 1.08922478556633, | |
| "epoch": 0.9620253164556962, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015390199637023595, | |
| "loss": 2.5269, | |
| "mean_token_accuracy": 0.548294797539711, | |
| "num_tokens": 183644.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 1.0965730547904968, | |
| "epoch": 0.969258589511754, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015353901996370236, | |
| "loss": 2.5512, | |
| "mean_token_accuracy": 0.5375129878520966, | |
| "num_tokens": 184988.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 1.047539085149765, | |
| "epoch": 0.976491862567812, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001531760435571688, | |
| "loss": 2.4542, | |
| "mean_token_accuracy": 0.5518470257520676, | |
| "num_tokens": 186434.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 1.058751568198204, | |
| "epoch": 0.9837251356238698, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015281306715063522, | |
| "loss": 2.4307, | |
| "mean_token_accuracy": 0.5505292564630508, | |
| "num_tokens": 187884.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 1.0880784392356873, | |
| "epoch": 0.9909584086799277, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015245009074410162, | |
| "loss": 2.4285, | |
| "mean_token_accuracy": 0.5528350919485092, | |
| "num_tokens": 189290.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 1.057303100824356, | |
| "epoch": 0.9981916817359855, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015208711433756806, | |
| "loss": 2.5462, | |
| "mean_token_accuracy": 0.5505049228668213, | |
| "num_tokens": 190628.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 1.1205418109893799, | |
| "epoch": 1.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015172413793103449, | |
| "loss": 2.7426, | |
| "mean_token_accuracy": 0.518750011920929, | |
| "num_tokens": 190790.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 1.0945520102977753, | |
| "epoch": 1.0072332730560578, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015136116152450092, | |
| "loss": 2.4704, | |
| "mean_token_accuracy": 0.5627106875181198, | |
| "num_tokens": 192197.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.115374892950058, | |
| "epoch": 1.0144665461121158, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015099818511796735, | |
| "loss": 2.5376, | |
| "mean_token_accuracy": 0.5344210714101791, | |
| "num_tokens": 193569.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 1.0689078569412231, | |
| "epoch": 1.0216998191681737, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015063520871143378, | |
| "loss": 2.4321, | |
| "mean_token_accuracy": 0.5700321197509766, | |
| "num_tokens": 194987.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 1.0969620048999786, | |
| "epoch": 1.0289330922242315, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015027223230490018, | |
| "loss": 2.5325, | |
| "mean_token_accuracy": 0.5415942668914795, | |
| "num_tokens": 196398.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 1.0174528360366821, | |
| "epoch": 1.0361663652802893, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014990925589836661, | |
| "loss": 2.4498, | |
| "mean_token_accuracy": 0.5438085496425629, | |
| "num_tokens": 197820.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 1.1043200492858887, | |
| "epoch": 1.0433996383363473, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014954627949183305, | |
| "loss": 2.458, | |
| "mean_token_accuracy": 0.5458298474550247, | |
| "num_tokens": 199213.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 1.1462195813655853, | |
| "epoch": 1.0506329113924051, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014918330308529948, | |
| "loss": 2.5472, | |
| "mean_token_accuracy": 0.5426424294710159, | |
| "num_tokens": 200561.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 1.139638602733612, | |
| "epoch": 1.057866184448463, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014882032667876588, | |
| "loss": 2.4519, | |
| "mean_token_accuracy": 0.5445922464132309, | |
| "num_tokens": 201975.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 1.0623964667320251, | |
| "epoch": 1.0650994575045207, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001484573502722323, | |
| "loss": 2.465, | |
| "mean_token_accuracy": 0.5589400827884674, | |
| "num_tokens": 203414.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 1.076666384935379, | |
| "epoch": 1.0723327305605788, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014809437386569874, | |
| "loss": 2.5235, | |
| "mean_token_accuracy": 0.5452157557010651, | |
| "num_tokens": 204796.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 1.1187179684638977, | |
| "epoch": 1.0795660036166366, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014773139745916515, | |
| "loss": 2.5372, | |
| "mean_token_accuracy": 0.5426393896341324, | |
| "num_tokens": 206181.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.0679790079593658, | |
| "epoch": 1.0867992766726944, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014736842105263158, | |
| "loss": 2.4031, | |
| "mean_token_accuracy": 0.5628975033760071, | |
| "num_tokens": 207604.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 1.1154986917972565, | |
| "epoch": 1.0940325497287522, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000147005444646098, | |
| "loss": 2.5325, | |
| "mean_token_accuracy": 0.5442689210176468, | |
| "num_tokens": 208978.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 1.0788207352161407, | |
| "epoch": 1.1012658227848102, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014664246823956444, | |
| "loss": 2.4932, | |
| "mean_token_accuracy": 0.5427374988794327, | |
| "num_tokens": 210332.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 1.1146452724933624, | |
| "epoch": 1.108499095840868, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014627949183303087, | |
| "loss": 2.5571, | |
| "mean_token_accuracy": 0.5369889438152313, | |
| "num_tokens": 211700.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 1.1245636343955994, | |
| "epoch": 1.1157323688969258, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001459165154264973, | |
| "loss": 2.6096, | |
| "mean_token_accuracy": 0.5278495326638222, | |
| "num_tokens": 213050.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 1.1087840497493744, | |
| "epoch": 1.1229656419529837, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001455535390199637, | |
| "loss": 2.5262, | |
| "mean_token_accuracy": 0.5457671284675598, | |
| "num_tokens": 214475.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 1.0188728719949722, | |
| "epoch": 1.1301989150090417, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014519056261343014, | |
| "loss": 2.4321, | |
| "mean_token_accuracy": 0.5751179605722427, | |
| "num_tokens": 215929.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 1.127344325184822, | |
| "epoch": 1.1374321880650995, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014482758620689657, | |
| "loss": 2.6004, | |
| "mean_token_accuracy": 0.5295312106609344, | |
| "num_tokens": 217327.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 1.104835420846939, | |
| "epoch": 1.1446654611211573, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014446460980036297, | |
| "loss": 2.4725, | |
| "mean_token_accuracy": 0.5560386776924133, | |
| "num_tokens": 218718.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 1.098018765449524, | |
| "epoch": 1.1518987341772151, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001441016333938294, | |
| "loss": 2.5442, | |
| "mean_token_accuracy": 0.5397144109010696, | |
| "num_tokens": 220055.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.1457242369651794, | |
| "epoch": 1.1591320072332731, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014373865698729584, | |
| "loss": 2.5436, | |
| "mean_token_accuracy": 0.5351956188678741, | |
| "num_tokens": 221397.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 1.092559427022934, | |
| "epoch": 1.166365280289331, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014337568058076224, | |
| "loss": 2.4248, | |
| "mean_token_accuracy": 0.5560291260480881, | |
| "num_tokens": 222789.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 1.0653438866138458, | |
| "epoch": 1.1735985533453888, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014301270417422867, | |
| "loss": 2.5878, | |
| "mean_token_accuracy": 0.5465674847364426, | |
| "num_tokens": 224159.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 1.1752942502498627, | |
| "epoch": 1.1808318264014466, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001426497277676951, | |
| "loss": 2.5277, | |
| "mean_token_accuracy": 0.5325864478945732, | |
| "num_tokens": 225582.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 1.0615761578083038, | |
| "epoch": 1.1880650994575046, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014228675136116153, | |
| "loss": 2.4925, | |
| "mean_token_accuracy": 0.5536267906427383, | |
| "num_tokens": 226983.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 1.1374837756156921, | |
| "epoch": 1.1952983725135624, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014192377495462796, | |
| "loss": 2.4594, | |
| "mean_token_accuracy": 0.5434800386428833, | |
| "num_tokens": 228399.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 1.1201145946979523, | |
| "epoch": 1.2025316455696202, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001415607985480944, | |
| "loss": 2.4985, | |
| "mean_token_accuracy": 0.5561137795448303, | |
| "num_tokens": 229805.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 1.1054023802280426, | |
| "epoch": 1.209764918625678, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001411978221415608, | |
| "loss": 2.4873, | |
| "mean_token_accuracy": 0.547326996922493, | |
| "num_tokens": 231177.0, | |
| "step": 168 | |
| }, | |
| { | |
| "entropy": 1.072383537888527, | |
| "epoch": 1.216998191681736, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014083484573502723, | |
| "loss": 2.4556, | |
| "mean_token_accuracy": 0.5583519786596298, | |
| "num_tokens": 232645.0, | |
| "step": 169 | |
| }, | |
| { | |
| "entropy": 1.085743099451065, | |
| "epoch": 1.2242314647377939, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014047186932849366, | |
| "loss": 2.5287, | |
| "mean_token_accuracy": 0.5549855530261993, | |
| "num_tokens": 233952.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.0745936632156372, | |
| "epoch": 1.2314647377938517, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014010889292196007, | |
| "loss": 2.5143, | |
| "mean_token_accuracy": 0.5558921694755554, | |
| "num_tokens": 235287.0, | |
| "step": 171 | |
| }, | |
| { | |
| "entropy": 1.0693705677986145, | |
| "epoch": 1.2386980108499095, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001397459165154265, | |
| "loss": 2.4938, | |
| "mean_token_accuracy": 0.5452517122030258, | |
| "num_tokens": 236670.0, | |
| "step": 172 | |
| }, | |
| { | |
| "entropy": 1.135264903306961, | |
| "epoch": 1.2459312839059675, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013938294010889293, | |
| "loss": 2.5618, | |
| "mean_token_accuracy": 0.5449370294809341, | |
| "num_tokens": 238001.0, | |
| "step": 173 | |
| }, | |
| { | |
| "entropy": 1.0745739191770554, | |
| "epoch": 1.2531645569620253, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013901996370235933, | |
| "loss": 2.4831, | |
| "mean_token_accuracy": 0.5497026294469833, | |
| "num_tokens": 239380.0, | |
| "step": 174 | |
| }, | |
| { | |
| "entropy": 1.1606217920780182, | |
| "epoch": 1.2603978300180831, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013865698729582576, | |
| "loss": 2.5234, | |
| "mean_token_accuracy": 0.5369236767292023, | |
| "num_tokens": 240671.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 1.1321864128112793, | |
| "epoch": 1.267631103074141, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013829401088929222, | |
| "loss": 2.568, | |
| "mean_token_accuracy": 0.5358796268701553, | |
| "num_tokens": 241999.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 1.0919426679611206, | |
| "epoch": 1.274864376130199, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013793103448275863, | |
| "loss": 2.4199, | |
| "mean_token_accuracy": 0.5525386333465576, | |
| "num_tokens": 243432.0, | |
| "step": 177 | |
| }, | |
| { | |
| "entropy": 1.1321356147527695, | |
| "epoch": 1.2820976491862568, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013756805807622506, | |
| "loss": 2.5929, | |
| "mean_token_accuracy": 0.5318445116281509, | |
| "num_tokens": 244799.0, | |
| "step": 178 | |
| }, | |
| { | |
| "entropy": 1.0925188958644867, | |
| "epoch": 1.2893309222423146, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001372050816696915, | |
| "loss": 2.412, | |
| "mean_token_accuracy": 0.5604078769683838, | |
| "num_tokens": 246201.0, | |
| "step": 179 | |
| }, | |
| { | |
| "entropy": 1.0641421675682068, | |
| "epoch": 1.2965641952983726, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001368421052631579, | |
| "loss": 2.3832, | |
| "mean_token_accuracy": 0.5665835738182068, | |
| "num_tokens": 247652.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.1118652522563934, | |
| "epoch": 1.3037974683544304, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013647912885662432, | |
| "loss": 2.5912, | |
| "mean_token_accuracy": 0.5334130972623825, | |
| "num_tokens": 249011.0, | |
| "step": 181 | |
| }, | |
| { | |
| "entropy": 1.093698412179947, | |
| "epoch": 1.3110307414104883, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013611615245009076, | |
| "loss": 2.4982, | |
| "mean_token_accuracy": 0.5461122542619705, | |
| "num_tokens": 250442.0, | |
| "step": 182 | |
| }, | |
| { | |
| "entropy": 1.0915465950965881, | |
| "epoch": 1.318264014466546, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013575317604355719, | |
| "loss": 2.5026, | |
| "mean_token_accuracy": 0.5573518574237823, | |
| "num_tokens": 251829.0, | |
| "step": 183 | |
| }, | |
| { | |
| "entropy": 1.0844445824623108, | |
| "epoch": 1.3254972875226039, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001353901996370236, | |
| "loss": 2.4848, | |
| "mean_token_accuracy": 0.541678175330162, | |
| "num_tokens": 253229.0, | |
| "step": 184 | |
| }, | |
| { | |
| "entropy": 1.1480746567249298, | |
| "epoch": 1.332730560578662, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013502722323049002, | |
| "loss": 2.5386, | |
| "mean_token_accuracy": 0.5468995273113251, | |
| "num_tokens": 254604.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 1.1767261922359467, | |
| "epoch": 1.3399638336347197, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013466424682395645, | |
| "loss": 2.5959, | |
| "mean_token_accuracy": 0.5261930972337723, | |
| "num_tokens": 255900.0, | |
| "step": 186 | |
| }, | |
| { | |
| "entropy": 1.0286410748958588, | |
| "epoch": 1.3471971066907775, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013430127041742286, | |
| "loss": 2.4155, | |
| "mean_token_accuracy": 0.5538035929203033, | |
| "num_tokens": 257311.0, | |
| "step": 187 | |
| }, | |
| { | |
| "entropy": 1.1496650278568268, | |
| "epoch": 1.3544303797468356, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013393829401088931, | |
| "loss": 2.5327, | |
| "mean_token_accuracy": 0.5415185838937759, | |
| "num_tokens": 258679.0, | |
| "step": 188 | |
| }, | |
| { | |
| "entropy": 1.0800335109233856, | |
| "epoch": 1.3616636528028934, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013357531760435572, | |
| "loss": 2.4404, | |
| "mean_token_accuracy": 0.5601710379123688, | |
| "num_tokens": 260035.0, | |
| "step": 189 | |
| }, | |
| { | |
| "entropy": 1.0977840423583984, | |
| "epoch": 1.3688969258589512, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013321234119782215, | |
| "loss": 2.5612, | |
| "mean_token_accuracy": 0.5392811894416809, | |
| "num_tokens": 261446.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 1.0847298502922058, | |
| "epoch": 1.376130198915009, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013284936479128858, | |
| "loss": 2.4941, | |
| "mean_token_accuracy": 0.553534135222435, | |
| "num_tokens": 262823.0, | |
| "step": 191 | |
| }, | |
| { | |
| "entropy": 1.0621435940265656, | |
| "epoch": 1.3833634719710668, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000132486388384755, | |
| "loss": 2.5295, | |
| "mean_token_accuracy": 0.5515837073326111, | |
| "num_tokens": 264213.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 1.1017491519451141, | |
| "epoch": 1.3905967450271248, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013212341197822142, | |
| "loss": 2.592, | |
| "mean_token_accuracy": 0.5382756292819977, | |
| "num_tokens": 265526.0, | |
| "step": 193 | |
| }, | |
| { | |
| "entropy": 1.0822009593248367, | |
| "epoch": 1.3978300180831826, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013176043557168785, | |
| "loss": 2.5066, | |
| "mean_token_accuracy": 0.5384088605642319, | |
| "num_tokens": 266949.0, | |
| "step": 194 | |
| }, | |
| { | |
| "entropy": 1.1282154023647308, | |
| "epoch": 1.4050632911392404, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013139745916515428, | |
| "loss": 2.5788, | |
| "mean_token_accuracy": 0.5369668304920197, | |
| "num_tokens": 268304.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 1.0464816391468048, | |
| "epoch": 1.4122965641952985, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013103448275862068, | |
| "loss": 2.4394, | |
| "mean_token_accuracy": 0.5474424809217453, | |
| "num_tokens": 269668.0, | |
| "step": 196 | |
| }, | |
| { | |
| "entropy": 1.151181936264038, | |
| "epoch": 1.4195298372513563, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013067150635208711, | |
| "loss": 2.4921, | |
| "mean_token_accuracy": 0.5326793938875198, | |
| "num_tokens": 271048.0, | |
| "step": 197 | |
| }, | |
| { | |
| "entropy": 1.1585266888141632, | |
| "epoch": 1.426763110307414, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013030852994555355, | |
| "loss": 2.5596, | |
| "mean_token_accuracy": 0.5290036201477051, | |
| "num_tokens": 272423.0, | |
| "step": 198 | |
| }, | |
| { | |
| "entropy": 1.0876199752092361, | |
| "epoch": 1.433996383363472, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012994555353901995, | |
| "loss": 2.5371, | |
| "mean_token_accuracy": 0.542813628911972, | |
| "num_tokens": 273880.0, | |
| "step": 199 | |
| }, | |
| { | |
| "entropy": 1.0546641200780869, | |
| "epoch": 1.4412296564195297, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001295825771324864, | |
| "loss": 2.4553, | |
| "mean_token_accuracy": 0.5557200312614441, | |
| "num_tokens": 275289.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.117970123887062, | |
| "epoch": 1.4484629294755877, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012921960072595284, | |
| "loss": 2.5695, | |
| "mean_token_accuracy": 0.5380512326955795, | |
| "num_tokens": 276656.0, | |
| "step": 201 | |
| }, | |
| { | |
| "entropy": 1.0969232022762299, | |
| "epoch": 1.4556962025316456, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012885662431941924, | |
| "loss": 2.4635, | |
| "mean_token_accuracy": 0.5425622910261154, | |
| "num_tokens": 278083.0, | |
| "step": 202 | |
| }, | |
| { | |
| "entropy": 1.0811368227005005, | |
| "epoch": 1.4629294755877034, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012849364791288567, | |
| "loss": 2.5302, | |
| "mean_token_accuracy": 0.5425570607185364, | |
| "num_tokens": 279431.0, | |
| "step": 203 | |
| }, | |
| { | |
| "entropy": 1.055485188961029, | |
| "epoch": 1.4701627486437614, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001281306715063521, | |
| "loss": 2.4713, | |
| "mean_token_accuracy": 0.5471125394105911, | |
| "num_tokens": 280835.0, | |
| "step": 204 | |
| }, | |
| { | |
| "entropy": 1.1141844391822815, | |
| "epoch": 1.4773960216998192, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001277676950998185, | |
| "loss": 2.5834, | |
| "mean_token_accuracy": 0.5362391173839569, | |
| "num_tokens": 282200.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 1.1039184033870697, | |
| "epoch": 1.484629294755877, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012740471869328494, | |
| "loss": 2.4992, | |
| "mean_token_accuracy": 0.5513607412576675, | |
| "num_tokens": 283567.0, | |
| "step": 206 | |
| }, | |
| { | |
| "entropy": 1.0966024100780487, | |
| "epoch": 1.4918625678119348, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012704174228675137, | |
| "loss": 2.3996, | |
| "mean_token_accuracy": 0.552715390920639, | |
| "num_tokens": 284967.0, | |
| "step": 207 | |
| }, | |
| { | |
| "entropy": 1.096381276845932, | |
| "epoch": 1.4990958408679926, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012667876588021778, | |
| "loss": 2.5746, | |
| "mean_token_accuracy": 0.5394180566072464, | |
| "num_tokens": 286329.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 1.0701113641262054, | |
| "epoch": 1.5063291139240507, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001263157894736842, | |
| "loss": 2.4981, | |
| "mean_token_accuracy": 0.5468792766332626, | |
| "num_tokens": 287720.0, | |
| "step": 209 | |
| }, | |
| { | |
| "entropy": 1.1413646340370178, | |
| "epoch": 1.5135623869801085, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012595281306715064, | |
| "loss": 2.5805, | |
| "mean_token_accuracy": 0.533017098903656, | |
| "num_tokens": 289127.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.0838797986507416, | |
| "epoch": 1.5207956600361663, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012558983666061704, | |
| "loss": 2.4496, | |
| "mean_token_accuracy": 0.565049484372139, | |
| "num_tokens": 290482.0, | |
| "step": 211 | |
| }, | |
| { | |
| "entropy": 1.0366474837064743, | |
| "epoch": 1.5280289330922243, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001252268602540835, | |
| "loss": 2.5012, | |
| "mean_token_accuracy": 0.5630057752132416, | |
| "num_tokens": 291876.0, | |
| "step": 212 | |
| }, | |
| { | |
| "entropy": 1.0706925094127655, | |
| "epoch": 1.5352622061482821, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012486388384754993, | |
| "loss": 2.5012, | |
| "mean_token_accuracy": 0.5362317860126495, | |
| "num_tokens": 293306.0, | |
| "step": 213 | |
| }, | |
| { | |
| "entropy": 1.0606517046689987, | |
| "epoch": 1.54249547920434, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012450090744101634, | |
| "loss": 2.5622, | |
| "mean_token_accuracy": 0.5519603192806244, | |
| "num_tokens": 294662.0, | |
| "step": 214 | |
| }, | |
| { | |
| "entropy": 1.0365487039089203, | |
| "epoch": 1.549728752260398, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012413793103448277, | |
| "loss": 2.4236, | |
| "mean_token_accuracy": 0.5674052089452744, | |
| "num_tokens": 296087.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 1.0755655467510223, | |
| "epoch": 1.5569620253164556, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001237749546279492, | |
| "loss": 2.5169, | |
| "mean_token_accuracy": 0.5508773624897003, | |
| "num_tokens": 297502.0, | |
| "step": 216 | |
| }, | |
| { | |
| "entropy": 1.1377713978290558, | |
| "epoch": 1.5641952983725136, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001234119782214156, | |
| "loss": 2.5534, | |
| "mean_token_accuracy": 0.5336230993270874, | |
| "num_tokens": 298836.0, | |
| "step": 217 | |
| }, | |
| { | |
| "entropy": 1.109740287065506, | |
| "epoch": 1.5714285714285714, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012304900181488203, | |
| "loss": 2.7175, | |
| "mean_token_accuracy": 0.5418126434087753, | |
| "num_tokens": 300111.0, | |
| "step": 218 | |
| }, | |
| { | |
| "entropy": 1.0909195244312286, | |
| "epoch": 1.5786618444846292, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012268602540834846, | |
| "loss": 2.5177, | |
| "mean_token_accuracy": 0.5417619496583939, | |
| "num_tokens": 301501.0, | |
| "step": 219 | |
| }, | |
| { | |
| "entropy": 1.06816665828228, | |
| "epoch": 1.5858951175406872, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001223230490018149, | |
| "loss": 2.4367, | |
| "mean_token_accuracy": 0.559718519449234, | |
| "num_tokens": 302926.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.0440122485160828, | |
| "epoch": 1.593128390596745, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001219600725952813, | |
| "loss": 2.4394, | |
| "mean_token_accuracy": 0.5527326017618179, | |
| "num_tokens": 304337.0, | |
| "step": 221 | |
| }, | |
| { | |
| "entropy": 1.0804894715547562, | |
| "epoch": 1.6003616636528029, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012159709618874773, | |
| "loss": 2.4707, | |
| "mean_token_accuracy": 0.5539764165878296, | |
| "num_tokens": 305707.0, | |
| "step": 222 | |
| }, | |
| { | |
| "entropy": 1.114003211259842, | |
| "epoch": 1.6075949367088609, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012123411978221418, | |
| "loss": 2.4948, | |
| "mean_token_accuracy": 0.5415279567241669, | |
| "num_tokens": 307138.0, | |
| "step": 223 | |
| }, | |
| { | |
| "entropy": 1.1211613416671753, | |
| "epoch": 1.6148282097649185, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012087114337568059, | |
| "loss": 2.5239, | |
| "mean_token_accuracy": 0.5385608822107315, | |
| "num_tokens": 308507.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 1.1505621373653412, | |
| "epoch": 1.6220614828209765, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012050816696914702, | |
| "loss": 2.527, | |
| "mean_token_accuracy": 0.5417819917201996, | |
| "num_tokens": 309890.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 1.1308437585830688, | |
| "epoch": 1.6292947558770343, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012014519056261344, | |
| "loss": 2.5056, | |
| "mean_token_accuracy": 0.5359330922365189, | |
| "num_tokens": 311303.0, | |
| "step": 226 | |
| }, | |
| { | |
| "entropy": 1.0914883315563202, | |
| "epoch": 1.6365280289330921, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011978221415607986, | |
| "loss": 2.5595, | |
| "mean_token_accuracy": 0.5537950694561005, | |
| "num_tokens": 312697.0, | |
| "step": 227 | |
| }, | |
| { | |
| "entropy": 1.0523104220628738, | |
| "epoch": 1.6437613019891502, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011941923774954629, | |
| "loss": 2.4534, | |
| "mean_token_accuracy": 0.5497398674488068, | |
| "num_tokens": 314071.0, | |
| "step": 228 | |
| }, | |
| { | |
| "entropy": 1.1443000137805939, | |
| "epoch": 1.650994575045208, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011905626134301271, | |
| "loss": 2.5725, | |
| "mean_token_accuracy": 0.5432182401418686, | |
| "num_tokens": 315403.0, | |
| "step": 229 | |
| }, | |
| { | |
| "entropy": 1.0856167376041412, | |
| "epoch": 1.6582278481012658, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011869328493647913, | |
| "loss": 2.5599, | |
| "mean_token_accuracy": 0.5425290018320084, | |
| "num_tokens": 316771.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 1.118817538022995, | |
| "epoch": 1.6654611211573238, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011833030852994556, | |
| "loss": 2.5283, | |
| "mean_token_accuracy": 0.544951319694519, | |
| "num_tokens": 318111.0, | |
| "step": 231 | |
| }, | |
| { | |
| "entropy": 1.0854854881763458, | |
| "epoch": 1.6726943942133814, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011796733212341197, | |
| "loss": 2.577, | |
| "mean_token_accuracy": 0.5448784381151199, | |
| "num_tokens": 319462.0, | |
| "step": 232 | |
| }, | |
| { | |
| "entropy": 1.1421701908111572, | |
| "epoch": 1.6799276672694394, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001176043557168784, | |
| "loss": 2.5542, | |
| "mean_token_accuracy": 0.5450168401002884, | |
| "num_tokens": 320863.0, | |
| "step": 233 | |
| }, | |
| { | |
| "entropy": 1.0995429754257202, | |
| "epoch": 1.6871609403254972, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011724137931034482, | |
| "loss": 2.4889, | |
| "mean_token_accuracy": 0.5333838164806366, | |
| "num_tokens": 322273.0, | |
| "step": 234 | |
| }, | |
| { | |
| "entropy": 1.091221421957016, | |
| "epoch": 1.694394213381555, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011687840290381127, | |
| "loss": 2.5964, | |
| "mean_token_accuracy": 0.5492272824048996, | |
| "num_tokens": 323681.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 1.089509442448616, | |
| "epoch": 1.701627486437613, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011651542649727769, | |
| "loss": 2.4769, | |
| "mean_token_accuracy": 0.5460635870695114, | |
| "num_tokens": 325074.0, | |
| "step": 236 | |
| }, | |
| { | |
| "entropy": 1.1351898610591888, | |
| "epoch": 1.7088607594936709, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011615245009074412, | |
| "loss": 2.4841, | |
| "mean_token_accuracy": 0.5464938133955002, | |
| "num_tokens": 326448.0, | |
| "step": 237 | |
| }, | |
| { | |
| "entropy": 1.065805822610855, | |
| "epoch": 1.7160940325497287, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011578947368421053, | |
| "loss": 2.5789, | |
| "mean_token_accuracy": 0.5493966788053513, | |
| "num_tokens": 327816.0, | |
| "step": 238 | |
| }, | |
| { | |
| "entropy": 1.1061568558216095, | |
| "epoch": 1.7233273056057867, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011542649727767697, | |
| "loss": 2.5684, | |
| "mean_token_accuracy": 0.5350202769041061, | |
| "num_tokens": 329183.0, | |
| "step": 239 | |
| }, | |
| { | |
| "entropy": 1.1101190447807312, | |
| "epoch": 1.7305605786618445, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011506352087114338, | |
| "loss": 2.544, | |
| "mean_token_accuracy": 0.5512830317020416, | |
| "num_tokens": 330564.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.1330992877483368, | |
| "epoch": 1.7377938517179023, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001147005444646098, | |
| "loss": 2.5997, | |
| "mean_token_accuracy": 0.5283233672380447, | |
| "num_tokens": 331896.0, | |
| "step": 241 | |
| }, | |
| { | |
| "entropy": 1.088073879480362, | |
| "epoch": 1.7450271247739604, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011433756805807623, | |
| "loss": 2.5063, | |
| "mean_token_accuracy": 0.5544091314077377, | |
| "num_tokens": 333237.0, | |
| "step": 242 | |
| }, | |
| { | |
| "entropy": 1.1106895804405212, | |
| "epoch": 1.752260397830018, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011397459165154265, | |
| "loss": 2.5657, | |
| "mean_token_accuracy": 0.5488100796937943, | |
| "num_tokens": 334592.0, | |
| "step": 243 | |
| }, | |
| { | |
| "entropy": 1.0510992854833603, | |
| "epoch": 1.759493670886076, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011361161524500907, | |
| "loss": 2.5307, | |
| "mean_token_accuracy": 0.5456000864505768, | |
| "num_tokens": 336006.0, | |
| "step": 244 | |
| }, | |
| { | |
| "entropy": 1.0965907573699951, | |
| "epoch": 1.7667269439421338, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001132486388384755, | |
| "loss": 2.4492, | |
| "mean_token_accuracy": 0.5489227473735809, | |
| "num_tokens": 337410.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 1.0655700862407684, | |
| "epoch": 1.7739602169981916, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011288566243194192, | |
| "loss": 2.4842, | |
| "mean_token_accuracy": 0.5555550754070282, | |
| "num_tokens": 338810.0, | |
| "step": 246 | |
| }, | |
| { | |
| "entropy": 1.1214852780103683, | |
| "epoch": 1.7811934900542497, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011252268602540836, | |
| "loss": 2.4906, | |
| "mean_token_accuracy": 0.5389305800199509, | |
| "num_tokens": 340273.0, | |
| "step": 247 | |
| }, | |
| { | |
| "entropy": 1.1377345025539398, | |
| "epoch": 1.7884267631103075, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011215970961887479, | |
| "loss": 2.5227, | |
| "mean_token_accuracy": 0.530749037861824, | |
| "num_tokens": 341670.0, | |
| "step": 248 | |
| }, | |
| { | |
| "entropy": 1.1064392030239105, | |
| "epoch": 1.7956600361663653, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011179673321234121, | |
| "loss": 2.5451, | |
| "mean_token_accuracy": 0.5409312695264816, | |
| "num_tokens": 343002.0, | |
| "step": 249 | |
| }, | |
| { | |
| "entropy": 1.115884155035019, | |
| "epoch": 1.8028933092224233, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011143375680580763, | |
| "loss": 2.5614, | |
| "mean_token_accuracy": 0.5375799685716629, | |
| "num_tokens": 344415.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.1006877422332764, | |
| "epoch": 1.810126582278481, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011107078039927406, | |
| "loss": 2.4458, | |
| "mean_token_accuracy": 0.5491993427276611, | |
| "num_tokens": 345822.0, | |
| "step": 251 | |
| }, | |
| { | |
| "entropy": 1.0959600508213043, | |
| "epoch": 1.817359855334539, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011070780399274048, | |
| "loss": 2.4732, | |
| "mean_token_accuracy": 0.5487753301858902, | |
| "num_tokens": 347197.0, | |
| "step": 252 | |
| }, | |
| { | |
| "entropy": 1.069685012102127, | |
| "epoch": 1.8245931283905967, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001103448275862069, | |
| "loss": 2.4234, | |
| "mean_token_accuracy": 0.5564672946929932, | |
| "num_tokens": 348560.0, | |
| "step": 253 | |
| }, | |
| { | |
| "entropy": 1.1855289340019226, | |
| "epoch": 1.8318264014466545, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010998185117967332, | |
| "loss": 2.4667, | |
| "mean_token_accuracy": 0.5307918041944504, | |
| "num_tokens": 349915.0, | |
| "step": 254 | |
| }, | |
| { | |
| "entropy": 1.0919787287712097, | |
| "epoch": 1.8390596745027126, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010961887477313974, | |
| "loss": 2.452, | |
| "mean_token_accuracy": 0.5391028076410294, | |
| "num_tokens": 351361.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 1.0317457616329193, | |
| "epoch": 1.8462929475587704, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010925589836660617, | |
| "loss": 2.483, | |
| "mean_token_accuracy": 0.5674040019512177, | |
| "num_tokens": 352716.0, | |
| "step": 256 | |
| }, | |
| { | |
| "entropy": 1.1169418692588806, | |
| "epoch": 1.8535262206148282, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010889292196007259, | |
| "loss": 2.5078, | |
| "mean_token_accuracy": 0.5412558689713478, | |
| "num_tokens": 354073.0, | |
| "step": 257 | |
| }, | |
| { | |
| "entropy": 1.0786909759044647, | |
| "epoch": 1.8607594936708862, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010852994555353901, | |
| "loss": 2.447, | |
| "mean_token_accuracy": 0.5568330138921738, | |
| "num_tokens": 355466.0, | |
| "step": 258 | |
| }, | |
| { | |
| "entropy": 1.1377921402454376, | |
| "epoch": 1.8679927667269438, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010816696914700545, | |
| "loss": 2.5446, | |
| "mean_token_accuracy": 0.5410029292106628, | |
| "num_tokens": 356819.0, | |
| "step": 259 | |
| }, | |
| { | |
| "entropy": 1.0561549663543701, | |
| "epoch": 1.8752260397830018, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010780399274047188, | |
| "loss": 2.4241, | |
| "mean_token_accuracy": 0.562323585152626, | |
| "num_tokens": 358205.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.1019063591957092, | |
| "epoch": 1.8824593128390597, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001074410163339383, | |
| "loss": 2.4751, | |
| "mean_token_accuracy": 0.549642950296402, | |
| "num_tokens": 359634.0, | |
| "step": 261 | |
| }, | |
| { | |
| "entropy": 1.1285791844129562, | |
| "epoch": 1.8896925858951175, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010707803992740473, | |
| "loss": 2.5493, | |
| "mean_token_accuracy": 0.526919350028038, | |
| "num_tokens": 360970.0, | |
| "step": 262 | |
| }, | |
| { | |
| "entropy": 1.0747187435626984, | |
| "epoch": 1.8969258589511755, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010671506352087115, | |
| "loss": 2.5552, | |
| "mean_token_accuracy": 0.5351353734731674, | |
| "num_tokens": 362305.0, | |
| "step": 263 | |
| }, | |
| { | |
| "entropy": 1.0817217528820038, | |
| "epoch": 1.9041591320072333, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010635208711433757, | |
| "loss": 2.5681, | |
| "mean_token_accuracy": 0.5475313067436218, | |
| "num_tokens": 363677.0, | |
| "step": 264 | |
| }, | |
| { | |
| "entropy": 1.132976919412613, | |
| "epoch": 1.9113924050632911, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000105989110707804, | |
| "loss": 2.5469, | |
| "mean_token_accuracy": 0.5420292168855667, | |
| "num_tokens": 365019.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 1.1142061054706573, | |
| "epoch": 1.9186256781193491, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010562613430127042, | |
| "loss": 2.5654, | |
| "mean_token_accuracy": 0.5359852463006973, | |
| "num_tokens": 366376.0, | |
| "step": 266 | |
| }, | |
| { | |
| "entropy": 1.1108311414718628, | |
| "epoch": 1.9258589511754067, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010526315789473685, | |
| "loss": 2.5543, | |
| "mean_token_accuracy": 0.5375416427850723, | |
| "num_tokens": 367689.0, | |
| "step": 267 | |
| }, | |
| { | |
| "entropy": 1.0747735798358917, | |
| "epoch": 1.9330922242314648, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010490018148820327, | |
| "loss": 2.4855, | |
| "mean_token_accuracy": 0.5498427450656891, | |
| "num_tokens": 369062.0, | |
| "step": 268 | |
| }, | |
| { | |
| "entropy": 1.067014902830124, | |
| "epoch": 1.9403254972875226, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010453720508166968, | |
| "loss": 2.501, | |
| "mean_token_accuracy": 0.5583888292312622, | |
| "num_tokens": 370453.0, | |
| "step": 269 | |
| }, | |
| { | |
| "entropy": 1.0880665332078934, | |
| "epoch": 1.9475587703435804, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010417422867513613, | |
| "loss": 2.533, | |
| "mean_token_accuracy": 0.5387668460607529, | |
| "num_tokens": 371819.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 1.0962998867034912, | |
| "epoch": 1.9547920433996384, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010381125226860256, | |
| "loss": 2.5378, | |
| "mean_token_accuracy": 0.5404876172542572, | |
| "num_tokens": 373163.0, | |
| "step": 271 | |
| }, | |
| { | |
| "entropy": 1.1243647336959839, | |
| "epoch": 1.9620253164556962, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010344827586206898, | |
| "loss": 2.4849, | |
| "mean_token_accuracy": 0.5434926003217697, | |
| "num_tokens": 374562.0, | |
| "step": 272 | |
| }, | |
| { | |
| "entropy": 1.1283120214939117, | |
| "epoch": 1.969258589511754, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001030852994555354, | |
| "loss": 2.5624, | |
| "mean_token_accuracy": 0.5324713289737701, | |
| "num_tokens": 375913.0, | |
| "step": 273 | |
| }, | |
| { | |
| "entropy": 1.1570648550987244, | |
| "epoch": 1.976491862567812, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010272232304900183, | |
| "loss": 2.5273, | |
| "mean_token_accuracy": 0.5250543802976608, | |
| "num_tokens": 377318.0, | |
| "step": 274 | |
| }, | |
| { | |
| "entropy": 1.1177105605602264, | |
| "epoch": 1.9837251356238697, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010235934664246824, | |
| "loss": 2.5293, | |
| "mean_token_accuracy": 0.5483842194080353, | |
| "num_tokens": 378658.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 1.1408209800720215, | |
| "epoch": 1.9909584086799277, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010199637023593467, | |
| "loss": 2.5535, | |
| "mean_token_accuracy": 0.5317675769329071, | |
| "num_tokens": 380010.0, | |
| "step": 276 | |
| }, | |
| { | |
| "entropy": 1.0788544714450836, | |
| "epoch": 1.9981916817359855, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010163339382940109, | |
| "loss": 2.5116, | |
| "mean_token_accuracy": 0.5480602532625198, | |
| "num_tokens": 381410.0, | |
| "step": 277 | |
| }, | |
| { | |
| "entropy": 1.132047176361084, | |
| "epoch": 2.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010127041742286751, | |
| "loss": 2.3664, | |
| "mean_token_accuracy": 0.5952380895614624, | |
| "num_tokens": 381580.0, | |
| "step": 278 | |
| }, | |
| { | |
| "entropy": 1.1153302490711212, | |
| "epoch": 2.007233273056058, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010090744101633394, | |
| "loss": 2.5106, | |
| "mean_token_accuracy": 0.5436052531003952, | |
| "num_tokens": 382940.0, | |
| "step": 279 | |
| }, | |
| { | |
| "entropy": 1.1441259980201721, | |
| "epoch": 2.0144665461121156, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010054446460980036, | |
| "loss": 2.6192, | |
| "mean_token_accuracy": 0.5275150388479233, | |
| "num_tokens": 384262.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 1.1141088157892227, | |
| "epoch": 2.0216998191681737, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010018148820326678, | |
| "loss": 2.5423, | |
| "mean_token_accuracy": 0.5334936380386353, | |
| "num_tokens": 385654.0, | |
| "step": 281 | |
| }, | |
| { | |
| "entropy": 1.1348835080862045, | |
| "epoch": 2.0289330922242317, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.981851179673322e-05, | |
| "loss": 2.523, | |
| "mean_token_accuracy": 0.5435043275356293, | |
| "num_tokens": 386982.0, | |
| "step": 282 | |
| }, | |
| { | |
| "entropy": 1.0941515564918518, | |
| "epoch": 2.0361663652802893, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.945553539019964e-05, | |
| "loss": 2.4501, | |
| "mean_token_accuracy": 0.5536217093467712, | |
| "num_tokens": 388414.0, | |
| "step": 283 | |
| }, | |
| { | |
| "entropy": 1.0650618076324463, | |
| "epoch": 2.0433996383363473, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.909255898366606e-05, | |
| "loss": 2.5364, | |
| "mean_token_accuracy": 0.5536990314722061, | |
| "num_tokens": 389792.0, | |
| "step": 284 | |
| }, | |
| { | |
| "entropy": 1.2052790224552155, | |
| "epoch": 2.050632911392405, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.87295825771325e-05, | |
| "loss": 2.6587, | |
| "mean_token_accuracy": 0.5142560675740242, | |
| "num_tokens": 391102.0, | |
| "step": 285 | |
| }, | |
| { | |
| "entropy": 1.0709343552589417, | |
| "epoch": 2.057866184448463, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.836660617059892e-05, | |
| "loss": 2.4229, | |
| "mean_token_accuracy": 0.5499039888381958, | |
| "num_tokens": 392525.0, | |
| "step": 286 | |
| }, | |
| { | |
| "entropy": 1.0422286242246628, | |
| "epoch": 2.065099457504521, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.800362976406534e-05, | |
| "loss": 2.5312, | |
| "mean_token_accuracy": 0.5531226098537445, | |
| "num_tokens": 393832.0, | |
| "step": 287 | |
| }, | |
| { | |
| "entropy": 1.1782008707523346, | |
| "epoch": 2.0723327305605785, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.764065335753177e-05, | |
| "loss": 2.4966, | |
| "mean_token_accuracy": 0.5358275324106216, | |
| "num_tokens": 395262.0, | |
| "step": 288 | |
| }, | |
| { | |
| "entropy": 1.075025349855423, | |
| "epoch": 2.0795660036166366, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.727767695099818e-05, | |
| "loss": 2.5047, | |
| "mean_token_accuracy": 0.5378954857587814, | |
| "num_tokens": 396636.0, | |
| "step": 289 | |
| }, | |
| { | |
| "entropy": 1.1651588082313538, | |
| "epoch": 2.0867992766726946, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.691470054446462e-05, | |
| "loss": 2.5779, | |
| "mean_token_accuracy": 0.5389810055494308, | |
| "num_tokens": 397961.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 1.1003607213497162, | |
| "epoch": 2.094032549728752, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.655172413793105e-05, | |
| "loss": 2.4637, | |
| "mean_token_accuracy": 0.5527941435575485, | |
| "num_tokens": 399350.0, | |
| "step": 291 | |
| }, | |
| { | |
| "entropy": 1.0847670435905457, | |
| "epoch": 2.1012658227848102, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.618874773139746e-05, | |
| "loss": 2.5583, | |
| "mean_token_accuracy": 0.5427492707967758, | |
| "num_tokens": 400728.0, | |
| "step": 292 | |
| }, | |
| { | |
| "entropy": 1.0903814435005188, | |
| "epoch": 2.108499095840868, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.58257713248639e-05, | |
| "loss": 2.5381, | |
| "mean_token_accuracy": 0.5464666932821274, | |
| "num_tokens": 402105.0, | |
| "step": 293 | |
| }, | |
| { | |
| "entropy": 1.0938865840435028, | |
| "epoch": 2.115732368896926, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.546279491833031e-05, | |
| "loss": 2.5563, | |
| "mean_token_accuracy": 0.5378870666027069, | |
| "num_tokens": 403436.0, | |
| "step": 294 | |
| }, | |
| { | |
| "entropy": 1.1337309926748276, | |
| "epoch": 2.122965641952984, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.509981851179673e-05, | |
| "loss": 2.5028, | |
| "mean_token_accuracy": 0.537694551050663, | |
| "num_tokens": 404762.0, | |
| "step": 295 | |
| }, | |
| { | |
| "entropy": 1.0870742499828339, | |
| "epoch": 2.1301989150090415, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.473684210526316e-05, | |
| "loss": 2.4864, | |
| "mean_token_accuracy": 0.5463483482599258, | |
| "num_tokens": 406079.0, | |
| "step": 296 | |
| }, | |
| { | |
| "entropy": 1.113557517528534, | |
| "epoch": 2.1374321880650995, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.437386569872959e-05, | |
| "loss": 2.4162, | |
| "mean_token_accuracy": 0.5570182800292969, | |
| "num_tokens": 407484.0, | |
| "step": 297 | |
| }, | |
| { | |
| "entropy": 1.0499663054943085, | |
| "epoch": 2.1446654611211575, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.401088929219601e-05, | |
| "loss": 2.4024, | |
| "mean_token_accuracy": 0.5615633577108383, | |
| "num_tokens": 408928.0, | |
| "step": 298 | |
| }, | |
| { | |
| "entropy": 1.1128461360931396, | |
| "epoch": 2.151898734177215, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.364791288566244e-05, | |
| "loss": 2.4792, | |
| "mean_token_accuracy": 0.545862227678299, | |
| "num_tokens": 410349.0, | |
| "step": 299 | |
| }, | |
| { | |
| "entropy": 1.0802654325962067, | |
| "epoch": 2.159132007233273, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.328493647912886e-05, | |
| "loss": 2.5234, | |
| "mean_token_accuracy": 0.541241779923439, | |
| "num_tokens": 411719.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.074128895998001, | |
| "epoch": 2.1663652802893307, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.292196007259528e-05, | |
| "loss": 2.4736, | |
| "mean_token_accuracy": 0.5454134047031403, | |
| "num_tokens": 413112.0, | |
| "step": 301 | |
| }, | |
| { | |
| "entropy": 1.0438470542430878, | |
| "epoch": 2.1735985533453888, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.255898366606171e-05, | |
| "loss": 2.4601, | |
| "mean_token_accuracy": 0.5526005625724792, | |
| "num_tokens": 414468.0, | |
| "step": 302 | |
| }, | |
| { | |
| "entropy": 1.048665851354599, | |
| "epoch": 2.180831826401447, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.219600725952814e-05, | |
| "loss": 2.5202, | |
| "mean_token_accuracy": 0.5461462587118149, | |
| "num_tokens": 415831.0, | |
| "step": 303 | |
| }, | |
| { | |
| "entropy": 1.008974403142929, | |
| "epoch": 2.1880650994575044, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.183303085299456e-05, | |
| "loss": 2.4394, | |
| "mean_token_accuracy": 0.5644627660512924, | |
| "num_tokens": 417221.0, | |
| "step": 304 | |
| }, | |
| { | |
| "entropy": 1.1043965220451355, | |
| "epoch": 2.1952983725135624, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.147005444646099e-05, | |
| "loss": 2.5703, | |
| "mean_token_accuracy": 0.545972928404808, | |
| "num_tokens": 418583.0, | |
| "step": 305 | |
| }, | |
| { | |
| "entropy": 1.0962344855070114, | |
| "epoch": 2.2025316455696204, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.11070780399274e-05, | |
| "loss": 2.4894, | |
| "mean_token_accuracy": 0.5580534338951111, | |
| "num_tokens": 419865.0, | |
| "step": 306 | |
| }, | |
| { | |
| "entropy": 1.1428653001785278, | |
| "epoch": 2.209764918625678, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.074410163339382e-05, | |
| "loss": 2.5941, | |
| "mean_token_accuracy": 0.5405401438474655, | |
| "num_tokens": 421190.0, | |
| "step": 307 | |
| }, | |
| { | |
| "entropy": 1.0979954898357391, | |
| "epoch": 2.216998191681736, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.038112522686027e-05, | |
| "loss": 2.5265, | |
| "mean_token_accuracy": 0.530515693128109, | |
| "num_tokens": 422593.0, | |
| "step": 308 | |
| }, | |
| { | |
| "entropy": 1.0707228779792786, | |
| "epoch": 2.2242314647377937, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.001814882032669e-05, | |
| "loss": 2.4784, | |
| "mean_token_accuracy": 0.5580956488847733, | |
| "num_tokens": 424028.0, | |
| "step": 309 | |
| }, | |
| { | |
| "entropy": 1.139556735754013, | |
| "epoch": 2.2314647377938517, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.96551724137931e-05, | |
| "loss": 2.5697, | |
| "mean_token_accuracy": 0.532667800784111, | |
| "num_tokens": 425381.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 1.108998566865921, | |
| "epoch": 2.2386980108499097, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.929219600725953e-05, | |
| "loss": 2.5312, | |
| "mean_token_accuracy": 0.5356175154447556, | |
| "num_tokens": 426797.0, | |
| "step": 311 | |
| }, | |
| { | |
| "entropy": 1.1161520779132843, | |
| "epoch": 2.2459312839059673, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.892921960072595e-05, | |
| "loss": 2.5586, | |
| "mean_token_accuracy": 0.542470321059227, | |
| "num_tokens": 428147.0, | |
| "step": 312 | |
| }, | |
| { | |
| "entropy": 1.0842461287975311, | |
| "epoch": 2.2531645569620253, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.856624319419238e-05, | |
| "loss": 2.5123, | |
| "mean_token_accuracy": 0.5529365092515945, | |
| "num_tokens": 429559.0, | |
| "step": 313 | |
| }, | |
| { | |
| "entropy": 1.1063741445541382, | |
| "epoch": 2.2603978300180834, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.820326678765881e-05, | |
| "loss": 2.6709, | |
| "mean_token_accuracy": 0.527004636824131, | |
| "num_tokens": 430920.0, | |
| "step": 314 | |
| }, | |
| { | |
| "entropy": 1.0897562205791473, | |
| "epoch": 2.267631103074141, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.784029038112523e-05, | |
| "loss": 2.4544, | |
| "mean_token_accuracy": 0.5409349501132965, | |
| "num_tokens": 432317.0, | |
| "step": 315 | |
| }, | |
| { | |
| "entropy": 1.0743544101715088, | |
| "epoch": 2.274864376130199, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.747731397459166e-05, | |
| "loss": 2.5057, | |
| "mean_token_accuracy": 0.5578703433275223, | |
| "num_tokens": 433677.0, | |
| "step": 316 | |
| }, | |
| { | |
| "entropy": 1.116711288690567, | |
| "epoch": 2.282097649186257, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.711433756805808e-05, | |
| "loss": 2.5612, | |
| "mean_token_accuracy": 0.5628638714551926, | |
| "num_tokens": 435033.0, | |
| "step": 317 | |
| }, | |
| { | |
| "entropy": 1.0676509886980057, | |
| "epoch": 2.2893309222423146, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.67513611615245e-05, | |
| "loss": 2.5598, | |
| "mean_token_accuracy": 0.5456564128398895, | |
| "num_tokens": 436427.0, | |
| "step": 318 | |
| }, | |
| { | |
| "entropy": 1.0859863460063934, | |
| "epoch": 2.2965641952983726, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.638838475499093e-05, | |
| "loss": 2.4101, | |
| "mean_token_accuracy": 0.5709582716226578, | |
| "num_tokens": 437883.0, | |
| "step": 319 | |
| }, | |
| { | |
| "entropy": 1.0581027567386627, | |
| "epoch": 2.3037974683544302, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.602540834845736e-05, | |
| "loss": 2.5804, | |
| "mean_token_accuracy": 0.5438152998685837, | |
| "num_tokens": 439239.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 1.06120565533638, | |
| "epoch": 2.3110307414104883, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.566243194192378e-05, | |
| "loss": 2.5033, | |
| "mean_token_accuracy": 0.5619644969701767, | |
| "num_tokens": 440618.0, | |
| "step": 321 | |
| }, | |
| { | |
| "entropy": 1.0986764430999756, | |
| "epoch": 2.3182640144665463, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.529945553539021e-05, | |
| "loss": 2.5249, | |
| "mean_token_accuracy": 0.5383182018995285, | |
| "num_tokens": 441969.0, | |
| "step": 322 | |
| }, | |
| { | |
| "entropy": 1.0080211162567139, | |
| "epoch": 2.325497287522604, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.493647912885663e-05, | |
| "loss": 2.4389, | |
| "mean_token_accuracy": 0.5622601956129074, | |
| "num_tokens": 443374.0, | |
| "step": 323 | |
| }, | |
| { | |
| "entropy": 1.0605318397283554, | |
| "epoch": 2.332730560578662, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.457350272232304e-05, | |
| "loss": 2.4748, | |
| "mean_token_accuracy": 0.5619993209838867, | |
| "num_tokens": 444719.0, | |
| "step": 324 | |
| }, | |
| { | |
| "entropy": 1.0709475874900818, | |
| "epoch": 2.3399638336347195, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.421052631578948e-05, | |
| "loss": 2.5582, | |
| "mean_token_accuracy": 0.5409077703952789, | |
| "num_tokens": 446087.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 1.1008452475070953, | |
| "epoch": 2.3471971066907775, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.38475499092559e-05, | |
| "loss": 2.4498, | |
| "mean_token_accuracy": 0.5474594086408615, | |
| "num_tokens": 447476.0, | |
| "step": 326 | |
| }, | |
| { | |
| "entropy": 1.0892058312892914, | |
| "epoch": 2.3544303797468356, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.348457350272232e-05, | |
| "loss": 2.3334, | |
| "mean_token_accuracy": 0.5688722282648087, | |
| "num_tokens": 448884.0, | |
| "step": 327 | |
| }, | |
| { | |
| "entropy": 1.1114183962345123, | |
| "epoch": 2.361663652802893, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.312159709618876e-05, | |
| "loss": 2.4905, | |
| "mean_token_accuracy": 0.5403403639793396, | |
| "num_tokens": 450339.0, | |
| "step": 328 | |
| }, | |
| { | |
| "entropy": 1.117778092622757, | |
| "epoch": 2.368896925858951, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.275862068965517e-05, | |
| "loss": 2.4494, | |
| "mean_token_accuracy": 0.5517453998327255, | |
| "num_tokens": 451759.0, | |
| "step": 329 | |
| }, | |
| { | |
| "entropy": 1.0825735926628113, | |
| "epoch": 2.376130198915009, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.23956442831216e-05, | |
| "loss": 2.5498, | |
| "mean_token_accuracy": 0.5487709194421768, | |
| "num_tokens": 453123.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 1.0718315988779068, | |
| "epoch": 2.383363471971067, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.203266787658802e-05, | |
| "loss": 2.4747, | |
| "mean_token_accuracy": 0.5481411963701248, | |
| "num_tokens": 454529.0, | |
| "step": 331 | |
| }, | |
| { | |
| "entropy": 1.0700356364250183, | |
| "epoch": 2.390596745027125, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.166969147005445e-05, | |
| "loss": 2.4382, | |
| "mean_token_accuracy": 0.5582916736602783, | |
| "num_tokens": 455969.0, | |
| "step": 332 | |
| }, | |
| { | |
| "entropy": 1.1482441425323486, | |
| "epoch": 2.397830018083183, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.130671506352088e-05, | |
| "loss": 2.5565, | |
| "mean_token_accuracy": 0.5328105837106705, | |
| "num_tokens": 457364.0, | |
| "step": 333 | |
| }, | |
| { | |
| "entropy": 1.0800821483135223, | |
| "epoch": 2.4050632911392404, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.09437386569873e-05, | |
| "loss": 2.5446, | |
| "mean_token_accuracy": 0.5423636585474014, | |
| "num_tokens": 458716.0, | |
| "step": 334 | |
| }, | |
| { | |
| "entropy": 1.1315284073352814, | |
| "epoch": 2.4122965641952985, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.058076225045372e-05, | |
| "loss": 2.4926, | |
| "mean_token_accuracy": 0.5568676143884659, | |
| "num_tokens": 460055.0, | |
| "step": 335 | |
| }, | |
| { | |
| "entropy": 1.1484705805778503, | |
| "epoch": 2.419529837251356, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.021778584392015e-05, | |
| "loss": 2.5977, | |
| "mean_token_accuracy": 0.524099811911583, | |
| "num_tokens": 461402.0, | |
| "step": 336 | |
| }, | |
| { | |
| "entropy": 1.1127368807792664, | |
| "epoch": 2.426763110307414, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.985480943738657e-05, | |
| "loss": 2.6225, | |
| "mean_token_accuracy": 0.5300293117761612, | |
| "num_tokens": 462757.0, | |
| "step": 337 | |
| }, | |
| { | |
| "entropy": 1.08088980615139, | |
| "epoch": 2.433996383363472, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.9491833030853e-05, | |
| "loss": 2.4575, | |
| "mean_token_accuracy": 0.5447369813919067, | |
| "num_tokens": 464138.0, | |
| "step": 338 | |
| }, | |
| { | |
| "entropy": 1.0856578946113586, | |
| "epoch": 2.4412296564195297, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.912885662431943e-05, | |
| "loss": 2.4591, | |
| "mean_token_accuracy": 0.5506195574998856, | |
| "num_tokens": 465536.0, | |
| "step": 339 | |
| }, | |
| { | |
| "entropy": 1.090321958065033, | |
| "epoch": 2.4484629294755877, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.876588021778585e-05, | |
| "loss": 2.5399, | |
| "mean_token_accuracy": 0.5420294851064682, | |
| "num_tokens": 466896.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 1.1008956581354141, | |
| "epoch": 2.4556962025316453, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.840290381125227e-05, | |
| "loss": 2.609, | |
| "mean_token_accuracy": 0.541169598698616, | |
| "num_tokens": 468219.0, | |
| "step": 341 | |
| }, | |
| { | |
| "entropy": 1.1546584069728851, | |
| "epoch": 2.4629294755877034, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.80399274047187e-05, | |
| "loss": 2.6097, | |
| "mean_token_accuracy": 0.530337005853653, | |
| "num_tokens": 469569.0, | |
| "step": 342 | |
| }, | |
| { | |
| "entropy": 1.1146379113197327, | |
| "epoch": 2.4701627486437614, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.767695099818511e-05, | |
| "loss": 2.4637, | |
| "mean_token_accuracy": 0.5511259138584137, | |
| "num_tokens": 470954.0, | |
| "step": 343 | |
| }, | |
| { | |
| "entropy": 1.1002293676137924, | |
| "epoch": 2.477396021699819, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.731397459165155e-05, | |
| "loss": 2.4593, | |
| "mean_token_accuracy": 0.5492667853832245, | |
| "num_tokens": 472325.0, | |
| "step": 344 | |
| }, | |
| { | |
| "entropy": 1.0669283270835876, | |
| "epoch": 2.484629294755877, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.695099818511798e-05, | |
| "loss": 2.414, | |
| "mean_token_accuracy": 0.5588005930185318, | |
| "num_tokens": 473725.0, | |
| "step": 345 | |
| }, | |
| { | |
| "entropy": 1.0927933007478714, | |
| "epoch": 2.491862567811935, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.65880217785844e-05, | |
| "loss": 2.5172, | |
| "mean_token_accuracy": 0.544892281293869, | |
| "num_tokens": 475106.0, | |
| "step": 346 | |
| }, | |
| { | |
| "entropy": 1.111391931772232, | |
| "epoch": 2.4990958408679926, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.622504537205081e-05, | |
| "loss": 2.469, | |
| "mean_token_accuracy": 0.5566596537828445, | |
| "num_tokens": 476487.0, | |
| "step": 347 | |
| }, | |
| { | |
| "entropy": 1.0637227594852448, | |
| "epoch": 2.5063291139240507, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.586206896551724e-05, | |
| "loss": 2.4938, | |
| "mean_token_accuracy": 0.5464789718389511, | |
| "num_tokens": 477906.0, | |
| "step": 348 | |
| }, | |
| { | |
| "entropy": 1.137012243270874, | |
| "epoch": 2.5135623869801087, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.549909255898367e-05, | |
| "loss": 2.5766, | |
| "mean_token_accuracy": 0.5444240942597389, | |
| "num_tokens": 479276.0, | |
| "step": 349 | |
| }, | |
| { | |
| "entropy": 1.11709363758564, | |
| "epoch": 2.5207956600361663, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.513611615245009e-05, | |
| "loss": 2.57, | |
| "mean_token_accuracy": 0.5383694916963577, | |
| "num_tokens": 480600.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.1331063508987427, | |
| "epoch": 2.5280289330922243, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.477313974591652e-05, | |
| "loss": 2.4282, | |
| "mean_token_accuracy": 0.5514472872018814, | |
| "num_tokens": 481990.0, | |
| "step": 351 | |
| }, | |
| { | |
| "entropy": 1.115255892276764, | |
| "epoch": 2.535262206148282, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.441016333938294e-05, | |
| "loss": 2.5939, | |
| "mean_token_accuracy": 0.527045726776123, | |
| "num_tokens": 483334.0, | |
| "step": 352 | |
| }, | |
| { | |
| "entropy": 1.123845636844635, | |
| "epoch": 2.54249547920434, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.404718693284937e-05, | |
| "loss": 2.487, | |
| "mean_token_accuracy": 0.556125819683075, | |
| "num_tokens": 484723.0, | |
| "step": 353 | |
| }, | |
| { | |
| "entropy": 1.097337931394577, | |
| "epoch": 2.549728752260398, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.368421052631579e-05, | |
| "loss": 2.4926, | |
| "mean_token_accuracy": 0.5411562025547028, | |
| "num_tokens": 486125.0, | |
| "step": 354 | |
| }, | |
| { | |
| "entropy": 1.0377470254898071, | |
| "epoch": 2.5569620253164556, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.332123411978222e-05, | |
| "loss": 2.4463, | |
| "mean_token_accuracy": 0.5503391325473785, | |
| "num_tokens": 487597.0, | |
| "step": 355 | |
| }, | |
| { | |
| "entropy": 1.0625304579734802, | |
| "epoch": 2.5641952983725136, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.295825771324865e-05, | |
| "loss": 2.4083, | |
| "mean_token_accuracy": 0.560984194278717, | |
| "num_tokens": 489017.0, | |
| "step": 356 | |
| }, | |
| { | |
| "entropy": 1.1392813175916672, | |
| "epoch": 2.571428571428571, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.259528130671507e-05, | |
| "loss": 2.5535, | |
| "mean_token_accuracy": 0.5292238146066666, | |
| "num_tokens": 490389.0, | |
| "step": 357 | |
| }, | |
| { | |
| "entropy": 1.093557357788086, | |
| "epoch": 2.578661844484629, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.223230490018149e-05, | |
| "loss": 2.5426, | |
| "mean_token_accuracy": 0.5413297116756439, | |
| "num_tokens": 491781.0, | |
| "step": 358 | |
| }, | |
| { | |
| "entropy": 1.1400565207004547, | |
| "epoch": 2.5858951175406872, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.186932849364792e-05, | |
| "loss": 2.5368, | |
| "mean_token_accuracy": 0.53314408659935, | |
| "num_tokens": 493138.0, | |
| "step": 359 | |
| }, | |
| { | |
| "entropy": 1.1020191758871078, | |
| "epoch": 2.5931283905967453, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.150635208711434e-05, | |
| "loss": 2.536, | |
| "mean_token_accuracy": 0.5494784340262413, | |
| "num_tokens": 494468.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 1.1096655130386353, | |
| "epoch": 2.600361663652803, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.114337568058077e-05, | |
| "loss": 2.5269, | |
| "mean_token_accuracy": 0.5418857932090759, | |
| "num_tokens": 495852.0, | |
| "step": 361 | |
| }, | |
| { | |
| "entropy": 1.15224489569664, | |
| "epoch": 2.607594936708861, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.07803992740472e-05, | |
| "loss": 2.4834, | |
| "mean_token_accuracy": 0.5387386232614517, | |
| "num_tokens": 497275.0, | |
| "step": 362 | |
| }, | |
| { | |
| "entropy": 1.1237707734107971, | |
| "epoch": 2.6148282097649185, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.041742286751362e-05, | |
| "loss": 2.5333, | |
| "mean_token_accuracy": 0.534076914191246, | |
| "num_tokens": 498628.0, | |
| "step": 363 | |
| }, | |
| { | |
| "entropy": 1.089442789554596, | |
| "epoch": 2.6220614828209765, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.005444646098003e-05, | |
| "loss": 2.4523, | |
| "mean_token_accuracy": 0.5411692559719086, | |
| "num_tokens": 500062.0, | |
| "step": 364 | |
| }, | |
| { | |
| "entropy": 1.0806556940078735, | |
| "epoch": 2.6292947558770345, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.969147005444646e-05, | |
| "loss": 2.5709, | |
| "mean_token_accuracy": 0.5465729981660843, | |
| "num_tokens": 501411.0, | |
| "step": 365 | |
| }, | |
| { | |
| "entropy": 1.060823678970337, | |
| "epoch": 2.636528028933092, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.932849364791288e-05, | |
| "loss": 2.5076, | |
| "mean_token_accuracy": 0.543865293264389, | |
| "num_tokens": 502800.0, | |
| "step": 366 | |
| }, | |
| { | |
| "entropy": 1.1018014252185822, | |
| "epoch": 2.64376130198915, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.896551724137931e-05, | |
| "loss": 2.4844, | |
| "mean_token_accuracy": 0.5601823925971985, | |
| "num_tokens": 504242.0, | |
| "step": 367 | |
| }, | |
| { | |
| "entropy": 1.121594250202179, | |
| "epoch": 2.6509945750452077, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.860254083484574e-05, | |
| "loss": 2.4678, | |
| "mean_token_accuracy": 0.5592560023069382, | |
| "num_tokens": 505682.0, | |
| "step": 368 | |
| }, | |
| { | |
| "entropy": 1.0919914245605469, | |
| "epoch": 2.6582278481012658, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.823956442831216e-05, | |
| "loss": 2.5458, | |
| "mean_token_accuracy": 0.5379031747579575, | |
| "num_tokens": 506999.0, | |
| "step": 369 | |
| }, | |
| { | |
| "entropy": 1.0880248546600342, | |
| "epoch": 2.665461121157324, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.787658802177859e-05, | |
| "loss": 2.5395, | |
| "mean_token_accuracy": 0.5501109808683395, | |
| "num_tokens": 508348.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 1.1036258935928345, | |
| "epoch": 2.6726943942133814, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.751361161524501e-05, | |
| "loss": 2.4245, | |
| "mean_token_accuracy": 0.5422181189060211, | |
| "num_tokens": 509802.0, | |
| "step": 371 | |
| }, | |
| { | |
| "entropy": 1.0616427958011627, | |
| "epoch": 2.6799276672694394, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.715063520871143e-05, | |
| "loss": 2.4194, | |
| "mean_token_accuracy": 0.5661514550447464, | |
| "num_tokens": 511280.0, | |
| "step": 372 | |
| }, | |
| { | |
| "entropy": 1.0789577066898346, | |
| "epoch": 2.687160940325497, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.678765880217786e-05, | |
| "loss": 2.5258, | |
| "mean_token_accuracy": 0.5572595447301865, | |
| "num_tokens": 512694.0, | |
| "step": 373 | |
| }, | |
| { | |
| "entropy": 1.0851573646068573, | |
| "epoch": 2.694394213381555, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.642468239564429e-05, | |
| "loss": 2.4856, | |
| "mean_token_accuracy": 0.5498984158039093, | |
| "num_tokens": 514097.0, | |
| "step": 374 | |
| }, | |
| { | |
| "entropy": 1.1420559734106064, | |
| "epoch": 2.701627486437613, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.606170598911071e-05, | |
| "loss": 2.5407, | |
| "mean_token_accuracy": 0.5401259958744049, | |
| "num_tokens": 515548.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 1.1659338176250458, | |
| "epoch": 2.708860759493671, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.569872958257714e-05, | |
| "loss": 2.4876, | |
| "mean_token_accuracy": 0.5470705479383469, | |
| "num_tokens": 516910.0, | |
| "step": 376 | |
| }, | |
| { | |
| "entropy": 1.0591351091861725, | |
| "epoch": 2.7160940325497287, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.533575317604356e-05, | |
| "loss": 2.5196, | |
| "mean_token_accuracy": 0.5521393418312073, | |
| "num_tokens": 518281.0, | |
| "step": 377 | |
| }, | |
| { | |
| "entropy": 1.0627894699573517, | |
| "epoch": 2.7233273056057867, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.497277676950997e-05, | |
| "loss": 2.4929, | |
| "mean_token_accuracy": 0.5605411231517792, | |
| "num_tokens": 519628.0, | |
| "step": 378 | |
| }, | |
| { | |
| "entropy": 1.1596409678459167, | |
| "epoch": 2.7305605786618443, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.460980036297642e-05, | |
| "loss": 2.573, | |
| "mean_token_accuracy": 0.532776340842247, | |
| "num_tokens": 520963.0, | |
| "step": 379 | |
| }, | |
| { | |
| "entropy": 1.1137427985668182, | |
| "epoch": 2.7377938517179023, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.424682395644284e-05, | |
| "loss": 2.6272, | |
| "mean_token_accuracy": 0.5297515243291855, | |
| "num_tokens": 522361.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 1.1028319001197815, | |
| "epoch": 2.7450271247739604, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.388384754990925e-05, | |
| "loss": 2.5842, | |
| "mean_token_accuracy": 0.5381578505039215, | |
| "num_tokens": 523711.0, | |
| "step": 381 | |
| }, | |
| { | |
| "entropy": 1.1044474244117737, | |
| "epoch": 2.752260397830018, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.352087114337569e-05, | |
| "loss": 2.4927, | |
| "mean_token_accuracy": 0.5612015575170517, | |
| "num_tokens": 525121.0, | |
| "step": 382 | |
| }, | |
| { | |
| "entropy": 1.0691001415252686, | |
| "epoch": 2.759493670886076, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.31578947368421e-05, | |
| "loss": 2.4769, | |
| "mean_token_accuracy": 0.5529969185590744, | |
| "num_tokens": 526523.0, | |
| "step": 383 | |
| }, | |
| { | |
| "entropy": 1.113732784986496, | |
| "epoch": 2.7667269439421336, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.279491833030852e-05, | |
| "loss": 2.604, | |
| "mean_token_accuracy": 0.5437600612640381, | |
| "num_tokens": 527857.0, | |
| "step": 384 | |
| }, | |
| { | |
| "entropy": 1.1269434988498688, | |
| "epoch": 2.7739602169981916, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.243194192377497e-05, | |
| "loss": 2.5608, | |
| "mean_token_accuracy": 0.5377811715006828, | |
| "num_tokens": 529150.0, | |
| "step": 385 | |
| }, | |
| { | |
| "entropy": 1.0413260757923126, | |
| "epoch": 2.7811934900542497, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.206896551724138e-05, | |
| "loss": 2.5217, | |
| "mean_token_accuracy": 0.5485084652900696, | |
| "num_tokens": 530577.0, | |
| "step": 386 | |
| }, | |
| { | |
| "entropy": 1.1135965585708618, | |
| "epoch": 2.7884267631103077, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.17059891107078e-05, | |
| "loss": 2.5482, | |
| "mean_token_accuracy": 0.5371388792991638, | |
| "num_tokens": 531979.0, | |
| "step": 387 | |
| }, | |
| { | |
| "entropy": 1.1181357651948929, | |
| "epoch": 2.7956600361663653, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.134301270417423e-05, | |
| "loss": 2.538, | |
| "mean_token_accuracy": 0.5407692342996597, | |
| "num_tokens": 533308.0, | |
| "step": 388 | |
| }, | |
| { | |
| "entropy": 1.1135433316230774, | |
| "epoch": 2.8028933092224233, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.098003629764065e-05, | |
| "loss": 2.6532, | |
| "mean_token_accuracy": 0.5302906185388565, | |
| "num_tokens": 534652.0, | |
| "step": 389 | |
| }, | |
| { | |
| "entropy": 1.0605097115039825, | |
| "epoch": 2.810126582278481, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.061705989110709e-05, | |
| "loss": 2.3683, | |
| "mean_token_accuracy": 0.5562764406204224, | |
| "num_tokens": 536133.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 1.0552656948566437, | |
| "epoch": 2.817359855334539, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.025408348457351e-05, | |
| "loss": 2.4262, | |
| "mean_token_accuracy": 0.5565686523914337, | |
| "num_tokens": 537572.0, | |
| "step": 391 | |
| }, | |
| { | |
| "entropy": 1.033959150314331, | |
| "epoch": 2.824593128390597, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.989110707803993e-05, | |
| "loss": 2.5963, | |
| "mean_token_accuracy": 0.5316084623336792, | |
| "num_tokens": 539020.0, | |
| "step": 392 | |
| }, | |
| { | |
| "entropy": 1.0652283132076263, | |
| "epoch": 2.8318264014466545, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.9528130671506354e-05, | |
| "loss": 2.4578, | |
| "mean_token_accuracy": 0.5588241964578629, | |
| "num_tokens": 540459.0, | |
| "step": 393 | |
| }, | |
| { | |
| "entropy": 1.2018023431301117, | |
| "epoch": 2.8390596745027126, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.916515426497278e-05, | |
| "loss": 2.5386, | |
| "mean_token_accuracy": 0.5374249666929245, | |
| "num_tokens": 541835.0, | |
| "step": 394 | |
| }, | |
| { | |
| "entropy": 1.1306479573249817, | |
| "epoch": 2.84629294755877, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.88021778584392e-05, | |
| "loss": 2.5295, | |
| "mean_token_accuracy": 0.5378138273954391, | |
| "num_tokens": 543205.0, | |
| "step": 395 | |
| }, | |
| { | |
| "entropy": 1.123464673757553, | |
| "epoch": 2.853526220614828, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.8439201451905634e-05, | |
| "loss": 2.4656, | |
| "mean_token_accuracy": 0.5502952486276627, | |
| "num_tokens": 544634.0, | |
| "step": 396 | |
| }, | |
| { | |
| "entropy": 1.0560733824968338, | |
| "epoch": 2.8607594936708862, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.807622504537206e-05, | |
| "loss": 2.475, | |
| "mean_token_accuracy": 0.5549702495336533, | |
| "num_tokens": 546043.0, | |
| "step": 397 | |
| }, | |
| { | |
| "entropy": 1.1620246469974518, | |
| "epoch": 2.867992766726944, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.771324863883848e-05, | |
| "loss": 2.5459, | |
| "mean_token_accuracy": 0.53336001932621, | |
| "num_tokens": 547385.0, | |
| "step": 398 | |
| }, | |
| { | |
| "entropy": 1.1056917607784271, | |
| "epoch": 2.875226039783002, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.73502722323049e-05, | |
| "loss": 2.5579, | |
| "mean_token_accuracy": 0.5398988127708435, | |
| "num_tokens": 548771.0, | |
| "step": 399 | |
| }, | |
| { | |
| "entropy": 1.1166136860847473, | |
| "epoch": 2.8824593128390594, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.6987295825771325e-05, | |
| "loss": 2.4985, | |
| "mean_token_accuracy": 0.5441780239343643, | |
| "num_tokens": 550099.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.1156093180179596, | |
| "epoch": 2.8896925858951175, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.662431941923775e-05, | |
| "loss": 2.4891, | |
| "mean_token_accuracy": 0.5433008521795273, | |
| "num_tokens": 551444.0, | |
| "step": 401 | |
| }, | |
| { | |
| "entropy": 1.1306461095809937, | |
| "epoch": 2.8969258589511755, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.626134301270418e-05, | |
| "loss": 2.5102, | |
| "mean_token_accuracy": 0.5288781076669693, | |
| "num_tokens": 552832.0, | |
| "step": 402 | |
| }, | |
| { | |
| "entropy": 1.076299399137497, | |
| "epoch": 2.9041591320072335, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.5898366606170604e-05, | |
| "loss": 2.5052, | |
| "mean_token_accuracy": 0.5499210357666016, | |
| "num_tokens": 554262.0, | |
| "step": 403 | |
| }, | |
| { | |
| "entropy": 1.1156409084796906, | |
| "epoch": 2.911392405063291, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.553539019963703e-05, | |
| "loss": 2.4818, | |
| "mean_token_accuracy": 0.5514587759971619, | |
| "num_tokens": 555601.0, | |
| "step": 404 | |
| }, | |
| { | |
| "entropy": 1.1044765412807465, | |
| "epoch": 2.918625678119349, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.517241379310345e-05, | |
| "loss": 2.4969, | |
| "mean_token_accuracy": 0.5391402244567871, | |
| "num_tokens": 556959.0, | |
| "step": 405 | |
| }, | |
| { | |
| "entropy": 1.0975346863269806, | |
| "epoch": 2.9258589511754067, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.480943738656987e-05, | |
| "loss": 2.4576, | |
| "mean_token_accuracy": 0.5543079674243927, | |
| "num_tokens": 558363.0, | |
| "step": 406 | |
| }, | |
| { | |
| "entropy": 1.1334719359874725, | |
| "epoch": 2.9330922242314648, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.4446460980036295e-05, | |
| "loss": 2.5585, | |
| "mean_token_accuracy": 0.5375552624464035, | |
| "num_tokens": 559717.0, | |
| "step": 407 | |
| }, | |
| { | |
| "entropy": 1.0860235095024109, | |
| "epoch": 2.940325497287523, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.4083484573502726e-05, | |
| "loss": 2.5664, | |
| "mean_token_accuracy": 0.5349879115819931, | |
| "num_tokens": 561050.0, | |
| "step": 408 | |
| }, | |
| { | |
| "entropy": 1.1346383392810822, | |
| "epoch": 2.9475587703435804, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.372050816696915e-05, | |
| "loss": 2.5143, | |
| "mean_token_accuracy": 0.536883682012558, | |
| "num_tokens": 562484.0, | |
| "step": 409 | |
| }, | |
| { | |
| "entropy": 1.0807745456695557, | |
| "epoch": 2.9547920433996384, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.3357531760435575e-05, | |
| "loss": 2.4388, | |
| "mean_token_accuracy": 0.5592896640300751, | |
| "num_tokens": 563891.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 1.1142305731773376, | |
| "epoch": 2.962025316455696, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.2994555353902e-05, | |
| "loss": 2.4821, | |
| "mean_token_accuracy": 0.5541020184755325, | |
| "num_tokens": 565284.0, | |
| "step": 411 | |
| }, | |
| { | |
| "entropy": 1.0620263665914536, | |
| "epoch": 2.969258589511754, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.2631578947368424e-05, | |
| "loss": 2.5498, | |
| "mean_token_accuracy": 0.5445921868085861, | |
| "num_tokens": 566623.0, | |
| "step": 412 | |
| }, | |
| { | |
| "entropy": 1.0871918499469757, | |
| "epoch": 2.976491862567812, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.226860254083484e-05, | |
| "loss": 2.4715, | |
| "mean_token_accuracy": 0.5425689667463303, | |
| "num_tokens": 568009.0, | |
| "step": 413 | |
| }, | |
| { | |
| "entropy": 1.1104594767093658, | |
| "epoch": 2.9837251356238697, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.190562613430128e-05, | |
| "loss": 2.4909, | |
| "mean_token_accuracy": 0.5514119565486908, | |
| "num_tokens": 569437.0, | |
| "step": 414 | |
| }, | |
| { | |
| "entropy": 1.103329062461853, | |
| "epoch": 2.9909584086799277, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.15426497277677e-05, | |
| "loss": 2.5503, | |
| "mean_token_accuracy": 0.5356593430042267, | |
| "num_tokens": 570799.0, | |
| "step": 415 | |
| }, | |
| { | |
| "entropy": 1.1086884438991547, | |
| "epoch": 2.9981916817359853, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.117967332123412e-05, | |
| "loss": 2.5718, | |
| "mean_token_accuracy": 0.535956472158432, | |
| "num_tokens": 572196.0, | |
| "step": 416 | |
| }, | |
| { | |
| "entropy": 0.9301452040672302, | |
| "epoch": 3.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.0816696914700546e-05, | |
| "loss": 2.4567, | |
| "mean_token_accuracy": 0.5465116500854492, | |
| "num_tokens": 572370.0, | |
| "step": 417 | |
| }, | |
| { | |
| "entropy": 1.082727700471878, | |
| "epoch": 3.007233273056058, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.045372050816697e-05, | |
| "loss": 2.4503, | |
| "mean_token_accuracy": 0.5427384972572327, | |
| "num_tokens": 573772.0, | |
| "step": 418 | |
| }, | |
| { | |
| "entropy": 1.1125991344451904, | |
| "epoch": 3.0144665461121156, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.009074410163339e-05, | |
| "loss": 2.5188, | |
| "mean_token_accuracy": 0.5483275502920151, | |
| "num_tokens": 575161.0, | |
| "step": 419 | |
| }, | |
| { | |
| "entropy": 1.1073324084281921, | |
| "epoch": 3.0216998191681737, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.972776769509982e-05, | |
| "loss": 2.5325, | |
| "mean_token_accuracy": 0.5482524484395981, | |
| "num_tokens": 576544.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 1.0981022417545319, | |
| "epoch": 3.0289330922242317, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.936479128856625e-05, | |
| "loss": 2.4728, | |
| "mean_token_accuracy": 0.5473926514387131, | |
| "num_tokens": 577967.0, | |
| "step": 421 | |
| }, | |
| { | |
| "entropy": 1.0880376398563385, | |
| "epoch": 3.0361663652802893, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.900181488203267e-05, | |
| "loss": 2.4843, | |
| "mean_token_accuracy": 0.5521259754896164, | |
| "num_tokens": 579347.0, | |
| "step": 422 | |
| }, | |
| { | |
| "entropy": 1.1007913947105408, | |
| "epoch": 3.0433996383363473, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.863883847549909e-05, | |
| "loss": 2.5436, | |
| "mean_token_accuracy": 0.5395879149436951, | |
| "num_tokens": 580701.0, | |
| "step": 423 | |
| }, | |
| { | |
| "entropy": 1.1133521646261215, | |
| "epoch": 3.050632911392405, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.827586206896552e-05, | |
| "loss": 2.5756, | |
| "mean_token_accuracy": 0.5257195383310318, | |
| "num_tokens": 582118.0, | |
| "step": 424 | |
| }, | |
| { | |
| "entropy": 1.1162773072719574, | |
| "epoch": 3.057866184448463, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.791288566243195e-05, | |
| "loss": 2.4577, | |
| "mean_token_accuracy": 0.5567342340946198, | |
| "num_tokens": 583497.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 1.1314301490783691, | |
| "epoch": 3.065099457504521, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.7549909255898365e-05, | |
| "loss": 2.4166, | |
| "mean_token_accuracy": 0.560940682888031, | |
| "num_tokens": 584880.0, | |
| "step": 426 | |
| }, | |
| { | |
| "entropy": 1.1418559551239014, | |
| "epoch": 3.0723327305605785, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.7186932849364796e-05, | |
| "loss": 2.5006, | |
| "mean_token_accuracy": 0.5414097160100937, | |
| "num_tokens": 586303.0, | |
| "step": 427 | |
| }, | |
| { | |
| "entropy": 1.1465589702129364, | |
| "epoch": 3.0795660036166366, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.682395644283122e-05, | |
| "loss": 2.5196, | |
| "mean_token_accuracy": 0.5303985774517059, | |
| "num_tokens": 587652.0, | |
| "step": 428 | |
| }, | |
| { | |
| "entropy": 1.093212753534317, | |
| "epoch": 3.0867992766726946, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.646098003629764e-05, | |
| "loss": 2.6529, | |
| "mean_token_accuracy": 0.5337386429309845, | |
| "num_tokens": 589047.0, | |
| "step": 429 | |
| }, | |
| { | |
| "entropy": 1.0639213025569916, | |
| "epoch": 3.094032549728752, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.609800362976407e-05, | |
| "loss": 2.5235, | |
| "mean_token_accuracy": 0.5478297472000122, | |
| "num_tokens": 590357.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 1.1245644390583038, | |
| "epoch": 3.1012658227848102, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.5735027223230494e-05, | |
| "loss": 2.5644, | |
| "mean_token_accuracy": 0.5405333489179611, | |
| "num_tokens": 591728.0, | |
| "step": 431 | |
| }, | |
| { | |
| "entropy": 1.1015748530626297, | |
| "epoch": 3.108499095840868, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.537205081669691e-05, | |
| "loss": 2.4838, | |
| "mean_token_accuracy": 0.5478966683149338, | |
| "num_tokens": 593131.0, | |
| "step": 432 | |
| }, | |
| { | |
| "entropy": 1.0423375070095062, | |
| "epoch": 3.115732368896926, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.500907441016334e-05, | |
| "loss": 2.3926, | |
| "mean_token_accuracy": 0.5643803477287292, | |
| "num_tokens": 594522.0, | |
| "step": 433 | |
| }, | |
| { | |
| "entropy": 1.0879007577896118, | |
| "epoch": 3.122965641952984, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.464609800362977e-05, | |
| "loss": 2.4367, | |
| "mean_token_accuracy": 0.5585435032844543, | |
| "num_tokens": 595933.0, | |
| "step": 434 | |
| }, | |
| { | |
| "entropy": 1.0884363949298859, | |
| "epoch": 3.1301989150090415, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.428312159709619e-05, | |
| "loss": 2.5198, | |
| "mean_token_accuracy": 0.5258640795946121, | |
| "num_tokens": 597380.0, | |
| "step": 435 | |
| }, | |
| { | |
| "entropy": 1.0987628400325775, | |
| "epoch": 3.1374321880650995, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.3920145190562616e-05, | |
| "loss": 2.6001, | |
| "mean_token_accuracy": 0.5372245460748672, | |
| "num_tokens": 598694.0, | |
| "step": 436 | |
| }, | |
| { | |
| "entropy": 1.1444098055362701, | |
| "epoch": 3.1446654611211575, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.355716878402904e-05, | |
| "loss": 2.5247, | |
| "mean_token_accuracy": 0.5320253819227219, | |
| "num_tokens": 600074.0, | |
| "step": 437 | |
| }, | |
| { | |
| "entropy": 1.0908999145030975, | |
| "epoch": 3.151898734177215, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.3194192377495465e-05, | |
| "loss": 2.4631, | |
| "mean_token_accuracy": 0.5523603707551956, | |
| "num_tokens": 601454.0, | |
| "step": 438 | |
| }, | |
| { | |
| "entropy": 1.1113301813602448, | |
| "epoch": 3.159132007233273, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.283121597096189e-05, | |
| "loss": 2.4427, | |
| "mean_token_accuracy": 0.5540345758199692, | |
| "num_tokens": 602853.0, | |
| "step": 439 | |
| }, | |
| { | |
| "entropy": 1.0943928360939026, | |
| "epoch": 3.1663652802893307, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.2468239564428313e-05, | |
| "loss": 2.4994, | |
| "mean_token_accuracy": 0.5468274131417274, | |
| "num_tokens": 604229.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 1.051211178302765, | |
| "epoch": 3.1735985533453888, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.210526315789474e-05, | |
| "loss": 2.452, | |
| "mean_token_accuracy": 0.5569523572921753, | |
| "num_tokens": 605602.0, | |
| "step": 441 | |
| }, | |
| { | |
| "entropy": 1.1384278237819672, | |
| "epoch": 3.180831826401447, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.174228675136116e-05, | |
| "loss": 2.5022, | |
| "mean_token_accuracy": 0.5312958657741547, | |
| "num_tokens": 606997.0, | |
| "step": 442 | |
| }, | |
| { | |
| "entropy": 1.148714303970337, | |
| "epoch": 3.1880650994575044, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.1379310344827587e-05, | |
| "loss": 2.5076, | |
| "mean_token_accuracy": 0.5392551869153976, | |
| "num_tokens": 608359.0, | |
| "step": 443 | |
| }, | |
| { | |
| "entropy": 1.0378380715847015, | |
| "epoch": 3.1952983725135624, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.101633393829401e-05, | |
| "loss": 2.5422, | |
| "mean_token_accuracy": 0.5553303360939026, | |
| "num_tokens": 609744.0, | |
| "step": 444 | |
| }, | |
| { | |
| "entropy": 1.0927992165088654, | |
| "epoch": 3.2025316455696204, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.065335753176044e-05, | |
| "loss": 2.4453, | |
| "mean_token_accuracy": 0.5521446019411087, | |
| "num_tokens": 611134.0, | |
| "step": 445 | |
| }, | |
| { | |
| "entropy": 1.132373720407486, | |
| "epoch": 3.209764918625678, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.029038112522686e-05, | |
| "loss": 2.494, | |
| "mean_token_accuracy": 0.551935002207756, | |
| "num_tokens": 612463.0, | |
| "step": 446 | |
| }, | |
| { | |
| "entropy": 1.077909603714943, | |
| "epoch": 3.216998191681736, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.9927404718693284e-05, | |
| "loss": 2.4535, | |
| "mean_token_accuracy": 0.5600922256708145, | |
| "num_tokens": 613902.0, | |
| "step": 447 | |
| }, | |
| { | |
| "entropy": 1.1611990630626678, | |
| "epoch": 3.2242314647377937, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.9564428312159715e-05, | |
| "loss": 2.5906, | |
| "mean_token_accuracy": 0.5318414568901062, | |
| "num_tokens": 615225.0, | |
| "step": 448 | |
| }, | |
| { | |
| "entropy": 1.105976551771164, | |
| "epoch": 3.2314647377938517, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.920145190562613e-05, | |
| "loss": 2.5691, | |
| "mean_token_accuracy": 0.5415270179510117, | |
| "num_tokens": 616599.0, | |
| "step": 449 | |
| }, | |
| { | |
| "entropy": 1.02475506067276, | |
| "epoch": 3.2386980108499097, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.883847549909256e-05, | |
| "loss": 2.4645, | |
| "mean_token_accuracy": 0.5659692138433456, | |
| "num_tokens": 618012.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.138374000787735, | |
| "epoch": 3.2459312839059673, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.847549909255899e-05, | |
| "loss": 2.4612, | |
| "mean_token_accuracy": 0.562134400010109, | |
| "num_tokens": 619371.0, | |
| "step": 451 | |
| }, | |
| { | |
| "entropy": 1.1324277371168137, | |
| "epoch": 3.2531645569620253, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.8112522686025406e-05, | |
| "loss": 2.5048, | |
| "mean_token_accuracy": 0.5571979880332947, | |
| "num_tokens": 620687.0, | |
| "step": 452 | |
| }, | |
| { | |
| "entropy": 1.0829709619283676, | |
| "epoch": 3.2603978300180834, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.774954627949184e-05, | |
| "loss": 2.5123, | |
| "mean_token_accuracy": 0.5353300124406815, | |
| "num_tokens": 622073.0, | |
| "step": 453 | |
| }, | |
| { | |
| "entropy": 1.0782041102647781, | |
| "epoch": 3.267631103074141, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.738656987295826e-05, | |
| "loss": 2.55, | |
| "mean_token_accuracy": 0.5475586950778961, | |
| "num_tokens": 623432.0, | |
| "step": 454 | |
| }, | |
| { | |
| "entropy": 1.0757603645324707, | |
| "epoch": 3.274864376130199, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.7023593466424686e-05, | |
| "loss": 2.3509, | |
| "mean_token_accuracy": 0.5611744374036789, | |
| "num_tokens": 624838.0, | |
| "step": 455 | |
| }, | |
| { | |
| "entropy": 1.103335440158844, | |
| "epoch": 3.282097649186257, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.666061705989111e-05, | |
| "loss": 2.4486, | |
| "mean_token_accuracy": 0.5478360801935196, | |
| "num_tokens": 626196.0, | |
| "step": 456 | |
| }, | |
| { | |
| "entropy": 1.10829758644104, | |
| "epoch": 3.2893309222423146, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.6297640653357535e-05, | |
| "loss": 2.4464, | |
| "mean_token_accuracy": 0.5474723875522614, | |
| "num_tokens": 627589.0, | |
| "step": 457 | |
| }, | |
| { | |
| "entropy": 1.1216012835502625, | |
| "epoch": 3.2965641952983726, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.593466424682396e-05, | |
| "loss": 2.5176, | |
| "mean_token_accuracy": 0.541084423661232, | |
| "num_tokens": 628982.0, | |
| "step": 458 | |
| }, | |
| { | |
| "entropy": 1.1574302315711975, | |
| "epoch": 3.3037974683544302, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.5571687840290383e-05, | |
| "loss": 2.6293, | |
| "mean_token_accuracy": 0.5247242599725723, | |
| "num_tokens": 630355.0, | |
| "step": 459 | |
| }, | |
| { | |
| "entropy": 1.1090004444122314, | |
| "epoch": 3.3110307414104883, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.520871143375681e-05, | |
| "loss": 2.4459, | |
| "mean_token_accuracy": 0.5529509037733078, | |
| "num_tokens": 631715.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 1.1352742612361908, | |
| "epoch": 3.3182640144665463, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.484573502722323e-05, | |
| "loss": 2.5476, | |
| "mean_token_accuracy": 0.5357427150011063, | |
| "num_tokens": 633097.0, | |
| "step": 461 | |
| }, | |
| { | |
| "entropy": 1.0867815911769867, | |
| "epoch": 3.325497287522604, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.4482758620689657e-05, | |
| "loss": 2.5339, | |
| "mean_token_accuracy": 0.5402389466762543, | |
| "num_tokens": 634505.0, | |
| "step": 462 | |
| }, | |
| { | |
| "entropy": 1.0494126379489899, | |
| "epoch": 3.332730560578662, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.411978221415608e-05, | |
| "loss": 2.4625, | |
| "mean_token_accuracy": 0.5514612942934036, | |
| "num_tokens": 635950.0, | |
| "step": 463 | |
| }, | |
| { | |
| "entropy": 1.066350743174553, | |
| "epoch": 3.3399638336347195, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.3756805807622505e-05, | |
| "loss": 2.4877, | |
| "mean_token_accuracy": 0.5456369668245316, | |
| "num_tokens": 637355.0, | |
| "step": 464 | |
| }, | |
| { | |
| "entropy": 1.0907469242811203, | |
| "epoch": 3.3471971066907775, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.339382940108893e-05, | |
| "loss": 2.4705, | |
| "mean_token_accuracy": 0.5456852614879608, | |
| "num_tokens": 638708.0, | |
| "step": 465 | |
| }, | |
| { | |
| "entropy": 1.0532267093658447, | |
| "epoch": 3.3544303797468356, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.3030852994555354e-05, | |
| "loss": 2.4572, | |
| "mean_token_accuracy": 0.5576175153255463, | |
| "num_tokens": 640097.0, | |
| "step": 466 | |
| }, | |
| { | |
| "entropy": 1.081478163599968, | |
| "epoch": 3.361663652802893, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.266787658802178e-05, | |
| "loss": 2.4297, | |
| "mean_token_accuracy": 0.5416488796472549, | |
| "num_tokens": 641589.0, | |
| "step": 467 | |
| }, | |
| { | |
| "entropy": 1.1244118511676788, | |
| "epoch": 3.368896925858951, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.230490018148821e-05, | |
| "loss": 2.6067, | |
| "mean_token_accuracy": 0.5368776768445969, | |
| "num_tokens": 642942.0, | |
| "step": 468 | |
| }, | |
| { | |
| "entropy": 1.1060850024223328, | |
| "epoch": 3.376130198915009, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.194192377495463e-05, | |
| "loss": 2.4724, | |
| "mean_token_accuracy": 0.555050402879715, | |
| "num_tokens": 644294.0, | |
| "step": 469 | |
| }, | |
| { | |
| "entropy": 1.0835371911525726, | |
| "epoch": 3.383363471971067, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.157894736842105e-05, | |
| "loss": 2.5014, | |
| "mean_token_accuracy": 0.5477449595928192, | |
| "num_tokens": 645696.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 1.0998270213603973, | |
| "epoch": 3.390596745027125, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.121597096188748e-05, | |
| "loss": 2.5321, | |
| "mean_token_accuracy": 0.5408206954598427, | |
| "num_tokens": 647073.0, | |
| "step": 471 | |
| }, | |
| { | |
| "entropy": 1.0803327411413193, | |
| "epoch": 3.397830018083183, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.08529945553539e-05, | |
| "loss": 2.4254, | |
| "mean_token_accuracy": 0.5565962195396423, | |
| "num_tokens": 648481.0, | |
| "step": 472 | |
| }, | |
| { | |
| "entropy": 1.1584204137325287, | |
| "epoch": 3.4050632911392404, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.0490018148820325e-05, | |
| "loss": 2.4628, | |
| "mean_token_accuracy": 0.5492859929800034, | |
| "num_tokens": 649890.0, | |
| "step": 473 | |
| }, | |
| { | |
| "entropy": 1.0719866752624512, | |
| "epoch": 3.4122965641952985, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.0127041742286756e-05, | |
| "loss": 2.547, | |
| "mean_token_accuracy": 0.5507875829935074, | |
| "num_tokens": 651282.0, | |
| "step": 474 | |
| }, | |
| { | |
| "entropy": 1.0890982151031494, | |
| "epoch": 3.419529837251356, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.9764065335753177e-05, | |
| "loss": 2.4228, | |
| "mean_token_accuracy": 0.5508367717266083, | |
| "num_tokens": 652705.0, | |
| "step": 475 | |
| }, | |
| { | |
| "entropy": 1.1372348964214325, | |
| "epoch": 3.426763110307414, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.94010889292196e-05, | |
| "loss": 2.6279, | |
| "mean_token_accuracy": 0.5274247825145721, | |
| "num_tokens": 654090.0, | |
| "step": 476 | |
| }, | |
| { | |
| "entropy": 1.1086134016513824, | |
| "epoch": 3.433996383363472, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.903811252268603e-05, | |
| "loss": 2.5969, | |
| "mean_token_accuracy": 0.5212997198104858, | |
| "num_tokens": 655493.0, | |
| "step": 477 | |
| }, | |
| { | |
| "entropy": 1.1175757050514221, | |
| "epoch": 3.4412296564195297, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.867513611615245e-05, | |
| "loss": 2.5913, | |
| "mean_token_accuracy": 0.5459360331296921, | |
| "num_tokens": 656820.0, | |
| "step": 478 | |
| }, | |
| { | |
| "entropy": 1.0289329886436462, | |
| "epoch": 3.4484629294755877, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.8312159709618874e-05, | |
| "loss": 2.4419, | |
| "mean_token_accuracy": 0.5702246725559235, | |
| "num_tokens": 658208.0, | |
| "step": 479 | |
| }, | |
| { | |
| "entropy": 1.0913092195987701, | |
| "epoch": 3.4556962025316453, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.7949183303085302e-05, | |
| "loss": 2.5732, | |
| "mean_token_accuracy": 0.5397117137908936, | |
| "num_tokens": 659557.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 1.0869504362344742, | |
| "epoch": 3.4629294755877034, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.7586206896551727e-05, | |
| "loss": 2.4652, | |
| "mean_token_accuracy": 0.5512167811393738, | |
| "num_tokens": 660949.0, | |
| "step": 481 | |
| }, | |
| { | |
| "entropy": 1.0968185365200043, | |
| "epoch": 3.4701627486437614, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.7223230490018148e-05, | |
| "loss": 2.4802, | |
| "mean_token_accuracy": 0.5515278428792953, | |
| "num_tokens": 662362.0, | |
| "step": 482 | |
| }, | |
| { | |
| "entropy": 1.0313489437103271, | |
| "epoch": 3.477396021699819, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.6860254083484575e-05, | |
| "loss": 2.4, | |
| "mean_token_accuracy": 0.558410719037056, | |
| "num_tokens": 663815.0, | |
| "step": 483 | |
| }, | |
| { | |
| "entropy": 1.0685087740421295, | |
| "epoch": 3.484629294755877, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.6497277676951e-05, | |
| "loss": 2.4427, | |
| "mean_token_accuracy": 0.5495921522378922, | |
| "num_tokens": 665156.0, | |
| "step": 484 | |
| }, | |
| { | |
| "entropy": 1.0642586052417755, | |
| "epoch": 3.491862567811935, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.613430127041742e-05, | |
| "loss": 2.4741, | |
| "mean_token_accuracy": 0.5541531145572662, | |
| "num_tokens": 666532.0, | |
| "step": 485 | |
| }, | |
| { | |
| "entropy": 1.1081467866897583, | |
| "epoch": 3.4990958408679926, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.577132486388385e-05, | |
| "loss": 2.4227, | |
| "mean_token_accuracy": 0.5435217171907425, | |
| "num_tokens": 667952.0, | |
| "step": 486 | |
| }, | |
| { | |
| "entropy": 1.0714454650878906, | |
| "epoch": 3.5063291139240507, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.5408348457350273e-05, | |
| "loss": 2.5223, | |
| "mean_token_accuracy": 0.5453221052885056, | |
| "num_tokens": 669331.0, | |
| "step": 487 | |
| }, | |
| { | |
| "entropy": 1.1786887049674988, | |
| "epoch": 3.5135623869801087, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.5045372050816694e-05, | |
| "loss": 2.4881, | |
| "mean_token_accuracy": 0.5456383675336838, | |
| "num_tokens": 670744.0, | |
| "step": 488 | |
| }, | |
| { | |
| "entropy": 1.0308251529932022, | |
| "epoch": 3.5207956600361663, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.4682395644283125e-05, | |
| "loss": 2.4407, | |
| "mean_token_accuracy": 0.5562519431114197, | |
| "num_tokens": 672168.0, | |
| "step": 489 | |
| }, | |
| { | |
| "entropy": 1.1447840631008148, | |
| "epoch": 3.5280289330922243, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.4319419237749546e-05, | |
| "loss": 2.4995, | |
| "mean_token_accuracy": 0.548570990562439, | |
| "num_tokens": 673522.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 1.1574127972126007, | |
| "epoch": 3.535262206148282, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.3956442831215974e-05, | |
| "loss": 2.5088, | |
| "mean_token_accuracy": 0.5454981774091721, | |
| "num_tokens": 674902.0, | |
| "step": 491 | |
| }, | |
| { | |
| "entropy": 1.0497371554374695, | |
| "epoch": 3.54249547920434, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.3593466424682398e-05, | |
| "loss": 2.4379, | |
| "mean_token_accuracy": 0.5521166771650314, | |
| "num_tokens": 676251.0, | |
| "step": 492 | |
| }, | |
| { | |
| "entropy": 1.0881330370903015, | |
| "epoch": 3.549728752260398, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.323049001814882e-05, | |
| "loss": 2.6047, | |
| "mean_token_accuracy": 0.5381551831960678, | |
| "num_tokens": 677576.0, | |
| "step": 493 | |
| }, | |
| { | |
| "entropy": 1.0649862885475159, | |
| "epoch": 3.5569620253164556, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.2867513611615247e-05, | |
| "loss": 2.5863, | |
| "mean_token_accuracy": 0.5362200736999512, | |
| "num_tokens": 678938.0, | |
| "step": 494 | |
| }, | |
| { | |
| "entropy": 1.11095330119133, | |
| "epoch": 3.5641952983725136, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.250453720508167e-05, | |
| "loss": 2.4817, | |
| "mean_token_accuracy": 0.5521639734506607, | |
| "num_tokens": 680340.0, | |
| "step": 495 | |
| }, | |
| { | |
| "entropy": 1.1321823298931122, | |
| "epoch": 3.571428571428571, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.2141560798548096e-05, | |
| "loss": 2.5982, | |
| "mean_token_accuracy": 0.5332741737365723, | |
| "num_tokens": 681726.0, | |
| "step": 496 | |
| }, | |
| { | |
| "entropy": 1.0432183742523193, | |
| "epoch": 3.578661844484629, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.177858439201452e-05, | |
| "loss": 2.538, | |
| "mean_token_accuracy": 0.5448485761880875, | |
| "num_tokens": 683093.0, | |
| "step": 497 | |
| }, | |
| { | |
| "entropy": 1.1311353743076324, | |
| "epoch": 3.5858951175406872, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.1415607985480945e-05, | |
| "loss": 2.5891, | |
| "mean_token_accuracy": 0.5331176221370697, | |
| "num_tokens": 684418.0, | |
| "step": 498 | |
| }, | |
| { | |
| "entropy": 1.0668546259403229, | |
| "epoch": 3.5931283905967453, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.105263157894737e-05, | |
| "loss": 2.4809, | |
| "mean_token_accuracy": 0.5538065284490585, | |
| "num_tokens": 685847.0, | |
| "step": 499 | |
| }, | |
| { | |
| "entropy": 1.105953961610794, | |
| "epoch": 3.600361663652803, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.0689655172413793e-05, | |
| "loss": 2.5048, | |
| "mean_token_accuracy": 0.5508566051721573, | |
| "num_tokens": 687244.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 1.0005443841218948, | |
| "epoch": 3.607594936708861, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.032667876588022e-05, | |
| "loss": 2.5028, | |
| "mean_token_accuracy": 0.5614646375179291, | |
| "num_tokens": 688625.0, | |
| "step": 501 | |
| }, | |
| { | |
| "entropy": 1.1225857138633728, | |
| "epoch": 3.6148282097649185, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9963702359346642e-05, | |
| "loss": 2.4824, | |
| "mean_token_accuracy": 0.5448538213968277, | |
| "num_tokens": 690000.0, | |
| "step": 502 | |
| }, | |
| { | |
| "entropy": 1.1489295065402985, | |
| "epoch": 3.6220614828209765, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9600725952813066e-05, | |
| "loss": 2.5399, | |
| "mean_token_accuracy": 0.5357647836208344, | |
| "num_tokens": 691320.0, | |
| "step": 503 | |
| }, | |
| { | |
| "entropy": 1.078998863697052, | |
| "epoch": 3.6292947558770345, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9237749546279494e-05, | |
| "loss": 2.6026, | |
| "mean_token_accuracy": 0.5319690853357315, | |
| "num_tokens": 692652.0, | |
| "step": 504 | |
| }, | |
| { | |
| "entropy": 1.130728840827942, | |
| "epoch": 3.636528028933092, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.887477313974592e-05, | |
| "loss": 2.5335, | |
| "mean_token_accuracy": 0.5432541519403458, | |
| "num_tokens": 694091.0, | |
| "step": 505 | |
| }, | |
| { | |
| "entropy": 1.0619003176689148, | |
| "epoch": 3.64376130198915, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8511796733212343e-05, | |
| "loss": 2.4622, | |
| "mean_token_accuracy": 0.5655789524316788, | |
| "num_tokens": 695507.0, | |
| "step": 506 | |
| }, | |
| { | |
| "entropy": 1.1114209294319153, | |
| "epoch": 3.6509945750452077, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8148820326678767e-05, | |
| "loss": 2.5158, | |
| "mean_token_accuracy": 0.5567562431097031, | |
| "num_tokens": 696867.0, | |
| "step": 507 | |
| }, | |
| { | |
| "entropy": 1.1229240000247955, | |
| "epoch": 3.6582278481012658, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.7785843920145192e-05, | |
| "loss": 2.5613, | |
| "mean_token_accuracy": 0.5479451417922974, | |
| "num_tokens": 698187.0, | |
| "step": 508 | |
| }, | |
| { | |
| "entropy": 1.15980663895607, | |
| "epoch": 3.665461121157324, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.7422867513611616e-05, | |
| "loss": 2.569, | |
| "mean_token_accuracy": 0.5393990874290466, | |
| "num_tokens": 699559.0, | |
| "step": 509 | |
| }, | |
| { | |
| "entropy": 1.0907841324806213, | |
| "epoch": 3.6726943942133814, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.705989110707804e-05, | |
| "loss": 2.5011, | |
| "mean_token_accuracy": 0.5401911735534668, | |
| "num_tokens": 700954.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 1.1135782897472382, | |
| "epoch": 3.6799276672694394, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.6696914700544465e-05, | |
| "loss": 2.4691, | |
| "mean_token_accuracy": 0.5452233403921127, | |
| "num_tokens": 702378.0, | |
| "step": 511 | |
| }, | |
| { | |
| "entropy": 1.0565876811742783, | |
| "epoch": 3.687160940325497, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.633393829401089e-05, | |
| "loss": 2.4989, | |
| "mean_token_accuracy": 0.5499396473169327, | |
| "num_tokens": 703785.0, | |
| "step": 512 | |
| }, | |
| { | |
| "entropy": 1.0759983956813812, | |
| "epoch": 3.694394213381555, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.5970961887477314e-05, | |
| "loss": 2.5112, | |
| "mean_token_accuracy": 0.5497391521930695, | |
| "num_tokens": 705219.0, | |
| "step": 513 | |
| }, | |
| { | |
| "entropy": 1.1497739553451538, | |
| "epoch": 3.701627486437613, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.560798548094374e-05, | |
| "loss": 2.5841, | |
| "mean_token_accuracy": 0.5287934392690659, | |
| "num_tokens": 706606.0, | |
| "step": 514 | |
| }, | |
| { | |
| "entropy": 1.0716162323951721, | |
| "epoch": 3.708860759493671, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.5245009074410162e-05, | |
| "loss": 2.5038, | |
| "mean_token_accuracy": 0.5536665618419647, | |
| "num_tokens": 707989.0, | |
| "step": 515 | |
| }, | |
| { | |
| "entropy": 1.0738689005374908, | |
| "epoch": 3.7160940325497287, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.4882032667876588e-05, | |
| "loss": 2.5212, | |
| "mean_token_accuracy": 0.5448974221944809, | |
| "num_tokens": 709349.0, | |
| "step": 516 | |
| }, | |
| { | |
| "entropy": 1.1064155101776123, | |
| "epoch": 3.7233273056057867, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.4519056261343015e-05, | |
| "loss": 2.6506, | |
| "mean_token_accuracy": 0.5230308175086975, | |
| "num_tokens": 710705.0, | |
| "step": 517 | |
| }, | |
| { | |
| "entropy": 1.058555543422699, | |
| "epoch": 3.7305605786618443, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.4156079854809437e-05, | |
| "loss": 2.4917, | |
| "mean_token_accuracy": 0.5443996042013168, | |
| "num_tokens": 712069.0, | |
| "step": 518 | |
| }, | |
| { | |
| "entropy": 1.0357494354248047, | |
| "epoch": 3.7377938517179023, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.3793103448275863e-05, | |
| "loss": 2.4184, | |
| "mean_token_accuracy": 0.5529536008834839, | |
| "num_tokens": 713492.0, | |
| "step": 519 | |
| }, | |
| { | |
| "entropy": 1.1316678524017334, | |
| "epoch": 3.7450271247739604, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.3430127041742288e-05, | |
| "loss": 2.4531, | |
| "mean_token_accuracy": 0.544991984963417, | |
| "num_tokens": 714834.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 1.1400097012519836, | |
| "epoch": 3.752260397830018, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.306715063520871e-05, | |
| "loss": 2.5073, | |
| "mean_token_accuracy": 0.5420869141817093, | |
| "num_tokens": 716274.0, | |
| "step": 521 | |
| }, | |
| { | |
| "entropy": 1.157213181257248, | |
| "epoch": 3.759493670886076, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.2704174228675136e-05, | |
| "loss": 2.6404, | |
| "mean_token_accuracy": 0.5418842732906342, | |
| "num_tokens": 717608.0, | |
| "step": 522 | |
| }, | |
| { | |
| "entropy": 1.0655402839183807, | |
| "epoch": 3.7667269439421336, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.2341197822141563e-05, | |
| "loss": 2.6179, | |
| "mean_token_accuracy": 0.5416123121976852, | |
| "num_tokens": 718918.0, | |
| "step": 523 | |
| }, | |
| { | |
| "entropy": 1.1274596750736237, | |
| "epoch": 3.7739602169981916, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1978221415607987e-05, | |
| "loss": 2.5871, | |
| "mean_token_accuracy": 0.5287514328956604, | |
| "num_tokens": 720304.0, | |
| "step": 524 | |
| }, | |
| { | |
| "entropy": 1.1309744715690613, | |
| "epoch": 3.7811934900542497, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.161524500907441e-05, | |
| "loss": 2.4583, | |
| "mean_token_accuracy": 0.5526018738746643, | |
| "num_tokens": 721778.0, | |
| "step": 525 | |
| }, | |
| { | |
| "entropy": 1.0946650505065918, | |
| "epoch": 3.7884267631103077, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1252268602540836e-05, | |
| "loss": 2.6319, | |
| "mean_token_accuracy": 0.5338682383298874, | |
| "num_tokens": 723119.0, | |
| "step": 526 | |
| }, | |
| { | |
| "entropy": 1.0975251197814941, | |
| "epoch": 3.7956600361663653, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.088929219600726e-05, | |
| "loss": 2.4436, | |
| "mean_token_accuracy": 0.5491883158683777, | |
| "num_tokens": 724610.0, | |
| "step": 527 | |
| }, | |
| { | |
| "entropy": 1.0772320330142975, | |
| "epoch": 3.8028933092224233, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.0526315789473684e-05, | |
| "loss": 2.5004, | |
| "mean_token_accuracy": 0.5622773170471191, | |
| "num_tokens": 725947.0, | |
| "step": 528 | |
| }, | |
| { | |
| "entropy": 1.0730805099010468, | |
| "epoch": 3.810126582278481, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.016333938294011e-05, | |
| "loss": 2.4317, | |
| "mean_token_accuracy": 0.5641112923622131, | |
| "num_tokens": 727305.0, | |
| "step": 529 | |
| }, | |
| { | |
| "entropy": 1.071317881345749, | |
| "epoch": 3.817359855334539, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.800362976406533e-06, | |
| "loss": 2.5299, | |
| "mean_token_accuracy": 0.5482836663722992, | |
| "num_tokens": 728688.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 1.0742418766021729, | |
| "epoch": 3.824593128390597, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.43738656987296e-06, | |
| "loss": 2.4834, | |
| "mean_token_accuracy": 0.5449412018060684, | |
| "num_tokens": 730106.0, | |
| "step": 531 | |
| }, | |
| { | |
| "entropy": 1.1253242194652557, | |
| "epoch": 3.8318264014466545, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.074410163339384e-06, | |
| "loss": 2.6065, | |
| "mean_token_accuracy": 0.5322617739439011, | |
| "num_tokens": 731510.0, | |
| "step": 532 | |
| }, | |
| { | |
| "entropy": 1.0974994003772736, | |
| "epoch": 3.8390596745027126, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.711433756805808e-06, | |
| "loss": 2.5402, | |
| "mean_token_accuracy": 0.5461835712194443, | |
| "num_tokens": 732829.0, | |
| "step": 533 | |
| }, | |
| { | |
| "entropy": 1.1266226470470428, | |
| "epoch": 3.84629294755877, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.348457350272232e-06, | |
| "loss": 2.6068, | |
| "mean_token_accuracy": 0.5259987786412239, | |
| "num_tokens": 734191.0, | |
| "step": 534 | |
| }, | |
| { | |
| "entropy": 1.12846839427948, | |
| "epoch": 3.853526220614828, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.985480943738657e-06, | |
| "loss": 2.4417, | |
| "mean_token_accuracy": 0.5635862648487091, | |
| "num_tokens": 735573.0, | |
| "step": 535 | |
| }, | |
| { | |
| "entropy": 1.1160639226436615, | |
| "epoch": 3.8607594936708862, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.622504537205081e-06, | |
| "loss": 2.4455, | |
| "mean_token_accuracy": 0.5517152100801468, | |
| "num_tokens": 737003.0, | |
| "step": 536 | |
| }, | |
| { | |
| "entropy": 1.0702637135982513, | |
| "epoch": 3.867992766726944, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.259528130671507e-06, | |
| "loss": 2.5004, | |
| "mean_token_accuracy": 0.5487655699253082, | |
| "num_tokens": 738375.0, | |
| "step": 537 | |
| }, | |
| { | |
| "entropy": 1.0945512652397156, | |
| "epoch": 3.875226039783002, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.896551724137932e-06, | |
| "loss": 2.5679, | |
| "mean_token_accuracy": 0.5293791145086288, | |
| "num_tokens": 739748.0, | |
| "step": 538 | |
| }, | |
| { | |
| "entropy": 1.0917899906635284, | |
| "epoch": 3.8824593128390594, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.533575317604355e-06, | |
| "loss": 2.6122, | |
| "mean_token_accuracy": 0.5389818549156189, | |
| "num_tokens": 741088.0, | |
| "step": 539 | |
| }, | |
| { | |
| "entropy": 1.1529441475868225, | |
| "epoch": 3.8896925858951175, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.170598911070781e-06, | |
| "loss": 2.5513, | |
| "mean_token_accuracy": 0.5297400206327438, | |
| "num_tokens": 742432.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 1.0943627953529358, | |
| "epoch": 3.8969258589511755, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.807622504537205e-06, | |
| "loss": 2.5533, | |
| "mean_token_accuracy": 0.5300375521183014, | |
| "num_tokens": 743872.0, | |
| "step": 541 | |
| }, | |
| { | |
| "entropy": 1.1212878823280334, | |
| "epoch": 3.9041591320072335, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.44464609800363e-06, | |
| "loss": 2.5892, | |
| "mean_token_accuracy": 0.5295825377106667, | |
| "num_tokens": 745229.0, | |
| "step": 542 | |
| }, | |
| { | |
| "entropy": 1.0862334966659546, | |
| "epoch": 3.911392405063291, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.081669691470055e-06, | |
| "loss": 2.4351, | |
| "mean_token_accuracy": 0.549940288066864, | |
| "num_tokens": 746589.0, | |
| "step": 543 | |
| }, | |
| { | |
| "entropy": 1.1090565025806427, | |
| "epoch": 3.918625678119349, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.71869328493648e-06, | |
| "loss": 2.6019, | |
| "mean_token_accuracy": 0.5316396206617355, | |
| "num_tokens": 747923.0, | |
| "step": 544 | |
| }, | |
| { | |
| "entropy": 1.0873733460903168, | |
| "epoch": 3.9258589511754067, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.355716878402904e-06, | |
| "loss": 2.4612, | |
| "mean_token_accuracy": 0.5497990250587463, | |
| "num_tokens": 749358.0, | |
| "step": 545 | |
| }, | |
| { | |
| "entropy": 1.1330247223377228, | |
| "epoch": 3.9330922242314648, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.992740471869328e-06, | |
| "loss": 2.3935, | |
| "mean_token_accuracy": 0.5545907616615295, | |
| "num_tokens": 750795.0, | |
| "step": 546 | |
| }, | |
| { | |
| "entropy": 1.1012303829193115, | |
| "epoch": 3.940325497287523, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.6297640653357536e-06, | |
| "loss": 2.5501, | |
| "mean_token_accuracy": 0.5469983816146851, | |
| "num_tokens": 752133.0, | |
| "step": 547 | |
| }, | |
| { | |
| "entropy": 1.052758365869522, | |
| "epoch": 3.9475587703435804, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.2667876588021776e-06, | |
| "loss": 2.473, | |
| "mean_token_accuracy": 0.5600574761629105, | |
| "num_tokens": 753544.0, | |
| "step": 548 | |
| }, | |
| { | |
| "entropy": 1.098718285560608, | |
| "epoch": 3.9547920433996384, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.9038112522686024e-06, | |
| "loss": 2.5281, | |
| "mean_token_accuracy": 0.5494914799928665, | |
| "num_tokens": 754898.0, | |
| "step": 549 | |
| }, | |
| { | |
| "entropy": 1.1130988895893097, | |
| "epoch": 3.962025316455696, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.5408348457350276e-06, | |
| "loss": 2.5836, | |
| "mean_token_accuracy": 0.5407898128032684, | |
| "num_tokens": 756240.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 1.1131569147109985, | |
| "epoch": 3.969258589511754, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.177858439201452e-06, | |
| "loss": 2.5982, | |
| "mean_token_accuracy": 0.5369937494397163, | |
| "num_tokens": 757580.0, | |
| "step": 551 | |
| }, | |
| { | |
| "entropy": 1.1359702944755554, | |
| "epoch": 3.976491862567812, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8148820326678768e-06, | |
| "loss": 2.5958, | |
| "mean_token_accuracy": 0.5286305099725723, | |
| "num_tokens": 758936.0, | |
| "step": 552 | |
| }, | |
| { | |
| "entropy": 1.0897059440612793, | |
| "epoch": 3.9837251356238697, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.4519056261343012e-06, | |
| "loss": 2.4814, | |
| "mean_token_accuracy": 0.5611370354890823, | |
| "num_tokens": 760290.0, | |
| "step": 553 | |
| }, | |
| { | |
| "entropy": 1.1387740671634674, | |
| "epoch": 3.9909584086799277, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.088929219600726e-06, | |
| "loss": 2.615, | |
| "mean_token_accuracy": 0.5323370546102524, | |
| "num_tokens": 761671.0, | |
| "step": 554 | |
| }, | |
| { | |
| "entropy": 1.0992904007434845, | |
| "epoch": 3.9981916817359853, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.259528130671506e-07, | |
| "loss": 2.5056, | |
| "mean_token_accuracy": 0.5511928796768188, | |
| "num_tokens": 763000.0, | |
| "step": 555 | |
| }, | |
| { | |
| "entropy": 1.0427762269973755, | |
| "epoch": 4.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.629764065335753e-07, | |
| "loss": 2.6583, | |
| "mean_token_accuracy": 0.5569620132446289, | |
| "num_tokens": 763160.0, | |
| "step": 556 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 556, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.167535347687424e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |