diff --git "a/checkpoint-9000/trainer_state.json" "b/checkpoint-9000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-9000/trainer_state.json" @@ -0,0 +1,7534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.7778377961262444, + "eval_steps": 500, + "global_step": 9000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 0.8813431424399217, + "epoch": 0.003703989505363068, + "grad_norm": 0.0770227387547493, + "learning_rate": 4.4e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7800739134351412, + "num_tokens": 2121076.0, + "step": 12 + }, + { + "entropy": 0.8696938330928484, + "epoch": 0.007407979010726136, + "grad_norm": 0.07951901108026505, + "learning_rate": 9.2e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7851626686751842, + "num_tokens": 4244776.0, + "step": 24 + }, + { + "entropy": 0.9179414622485638, + "epoch": 0.011111968516089204, + "grad_norm": 0.07233385741710663, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.9521, + "mean_token_accuracy": 0.783088818192482, + "num_tokens": 6391196.0, + "step": 36 + }, + { + "entropy": 0.9946566758056482, + "epoch": 0.014815958021452273, + "grad_norm": 0.061557795852422714, + "learning_rate": 1.88e-05, + "loss": 0.9499, + "mean_token_accuracy": 0.7822788568834463, + "num_tokens": 8503157.0, + "step": 48 + }, + { + "entropy": 0.9443877513209978, + "epoch": 0.01851994752681534, + "grad_norm": 0.056875523179769516, + "learning_rate": 2.36e-05, + "loss": 0.9141, + "mean_token_accuracy": 0.7903395928442478, + "num_tokens": 10635230.0, + "step": 60 + }, + { + "entropy": 0.9328317133088907, + "epoch": 0.02222393703217841, + "grad_norm": 0.06272337585687637, + "learning_rate": 2.84e-05, + "loss": 0.9189, + "mean_token_accuracy": 0.7886765040457249, + "num_tokens": 12765188.0, + "step": 72 + }, + { + "entropy": 0.9268471834560236, + "epoch": 0.025927926537541477, + "grad_norm": 0.06604354083538055, + "learning_rate": 3.32e-05, + "loss": 0.9058, + "mean_token_accuracy": 0.7908310927450657, + "num_tokens": 14906939.0, + "step": 84 + }, + { + "entropy": 0.9278339172403017, + "epoch": 0.029631916042904546, + "grad_norm": 0.07385515421628952, + "learning_rate": 3.8e-05, + "loss": 0.9058, + "mean_token_accuracy": 0.7903257831931114, + "num_tokens": 17063752.0, + "step": 96 + }, + { + "entropy": 0.920132706562678, + "epoch": 0.03333590554826761, + "grad_norm": 0.08544895797967911, + "learning_rate": 4.2800000000000004e-05, + "loss": 0.8985, + "mean_token_accuracy": 0.7918331262965997, + "num_tokens": 19212379.0, + "step": 108 + }, + { + "entropy": 0.9165268304447333, + "epoch": 0.03703989505363068, + "grad_norm": 0.07440265268087387, + "learning_rate": 4.76e-05, + "loss": 0.8934, + "mean_token_accuracy": 0.7929873218139013, + "num_tokens": 21338398.0, + "step": 120 + }, + { + "entropy": 0.9263193979859352, + "epoch": 0.04074388455899375, + "grad_norm": 0.07524750381708145, + "learning_rate": 5.2400000000000007e-05, + "loss": 0.9026, + "mean_token_accuracy": 0.7904796029130617, + "num_tokens": 23447525.0, + "step": 132 + }, + { + "entropy": 0.9026983591417471, + "epoch": 0.04444787406435682, + "grad_norm": 0.0853290930390358, + "learning_rate": 5.72e-05, + "loss": 0.8807, + "mean_token_accuracy": 0.7949702950815359, + "num_tokens": 25576292.0, + "step": 144 + }, + { + "entropy": 0.91398652891318, + "epoch": 0.048151863569719885, + "grad_norm": 0.08753567934036255, + "learning_rate": 6.2e-05, + "loss": 0.893, + "mean_token_accuracy": 0.7922145264844099, + "num_tokens": 27677943.0, + "step": 156 + }, + { + "entropy": 0.9047610275447369, + "epoch": 0.051855853075082954, + "grad_norm": 0.09057004004716873, + "learning_rate": 6.680000000000001e-05, + "loss": 0.8825, + "mean_token_accuracy": 0.7944387656946977, + "num_tokens": 29799957.0, + "step": 168 + }, + { + "entropy": 0.8962277062237263, + "epoch": 0.05555984258044602, + "grad_norm": 0.09516258537769318, + "learning_rate": 7.16e-05, + "loss": 0.8759, + "mean_token_accuracy": 0.7956550841530164, + "num_tokens": 31935987.0, + "step": 180 + }, + { + "entropy": 0.8993425803879896, + "epoch": 0.05926383208580909, + "grad_norm": 0.09167130291461945, + "learning_rate": 7.64e-05, + "loss": 0.8781, + "mean_token_accuracy": 0.7949150291581949, + "num_tokens": 34073688.0, + "step": 192 + }, + { + "entropy": 0.8955355410774549, + "epoch": 0.06296782159117216, + "grad_norm": 0.10528211295604706, + "learning_rate": 8.120000000000001e-05, + "loss": 0.8769, + "mean_token_accuracy": 0.7950745013852915, + "num_tokens": 36211776.0, + "step": 204 + }, + { + "entropy": 0.890474279721578, + "epoch": 0.06667181109653522, + "grad_norm": 0.09239654988050461, + "learning_rate": 8.6e-05, + "loss": 0.8681, + "mean_token_accuracy": 0.7962082512676716, + "num_tokens": 38314176.0, + "step": 216 + }, + { + "entropy": 0.8846843503415585, + "epoch": 0.0703758006018983, + "grad_norm": 0.08734577894210815, + "learning_rate": 9.080000000000001e-05, + "loss": 0.864, + "mean_token_accuracy": 0.7977614911894003, + "num_tokens": 40427803.0, + "step": 228 + }, + { + "entropy": 0.8876174452404181, + "epoch": 0.07407979010726136, + "grad_norm": 0.09199592471122742, + "learning_rate": 9.56e-05, + "loss": 0.8663, + "mean_token_accuracy": 0.7968363290031751, + "num_tokens": 42601795.0, + "step": 240 + }, + { + "entropy": 0.8842576717336973, + "epoch": 0.07778377961262443, + "grad_norm": 0.09066279232501984, + "learning_rate": 9.999999821283761e-05, + "loss": 0.8596, + "mean_token_accuracy": 0.7992542050778866, + "num_tokens": 44720639.0, + "step": 252 + }, + { + "entropy": 0.8819859276215235, + "epoch": 0.0814877691179875, + "grad_norm": 0.09380369633436203, + "learning_rate": 9.999969796985704e-05, + "loss": 0.8606, + "mean_token_accuracy": 0.7987109025319418, + "num_tokens": 46822627.0, + "step": 264 + }, + { + "entropy": 0.8764691551526388, + "epoch": 0.08519175862335057, + "grad_norm": 0.08952156454324722, + "learning_rate": 9.999888302765345e-05, + "loss": 0.8564, + "mean_token_accuracy": 0.7992806943754355, + "num_tokens": 48939905.0, + "step": 276 + }, + { + "entropy": 0.8867798671126366, + "epoch": 0.08889574812871363, + "grad_norm": 0.0936591774225235, + "learning_rate": 9.999755339461591e-05, + "loss": 0.8668, + "mean_token_accuracy": 0.7967317899068197, + "num_tokens": 51041899.0, + "step": 288 + }, + { + "entropy": 0.8787145999570688, + "epoch": 0.09259973763407671, + "grad_norm": 0.0926186814904213, + "learning_rate": 9.999570908443172e-05, + "loss": 0.8574, + "mean_token_accuracy": 0.7985943108797073, + "num_tokens": 53173319.0, + "step": 300 + }, + { + "entropy": 0.8810293078422546, + "epoch": 0.09630372713943977, + "grad_norm": 0.09375683963298798, + "learning_rate": 9.99933501160863e-05, + "loss": 0.8602, + "mean_token_accuracy": 0.7984230530758699, + "num_tokens": 55298285.0, + "step": 312 + }, + { + "entropy": 0.8722913352151712, + "epoch": 0.10000771664480285, + "grad_norm": 0.09732625633478165, + "learning_rate": 9.999047651386295e-05, + "loss": 0.8545, + "mean_token_accuracy": 0.7996316676338514, + "num_tokens": 57422076.0, + "step": 324 + }, + { + "entropy": 0.8687134782473246, + "epoch": 0.10371170615016591, + "grad_norm": 0.08659056574106216, + "learning_rate": 9.99870883073427e-05, + "loss": 0.8453, + "mean_token_accuracy": 0.8012902028858662, + "num_tokens": 59585248.0, + "step": 336 + }, + { + "entropy": 0.8830971730252107, + "epoch": 0.10741569565552897, + "grad_norm": 0.08845651149749756, + "learning_rate": 9.998318553140387e-05, + "loss": 0.8627, + "mean_token_accuracy": 0.7974087223410606, + "num_tokens": 61673684.0, + "step": 348 + }, + { + "entropy": 0.8765653309722742, + "epoch": 0.11111968516089205, + "grad_norm": 0.08888445049524307, + "learning_rate": 9.997876822622186e-05, + "loss": 0.8556, + "mean_token_accuracy": 0.7995972596108913, + "num_tokens": 63792291.0, + "step": 360 + }, + { + "entropy": 0.871281652400891, + "epoch": 0.11482367466625511, + "grad_norm": 0.09176748991012573, + "learning_rate": 9.99738364372686e-05, + "loss": 0.8534, + "mean_token_accuracy": 0.7989814169704914, + "num_tokens": 65901685.0, + "step": 372 + }, + { + "entropy": 0.8688261434435844, + "epoch": 0.11852766417161818, + "grad_norm": 0.09121166169643402, + "learning_rate": 9.996839021531213e-05, + "loss": 0.8467, + "mean_token_accuracy": 0.8007690558830897, + "num_tokens": 68051060.0, + "step": 384 + }, + { + "entropy": 0.8706431252261003, + "epoch": 0.12223165367698124, + "grad_norm": 0.09300525486469269, + "learning_rate": 9.996242961641615e-05, + "loss": 0.8536, + "mean_token_accuracy": 0.7998151158293089, + "num_tokens": 70196769.0, + "step": 396 + }, + { + "entropy": 0.8776354491710663, + "epoch": 0.12593564318234432, + "grad_norm": 0.08426107466220856, + "learning_rate": 9.995595470193933e-05, + "loss": 0.8545, + "mean_token_accuracy": 0.7993121594190598, + "num_tokens": 72307999.0, + "step": 408 + }, + { + "entropy": 0.8773620029290518, + "epoch": 0.1296396326877074, + "grad_norm": 0.09252817928791046, + "learning_rate": 9.994896553853472e-05, + "loss": 0.8581, + "mean_token_accuracy": 0.7982841742535433, + "num_tokens": 74422316.0, + "step": 420 + }, + { + "entropy": 0.8797574390967687, + "epoch": 0.13334362219307044, + "grad_norm": 0.08833932876586914, + "learning_rate": 9.994146219814912e-05, + "loss": 0.8583, + "mean_token_accuracy": 0.7982658172647158, + "num_tokens": 76535665.0, + "step": 432 + }, + { + "entropy": 0.8648973529537519, + "epoch": 0.13704761169843352, + "grad_norm": 0.08894747495651245, + "learning_rate": 9.993344475802226e-05, + "loss": 0.8449, + "mean_token_accuracy": 0.8004408640166124, + "num_tokens": 78656429.0, + "step": 444 + }, + { + "entropy": 0.8582859995464484, + "epoch": 0.1407516012037966, + "grad_norm": 0.09439876675605774, + "learning_rate": 9.992491330068606e-05, + "loss": 0.8369, + "mean_token_accuracy": 0.8031645379960537, + "num_tokens": 80814117.0, + "step": 456 + }, + { + "entropy": 0.8811340468625227, + "epoch": 0.14445559070915967, + "grad_norm": 0.0849665030837059, + "learning_rate": 9.99158679139637e-05, + "loss": 0.858, + "mean_token_accuracy": 0.7984087901810805, + "num_tokens": 82925792.0, + "step": 468 + }, + { + "entropy": 0.8554788815478483, + "epoch": 0.14815958021452272, + "grad_norm": 0.0906689241528511, + "learning_rate": 9.990630869096883e-05, + "loss": 0.8383, + "mean_token_accuracy": 0.8024374966820081, + "num_tokens": 85082604.0, + "step": 480 + }, + { + "entropy": 0.8597272150218487, + "epoch": 0.1518635697198858, + "grad_norm": 0.08773977309465408, + "learning_rate": 9.989623573010455e-05, + "loss": 0.8391, + "mean_token_accuracy": 0.8020844186345736, + "num_tokens": 87207502.0, + "step": 492 + }, + { + "entropy": 0.8616875174144903, + "epoch": 0.15556755922524887, + "grad_norm": 0.08853704482316971, + "learning_rate": 9.988564913506238e-05, + "loss": 0.8414, + "mean_token_accuracy": 0.801638551056385, + "num_tokens": 89348327.0, + "step": 504 + }, + { + "entropy": 0.8478845059871674, + "epoch": 0.15927154873061192, + "grad_norm": 0.09445828199386597, + "learning_rate": 9.987454901482122e-05, + "loss": 0.8285, + "mean_token_accuracy": 0.8042828006048998, + "num_tokens": 91504049.0, + "step": 516 + }, + { + "entropy": 0.8598806957403818, + "epoch": 0.162975538235975, + "grad_norm": 0.09136994183063507, + "learning_rate": 9.986293548364622e-05, + "loss": 0.8411, + "mean_token_accuracy": 0.8020358892778555, + "num_tokens": 93626935.0, + "step": 528 + }, + { + "entropy": 0.8653969628115495, + "epoch": 0.16667952774133807, + "grad_norm": 0.09781411290168762, + "learning_rate": 9.985080866108762e-05, + "loss": 0.8444, + "mean_token_accuracy": 0.8006485402584076, + "num_tokens": 95703749.0, + "step": 540 + }, + { + "entropy": 0.8538096360862255, + "epoch": 0.17038351724670114, + "grad_norm": 0.08897630125284195, + "learning_rate": 9.983816867197953e-05, + "loss": 0.8359, + "mean_token_accuracy": 0.8030762349565824, + "num_tokens": 97836700.0, + "step": 552 + }, + { + "entropy": 0.8514401825765768, + "epoch": 0.1740875067520642, + "grad_norm": 0.09062942117452621, + "learning_rate": 9.982501564643852e-05, + "loss": 0.8295, + "mean_token_accuracy": 0.8046696037054062, + "num_tokens": 99924036.0, + "step": 564 + }, + { + "entropy": 0.8512975362439951, + "epoch": 0.17779149625742727, + "grad_norm": 0.0891498252749443, + "learning_rate": 9.98113497198625e-05, + "loss": 0.8304, + "mean_token_accuracy": 0.8033984725673994, + "num_tokens": 102057627.0, + "step": 576 + }, + { + "entropy": 0.8456763414045175, + "epoch": 0.18149548576279034, + "grad_norm": 0.0918072909116745, + "learning_rate": 9.979717103292912e-05, + "loss": 0.8262, + "mean_token_accuracy": 0.8045558805267016, + "num_tokens": 104162821.0, + "step": 588 + }, + { + "entropy": 0.8628777662913004, + "epoch": 0.18519947526815342, + "grad_norm": 0.09555666148662567, + "learning_rate": 9.978247973159448e-05, + "loss": 0.8427, + "mean_token_accuracy": 0.8011121414601803, + "num_tokens": 106260558.0, + "step": 600 + }, + { + "entropy": 0.8535663560032845, + "epoch": 0.18890346477351647, + "grad_norm": 0.09658853709697723, + "learning_rate": 9.97672759670915e-05, + "loss": 0.8311, + "mean_token_accuracy": 0.8037117148439089, + "num_tokens": 108347746.0, + "step": 612 + }, + { + "entropy": 0.850828155875206, + "epoch": 0.19260745427887954, + "grad_norm": 0.09168772399425507, + "learning_rate": 9.975155989592844e-05, + "loss": 0.8351, + "mean_token_accuracy": 0.8032047462960085, + "num_tokens": 110475744.0, + "step": 624 + }, + { + "entropy": 0.8446941214303175, + "epoch": 0.19631144378424262, + "grad_norm": 0.09660109132528305, + "learning_rate": 9.973533167988728e-05, + "loss": 0.8261, + "mean_token_accuracy": 0.805509191006422, + "num_tokens": 112562500.0, + "step": 636 + }, + { + "entropy": 0.8449327821532885, + "epoch": 0.2000154332896057, + "grad_norm": 0.0988537073135376, + "learning_rate": 9.971859148602202e-05, + "loss": 0.8235, + "mean_token_accuracy": 0.8050300491352876, + "num_tokens": 114663477.0, + "step": 648 + }, + { + "entropy": 0.8418795031805834, + "epoch": 0.20371942279496874, + "grad_norm": 0.10221028327941895, + "learning_rate": 9.970133948665702e-05, + "loss": 0.8259, + "mean_token_accuracy": 0.8041424031058947, + "num_tokens": 116824983.0, + "step": 660 + }, + { + "entropy": 0.8358608248333136, + "epoch": 0.20742341230033182, + "grad_norm": 0.09639787673950195, + "learning_rate": 9.968357585938515e-05, + "loss": 0.815, + "mean_token_accuracy": 0.8072937255104383, + "num_tokens": 118942813.0, + "step": 672 + }, + { + "entropy": 0.8451284418503443, + "epoch": 0.2111274018056949, + "grad_norm": 0.09338120371103287, + "learning_rate": 9.966530078706599e-05, + "loss": 0.8267, + "mean_token_accuracy": 0.8043443597853184, + "num_tokens": 121069569.0, + "step": 684 + }, + { + "entropy": 0.8439769372344017, + "epoch": 0.21483139131105794, + "grad_norm": 0.10154031217098236, + "learning_rate": 9.964651445782405e-05, + "loss": 0.8258, + "mean_token_accuracy": 0.8052287250757217, + "num_tokens": 123193984.0, + "step": 696 + }, + { + "entropy": 0.8514358488221964, + "epoch": 0.21853538081642102, + "grad_norm": 0.09599766135215759, + "learning_rate": 9.962721706504663e-05, + "loss": 0.8331, + "mean_token_accuracy": 0.8030676891406378, + "num_tokens": 125349706.0, + "step": 708 + }, + { + "entropy": 0.8406069638828436, + "epoch": 0.2222393703217841, + "grad_norm": 0.09738138318061829, + "learning_rate": 9.9607408807382e-05, + "loss": 0.8218, + "mean_token_accuracy": 0.8066203904648622, + "num_tokens": 127475845.0, + "step": 720 + }, + { + "entropy": 0.8436915278434753, + "epoch": 0.22594335982714717, + "grad_norm": 0.0990961417555809, + "learning_rate": 9.958708988873729e-05, + "loss": 0.8254, + "mean_token_accuracy": 0.8044916093349457, + "num_tokens": 129575388.0, + "step": 732 + }, + { + "entropy": 0.8330078596870104, + "epoch": 0.22964734933251021, + "grad_norm": 0.09744027256965637, + "learning_rate": 9.956626051827643e-05, + "loss": 0.8138, + "mean_token_accuracy": 0.8071556240320206, + "num_tokens": 131698547.0, + "step": 744 + }, + { + "entropy": 0.8432958399256071, + "epoch": 0.2333513388378733, + "grad_norm": 0.1007818877696991, + "learning_rate": 9.954492091041788e-05, + "loss": 0.8227, + "mean_token_accuracy": 0.804592452943325, + "num_tokens": 133841198.0, + "step": 756 + }, + { + "entropy": 0.836149275302887, + "epoch": 0.23705532834323637, + "grad_norm": 0.09519964456558228, + "learning_rate": 9.952307128483256e-05, + "loss": 0.8178, + "mean_token_accuracy": 0.8064361910025278, + "num_tokens": 135976771.0, + "step": 768 + }, + { + "entropy": 0.8368441437681516, + "epoch": 0.24075931784859944, + "grad_norm": 0.10342419147491455, + "learning_rate": 9.950071186644159e-05, + "loss": 0.8176, + "mean_token_accuracy": 0.8063248756031195, + "num_tokens": 138094553.0, + "step": 780 + }, + { + "entropy": 0.8424511601527532, + "epoch": 0.2444633073539625, + "grad_norm": 0.0969947949051857, + "learning_rate": 9.94778428854138e-05, + "loss": 0.8208, + "mean_token_accuracy": 0.8055132577816645, + "num_tokens": 140199892.0, + "step": 792 + }, + { + "entropy": 0.8211217597126961, + "epoch": 0.24816729685932556, + "grad_norm": 0.09834083914756775, + "learning_rate": 9.945446457716359e-05, + "loss": 0.8014, + "mean_token_accuracy": 0.8097951101760069, + "num_tokens": 142340717.0, + "step": 804 + }, + { + "entropy": 0.8515114995340506, + "epoch": 0.25187128636468864, + "grad_norm": 0.10160677880048752, + "learning_rate": 9.943057718234836e-05, + "loss": 0.8317, + "mean_token_accuracy": 0.802549467732509, + "num_tokens": 144490208.0, + "step": 816 + }, + { + "entropy": 0.8411018749078115, + "epoch": 0.2555752758700517, + "grad_norm": 0.10110847651958466, + "learning_rate": 9.940618094686603e-05, + "loss": 0.8243, + "mean_token_accuracy": 0.8038219797114531, + "num_tokens": 146598967.0, + "step": 828 + }, + { + "entropy": 0.8269097929199537, + "epoch": 0.2592792653754148, + "grad_norm": 0.10856305807828903, + "learning_rate": 9.938127612185261e-05, + "loss": 0.8078, + "mean_token_accuracy": 0.808723546564579, + "num_tokens": 148706602.0, + "step": 840 + }, + { + "entropy": 0.8350271495680014, + "epoch": 0.26298325488077784, + "grad_norm": 0.10275018215179443, + "learning_rate": 9.935586296367953e-05, + "loss": 0.8144, + "mean_token_accuracy": 0.8073840402066708, + "num_tokens": 150831294.0, + "step": 852 + }, + { + "entropy": 0.8402173941334089, + "epoch": 0.2666872443861409, + "grad_norm": 0.10895514488220215, + "learning_rate": 9.932994173395103e-05, + "loss": 0.823, + "mean_token_accuracy": 0.8050992513696352, + "num_tokens": 152914180.0, + "step": 864 + }, + { + "entropy": 0.835850744197766, + "epoch": 0.270391233891504, + "grad_norm": 0.1103854700922966, + "learning_rate": 9.930351269950143e-05, + "loss": 0.8149, + "mean_token_accuracy": 0.8063218059639136, + "num_tokens": 155043841.0, + "step": 876 + }, + { + "entropy": 0.8304483393828074, + "epoch": 0.27409522339686704, + "grad_norm": 0.09872893989086151, + "learning_rate": 9.927657613239247e-05, + "loss": 0.8124, + "mean_token_accuracy": 0.8073564060032368, + "num_tokens": 157194273.0, + "step": 888 + }, + { + "entropy": 0.8408761695027351, + "epoch": 0.2777992129022301, + "grad_norm": 0.10328029841184616, + "learning_rate": 9.924913230991044e-05, + "loss": 0.8201, + "mean_token_accuracy": 0.8051391566793124, + "num_tokens": 159325166.0, + "step": 900 + }, + { + "entropy": 0.8261769798894724, + "epoch": 0.2815032024075932, + "grad_norm": 0.1065843477845192, + "learning_rate": 9.922118151456327e-05, + "loss": 0.8074, + "mean_token_accuracy": 0.8085715671380361, + "num_tokens": 161429666.0, + "step": 912 + }, + { + "entropy": 0.8249383146564165, + "epoch": 0.28520719191295624, + "grad_norm": 0.10588902980089188, + "learning_rate": 9.919272403407782e-05, + "loss": 0.8061, + "mean_token_accuracy": 0.8078173498312632, + "num_tokens": 163588436.0, + "step": 924 + }, + { + "entropy": 0.8257463040451208, + "epoch": 0.28891118141831934, + "grad_norm": 0.10355869680643082, + "learning_rate": 9.91637601613967e-05, + "loss": 0.8065, + "mean_token_accuracy": 0.8088524142901102, + "num_tokens": 165703594.0, + "step": 936 + }, + { + "entropy": 0.8323748372495174, + "epoch": 0.2926151709236824, + "grad_norm": 0.11059477180242538, + "learning_rate": 9.913429019467534e-05, + "loss": 0.8135, + "mean_token_accuracy": 0.8072844197352728, + "num_tokens": 167834747.0, + "step": 948 + }, + { + "entropy": 0.8198406957089901, + "epoch": 0.29631916042904544, + "grad_norm": 0.11167987436056137, + "learning_rate": 9.910431443727897e-05, + "loss": 0.8005, + "mean_token_accuracy": 0.8094635804494222, + "num_tokens": 169954318.0, + "step": 960 + }, + { + "entropy": 0.8273561559617519, + "epoch": 0.30002314993440854, + "grad_norm": 0.11263331025838852, + "learning_rate": 9.907383319777945e-05, + "loss": 0.8066, + "mean_token_accuracy": 0.8084728009998798, + "num_tokens": 172056219.0, + "step": 972 + }, + { + "entropy": 0.8382978811860085, + "epoch": 0.3037271394397716, + "grad_norm": 0.1031041070818901, + "learning_rate": 9.904284678995206e-05, + "loss": 0.8204, + "mean_token_accuracy": 0.8051142630477747, + "num_tokens": 174209607.0, + "step": 984 + }, + { + "entropy": 0.8209519870579243, + "epoch": 0.30743112894513464, + "grad_norm": 0.10300373286008835, + "learning_rate": 9.901135553277232e-05, + "loss": 0.798, + "mean_token_accuracy": 0.8102124035358429, + "num_tokens": 176340332.0, + "step": 996 + }, + { + "entropy": 0.8192687357465426, + "epoch": 0.31113511845049774, + "grad_norm": 0.11473007500171661, + "learning_rate": 9.897935975041273e-05, + "loss": 0.8024, + "mean_token_accuracy": 0.8089515815178553, + "num_tokens": 178465989.0, + "step": 1008 + }, + { + "entropy": 0.8185019778708617, + "epoch": 0.3148391079558608, + "grad_norm": 0.10434567183256149, + "learning_rate": 9.894685977223934e-05, + "loss": 0.7995, + "mean_token_accuracy": 0.8095376479128996, + "num_tokens": 180608399.0, + "step": 1020 + }, + { + "entropy": 0.8181099407374859, + "epoch": 0.31854309746122383, + "grad_norm": 0.10841691493988037, + "learning_rate": 9.891385593280847e-05, + "loss": 0.7983, + "mean_token_accuracy": 0.8103865322967371, + "num_tokens": 182744332.0, + "step": 1032 + }, + { + "entropy": 0.8311556254824003, + "epoch": 0.32224708696658694, + "grad_norm": 0.11030473560094833, + "learning_rate": 9.888034857186315e-05, + "loss": 0.8107, + "mean_token_accuracy": 0.8080282248556614, + "num_tokens": 184878580.0, + "step": 1044 + }, + { + "entropy": 0.8324671102066835, + "epoch": 0.32595107647195, + "grad_norm": 0.11505813896656036, + "learning_rate": 9.884633803432972e-05, + "loss": 0.816, + "mean_token_accuracy": 0.8058654479682446, + "num_tokens": 186987179.0, + "step": 1056 + }, + { + "entropy": 0.8222536593675613, + "epoch": 0.3296550659773131, + "grad_norm": 0.11003649234771729, + "learning_rate": 9.881182467031427e-05, + "loss": 0.8024, + "mean_token_accuracy": 0.8091614345709482, + "num_tokens": 189146940.0, + "step": 1068 + }, + { + "entropy": 0.8215553897122542, + "epoch": 0.33335905548267614, + "grad_norm": 0.11053642630577087, + "learning_rate": 9.877680883509895e-05, + "loss": 0.8045, + "mean_token_accuracy": 0.8081031031906605, + "num_tokens": 191273140.0, + "step": 1080 + }, + { + "entropy": 0.8301792852580547, + "epoch": 0.3370630449880392, + "grad_norm": 0.11618710309267044, + "learning_rate": 9.874129088913842e-05, + "loss": 0.8111, + "mean_token_accuracy": 0.8073916758100191, + "num_tokens": 193419995.0, + "step": 1092 + }, + { + "entropy": 0.8200336868564287, + "epoch": 0.3407670344934023, + "grad_norm": 0.11068252474069595, + "learning_rate": 9.870527119805603e-05, + "loss": 0.7987, + "mean_token_accuracy": 0.8100821475187937, + "num_tokens": 195570082.0, + "step": 1104 + }, + { + "entropy": 0.8287765470643839, + "epoch": 0.34447102399876534, + "grad_norm": 0.10776914656162262, + "learning_rate": 9.866875013264023e-05, + "loss": 0.8118, + "mean_token_accuracy": 0.8070496221383413, + "num_tokens": 197675630.0, + "step": 1116 + }, + { + "entropy": 0.8167796346048514, + "epoch": 0.3481750135041284, + "grad_norm": 0.10457509756088257, + "learning_rate": 9.863172806884051e-05, + "loss": 0.798, + "mean_token_accuracy": 0.8097474686801434, + "num_tokens": 199804447.0, + "step": 1128 + }, + { + "entropy": 0.8168069496750832, + "epoch": 0.3518790030094915, + "grad_norm": 0.10983676463365555, + "learning_rate": 9.859420538776376e-05, + "loss": 0.7981, + "mean_token_accuracy": 0.8102098839978377, + "num_tokens": 201919269.0, + "step": 1140 + }, + { + "entropy": 0.808088593184948, + "epoch": 0.35558299251485453, + "grad_norm": 0.11442640423774719, + "learning_rate": 9.855618247567018e-05, + "loss": 0.7926, + "mean_token_accuracy": 0.8110264576971531, + "num_tokens": 204057926.0, + "step": 1152 + }, + { + "entropy": 0.8169751837849617, + "epoch": 0.3592869820202176, + "grad_norm": 0.10822667181491852, + "learning_rate": 9.851765972396943e-05, + "loss": 0.7967, + "mean_token_accuracy": 0.8103730641305447, + "num_tokens": 206175396.0, + "step": 1164 + }, + { + "entropy": 0.814443551003933, + "epoch": 0.3629909715255807, + "grad_norm": 0.11191259324550629, + "learning_rate": 9.847863752921649e-05, + "loss": 0.7933, + "mean_token_accuracy": 0.8108923596640428, + "num_tokens": 208298979.0, + "step": 1176 + }, + { + "entropy": 0.807010448227326, + "epoch": 0.36669496103094373, + "grad_norm": 0.11479055136442184, + "learning_rate": 9.843911629310764e-05, + "loss": 0.7892, + "mean_token_accuracy": 0.8113560838003954, + "num_tokens": 210441373.0, + "step": 1188 + }, + { + "entropy": 0.8038304224610329, + "epoch": 0.37039895053630684, + "grad_norm": 0.12469020485877991, + "learning_rate": 9.839909642247637e-05, + "loss": 0.7845, + "mean_token_accuracy": 0.8129827156662941, + "num_tokens": 212573549.0, + "step": 1200 + }, + { + "entropy": 0.8132020806272825, + "epoch": 0.3741029400416699, + "grad_norm": 0.11616237461566925, + "learning_rate": 9.835857832928908e-05, + "loss": 0.7946, + "mean_token_accuracy": 0.8103696592152119, + "num_tokens": 214646922.0, + "step": 1212 + }, + { + "entropy": 0.8121326218048731, + "epoch": 0.37780692954703293, + "grad_norm": 0.11775597929954529, + "learning_rate": 9.831756243064088e-05, + "loss": 0.7945, + "mean_token_accuracy": 0.8108564031620821, + "num_tokens": 216767641.0, + "step": 1224 + }, + { + "entropy": 0.8172868477801482, + "epoch": 0.38151091905239604, + "grad_norm": 0.11528316140174866, + "learning_rate": 9.827604914875139e-05, + "loss": 0.7979, + "mean_token_accuracy": 0.8101303689181805, + "num_tokens": 218905400.0, + "step": 1236 + }, + { + "entropy": 0.8001810808976492, + "epoch": 0.3852149085577591, + "grad_norm": 0.1265256702899933, + "learning_rate": 9.823403891096024e-05, + "loss": 0.7804, + "mean_token_accuracy": 0.813588964442412, + "num_tokens": 220993960.0, + "step": 1248 + }, + { + "entropy": 0.8007968428234259, + "epoch": 0.38891889806312213, + "grad_norm": 0.11904493719339371, + "learning_rate": 9.819153214972279e-05, + "loss": 0.7826, + "mean_token_accuracy": 0.8128631959358851, + "num_tokens": 223114091.0, + "step": 1260 + }, + { + "entropy": 0.8208761115868887, + "epoch": 0.39262288756848523, + "grad_norm": 0.12480930238962173, + "learning_rate": 9.814852930260561e-05, + "loss": 0.7983, + "mean_token_accuracy": 0.8095660296579202, + "num_tokens": 225247556.0, + "step": 1272 + }, + { + "entropy": 0.815931453059117, + "epoch": 0.3963268770738483, + "grad_norm": 0.11885613203048706, + "learning_rate": 9.810503081228202e-05, + "loss": 0.7996, + "mean_token_accuracy": 0.8100833334028721, + "num_tokens": 227351759.0, + "step": 1284 + }, + { + "entropy": 0.8205481581389904, + "epoch": 0.4000308665792114, + "grad_norm": 0.11844924092292786, + "learning_rate": 9.80610371265275e-05, + "loss": 0.8019, + "mean_token_accuracy": 0.808446753770113, + "num_tokens": 229441295.0, + "step": 1296 + }, + { + "entropy": 0.8117605733374754, + "epoch": 0.40373485608457443, + "grad_norm": 0.11241784691810608, + "learning_rate": 9.801654869821512e-05, + "loss": 0.7932, + "mean_token_accuracy": 0.81082005550464, + "num_tokens": 231566470.0, + "step": 1308 + }, + { + "entropy": 0.8152283951640129, + "epoch": 0.4074388455899375, + "grad_norm": 0.13755977153778076, + "learning_rate": 9.797156598531085e-05, + "loss": 0.7936, + "mean_token_accuracy": 0.8106731151541074, + "num_tokens": 233658940.0, + "step": 1320 + }, + { + "entropy": 0.8065759042898814, + "epoch": 0.4111428350953006, + "grad_norm": 0.11853731423616409, + "learning_rate": 9.79260894508688e-05, + "loss": 0.7894, + "mean_token_accuracy": 0.8116564750671387, + "num_tokens": 235792409.0, + "step": 1332 + }, + { + "entropy": 0.8189149846633276, + "epoch": 0.41484682460066363, + "grad_norm": 0.11743967235088348, + "learning_rate": 9.788011956302656e-05, + "loss": 0.798, + "mean_token_accuracy": 0.8094673914213976, + "num_tokens": 237882728.0, + "step": 1344 + }, + { + "entropy": 0.8161345782379309, + "epoch": 0.4185508141060267, + "grad_norm": 0.12426267564296722, + "learning_rate": 9.783365679500027e-05, + "loss": 0.7954, + "mean_token_accuracy": 0.8102164330581824, + "num_tokens": 240024867.0, + "step": 1356 + }, + { + "entropy": 0.7942858090003332, + "epoch": 0.4222548036113898, + "grad_norm": 0.11311879754066467, + "learning_rate": 9.778670162507986e-05, + "loss": 0.7759, + "mean_token_accuracy": 0.8148905138174692, + "num_tokens": 242149516.0, + "step": 1368 + }, + { + "entropy": 0.803426214804252, + "epoch": 0.42595879311675283, + "grad_norm": 0.11434963345527649, + "learning_rate": 9.773925453662403e-05, + "loss": 0.785, + "mean_token_accuracy": 0.8126038151482741, + "num_tokens": 244256116.0, + "step": 1380 + }, + { + "entropy": 0.7926236540079117, + "epoch": 0.4296627826221159, + "grad_norm": 0.10715149343013763, + "learning_rate": 9.769131601805534e-05, + "loss": 0.7733, + "mean_token_accuracy": 0.8152074652413527, + "num_tokens": 246406475.0, + "step": 1392 + }, + { + "entropy": 0.8100047335028648, + "epoch": 0.433366772127479, + "grad_norm": 0.11607677489519119, + "learning_rate": 9.76428865628551e-05, + "loss": 0.7907, + "mean_token_accuracy": 0.8116505754490694, + "num_tokens": 248494590.0, + "step": 1404 + }, + { + "entropy": 0.8006227687001228, + "epoch": 0.43707076163284203, + "grad_norm": 0.11636164784431458, + "learning_rate": 9.75939666695584e-05, + "loss": 0.7815, + "mean_token_accuracy": 0.8131669908761978, + "num_tokens": 250624649.0, + "step": 1416 + }, + { + "entropy": 0.8028407072027525, + "epoch": 0.44077475113820513, + "grad_norm": 0.12113169580698013, + "learning_rate": 9.75445568417489e-05, + "loss": 0.7819, + "mean_token_accuracy": 0.812740029146274, + "num_tokens": 252753433.0, + "step": 1428 + }, + { + "entropy": 0.811029980580012, + "epoch": 0.4444787406435682, + "grad_norm": 0.1194261834025383, + "learning_rate": 9.74946575880537e-05, + "loss": 0.7925, + "mean_token_accuracy": 0.8101853169500828, + "num_tokens": 254919606.0, + "step": 1440 + }, + { + "entropy": 0.8054500371217728, + "epoch": 0.44818273014893123, + "grad_norm": 0.12146873027086258, + "learning_rate": 9.744426942213799e-05, + "loss": 0.788, + "mean_token_accuracy": 0.8125789103408655, + "num_tokens": 257037052.0, + "step": 1452 + }, + { + "entropy": 0.7984961271286011, + "epoch": 0.45188671965429433, + "grad_norm": 0.12276361137628555, + "learning_rate": 9.739339286269995e-05, + "loss": 0.7787, + "mean_token_accuracy": 0.8136009263495604, + "num_tokens": 259200523.0, + "step": 1464 + }, + { + "entropy": 0.8060410469770432, + "epoch": 0.4555907091596574, + "grad_norm": 0.12075335532426834, + "learning_rate": 9.734202843346522e-05, + "loss": 0.7882, + "mean_token_accuracy": 0.8113779984414577, + "num_tokens": 261303525.0, + "step": 1476 + }, + { + "entropy": 0.8057870579262575, + "epoch": 0.45929469866502043, + "grad_norm": 0.12227274477481842, + "learning_rate": 9.729017666318165e-05, + "loss": 0.7868, + "mean_token_accuracy": 0.8118944627543291, + "num_tokens": 263437688.0, + "step": 1488 + }, + { + "entropy": 0.8005211018025875, + "epoch": 0.46299868817038353, + "grad_norm": 0.11424333602190018, + "learning_rate": 9.723783808561378e-05, + "loss": 0.7791, + "mean_token_accuracy": 0.8138281057278315, + "num_tokens": 265568368.0, + "step": 1500 + }, + { + "entropy": 0.8023173411687216, + "epoch": 0.4667026776757466, + "grad_norm": 0.13906393945217133, + "learning_rate": 9.718501323953737e-05, + "loss": 0.7853, + "mean_token_accuracy": 0.811311274766922, + "num_tokens": 267706317.0, + "step": 1512 + }, + { + "entropy": 0.8110432215034962, + "epoch": 0.47040666718110963, + "grad_norm": 0.11983723938465118, + "learning_rate": 9.713170266873384e-05, + "loss": 0.7893, + "mean_token_accuracy": 0.8107657519479593, + "num_tokens": 269835542.0, + "step": 1524 + }, + { + "entropy": 0.7883443137009939, + "epoch": 0.47411065668647273, + "grad_norm": 0.13033325970172882, + "learning_rate": 9.70779069219847e-05, + "loss": 0.7734, + "mean_token_accuracy": 0.8156957700848579, + "num_tokens": 271983602.0, + "step": 1536 + }, + { + "entropy": 0.7950549945235252, + "epoch": 0.4778146461918358, + "grad_norm": 0.10897476971149445, + "learning_rate": 9.702362655306587e-05, + "loss": 0.775, + "mean_token_accuracy": 0.8149407207965851, + "num_tokens": 274107343.0, + "step": 1548 + }, + { + "entropy": 0.7990003265440464, + "epoch": 0.4815186356971989, + "grad_norm": 0.11902206391096115, + "learning_rate": 9.696886212074202e-05, + "loss": 0.7808, + "mean_token_accuracy": 0.8132348544895649, + "num_tokens": 276222435.0, + "step": 1560 + }, + { + "entropy": 0.801575344055891, + "epoch": 0.48522262520256193, + "grad_norm": 0.11533886194229126, + "learning_rate": 9.691361418876075e-05, + "loss": 0.7795, + "mean_token_accuracy": 0.8133863943318526, + "num_tokens": 278355706.0, + "step": 1572 + }, + { + "entropy": 0.7944580602149168, + "epoch": 0.488926614707925, + "grad_norm": 0.12622949481010437, + "learning_rate": 9.685788332584685e-05, + "loss": 0.7752, + "mean_token_accuracy": 0.814650778969129, + "num_tokens": 280460375.0, + "step": 1584 + }, + { + "entropy": 0.8005058703323206, + "epoch": 0.4926306042132881, + "grad_norm": 0.12395934015512466, + "learning_rate": 9.68016701056964e-05, + "loss": 0.7828, + "mean_token_accuracy": 0.8131459752718607, + "num_tokens": 282590228.0, + "step": 1596 + }, + { + "entropy": 0.8098582600553831, + "epoch": 0.49633459371865113, + "grad_norm": 0.13313600420951843, + "learning_rate": 9.674497510697097e-05, + "loss": 0.7903, + "mean_token_accuracy": 0.8115353050331274, + "num_tokens": 284669419.0, + "step": 1608 + }, + { + "entropy": 0.805266530563434, + "epoch": 0.5000385832240142, + "grad_norm": 0.11492042243480682, + "learning_rate": 9.668779891329147e-05, + "loss": 0.7851, + "mean_token_accuracy": 0.8125738625725111, + "num_tokens": 286824939.0, + "step": 1620 + }, + { + "entropy": 0.7950548666218916, + "epoch": 0.5037425727293773, + "grad_norm": 0.12213092297315598, + "learning_rate": 9.663014211323233e-05, + "loss": 0.7738, + "mean_token_accuracy": 0.8150150341292223, + "num_tokens": 288958154.0, + "step": 1632 + }, + { + "entropy": 0.7952163182199001, + "epoch": 0.5074465622347404, + "grad_norm": 0.11614470928907394, + "learning_rate": 9.657200530031533e-05, + "loss": 0.7776, + "mean_token_accuracy": 0.8137414579590162, + "num_tokens": 291092060.0, + "step": 1644 + }, + { + "entropy": 0.8064338949819406, + "epoch": 0.5111505517401034, + "grad_norm": 0.1288743019104004, + "learning_rate": 9.651338907300354e-05, + "loss": 0.7836, + "mean_token_accuracy": 0.8120651505887508, + "num_tokens": 293214294.0, + "step": 1656 + }, + { + "entropy": 0.8109964442749819, + "epoch": 0.5148545412454665, + "grad_norm": 0.12403728067874908, + "learning_rate": 9.645429403469512e-05, + "loss": 0.794, + "mean_token_accuracy": 0.8106129181881746, + "num_tokens": 295328150.0, + "step": 1668 + }, + { + "entropy": 0.8038183848063151, + "epoch": 0.5185585307508296, + "grad_norm": 0.12052307277917862, + "learning_rate": 9.639472079371717e-05, + "loss": 0.7841, + "mean_token_accuracy": 0.8124721460044384, + "num_tokens": 297455756.0, + "step": 1680 + }, + { + "entropy": 0.7984983747204145, + "epoch": 0.5222625202561926, + "grad_norm": 0.12845173478126526, + "learning_rate": 9.63346699633194e-05, + "loss": 0.7794, + "mean_token_accuracy": 0.8130777689317862, + "num_tokens": 299602538.0, + "step": 1692 + }, + { + "entropy": 0.7976608164608479, + "epoch": 0.5259665097615557, + "grad_norm": 0.1210513561964035, + "learning_rate": 9.627414216166787e-05, + "loss": 0.7789, + "mean_token_accuracy": 0.8136931844055653, + "num_tokens": 301711090.0, + "step": 1704 + }, + { + "entropy": 0.787033441166083, + "epoch": 0.5296704992669188, + "grad_norm": 0.11822398006916046, + "learning_rate": 9.621313801183858e-05, + "loss": 0.7655, + "mean_token_accuracy": 0.8165569653113683, + "num_tokens": 303823411.0, + "step": 1716 + }, + { + "entropy": 0.7807075269520283, + "epoch": 0.5333744887722818, + "grad_norm": 0.1327759176492691, + "learning_rate": 9.61516581418111e-05, + "loss": 0.7621, + "mean_token_accuracy": 0.8171720070143541, + "num_tokens": 305981408.0, + "step": 1728 + }, + { + "entropy": 0.7996973978976408, + "epoch": 0.5370784782776449, + "grad_norm": 0.12808099389076233, + "learning_rate": 9.608970318446208e-05, + "loss": 0.7815, + "mean_token_accuracy": 0.8129920872549216, + "num_tokens": 308103143.0, + "step": 1740 + }, + { + "entropy": 0.7994385659694672, + "epoch": 0.540782467783008, + "grad_norm": 0.12494061887264252, + "learning_rate": 9.602727377755875e-05, + "loss": 0.7776, + "mean_token_accuracy": 0.8130986665685972, + "num_tokens": 310257572.0, + "step": 1752 + }, + { + "entropy": 0.7863332827885946, + "epoch": 0.544486457288371, + "grad_norm": 0.11875750869512558, + "learning_rate": 9.596437056375231e-05, + "loss": 0.7668, + "mean_token_accuracy": 0.8158110665778319, + "num_tokens": 312373153.0, + "step": 1764 + }, + { + "entropy": 0.7789370119571686, + "epoch": 0.5481904467937341, + "grad_norm": 0.12376753985881805, + "learning_rate": 9.590099419057141e-05, + "loss": 0.7592, + "mean_token_accuracy": 0.8181358501315117, + "num_tokens": 314480636.0, + "step": 1776 + }, + { + "entropy": 0.7844561214248339, + "epoch": 0.5518944362990972, + "grad_norm": 0.13164328038692474, + "learning_rate": 9.583714531041538e-05, + "loss": 0.7677, + "mean_token_accuracy": 0.8156558784345785, + "num_tokens": 316566703.0, + "step": 1788 + }, + { + "entropy": 0.7855853562553724, + "epoch": 0.5555984258044602, + "grad_norm": 0.13665804266929626, + "learning_rate": 9.577282458054755e-05, + "loss": 0.7639, + "mean_token_accuracy": 0.8170899252096812, + "num_tokens": 318671313.0, + "step": 1800 + }, + { + "entropy": 0.7836192175745964, + "epoch": 0.5593024153098233, + "grad_norm": 0.12808938324451447, + "learning_rate": 9.570803266308854e-05, + "loss": 0.7644, + "mean_token_accuracy": 0.8164806043108305, + "num_tokens": 320785587.0, + "step": 1812 + }, + { + "entropy": 0.7717621214687824, + "epoch": 0.5630064048151864, + "grad_norm": 0.11582314968109131, + "learning_rate": 9.564277022500936e-05, + "loss": 0.753, + "mean_token_accuracy": 0.8190883584320545, + "num_tokens": 322901535.0, + "step": 1824 + }, + { + "entropy": 0.7969209514558315, + "epoch": 0.5667103943205494, + "grad_norm": 0.1297702044248581, + "learning_rate": 9.557703793812458e-05, + "loss": 0.7776, + "mean_token_accuracy": 0.8134247288107872, + "num_tokens": 325009488.0, + "step": 1836 + }, + { + "entropy": 0.7894351184368134, + "epoch": 0.5704143838259125, + "grad_norm": 0.13040970265865326, + "learning_rate": 9.551083647908546e-05, + "loss": 0.7707, + "mean_token_accuracy": 0.8147525365153948, + "num_tokens": 327151753.0, + "step": 1848 + }, + { + "entropy": 0.7986459036668142, + "epoch": 0.5741183733312756, + "grad_norm": 0.12837360799312592, + "learning_rate": 9.544416652937287e-05, + "loss": 0.7798, + "mean_token_accuracy": 0.8129371106624603, + "num_tokens": 329288710.0, + "step": 1860 + }, + { + "entropy": 0.782096286614736, + "epoch": 0.5778223628366387, + "grad_norm": 0.1294112354516983, + "learning_rate": 9.53770287752904e-05, + "loss": 0.7621, + "mean_token_accuracy": 0.8168822092314562, + "num_tokens": 331435486.0, + "step": 1872 + }, + { + "entropy": 0.802633331467708, + "epoch": 0.5815263523420017, + "grad_norm": 0.1259497106075287, + "learning_rate": 9.53094239079572e-05, + "loss": 0.7828, + "mean_token_accuracy": 0.8125941964487234, + "num_tokens": 333550517.0, + "step": 1884 + }, + { + "entropy": 0.7907603432734808, + "epoch": 0.5852303418473648, + "grad_norm": 0.1232752725481987, + "learning_rate": 9.524135262330098e-05, + "loss": 0.7692, + "mean_token_accuracy": 0.8152645404140154, + "num_tokens": 335680200.0, + "step": 1896 + }, + { + "entropy": 0.785350481669108, + "epoch": 0.5889343313527279, + "grad_norm": 0.12669149041175842, + "learning_rate": 9.517281562205067e-05, + "loss": 0.769, + "mean_token_accuracy": 0.8154158430794874, + "num_tokens": 337807097.0, + "step": 1908 + }, + { + "entropy": 0.7851421398421129, + "epoch": 0.5926383208580909, + "grad_norm": 0.1276937574148178, + "learning_rate": 9.510381360972938e-05, + "loss": 0.7641, + "mean_token_accuracy": 0.8159417261679968, + "num_tokens": 339954087.0, + "step": 1920 + }, + { + "entropy": 0.7850158450504144, + "epoch": 0.596342310363454, + "grad_norm": 0.132017120718956, + "learning_rate": 9.503434729664705e-05, + "loss": 0.765, + "mean_token_accuracy": 0.8161118167142073, + "num_tokens": 342092293.0, + "step": 1932 + }, + { + "entropy": 0.7878543138504028, + "epoch": 0.6000462998688171, + "grad_norm": 0.12451501935720444, + "learning_rate": 9.49644173978931e-05, + "loss": 0.7704, + "mean_token_accuracy": 0.8147625786562761, + "num_tokens": 344240873.0, + "step": 1944 + }, + { + "entropy": 0.7971215297778448, + "epoch": 0.6037502893741801, + "grad_norm": 0.12397521734237671, + "learning_rate": 9.489402463332923e-05, + "loss": 0.7757, + "mean_token_accuracy": 0.814307109763225, + "num_tokens": 346363936.0, + "step": 1956 + }, + { + "entropy": 0.7801007392505804, + "epoch": 0.6074542788795432, + "grad_norm": 0.12504300475120544, + "learning_rate": 9.482316972758181e-05, + "loss": 0.7628, + "mean_token_accuracy": 0.8172982657949129, + "num_tokens": 348481250.0, + "step": 1968 + }, + { + "entropy": 0.7930209897458553, + "epoch": 0.6111582683849063, + "grad_norm": 0.1284765601158142, + "learning_rate": 9.475185341003455e-05, + "loss": 0.7747, + "mean_token_accuracy": 0.8143445054690043, + "num_tokens": 350608362.0, + "step": 1980 + }, + { + "entropy": 0.7802144425610701, + "epoch": 0.6148622578902693, + "grad_norm": 0.12513010203838348, + "learning_rate": 9.468007641482094e-05, + "loss": 0.7647, + "mean_token_accuracy": 0.8167619270582994, + "num_tokens": 352709516.0, + "step": 1992 + }, + { + "entropy": 0.7944958060979843, + "epoch": 0.6185662473956324, + "grad_norm": 0.13269025087356567, + "learning_rate": 9.460783948081675e-05, + "loss": 0.7739, + "mean_token_accuracy": 0.8146844121317068, + "num_tokens": 354822759.0, + "step": 2004 + }, + { + "entropy": 0.7854583573838075, + "epoch": 0.6222702369009955, + "grad_norm": 0.1287117898464203, + "learning_rate": 9.453514335163231e-05, + "loss": 0.7637, + "mean_token_accuracy": 0.8166800700128078, + "num_tokens": 356911949.0, + "step": 2016 + }, + { + "entropy": 0.7777937439580759, + "epoch": 0.6259742264063585, + "grad_norm": 0.12762552499771118, + "learning_rate": 9.446198877560497e-05, + "loss": 0.7571, + "mean_token_accuracy": 0.8182835541665554, + "num_tokens": 359049122.0, + "step": 2028 + }, + { + "entropy": 0.7750151492655277, + "epoch": 0.6296782159117216, + "grad_norm": 0.1277306228876114, + "learning_rate": 9.438837650579137e-05, + "loss": 0.7553, + "mean_token_accuracy": 0.8175727687776089, + "num_tokens": 361210090.0, + "step": 2040 + }, + { + "entropy": 0.7802457685271899, + "epoch": 0.6333822054170847, + "grad_norm": 0.12481208890676498, + "learning_rate": 9.431430729995963e-05, + "loss": 0.7598, + "mean_token_accuracy": 0.8178850611050924, + "num_tokens": 363337396.0, + "step": 2052 + }, + { + "entropy": 0.7862026058137417, + "epoch": 0.6370861949224477, + "grad_norm": 0.13092230260372162, + "learning_rate": 9.42397819205816e-05, + "loss": 0.7688, + "mean_token_accuracy": 0.8156153832872709, + "num_tokens": 365477630.0, + "step": 2064 + }, + { + "entropy": 0.7690726555883884, + "epoch": 0.6407901844278108, + "grad_norm": 0.11987863481044769, + "learning_rate": 9.416480113482504e-05, + "loss": 0.7521, + "mean_token_accuracy": 0.8195419311523438, + "num_tokens": 367643528.0, + "step": 2076 + }, + { + "entropy": 0.7762465241054693, + "epoch": 0.6444941739331739, + "grad_norm": 0.13100093603134155, + "learning_rate": 9.408936571454566e-05, + "loss": 0.7563, + "mean_token_accuracy": 0.8186973209182421, + "num_tokens": 369782358.0, + "step": 2088 + }, + { + "entropy": 0.7813232329984506, + "epoch": 0.648198163438537, + "grad_norm": 0.13405582308769226, + "learning_rate": 9.401347643627915e-05, + "loss": 0.7632, + "mean_token_accuracy": 0.8167420464257399, + "num_tokens": 371928210.0, + "step": 2100 + }, + { + "entropy": 0.7875241699318091, + "epoch": 0.6519021529439, + "grad_norm": 0.12580129504203796, + "learning_rate": 9.393713408123332e-05, + "loss": 0.7699, + "mean_token_accuracy": 0.8153914275268713, + "num_tokens": 374058726.0, + "step": 2112 + }, + { + "entropy": 0.7834992570181688, + "epoch": 0.6556061424492631, + "grad_norm": 0.1281077116727829, + "learning_rate": 9.38603394352799e-05, + "loss": 0.7638, + "mean_token_accuracy": 0.8171246275305748, + "num_tokens": 376203701.0, + "step": 2124 + }, + { + "entropy": 0.7805231350163618, + "epoch": 0.6593101319546262, + "grad_norm": 0.14219819009304047, + "learning_rate": 9.378309328894662e-05, + "loss": 0.7616, + "mean_token_accuracy": 0.8166681937873363, + "num_tokens": 378335272.0, + "step": 2136 + }, + { + "entropy": 0.7874154051144918, + "epoch": 0.6630141214599892, + "grad_norm": 0.1304338425397873, + "learning_rate": 9.370539643740883e-05, + "loss": 0.7637, + "mean_token_accuracy": 0.8164051709075769, + "num_tokens": 380467266.0, + "step": 2148 + }, + { + "entropy": 0.7733189848562082, + "epoch": 0.6667181109653523, + "grad_norm": 0.13076983392238617, + "learning_rate": 9.36272496804816e-05, + "loss": 0.7571, + "mean_token_accuracy": 0.8175699549416701, + "num_tokens": 382603869.0, + "step": 2160 + }, + { + "entropy": 0.776911374181509, + "epoch": 0.6704221004707154, + "grad_norm": 0.13455283641815186, + "learning_rate": 9.354865382261128e-05, + "loss": 0.7563, + "mean_token_accuracy": 0.8178863686819872, + "num_tokens": 384767350.0, + "step": 2172 + }, + { + "entropy": 0.7755288705229759, + "epoch": 0.6741260899760784, + "grad_norm": 0.1210244670510292, + "learning_rate": 9.346960967286728e-05, + "loss": 0.7556, + "mean_token_accuracy": 0.8183015001316866, + "num_tokens": 386934062.0, + "step": 2184 + }, + { + "entropy": 0.7806575360397497, + "epoch": 0.6778300794814415, + "grad_norm": 0.1282687783241272, + "learning_rate": 9.339011804493378e-05, + "loss": 0.7604, + "mean_token_accuracy": 0.8167587071657181, + "num_tokens": 389047648.0, + "step": 2196 + }, + { + "entropy": 0.7758863245447477, + "epoch": 0.6815340689868046, + "grad_norm": 0.13144470751285553, + "learning_rate": 9.331017975710132e-05, + "loss": 0.7582, + "mean_token_accuracy": 0.8180744908750057, + "num_tokens": 391155544.0, + "step": 2208 + }, + { + "entropy": 0.7667204054693381, + "epoch": 0.6852380584921676, + "grad_norm": 0.12816905975341797, + "learning_rate": 9.322979563225833e-05, + "loss": 0.7464, + "mean_token_accuracy": 0.8206619794170061, + "num_tokens": 393293375.0, + "step": 2220 + }, + { + "entropy": 0.7751789751152197, + "epoch": 0.6889420479975307, + "grad_norm": 0.14497670531272888, + "learning_rate": 9.314896649788277e-05, + "loss": 0.7561, + "mean_token_accuracy": 0.8184558848539988, + "num_tokens": 395411494.0, + "step": 2232 + }, + { + "entropy": 0.7844450324773788, + "epoch": 0.6926460375028938, + "grad_norm": 0.13543701171875, + "learning_rate": 9.306769318603348e-05, + "loss": 0.765, + "mean_token_accuracy": 0.8167361902693907, + "num_tokens": 397518224.0, + "step": 2244 + }, + { + "entropy": 0.7686401257912318, + "epoch": 0.6963500270082568, + "grad_norm": 0.13837109506130219, + "learning_rate": 9.298597653334178e-05, + "loss": 0.7487, + "mean_token_accuracy": 0.8197805831829706, + "num_tokens": 399648867.0, + "step": 2256 + }, + { + "entropy": 0.7626761943101883, + "epoch": 0.7000540165136199, + "grad_norm": 0.1293450891971588, + "learning_rate": 9.290381738100265e-05, + "loss": 0.7441, + "mean_token_accuracy": 0.8214877719680468, + "num_tokens": 401743610.0, + "step": 2268 + }, + { + "entropy": 0.7687733148535093, + "epoch": 0.703758006018983, + "grad_norm": 0.1483583152294159, + "learning_rate": 9.282121657476627e-05, + "loss": 0.7506, + "mean_token_accuracy": 0.8195670247077942, + "num_tokens": 403850723.0, + "step": 2280 + }, + { + "entropy": 0.7698837158580621, + "epoch": 0.707461995524346, + "grad_norm": 0.13035227358341217, + "learning_rate": 9.273817496492917e-05, + "loss": 0.7503, + "mean_token_accuracy": 0.8193908805648485, + "num_tokens": 405973443.0, + "step": 2292 + }, + { + "entropy": 0.770644503335158, + "epoch": 0.7111659850297091, + "grad_norm": 0.13103176653385162, + "learning_rate": 9.265469340632557e-05, + "loss": 0.7538, + "mean_token_accuracy": 0.8188917512694994, + "num_tokens": 408105892.0, + "step": 2304 + }, + { + "entropy": 0.7735109639664491, + "epoch": 0.7148699745350722, + "grad_norm": 0.1398514360189438, + "learning_rate": 9.257077275831853e-05, + "loss": 0.7518, + "mean_token_accuracy": 0.819186095148325, + "num_tokens": 410296037.0, + "step": 2316 + }, + { + "entropy": 0.771765373647213, + "epoch": 0.7185739640404352, + "grad_norm": 0.14083071053028107, + "learning_rate": 9.248641388479111e-05, + "loss": 0.7526, + "mean_token_accuracy": 0.8195289969444275, + "num_tokens": 412380775.0, + "step": 2328 + }, + { + "entropy": 0.7654011559983095, + "epoch": 0.7222779535457983, + "grad_norm": 0.1325056552886963, + "learning_rate": 9.240161765413748e-05, + "loss": 0.7457, + "mean_token_accuracy": 0.8208736081918081, + "num_tokens": 414535472.0, + "step": 2340 + }, + { + "entropy": 0.7590547104676565, + "epoch": 0.7259819430511614, + "grad_norm": 0.1285925805568695, + "learning_rate": 9.231638493925402e-05, + "loss": 0.7407, + "mean_token_accuracy": 0.8210619514187177, + "num_tokens": 416686381.0, + "step": 2352 + }, + { + "entropy": 0.7812760807573795, + "epoch": 0.7296859325565245, + "grad_norm": 0.12959104776382446, + "learning_rate": 9.223071661753024e-05, + "loss": 0.7615, + "mean_token_accuracy": 0.8172919216255347, + "num_tokens": 418808712.0, + "step": 2364 + }, + { + "entropy": 0.7873292689522108, + "epoch": 0.7333899220618875, + "grad_norm": 0.1371845006942749, + "learning_rate": 9.214461357083985e-05, + "loss": 0.7677, + "mean_token_accuracy": 0.8155596914390723, + "num_tokens": 420927036.0, + "step": 2376 + }, + { + "entropy": 0.7580399426321188, + "epoch": 0.7370939115672506, + "grad_norm": 0.14509984850883484, + "learning_rate": 9.205807668553164e-05, + "loss": 0.7373, + "mean_token_accuracy": 0.8222843247155348, + "num_tokens": 423036030.0, + "step": 2388 + }, + { + "entropy": 0.7687507805724939, + "epoch": 0.7407979010726137, + "grad_norm": 0.15484097599983215, + "learning_rate": 9.197110685242034e-05, + "loss": 0.747, + "mean_token_accuracy": 0.8196417490641276, + "num_tokens": 425187345.0, + "step": 2400 + }, + { + "entropy": 0.764864676942428, + "epoch": 0.7445018905779767, + "grad_norm": 0.12323542684316635, + "learning_rate": 9.188370496677745e-05, + "loss": 0.7452, + "mean_token_accuracy": 0.8201479675869147, + "num_tokens": 427331828.0, + "step": 2412 + }, + { + "entropy": 0.7658702706297239, + "epoch": 0.7482058800833398, + "grad_norm": 0.12649372220039368, + "learning_rate": 9.179587192832209e-05, + "loss": 0.7474, + "mean_token_accuracy": 0.8195582069456577, + "num_tokens": 429448072.0, + "step": 2424 + }, + { + "entropy": 0.7575242506961027, + "epoch": 0.7519098695887029, + "grad_norm": 0.13158433139324188, + "learning_rate": 9.170760864121162e-05, + "loss": 0.7366, + "mean_token_accuracy": 0.8224081955850124, + "num_tokens": 431533949.0, + "step": 2436 + }, + { + "entropy": 0.7688459927837054, + "epoch": 0.7556138590940659, + "grad_norm": 0.12840636074543, + "learning_rate": 9.161891601403245e-05, + "loss": 0.7487, + "mean_token_accuracy": 0.8195722003777822, + "num_tokens": 433658430.0, + "step": 2448 + }, + { + "entropy": 0.7660383395850658, + "epoch": 0.759317848599429, + "grad_norm": 0.12874674797058105, + "learning_rate": 9.152979495979063e-05, + "loss": 0.7493, + "mean_token_accuracy": 0.8193290382623672, + "num_tokens": 435741465.0, + "step": 2460 + }, + { + "entropy": 0.7668347917497158, + "epoch": 0.7630218381047921, + "grad_norm": 0.12673811614513397, + "learning_rate": 9.144024639590245e-05, + "loss": 0.7473, + "mean_token_accuracy": 0.8204079742232958, + "num_tokens": 437908304.0, + "step": 2472 + }, + { + "entropy": 0.7687832986315092, + "epoch": 0.7667258276101551, + "grad_norm": 0.12845873832702637, + "learning_rate": 9.135027124418499e-05, + "loss": 0.7536, + "mean_token_accuracy": 0.8192412157853445, + "num_tokens": 440028485.0, + "step": 2484 + }, + { + "entropy": 0.7862997514506181, + "epoch": 0.7704298171155182, + "grad_norm": 0.13068512082099915, + "learning_rate": 9.125987043084665e-05, + "loss": 0.7642, + "mean_token_accuracy": 0.8162732509275278, + "num_tokens": 442179533.0, + "step": 2496 + }, + { + "entropy": 0.7685239054262638, + "epoch": 0.7741338066208813, + "grad_norm": 0.13230957090854645, + "learning_rate": 9.116904488647764e-05, + "loss": 0.75, + "mean_token_accuracy": 0.8193069696426392, + "num_tokens": 444264359.0, + "step": 2508 + }, + { + "entropy": 0.76015779748559, + "epoch": 0.7778377961262443, + "grad_norm": 0.13136623799800873, + "learning_rate": 9.107779554604035e-05, + "loss": 0.7418, + "mean_token_accuracy": 0.8217526078224182, + "num_tokens": 446400739.0, + "step": 2520 + }, + { + "entropy": 0.7768492065370083, + "epoch": 0.7815417856316074, + "grad_norm": 0.13927006721496582, + "learning_rate": 9.098612334885972e-05, + "loss": 0.7581, + "mean_token_accuracy": 0.8174782631297907, + "num_tokens": 448488906.0, + "step": 2532 + }, + { + "entropy": 0.7547270730137825, + "epoch": 0.7852457751369705, + "grad_norm": 0.14124959707260132, + "learning_rate": 9.089402923861366e-05, + "loss": 0.7356, + "mean_token_accuracy": 0.8228224813938141, + "num_tokens": 450609731.0, + "step": 2544 + }, + { + "entropy": 0.7677095470329126, + "epoch": 0.7889497646423335, + "grad_norm": 0.14042387902736664, + "learning_rate": 9.080151416332319e-05, + "loss": 0.747, + "mean_token_accuracy": 0.8197631984949112, + "num_tokens": 452710661.0, + "step": 2556 + }, + { + "entropy": 0.768154501914978, + "epoch": 0.7926537541476966, + "grad_norm": 0.12782582640647888, + "learning_rate": 9.070857907534287e-05, + "loss": 0.7473, + "mean_token_accuracy": 0.8194541210929552, + "num_tokens": 454884003.0, + "step": 2568 + }, + { + "entropy": 0.7609333768486977, + "epoch": 0.7963577436530597, + "grad_norm": 0.141867533326149, + "learning_rate": 9.061522493135079e-05, + "loss": 0.7407, + "mean_token_accuracy": 0.82136203845342, + "num_tokens": 457030737.0, + "step": 2580 + }, + { + "entropy": 0.7599871419370174, + "epoch": 0.8000617331584228, + "grad_norm": 0.13808685541152954, + "learning_rate": 9.052145269233887e-05, + "loss": 0.7402, + "mean_token_accuracy": 0.8215218285719553, + "num_tokens": 459177274.0, + "step": 2592 + }, + { + "entropy": 0.7648821386198202, + "epoch": 0.8037657226637858, + "grad_norm": 0.15076588094234467, + "learning_rate": 9.042726332360292e-05, + "loss": 0.7454, + "mean_token_accuracy": 0.8200170012811819, + "num_tokens": 461306463.0, + "step": 2604 + }, + { + "entropy": 0.7687475743393103, + "epoch": 0.8074697121691489, + "grad_norm": 0.12965603172779083, + "learning_rate": 9.033265779473268e-05, + "loss": 0.7492, + "mean_token_accuracy": 0.819452028721571, + "num_tokens": 463455408.0, + "step": 2616 + }, + { + "entropy": 0.7646373882889748, + "epoch": 0.811173701674512, + "grad_norm": 0.1345282644033432, + "learning_rate": 9.023763707960188e-05, + "loss": 0.7434, + "mean_token_accuracy": 0.8209528836111227, + "num_tokens": 465550715.0, + "step": 2628 + }, + { + "entropy": 0.76267567401131, + "epoch": 0.814877691179875, + "grad_norm": 0.1315702497959137, + "learning_rate": 9.01422021563582e-05, + "loss": 0.7427, + "mean_token_accuracy": 0.820941454420487, + "num_tokens": 467676160.0, + "step": 2640 + }, + { + "entropy": 0.7417604538301626, + "epoch": 0.8185816806852381, + "grad_norm": 0.12949636578559875, + "learning_rate": 9.00463540074132e-05, + "loss": 0.7243, + "mean_token_accuracy": 0.8254370614886284, + "num_tokens": 469811393.0, + "step": 2652 + }, + { + "entropy": 0.7587257462243239, + "epoch": 0.8222856701906012, + "grad_norm": 0.13278073072433472, + "learning_rate": 8.995009361943218e-05, + "loss": 0.7384, + "mean_token_accuracy": 0.8231843349834284, + "num_tokens": 471930105.0, + "step": 2664 + }, + { + "entropy": 0.7431893559793631, + "epoch": 0.8259896596959642, + "grad_norm": 0.14032749831676483, + "learning_rate": 8.985342198332407e-05, + "loss": 0.7247, + "mean_token_accuracy": 0.8248631035288175, + "num_tokens": 474075700.0, + "step": 2676 + }, + { + "entropy": 0.7719796399275461, + "epoch": 0.8296936492013273, + "grad_norm": 0.14027121663093567, + "learning_rate": 8.975634009423122e-05, + "loss": 0.752, + "mean_token_accuracy": 0.8185311630368233, + "num_tokens": 476204230.0, + "step": 2688 + }, + { + "entropy": 0.7616085906823477, + "epoch": 0.8333976387066904, + "grad_norm": 0.1393400877714157, + "learning_rate": 8.965884895151908e-05, + "loss": 0.7443, + "mean_token_accuracy": 0.8211442058285078, + "num_tokens": 478348559.0, + "step": 2700 + }, + { + "entropy": 0.7600347759823004, + "epoch": 0.8371016282120534, + "grad_norm": 0.1410454660654068, + "learning_rate": 8.956094955876607e-05, + "loss": 0.7373, + "mean_token_accuracy": 0.8219936129947504, + "num_tokens": 480446587.0, + "step": 2712 + }, + { + "entropy": 0.7613047709067663, + "epoch": 0.8408056177174165, + "grad_norm": 0.140048548579216, + "learning_rate": 8.946264292375306e-05, + "loss": 0.7426, + "mean_token_accuracy": 0.8213416139284769, + "num_tokens": 482581346.0, + "step": 2724 + }, + { + "entropy": 0.7594624037543932, + "epoch": 0.8445096072227796, + "grad_norm": 0.1277952641248703, + "learning_rate": 8.936393005845316e-05, + "loss": 0.739, + "mean_token_accuracy": 0.8218294816712538, + "num_tokens": 484695228.0, + "step": 2736 + }, + { + "entropy": 0.7544240554173788, + "epoch": 0.8482135967281426, + "grad_norm": 0.154397651553154, + "learning_rate": 8.926481197902122e-05, + "loss": 0.7341, + "mean_token_accuracy": 0.8225894048810005, + "num_tokens": 486862761.0, + "step": 2748 + }, + { + "entropy": 0.7661119649807612, + "epoch": 0.8519175862335057, + "grad_norm": 0.14195363223552704, + "learning_rate": 8.916528970578333e-05, + "loss": 0.746, + "mean_token_accuracy": 0.8203906156122684, + "num_tokens": 489014996.0, + "step": 2760 + }, + { + "entropy": 0.7686087116599083, + "epoch": 0.8556215757388688, + "grad_norm": 0.13611237704753876, + "learning_rate": 8.906536426322646e-05, + "loss": 0.7473, + "mean_token_accuracy": 0.8198287424941858, + "num_tokens": 491157432.0, + "step": 2772 + }, + { + "entropy": 0.7590742185711861, + "epoch": 0.8593255652442318, + "grad_norm": 0.12830516695976257, + "learning_rate": 8.896503667998777e-05, + "loss": 0.7414, + "mean_token_accuracy": 0.8214252628386021, + "num_tokens": 493327333.0, + "step": 2784 + }, + { + "entropy": 0.758670142541329, + "epoch": 0.8630295547495949, + "grad_norm": 0.13577738404273987, + "learning_rate": 8.886430798884406e-05, + "loss": 0.74, + "mean_token_accuracy": 0.8217101270953814, + "num_tokens": 495441259.0, + "step": 2796 + }, + { + "entropy": 0.7604162829617659, + "epoch": 0.866733544254958, + "grad_norm": 0.1389269232749939, + "learning_rate": 8.876317922670119e-05, + "loss": 0.7385, + "mean_token_accuracy": 0.8216200309495131, + "num_tokens": 497558061.0, + "step": 2808 + }, + { + "entropy": 0.7654056176543236, + "epoch": 0.870437533760321, + "grad_norm": 0.1383410394191742, + "learning_rate": 8.866165143458334e-05, + "loss": 0.7493, + "mean_token_accuracy": 0.819673765450716, + "num_tokens": 499689660.0, + "step": 2820 + }, + { + "entropy": 0.7578516465922197, + "epoch": 0.8741415232656841, + "grad_norm": 0.144080251455307, + "learning_rate": 8.855972565762236e-05, + "loss": 0.7373, + "mean_token_accuracy": 0.822392825037241, + "num_tokens": 501811384.0, + "step": 2832 + }, + { + "entropy": 0.7520838553706805, + "epoch": 0.8778455127710472, + "grad_norm": 0.1445675939321518, + "learning_rate": 8.845740294504691e-05, + "loss": 0.7321, + "mean_token_accuracy": 0.8238844747344652, + "num_tokens": 503931258.0, + "step": 2844 + }, + { + "entropy": 0.7716477451225122, + "epoch": 0.8815495022764103, + "grad_norm": 0.13848181068897247, + "learning_rate": 8.835468435017183e-05, + "loss": 0.75, + "mean_token_accuracy": 0.8195607153077921, + "num_tokens": 506070704.0, + "step": 2856 + }, + { + "entropy": 0.7465110706786314, + "epoch": 0.8852534917817733, + "grad_norm": 0.1347406953573227, + "learning_rate": 8.825157093038708e-05, + "loss": 0.7295, + "mean_token_accuracy": 0.8236632930735747, + "num_tokens": 508203294.0, + "step": 2868 + }, + { + "entropy": 0.7651778521637121, + "epoch": 0.8889574812871364, + "grad_norm": 0.13069529831409454, + "learning_rate": 8.814806374714702e-05, + "loss": 0.7449, + "mean_token_accuracy": 0.8204664116104444, + "num_tokens": 510395477.0, + "step": 2880 + }, + { + "entropy": 0.7543973810970783, + "epoch": 0.8926614707924995, + "grad_norm": 0.12907341122627258, + "learning_rate": 8.804416386595943e-05, + "loss": 0.733, + "mean_token_accuracy": 0.8230690496663252, + "num_tokens": 512509055.0, + "step": 2892 + }, + { + "entropy": 0.7565698722998301, + "epoch": 0.8963654602978625, + "grad_norm": 0.14323952794075012, + "learning_rate": 8.793987235637453e-05, + "loss": 0.7376, + "mean_token_accuracy": 0.8218328369160494, + "num_tokens": 514596287.0, + "step": 2904 + }, + { + "entropy": 0.7765912314256033, + "epoch": 0.9000694498032256, + "grad_norm": 0.13393579423427582, + "learning_rate": 8.783519029197398e-05, + "loss": 0.7567, + "mean_token_accuracy": 0.8180692394574484, + "num_tokens": 516697723.0, + "step": 2916 + }, + { + "entropy": 0.748100645840168, + "epoch": 0.9037734393085887, + "grad_norm": 0.1474265605211258, + "learning_rate": 8.773011875035983e-05, + "loss": 0.7294, + "mean_token_accuracy": 0.8240405097603798, + "num_tokens": 518833866.0, + "step": 2928 + }, + { + "entropy": 0.7694602260986964, + "epoch": 0.9074774288139517, + "grad_norm": 0.14337094128131866, + "learning_rate": 8.762465881314346e-05, + "loss": 0.7481, + "mean_token_accuracy": 0.8199893161654472, + "num_tokens": 520946745.0, + "step": 2940 + }, + { + "entropy": 0.7642699517309666, + "epoch": 0.9111814183193148, + "grad_norm": 0.1440202295780182, + "learning_rate": 8.751881156593434e-05, + "loss": 0.7471, + "mean_token_accuracy": 0.8201769590377808, + "num_tokens": 523082474.0, + "step": 2952 + }, + { + "entropy": 0.7573205133279165, + "epoch": 0.9148854078246779, + "grad_norm": 0.14229246973991394, + "learning_rate": 8.7412578098329e-05, + "loss": 0.7388, + "mean_token_accuracy": 0.8218303211033344, + "num_tokens": 525230014.0, + "step": 2964 + }, + { + "entropy": 0.774663812170426, + "epoch": 0.9185893973300409, + "grad_norm": 0.1373312920331955, + "learning_rate": 8.730595950389968e-05, + "loss": 0.7537, + "mean_token_accuracy": 0.8188973863919576, + "num_tokens": 527358964.0, + "step": 2976 + }, + { + "entropy": 0.7440223582088947, + "epoch": 0.922293386835404, + "grad_norm": 0.13869309425354004, + "learning_rate": 8.71989568801832e-05, + "loss": 0.7243, + "mean_token_accuracy": 0.8259313181042671, + "num_tokens": 529446921.0, + "step": 2988 + }, + { + "entropy": 0.7558054054776827, + "epoch": 0.9259973763407671, + "grad_norm": 0.14386014640331268, + "learning_rate": 8.709157132866954e-05, + "loss": 0.7371, + "mean_token_accuracy": 0.8218563583989938, + "num_tokens": 531572822.0, + "step": 3000 + }, + { + "entropy": 0.7559870270391306, + "epoch": 0.9297013658461301, + "grad_norm": 0.14299409091472626, + "learning_rate": 8.698380395479058e-05, + "loss": 0.7346, + "mean_token_accuracy": 0.8229275730748972, + "num_tokens": 533671253.0, + "step": 3012 + }, + { + "entropy": 0.7501622214913368, + "epoch": 0.9334053553514932, + "grad_norm": 0.1446056067943573, + "learning_rate": 8.68756558679087e-05, + "loss": 0.7307, + "mean_token_accuracy": 0.8238628643254439, + "num_tokens": 535799339.0, + "step": 3024 + }, + { + "entropy": 0.7404041352371374, + "epoch": 0.9371093448568563, + "grad_norm": 0.14985917508602142, + "learning_rate": 8.676712818130534e-05, + "loss": 0.7227, + "mean_token_accuracy": 0.8250208596388499, + "num_tokens": 537936260.0, + "step": 3036 + }, + { + "entropy": 0.7513143469889959, + "epoch": 0.9408133343622193, + "grad_norm": 0.13818083703517914, + "learning_rate": 8.665822201216958e-05, + "loss": 0.7306, + "mean_token_accuracy": 0.8236184107760588, + "num_tokens": 540092247.0, + "step": 3048 + }, + { + "entropy": 0.7599398357172807, + "epoch": 0.9445173238675824, + "grad_norm": 0.14524659514427185, + "learning_rate": 8.654893848158658e-05, + "loss": 0.7398, + "mean_token_accuracy": 0.8216308690607548, + "num_tokens": 542253810.0, + "step": 3060 + }, + { + "entropy": 0.7556075366834799, + "epoch": 0.9482213133729455, + "grad_norm": 0.14592809975147247, + "learning_rate": 8.643927871452611e-05, + "loss": 0.7387, + "mean_token_accuracy": 0.8219700244565805, + "num_tokens": 544371064.0, + "step": 3072 + }, + { + "entropy": 0.7573518306016922, + "epoch": 0.9519253028783086, + "grad_norm": 0.136802539229393, + "learning_rate": 8.632924383983096e-05, + "loss": 0.7383, + "mean_token_accuracy": 0.8221282474696636, + "num_tokens": 546525688.0, + "step": 3084 + }, + { + "entropy": 0.760393563657999, + "epoch": 0.9556292923836716, + "grad_norm": 0.14470767974853516, + "learning_rate": 8.621883499020523e-05, + "loss": 0.7405, + "mean_token_accuracy": 0.8215649748841921, + "num_tokens": 548650119.0, + "step": 3096 + }, + { + "entropy": 0.7766042277216911, + "epoch": 0.9593332818890347, + "grad_norm": 0.15178759396076202, + "learning_rate": 8.610805330220275e-05, + "loss": 0.755, + "mean_token_accuracy": 0.818018895884355, + "num_tokens": 550763680.0, + "step": 3108 + }, + { + "entropy": 0.7451968391736349, + "epoch": 0.9630372713943978, + "grad_norm": 0.16293193399906158, + "learning_rate": 8.599689991621543e-05, + "loss": 0.7257, + "mean_token_accuracy": 0.8251891148587068, + "num_tokens": 552911674.0, + "step": 3120 + }, + { + "entropy": 0.7404080505172411, + "epoch": 0.9667412608997608, + "grad_norm": 0.14965808391571045, + "learning_rate": 8.588537597646139e-05, + "loss": 0.7215, + "mean_token_accuracy": 0.8259551251928011, + "num_tokens": 555010287.0, + "step": 3132 + }, + { + "entropy": 0.7473993599414825, + "epoch": 0.9704452504051239, + "grad_norm": 0.13951192796230316, + "learning_rate": 8.577348263097324e-05, + "loss": 0.7281, + "mean_token_accuracy": 0.8238498754799366, + "num_tokens": 557140449.0, + "step": 3144 + }, + { + "entropy": 0.749595433473587, + "epoch": 0.974149239910487, + "grad_norm": 0.14272676408290863, + "learning_rate": 8.566122103158636e-05, + "loss": 0.7301, + "mean_token_accuracy": 0.8237165659666061, + "num_tokens": 559258810.0, + "step": 3156 + }, + { + "entropy": 0.7361926498512427, + "epoch": 0.97785322941585, + "grad_norm": 0.14749550819396973, + "learning_rate": 8.554859233392682e-05, + "loss": 0.7167, + "mean_token_accuracy": 0.8265771567821503, + "num_tokens": 561389026.0, + "step": 3168 + }, + { + "entropy": 0.7617857754230499, + "epoch": 0.9815572189212131, + "grad_norm": 0.14244970679283142, + "learning_rate": 8.543559769739974e-05, + "loss": 0.7436, + "mean_token_accuracy": 0.8205315048495928, + "num_tokens": 563506691.0, + "step": 3180 + }, + { + "entropy": 0.7460702173411846, + "epoch": 0.9852612084265762, + "grad_norm": 0.15570423007011414, + "learning_rate": 8.532223828517716e-05, + "loss": 0.7265, + "mean_token_accuracy": 0.8244366881748041, + "num_tokens": 565591234.0, + "step": 3192 + }, + { + "entropy": 0.7432848749061426, + "epoch": 0.9889651979319392, + "grad_norm": 0.1433694213628769, + "learning_rate": 8.520851526418614e-05, + "loss": 0.7252, + "mean_token_accuracy": 0.8244002535939217, + "num_tokens": 567734464.0, + "step": 3204 + }, + { + "entropy": 0.7543482234080633, + "epoch": 0.9926691874373023, + "grad_norm": 0.1491028368473053, + "learning_rate": 8.509442980509678e-05, + "loss": 0.7326, + "mean_token_accuracy": 0.8236272583405176, + "num_tokens": 569859027.0, + "step": 3216 + }, + { + "entropy": 0.7251827741662661, + "epoch": 0.9963731769426654, + "grad_norm": 0.14663253724575043, + "learning_rate": 8.497998308231012e-05, + "loss": 0.706, + "mean_token_accuracy": 0.8289556242525578, + "num_tokens": 571982689.0, + "step": 3228 + }, + { + "entropy": 0.7527169965683146, + "epoch": 1.0, + "grad_norm": 0.1720859855413437, + "learning_rate": 8.486517627394606e-05, + "loss": 0.7331, + "mean_token_accuracy": 0.823117880111045, + "num_tokens": 574035358.0, + "step": 3240 + }, + { + "entropy": 0.7543971252938112, + "epoch": 1.003703989505363, + "grad_norm": 0.14148341119289398, + "learning_rate": 8.475001056183124e-05, + "loss": 0.731, + "mean_token_accuracy": 0.8228112831711769, + "num_tokens": 576181278.0, + "step": 3252 + }, + { + "entropy": 0.7378036280473074, + "epoch": 1.0074079790107262, + "grad_norm": 0.13494503498077393, + "learning_rate": 8.463448713148687e-05, + "loss": 0.7178, + "mean_token_accuracy": 0.825627734263738, + "num_tokens": 578356052.0, + "step": 3264 + }, + { + "entropy": 0.7363200634717941, + "epoch": 1.0111119685160892, + "grad_norm": 0.14111706614494324, + "learning_rate": 8.451860717211653e-05, + "loss": 0.7176, + "mean_token_accuracy": 0.8264128789305687, + "num_tokens": 580483875.0, + "step": 3276 + }, + { + "entropy": 0.7324805557727814, + "epoch": 1.0148159580214522, + "grad_norm": 0.14493153989315033, + "learning_rate": 8.440237187659391e-05, + "loss": 0.7122, + "mean_token_accuracy": 0.8274283918241659, + "num_tokens": 582624687.0, + "step": 3288 + }, + { + "entropy": 0.7307334840297699, + "epoch": 1.0185199475268154, + "grad_norm": 0.14563046395778656, + "learning_rate": 8.42857824414506e-05, + "loss": 0.7103, + "mean_token_accuracy": 0.8277510106563568, + "num_tokens": 584748734.0, + "step": 3300 + }, + { + "entropy": 0.7457285039126873, + "epoch": 1.0222239370321784, + "grad_norm": 0.15820999443531036, + "learning_rate": 8.416884006686366e-05, + "loss": 0.7266, + "mean_token_accuracy": 0.8236699538926283, + "num_tokens": 586852542.0, + "step": 3312 + }, + { + "entropy": 0.7318388596177101, + "epoch": 1.0259279265375414, + "grad_norm": 0.15957045555114746, + "learning_rate": 8.405154595664332e-05, + "loss": 0.7107, + "mean_token_accuracy": 0.8276907838881016, + "num_tokens": 588984878.0, + "step": 3324 + }, + { + "entropy": 0.7418795588115851, + "epoch": 1.0296319160429046, + "grad_norm": 0.14202064275741577, + "learning_rate": 8.39339013182207e-05, + "loss": 0.7207, + "mean_token_accuracy": 0.8251368477940559, + "num_tokens": 591095187.0, + "step": 3336 + }, + { + "entropy": 0.7447669468820095, + "epoch": 1.0333359055482676, + "grad_norm": 0.15226991474628448, + "learning_rate": 8.381590736263512e-05, + "loss": 0.724, + "mean_token_accuracy": 0.8251153019567331, + "num_tokens": 593222885.0, + "step": 3348 + }, + { + "entropy": 0.7374236173927784, + "epoch": 1.0370398950536306, + "grad_norm": 0.14633263647556305, + "learning_rate": 8.369756530452191e-05, + "loss": 0.7177, + "mean_token_accuracy": 0.8263744947810968, + "num_tokens": 595340453.0, + "step": 3360 + }, + { + "entropy": 0.7381995966037115, + "epoch": 1.0407438845589938, + "grad_norm": 0.1505565196275711, + "learning_rate": 8.35788763620997e-05, + "loss": 0.7174, + "mean_token_accuracy": 0.8263110890984535, + "num_tokens": 597449422.0, + "step": 3372 + }, + { + "entropy": 0.7414120820661386, + "epoch": 1.0444478740643568, + "grad_norm": 0.143823504447937, + "learning_rate": 8.345984175715802e-05, + "loss": 0.7195, + "mean_token_accuracy": 0.8251637890934944, + "num_tokens": 599594017.0, + "step": 3384 + }, + { + "entropy": 0.7337635308504105, + "epoch": 1.0481518635697198, + "grad_norm": 0.14950762689113617, + "learning_rate": 8.334046271504465e-05, + "loss": 0.7151, + "mean_token_accuracy": 0.8268262160321077, + "num_tokens": 601723800.0, + "step": 3396 + }, + { + "entropy": 0.737778523315986, + "epoch": 1.051855853075083, + "grad_norm": 0.14158302545547485, + "learning_rate": 8.3220740464653e-05, + "loss": 0.7155, + "mean_token_accuracy": 0.8266446044047674, + "num_tokens": 603851467.0, + "step": 3408 + }, + { + "entropy": 0.7279616557061672, + "epoch": 1.055559842580446, + "grad_norm": 0.14092348515987396, + "learning_rate": 8.310067623840951e-05, + "loss": 0.7091, + "mean_token_accuracy": 0.8280529901385307, + "num_tokens": 605974553.0, + "step": 3420 + }, + { + "entropy": 0.7397043655316035, + "epoch": 1.059263832085809, + "grad_norm": 0.1458451896905899, + "learning_rate": 8.298027127226093e-05, + "loss": 0.7186, + "mean_token_accuracy": 0.8254076987504959, + "num_tokens": 608111416.0, + "step": 3432 + }, + { + "entropy": 0.7327801200250784, + "epoch": 1.0629678215911722, + "grad_norm": 0.14519493281841278, + "learning_rate": 8.28595268056616e-05, + "loss": 0.7121, + "mean_token_accuracy": 0.8273726465801398, + "num_tokens": 610224373.0, + "step": 3444 + }, + { + "entropy": 0.7508813291788101, + "epoch": 1.0666718110965352, + "grad_norm": 0.1475428193807602, + "learning_rate": 8.273844408156066e-05, + "loss": 0.7283, + "mean_token_accuracy": 0.8238155034681162, + "num_tokens": 612367458.0, + "step": 3456 + }, + { + "entropy": 0.7356340115269026, + "epoch": 1.0703758006018984, + "grad_norm": 0.15480412542819977, + "learning_rate": 8.261702434638936e-05, + "loss": 0.7165, + "mean_token_accuracy": 0.8265450286368529, + "num_tokens": 614500562.0, + "step": 3468 + }, + { + "entropy": 0.7494183418651422, + "epoch": 1.0740797901072614, + "grad_norm": 0.14762075245380402, + "learning_rate": 8.249526885004809e-05, + "loss": 0.7274, + "mean_token_accuracy": 0.8240708733598391, + "num_tokens": 616628140.0, + "step": 3480 + }, + { + "entropy": 0.7446132637560368, + "epoch": 1.0777837796126244, + "grad_norm": 0.15463441610336304, + "learning_rate": 8.237317884589361e-05, + "loss": 0.7247, + "mean_token_accuracy": 0.8243404676516851, + "num_tokens": 618719252.0, + "step": 3492 + }, + { + "entropy": 0.7410028763115406, + "epoch": 1.0814877691179876, + "grad_norm": 0.15182152390480042, + "learning_rate": 8.225075559072614e-05, + "loss": 0.7195, + "mean_token_accuracy": 0.8264857692023119, + "num_tokens": 620851273.0, + "step": 3504 + }, + { + "entropy": 0.7477393746376038, + "epoch": 1.0851917586233506, + "grad_norm": 0.14437253773212433, + "learning_rate": 8.212800034477637e-05, + "loss": 0.7256, + "mean_token_accuracy": 0.8241480415066084, + "num_tokens": 622956290.0, + "step": 3516 + }, + { + "entropy": 0.734037263939778, + "epoch": 1.0888957481287136, + "grad_norm": 0.15787097811698914, + "learning_rate": 8.200491437169251e-05, + "loss": 0.713, + "mean_token_accuracy": 0.8271950905521711, + "num_tokens": 625080917.0, + "step": 3528 + }, + { + "entropy": 0.7377768655618032, + "epoch": 1.0925997376340768, + "grad_norm": 0.14514772593975067, + "learning_rate": 8.188149893852732e-05, + "loss": 0.7162, + "mean_token_accuracy": 0.8267666287720203, + "num_tokens": 627213629.0, + "step": 3540 + }, + { + "entropy": 0.7321221468349298, + "epoch": 1.0963037271394398, + "grad_norm": 0.15606361627578735, + "learning_rate": 8.175775531572501e-05, + "loss": 0.7107, + "mean_token_accuracy": 0.8280278158684572, + "num_tokens": 629338939.0, + "step": 3552 + }, + { + "entropy": 0.7345453649759293, + "epoch": 1.1000077166448028, + "grad_norm": 0.14273642003536224, + "learning_rate": 8.163368477710825e-05, + "loss": 0.7143, + "mean_token_accuracy": 0.8269894048571587, + "num_tokens": 631433430.0, + "step": 3564 + }, + { + "entropy": 0.7408470747371515, + "epoch": 1.103711706150166, + "grad_norm": 0.16014736890792847, + "learning_rate": 8.150928859986488e-05, + "loss": 0.7194, + "mean_token_accuracy": 0.8258067518472672, + "num_tokens": 633560482.0, + "step": 3576 + }, + { + "entropy": 0.7225839781264464, + "epoch": 1.107415695655529, + "grad_norm": 0.15194731950759888, + "learning_rate": 8.138456806453503e-05, + "loss": 0.7012, + "mean_token_accuracy": 0.8298800686995188, + "num_tokens": 635693516.0, + "step": 3588 + }, + { + "entropy": 0.7254953247805437, + "epoch": 1.111119685160892, + "grad_norm": 0.15114106237888336, + "learning_rate": 8.125952445499765e-05, + "loss": 0.7036, + "mean_token_accuracy": 0.8291740665833155, + "num_tokens": 637840171.0, + "step": 3600 + }, + { + "entropy": 0.7438126876950264, + "epoch": 1.1148236746662552, + "grad_norm": 0.15659384429454803, + "learning_rate": 8.113415905845751e-05, + "loss": 0.7238, + "mean_token_accuracy": 0.8248664475977421, + "num_tokens": 639953063.0, + "step": 3612 + }, + { + "entropy": 0.7412439535061518, + "epoch": 1.1185276641716182, + "grad_norm": 0.14707545936107635, + "learning_rate": 8.100847316543185e-05, + "loss": 0.7207, + "mean_token_accuracy": 0.825270589441061, + "num_tokens": 642061202.0, + "step": 3624 + }, + { + "entropy": 0.7216433795789877, + "epoch": 1.1222316536769812, + "grad_norm": 0.14075824618339539, + "learning_rate": 8.088246806973712e-05, + "loss": 0.7012, + "mean_token_accuracy": 0.8303951819737753, + "num_tokens": 644170583.0, + "step": 3636 + }, + { + "entropy": 0.7379123046994209, + "epoch": 1.1259356431823444, + "grad_norm": 0.14188188314437866, + "learning_rate": 8.075614506847563e-05, + "loss": 0.7167, + "mean_token_accuracy": 0.8262317391733328, + "num_tokens": 646270524.0, + "step": 3648 + }, + { + "entropy": 0.7330505475401878, + "epoch": 1.1296396326877074, + "grad_norm": 0.14915235340595245, + "learning_rate": 8.062950546202228e-05, + "loss": 0.7143, + "mean_token_accuracy": 0.8266897139449915, + "num_tokens": 648408593.0, + "step": 3660 + }, + { + "entropy": 0.7359581738710403, + "epoch": 1.1333436221930704, + "grad_norm": 0.1471003293991089, + "learning_rate": 8.050255055401105e-05, + "loss": 0.7131, + "mean_token_accuracy": 0.8272727802395821, + "num_tokens": 650550976.0, + "step": 3672 + }, + { + "entropy": 0.7366929526130358, + "epoch": 1.1370476116984336, + "grad_norm": 0.14605937898159027, + "learning_rate": 8.03752816513217e-05, + "loss": 0.7169, + "mean_token_accuracy": 0.825990212460359, + "num_tokens": 652692582.0, + "step": 3684 + }, + { + "entropy": 0.7435482144355774, + "epoch": 1.1407516012037966, + "grad_norm": 0.15462058782577515, + "learning_rate": 8.024770006406628e-05, + "loss": 0.7251, + "mean_token_accuracy": 0.823987594495217, + "num_tokens": 654848825.0, + "step": 3696 + }, + { + "entropy": 0.7348724926511446, + "epoch": 1.1444555907091596, + "grad_norm": 0.1494790017604828, + "learning_rate": 8.011980710557554e-05, + "loss": 0.7139, + "mean_token_accuracy": 0.8266541995108128, + "num_tokens": 656948636.0, + "step": 3708 + }, + { + "entropy": 0.7282378102342287, + "epoch": 1.1481595802145228, + "grad_norm": 0.16433827579021454, + "learning_rate": 7.999160409238563e-05, + "loss": 0.7075, + "mean_token_accuracy": 0.8285117484629154, + "num_tokens": 659061066.0, + "step": 3720 + }, + { + "entropy": 0.738138652096192, + "epoch": 1.1518635697198858, + "grad_norm": 0.15777455270290375, + "learning_rate": 7.986309234422427e-05, + "loss": 0.7177, + "mean_token_accuracy": 0.8264009902874628, + "num_tokens": 661175184.0, + "step": 3732 + }, + { + "entropy": 0.730212123443683, + "epoch": 1.1555675592252488, + "grad_norm": 0.1504729837179184, + "learning_rate": 7.973427318399746e-05, + "loss": 0.7146, + "mean_token_accuracy": 0.827114554742972, + "num_tokens": 663279250.0, + "step": 3744 + }, + { + "entropy": 0.7331996709108353, + "epoch": 1.159271548730612, + "grad_norm": 0.15760678052902222, + "learning_rate": 7.960514793777559e-05, + "loss": 0.7107, + "mean_token_accuracy": 0.8281191426018873, + "num_tokens": 665384207.0, + "step": 3756 + }, + { + "entropy": 0.7236106917262077, + "epoch": 1.162975538235975, + "grad_norm": 0.15555773675441742, + "learning_rate": 7.947571793478e-05, + "loss": 0.7033, + "mean_token_accuracy": 0.8294017761945724, + "num_tokens": 667490801.0, + "step": 3768 + }, + { + "entropy": 0.7413870220383009, + "epoch": 1.166679527741338, + "grad_norm": 0.14847822487354279, + "learning_rate": 7.934598450736919e-05, + "loss": 0.7201, + "mean_token_accuracy": 0.8256005557874838, + "num_tokens": 669641877.0, + "step": 3780 + }, + { + "entropy": 0.7133117939035097, + "epoch": 1.1703835172467012, + "grad_norm": 0.14297045767307281, + "learning_rate": 7.921594899102505e-05, + "loss": 0.6898, + "mean_token_accuracy": 0.8321465166906515, + "num_tokens": 671760517.0, + "step": 3792 + }, + { + "entropy": 0.7260085244973501, + "epoch": 1.1740875067520642, + "grad_norm": 0.154536172747612, + "learning_rate": 7.908561272433932e-05, + "loss": 0.7061, + "mean_token_accuracy": 0.828648411979278, + "num_tokens": 673879141.0, + "step": 3804 + }, + { + "entropy": 0.7265941066046556, + "epoch": 1.1777914962574272, + "grad_norm": 0.15274272859096527, + "learning_rate": 7.895497704899957e-05, + "loss": 0.7083, + "mean_token_accuracy": 0.8286111789445082, + "num_tokens": 676023807.0, + "step": 3816 + }, + { + "entropy": 0.7224672473967075, + "epoch": 1.1814954857627904, + "grad_norm": 0.16113263368606567, + "learning_rate": 7.882404330977556e-05, + "loss": 0.7002, + "mean_token_accuracy": 0.8300817757844925, + "num_tokens": 678114097.0, + "step": 3828 + }, + { + "entropy": 0.731824230402708, + "epoch": 1.1851994752681534, + "grad_norm": 0.16188302636146545, + "learning_rate": 7.869281285450527e-05, + "loss": 0.7136, + "mean_token_accuracy": 0.827064195026954, + "num_tokens": 680225565.0, + "step": 3840 + }, + { + "entropy": 0.745010394603014, + "epoch": 1.1889034647735164, + "grad_norm": 0.1467347890138626, + "learning_rate": 7.856128703408118e-05, + "loss": 0.7239, + "mean_token_accuracy": 0.8247304347654184, + "num_tokens": 682344124.0, + "step": 3852 + }, + { + "entropy": 0.7087207237879435, + "epoch": 1.1926074542788796, + "grad_norm": 0.14708933234214783, + "learning_rate": 7.842946720243617e-05, + "loss": 0.6868, + "mean_token_accuracy": 0.8324318118393421, + "num_tokens": 684423002.0, + "step": 3864 + }, + { + "entropy": 0.7357664232452711, + "epoch": 1.1963114437842426, + "grad_norm": 0.13852429389953613, + "learning_rate": 7.829735471652978e-05, + "loss": 0.7142, + "mean_token_accuracy": 0.8267920973400275, + "num_tokens": 686526635.0, + "step": 3876 + }, + { + "entropy": 0.717096570879221, + "epoch": 1.2000154332896056, + "grad_norm": 0.15155161917209625, + "learning_rate": 7.816495093633405e-05, + "loss": 0.6966, + "mean_token_accuracy": 0.8317411964138349, + "num_tokens": 688658046.0, + "step": 3888 + }, + { + "entropy": 0.7423039426406225, + "epoch": 1.2037194227949688, + "grad_norm": 0.16276352107524872, + "learning_rate": 7.80322572248197e-05, + "loss": 0.7213, + "mean_token_accuracy": 0.8252335165937742, + "num_tokens": 690786172.0, + "step": 3900 + }, + { + "entropy": 0.711784016340971, + "epoch": 1.2074234123003318, + "grad_norm": 0.1595115065574646, + "learning_rate": 7.7899274947942e-05, + "loss": 0.6916, + "mean_token_accuracy": 0.8321591466665268, + "num_tokens": 692906988.0, + "step": 3912 + }, + { + "entropy": 0.7300878415505091, + "epoch": 1.2111274018056948, + "grad_norm": 0.14089138805866241, + "learning_rate": 7.77660054746267e-05, + "loss": 0.7072, + "mean_token_accuracy": 0.8286360974113146, + "num_tokens": 695033002.0, + "step": 3924 + }, + { + "entropy": 0.7247926443815231, + "epoch": 1.214831391311058, + "grad_norm": 0.1512719839811325, + "learning_rate": 7.763245017675596e-05, + "loss": 0.7032, + "mean_token_accuracy": 0.8293510377407074, + "num_tokens": 697153179.0, + "step": 3936 + }, + { + "entropy": 0.7274364518622557, + "epoch": 1.218535380816421, + "grad_norm": 0.14784738421440125, + "learning_rate": 7.749861042915424e-05, + "loss": 0.7094, + "mean_token_accuracy": 0.8280140981078148, + "num_tokens": 699267739.0, + "step": 3948 + }, + { + "entropy": 0.733709204941988, + "epoch": 1.222239370321784, + "grad_norm": 0.1493990421295166, + "learning_rate": 7.736448760957418e-05, + "loss": 0.7153, + "mean_token_accuracy": 0.826848104596138, + "num_tokens": 701376610.0, + "step": 3960 + }, + { + "entropy": 0.7143348765869936, + "epoch": 1.2259433598271472, + "grad_norm": 0.16152378916740417, + "learning_rate": 7.72300830986823e-05, + "loss": 0.692, + "mean_token_accuracy": 0.8318944051861763, + "num_tokens": 703516019.0, + "step": 3972 + }, + { + "entropy": 0.7241542227566242, + "epoch": 1.2296473493325102, + "grad_norm": 0.1497247815132141, + "learning_rate": 7.709539828004492e-05, + "loss": 0.7011, + "mean_token_accuracy": 0.8292028916378816, + "num_tokens": 705617753.0, + "step": 3984 + }, + { + "entropy": 0.7197034644583861, + "epoch": 1.2333513388378732, + "grad_norm": 0.1511349231004715, + "learning_rate": 7.696043454011387e-05, + "loss": 0.7001, + "mean_token_accuracy": 0.8302300423383713, + "num_tokens": 707732735.0, + "step": 3996 + }, + { + "entropy": 0.7363270024458567, + "epoch": 1.2370553283432364, + "grad_norm": 0.15632647275924683, + "learning_rate": 7.682519326821215e-05, + "loss": 0.716, + "mean_token_accuracy": 0.8261935313542684, + "num_tokens": 709835226.0, + "step": 4008 + }, + { + "entropy": 0.7174325250089169, + "epoch": 1.2407593178485994, + "grad_norm": 0.13901802897453308, + "learning_rate": 7.668967585651974e-05, + "loss": 0.6954, + "mean_token_accuracy": 0.8306252273420492, + "num_tokens": 711988818.0, + "step": 4020 + }, + { + "entropy": 0.7259048347671827, + "epoch": 1.2444633073539624, + "grad_norm": 0.1487317979335785, + "learning_rate": 7.65538837000592e-05, + "loss": 0.7057, + "mean_token_accuracy": 0.8284634724259377, + "num_tokens": 714096477.0, + "step": 4032 + }, + { + "entropy": 0.7280755999187628, + "epoch": 1.2481672968593256, + "grad_norm": 0.14473356306552887, + "learning_rate": 7.64178181966813e-05, + "loss": 0.7081, + "mean_token_accuracy": 0.8283463989694914, + "num_tokens": 716245082.0, + "step": 4044 + }, + { + "entropy": 0.7249644311765829, + "epoch": 1.2518712863646886, + "grad_norm": 0.1691472977399826, + "learning_rate": 7.62814807470507e-05, + "loss": 0.7051, + "mean_token_accuracy": 0.8292301384111246, + "num_tokens": 718361640.0, + "step": 4056 + }, + { + "entropy": 0.720829289406538, + "epoch": 1.2555752758700516, + "grad_norm": 0.14902262389659882, + "learning_rate": 7.614487275463143e-05, + "loss": 0.6981, + "mean_token_accuracy": 0.8298750383158525, + "num_tokens": 720489178.0, + "step": 4068 + }, + { + "entropy": 0.7293098631004492, + "epoch": 1.2592792653754148, + "grad_norm": 0.1404629945755005, + "learning_rate": 7.600799562567258e-05, + "loss": 0.7089, + "mean_token_accuracy": 0.8281568810343742, + "num_tokens": 722608223.0, + "step": 4080 + }, + { + "entropy": 0.722664033373197, + "epoch": 1.2629832548807778, + "grad_norm": 0.14373779296875, + "learning_rate": 7.587085076919369e-05, + "loss": 0.7024, + "mean_token_accuracy": 0.8302177861332893, + "num_tokens": 724757494.0, + "step": 4092 + }, + { + "entropy": 0.7234973087906837, + "epoch": 1.2666872443861408, + "grad_norm": 0.14617067575454712, + "learning_rate": 7.573343959697029e-05, + "loss": 0.7046, + "mean_token_accuracy": 0.8289206735789776, + "num_tokens": 726922696.0, + "step": 4104 + }, + { + "entropy": 0.7261980349818865, + "epoch": 1.270391233891504, + "grad_norm": 0.1495673805475235, + "learning_rate": 7.55957635235194e-05, + "loss": 0.706, + "mean_token_accuracy": 0.828493465979894, + "num_tokens": 729052352.0, + "step": 4116 + }, + { + "entropy": 0.7174485189219316, + "epoch": 1.274095223396867, + "grad_norm": 0.14732122421264648, + "learning_rate": 7.545782396608496e-05, + "loss": 0.6992, + "mean_token_accuracy": 0.8296906674901644, + "num_tokens": 731197978.0, + "step": 4128 + }, + { + "entropy": 0.7247343001266321, + "epoch": 1.27779921290223, + "grad_norm": 0.14333774149417877, + "learning_rate": 7.53196223446232e-05, + "loss": 0.7061, + "mean_token_accuracy": 0.8288652760287126, + "num_tokens": 733333853.0, + "step": 4140 + }, + { + "entropy": 0.7190474011003971, + "epoch": 1.2815032024075932, + "grad_norm": 0.15297874808311462, + "learning_rate": 7.518116008178805e-05, + "loss": 0.7012, + "mean_token_accuracy": 0.8297313017149767, + "num_tokens": 735431034.0, + "step": 4152 + }, + { + "entropy": 0.7267656040688356, + "epoch": 1.2852071919129562, + "grad_norm": 0.14727260172367096, + "learning_rate": 7.50424386029165e-05, + "loss": 0.7044, + "mean_token_accuracy": 0.8287480349342028, + "num_tokens": 737589354.0, + "step": 4164 + }, + { + "entropy": 0.7389783896505833, + "epoch": 1.2889111814183194, + "grad_norm": 0.15076899528503418, + "learning_rate": 7.490345933601395e-05, + "loss": 0.7211, + "mean_token_accuracy": 0.8257643903295199, + "num_tokens": 739720304.0, + "step": 4176 + }, + { + "entropy": 0.7206626770397028, + "epoch": 1.2926151709236824, + "grad_norm": 0.16268907487392426, + "learning_rate": 7.476422371173942e-05, + "loss": 0.6993, + "mean_token_accuracy": 0.8301717328528563, + "num_tokens": 741856208.0, + "step": 4188 + }, + { + "entropy": 0.7273670248687267, + "epoch": 1.2963191604290454, + "grad_norm": 0.158157080411911, + "learning_rate": 7.462473316339093e-05, + "loss": 0.7062, + "mean_token_accuracy": 0.8289822128911813, + "num_tokens": 743969037.0, + "step": 4200 + }, + { + "entropy": 0.7194506376981735, + "epoch": 1.3000231499344086, + "grad_norm": 0.16528646647930145, + "learning_rate": 7.44849891268907e-05, + "loss": 0.699, + "mean_token_accuracy": 0.8304046653211117, + "num_tokens": 746092753.0, + "step": 4212 + }, + { + "entropy": 0.7224392394224802, + "epoch": 1.3037271394397716, + "grad_norm": 0.15017327666282654, + "learning_rate": 7.434499304077036e-05, + "loss": 0.7023, + "mean_token_accuracy": 0.8293930192788442, + "num_tokens": 748204673.0, + "step": 4224 + }, + { + "entropy": 0.7125812520583471, + "epoch": 1.3074311289451346, + "grad_norm": 0.15608710050582886, + "learning_rate": 7.420474634615617e-05, + "loss": 0.6921, + "mean_token_accuracy": 0.8319937810301781, + "num_tokens": 750325027.0, + "step": 4236 + }, + { + "entropy": 0.7182938729723295, + "epoch": 1.3111351184504978, + "grad_norm": 0.15669798851013184, + "learning_rate": 7.406425048675409e-05, + "loss": 0.6978, + "mean_token_accuracy": 0.8300301494697729, + "num_tokens": 752470674.0, + "step": 4248 + }, + { + "entropy": 0.7105309578279654, + "epoch": 1.3148391079558608, + "grad_norm": 0.15128910541534424, + "learning_rate": 7.392350690883509e-05, + "loss": 0.6895, + "mean_token_accuracy": 0.8324328971405824, + "num_tokens": 754594955.0, + "step": 4260 + }, + { + "entropy": 0.7113361358642578, + "epoch": 1.3185430974612238, + "grad_norm": 0.15378808975219727, + "learning_rate": 7.378251706122013e-05, + "loss": 0.6903, + "mean_token_accuracy": 0.8316868332525095, + "num_tokens": 756717367.0, + "step": 4272 + }, + { + "entropy": 0.7267830123504003, + "epoch": 1.322247086966587, + "grad_norm": 0.151032954454422, + "learning_rate": 7.364128239526525e-05, + "loss": 0.7045, + "mean_token_accuracy": 0.8282621925075849, + "num_tokens": 758856430.0, + "step": 4284 + }, + { + "entropy": 0.721363440155983, + "epoch": 1.32595107647195, + "grad_norm": 0.16554951667785645, + "learning_rate": 7.349980436484672e-05, + "loss": 0.702, + "mean_token_accuracy": 0.8293261242409548, + "num_tokens": 761009855.0, + "step": 4296 + }, + { + "entropy": 0.717485940704743, + "epoch": 1.3296550659773132, + "grad_norm": 0.14937981963157654, + "learning_rate": 7.335808442634596e-05, + "loss": 0.6975, + "mean_token_accuracy": 0.8307504417995611, + "num_tokens": 763137834.0, + "step": 4308 + }, + { + "entropy": 0.7333954150478045, + "epoch": 1.3333590554826762, + "grad_norm": 0.14564256370067596, + "learning_rate": 7.321612403863465e-05, + "loss": 0.7109, + "mean_token_accuracy": 0.8272267108162245, + "num_tokens": 765243912.0, + "step": 4320 + }, + { + "entropy": 0.7177773999671141, + "epoch": 1.3370630449880392, + "grad_norm": 0.14997075498104095, + "learning_rate": 7.30739246630596e-05, + "loss": 0.6971, + "mean_token_accuracy": 0.8304961547255516, + "num_tokens": 767342231.0, + "step": 4332 + }, + { + "entropy": 0.7231075142820677, + "epoch": 1.3407670344934024, + "grad_norm": 0.15372461080551147, + "learning_rate": 7.293148776342787e-05, + "loss": 0.7022, + "mean_token_accuracy": 0.8295708782970905, + "num_tokens": 769459559.0, + "step": 4344 + }, + { + "entropy": 0.727438664684693, + "epoch": 1.3444710239987654, + "grad_norm": 0.1568097472190857, + "learning_rate": 7.278881480599151e-05, + "loss": 0.7063, + "mean_token_accuracy": 0.8285723961889744, + "num_tokens": 771586827.0, + "step": 4356 + }, + { + "entropy": 0.7256615745524565, + "epoch": 1.3481750135041284, + "grad_norm": 0.15446576476097107, + "learning_rate": 7.264590725943263e-05, + "loss": 0.7042, + "mean_token_accuracy": 0.8298846408724785, + "num_tokens": 773753359.0, + "step": 4368 + }, + { + "entropy": 0.716560627023379, + "epoch": 1.3518790030094916, + "grad_norm": 0.15649054944515228, + "learning_rate": 7.250276659484814e-05, + "loss": 0.6973, + "mean_token_accuracy": 0.8308561046918234, + "num_tokens": 775838605.0, + "step": 4380 + }, + { + "entropy": 0.7301006155709425, + "epoch": 1.3555829925148546, + "grad_norm": 0.15302646160125732, + "learning_rate": 7.235939428573473e-05, + "loss": 0.7084, + "mean_token_accuracy": 0.8278401518861452, + "num_tokens": 777947001.0, + "step": 4392 + }, + { + "entropy": 0.7269327379763126, + "epoch": 1.3592869820202176, + "grad_norm": 0.15263816714286804, + "learning_rate": 7.221579180797365e-05, + "loss": 0.7051, + "mean_token_accuracy": 0.8292628613611063, + "num_tokens": 780073853.0, + "step": 4404 + }, + { + "entropy": 0.7197130558391412, + "epoch": 1.3629909715255808, + "grad_norm": 0.1532004326581955, + "learning_rate": 7.207196063981552e-05, + "loss": 0.6983, + "mean_token_accuracy": 0.8306596241891384, + "num_tokens": 782197116.0, + "step": 4416 + }, + { + "entropy": 0.7153556197881699, + "epoch": 1.3666949610309438, + "grad_norm": 0.16322393715381622, + "learning_rate": 7.192790226186505e-05, + "loss": 0.694, + "mean_token_accuracy": 0.8311380048592886, + "num_tokens": 784336823.0, + "step": 4428 + }, + { + "entropy": 0.7153538229564825, + "epoch": 1.3703989505363068, + "grad_norm": 0.17276719212532043, + "learning_rate": 7.178361815706594e-05, + "loss": 0.6943, + "mean_token_accuracy": 0.830880576123794, + "num_tokens": 786476106.0, + "step": 4440 + }, + { + "entropy": 0.7140049921969572, + "epoch": 1.37410294004167, + "grad_norm": 0.1465296596288681, + "learning_rate": 7.163910981068547e-05, + "loss": 0.6914, + "mean_token_accuracy": 0.8320667843023936, + "num_tokens": 788592462.0, + "step": 4452 + }, + { + "entropy": 0.7188820218046507, + "epoch": 1.377806929547033, + "grad_norm": 0.1437097191810608, + "learning_rate": 7.14943787102993e-05, + "loss": 0.6992, + "mean_token_accuracy": 0.8301914831002554, + "num_tokens": 790710211.0, + "step": 4464 + }, + { + "entropy": 0.7231453433632851, + "epoch": 1.381510919052396, + "grad_norm": 0.15103840827941895, + "learning_rate": 7.134942634577614e-05, + "loss": 0.7028, + "mean_token_accuracy": 0.8289699579278628, + "num_tokens": 792853918.0, + "step": 4476 + }, + { + "entropy": 0.7152188581724962, + "epoch": 1.3852149085577592, + "grad_norm": 0.15957361459732056, + "learning_rate": 7.12042542092624e-05, + "loss": 0.6933, + "mean_token_accuracy": 0.8314299285411835, + "num_tokens": 794995486.0, + "step": 4488 + }, + { + "entropy": 0.7186589650809765, + "epoch": 1.3889188980631222, + "grad_norm": 0.15625914931297302, + "learning_rate": 7.105886379516679e-05, + "loss": 0.7, + "mean_token_accuracy": 0.8297496847808361, + "num_tokens": 797143590.0, + "step": 4500 + }, + { + "entropy": 0.7220030228296915, + "epoch": 1.3926228875684852, + "grad_norm": 0.15391391515731812, + "learning_rate": 7.091325660014505e-05, + "loss": 0.7001, + "mean_token_accuracy": 0.8300996559361616, + "num_tokens": 799290589.0, + "step": 4512 + }, + { + "entropy": 0.7087641693651676, + "epoch": 1.3963268770738484, + "grad_norm": 0.1555260866880417, + "learning_rate": 7.076743412308441e-05, + "loss": 0.6868, + "mean_token_accuracy": 0.8325341766079267, + "num_tokens": 801404554.0, + "step": 4524 + }, + { + "entropy": 0.734593483308951, + "epoch": 1.4000308665792114, + "grad_norm": 0.16026608645915985, + "learning_rate": 7.062139786508827e-05, + "loss": 0.7146, + "mean_token_accuracy": 0.8272804208099842, + "num_tokens": 803507633.0, + "step": 4536 + }, + { + "entropy": 0.7193813882768154, + "epoch": 1.4037348560845744, + "grad_norm": 0.15414464473724365, + "learning_rate": 7.047514932946068e-05, + "loss": 0.6982, + "mean_token_accuracy": 0.83012605458498, + "num_tokens": 805648024.0, + "step": 4548 + }, + { + "entropy": 0.7172121778130531, + "epoch": 1.4074388455899376, + "grad_norm": 0.15337833762168884, + "learning_rate": 7.032869002169088e-05, + "loss": 0.6981, + "mean_token_accuracy": 0.8307264360288779, + "num_tokens": 807789002.0, + "step": 4560 + }, + { + "entropy": 0.7184814922511578, + "epoch": 1.4111428350953006, + "grad_norm": 0.15992848575115204, + "learning_rate": 7.01820214494378e-05, + "loss": 0.6974, + "mean_token_accuracy": 0.8301586744685968, + "num_tokens": 809914470.0, + "step": 4572 + }, + { + "entropy": 0.7108126245439053, + "epoch": 1.4148468246006636, + "grad_norm": 0.15666691958904266, + "learning_rate": 7.00351451225146e-05, + "loss": 0.6883, + "mean_token_accuracy": 0.832117979725202, + "num_tokens": 812044112.0, + "step": 4584 + }, + { + "entropy": 0.7207119353115559, + "epoch": 1.4185508141060268, + "grad_norm": 0.162274569272995, + "learning_rate": 6.9888062552873e-05, + "loss": 0.6988, + "mean_token_accuracy": 0.830579354117314, + "num_tokens": 814207854.0, + "step": 4596 + }, + { + "entropy": 0.7099998084207376, + "epoch": 1.4222548036113898, + "grad_norm": 0.15512961149215698, + "learning_rate": 6.974077525458785e-05, + "loss": 0.6901, + "mean_token_accuracy": 0.8324882164597511, + "num_tokens": 816326311.0, + "step": 4608 + }, + { + "entropy": 0.7227374029656252, + "epoch": 1.4259587931167528, + "grad_norm": 0.15393690764904022, + "learning_rate": 6.95932847438415e-05, + "loss": 0.7023, + "mean_token_accuracy": 0.8297974628706773, + "num_tokens": 818475940.0, + "step": 4620 + }, + { + "entropy": 0.7139190497497717, + "epoch": 1.429662782622116, + "grad_norm": 0.14897596836090088, + "learning_rate": 6.944559253890809e-05, + "loss": 0.6923, + "mean_token_accuracy": 0.8322372958064079, + "num_tokens": 820570551.0, + "step": 4632 + }, + { + "entropy": 0.7113192056616148, + "epoch": 1.433366772127479, + "grad_norm": 0.16115012764930725, + "learning_rate": 6.92977001601381e-05, + "loss": 0.6919, + "mean_token_accuracy": 0.8324420315523943, + "num_tokens": 822687816.0, + "step": 4644 + }, + { + "entropy": 0.7261552785833677, + "epoch": 1.437070761632842, + "grad_norm": 0.17303551733493805, + "learning_rate": 6.914960912994257e-05, + "loss": 0.7047, + "mean_token_accuracy": 0.8293804277976354, + "num_tokens": 824846735.0, + "step": 4656 + }, + { + "entropy": 0.7077195967237154, + "epoch": 1.4407747511382052, + "grad_norm": 0.15039795637130737, + "learning_rate": 6.900132097277748e-05, + "loss": 0.687, + "mean_token_accuracy": 0.8330833079914252, + "num_tokens": 827013414.0, + "step": 4668 + }, + { + "entropy": 0.7014664622644583, + "epoch": 1.4444787406435682, + "grad_norm": 0.1548740714788437, + "learning_rate": 6.885283721512803e-05, + "loss": 0.6819, + "mean_token_accuracy": 0.834053193529447, + "num_tokens": 829191903.0, + "step": 4680 + }, + { + "entropy": 0.7116682541867098, + "epoch": 1.4481827301489312, + "grad_norm": 0.15452542901039124, + "learning_rate": 6.870415938549292e-05, + "loss": 0.6917, + "mean_token_accuracy": 0.8317574722071489, + "num_tokens": 831331900.0, + "step": 4692 + }, + { + "entropy": 0.7132287646333376, + "epoch": 1.4518867196542944, + "grad_norm": 0.15533322095870972, + "learning_rate": 6.855528901436871e-05, + "loss": 0.6894, + "mean_token_accuracy": 0.8324517086148262, + "num_tokens": 833489694.0, + "step": 4704 + }, + { + "entropy": 0.7113682615260283, + "epoch": 1.4555907091596574, + "grad_norm": 0.1611347496509552, + "learning_rate": 6.840622763423391e-05, + "loss": 0.6906, + "mean_token_accuracy": 0.8323202182849249, + "num_tokens": 835603454.0, + "step": 4716 + }, + { + "entropy": 0.6990887063244978, + "epoch": 1.4592946986650204, + "grad_norm": 0.1729213446378708, + "learning_rate": 6.825697677953332e-05, + "loss": 0.6819, + "mean_token_accuracy": 0.8341849880913893, + "num_tokens": 837735389.0, + "step": 4728 + }, + { + "entropy": 0.7129435998698076, + "epoch": 1.4629986881703836, + "grad_norm": 0.1657145619392395, + "learning_rate": 6.810753798666223e-05, + "loss": 0.6948, + "mean_token_accuracy": 0.8312515988945961, + "num_tokens": 839880792.0, + "step": 4740 + }, + { + "entropy": 0.7213788678248724, + "epoch": 1.4667026776757466, + "grad_norm": 0.1646701693534851, + "learning_rate": 6.795791279395052e-05, + "loss": 0.6996, + "mean_token_accuracy": 0.8302897252142429, + "num_tokens": 842005360.0, + "step": 4752 + }, + { + "entropy": 0.7203125320374966, + "epoch": 1.4704066671811096, + "grad_norm": 0.1686820685863495, + "learning_rate": 6.780810274164691e-05, + "loss": 0.6968, + "mean_token_accuracy": 0.8303221389651299, + "num_tokens": 844137345.0, + "step": 4764 + }, + { + "entropy": 0.7195424201587836, + "epoch": 1.4741106566864728, + "grad_norm": 0.1486455649137497, + "learning_rate": 6.765810937190306e-05, + "loss": 0.6999, + "mean_token_accuracy": 0.8301795323689779, + "num_tokens": 846276636.0, + "step": 4776 + }, + { + "entropy": 0.7059193042417368, + "epoch": 1.4778146461918358, + "grad_norm": 0.16239939630031586, + "learning_rate": 6.750793422875771e-05, + "loss": 0.6847, + "mean_token_accuracy": 0.833564005792141, + "num_tokens": 848408984.0, + "step": 4788 + }, + { + "entropy": 0.724933902422587, + "epoch": 1.4815186356971988, + "grad_norm": 0.16981275379657745, + "learning_rate": 6.73575788581208e-05, + "loss": 0.7054, + "mean_token_accuracy": 0.8286726363003254, + "num_tokens": 850546298.0, + "step": 4800 + }, + { + "entropy": 0.7115325927734375, + "epoch": 1.485222625202562, + "grad_norm": 0.17294248938560486, + "learning_rate": 6.720704480775753e-05, + "loss": 0.6899, + "mean_token_accuracy": 0.8321196387211481, + "num_tokens": 852661935.0, + "step": 4812 + }, + { + "entropy": 0.7177861680587133, + "epoch": 1.488926614707925, + "grad_norm": 0.160500168800354, + "learning_rate": 6.705633362727243e-05, + "loss": 0.6948, + "mean_token_accuracy": 0.8320040429631869, + "num_tokens": 854775802.0, + "step": 4824 + }, + { + "entropy": 0.704534916828076, + "epoch": 1.492630604213288, + "grad_norm": 0.16029898822307587, + "learning_rate": 6.690544686809342e-05, + "loss": 0.6824, + "mean_token_accuracy": 0.8342838796476523, + "num_tokens": 856867232.0, + "step": 4836 + }, + { + "entropy": 0.7108530278007189, + "epoch": 1.4963345937186512, + "grad_norm": 0.16785788536071777, + "learning_rate": 6.675438608345583e-05, + "loss": 0.69, + "mean_token_accuracy": 0.832078884045283, + "num_tokens": 859022001.0, + "step": 4848 + }, + { + "entropy": 0.7115449421107769, + "epoch": 1.5000385832240142, + "grad_norm": 0.15710817277431488, + "learning_rate": 6.660315282838643e-05, + "loss": 0.6926, + "mean_token_accuracy": 0.8314018162588278, + "num_tokens": 861145193.0, + "step": 4860 + }, + { + "entropy": 0.7248165036241213, + "epoch": 1.5037425727293772, + "grad_norm": 0.16734077036380768, + "learning_rate": 6.645174865968742e-05, + "loss": 0.7031, + "mean_token_accuracy": 0.8290742263197899, + "num_tokens": 863278162.0, + "step": 4872 + }, + { + "entropy": 0.7192536381383737, + "epoch": 1.5074465622347404, + "grad_norm": 0.15687040984630585, + "learning_rate": 6.630017513592035e-05, + "loss": 0.6977, + "mean_token_accuracy": 0.8306297038992246, + "num_tokens": 865417354.0, + "step": 4884 + }, + { + "entropy": 0.7058528165022532, + "epoch": 1.5111505517401034, + "grad_norm": 0.17756927013397217, + "learning_rate": 6.614843381739014e-05, + "loss": 0.6867, + "mean_token_accuracy": 0.8327944378058115, + "num_tokens": 867563022.0, + "step": 4896 + }, + { + "entropy": 0.7131319008767605, + "epoch": 1.5148545412454664, + "grad_norm": 0.15551723539829254, + "learning_rate": 6.5996526266129e-05, + "loss": 0.6927, + "mean_token_accuracy": 0.8321846400698026, + "num_tokens": 869656511.0, + "step": 4908 + }, + { + "entropy": 0.7125682967404524, + "epoch": 1.5185585307508296, + "grad_norm": 0.15141348540782928, + "learning_rate": 6.584445404588038e-05, + "loss": 0.6929, + "mean_token_accuracy": 0.831573948264122, + "num_tokens": 871824051.0, + "step": 4920 + }, + { + "entropy": 0.7077928557991982, + "epoch": 1.5222625202561926, + "grad_norm": 0.16390904784202576, + "learning_rate": 6.569221872208277e-05, + "loss": 0.6869, + "mean_token_accuracy": 0.8330778690675894, + "num_tokens": 873933807.0, + "step": 4932 + }, + { + "entropy": 0.7208421056469282, + "epoch": 1.5259665097615556, + "grad_norm": 0.1592690497636795, + "learning_rate": 6.553982186185374e-05, + "loss": 0.7006, + "mean_token_accuracy": 0.8300425770382086, + "num_tokens": 876062457.0, + "step": 4944 + }, + { + "entropy": 0.7219262520472208, + "epoch": 1.5296704992669188, + "grad_norm": 0.16316470503807068, + "learning_rate": 6.538726503397362e-05, + "loss": 0.6995, + "mean_token_accuracy": 0.8302759416401386, + "num_tokens": 878177430.0, + "step": 4956 + }, + { + "entropy": 0.7148293716212114, + "epoch": 1.5333744887722818, + "grad_norm": 0.16070803999900818, + "learning_rate": 6.523454980886957e-05, + "loss": 0.6935, + "mean_token_accuracy": 0.8319490551948547, + "num_tokens": 880334120.0, + "step": 4968 + }, + { + "entropy": 0.7131251158813635, + "epoch": 1.5370784782776448, + "grad_norm": 0.15670515596866608, + "learning_rate": 6.508167775859918e-05, + "loss": 0.692, + "mean_token_accuracy": 0.8319077553848425, + "num_tokens": 882447090.0, + "step": 4980 + }, + { + "entropy": 0.7086505778133869, + "epoch": 1.540782467783008, + "grad_norm": 0.15983329713344574, + "learning_rate": 6.49286504568345e-05, + "loss": 0.6881, + "mean_token_accuracy": 0.8326895671586195, + "num_tokens": 884569531.0, + "step": 4992 + }, + { + "entropy": 0.7085077924033006, + "epoch": 1.544486457288371, + "grad_norm": 0.15563714504241943, + "learning_rate": 6.477546947884572e-05, + "loss": 0.6866, + "mean_token_accuracy": 0.8332745619118214, + "num_tokens": 886683533.0, + "step": 5004 + }, + { + "entropy": 0.7201697329680125, + "epoch": 1.548190446793734, + "grad_norm": 0.15200090408325195, + "learning_rate": 6.462213640148495e-05, + "loss": 0.7011, + "mean_token_accuracy": 0.82986398289601, + "num_tokens": 888793750.0, + "step": 5016 + }, + { + "entropy": 0.7024264633655548, + "epoch": 1.5518944362990972, + "grad_norm": 0.17173556983470917, + "learning_rate": 6.446865280317005e-05, + "loss": 0.6812, + "mean_token_accuracy": 0.8346923552453518, + "num_tokens": 890929618.0, + "step": 5028 + }, + { + "entropy": 0.7268513465921084, + "epoch": 1.5555984258044602, + "grad_norm": 0.16845101118087769, + "learning_rate": 6.431502026386835e-05, + "loss": 0.7049, + "mean_token_accuracy": 0.8287280350923538, + "num_tokens": 893036881.0, + "step": 5040 + }, + { + "entropy": 0.7066880414883295, + "epoch": 1.5593024153098232, + "grad_norm": 0.15314914286136627, + "learning_rate": 6.416124036508035e-05, + "loss": 0.6848, + "mean_token_accuracy": 0.8333163931965828, + "num_tokens": 895190885.0, + "step": 5052 + }, + { + "entropy": 0.7045964933931828, + "epoch": 1.5630064048151864, + "grad_norm": 0.16949310898780823, + "learning_rate": 6.400731468982353e-05, + "loss": 0.6842, + "mean_token_accuracy": 0.8338618675867716, + "num_tokens": 897319125.0, + "step": 5064 + }, + { + "entropy": 0.7159998863935471, + "epoch": 1.5667103943205494, + "grad_norm": 0.15537752211093903, + "learning_rate": 6.385324482261597e-05, + "loss": 0.6948, + "mean_token_accuracy": 0.8313898133734862, + "num_tokens": 899443900.0, + "step": 5076 + }, + { + "entropy": 0.7135510904093584, + "epoch": 1.5704143838259124, + "grad_norm": 0.15910863876342773, + "learning_rate": 6.369903234946003e-05, + "loss": 0.6941, + "mean_token_accuracy": 0.8314098492264748, + "num_tokens": 901571980.0, + "step": 5088 + }, + { + "entropy": 0.7235615576306978, + "epoch": 1.5741183733312756, + "grad_norm": 0.15910740196704865, + "learning_rate": 6.354467885782614e-05, + "loss": 0.7035, + "mean_token_accuracy": 0.8294711311658224, + "num_tokens": 903733303.0, + "step": 5100 + }, + { + "entropy": 0.725119836628437, + "epoch": 1.5778223628366388, + "grad_norm": 0.15643955767154694, + "learning_rate": 6.339018593663633e-05, + "loss": 0.7048, + "mean_token_accuracy": 0.8289666138589382, + "num_tokens": 905831645.0, + "step": 5112 + }, + { + "entropy": 0.7218106302122275, + "epoch": 1.5815263523420016, + "grad_norm": 0.15788429975509644, + "learning_rate": 6.323555517624792e-05, + "loss": 0.6997, + "mean_token_accuracy": 0.8302192588647207, + "num_tokens": 907975789.0, + "step": 5124 + }, + { + "entropy": 0.7207488107184569, + "epoch": 1.5852303418473648, + "grad_norm": 0.17031899094581604, + "learning_rate": 6.308078816843721e-05, + "loss": 0.6983, + "mean_token_accuracy": 0.830046666165193, + "num_tokens": 910106098.0, + "step": 5136 + }, + { + "entropy": 0.7064071434239546, + "epoch": 1.588934331352728, + "grad_norm": 0.16523528099060059, + "learning_rate": 6.292588650638298e-05, + "loss": 0.6825, + "mean_token_accuracy": 0.8339592938621839, + "num_tokens": 912222945.0, + "step": 5148 + }, + { + "entropy": 0.6991514513889948, + "epoch": 1.5926383208580908, + "grad_norm": 0.1478477269411087, + "learning_rate": 6.277085178465015e-05, + "loss": 0.6804, + "mean_token_accuracy": 0.8346445349355539, + "num_tokens": 914355424.0, + "step": 5160 + }, + { + "entropy": 0.7049112543463707, + "epoch": 1.596342310363454, + "grad_norm": 0.16001063585281372, + "learning_rate": 6.261568559917336e-05, + "loss": 0.6867, + "mean_token_accuracy": 0.8334415753682455, + "num_tokens": 916470693.0, + "step": 5172 + }, + { + "entropy": 0.7129873298108578, + "epoch": 1.6000462998688172, + "grad_norm": 0.15970784425735474, + "learning_rate": 6.24603895472406e-05, + "loss": 0.6907, + "mean_token_accuracy": 0.8320851363241673, + "num_tokens": 918571794.0, + "step": 5184 + }, + { + "entropy": 0.6842332072556019, + "epoch": 1.60375028937418, + "grad_norm": 0.15537609159946442, + "learning_rate": 6.230496522747666e-05, + "loss": 0.6631, + "mean_token_accuracy": 0.8387776849170526, + "num_tokens": 920723475.0, + "step": 5196 + }, + { + "entropy": 0.7234001358350118, + "epoch": 1.6074542788795432, + "grad_norm": 0.1752452850341797, + "learning_rate": 6.214941423982674e-05, + "loss": 0.7009, + "mean_token_accuracy": 0.829645175486803, + "num_tokens": 922804826.0, + "step": 5208 + }, + { + "entropy": 0.7220541623731455, + "epoch": 1.6111582683849064, + "grad_norm": 0.2073037028312683, + "learning_rate": 6.199373818553996e-05, + "loss": 0.6998, + "mean_token_accuracy": 0.8298226619760195, + "num_tokens": 924951733.0, + "step": 5220 + }, + { + "entropy": 0.7145507372915745, + "epoch": 1.6148622578902692, + "grad_norm": 0.15807439386844635, + "learning_rate": 6.183793866715285e-05, + "loss": 0.6921, + "mean_token_accuracy": 0.8318213013311228, + "num_tokens": 927091486.0, + "step": 5232 + }, + { + "entropy": 0.7239471996823946, + "epoch": 1.6185662473956324, + "grad_norm": 0.14956313371658325, + "learning_rate": 6.168201728847298e-05, + "loss": 0.7061, + "mean_token_accuracy": 0.829018946737051, + "num_tokens": 929222367.0, + "step": 5244 + }, + { + "entropy": 0.7157949171960354, + "epoch": 1.6222702369009956, + "grad_norm": 0.15014518797397614, + "learning_rate": 6.152597565456225e-05, + "loss": 0.6951, + "mean_token_accuracy": 0.831564124673605, + "num_tokens": 931355312.0, + "step": 5256 + }, + { + "entropy": 0.7077574829260508, + "epoch": 1.6259742264063584, + "grad_norm": 0.1514291763305664, + "learning_rate": 6.136981537172054e-05, + "loss": 0.6871, + "mean_token_accuracy": 0.8334880148371061, + "num_tokens": 933487638.0, + "step": 5268 + }, + { + "entropy": 0.7066128787895044, + "epoch": 1.6296782159117216, + "grad_norm": 0.1583339273929596, + "learning_rate": 6.121353804746907e-05, + "loss": 0.6874, + "mean_token_accuracy": 0.8333603926002979, + "num_tokens": 935600348.0, + "step": 5280 + }, + { + "entropy": 0.716248465081056, + "epoch": 1.6333822054170848, + "grad_norm": 0.15541380643844604, + "learning_rate": 6.105714529053391e-05, + "loss": 0.6937, + "mean_token_accuracy": 0.8314491584897041, + "num_tokens": 937702410.0, + "step": 5292 + }, + { + "entropy": 0.7185319637258848, + "epoch": 1.6370861949224476, + "grad_norm": 0.1648872047662735, + "learning_rate": 6.090063871082941e-05, + "loss": 0.698, + "mean_token_accuracy": 0.8299818920592467, + "num_tokens": 939791729.0, + "step": 5304 + }, + { + "entropy": 0.7117224608858427, + "epoch": 1.6407901844278108, + "grad_norm": 0.16068147122859955, + "learning_rate": 6.0744019919441564e-05, + "loss": 0.6932, + "mean_token_accuracy": 0.8325706832110882, + "num_tokens": 941926316.0, + "step": 5316 + }, + { + "entropy": 0.703471340239048, + "epoch": 1.644494173933174, + "grad_norm": 0.15285624563694, + "learning_rate": 6.058729052861156e-05, + "loss": 0.6823, + "mean_token_accuracy": 0.8336320606370767, + "num_tokens": 944021627.0, + "step": 5328 + }, + { + "entropy": 0.7178343534469604, + "epoch": 1.648198163438537, + "grad_norm": 0.15492397546768188, + "learning_rate": 6.043045215171903e-05, + "loss": 0.6949, + "mean_token_accuracy": 0.8309007274607817, + "num_tokens": 946154913.0, + "step": 5340 + }, + { + "entropy": 0.710145698239406, + "epoch": 1.6519021529439, + "grad_norm": 0.15523739159107208, + "learning_rate": 6.0273506403265543e-05, + "loss": 0.6886, + "mean_token_accuracy": 0.832029789686203, + "num_tokens": 948306687.0, + "step": 5352 + }, + { + "entropy": 0.7194016513725122, + "epoch": 1.6556061424492632, + "grad_norm": 0.15659694373607635, + "learning_rate": 6.0116454898857974e-05, + "loss": 0.6978, + "mean_token_accuracy": 0.8309724852442741, + "num_tokens": 950424446.0, + "step": 5364 + }, + { + "entropy": 0.6990312064687411, + "epoch": 1.6593101319546262, + "grad_norm": 0.1696748435497284, + "learning_rate": 5.99592992551918e-05, + "loss": 0.6767, + "mean_token_accuracy": 0.835218321532011, + "num_tokens": 952534193.0, + "step": 5376 + }, + { + "entropy": 0.7059058484931787, + "epoch": 1.6630141214599892, + "grad_norm": 0.16559961438179016, + "learning_rate": 5.98020410900346e-05, + "loss": 0.6848, + "mean_token_accuracy": 0.8346103342870871, + "num_tokens": 954639431.0, + "step": 5388 + }, + { + "entropy": 0.7005784672995409, + "epoch": 1.6667181109653524, + "grad_norm": 0.1537712663412094, + "learning_rate": 5.964468202220919e-05, + "loss": 0.6797, + "mean_token_accuracy": 0.8347668411831061, + "num_tokens": 956782039.0, + "step": 5400 + }, + { + "entropy": 0.7165911011397839, + "epoch": 1.6704221004707154, + "grad_norm": 0.16163314878940582, + "learning_rate": 5.9487223671577206e-05, + "loss": 0.6932, + "mean_token_accuracy": 0.8314206625024477, + "num_tokens": 958932938.0, + "step": 5412 + }, + { + "entropy": 0.7195808986822764, + "epoch": 1.6741260899760784, + "grad_norm": 0.148748978972435, + "learning_rate": 5.932966765902225e-05, + "loss": 0.6988, + "mean_token_accuracy": 0.8302495119472345, + "num_tokens": 961062826.0, + "step": 5424 + }, + { + "entropy": 0.7168962607781092, + "epoch": 1.6778300794814416, + "grad_norm": 0.15100803971290588, + "learning_rate": 5.917201560643323e-05, + "loss": 0.6955, + "mean_token_accuracy": 0.8308056443929672, + "num_tokens": 963169661.0, + "step": 5436 + }, + { + "entropy": 0.7153752135733763, + "epoch": 1.6815340689868046, + "grad_norm": 0.17240118980407715, + "learning_rate": 5.901426913668777e-05, + "loss": 0.6941, + "mean_token_accuracy": 0.8311421684920788, + "num_tokens": 965320235.0, + "step": 5448 + }, + { + "entropy": 0.7057120179136595, + "epoch": 1.6852380584921676, + "grad_norm": 0.16611547768115997, + "learning_rate": 5.8856429873635366e-05, + "loss": 0.6831, + "mean_token_accuracy": 0.8337946385145187, + "num_tokens": 967435851.0, + "step": 5460 + }, + { + "entropy": 0.7108418879409631, + "epoch": 1.6889420479975308, + "grad_norm": 0.14767956733703613, + "learning_rate": 5.869849944208076e-05, + "loss": 0.6886, + "mean_token_accuracy": 0.8330086718002955, + "num_tokens": 969573643.0, + "step": 5472 + }, + { + "entropy": 0.7108013468484083, + "epoch": 1.6926460375028938, + "grad_norm": 0.17264387011528015, + "learning_rate": 5.854047946776717e-05, + "loss": 0.6897, + "mean_token_accuracy": 0.8324159942567348, + "num_tokens": 971716883.0, + "step": 5484 + }, + { + "entropy": 0.7093252340952555, + "epoch": 1.6963500270082568, + "grad_norm": 0.16304221749305725, + "learning_rate": 5.8382371577359584e-05, + "loss": 0.6887, + "mean_token_accuracy": 0.8329907159010569, + "num_tokens": 973827983.0, + "step": 5496 + }, + { + "entropy": 0.7014658078551292, + "epoch": 1.70005401651362, + "grad_norm": 0.15241508185863495, + "learning_rate": 5.8224177398428016e-05, + "loss": 0.6771, + "mean_token_accuracy": 0.8354054316878319, + "num_tokens": 975953202.0, + "step": 5508 + }, + { + "entropy": 0.7008350156247616, + "epoch": 1.703758006018983, + "grad_norm": 0.1691211462020874, + "learning_rate": 5.8065898559430706e-05, + "loss": 0.6805, + "mean_token_accuracy": 0.834287978708744, + "num_tokens": 978114142.0, + "step": 5520 + }, + { + "entropy": 0.7018342142303785, + "epoch": 1.707461995524346, + "grad_norm": 0.1586218923330307, + "learning_rate": 5.790753668969742e-05, + "loss": 0.6824, + "mean_token_accuracy": 0.8338803065319856, + "num_tokens": 980256061.0, + "step": 5532 + }, + { + "entropy": 0.7026222559312979, + "epoch": 1.7111659850297092, + "grad_norm": 0.15604816377162933, + "learning_rate": 5.7749093419412626e-05, + "loss": 0.6788, + "mean_token_accuracy": 0.8349755468467871, + "num_tokens": 982408166.0, + "step": 5544 + }, + { + "entropy": 0.6918922699987888, + "epoch": 1.7148699745350722, + "grad_norm": 0.1606925129890442, + "learning_rate": 5.759057037959872e-05, + "loss": 0.6716, + "mean_token_accuracy": 0.8362010270357132, + "num_tokens": 984543638.0, + "step": 5556 + }, + { + "entropy": 0.7039717622101307, + "epoch": 1.7185739640404352, + "grad_norm": 0.1536424607038498, + "learning_rate": 5.7431969202099287e-05, + "loss": 0.6839, + "mean_token_accuracy": 0.8334637098014355, + "num_tokens": 986649156.0, + "step": 5568 + }, + { + "entropy": 0.7010701584319273, + "epoch": 1.7222779535457984, + "grad_norm": 0.1718849092721939, + "learning_rate": 5.727329151956225e-05, + "loss": 0.6789, + "mean_token_accuracy": 0.8351405846575896, + "num_tokens": 988730105.0, + "step": 5580 + }, + { + "entropy": 0.7192123532295227, + "epoch": 1.7259819430511614, + "grad_norm": 0.15781332552433014, + "learning_rate": 5.711453896542307e-05, + "loss": 0.6987, + "mean_token_accuracy": 0.8300843487183253, + "num_tokens": 990834861.0, + "step": 5592 + }, + { + "entropy": 0.7151969609161218, + "epoch": 1.7296859325565244, + "grad_norm": 0.15374763309955597, + "learning_rate": 5.695571317388794e-05, + "loss": 0.6951, + "mean_token_accuracy": 0.8305746080974737, + "num_tokens": 992957801.0, + "step": 5604 + }, + { + "entropy": 0.7098864167928696, + "epoch": 1.7333899220618876, + "grad_norm": 0.18953058123588562, + "learning_rate": 5.679681577991694e-05, + "loss": 0.6886, + "mean_token_accuracy": 0.8322906543811163, + "num_tokens": 995069535.0, + "step": 5616 + }, + { + "entropy": 0.6933085806667805, + "epoch": 1.7370939115672506, + "grad_norm": 0.16794349253177643, + "learning_rate": 5.6637848419207305e-05, + "loss": 0.674, + "mean_token_accuracy": 0.8359851129353046, + "num_tokens": 997197992.0, + "step": 5628 + }, + { + "entropy": 0.7013748064637184, + "epoch": 1.7407979010726136, + "grad_norm": 0.1656210571527481, + "learning_rate": 5.6478812728176435e-05, + "loss": 0.6782, + "mean_token_accuracy": 0.8345577555398146, + "num_tokens": 999326233.0, + "step": 5640 + }, + { + "entropy": 0.716081328690052, + "epoch": 1.7445018905779768, + "grad_norm": 0.16797126829624176, + "learning_rate": 5.631971034394515e-05, + "loss": 0.6944, + "mean_token_accuracy": 0.8314446931083997, + "num_tokens": 1001429533.0, + "step": 5652 + }, + { + "entropy": 0.7122478174666563, + "epoch": 1.7482058800833398, + "grad_norm": 0.167856827378273, + "learning_rate": 5.616054290432082e-05, + "loss": 0.6916, + "mean_token_accuracy": 0.8324559306104978, + "num_tokens": 1003576389.0, + "step": 5664 + }, + { + "entropy": 0.7033972653249899, + "epoch": 1.7519098695887028, + "grad_norm": 0.15445369482040405, + "learning_rate": 5.6001312047780486e-05, + "loss": 0.682, + "mean_token_accuracy": 0.8342355067531267, + "num_tokens": 1005709667.0, + "step": 5676 + }, + { + "entropy": 0.6920518179734548, + "epoch": 1.755613859094066, + "grad_norm": 0.1603931039571762, + "learning_rate": 5.584201941345402e-05, + "loss": 0.6723, + "mean_token_accuracy": 0.8359791164596876, + "num_tokens": 1007876576.0, + "step": 5688 + }, + { + "entropy": 0.7032434691985449, + "epoch": 1.759317848599429, + "grad_norm": 0.15814517438411713, + "learning_rate": 5.568266664110722e-05, + "loss": 0.6818, + "mean_token_accuracy": 0.8343645632266998, + "num_tokens": 1010006821.0, + "step": 5700 + }, + { + "entropy": 0.710328691949447, + "epoch": 1.763021838104792, + "grad_norm": 0.17391884326934814, + "learning_rate": 5.552325537112497e-05, + "loss": 0.6875, + "mean_token_accuracy": 0.833385648826758, + "num_tokens": 1012148154.0, + "step": 5712 + }, + { + "entropy": 0.7074959563712279, + "epoch": 1.7667258276101552, + "grad_norm": 0.1595250517129898, + "learning_rate": 5.53637872444943e-05, + "loss": 0.688, + "mean_token_accuracy": 0.8327213364342848, + "num_tokens": 1014260818.0, + "step": 5724 + }, + { + "entropy": 0.6988821178674698, + "epoch": 1.7704298171155182, + "grad_norm": 0.16727086901664734, + "learning_rate": 5.5204263902787564e-05, + "loss": 0.6771, + "mean_token_accuracy": 0.8350557560722033, + "num_tokens": 1016337244.0, + "step": 5736 + }, + { + "entropy": 0.7048179631431898, + "epoch": 1.7741338066208812, + "grad_norm": 0.16889260709285736, + "learning_rate": 5.504468698814548e-05, + "loss": 0.6846, + "mean_token_accuracy": 0.8338587942222754, + "num_tokens": 1018453592.0, + "step": 5748 + }, + { + "entropy": 0.7078387662768364, + "epoch": 1.7778377961262444, + "grad_norm": 0.16895486414432526, + "learning_rate": 5.4885058143260227e-05, + "loss": 0.6869, + "mean_token_accuracy": 0.832654087493817, + "num_tokens": 1020585383.0, + "step": 5760 + }, + { + "entropy": 0.6954544944067796, + "epoch": 1.7815417856316074, + "grad_norm": 0.1606086641550064, + "learning_rate": 5.47253790113586e-05, + "loss": 0.6744, + "mean_token_accuracy": 0.8354161145786444, + "num_tokens": 1022716006.0, + "step": 5772 + }, + { + "entropy": 0.6906781146923701, + "epoch": 1.7852457751369704, + "grad_norm": 0.1552080512046814, + "learning_rate": 5.456565123618503e-05, + "loss": 0.6685, + "mean_token_accuracy": 0.8372346547742685, + "num_tokens": 1024830212.0, + "step": 5784 + }, + { + "entropy": 0.7113202723364035, + "epoch": 1.7889497646423336, + "grad_norm": 0.16079287230968475, + "learning_rate": 5.440587646198469e-05, + "loss": 0.6915, + "mean_token_accuracy": 0.8320710522433122, + "num_tokens": 1026944068.0, + "step": 5796 + }, + { + "entropy": 0.6925230113168558, + "epoch": 1.7926537541476966, + "grad_norm": 0.1665734201669693, + "learning_rate": 5.424605633348655e-05, + "loss": 0.671, + "mean_token_accuracy": 0.8368867188692093, + "num_tokens": 1029077844.0, + "step": 5808 + }, + { + "entropy": 0.7204868520299593, + "epoch": 1.7963577436530596, + "grad_norm": 0.1540994644165039, + "learning_rate": 5.408619249588644e-05, + "loss": 0.6991, + "mean_token_accuracy": 0.8302427244683107, + "num_tokens": 1031208806.0, + "step": 5820 + }, + { + "entropy": 0.6986957266926765, + "epoch": 1.8000617331584228, + "grad_norm": 0.15892110764980316, + "learning_rate": 5.392628659483021e-05, + "loss": 0.6767, + "mean_token_accuracy": 0.8347699269652367, + "num_tokens": 1033383899.0, + "step": 5832 + }, + { + "entropy": 0.7196601120134195, + "epoch": 1.8037657226637858, + "grad_norm": 0.16495993733406067, + "learning_rate": 5.3766340276396646e-05, + "loss": 0.7003, + "mean_token_accuracy": 0.8301363748808702, + "num_tokens": 1035511527.0, + "step": 5844 + }, + { + "entropy": 0.7001743813355764, + "epoch": 1.8074697121691488, + "grad_norm": 0.16940170526504517, + "learning_rate": 5.3606355187080595e-05, + "loss": 0.6812, + "mean_token_accuracy": 0.8345416759451231, + "num_tokens": 1037619457.0, + "step": 5856 + }, + { + "entropy": 0.7112437374889851, + "epoch": 1.811173701674512, + "grad_norm": 0.1692638248205185, + "learning_rate": 5.344633297377604e-05, + "loss": 0.6891, + "mean_token_accuracy": 0.832543725768725, + "num_tokens": 1039734230.0, + "step": 5868 + }, + { + "entropy": 0.7141083081563314, + "epoch": 1.814877691179875, + "grad_norm": 0.15631647408008575, + "learning_rate": 5.328627528375909e-05, + "loss": 0.6911, + "mean_token_accuracy": 0.832301444063584, + "num_tokens": 1041893850.0, + "step": 5880 + }, + { + "entropy": 0.7041296027600765, + "epoch": 1.818581680685238, + "grad_norm": 0.16098694503307343, + "learning_rate": 5.312618376467111e-05, + "loss": 0.6836, + "mean_token_accuracy": 0.8333825121323267, + "num_tokens": 1044042558.0, + "step": 5892 + }, + { + "entropy": 0.691443515320619, + "epoch": 1.8222856701906012, + "grad_norm": 0.1699233502149582, + "learning_rate": 5.2966060064501645e-05, + "loss": 0.6704, + "mean_token_accuracy": 0.8364453675846258, + "num_tokens": 1046172232.0, + "step": 5904 + }, + { + "entropy": 0.7075358144938946, + "epoch": 1.8259896596959642, + "grad_norm": 0.1754993498325348, + "learning_rate": 5.280590583157152e-05, + "loss": 0.685, + "mean_token_accuracy": 0.8332440021137396, + "num_tokens": 1048284789.0, + "step": 5916 + }, + { + "entropy": 0.7079668007791042, + "epoch": 1.8296936492013272, + "grad_norm": 0.1584164947271347, + "learning_rate": 5.264572271451591e-05, + "loss": 0.6874, + "mean_token_accuracy": 0.8330427743494511, + "num_tokens": 1050431870.0, + "step": 5928 + }, + { + "entropy": 0.7003171257674694, + "epoch": 1.8333976387066904, + "grad_norm": 0.15958057343959808, + "learning_rate": 5.248551236226724e-05, + "loss": 0.6776, + "mean_token_accuracy": 0.8346174930532774, + "num_tokens": 1052556525.0, + "step": 5940 + }, + { + "entropy": 0.7011785494784514, + "epoch": 1.8371016282120534, + "grad_norm": 0.15794256329536438, + "learning_rate": 5.232527642403841e-05, + "loss": 0.6802, + "mean_token_accuracy": 0.834118609627088, + "num_tokens": 1054656122.0, + "step": 5952 + }, + { + "entropy": 0.7017668796082338, + "epoch": 1.8408056177174164, + "grad_norm": 0.17382527887821198, + "learning_rate": 5.216501654930561e-05, + "loss": 0.6835, + "mean_token_accuracy": 0.8327813086410364, + "num_tokens": 1056796572.0, + "step": 5964 + }, + { + "entropy": 0.7102235816419125, + "epoch": 1.8445096072227796, + "grad_norm": 0.16711148619651794, + "learning_rate": 5.200473438779146e-05, + "loss": 0.6892, + "mean_token_accuracy": 0.8322862597803274, + "num_tokens": 1058920924.0, + "step": 5976 + }, + { + "entropy": 0.6888363808393478, + "epoch": 1.8482135967281426, + "grad_norm": 0.1576377898454666, + "learning_rate": 5.1844431589448025e-05, + "loss": 0.6666, + "mean_token_accuracy": 0.8380541230241457, + "num_tokens": 1061076284.0, + "step": 5988 + }, + { + "entropy": 0.7103967430690924, + "epoch": 1.8519175862335056, + "grad_norm": 0.16509290039539337, + "learning_rate": 5.1684109804439774e-05, + "loss": 0.6879, + "mean_token_accuracy": 0.8324746452271938, + "num_tokens": 1063152002.0, + "step": 6000 + }, + { + "entropy": 0.6895517694453398, + "epoch": 1.8556215757388688, + "grad_norm": 0.17304864525794983, + "learning_rate": 5.152377068312665e-05, + "loss": 0.6677, + "mean_token_accuracy": 0.8372837752103806, + "num_tokens": 1065304104.0, + "step": 6012 + }, + { + "entropy": 0.7109543569386005, + "epoch": 1.8593255652442318, + "grad_norm": 0.16532959043979645, + "learning_rate": 5.1363415876047036e-05, + "loss": 0.6926, + "mean_token_accuracy": 0.8319398574531078, + "num_tokens": 1067404193.0, + "step": 6024 + }, + { + "entropy": 0.7061434425413609, + "epoch": 1.8630295547495948, + "grad_norm": 0.1592184454202652, + "learning_rate": 5.1203047033900806e-05, + "loss": 0.6851, + "mean_token_accuracy": 0.8332473436991373, + "num_tokens": 1069527530.0, + "step": 6036 + }, + { + "entropy": 0.6931216232478619, + "epoch": 1.866733544254958, + "grad_norm": 0.15798227488994598, + "learning_rate": 5.10426658075323e-05, + "loss": 0.6708, + "mean_token_accuracy": 0.8361308053135872, + "num_tokens": 1071666406.0, + "step": 6048 + }, + { + "entropy": 0.6993834227323532, + "epoch": 1.870437533760321, + "grad_norm": 0.16762305796146393, + "learning_rate": 5.088227384791332e-05, + "loss": 0.6798, + "mean_token_accuracy": 0.8347605802118778, + "num_tokens": 1073823299.0, + "step": 6060 + }, + { + "entropy": 0.6961913978060087, + "epoch": 1.874141523265684, + "grad_norm": 0.1665721982717514, + "learning_rate": 5.072187280612621e-05, + "loss": 0.6726, + "mean_token_accuracy": 0.8361902994414171, + "num_tokens": 1075944213.0, + "step": 6072 + }, + { + "entropy": 0.702531552563111, + "epoch": 1.8778455127710472, + "grad_norm": 0.16553518176078796, + "learning_rate": 5.056146433334676e-05, + "loss": 0.6804, + "mean_token_accuracy": 0.8344750516116619, + "num_tokens": 1078033274.0, + "step": 6084 + }, + { + "entropy": 0.7059743218123913, + "epoch": 1.8815495022764104, + "grad_norm": 0.15529991686344147, + "learning_rate": 5.0401050080827297e-05, + "loss": 0.6855, + "mean_token_accuracy": 0.8335259656111399, + "num_tokens": 1080157323.0, + "step": 6096 + }, + { + "entropy": 0.7024221407870451, + "epoch": 1.8852534917817731, + "grad_norm": 0.15887407958507538, + "learning_rate": 5.024063169987958e-05, + "loss": 0.6806, + "mean_token_accuracy": 0.8346465714275837, + "num_tokens": 1082289229.0, + "step": 6108 + }, + { + "entropy": 0.7151320551832517, + "epoch": 1.8889574812871364, + "grad_norm": 0.1628350168466568, + "learning_rate": 5.008021084185791e-05, + "loss": 0.6921, + "mean_token_accuracy": 0.8322754862407843, + "num_tokens": 1084403486.0, + "step": 6120 + }, + { + "entropy": 0.7036687669654688, + "epoch": 1.8926614707924996, + "grad_norm": 0.16994571685791016, + "learning_rate": 4.99197891581421e-05, + "loss": 0.6836, + "mean_token_accuracy": 0.8344013380507628, + "num_tokens": 1086516462.0, + "step": 6132 + }, + { + "entropy": 0.7015910123785337, + "epoch": 1.8963654602978623, + "grad_norm": 0.15207666158676147, + "learning_rate": 4.975936830012043e-05, + "loss": 0.681, + "mean_token_accuracy": 0.8345483938852946, + "num_tokens": 1088624801.0, + "step": 6144 + }, + { + "entropy": 0.7084349157909552, + "epoch": 1.9000694498032256, + "grad_norm": 0.15862888097763062, + "learning_rate": 4.9598949919172715e-05, + "loss": 0.6858, + "mean_token_accuracy": 0.8337940263251463, + "num_tokens": 1090798611.0, + "step": 6156 + }, + { + "entropy": 0.6984652889271578, + "epoch": 1.9037734393085888, + "grad_norm": 0.16308139264583588, + "learning_rate": 4.9438535666653236e-05, + "loss": 0.678, + "mean_token_accuracy": 0.8351361577709516, + "num_tokens": 1092940043.0, + "step": 6168 + }, + { + "entropy": 0.7053580147524675, + "epoch": 1.9074774288139515, + "grad_norm": 0.17202384769916534, + "learning_rate": 4.927812719387378e-05, + "loss": 0.6823, + "mean_token_accuracy": 0.8341307466228803, + "num_tokens": 1095068183.0, + "step": 6180 + }, + { + "entropy": 0.7106709815561771, + "epoch": 1.9111814183193148, + "grad_norm": 0.1545022875070572, + "learning_rate": 4.9117726152086694e-05, + "loss": 0.689, + "mean_token_accuracy": 0.8321913021306196, + "num_tokens": 1097208618.0, + "step": 6192 + }, + { + "entropy": 0.7134841966132323, + "epoch": 1.914885407824678, + "grad_norm": 0.17459839582443237, + "learning_rate": 4.895733419246772e-05, + "loss": 0.6918, + "mean_token_accuracy": 0.8316825156410536, + "num_tokens": 1099320273.0, + "step": 6204 + }, + { + "entropy": 0.7126207016408443, + "epoch": 1.9185893973300407, + "grad_norm": 0.1696072369813919, + "learning_rate": 4.879695296609921e-05, + "loss": 0.6921, + "mean_token_accuracy": 0.8322778431077799, + "num_tokens": 1101435574.0, + "step": 6216 + }, + { + "entropy": 0.6995201781392097, + "epoch": 1.922293386835404, + "grad_norm": 0.16167153418064117, + "learning_rate": 4.863658412395297e-05, + "loss": 0.6779, + "mean_token_accuracy": 0.835229920844237, + "num_tokens": 1103574826.0, + "step": 6228 + }, + { + "entropy": 0.7126905384163061, + "epoch": 1.9259973763407672, + "grad_norm": 0.16883228719234467, + "learning_rate": 4.847622931687336e-05, + "loss": 0.6917, + "mean_token_accuracy": 0.8323102233310541, + "num_tokens": 1105701043.0, + "step": 6240 + }, + { + "entropy": 0.7085224725306034, + "epoch": 1.92970136584613, + "grad_norm": 0.1775846779346466, + "learning_rate": 4.831589019556024e-05, + "loss": 0.6842, + "mean_token_accuracy": 0.8335540195306143, + "num_tokens": 1107798646.0, + "step": 6252 + }, + { + "entropy": 0.7003039928774039, + "epoch": 1.9334053553514932, + "grad_norm": 0.15765723586082458, + "learning_rate": 4.815556841055198e-05, + "loss": 0.6806, + "mean_token_accuracy": 0.8348556930820147, + "num_tokens": 1109913551.0, + "step": 6264 + }, + { + "entropy": 0.7019528361658255, + "epoch": 1.9371093448568564, + "grad_norm": 0.15830156207084656, + "learning_rate": 4.799526561220855e-05, + "loss": 0.6835, + "mean_token_accuracy": 0.8335538630684217, + "num_tokens": 1112019237.0, + "step": 6276 + }, + { + "entropy": 0.7016400955617428, + "epoch": 1.9408133343622191, + "grad_norm": 0.18349109590053558, + "learning_rate": 4.7834983450694405e-05, + "loss": 0.6786, + "mean_token_accuracy": 0.8345780223608017, + "num_tokens": 1114144126.0, + "step": 6288 + }, + { + "entropy": 0.7051151817043623, + "epoch": 1.9445173238675824, + "grad_norm": 0.160055011510849, + "learning_rate": 4.767472357596159e-05, + "loss": 0.6865, + "mean_token_accuracy": 0.8335471215347449, + "num_tokens": 1116255204.0, + "step": 6300 + }, + { + "entropy": 0.6837548911571503, + "epoch": 1.9482213133729456, + "grad_norm": 0.1652652770280838, + "learning_rate": 4.751448763773275e-05, + "loss": 0.6624, + "mean_token_accuracy": 0.8393045626580715, + "num_tokens": 1118373731.0, + "step": 6312 + }, + { + "entropy": 0.6986660932501157, + "epoch": 1.9519253028783086, + "grad_norm": 0.16915340721607208, + "learning_rate": 4.735427728548412e-05, + "loss": 0.6762, + "mean_token_accuracy": 0.8349108224113783, + "num_tokens": 1120485114.0, + "step": 6324 + }, + { + "entropy": 0.7144566103816032, + "epoch": 1.9556292923836716, + "grad_norm": 0.16116300225257874, + "learning_rate": 4.7194094168428496e-05, + "loss": 0.6928, + "mean_token_accuracy": 0.8316774268945059, + "num_tokens": 1122622853.0, + "step": 6336 + }, + { + "entropy": 0.7057266781727473, + "epoch": 1.9593332818890348, + "grad_norm": 0.16374769806861877, + "learning_rate": 4.7033939935498366e-05, + "loss": 0.6834, + "mean_token_accuracy": 0.8338124553362528, + "num_tokens": 1124740405.0, + "step": 6348 + }, + { + "entropy": 0.7115621020396551, + "epoch": 1.9630372713943978, + "grad_norm": 0.17385482788085938, + "learning_rate": 4.6873816235328896e-05, + "loss": 0.6891, + "mean_token_accuracy": 0.8324099443852901, + "num_tokens": 1126901966.0, + "step": 6360 + }, + { + "entropy": 0.6988142629464468, + "epoch": 1.9667412608997608, + "grad_norm": 0.17653614282608032, + "learning_rate": 4.6713724716240915e-05, + "loss": 0.6779, + "mean_token_accuracy": 0.8349249089757601, + "num_tokens": 1128993855.0, + "step": 6372 + }, + { + "entropy": 0.7140787703295549, + "epoch": 1.970445250405124, + "grad_norm": 0.1632198989391327, + "learning_rate": 4.6553667026223975e-05, + "loss": 0.6932, + "mean_token_accuracy": 0.8312925559779009, + "num_tokens": 1131129546.0, + "step": 6384 + }, + { + "entropy": 0.7079446117083231, + "epoch": 1.974149239910487, + "grad_norm": 0.16609056293964386, + "learning_rate": 4.6393644812919416e-05, + "loss": 0.6874, + "mean_token_accuracy": 0.8327439626057943, + "num_tokens": 1133241525.0, + "step": 6396 + }, + { + "entropy": 0.6874182323614756, + "epoch": 1.97785322941585, + "grad_norm": 0.15895400941371918, + "learning_rate": 4.623365972360337e-05, + "loss": 0.6678, + "mean_token_accuracy": 0.8374094466368357, + "num_tokens": 1135375918.0, + "step": 6408 + }, + { + "entropy": 0.7057366843024889, + "epoch": 1.9815572189212132, + "grad_norm": 0.17353583872318268, + "learning_rate": 4.607371340516979e-05, + "loss": 0.6828, + "mean_token_accuracy": 0.8338442370295525, + "num_tokens": 1137469495.0, + "step": 6420 + }, + { + "entropy": 0.6886672539015611, + "epoch": 1.9852612084265762, + "grad_norm": 0.1662573218345642, + "learning_rate": 4.5913807504113556e-05, + "loss": 0.6675, + "mean_token_accuracy": 0.8377264539400736, + "num_tokens": 1139593432.0, + "step": 6432 + }, + { + "entropy": 0.692137303451697, + "epoch": 1.9889651979319392, + "grad_norm": 0.18068836629390717, + "learning_rate": 4.5753943666513455e-05, + "loss": 0.6698, + "mean_token_accuracy": 0.8362124636769295, + "num_tokens": 1141753650.0, + "step": 6444 + }, + { + "entropy": 0.697237261881431, + "epoch": 1.9926691874373024, + "grad_norm": 0.1671893298625946, + "learning_rate": 4.559412353801533e-05, + "loss": 0.6768, + "mean_token_accuracy": 0.8347001535197099, + "num_tokens": 1143892293.0, + "step": 6456 + }, + { + "entropy": 0.704800813148419, + "epoch": 1.9963731769426654, + "grad_norm": 0.1634693741798401, + "learning_rate": 4.543434876381497e-05, + "loss": 0.6824, + "mean_token_accuracy": 0.833663385361433, + "num_tokens": 1145995636.0, + "step": 6468 + }, + { + "entropy": 0.6973251129718537, + "epoch": 2.0, + "grad_norm": 0.1913071721792221, + "learning_rate": 4.52746209886414e-05, + "loss": 0.6774, + "mean_token_accuracy": 0.8352246829803954, + "num_tokens": 1148075160.0, + "step": 6480 + }, + { + "entropy": 0.6958606342474619, + "epoch": 2.003703989505363, + "grad_norm": 0.15933266282081604, + "learning_rate": 4.5114941856739785e-05, + "loss": 0.6727, + "mean_token_accuracy": 0.8364654754598936, + "num_tokens": 1150241776.0, + "step": 6492 + }, + { + "entropy": 0.6805713040133318, + "epoch": 2.007407979010726, + "grad_norm": 0.17767716944217682, + "learning_rate": 4.495531301185453e-05, + "loss": 0.6577, + "mean_token_accuracy": 0.8394694328308105, + "num_tokens": 1152398185.0, + "step": 6504 + }, + { + "entropy": 0.6998436215023199, + "epoch": 2.011111968516089, + "grad_norm": 0.176164448261261, + "learning_rate": 4.479573609721244e-05, + "loss": 0.678, + "mean_token_accuracy": 0.8348252177238464, + "num_tokens": 1154508126.0, + "step": 6516 + }, + { + "entropy": 0.6891536228358746, + "epoch": 2.0148159580214524, + "grad_norm": 0.16503383219242096, + "learning_rate": 4.463621275550571e-05, + "loss": 0.6645, + "mean_token_accuracy": 0.8373865385850271, + "num_tokens": 1156620964.0, + "step": 6528 + }, + { + "entropy": 0.6943193326393763, + "epoch": 2.018519947526815, + "grad_norm": 0.17573916912078857, + "learning_rate": 4.447674462887503e-05, + "loss": 0.6714, + "mean_token_accuracy": 0.836496909459432, + "num_tokens": 1158754680.0, + "step": 6540 + }, + { + "entropy": 0.692933322240909, + "epoch": 2.0222239370321784, + "grad_norm": 0.16031506657600403, + "learning_rate": 4.431733335889278e-05, + "loss": 0.6682, + "mean_token_accuracy": 0.8367396021882693, + "num_tokens": 1160874718.0, + "step": 6552 + }, + { + "entropy": 0.6829059620698293, + "epoch": 2.0259279265375416, + "grad_norm": 0.16216310858726501, + "learning_rate": 4.4157980586545974e-05, + "loss": 0.6582, + "mean_token_accuracy": 0.8389946781098843, + "num_tokens": 1163031354.0, + "step": 6564 + }, + { + "entropy": 0.6946393934388956, + "epoch": 2.0296319160429044, + "grad_norm": 0.18271124362945557, + "learning_rate": 4.399868795221951e-05, + "loss": 0.6707, + "mean_token_accuracy": 0.8366646692156792, + "num_tokens": 1165145296.0, + "step": 6576 + }, + { + "entropy": 0.6902418856819471, + "epoch": 2.0333359055482676, + "grad_norm": 0.15688392519950867, + "learning_rate": 4.383945709567919e-05, + "loss": 0.6682, + "mean_token_accuracy": 0.8372057154774666, + "num_tokens": 1167269330.0, + "step": 6588 + }, + { + "entropy": 0.6856662034988403, + "epoch": 2.037039895053631, + "grad_norm": 0.16447409987449646, + "learning_rate": 4.368028965605486e-05, + "loss": 0.6626, + "mean_token_accuracy": 0.8383788081506888, + "num_tokens": 1169406342.0, + "step": 6600 + }, + { + "entropy": 0.6875134222209454, + "epoch": 2.0407438845589936, + "grad_norm": 0.1646708846092224, + "learning_rate": 4.352118727182358e-05, + "loss": 0.663, + "mean_token_accuracy": 0.8381359900037447, + "num_tokens": 1171551058.0, + "step": 6612 + }, + { + "entropy": 0.6901310036579767, + "epoch": 2.044447874064357, + "grad_norm": 0.17784608900547028, + "learning_rate": 4.3362151580792707e-05, + "loss": 0.6689, + "mean_token_accuracy": 0.836744820078214, + "num_tokens": 1173700001.0, + "step": 6624 + }, + { + "entropy": 0.688694529235363, + "epoch": 2.04815186356972, + "grad_norm": 0.16807569563388824, + "learning_rate": 4.320318422008306e-05, + "loss": 0.6639, + "mean_token_accuracy": 0.838087435811758, + "num_tokens": 1175826029.0, + "step": 6636 + }, + { + "entropy": 0.6816125065088272, + "epoch": 2.051855853075083, + "grad_norm": 0.16541939973831177, + "learning_rate": 4.304428682611208e-05, + "loss": 0.659, + "mean_token_accuracy": 0.8384276715417703, + "num_tokens": 1177964197.0, + "step": 6648 + }, + { + "entropy": 0.6754038743674755, + "epoch": 2.055559842580446, + "grad_norm": 0.1831735372543335, + "learning_rate": 4.288546103457694e-05, + "loss": 0.6514, + "mean_token_accuracy": 0.8410420827567577, + "num_tokens": 1180073560.0, + "step": 6660 + }, + { + "entropy": 0.6953004685540994, + "epoch": 2.059263832085809, + "grad_norm": 0.16756285727024078, + "learning_rate": 4.272670848043776e-05, + "loss": 0.6715, + "mean_token_accuracy": 0.836195650200049, + "num_tokens": 1182227884.0, + "step": 6672 + }, + { + "entropy": 0.6704849277933439, + "epoch": 2.062967821591172, + "grad_norm": 0.16364024579524994, + "learning_rate": 4.256803079790071e-05, + "loss": 0.6504, + "mean_token_accuracy": 0.8407212706903616, + "num_tokens": 1184338894.0, + "step": 6684 + }, + { + "entropy": 0.6869217567145824, + "epoch": 2.066671811096535, + "grad_norm": 0.17596305906772614, + "learning_rate": 4.240942962040128e-05, + "loss": 0.6662, + "mean_token_accuracy": 0.8376976748307546, + "num_tokens": 1186473192.0, + "step": 6696 + }, + { + "entropy": 0.6874274127185345, + "epoch": 2.0703758006018984, + "grad_norm": 0.168784037232399, + "learning_rate": 4.22509065805874e-05, + "loss": 0.6635, + "mean_token_accuracy": 0.8380283936858177, + "num_tokens": 1188591972.0, + "step": 6708 + }, + { + "entropy": 0.6954148449003696, + "epoch": 2.074079790107261, + "grad_norm": 0.18569393455982208, + "learning_rate": 4.20924633103026e-05, + "loss": 0.6734, + "mean_token_accuracy": 0.8357961600025495, + "num_tokens": 1190735606.0, + "step": 6720 + }, + { + "entropy": 0.6922590111692747, + "epoch": 2.0777837796126244, + "grad_norm": 0.15867288410663605, + "learning_rate": 4.19341014405693e-05, + "loss": 0.6663, + "mean_token_accuracy": 0.8377314632137617, + "num_tokens": 1192850698.0, + "step": 6732 + }, + { + "entropy": 0.6765066559116045, + "epoch": 2.0814877691179876, + "grad_norm": 0.17693735659122467, + "learning_rate": 4.1775822601572e-05, + "loss": 0.6527, + "mean_token_accuracy": 0.8403165886799494, + "num_tokens": 1194969343.0, + "step": 6744 + }, + { + "entropy": 0.6881164262692133, + "epoch": 2.0851917586233504, + "grad_norm": 0.17512989044189453, + "learning_rate": 4.1617628422640434e-05, + "loss": 0.6631, + "mean_token_accuracy": 0.8381856555740038, + "num_tokens": 1197090640.0, + "step": 6756 + }, + { + "entropy": 0.6754906934996446, + "epoch": 2.0888957481287136, + "grad_norm": 0.17044971883296967, + "learning_rate": 4.145952053223284e-05, + "loss": 0.6526, + "mean_token_accuracy": 0.8403135687112808, + "num_tokens": 1199225773.0, + "step": 6768 + }, + { + "entropy": 0.6851841695606709, + "epoch": 2.092599737634077, + "grad_norm": 0.16464471817016602, + "learning_rate": 4.130150055791926e-05, + "loss": 0.6623, + "mean_token_accuracy": 0.8387955824534098, + "num_tokens": 1201348698.0, + "step": 6780 + }, + { + "entropy": 0.6839038406809171, + "epoch": 2.0963037271394396, + "grad_norm": 0.17445310950279236, + "learning_rate": 4.114357012636465e-05, + "loss": 0.6618, + "mean_token_accuracy": 0.8389583652218183, + "num_tokens": 1203501839.0, + "step": 6792 + }, + { + "entropy": 0.6837594596048197, + "epoch": 2.100007716644803, + "grad_norm": 0.16313707828521729, + "learning_rate": 4.0985730863312234e-05, + "loss": 0.6668, + "mean_token_accuracy": 0.8375067251423994, + "num_tokens": 1205659039.0, + "step": 6804 + }, + { + "entropy": 0.6823192238807678, + "epoch": 2.103711706150166, + "grad_norm": 0.18319004774093628, + "learning_rate": 4.082798439356677e-05, + "loss": 0.6571, + "mean_token_accuracy": 0.8394167373577753, + "num_tokens": 1207800655.0, + "step": 6816 + }, + { + "entropy": 0.6888291624685129, + "epoch": 2.1074156956555288, + "grad_norm": 0.16149847209453583, + "learning_rate": 4.0670332340977765e-05, + "loss": 0.6669, + "mean_token_accuracy": 0.8372188744445642, + "num_tokens": 1209942817.0, + "step": 6828 + }, + { + "entropy": 0.683031198879083, + "epoch": 2.111119685160892, + "grad_norm": 0.182169571518898, + "learning_rate": 4.051277632842281e-05, + "loss": 0.6586, + "mean_token_accuracy": 0.8390312877794107, + "num_tokens": 1212050353.0, + "step": 6840 + }, + { + "entropy": 0.6798936774333318, + "epoch": 2.114823674666255, + "grad_norm": 0.16732676327228546, + "learning_rate": 4.0355317977790815e-05, + "loss": 0.6555, + "mean_token_accuracy": 0.8397704648474852, + "num_tokens": 1214180248.0, + "step": 6852 + }, + { + "entropy": 0.6978944800794125, + "epoch": 2.118527664171618, + "grad_norm": 0.16686294972896576, + "learning_rate": 4.019795890996543e-05, + "loss": 0.6753, + "mean_token_accuracy": 0.8350128419697285, + "num_tokens": 1216254353.0, + "step": 6864 + }, + { + "entropy": 0.6932212188839912, + "epoch": 2.122231653676981, + "grad_norm": 0.1676548272371292, + "learning_rate": 4.0040700744808204e-05, + "loss": 0.6702, + "mean_token_accuracy": 0.8364811092615128, + "num_tokens": 1218387247.0, + "step": 6876 + }, + { + "entropy": 0.6828142193456491, + "epoch": 2.1259356431823444, + "grad_norm": 0.16696545481681824, + "learning_rate": 3.988354510114203e-05, + "loss": 0.6589, + "mean_token_accuracy": 0.8394288209577402, + "num_tokens": 1220553394.0, + "step": 6888 + }, + { + "entropy": 0.6806088127195835, + "epoch": 2.1296396326877076, + "grad_norm": 0.1975679099559784, + "learning_rate": 3.972649359673446e-05, + "loss": 0.6584, + "mean_token_accuracy": 0.8398889203866323, + "num_tokens": 1222724624.0, + "step": 6900 + }, + { + "entropy": 0.6907341443002224, + "epoch": 2.1333436221930704, + "grad_norm": 0.16510289907455444, + "learning_rate": 3.956954784828098e-05, + "loss": 0.6697, + "mean_token_accuracy": 0.836446632941564, + "num_tokens": 1224844469.0, + "step": 6912 + }, + { + "entropy": 0.6722525469958782, + "epoch": 2.1370476116984336, + "grad_norm": 0.17981542646884918, + "learning_rate": 3.9412709471388446e-05, + "loss": 0.6504, + "mean_token_accuracy": 0.8412780488530794, + "num_tokens": 1226985482.0, + "step": 6924 + }, + { + "entropy": 0.6808725881079832, + "epoch": 2.140751601203797, + "grad_norm": 0.17366532981395721, + "learning_rate": 3.925598008055844e-05, + "loss": 0.6588, + "mean_token_accuracy": 0.8392424657940865, + "num_tokens": 1229092079.0, + "step": 6936 + }, + { + "entropy": 0.6670869911710421, + "epoch": 2.1444555907091596, + "grad_norm": 0.1617341935634613, + "learning_rate": 3.90993612891706e-05, + "loss": 0.6455, + "mean_token_accuracy": 0.842083066701889, + "num_tokens": 1231212442.0, + "step": 6948 + }, + { + "entropy": 0.6818137591083845, + "epoch": 2.148159580214523, + "grad_norm": 0.16834302246570587, + "learning_rate": 3.894285470946609e-05, + "loss": 0.6584, + "mean_token_accuracy": 0.8394963393608729, + "num_tokens": 1233347325.0, + "step": 6960 + }, + { + "entropy": 0.6732950036724409, + "epoch": 2.151863569719886, + "grad_norm": 0.1831921488046646, + "learning_rate": 3.878646195253095e-05, + "loss": 0.649, + "mean_token_accuracy": 0.8414178465803465, + "num_tokens": 1235473249.0, + "step": 6972 + }, + { + "entropy": 0.6829912650088469, + "epoch": 2.155567559225249, + "grad_norm": 0.17452828586101532, + "learning_rate": 3.863018462827948e-05, + "loss": 0.6606, + "mean_token_accuracy": 0.8385308894018332, + "num_tokens": 1237606878.0, + "step": 6984 + }, + { + "entropy": 0.6933281992872556, + "epoch": 2.159271548730612, + "grad_norm": 0.1628619283437729, + "learning_rate": 3.847402434543777e-05, + "loss": 0.6721, + "mean_token_accuracy": 0.8357371079425017, + "num_tokens": 1239737890.0, + "step": 6996 + }, + { + "entropy": 0.6939899697899818, + "epoch": 2.162975538235975, + "grad_norm": 0.1830848604440689, + "learning_rate": 3.831798271152704e-05, + "loss": 0.6715, + "mean_token_accuracy": 0.8364391488333544, + "num_tokens": 1241820691.0, + "step": 7008 + }, + { + "entropy": 0.6830517227451006, + "epoch": 2.166679527741338, + "grad_norm": 0.1746508777141571, + "learning_rate": 3.816206133284716e-05, + "loss": 0.6595, + "mean_token_accuracy": 0.8387770056724548, + "num_tokens": 1243913932.0, + "step": 7020 + }, + { + "entropy": 0.6800741304953893, + "epoch": 2.170383517246701, + "grad_norm": 0.17165830731391907, + "learning_rate": 3.800626181446007e-05, + "loss": 0.6583, + "mean_token_accuracy": 0.8394532613456249, + "num_tokens": 1246054747.0, + "step": 7032 + }, + { + "entropy": 0.6898703599969546, + "epoch": 2.1740875067520644, + "grad_norm": 0.17083138227462769, + "learning_rate": 3.7850585760173275e-05, + "loss": 0.6656, + "mean_token_accuracy": 0.837770772476991, + "num_tokens": 1248153486.0, + "step": 7044 + }, + { + "entropy": 0.6814120809237162, + "epoch": 2.177791496257427, + "grad_norm": 0.16952723264694214, + "learning_rate": 3.769503477252335e-05, + "loss": 0.6586, + "mean_token_accuracy": 0.8393961998323599, + "num_tokens": 1250281780.0, + "step": 7056 + }, + { + "entropy": 0.6871961392462254, + "epoch": 2.1814954857627904, + "grad_norm": 0.16225868463516235, + "learning_rate": 3.75396104527594e-05, + "loss": 0.6635, + "mean_token_accuracy": 0.8381969705224037, + "num_tokens": 1252424662.0, + "step": 7068 + }, + { + "entropy": 0.6821385646859804, + "epoch": 2.1851994752681536, + "grad_norm": 0.16517527401447296, + "learning_rate": 3.738431440082665e-05, + "loss": 0.6593, + "mean_token_accuracy": 0.839236855506897, + "num_tokens": 1254557921.0, + "step": 7080 + }, + { + "entropy": 0.6806115495661894, + "epoch": 2.1889034647735164, + "grad_norm": 0.1751754730939865, + "learning_rate": 3.7229148215349884e-05, + "loss": 0.6595, + "mean_token_accuracy": 0.8389156634608904, + "num_tokens": 1256677173.0, + "step": 7092 + }, + { + "entropy": 0.6796896619101366, + "epoch": 2.1926074542788796, + "grad_norm": 0.18694189190864563, + "learning_rate": 3.707411349361705e-05, + "loss": 0.6551, + "mean_token_accuracy": 0.8396575612326463, + "num_tokens": 1258777752.0, + "step": 7104 + }, + { + "entropy": 0.6812881293396155, + "epoch": 2.196311443784243, + "grad_norm": 0.1656647026538849, + "learning_rate": 3.6919211831562806e-05, + "loss": 0.6573, + "mean_token_accuracy": 0.8394532203674316, + "num_tokens": 1260896416.0, + "step": 7116 + }, + { + "entropy": 0.6926145292818546, + "epoch": 2.2000154332896056, + "grad_norm": 0.1721578985452652, + "learning_rate": 3.676444482375208e-05, + "loss": 0.6692, + "mean_token_accuracy": 0.836610042800506, + "num_tokens": 1263001893.0, + "step": 7128 + }, + { + "entropy": 0.698531299829483, + "epoch": 2.203719422794969, + "grad_norm": 0.16790412366390228, + "learning_rate": 3.660981406336369e-05, + "loss": 0.6751, + "mean_token_accuracy": 0.8358333731691042, + "num_tokens": 1265170858.0, + "step": 7140 + }, + { + "entropy": 0.6893511265516281, + "epoch": 2.207423412300332, + "grad_norm": 0.17855829000473022, + "learning_rate": 3.645532114217387e-05, + "loss": 0.667, + "mean_token_accuracy": 0.837383933365345, + "num_tokens": 1267306591.0, + "step": 7152 + }, + { + "entropy": 0.6948994646469752, + "epoch": 2.211127401805695, + "grad_norm": 0.17504854500293732, + "learning_rate": 3.630096765053997e-05, + "loss": 0.6735, + "mean_token_accuracy": 0.8362486995756626, + "num_tokens": 1269448580.0, + "step": 7164 + }, + { + "entropy": 0.6904158741235733, + "epoch": 2.214831391311058, + "grad_norm": 0.16738073527812958, + "learning_rate": 3.614675517738405e-05, + "loss": 0.6678, + "mean_token_accuracy": 0.8369380322595438, + "num_tokens": 1271571840.0, + "step": 7176 + }, + { + "entropy": 0.6801648748417696, + "epoch": 2.218535380816421, + "grad_norm": 0.18572407960891724, + "learning_rate": 3.599268531017646e-05, + "loss": 0.658, + "mean_token_accuracy": 0.8388838407893976, + "num_tokens": 1273701179.0, + "step": 7188 + }, + { + "entropy": 0.6940938420593739, + "epoch": 2.222239370321784, + "grad_norm": 0.1790064424276352, + "learning_rate": 3.583875963491964e-05, + "loss": 0.6714, + "mean_token_accuracy": 0.8359104084471861, + "num_tokens": 1275846470.0, + "step": 7200 + }, + { + "entropy": 0.685243288675944, + "epoch": 2.225943359827147, + "grad_norm": 0.17031192779541016, + "learning_rate": 3.568497973613166e-05, + "loss": 0.6625, + "mean_token_accuracy": 0.8384867298106352, + "num_tokens": 1277981074.0, + "step": 7212 + }, + { + "entropy": 0.6832674543062845, + "epoch": 2.2296473493325104, + "grad_norm": 0.17609767615795135, + "learning_rate": 3.5531347196829964e-05, + "loss": 0.6602, + "mean_token_accuracy": 0.8384842636684576, + "num_tokens": 1280123692.0, + "step": 7224 + }, + { + "entropy": 0.6736332066357136, + "epoch": 2.233351338837873, + "grad_norm": 0.18072204291820526, + "learning_rate": 3.537786359851506e-05, + "loss": 0.6482, + "mean_token_accuracy": 0.8411118065317472, + "num_tokens": 1282280311.0, + "step": 7236 + }, + { + "entropy": 0.6906834666927656, + "epoch": 2.2370553283432364, + "grad_norm": 0.16962793469429016, + "learning_rate": 3.52245305211543e-05, + "loss": 0.669, + "mean_token_accuracy": 0.8369547414282957, + "num_tokens": 1284394136.0, + "step": 7248 + }, + { + "entropy": 0.681410993138949, + "epoch": 2.2407593178485996, + "grad_norm": 0.1630384773015976, + "learning_rate": 3.5071349543165513e-05, + "loss": 0.6591, + "mean_token_accuracy": 0.8389986592034498, + "num_tokens": 1286525608.0, + "step": 7260 + }, + { + "entropy": 0.6817056524256865, + "epoch": 2.2444633073539624, + "grad_norm": 0.17839214205741882, + "learning_rate": 3.491832224140083e-05, + "loss": 0.6592, + "mean_token_accuracy": 0.8388963937759399, + "num_tokens": 1288627106.0, + "step": 7272 + }, + { + "entropy": 0.6757811804612478, + "epoch": 2.2481672968593256, + "grad_norm": 0.17169944941997528, + "learning_rate": 3.476545019113046e-05, + "loss": 0.6562, + "mean_token_accuracy": 0.8402783883114656, + "num_tokens": 1290730620.0, + "step": 7284 + }, + { + "entropy": 0.6782712750136852, + "epoch": 2.251871286364689, + "grad_norm": 0.17680226266384125, + "learning_rate": 3.4612734966026387e-05, + "loss": 0.6531, + "mean_token_accuracy": 0.8399842207630476, + "num_tokens": 1292839997.0, + "step": 7296 + }, + { + "entropy": 0.6741428785026073, + "epoch": 2.2555752758700516, + "grad_norm": 0.1740843504667282, + "learning_rate": 3.446017813814627e-05, + "loss": 0.6539, + "mean_token_accuracy": 0.8401829078793526, + "num_tokens": 1294968879.0, + "step": 7308 + }, + { + "entropy": 0.6835886935393015, + "epoch": 2.259279265375415, + "grad_norm": 0.16604800522327423, + "learning_rate": 3.430778127791723e-05, + "loss": 0.6604, + "mean_token_accuracy": 0.8385746528704962, + "num_tokens": 1297112365.0, + "step": 7320 + }, + { + "entropy": 0.6774425928791364, + "epoch": 2.262983254880778, + "grad_norm": 0.16330775618553162, + "learning_rate": 3.415554595411963e-05, + "loss": 0.6549, + "mean_token_accuracy": 0.8393272707859675, + "num_tokens": 1299263272.0, + "step": 7332 + }, + { + "entropy": 0.6806386026243368, + "epoch": 2.2666872443861408, + "grad_norm": 0.17268003523349762, + "learning_rate": 3.400347373387099e-05, + "loss": 0.6591, + "mean_token_accuracy": 0.839379072189331, + "num_tokens": 1301389980.0, + "step": 7344 + }, + { + "entropy": 0.6829134225845337, + "epoch": 2.270391233891504, + "grad_norm": 0.17173625528812408, + "learning_rate": 3.385156618260988e-05, + "loss": 0.6615, + "mean_token_accuracy": 0.8388151476780573, + "num_tokens": 1303525932.0, + "step": 7356 + }, + { + "entropy": 0.6792828490336736, + "epoch": 2.274095223396867, + "grad_norm": 0.1721930056810379, + "learning_rate": 3.369982486407968e-05, + "loss": 0.6588, + "mean_token_accuracy": 0.8391289735833803, + "num_tokens": 1305664388.0, + "step": 7368 + }, + { + "entropy": 0.6927899805208048, + "epoch": 2.27779921290223, + "grad_norm": 0.17168404161930084, + "learning_rate": 3.354825134031261e-05, + "loss": 0.6682, + "mean_token_accuracy": 0.8373626172542572, + "num_tokens": 1307780556.0, + "step": 7380 + }, + { + "entropy": 0.6774941322704157, + "epoch": 2.281503202407593, + "grad_norm": 0.1669553965330124, + "learning_rate": 3.339684717161358e-05, + "loss": 0.6543, + "mean_token_accuracy": 0.8401380938788255, + "num_tokens": 1309926320.0, + "step": 7392 + }, + { + "entropy": 0.6856278690199057, + "epoch": 2.2852071919129564, + "grad_norm": 0.18108035624027252, + "learning_rate": 3.3245613916544186e-05, + "loss": 0.6643, + "mean_token_accuracy": 0.8376220539212227, + "num_tokens": 1312077047.0, + "step": 7404 + }, + { + "entropy": 0.6788114123046398, + "epoch": 2.288911181418319, + "grad_norm": 0.1684533655643463, + "learning_rate": 3.30945531319066e-05, + "loss": 0.6553, + "mean_token_accuracy": 0.8395294894774755, + "num_tokens": 1314210244.0, + "step": 7416 + }, + { + "entropy": 0.6847534601887068, + "epoch": 2.2926151709236824, + "grad_norm": 0.17420773208141327, + "learning_rate": 3.294366637272758e-05, + "loss": 0.664, + "mean_token_accuracy": 0.8375212388734022, + "num_tokens": 1316332813.0, + "step": 7428 + }, + { + "entropy": 0.67829688017567, + "epoch": 2.2963191604290456, + "grad_norm": 0.17259854078292847, + "learning_rate": 3.279295519224248e-05, + "loss": 0.6552, + "mean_token_accuracy": 0.840360164642334, + "num_tokens": 1318446966.0, + "step": 7440 + }, + { + "entropy": 0.682950338969628, + "epoch": 2.3000231499344084, + "grad_norm": 0.17080390453338623, + "learning_rate": 3.26424211418792e-05, + "loss": 0.6622, + "mean_token_accuracy": 0.838694820801417, + "num_tokens": 1320578326.0, + "step": 7452 + }, + { + "entropy": 0.6896775799492995, + "epoch": 2.3037271394397716, + "grad_norm": 0.17524172365665436, + "learning_rate": 3.2492065771242294e-05, + "loss": 0.6665, + "mean_token_accuracy": 0.8372247604032358, + "num_tokens": 1322730718.0, + "step": 7464 + }, + { + "entropy": 0.6799197805424532, + "epoch": 2.307431128945135, + "grad_norm": 0.17451880872249603, + "learning_rate": 3.234189062809695e-05, + "loss": 0.6573, + "mean_token_accuracy": 0.8392599709331989, + "num_tokens": 1324847293.0, + "step": 7476 + }, + { + "entropy": 0.667460153500239, + "epoch": 2.3111351184504976, + "grad_norm": 0.18068212270736694, + "learning_rate": 3.219189725835311e-05, + "loss": 0.6471, + "mean_token_accuracy": 0.841596515228351, + "num_tokens": 1326958062.0, + "step": 7488 + }, + { + "entropy": 0.6783041432499886, + "epoch": 2.314839107955861, + "grad_norm": 0.18286266922950745, + "learning_rate": 3.20420872060495e-05, + "loss": 0.6564, + "mean_token_accuracy": 0.8399129547178745, + "num_tokens": 1329086141.0, + "step": 7500 + }, + { + "entropy": 0.6802773574988047, + "epoch": 2.318543097461224, + "grad_norm": 0.17960713803768158, + "learning_rate": 3.189246201333778e-05, + "loss": 0.6568, + "mean_token_accuracy": 0.8396291807293892, + "num_tokens": 1331208721.0, + "step": 7512 + }, + { + "entropy": 0.6929339257379373, + "epoch": 2.3222470869665868, + "grad_norm": 0.1749773770570755, + "learning_rate": 3.174302322046669e-05, + "loss": 0.6688, + "mean_token_accuracy": 0.8368431739509106, + "num_tokens": 1333333100.0, + "step": 7524 + }, + { + "entropy": 0.6909980537990729, + "epoch": 2.32595107647195, + "grad_norm": 0.16345363855361938, + "learning_rate": 3.1593772365766105e-05, + "loss": 0.668, + "mean_token_accuracy": 0.8363487422466278, + "num_tokens": 1335449123.0, + "step": 7536 + }, + { + "entropy": 0.6702785243590673, + "epoch": 2.329655065977313, + "grad_norm": 0.17578427493572235, + "learning_rate": 3.144471098563131e-05, + "loss": 0.6476, + "mean_token_accuracy": 0.8414431102573872, + "num_tokens": 1337603503.0, + "step": 7548 + }, + { + "entropy": 0.6757986781497797, + "epoch": 2.333359055482676, + "grad_norm": 0.17358356714248657, + "learning_rate": 3.129584061450709e-05, + "loss": 0.6545, + "mean_token_accuracy": 0.8400241335233053, + "num_tokens": 1339719225.0, + "step": 7560 + }, + { + "entropy": 0.681422288219134, + "epoch": 2.337063044988039, + "grad_norm": 0.18540477752685547, + "learning_rate": 3.1147162784871986e-05, + "loss": 0.6578, + "mean_token_accuracy": 0.8393968902528286, + "num_tokens": 1341867232.0, + "step": 7572 + }, + { + "entropy": 0.6797411205867926, + "epoch": 2.3407670344934024, + "grad_norm": 0.17587818205356598, + "learning_rate": 3.0998679027222535e-05, + "loss": 0.6608, + "mean_token_accuracy": 0.8387786311407884, + "num_tokens": 1343972966.0, + "step": 7584 + }, + { + "entropy": 0.676243698845307, + "epoch": 2.344471023998765, + "grad_norm": 0.1766098141670227, + "learning_rate": 3.0850390870057435e-05, + "loss": 0.6539, + "mean_token_accuracy": 0.8406179261704286, + "num_tokens": 1346124725.0, + "step": 7596 + }, + { + "entropy": 0.683370524396499, + "epoch": 2.3481750135041284, + "grad_norm": 0.18542231619358063, + "learning_rate": 3.0702299839861915e-05, + "loss": 0.6598, + "mean_token_accuracy": 0.8393504520257314, + "num_tokens": 1348237486.0, + "step": 7608 + }, + { + "entropy": 0.6867342926561832, + "epoch": 2.3518790030094916, + "grad_norm": 0.17579790949821472, + "learning_rate": 3.0554407461091925e-05, + "loss": 0.6611, + "mean_token_accuracy": 0.8385164514183998, + "num_tokens": 1350337121.0, + "step": 7620 + }, + { + "entropy": 0.682259933402141, + "epoch": 2.3555829925148544, + "grad_norm": 0.1693212389945984, + "learning_rate": 3.0406715256158524e-05, + "loss": 0.66, + "mean_token_accuracy": 0.8392416251202425, + "num_tokens": 1352466879.0, + "step": 7632 + }, + { + "entropy": 0.6840335975090662, + "epoch": 2.3592869820202176, + "grad_norm": 0.16809053719043732, + "learning_rate": 3.0259224745412157e-05, + "loss": 0.6618, + "mean_token_accuracy": 0.8384669708708922, + "num_tokens": 1354609035.0, + "step": 7644 + }, + { + "entropy": 0.6911391988396645, + "epoch": 2.362990971525581, + "grad_norm": 0.16995146870613098, + "learning_rate": 3.011193744712701e-05, + "loss": 0.6697, + "mean_token_accuracy": 0.8367390433947245, + "num_tokens": 1356726626.0, + "step": 7656 + }, + { + "entropy": 0.6708775386214256, + "epoch": 2.3666949610309436, + "grad_norm": 0.18156154453754425, + "learning_rate": 2.996485487748542e-05, + "loss": 0.6465, + "mean_token_accuracy": 0.841506606588761, + "num_tokens": 1358868598.0, + "step": 7668 + }, + { + "entropy": 0.6810227843622366, + "epoch": 2.370398950536307, + "grad_norm": 0.18769192695617676, + "learning_rate": 2.9817978550562208e-05, + "loss": 0.6577, + "mean_token_accuracy": 0.8396956846117973, + "num_tokens": 1360988797.0, + "step": 7680 + }, + { + "entropy": 0.6622078480819861, + "epoch": 2.37410294004167, + "grad_norm": 0.18112178146839142, + "learning_rate": 2.9671309978309132e-05, + "loss": 0.6407, + "mean_token_accuracy": 0.8430601954460144, + "num_tokens": 1363110692.0, + "step": 7692 + }, + { + "entropy": 0.6774277985095978, + "epoch": 2.3778069295470328, + "grad_norm": 0.17796562612056732, + "learning_rate": 2.9524850670539327e-05, + "loss": 0.6551, + "mean_token_accuracy": 0.8401034958660603, + "num_tokens": 1365200120.0, + "step": 7704 + }, + { + "entropy": 0.6842349829773108, + "epoch": 2.381510919052396, + "grad_norm": 0.1720336526632309, + "learning_rate": 2.937860213491173e-05, + "loss": 0.6617, + "mean_token_accuracy": 0.8388624712824821, + "num_tokens": 1367347094.0, + "step": 7716 + }, + { + "entropy": 0.6832546703517437, + "epoch": 2.385214908557759, + "grad_norm": 0.1650596261024475, + "learning_rate": 2.9232565876915585e-05, + "loss": 0.6605, + "mean_token_accuracy": 0.8394694936772188, + "num_tokens": 1369448080.0, + "step": 7728 + }, + { + "entropy": 0.6817479953169823, + "epoch": 2.388918898063122, + "grad_norm": 0.17462390661239624, + "learning_rate": 2.9086743399854972e-05, + "loss": 0.6581, + "mean_token_accuracy": 0.839547095199426, + "num_tokens": 1371582172.0, + "step": 7740 + }, + { + "entropy": 0.6731888850529989, + "epoch": 2.392622887568485, + "grad_norm": 0.18051554262638092, + "learning_rate": 2.894113620483323e-05, + "loss": 0.6517, + "mean_token_accuracy": 0.8407465231915315, + "num_tokens": 1373684475.0, + "step": 7752 + }, + { + "entropy": 0.6890167209009329, + "epoch": 2.3963268770738484, + "grad_norm": 0.1616596132516861, + "learning_rate": 2.8795745790737612e-05, + "loss": 0.6663, + "mean_token_accuracy": 0.8371898656090101, + "num_tokens": 1375839368.0, + "step": 7764 + }, + { + "entropy": 0.6708348778386911, + "epoch": 2.400030866579211, + "grad_norm": 0.1778324991464615, + "learning_rate": 2.865057365422386e-05, + "loss": 0.6503, + "mean_token_accuracy": 0.8408366913596789, + "num_tokens": 1377951641.0, + "step": 7776 + }, + { + "entropy": 0.6847917512059212, + "epoch": 2.4037348560845744, + "grad_norm": 0.17857876420021057, + "learning_rate": 2.8505621289700705e-05, + "loss": 0.6618, + "mean_token_accuracy": 0.8389340154826641, + "num_tokens": 1380072849.0, + "step": 7788 + }, + { + "entropy": 0.6858414982755979, + "epoch": 2.4074388455899376, + "grad_norm": 0.17925503849983215, + "learning_rate": 2.8360890189314548e-05, + "loss": 0.6617, + "mean_token_accuracy": 0.8384112877150377, + "num_tokens": 1382150591.0, + "step": 7800 + }, + { + "entropy": 0.6786315503219763, + "epoch": 2.4111428350953004, + "grad_norm": 0.16545531153678894, + "learning_rate": 2.821638184293408e-05, + "loss": 0.6566, + "mean_token_accuracy": 0.8399387697378794, + "num_tokens": 1384273874.0, + "step": 7812 + }, + { + "entropy": 0.6717840371032556, + "epoch": 2.4148468246006636, + "grad_norm": 0.18790796399116516, + "learning_rate": 2.8072097738134946e-05, + "loss": 0.6487, + "mean_token_accuracy": 0.8416854639848074, + "num_tokens": 1386421067.0, + "step": 7824 + }, + { + "entropy": 0.6912858026723067, + "epoch": 2.418550814106027, + "grad_norm": 0.19037222862243652, + "learning_rate": 2.792803936018449e-05, + "loss": 0.6669, + "mean_token_accuracy": 0.8376653293768564, + "num_tokens": 1388555640.0, + "step": 7836 + }, + { + "entropy": 0.6990839106341203, + "epoch": 2.4222548036113896, + "grad_norm": 0.177109956741333, + "learning_rate": 2.778420819202634e-05, + "loss": 0.6791, + "mean_token_accuracy": 0.8342719053228697, + "num_tokens": 1390701540.0, + "step": 7848 + }, + { + "entropy": 0.6801580414175987, + "epoch": 2.4259587931167528, + "grad_norm": 0.18449810147285461, + "learning_rate": 2.764060571426527e-05, + "loss": 0.6581, + "mean_token_accuracy": 0.839683381219705, + "num_tokens": 1392847168.0, + "step": 7860 + }, + { + "entropy": 0.677167497575283, + "epoch": 2.429662782622116, + "grad_norm": 0.17673510313034058, + "learning_rate": 2.7497233405151873e-05, + "loss": 0.6546, + "mean_token_accuracy": 0.8402298440535864, + "num_tokens": 1394982282.0, + "step": 7872 + }, + { + "entropy": 0.6832474445303282, + "epoch": 2.4333667721274788, + "grad_norm": 0.1732633411884308, + "learning_rate": 2.7354092740567395e-05, + "loss": 0.6585, + "mean_token_accuracy": 0.8391941326359907, + "num_tokens": 1397146217.0, + "step": 7884 + }, + { + "entropy": 0.6827088569601377, + "epoch": 2.437070761632842, + "grad_norm": 0.1737031638622284, + "learning_rate": 2.7211185194008492e-05, + "loss": 0.6635, + "mean_token_accuracy": 0.8383902385830879, + "num_tokens": 1399291383.0, + "step": 7896 + }, + { + "entropy": 0.68471717958649, + "epoch": 2.440774751138205, + "grad_norm": 0.2073935717344284, + "learning_rate": 2.706851223657214e-05, + "loss": 0.6628, + "mean_token_accuracy": 0.8384527899324894, + "num_tokens": 1401422348.0, + "step": 7908 + }, + { + "entropy": 0.6754190859695276, + "epoch": 2.444478740643568, + "grad_norm": 0.18220025300979614, + "learning_rate": 2.6926075336940404e-05, + "loss": 0.651, + "mean_token_accuracy": 0.8403765348096689, + "num_tokens": 1403549979.0, + "step": 7920 + }, + { + "entropy": 0.6639910079538822, + "epoch": 2.448182730148931, + "grad_norm": 0.1840001493692398, + "learning_rate": 2.6783875961365378e-05, + "loss": 0.6427, + "mean_token_accuracy": 0.842850154886643, + "num_tokens": 1405631673.0, + "step": 7932 + }, + { + "entropy": 0.6794233421484629, + "epoch": 2.4518867196542944, + "grad_norm": 0.1842009425163269, + "learning_rate": 2.664191557365404e-05, + "loss": 0.6593, + "mean_token_accuracy": 0.8393058615426222, + "num_tokens": 1407746726.0, + "step": 7944 + }, + { + "entropy": 0.696602446337541, + "epoch": 2.455590709159657, + "grad_norm": 0.20143558084964752, + "learning_rate": 2.6500195635153287e-05, + "loss": 0.6741, + "mean_token_accuracy": 0.835796115299066, + "num_tokens": 1409892372.0, + "step": 7956 + }, + { + "entropy": 0.6946029762427012, + "epoch": 2.4592946986650204, + "grad_norm": 0.17518438398838043, + "learning_rate": 2.635871760473475e-05, + "loss": 0.6691, + "mean_token_accuracy": 0.8366143380602201, + "num_tokens": 1412008656.0, + "step": 7968 + }, + { + "entropy": 0.6815679098169009, + "epoch": 2.4629986881703836, + "grad_norm": 0.18056343495845795, + "learning_rate": 2.6217482938779882e-05, + "loss": 0.6573, + "mean_token_accuracy": 0.8400058262050152, + "num_tokens": 1414116887.0, + "step": 7980 + }, + { + "entropy": 0.6889658160507679, + "epoch": 2.4667026776757464, + "grad_norm": 0.184426948428154, + "learning_rate": 2.607649309116492e-05, + "loss": 0.6678, + "mean_token_accuracy": 0.8373049336175123, + "num_tokens": 1416227606.0, + "step": 7992 + }, + { + "entropy": 0.6798962478836378, + "epoch": 2.4704066671811096, + "grad_norm": 0.1737222820520401, + "learning_rate": 2.5935749513245927e-05, + "loss": 0.655, + "mean_token_accuracy": 0.8400146377583345, + "num_tokens": 1418343060.0, + "step": 8004 + }, + { + "entropy": 0.6849433220922947, + "epoch": 2.474110656686473, + "grad_norm": 0.17856653034687042, + "learning_rate": 2.579525365384386e-05, + "loss": 0.6641, + "mean_token_accuracy": 0.8383715438346068, + "num_tokens": 1420448289.0, + "step": 8016 + }, + { + "entropy": 0.6673534649113814, + "epoch": 2.4778146461918356, + "grad_norm": 0.18288525938987732, + "learning_rate": 2.5655006959229655e-05, + "loss": 0.6439, + "mean_token_accuracy": 0.8429509215056896, + "num_tokens": 1422567600.0, + "step": 8028 + }, + { + "entropy": 0.6816601393123468, + "epoch": 2.4815186356971988, + "grad_norm": 0.17290998995304108, + "learning_rate": 2.5515010873109313e-05, + "loss": 0.6588, + "mean_token_accuracy": 0.8389557711780071, + "num_tokens": 1424653554.0, + "step": 8040 + }, + { + "entropy": 0.6713767026861509, + "epoch": 2.485222625202562, + "grad_norm": 0.16905297338962555, + "learning_rate": 2.537526683660909e-05, + "loss": 0.6482, + "mean_token_accuracy": 0.841906680415074, + "num_tokens": 1426786823.0, + "step": 8052 + }, + { + "entropy": 0.6828158932427565, + "epoch": 2.4889266147079248, + "grad_norm": 0.17191940546035767, + "learning_rate": 2.5235776288260593e-05, + "loss": 0.6614, + "mean_token_accuracy": 0.8386979786058267, + "num_tokens": 1428924792.0, + "step": 8064 + }, + { + "entropy": 0.6837918112675349, + "epoch": 2.492630604213288, + "grad_norm": 0.16740551590919495, + "learning_rate": 2.5096540663986067e-05, + "loss": 0.6631, + "mean_token_accuracy": 0.8389793510238329, + "num_tokens": 1431082569.0, + "step": 8076 + }, + { + "entropy": 0.6886960230767727, + "epoch": 2.496334593718651, + "grad_norm": 0.17835856974124908, + "learning_rate": 2.4957561397083506e-05, + "loss": 0.6635, + "mean_token_accuracy": 0.8381286015113195, + "num_tokens": 1433227615.0, + "step": 8088 + }, + { + "entropy": 0.6757631773749987, + "epoch": 2.500038583224014, + "grad_norm": 0.1775360405445099, + "learning_rate": 2.4818839918211962e-05, + "loss": 0.6531, + "mean_token_accuracy": 0.8405404339234034, + "num_tokens": 1435347410.0, + "step": 8100 + }, + { + "entropy": 0.6918977871537209, + "epoch": 2.503742572729377, + "grad_norm": 0.17831064760684967, + "learning_rate": 2.4680377655376814e-05, + "loss": 0.6699, + "mean_token_accuracy": 0.8364169212679068, + "num_tokens": 1437513213.0, + "step": 8112 + }, + { + "entropy": 0.6668169523278872, + "epoch": 2.5074465622347404, + "grad_norm": 0.1669531911611557, + "learning_rate": 2.4542176033915048e-05, + "loss": 0.6448, + "mean_token_accuracy": 0.8414871692657471, + "num_tokens": 1439633767.0, + "step": 8124 + }, + { + "entropy": 0.6661192377408346, + "epoch": 2.511150551740103, + "grad_norm": 0.18062232434749603, + "learning_rate": 2.4404236476480608e-05, + "loss": 0.6454, + "mean_token_accuracy": 0.8420434022943178, + "num_tokens": 1441737922.0, + "step": 8136 + }, + { + "entropy": 0.678036684791247, + "epoch": 2.5148545412454664, + "grad_norm": 0.2435832917690277, + "learning_rate": 2.426656040302973e-05, + "loss": 0.654, + "mean_token_accuracy": 0.8405960587163767, + "num_tokens": 1443852135.0, + "step": 8148 + }, + { + "entropy": 0.6822113630672296, + "epoch": 2.5185585307508296, + "grad_norm": 0.16967307031154633, + "learning_rate": 2.4129149230806337e-05, + "loss": 0.6602, + "mean_token_accuracy": 0.8390215809146563, + "num_tokens": 1445946633.0, + "step": 8160 + }, + { + "entropy": 0.6625789310783148, + "epoch": 2.5222625202561924, + "grad_norm": 0.16657769680023193, + "learning_rate": 2.399200437432744e-05, + "loss": 0.642, + "mean_token_accuracy": 0.8431124811371168, + "num_tokens": 1448087716.0, + "step": 8172 + }, + { + "entropy": 0.6771047189831734, + "epoch": 2.5259665097615556, + "grad_norm": 0.1971009075641632, + "learning_rate": 2.385512724536857e-05, + "loss": 0.6527, + "mean_token_accuracy": 0.8407648056745529, + "num_tokens": 1450243227.0, + "step": 8184 + }, + { + "entropy": 0.6659577091534933, + "epoch": 2.529670499266919, + "grad_norm": 0.17768594622612, + "learning_rate": 2.3718519252949316e-05, + "loss": 0.6443, + "mean_token_accuracy": 0.8426598384976387, + "num_tokens": 1452367356.0, + "step": 8196 + }, + { + "entropy": 0.6778051033616066, + "epoch": 2.5333744887722816, + "grad_norm": 0.1780218631029129, + "learning_rate": 2.358218180331871e-05, + "loss": 0.6549, + "mean_token_accuracy": 0.840303952495257, + "num_tokens": 1454523799.0, + "step": 8208 + }, + { + "entropy": 0.6757925574978193, + "epoch": 2.5370784782776448, + "grad_norm": 0.175310879945755, + "learning_rate": 2.3446116299940814e-05, + "loss": 0.6527, + "mean_token_accuracy": 0.840245266755422, + "num_tokens": 1456682468.0, + "step": 8220 + }, + { + "entropy": 0.6883031961818536, + "epoch": 2.540782467783008, + "grad_norm": 0.17527994513511658, + "learning_rate": 2.331032414348026e-05, + "loss": 0.668, + "mean_token_accuracy": 0.837483424693346, + "num_tokens": 1458745387.0, + "step": 8232 + }, + { + "entropy": 0.6782544503609339, + "epoch": 2.5444864572883708, + "grad_norm": 0.18310600519180298, + "learning_rate": 2.3174806731787833e-05, + "loss": 0.6579, + "mean_token_accuracy": 0.8392157728473345, + "num_tokens": 1460852728.0, + "step": 8244 + }, + { + "entropy": 0.678227295478185, + "epoch": 2.548190446793734, + "grad_norm": 0.16748745739459991, + "learning_rate": 2.3039565459886144e-05, + "loss": 0.6544, + "mean_token_accuracy": 0.8402854837477207, + "num_tokens": 1462984834.0, + "step": 8256 + }, + { + "entropy": 0.6784488918880621, + "epoch": 2.551894436299097, + "grad_norm": 0.17468595504760742, + "learning_rate": 2.290460171995508e-05, + "loss": 0.6579, + "mean_token_accuracy": 0.8394933181504408, + "num_tokens": 1465111628.0, + "step": 8268 + }, + { + "entropy": 0.68077094728748, + "epoch": 2.55559842580446, + "grad_norm": 0.17412585020065308, + "learning_rate": 2.2769916901317717e-05, + "loss": 0.6588, + "mean_token_accuracy": 0.8396103344857693, + "num_tokens": 1467237410.0, + "step": 8280 + }, + { + "entropy": 0.6686192614336809, + "epoch": 2.559302415309823, + "grad_norm": 0.18562890589237213, + "learning_rate": 2.2635512390425834e-05, + "loss": 0.6453, + "mean_token_accuracy": 0.8421800620853901, + "num_tokens": 1469391192.0, + "step": 8292 + }, + { + "entropy": 0.6803597112496694, + "epoch": 2.5630064048151864, + "grad_norm": 0.17133885622024536, + "learning_rate": 2.2501389570845767e-05, + "loss": 0.6557, + "mean_token_accuracy": 0.8401055956880251, + "num_tokens": 1471504796.0, + "step": 8304 + }, + { + "entropy": 0.67287752404809, + "epoch": 2.566710394320549, + "grad_norm": 0.17606225609779358, + "learning_rate": 2.236754982324406e-05, + "loss": 0.6517, + "mean_token_accuracy": 0.8406522087752819, + "num_tokens": 1473637509.0, + "step": 8316 + }, + { + "entropy": 0.6758027486503124, + "epoch": 2.5704143838259124, + "grad_norm": 0.16274742782115936, + "learning_rate": 2.223399452537332e-05, + "loss": 0.6514, + "mean_token_accuracy": 0.8412458946307501, + "num_tokens": 1475756831.0, + "step": 8328 + }, + { + "entropy": 0.6769342236220837, + "epoch": 2.5741183733312756, + "grad_norm": 0.173589825630188, + "learning_rate": 2.2100725052058006e-05, + "loss": 0.6535, + "mean_token_accuracy": 0.8408135734498501, + "num_tokens": 1477899347.0, + "step": 8340 + }, + { + "entropy": 0.6732165577510992, + "epoch": 2.577822362836639, + "grad_norm": 0.18250826001167297, + "learning_rate": 2.1967742775180306e-05, + "loss": 0.6503, + "mean_token_accuracy": 0.8415469999114672, + "num_tokens": 1480035263.0, + "step": 8352 + }, + { + "entropy": 0.6719749377419552, + "epoch": 2.5815263523420016, + "grad_norm": 0.174708753824234, + "learning_rate": 2.183504906366595e-05, + "loss": 0.6495, + "mean_token_accuracy": 0.8410700398186842, + "num_tokens": 1482156466.0, + "step": 8364 + }, + { + "entropy": 0.6818638009329637, + "epoch": 2.5852303418473648, + "grad_norm": 0.18486975133419037, + "learning_rate": 2.1702645283470236e-05, + "loss": 0.6595, + "mean_token_accuracy": 0.8392534951368967, + "num_tokens": 1484290285.0, + "step": 8376 + }, + { + "entropy": 0.683278722067674, + "epoch": 2.588934331352728, + "grad_norm": 0.17580677568912506, + "learning_rate": 2.1570532797563852e-05, + "loss": 0.6613, + "mean_token_accuracy": 0.8384759289522966, + "num_tokens": 1486400150.0, + "step": 8388 + }, + { + "entropy": 0.6725541402896246, + "epoch": 2.5926383208580908, + "grad_norm": 0.19462420046329498, + "learning_rate": 2.143871296591885e-05, + "loss": 0.6482, + "mean_token_accuracy": 0.8413905973235766, + "num_tokens": 1488514217.0, + "step": 8400 + }, + { + "entropy": 0.6712416261434555, + "epoch": 2.596342310363454, + "grad_norm": 0.17859038710594177, + "learning_rate": 2.1307187145494727e-05, + "loss": 0.6476, + "mean_token_accuracy": 0.8417484226326147, + "num_tokens": 1490640601.0, + "step": 8412 + }, + { + "entropy": 0.6808934770524502, + "epoch": 2.600046299868817, + "grad_norm": 0.16942450404167175, + "learning_rate": 2.1175956690224453e-05, + "loss": 0.6565, + "mean_token_accuracy": 0.8403363898396492, + "num_tokens": 1492760780.0, + "step": 8424 + }, + { + "entropy": 0.688660574456056, + "epoch": 2.60375028937418, + "grad_norm": 0.17530061304569244, + "learning_rate": 2.1045022951000437e-05, + "loss": 0.6658, + "mean_token_accuracy": 0.8372594018777212, + "num_tokens": 1494877831.0, + "step": 8436 + }, + { + "entropy": 0.6738361803193887, + "epoch": 2.607454278879543, + "grad_norm": 0.169802725315094, + "learning_rate": 2.091438727566069e-05, + "loss": 0.6533, + "mean_token_accuracy": 0.8404905224839846, + "num_tokens": 1497011614.0, + "step": 8448 + }, + { + "entropy": 0.6933514513075352, + "epoch": 2.6111582683849064, + "grad_norm": 0.16949047148227692, + "learning_rate": 2.0784051008974957e-05, + "loss": 0.6672, + "mean_token_accuracy": 0.8373734119037787, + "num_tokens": 1499130158.0, + "step": 8460 + }, + { + "entropy": 0.676812877257665, + "epoch": 2.614862257890269, + "grad_norm": 0.16732563078403473, + "learning_rate": 2.0654015492630842e-05, + "loss": 0.6561, + "mean_token_accuracy": 0.8405400899549326, + "num_tokens": 1501274725.0, + "step": 8472 + }, + { + "entropy": 0.6772327373425165, + "epoch": 2.6185662473956324, + "grad_norm": 0.1803075224161148, + "learning_rate": 2.0524282065219995e-05, + "loss": 0.6551, + "mean_token_accuracy": 0.839802881081899, + "num_tokens": 1503401542.0, + "step": 8484 + }, + { + "entropy": 0.6690971752007803, + "epoch": 2.6222702369009956, + "grad_norm": 0.182891383767128, + "learning_rate": 2.0394852062224408e-05, + "loss": 0.6458, + "mean_token_accuracy": 0.84250757843256, + "num_tokens": 1505486844.0, + "step": 8496 + }, + { + "entropy": 0.6797614445288976, + "epoch": 2.6259742264063584, + "grad_norm": 0.1719876229763031, + "learning_rate": 2.026572681600257e-05, + "loss": 0.6556, + "mean_token_accuracy": 0.8399543985724449, + "num_tokens": 1507602524.0, + "step": 8508 + }, + { + "entropy": 0.6750110884507498, + "epoch": 2.6296782159117216, + "grad_norm": 0.16174942255020142, + "learning_rate": 2.0136907655775743e-05, + "loss": 0.6544, + "mean_token_accuracy": 0.8410836247106394, + "num_tokens": 1509738493.0, + "step": 8520 + }, + { + "entropy": 0.6792822231849035, + "epoch": 2.633382205417085, + "grad_norm": 0.18377965688705444, + "learning_rate": 2.000839590761439e-05, + "loss": 0.6583, + "mean_token_accuracy": 0.8397029526531696, + "num_tokens": 1511827824.0, + "step": 8532 + }, + { + "entropy": 0.6866883126397928, + "epoch": 2.6370861949224476, + "grad_norm": 0.17742934823036194, + "learning_rate": 1.988019289442446e-05, + "loss": 0.6644, + "mean_token_accuracy": 0.8386820207039515, + "num_tokens": 1513923734.0, + "step": 8544 + }, + { + "entropy": 0.6748844794929028, + "epoch": 2.6407901844278108, + "grad_norm": 0.1755974441766739, + "learning_rate": 1.9752299935933732e-05, + "loss": 0.6528, + "mean_token_accuracy": 0.8406555938223997, + "num_tokens": 1516031350.0, + "step": 8556 + }, + { + "entropy": 0.6851431752244631, + "epoch": 2.644494173933174, + "grad_norm": 0.17667120695114136, + "learning_rate": 1.9624718348678302e-05, + "loss": 0.659, + "mean_token_accuracy": 0.8388123251497746, + "num_tokens": 1518147388.0, + "step": 8568 + }, + { + "entropy": 0.6720710135996342, + "epoch": 2.648198163438537, + "grad_norm": 0.1714988797903061, + "learning_rate": 1.9497449445988964e-05, + "loss": 0.6477, + "mean_token_accuracy": 0.8416066728532314, + "num_tokens": 1520240737.0, + "step": 8580 + }, + { + "entropy": 0.6758585870265961, + "epoch": 2.6519021529439, + "grad_norm": 0.2017345130443573, + "learning_rate": 1.9370494537977724e-05, + "loss": 0.6538, + "mean_token_accuracy": 0.8405953471859297, + "num_tokens": 1522358544.0, + "step": 8592 + }, + { + "entropy": 0.6837990420560042, + "epoch": 2.655606142449263, + "grad_norm": 0.18071149289608002, + "learning_rate": 1.9243854931524363e-05, + "loss": 0.6622, + "mean_token_accuracy": 0.8380974618097147, + "num_tokens": 1524491885.0, + "step": 8604 + }, + { + "entropy": 0.6755645809074243, + "epoch": 2.6593101319546264, + "grad_norm": 0.1797918975353241, + "learning_rate": 1.911753193026288e-05, + "loss": 0.6517, + "mean_token_accuracy": 0.8409853937725226, + "num_tokens": 1526599435.0, + "step": 8616 + }, + { + "entropy": 0.6866617997487386, + "epoch": 2.663014121459989, + "grad_norm": 0.1666131615638733, + "learning_rate": 1.8991526834568146e-05, + "loss": 0.6633, + "mean_token_accuracy": 0.8389130185047785, + "num_tokens": 1528740068.0, + "step": 8628 + }, + { + "entropy": 0.6782820535202821, + "epoch": 2.6667181109653524, + "grad_norm": 0.1680057942867279, + "learning_rate": 1.8865840941542506e-05, + "loss": 0.6564, + "mean_token_accuracy": 0.8396158503989378, + "num_tokens": 1530887262.0, + "step": 8640 + }, + { + "entropy": 0.6818838343024254, + "epoch": 2.6704221004707156, + "grad_norm": 0.18536606431007385, + "learning_rate": 1.8740475545002357e-05, + "loss": 0.6589, + "mean_token_accuracy": 0.8392697448531786, + "num_tokens": 1533019334.0, + "step": 8652 + }, + { + "entropy": 0.6834930268426737, + "epoch": 2.6741260899760784, + "grad_norm": 0.16942651569843292, + "learning_rate": 1.8615431935464982e-05, + "loss": 0.6633, + "mean_token_accuracy": 0.8381645083427429, + "num_tokens": 1535141514.0, + "step": 8664 + }, + { + "entropy": 0.6799823269248009, + "epoch": 2.6778300794814416, + "grad_norm": 0.18408891558647156, + "learning_rate": 1.8490711400135118e-05, + "loss": 0.6584, + "mean_token_accuracy": 0.8390690212448438, + "num_tokens": 1537222924.0, + "step": 8676 + }, + { + "entropy": 0.6704868152737617, + "epoch": 2.681534068986805, + "grad_norm": 0.17771634459495544, + "learning_rate": 1.8366315222891772e-05, + "loss": 0.649, + "mean_token_accuracy": 0.8417287928362688, + "num_tokens": 1539361332.0, + "step": 8688 + }, + { + "entropy": 0.6824560165405273, + "epoch": 2.6852380584921676, + "grad_norm": 0.1805276870727539, + "learning_rate": 1.8242244684274994e-05, + "loss": 0.662, + "mean_token_accuracy": 0.8386474835375944, + "num_tokens": 1541489251.0, + "step": 8700 + }, + { + "entropy": 0.6674566405514876, + "epoch": 2.688942047997531, + "grad_norm": 0.17611855268478394, + "learning_rate": 1.8118501061472686e-05, + "loss": 0.6428, + "mean_token_accuracy": 0.8432766410211722, + "num_tokens": 1543623822.0, + "step": 8712 + }, + { + "entropy": 0.6744134798645973, + "epoch": 2.692646037502894, + "grad_norm": 0.17920047044754028, + "learning_rate": 1.7995085628307494e-05, + "loss": 0.6498, + "mean_token_accuracy": 0.8411453490455946, + "num_tokens": 1545739068.0, + "step": 8724 + }, + { + "entropy": 0.6672421346108118, + "epoch": 2.6963500270082568, + "grad_norm": 0.17886677384376526, + "learning_rate": 1.7871999655223636e-05, + "loss": 0.6447, + "mean_token_accuracy": 0.8418544096251329, + "num_tokens": 1547875527.0, + "step": 8736 + }, + { + "entropy": 0.6673632053037485, + "epoch": 2.70005401651362, + "grad_norm": 0.18474158644676208, + "learning_rate": 1.774924440927386e-05, + "loss": 0.6458, + "mean_token_accuracy": 0.841834536443154, + "num_tokens": 1549985851.0, + "step": 8748 + }, + { + "entropy": 0.6826143587629, + "epoch": 2.703758006018983, + "grad_norm": 0.1684853732585907, + "learning_rate": 1.7626821154106382e-05, + "loss": 0.6594, + "mean_token_accuracy": 0.8393335652848085, + "num_tokens": 1552149497.0, + "step": 8760 + }, + { + "entropy": 0.6779943940540155, + "epoch": 2.707461995524346, + "grad_norm": 0.16713795065879822, + "learning_rate": 1.7504731149951913e-05, + "loss": 0.6564, + "mean_token_accuracy": 0.8403484423955282, + "num_tokens": 1554311306.0, + "step": 8772 + }, + { + "entropy": 0.6786763022343317, + "epoch": 2.711165985029709, + "grad_norm": 0.18719340860843658, + "learning_rate": 1.738297565361065e-05, + "loss": 0.6533, + "mean_token_accuracy": 0.8405405295391878, + "num_tokens": 1556410114.0, + "step": 8784 + }, + { + "entropy": 0.6622005340953668, + "epoch": 2.7148699745350724, + "grad_norm": 0.1840929239988327, + "learning_rate": 1.7261555918439347e-05, + "loss": 0.6393, + "mean_token_accuracy": 0.8439254661401113, + "num_tokens": 1558582974.0, + "step": 8796 + }, + { + "entropy": 0.6725914205114046, + "epoch": 2.718573964040435, + "grad_norm": 0.17707188427448273, + "learning_rate": 1.714047319433842e-05, + "loss": 0.6485, + "mean_token_accuracy": 0.8418245787421862, + "num_tokens": 1560714505.0, + "step": 8808 + }, + { + "entropy": 0.6861369709173838, + "epoch": 2.7222779535457984, + "grad_norm": 0.18081851303577423, + "learning_rate": 1.7019728727739082e-05, + "loss": 0.6616, + "mean_token_accuracy": 0.8398788695534071, + "num_tokens": 1562841818.0, + "step": 8820 + }, + { + "entropy": 0.676196767638127, + "epoch": 2.7259819430511616, + "grad_norm": 0.17770478129386902, + "learning_rate": 1.6899323761590486e-05, + "loss": 0.6529, + "mean_token_accuracy": 0.8412167988717556, + "num_tokens": 1564958639.0, + "step": 8832 + }, + { + "entropy": 0.6725533629457155, + "epoch": 2.7296859325565244, + "grad_norm": 0.18487904965877533, + "learning_rate": 1.6779259535346996e-05, + "loss": 0.6497, + "mean_token_accuracy": 0.8410086408257484, + "num_tokens": 1567108439.0, + "step": 8844 + }, + { + "entropy": 0.6586884421606859, + "epoch": 2.7333899220618876, + "grad_norm": 0.16857771575450897, + "learning_rate": 1.6659537284955358e-05, + "loss": 0.6388, + "mean_token_accuracy": 0.8442123333613077, + "num_tokens": 1569262392.0, + "step": 8856 + }, + { + "entropy": 0.6725297843416532, + "epoch": 2.737093911567251, + "grad_norm": 0.17279498279094696, + "learning_rate": 1.654015824284198e-05, + "loss": 0.6484, + "mean_token_accuracy": 0.8418673065801462, + "num_tokens": 1571400293.0, + "step": 8868 + }, + { + "entropy": 0.665387824177742, + "epoch": 2.7407979010726136, + "grad_norm": 0.19026906788349152, + "learning_rate": 1.642112363790031e-05, + "loss": 0.6424, + "mean_token_accuracy": 0.8429265518983206, + "num_tokens": 1573528403.0, + "step": 8880 + }, + { + "entropy": 0.6819708024462064, + "epoch": 2.7445018905779768, + "grad_norm": 0.17325517535209656, + "learning_rate": 1.6302434695478107e-05, + "loss": 0.6607, + "mean_token_accuracy": 0.8388937674462795, + "num_tokens": 1575627080.0, + "step": 8892 + }, + { + "entropy": 0.6820539807279905, + "epoch": 2.74820588008334, + "grad_norm": 0.1792973279953003, + "learning_rate": 1.618409263736489e-05, + "loss": 0.658, + "mean_token_accuracy": 0.8399195012946924, + "num_tokens": 1577728236.0, + "step": 8904 + }, + { + "entropy": 0.6730578330655893, + "epoch": 2.7519098695887028, + "grad_norm": 0.18479055166244507, + "learning_rate": 1.606609868177932e-05, + "loss": 0.6522, + "mean_token_accuracy": 0.8412285819649696, + "num_tokens": 1579821668.0, + "step": 8916 + }, + { + "entropy": 0.6686800830066204, + "epoch": 2.755613859094066, + "grad_norm": 0.17226146161556244, + "learning_rate": 1.594845404335668e-05, + "loss": 0.644, + "mean_token_accuracy": 0.842714703331391, + "num_tokens": 1581941943.0, + "step": 8928 + }, + { + "entropy": 0.6718707966307799, + "epoch": 2.759317848599429, + "grad_norm": 0.17666469514369965, + "learning_rate": 1.583115993313637e-05, + "loss": 0.6497, + "mean_token_accuracy": 0.8410539838174979, + "num_tokens": 1584072385.0, + "step": 8940 + }, + { + "entropy": 0.6763085102041563, + "epoch": 2.763021838104792, + "grad_norm": 0.17432096600532532, + "learning_rate": 1.571421755854941e-05, + "loss": 0.6526, + "mean_token_accuracy": 0.8408862004677454, + "num_tokens": 1586203502.0, + "step": 8952 + }, + { + "entropy": 0.6714817471802235, + "epoch": 2.766725827610155, + "grad_norm": 0.1722986400127411, + "learning_rate": 1.5597628123406095e-05, + "loss": 0.6489, + "mean_token_accuracy": 0.8418647820750872, + "num_tokens": 1588322897.0, + "step": 8964 + }, + { + "entropy": 0.6764586915572485, + "epoch": 2.7704298171155184, + "grad_norm": 0.16954948008060455, + "learning_rate": 1.548139282788349e-05, + "loss": 0.6549, + "mean_token_accuracy": 0.8404773225386938, + "num_tokens": 1590459142.0, + "step": 8976 + }, + { + "entropy": 0.6723885672787825, + "epoch": 2.774133806620881, + "grad_norm": 0.18865634500980377, + "learning_rate": 1.5365512868513144e-05, + "loss": 0.6509, + "mean_token_accuracy": 0.8412475138902664, + "num_tokens": 1592577106.0, + "step": 8988 + }, + { + "entropy": 0.669227180381616, + "epoch": 2.7778377961262444, + "grad_norm": 0.17359550297260284, + "learning_rate": 1.5249989438168771e-05, + "loss": 0.6457, + "mean_token_accuracy": 0.8424140512943268, + "num_tokens": 1594699366.0, + "step": 9000 + } + ], + "logging_steps": 12, + "max_steps": 12000, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0682543308592801e+20, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}