{ "best_global_step": null, "best_metric": 2.08726167678833, "best_model_checkpoint": null, "epoch": 0.9993247805536799, "eval_steps": 10, "global_step": 370, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 2.397214651107788, "eval_mean_token_accuracy": 0.5059000539779663, "eval_num_tokens": 0.0, "eval_runtime": 45.6648, "eval_samples_per_second": 4.226, "eval_steps_per_second": 0.547, "step": 0 }, { "epoch": 0.0027008777852802163, "grad_norm": 74.73389434814453, "learning_rate": 0.0, "loss": 36.528, "mean_token_accuracy": 0.5154154784977436, "num_tokens": 30979.0, "step": 1 }, { "epoch": 0.0054017555705604325, "grad_norm": 71.96607971191406, "learning_rate": 8.333333333333333e-07, "loss": 38.3279, "mean_token_accuracy": 0.5002705305814743, "num_tokens": 59875.0, "step": 2 }, { "epoch": 0.008102633355840648, "grad_norm": 89.63040924072266, "learning_rate": 1.6666666666666667e-06, "loss": 34.9865, "mean_token_accuracy": 0.5186122097074986, "num_tokens": 94813.0, "step": 3 }, { "epoch": 0.010803511141120865, "grad_norm": 62.724388122558594, "learning_rate": 2.5e-06, "loss": 37.4605, "mean_token_accuracy": 0.5071384683251381, "num_tokens": 122943.0, "step": 4 }, { "epoch": 0.01350438892640108, "grad_norm": 58.08222198486328, "learning_rate": 3.3333333333333333e-06, "loss": 38.544, "mean_token_accuracy": 0.49695252999663353, "num_tokens": 155196.0, "step": 5 }, { "epoch": 0.016205266711681297, "grad_norm": 62.1136589050293, "learning_rate": 4.166666666666667e-06, "loss": 38.5181, "mean_token_accuracy": 0.5059397108852863, "num_tokens": 185828.0, "step": 6 }, { "epoch": 0.018906144496961513, "grad_norm": 58.79863739013672, "learning_rate": 5e-06, "loss": 40.3843, "mean_token_accuracy": 0.47813237458467484, "num_tokens": 215412.0, "step": 7 }, { "epoch": 0.02160702228224173, "grad_norm": 57.702178955078125, "learning_rate": 5.833333333333334e-06, "loss": 37.7204, "mean_token_accuracy": 0.5041601024568081, "num_tokens": 239108.0, "step": 8 }, { "epoch": 0.024307900067521943, "grad_norm": 50.49614334106445, "learning_rate": 6.666666666666667e-06, "loss": 39.7766, "mean_token_accuracy": 0.472469225525856, "num_tokens": 266446.0, "step": 9 }, { "epoch": 0.02700877785280216, "grad_norm": 47.13991928100586, "learning_rate": 7.500000000000001e-06, "loss": 38.3749, "mean_token_accuracy": 0.4879019036889076, "num_tokens": 295285.0, "step": 10 }, { "epoch": 0.02700877785280216, "eval_loss": 2.312359571456909, "eval_mean_token_accuracy": 0.5222688019275665, "eval_num_tokens": 295285.0, "eval_runtime": 36.6088, "eval_samples_per_second": 5.272, "eval_steps_per_second": 0.683, "step": 10 }, { "epoch": 0.029709655638082377, "grad_norm": 45.16763687133789, "learning_rate": 8.333333333333334e-06, "loss": 37.4436, "mean_token_accuracy": 0.5303923562169075, "num_tokens": 328556.0, "step": 11 }, { "epoch": 0.03241053342336259, "grad_norm": 48.777076721191406, "learning_rate": 9.166666666666666e-06, "loss": 35.9707, "mean_token_accuracy": 0.5227997153997421, "num_tokens": 354607.0, "step": 12 }, { "epoch": 0.035111411208642807, "grad_norm": 44.566070556640625, "learning_rate": 1e-05, "loss": 37.8243, "mean_token_accuracy": 0.5070199966430664, "num_tokens": 386672.0, "step": 13 }, { "epoch": 0.03781228899392303, "grad_norm": 43.15651321411133, "learning_rate": 9.999827697890387e-06, "loss": 38.5462, "mean_token_accuracy": 0.5086139552295208, "num_tokens": 411869.0, "step": 14 }, { "epoch": 0.04051316677920324, "grad_norm": 39.41759490966797, "learning_rate": 9.99931080475622e-06, "loss": 35.3692, "mean_token_accuracy": 0.5319068878889084, "num_tokens": 439155.0, "step": 15 }, { "epoch": 0.04321404456448346, "grad_norm": 38.14093017578125, "learning_rate": 9.998449360180513e-06, "loss": 39.2616, "mean_token_accuracy": 0.4879924915730953, "num_tokens": 475091.0, "step": 16 }, { "epoch": 0.04591492234976367, "grad_norm": 34.54646682739258, "learning_rate": 9.997243430131587e-06, "loss": 36.0486, "mean_token_accuracy": 0.5397874899208546, "num_tokens": 510644.0, "step": 17 }, { "epoch": 0.048615800135043886, "grad_norm": 37.82185363769531, "learning_rate": 9.995693106958009e-06, "loss": 40.5855, "mean_token_accuracy": 0.48904992267489433, "num_tokens": 542249.0, "step": 18 }, { "epoch": 0.05131667792032411, "grad_norm": 43.19975280761719, "learning_rate": 9.993798509381542e-06, "loss": 35.7794, "mean_token_accuracy": 0.5416266955435276, "num_tokens": 569535.0, "step": 19 }, { "epoch": 0.05401755570560432, "grad_norm": 42.238162994384766, "learning_rate": 9.991559782488031e-06, "loss": 34.7708, "mean_token_accuracy": 0.5208283551037312, "num_tokens": 605789.0, "step": 20 }, { "epoch": 0.05401755570560432, "eval_loss": 2.2481446266174316, "eval_mean_token_accuracy": 0.534355319738388, "eval_num_tokens": 605789.0, "eval_runtime": 36.4364, "eval_samples_per_second": 5.297, "eval_steps_per_second": 0.686, "step": 20 }, { "epoch": 0.05671843349088454, "grad_norm": 36.71121597290039, "learning_rate": 9.98897709771631e-06, "loss": 39.087, "mean_token_accuracy": 0.506351962685585, "num_tokens": 633020.0, "step": 21 }, { "epoch": 0.05941931127616475, "grad_norm": 42.23619842529297, "learning_rate": 9.986050652845058e-06, "loss": 38.4557, "mean_token_accuracy": 0.507073100656271, "num_tokens": 658345.0, "step": 22 }, { "epoch": 0.062120189061444966, "grad_norm": 37.17399597167969, "learning_rate": 9.982780671977665e-06, "loss": 35.6222, "mean_token_accuracy": 0.5312147065997124, "num_tokens": 683932.0, "step": 23 }, { "epoch": 0.06482106684672519, "grad_norm": 34.07923889160156, "learning_rate": 9.979167405525063e-06, "loss": 35.1447, "mean_token_accuracy": 0.5398494191467762, "num_tokens": 712590.0, "step": 24 }, { "epoch": 0.0675219446320054, "grad_norm": 32.862369537353516, "learning_rate": 9.97521113018656e-06, "loss": 37.8009, "mean_token_accuracy": 0.5293546989560127, "num_tokens": 741958.0, "step": 25 }, { "epoch": 0.07022282241728561, "grad_norm": 38.0152587890625, "learning_rate": 9.970912148928634e-06, "loss": 35.4103, "mean_token_accuracy": 0.5297193340957165, "num_tokens": 768827.0, "step": 26 }, { "epoch": 0.07292370020256583, "grad_norm": 35.030948638916016, "learning_rate": 9.966270790961747e-06, "loss": 39.028, "mean_token_accuracy": 0.5193862244486809, "num_tokens": 806591.0, "step": 27 }, { "epoch": 0.07562457798784605, "grad_norm": 35.34138107299805, "learning_rate": 9.961287411715135e-06, "loss": 35.4406, "mean_token_accuracy": 0.5286831967532635, "num_tokens": 839283.0, "step": 28 }, { "epoch": 0.07832545577312626, "grad_norm": 36.31604766845703, "learning_rate": 9.955962392809575e-06, "loss": 37.2577, "mean_token_accuracy": 0.5075593329966068, "num_tokens": 867848.0, "step": 29 }, { "epoch": 0.08102633355840648, "grad_norm": 32.090091705322266, "learning_rate": 9.950296142028175e-06, "loss": 35.8017, "mean_token_accuracy": 0.5367580950260162, "num_tokens": 905786.0, "step": 30 }, { "epoch": 0.08102633355840648, "eval_loss": 2.2169125080108643, "eval_mean_token_accuracy": 0.538977221250534, "eval_num_tokens": 905786.0, "eval_runtime": 35.9569, "eval_samples_per_second": 5.368, "eval_steps_per_second": 0.695, "step": 30 }, { "epoch": 0.0837272113436867, "grad_norm": 34.76291275024414, "learning_rate": 9.944289093285142e-06, "loss": 39.8727, "mean_token_accuracy": 0.5054902620613575, "num_tokens": 943504.0, "step": 31 }, { "epoch": 0.08642808912896692, "grad_norm": 38.11785125732422, "learning_rate": 9.93794170659255e-06, "loss": 38.6825, "mean_token_accuracy": 0.4918428808450699, "num_tokens": 975301.0, "step": 32 }, { "epoch": 0.08912896691424713, "grad_norm": 34.85907745361328, "learning_rate": 9.93125446802512e-06, "loss": 30.3906, "mean_token_accuracy": 0.5750567205250263, "num_tokens": 1000967.0, "step": 33 }, { "epoch": 0.09182984469952735, "grad_norm": 40.81071472167969, "learning_rate": 9.924227889682989e-06, "loss": 36.6105, "mean_token_accuracy": 0.5123943611979485, "num_tokens": 1024795.0, "step": 34 }, { "epoch": 0.09453072248480757, "grad_norm": 35.881874084472656, "learning_rate": 9.9168625096525e-06, "loss": 38.7566, "mean_token_accuracy": 0.5056329034268856, "num_tokens": 1055588.0, "step": 35 }, { "epoch": 0.09723160027008777, "grad_norm": 36.923789978027344, "learning_rate": 9.909158891964996e-06, "loss": 34.4608, "mean_token_accuracy": 0.5372067280113697, "num_tokens": 1084319.0, "step": 36 }, { "epoch": 0.099932478055368, "grad_norm": 36.05925750732422, "learning_rate": 9.901117626553624e-06, "loss": 40.9943, "mean_token_accuracy": 0.49258575960993767, "num_tokens": 1116276.0, "step": 37 }, { "epoch": 0.10263335584064821, "grad_norm": 32.34959030151367, "learning_rate": 9.892739329208153e-06, "loss": 34.6149, "mean_token_accuracy": 0.5492901094257832, "num_tokens": 1146878.0, "step": 38 }, { "epoch": 0.10533423362592843, "grad_norm": 37.472251892089844, "learning_rate": 9.88402464152784e-06, "loss": 33.4951, "mean_token_accuracy": 0.5433274656534195, "num_tokens": 1171782.0, "step": 39 }, { "epoch": 0.10803511141120864, "grad_norm": 33.00364685058594, "learning_rate": 9.874974230872265e-06, "loss": 33.8645, "mean_token_accuracy": 0.5462823025882244, "num_tokens": 1201042.0, "step": 40 }, { "epoch": 0.10803511141120864, "eval_loss": 2.197446584701538, "eval_mean_token_accuracy": 0.541057585477829, "eval_num_tokens": 1201042.0, "eval_runtime": 36.2206, "eval_samples_per_second": 5.328, "eval_steps_per_second": 0.69, "step": 40 }, { "epoch": 0.11073598919648886, "grad_norm": 34.35288619995117, "learning_rate": 9.865588790310254e-06, "loss": 38.0077, "mean_token_accuracy": 0.5095454230904579, "num_tokens": 1236544.0, "step": 41 }, { "epoch": 0.11343686698176908, "grad_norm": 33.34933090209961, "learning_rate": 9.855869038566786e-06, "loss": 36.6855, "mean_token_accuracy": 0.5215939655900002, "num_tokens": 1266441.0, "step": 42 }, { "epoch": 0.11613774476704929, "grad_norm": 39.579246520996094, "learning_rate": 9.845815719967965e-06, "loss": 39.3874, "mean_token_accuracy": 0.4953847639262676, "num_tokens": 1290116.0, "step": 43 }, { "epoch": 0.1188386225523295, "grad_norm": 34.852294921875, "learning_rate": 9.835429604384015e-06, "loss": 34.1387, "mean_token_accuracy": 0.540221817791462, "num_tokens": 1316530.0, "step": 44 }, { "epoch": 0.12153950033760973, "grad_norm": 31.712799072265625, "learning_rate": 9.824711487170325e-06, "loss": 37.4883, "mean_token_accuracy": 0.5219819694757462, "num_tokens": 1353008.0, "step": 45 }, { "epoch": 0.12424037812288993, "grad_norm": 36.45340347290039, "learning_rate": 9.81366218910654e-06, "loss": 32.1224, "mean_token_accuracy": 0.5743309520184994, "num_tokens": 1382544.0, "step": 46 }, { "epoch": 0.12694125590817015, "grad_norm": 31.250629425048828, "learning_rate": 9.802282556333713e-06, "loss": 37.0191, "mean_token_accuracy": 0.514617059379816, "num_tokens": 1417132.0, "step": 47 }, { "epoch": 0.12964213369345037, "grad_norm": 33.27100372314453, "learning_rate": 9.790573460289505e-06, "loss": 39.471, "mean_token_accuracy": 0.49816175177693367, "num_tokens": 1450372.0, "step": 48 }, { "epoch": 0.1323430114787306, "grad_norm": 33.606319427490234, "learning_rate": 9.77853579764145e-06, "loss": 36.5512, "mean_token_accuracy": 0.5297598242759705, "num_tokens": 1486211.0, "step": 49 }, { "epoch": 0.1350438892640108, "grad_norm": 37.80785369873047, "learning_rate": 9.766170490218285e-06, "loss": 35.2075, "mean_token_accuracy": 0.5218806006014347, "num_tokens": 1513960.0, "step": 50 }, { "epoch": 0.1350438892640108, "eval_loss": 2.1853604316711426, "eval_mean_token_accuracy": 0.542732949256897, "eval_num_tokens": 1513960.0, "eval_runtime": 35.9988, "eval_samples_per_second": 5.361, "eval_steps_per_second": 0.694, "step": 50 }, { "epoch": 0.137744767049291, "grad_norm": 34.045265197753906, "learning_rate": 9.753478484939373e-06, "loss": 40.1782, "mean_token_accuracy": 0.4985695630311966, "num_tokens": 1545326.0, "step": 51 }, { "epoch": 0.14044564483457123, "grad_norm": 39.414608001708984, "learning_rate": 9.74046075374217e-06, "loss": 35.9163, "mean_token_accuracy": 0.5289335921406746, "num_tokens": 1569170.0, "step": 52 }, { "epoch": 0.14314652261985145, "grad_norm": 35.90230178833008, "learning_rate": 9.72711829350781e-06, "loss": 39.0823, "mean_token_accuracy": 0.49731169641017914, "num_tokens": 1595110.0, "step": 53 }, { "epoch": 0.14584740040513167, "grad_norm": 34.87423324584961, "learning_rate": 9.71345212598476e-06, "loss": 32.0765, "mean_token_accuracy": 0.5693522170186043, "num_tokens": 1620612.0, "step": 54 }, { "epoch": 0.1485482781904119, "grad_norm": 38.736793518066406, "learning_rate": 9.699463297710568e-06, "loss": 35.1758, "mean_token_accuracy": 0.5582655668258667, "num_tokens": 1644702.0, "step": 55 }, { "epoch": 0.1512491559756921, "grad_norm": 29.255271911621094, "learning_rate": 9.68515287993174e-06, "loss": 32.9944, "mean_token_accuracy": 0.5587775930762291, "num_tokens": 1687142.0, "step": 56 }, { "epoch": 0.15395003376097233, "grad_norm": 34.386444091796875, "learning_rate": 9.670521968521677e-06, "loss": 33.6106, "mean_token_accuracy": 0.5457474626600742, "num_tokens": 1715839.0, "step": 57 }, { "epoch": 0.15665091154625252, "grad_norm": 32.34107971191406, "learning_rate": 9.655571683896789e-06, "loss": 38.4568, "mean_token_accuracy": 0.5071151182055473, "num_tokens": 1751420.0, "step": 58 }, { "epoch": 0.15935178933153274, "grad_norm": 36.9901237487793, "learning_rate": 9.64030317093066e-06, "loss": 34.244, "mean_token_accuracy": 0.5432365871965885, "num_tokens": 1779635.0, "step": 59 }, { "epoch": 0.16205266711681296, "grad_norm": 33.886173248291016, "learning_rate": 9.624717598866405e-06, "loss": 37.6426, "mean_token_accuracy": 0.513043649494648, "num_tokens": 1813837.0, "step": 60 }, { "epoch": 0.16205266711681296, "eval_loss": 2.1764111518859863, "eval_mean_token_accuracy": 0.5435922384262085, "eval_num_tokens": 1813837.0, "eval_runtime": 36.167, "eval_samples_per_second": 5.336, "eval_steps_per_second": 0.691, "step": 60 }, { "epoch": 0.16475354490209318, "grad_norm": 33.836387634277344, "learning_rate": 9.608816161227105e-06, "loss": 35.2374, "mean_token_accuracy": 0.5427614711225033, "num_tokens": 1843297.0, "step": 61 }, { "epoch": 0.1674544226873734, "grad_norm": 37.59563064575195, "learning_rate": 9.59260007572443e-06, "loss": 30.4221, "mean_token_accuracy": 0.5771517679095268, "num_tokens": 1877362.0, "step": 62 }, { "epoch": 0.17015530047265362, "grad_norm": 35.81209945678711, "learning_rate": 9.576070584165379e-06, "loss": 34.8951, "mean_token_accuracy": 0.5191254131495953, "num_tokens": 1906069.0, "step": 63 }, { "epoch": 0.17285617825793384, "grad_norm": 36.29964065551758, "learning_rate": 9.559228952357174e-06, "loss": 33.9633, "mean_token_accuracy": 0.5233625434339046, "num_tokens": 1934070.0, "step": 64 }, { "epoch": 0.17555705604321403, "grad_norm": 31.55278968811035, "learning_rate": 9.542076470010353e-06, "loss": 30.8588, "mean_token_accuracy": 0.5758149437606335, "num_tokens": 1962537.0, "step": 65 }, { "epoch": 0.17825793382849425, "grad_norm": 33.825042724609375, "learning_rate": 9.524614450639975e-06, "loss": 34.2436, "mean_token_accuracy": 0.5403826721012592, "num_tokens": 1998232.0, "step": 66 }, { "epoch": 0.18095881161377447, "grad_norm": 39.315399169921875, "learning_rate": 9.50684423146505e-06, "loss": 35.1627, "mean_token_accuracy": 0.5102917179465294, "num_tokens": 2025844.0, "step": 67 }, { "epoch": 0.1836596893990547, "grad_norm": 35.144981384277344, "learning_rate": 9.488767173306139e-06, "loss": 35.4336, "mean_token_accuracy": 0.5367797575891018, "num_tokens": 2052200.0, "step": 68 }, { "epoch": 0.1863605671843349, "grad_norm": 41.185211181640625, "learning_rate": 9.470384660481128e-06, "loss": 37.7336, "mean_token_accuracy": 0.5066433399915695, "num_tokens": 2082792.0, "step": 69 }, { "epoch": 0.18906144496961513, "grad_norm": 31.638525009155273, "learning_rate": 9.451698100699242e-06, "loss": 34.8277, "mean_token_accuracy": 0.5439408719539642, "num_tokens": 2117297.0, "step": 70 }, { "epoch": 0.18906144496961513, "eval_loss": 2.170250415802002, "eval_mean_token_accuracy": 0.5445958733558655, "eval_num_tokens": 2117297.0, "eval_runtime": 36.0291, "eval_samples_per_second": 5.357, "eval_steps_per_second": 0.694, "step": 70 }, { "epoch": 0.19176232275489535, "grad_norm": 33.62677001953125, "learning_rate": 9.432708924953216e-06, "loss": 34.3854, "mean_token_accuracy": 0.566694688051939, "num_tokens": 2148131.0, "step": 71 }, { "epoch": 0.19446320054017555, "grad_norm": 36.49077606201172, "learning_rate": 9.41341858740974e-06, "loss": 34.7133, "mean_token_accuracy": 0.5199744664132595, "num_tokens": 2173221.0, "step": 72 }, { "epoch": 0.19716407832545577, "grad_norm": 37.14122772216797, "learning_rate": 9.393828565298081e-06, "loss": 35.3098, "mean_token_accuracy": 0.5229644030332565, "num_tokens": 2209764.0, "step": 73 }, { "epoch": 0.199864956110736, "grad_norm": 39.197479248046875, "learning_rate": 9.373940358796968e-06, "loss": 37.8829, "mean_token_accuracy": 0.4989491365849972, "num_tokens": 2236361.0, "step": 74 }, { "epoch": 0.2025658338960162, "grad_norm": 37.38964080810547, "learning_rate": 9.353755490919702e-06, "loss": 35.9907, "mean_token_accuracy": 0.5204923562705517, "num_tokens": 2269694.0, "step": 75 }, { "epoch": 0.20526671168129643, "grad_norm": 36.209815979003906, "learning_rate": 9.33327550739754e-06, "loss": 35.1031, "mean_token_accuracy": 0.5467646308243275, "num_tokens": 2305437.0, "step": 76 }, { "epoch": 0.20796758946657665, "grad_norm": 37.16224670410156, "learning_rate": 9.31250197656131e-06, "loss": 36.3165, "mean_token_accuracy": 0.5429669097065926, "num_tokens": 2336565.0, "step": 77 }, { "epoch": 0.21066846725185687, "grad_norm": 41.85482406616211, "learning_rate": 9.291436489221316e-06, "loss": 36.3102, "mean_token_accuracy": 0.530618391931057, "num_tokens": 2366292.0, "step": 78 }, { "epoch": 0.21336934503713706, "grad_norm": 33.55143356323242, "learning_rate": 9.27008065854552e-06, "loss": 37.1768, "mean_token_accuracy": 0.5360248871147633, "num_tokens": 2394155.0, "step": 79 }, { "epoch": 0.21607022282241728, "grad_norm": 37.05324935913086, "learning_rate": 9.248436119935996e-06, "loss": 31.4134, "mean_token_accuracy": 0.5698137208819389, "num_tokens": 2419744.0, "step": 80 }, { "epoch": 0.21607022282241728, "eval_loss": 2.1641621589660645, "eval_mean_token_accuracy": 0.5452725517749787, "eval_num_tokens": 2419744.0, "eval_runtime": 36.1675, "eval_samples_per_second": 5.336, "eval_steps_per_second": 0.691, "step": 80 }, { "epoch": 0.2187711006076975, "grad_norm": 35.2697868347168, "learning_rate": 9.22650453090371e-06, "loss": 33.1763, "mean_token_accuracy": 0.552621815353632, "num_tokens": 2450738.0, "step": 81 }, { "epoch": 0.22147197839297772, "grad_norm": 33.75497055053711, "learning_rate": 9.204287570941578e-06, "loss": 35.7061, "mean_token_accuracy": 0.5318405367434025, "num_tokens": 2476596.0, "step": 82 }, { "epoch": 0.22417285617825794, "grad_norm": 35.982200622558594, "learning_rate": 9.181786941395849e-06, "loss": 40.0557, "mean_token_accuracy": 0.5097162947058678, "num_tokens": 2512914.0, "step": 83 }, { "epoch": 0.22687373396353816, "grad_norm": 36.56319808959961, "learning_rate": 9.15900436533583e-06, "loss": 36.9495, "mean_token_accuracy": 0.521889578551054, "num_tokens": 2542829.0, "step": 84 }, { "epoch": 0.22957461174881835, "grad_norm": 35.2000732421875, "learning_rate": 9.135941587421932e-06, "loss": 33.7668, "mean_token_accuracy": 0.5471107661724091, "num_tokens": 2569827.0, "step": 85 }, { "epoch": 0.23227548953409857, "grad_norm": 41.712135314941406, "learning_rate": 9.112600373772056e-06, "loss": 29.314, "mean_token_accuracy": 0.585636280477047, "num_tokens": 2590572.0, "step": 86 }, { "epoch": 0.2349763673193788, "grad_norm": 40.713172912597656, "learning_rate": 9.088982511826365e-06, "loss": 35.9162, "mean_token_accuracy": 0.538209792226553, "num_tokens": 2621266.0, "step": 87 }, { "epoch": 0.237677245104659, "grad_norm": 35.17522430419922, "learning_rate": 9.065089810210381e-06, "loss": 35.8204, "mean_token_accuracy": 0.5450333692133427, "num_tokens": 2656561.0, "step": 88 }, { "epoch": 0.24037812288993923, "grad_norm": 39.51090621948242, "learning_rate": 9.040924098596504e-06, "loss": 33.3189, "mean_token_accuracy": 0.5844779461622238, "num_tokens": 2691268.0, "step": 89 }, { "epoch": 0.24307900067521945, "grad_norm": 36.341941833496094, "learning_rate": 9.016487227563885e-06, "loss": 34.5514, "mean_token_accuracy": 0.5403696186840534, "num_tokens": 2715002.0, "step": 90 }, { "epoch": 0.24307900067521945, "eval_loss": 2.1585216522216797, "eval_mean_token_accuracy": 0.5465755295753479, "eval_num_tokens": 2715002.0, "eval_runtime": 36.3583, "eval_samples_per_second": 5.308, "eval_steps_per_second": 0.688, "step": 90 }, { "epoch": 0.24577987846049967, "grad_norm": 34.934417724609375, "learning_rate": 8.991781068456715e-06, "loss": 34.7165, "mean_token_accuracy": 0.5262048728764057, "num_tokens": 2741874.0, "step": 91 }, { "epoch": 0.24848075624577987, "grad_norm": 36.20458984375, "learning_rate": 8.966807513240921e-06, "loss": 36.3895, "mean_token_accuracy": 0.5340331271290779, "num_tokens": 2774701.0, "step": 92 }, { "epoch": 0.2511816340310601, "grad_norm": 36.3608512878418, "learning_rate": 8.94156847435928e-06, "loss": 33.8819, "mean_token_accuracy": 0.5468890182673931, "num_tokens": 2803595.0, "step": 93 }, { "epoch": 0.2538825118163403, "grad_norm": 34.25827407836914, "learning_rate": 8.916065884584969e-06, "loss": 34.5669, "mean_token_accuracy": 0.5531870052218437, "num_tokens": 2829228.0, "step": 94 }, { "epoch": 0.2565833896016205, "grad_norm": 35.43453598022461, "learning_rate": 8.89030169687355e-06, "loss": 35.7638, "mean_token_accuracy": 0.5375570505857468, "num_tokens": 2862333.0, "step": 95 }, { "epoch": 0.25928426738690075, "grad_norm": 34.045860290527344, "learning_rate": 8.864277884213419e-06, "loss": 33.9517, "mean_token_accuracy": 0.548635296523571, "num_tokens": 2893596.0, "step": 96 }, { "epoch": 0.26198514517218097, "grad_norm": 36.67363739013672, "learning_rate": 8.837996439474722e-06, "loss": 37.9136, "mean_token_accuracy": 0.5133212320506573, "num_tokens": 2921678.0, "step": 97 }, { "epoch": 0.2646860229574612, "grad_norm": 35.61439514160156, "learning_rate": 8.811459375256734e-06, "loss": 35.9335, "mean_token_accuracy": 0.515679482370615, "num_tokens": 2946413.0, "step": 98 }, { "epoch": 0.2673869007427414, "grad_norm": 30.285968780517578, "learning_rate": 8.784668723733744e-06, "loss": 34.6207, "mean_token_accuracy": 0.5512839630246162, "num_tokens": 2983163.0, "step": 99 }, { "epoch": 0.2700877785280216, "grad_norm": 30.727275848388672, "learning_rate": 8.757626536499427e-06, "loss": 37.5607, "mean_token_accuracy": 0.519566722214222, "num_tokens": 3018566.0, "step": 100 }, { "epoch": 0.2700877785280216, "eval_loss": 2.1530356407165527, "eval_mean_token_accuracy": 0.5469204211235046, "eval_num_tokens": 3018566.0, "eval_runtime": 35.7427, "eval_samples_per_second": 5.4, "eval_steps_per_second": 0.699, "step": 100 }, { "epoch": 0.27278865631330185, "grad_norm": 29.972339630126953, "learning_rate": 8.730334884409746e-06, "loss": 33.4215, "mean_token_accuracy": 0.5411897301673889, "num_tokens": 3051443.0, "step": 101 }, { "epoch": 0.275489534098582, "grad_norm": 37.05634689331055, "learning_rate": 8.702795857424358e-06, "loss": 34.3702, "mean_token_accuracy": 0.5186247676610947, "num_tokens": 3080727.0, "step": 102 }, { "epoch": 0.27819041188386223, "grad_norm": 34.092227935791016, "learning_rate": 8.675011564446572e-06, "loss": 36.106, "mean_token_accuracy": 0.5261654853820801, "num_tokens": 3110937.0, "step": 103 }, { "epoch": 0.28089128966914245, "grad_norm": 33.42031478881836, "learning_rate": 8.646984133161851e-06, "loss": 36.901, "mean_token_accuracy": 0.5126399099826813, "num_tokens": 3140898.0, "step": 104 }, { "epoch": 0.2835921674544227, "grad_norm": 34.97120666503906, "learning_rate": 8.61871570987488e-06, "loss": 29.9555, "mean_token_accuracy": 0.5955347269773483, "num_tokens": 3173196.0, "step": 105 }, { "epoch": 0.2862930452397029, "grad_norm": 34.68533706665039, "learning_rate": 8.5902084593452e-06, "loss": 32.6639, "mean_token_accuracy": 0.546476673334837, "num_tokens": 3202833.0, "step": 106 }, { "epoch": 0.2889939230249831, "grad_norm": 36.51945495605469, "learning_rate": 8.561464564621433e-06, "loss": 36.5502, "mean_token_accuracy": 0.5163783058524132, "num_tokens": 3229103.0, "step": 107 }, { "epoch": 0.29169480081026333, "grad_norm": 37.32801818847656, "learning_rate": 8.532486226874114e-06, "loss": 33.6549, "mean_token_accuracy": 0.5386885888874531, "num_tokens": 3253862.0, "step": 108 }, { "epoch": 0.29439567859554355, "grad_norm": 33.45566940307617, "learning_rate": 8.503275665227126e-06, "loss": 40.3254, "mean_token_accuracy": 0.4978017285466194, "num_tokens": 3284178.0, "step": 109 }, { "epoch": 0.2970965563808238, "grad_norm": 35.25465393066406, "learning_rate": 8.473835116587749e-06, "loss": 33.03, "mean_token_accuracy": 0.5586917027831078, "num_tokens": 3310874.0, "step": 110 }, { "epoch": 0.2970965563808238, "eval_loss": 2.147336006164551, "eval_mean_token_accuracy": 0.5475206339359283, "eval_num_tokens": 3310874.0, "eval_runtime": 36.2618, "eval_samples_per_second": 5.322, "eval_steps_per_second": 0.689, "step": 110 }, { "epoch": 0.299797434166104, "grad_norm": 33.06233596801758, "learning_rate": 8.444166835475379e-06, "loss": 39.3369, "mean_token_accuracy": 0.49359146505594254, "num_tokens": 3343475.0, "step": 111 }, { "epoch": 0.3024983119513842, "grad_norm": 29.681777954101562, "learning_rate": 8.414273093848875e-06, "loss": 33.0762, "mean_token_accuracy": 0.5716694034636021, "num_tokens": 3376722.0, "step": 112 }, { "epoch": 0.30519918973666443, "grad_norm": 36.85416793823242, "learning_rate": 8.384156180932566e-06, "loss": 33.9501, "mean_token_accuracy": 0.5419654734432697, "num_tokens": 3407235.0, "step": 113 }, { "epoch": 0.30790006752194465, "grad_norm": 35.34343338012695, "learning_rate": 8.353818403040954e-06, "loss": 33.7944, "mean_token_accuracy": 0.5648217089474201, "num_tokens": 3440419.0, "step": 114 }, { "epoch": 0.3106009453072249, "grad_norm": 33.71561050415039, "learning_rate": 8.323262083402101e-06, "loss": 34.8947, "mean_token_accuracy": 0.5803968720138073, "num_tokens": 3475672.0, "step": 115 }, { "epoch": 0.31330182309250504, "grad_norm": 30.479001998901367, "learning_rate": 8.292489561979707e-06, "loss": 36.8288, "mean_token_accuracy": 0.5175724886357784, "num_tokens": 3508728.0, "step": 116 }, { "epoch": 0.31600270087778526, "grad_norm": 37.75738525390625, "learning_rate": 8.261503195293934e-06, "loss": 36.0005, "mean_token_accuracy": 0.5403750203549862, "num_tokens": 3543974.0, "step": 117 }, { "epoch": 0.3187035786630655, "grad_norm": 34.52747344970703, "learning_rate": 8.230305356240936e-06, "loss": 30.8057, "mean_token_accuracy": 0.5758753754198551, "num_tokens": 3573337.0, "step": 118 }, { "epoch": 0.3214044564483457, "grad_norm": 32.336307525634766, "learning_rate": 8.198898433911155e-06, "loss": 33.8297, "mean_token_accuracy": 0.5259153321385384, "num_tokens": 3602008.0, "step": 119 }, { "epoch": 0.3241053342336259, "grad_norm": 36.36445617675781, "learning_rate": 8.16728483340635e-06, "loss": 35.2108, "mean_token_accuracy": 0.541402067989111, "num_tokens": 3629931.0, "step": 120 }, { "epoch": 0.3241053342336259, "eval_loss": 2.142936944961548, "eval_mean_token_accuracy": 0.5485082387924194, "eval_num_tokens": 3629931.0, "eval_runtime": 36.1169, "eval_samples_per_second": 5.344, "eval_steps_per_second": 0.692, "step": 120 }, { "epoch": 0.32680621201890614, "grad_norm": 34.380401611328125, "learning_rate": 8.135466975655443e-06, "loss": 33.8549, "mean_token_accuracy": 0.5417294539511204, "num_tokens": 3656934.0, "step": 121 }, { "epoch": 0.32950708980418636, "grad_norm": 28.74480438232422, "learning_rate": 8.103447297229102e-06, "loss": 37.8196, "mean_token_accuracy": 0.5115389674901962, "num_tokens": 3695154.0, "step": 122 }, { "epoch": 0.3322079675894666, "grad_norm": 28.74443817138672, "learning_rate": 8.071228250153171e-06, "loss": 36.9908, "mean_token_accuracy": 0.5405142866075039, "num_tokens": 3735717.0, "step": 123 }, { "epoch": 0.3349088453747468, "grad_norm": 35.23930358886719, "learning_rate": 8.038812301720884e-06, "loss": 31.9768, "mean_token_accuracy": 0.559939332306385, "num_tokens": 3758974.0, "step": 124 }, { "epoch": 0.337609723160027, "grad_norm": 28.688379287719727, "learning_rate": 8.00620193430393e-06, "loss": 33.6419, "mean_token_accuracy": 0.5499699637293816, "num_tokens": 3794373.0, "step": 125 }, { "epoch": 0.34031060094530724, "grad_norm": 34.50121307373047, "learning_rate": 7.973399645162356e-06, "loss": 35.1553, "mean_token_accuracy": 0.5216336958110332, "num_tokens": 3835044.0, "step": 126 }, { "epoch": 0.34301147873058746, "grad_norm": 37.575050354003906, "learning_rate": 7.940407946253325e-06, "loss": 32.6452, "mean_token_accuracy": 0.5582419410347939, "num_tokens": 3867527.0, "step": 127 }, { "epoch": 0.3457123565158677, "grad_norm": 33.72121047973633, "learning_rate": 7.907229364038757e-06, "loss": 32.687, "mean_token_accuracy": 0.5616066604852676, "num_tokens": 3901128.0, "step": 128 }, { "epoch": 0.3484132343011479, "grad_norm": 33.48974609375, "learning_rate": 7.873866439291855e-06, "loss": 36.6975, "mean_token_accuracy": 0.5192348062992096, "num_tokens": 3932725.0, "step": 129 }, { "epoch": 0.35111411208642807, "grad_norm": 34.740665435791016, "learning_rate": 7.840321726902541e-06, "loss": 33.6712, "mean_token_accuracy": 0.5390357933938503, "num_tokens": 3965466.0, "step": 130 }, { "epoch": 0.35111411208642807, "eval_loss": 2.1384716033935547, "eval_mean_token_accuracy": 0.5488432574272156, "eval_num_tokens": 3965466.0, "eval_runtime": 37.399, "eval_samples_per_second": 5.161, "eval_steps_per_second": 0.668, "step": 130 }, { "epoch": 0.3538149898717083, "grad_norm": 37.99197006225586, "learning_rate": 7.806597795681796e-06, "loss": 33.0943, "mean_token_accuracy": 0.5497070737183094, "num_tokens": 3992396.0, "step": 131 }, { "epoch": 0.3565158676569885, "grad_norm": 34.24283218383789, "learning_rate": 7.772697228164951e-06, "loss": 33.6122, "mean_token_accuracy": 0.5541913397610188, "num_tokens": 4030485.0, "step": 132 }, { "epoch": 0.3592167454422687, "grad_norm": 36.40911102294922, "learning_rate": 7.738622620413916e-06, "loss": 36.3135, "mean_token_accuracy": 0.51901014149189, "num_tokens": 4056899.0, "step": 133 }, { "epoch": 0.36191762322754895, "grad_norm": 39.211021423339844, "learning_rate": 7.70437658181838e-06, "loss": 30.7125, "mean_token_accuracy": 0.5674076676368713, "num_tokens": 4082420.0, "step": 134 }, { "epoch": 0.36461850101282917, "grad_norm": 34.15367889404297, "learning_rate": 7.66996173489599e-06, "loss": 35.0459, "mean_token_accuracy": 0.5370287075638771, "num_tokens": 4111214.0, "step": 135 }, { "epoch": 0.3673193787981094, "grad_norm": 43.82987976074219, "learning_rate": 7.635380715091504e-06, "loss": 32.0111, "mean_token_accuracy": 0.5593460202217102, "num_tokens": 4139608.0, "step": 136 }, { "epoch": 0.3700202565833896, "grad_norm": 36.09122848510742, "learning_rate": 7.6006361705750035e-06, "loss": 38.0412, "mean_token_accuracy": 0.5068509392440319, "num_tokens": 4169141.0, "step": 137 }, { "epoch": 0.3727211343686698, "grad_norm": 40.09010696411133, "learning_rate": 7.565730762039072e-06, "loss": 36.6723, "mean_token_accuracy": 0.5209071189165115, "num_tokens": 4195527.0, "step": 138 }, { "epoch": 0.37542201215395005, "grad_norm": 34.21112060546875, "learning_rate": 7.530667162495054e-06, "loss": 35.6671, "mean_token_accuracy": 0.5345174558460712, "num_tokens": 4225345.0, "step": 139 }, { "epoch": 0.37812288993923027, "grad_norm": 33.34288787841797, "learning_rate": 7.495448057068361e-06, "loss": 35.1221, "mean_token_accuracy": 0.5357744507491589, "num_tokens": 4252036.0, "step": 140 }, { "epoch": 0.37812288993923027, "eval_loss": 2.1338346004486084, "eval_mean_token_accuracy": 0.5494685709476471, "eval_num_tokens": 4252036.0, "eval_runtime": 36.2414, "eval_samples_per_second": 5.325, "eval_steps_per_second": 0.69, "step": 140 }, { "epoch": 0.3808237677245105, "grad_norm": 31.513835906982422, "learning_rate": 7.460076142792842e-06, "loss": 37.2946, "mean_token_accuracy": 0.5160485915839672, "num_tokens": 4284827.0, "step": 141 }, { "epoch": 0.3835246455097907, "grad_norm": 38.01262283325195, "learning_rate": 7.424554128404253e-06, "loss": 32.7539, "mean_token_accuracy": 0.5608501322567463, "num_tokens": 4310382.0, "step": 142 }, { "epoch": 0.38622552329507087, "grad_norm": 33.40518569946289, "learning_rate": 7.388884734132825e-06, "loss": 37.2924, "mean_token_accuracy": 0.524672869592905, "num_tokens": 4336622.0, "step": 143 }, { "epoch": 0.3889264010803511, "grad_norm": 33.23485565185547, "learning_rate": 7.353070691494949e-06, "loss": 33.0679, "mean_token_accuracy": 0.5497001633048058, "num_tokens": 4364603.0, "step": 144 }, { "epoch": 0.3916272788656313, "grad_norm": 32.9569206237793, "learning_rate": 7.317114743084004e-06, "loss": 36.8441, "mean_token_accuracy": 0.5169048607349396, "num_tokens": 4400904.0, "step": 145 }, { "epoch": 0.39432815665091153, "grad_norm": 31.794025421142578, "learning_rate": 7.281019642360326e-06, "loss": 36.3152, "mean_token_accuracy": 0.5194016620516777, "num_tokens": 4430111.0, "step": 146 }, { "epoch": 0.39702903443619175, "grad_norm": 32.69416809082031, "learning_rate": 7.244788153440365e-06, "loss": 33.9435, "mean_token_accuracy": 0.5554650984704494, "num_tokens": 4458443.0, "step": 147 }, { "epoch": 0.399729912221472, "grad_norm": 34.474151611328125, "learning_rate": 7.208423050884996e-06, "loss": 32.2953, "mean_token_accuracy": 0.5748496502637863, "num_tokens": 4485706.0, "step": 148 }, { "epoch": 0.4024307900067522, "grad_norm": 32.290000915527344, "learning_rate": 7.171927119487059e-06, "loss": 34.8319, "mean_token_accuracy": 0.5433229058980942, "num_tokens": 4518550.0, "step": 149 }, { "epoch": 0.4051316677920324, "grad_norm": 31.469343185424805, "learning_rate": 7.135303154058094e-06, "loss": 35.5133, "mean_token_accuracy": 0.5323488153517246, "num_tokens": 4549116.0, "step": 150 }, { "epoch": 0.4051316677920324, "eval_loss": 2.1300270557403564, "eval_mean_token_accuracy": 0.5498071122169494, "eval_num_tokens": 4549116.0, "eval_runtime": 36.4718, "eval_samples_per_second": 5.292, "eval_steps_per_second": 0.685, "step": 150 }, { "epoch": 0.40783254557731263, "grad_norm": 35.01815414428711, "learning_rate": 7.0985539592143295e-06, "loss": 33.8798, "mean_token_accuracy": 0.5323333479464054, "num_tokens": 4577809.0, "step": 151 }, { "epoch": 0.41053342336259285, "grad_norm": 37.759735107421875, "learning_rate": 7.061682349161898e-06, "loss": 34.0646, "mean_token_accuracy": 0.5341670103371143, "num_tokens": 4606190.0, "step": 152 }, { "epoch": 0.4132343011478731, "grad_norm": 40.53055953979492, "learning_rate": 7.024691147481328e-06, "loss": 32.3435, "mean_token_accuracy": 0.5441778153181076, "num_tokens": 4630728.0, "step": 153 }, { "epoch": 0.4159351789331533, "grad_norm": 31.96356964111328, "learning_rate": 6.987583186911327e-06, "loss": 35.9188, "mean_token_accuracy": 0.5264074876904488, "num_tokens": 4662606.0, "step": 154 }, { "epoch": 0.4186360567184335, "grad_norm": 33.261566162109375, "learning_rate": 6.950361309131848e-06, "loss": 30.8171, "mean_token_accuracy": 0.5653699412941933, "num_tokens": 4695737.0, "step": 155 }, { "epoch": 0.42133693450371373, "grad_norm": 33.04492950439453, "learning_rate": 6.9130283645464715e-06, "loss": 30.9808, "mean_token_accuracy": 0.5676438994705677, "num_tokens": 4723926.0, "step": 156 }, { "epoch": 0.4240378122889939, "grad_norm": 35.770809173583984, "learning_rate": 6.875587212064138e-06, "loss": 35.4103, "mean_token_accuracy": 0.5413705073297024, "num_tokens": 4748289.0, "step": 157 }, { "epoch": 0.4267386900742741, "grad_norm": 38.495601654052734, "learning_rate": 6.838040718880205e-06, "loss": 35.1233, "mean_token_accuracy": 0.5298267938196659, "num_tokens": 4771440.0, "step": 158 }, { "epoch": 0.42943956785955434, "grad_norm": 35.45297622680664, "learning_rate": 6.800391760256889e-06, "loss": 37.5557, "mean_token_accuracy": 0.5189447142183781, "num_tokens": 4800138.0, "step": 159 }, { "epoch": 0.43214044564483456, "grad_norm": 34.43162155151367, "learning_rate": 6.762643219303079e-06, "loss": 34.2271, "mean_token_accuracy": 0.5525719411671162, "num_tokens": 4833882.0, "step": 160 }, { "epoch": 0.43214044564483456, "eval_loss": 2.1280150413513184, "eval_mean_token_accuracy": 0.5499992990493774, "eval_num_tokens": 4833882.0, "eval_runtime": 36.7383, "eval_samples_per_second": 5.253, "eval_steps_per_second": 0.68, "step": 160 }, { "epoch": 0.4348413234301148, "grad_norm": 38.99521255493164, "learning_rate": 6.724797986753544e-06, "loss": 29.5404, "mean_token_accuracy": 0.5754740871489048, "num_tokens": 4871976.0, "step": 161 }, { "epoch": 0.437542201215395, "grad_norm": 32.28839111328125, "learning_rate": 6.686858960747581e-06, "loss": 32.8044, "mean_token_accuracy": 0.5634505487978458, "num_tokens": 4907192.0, "step": 162 }, { "epoch": 0.4402430790006752, "grad_norm": 37.342369079589844, "learning_rate": 6.64882904660706e-06, "loss": 31.9463, "mean_token_accuracy": 0.5631079636514187, "num_tokens": 4933884.0, "step": 163 }, { "epoch": 0.44294395678595544, "grad_norm": 30.216907501220703, "learning_rate": 6.610711156613956e-06, "loss": 31.5662, "mean_token_accuracy": 0.5566225573420525, "num_tokens": 4966876.0, "step": 164 }, { "epoch": 0.44564483457123566, "grad_norm": 40.20173263549805, "learning_rate": 6.572508209787316e-06, "loss": 33.3505, "mean_token_accuracy": 0.5285554938018322, "num_tokens": 4994687.0, "step": 165 }, { "epoch": 0.4483457123565159, "grad_norm": 34.483619689941406, "learning_rate": 6.5342231316597305e-06, "loss": 34.1092, "mean_token_accuracy": 0.5458228774368763, "num_tokens": 5030299.0, "step": 166 }, { "epoch": 0.4510465901417961, "grad_norm": 36.758453369140625, "learning_rate": 6.495858854053303e-06, "loss": 33.5874, "mean_token_accuracy": 0.5403504222631454, "num_tokens": 5061349.0, "step": 167 }, { "epoch": 0.4537474679270763, "grad_norm": 36.65058517456055, "learning_rate": 6.457418314855131e-06, "loss": 33.6991, "mean_token_accuracy": 0.5556437149643898, "num_tokens": 5089978.0, "step": 168 }, { "epoch": 0.45644834571235654, "grad_norm": 31.584121704101562, "learning_rate": 6.418904457792319e-06, "loss": 31.5041, "mean_token_accuracy": 0.5657464414834976, "num_tokens": 5119725.0, "step": 169 }, { "epoch": 0.4591492234976367, "grad_norm": 35.44051742553711, "learning_rate": 6.3803202322065696e-06, "loss": 36.2456, "mean_token_accuracy": 0.5201266296207905, "num_tokens": 5146599.0, "step": 170 }, { "epoch": 0.4591492234976367, "eval_loss": 2.1237289905548096, "eval_mean_token_accuracy": 0.5506399416923523, "eval_num_tokens": 5146599.0, "eval_runtime": 36.1534, "eval_samples_per_second": 5.338, "eval_steps_per_second": 0.691, "step": 170 }, { "epoch": 0.4618501012829169, "grad_norm": 33.61663055419922, "learning_rate": 6.341668592828312e-06, "loss": 37.3738, "mean_token_accuracy": 0.5134607069194317, "num_tokens": 5175358.0, "step": 171 }, { "epoch": 0.46455097906819715, "grad_norm": 33.51021957397461, "learning_rate": 6.302952499550437e-06, "loss": 36.7182, "mean_token_accuracy": 0.5284128822386265, "num_tokens": 5207993.0, "step": 172 }, { "epoch": 0.46725185685347737, "grad_norm": 37.06759262084961, "learning_rate": 6.264174917201632e-06, "loss": 32.3966, "mean_token_accuracy": 0.560367189347744, "num_tokens": 5236975.0, "step": 173 }, { "epoch": 0.4699527346387576, "grad_norm": 36.46392059326172, "learning_rate": 6.2253388153193376e-06, "loss": 33.2234, "mean_token_accuracy": 0.552173301577568, "num_tokens": 5259686.0, "step": 174 }, { "epoch": 0.4726536124240378, "grad_norm": 36.44706344604492, "learning_rate": 6.186447167922349e-06, "loss": 30.8265, "mean_token_accuracy": 0.5908371061086655, "num_tokens": 5282094.0, "step": 175 }, { "epoch": 0.475354490209318, "grad_norm": 34.210418701171875, "learning_rate": 6.147502953283064e-06, "loss": 37.5325, "mean_token_accuracy": 0.517979085445404, "num_tokens": 5312683.0, "step": 176 }, { "epoch": 0.47805536799459825, "grad_norm": 32.863704681396484, "learning_rate": 6.1085091536994075e-06, "loss": 31.7282, "mean_token_accuracy": 0.5739234499633312, "num_tokens": 5340445.0, "step": 177 }, { "epoch": 0.48075624577987847, "grad_norm": 35.57159423828125, "learning_rate": 6.0694687552664625e-06, "loss": 36.3493, "mean_token_accuracy": 0.539178866893053, "num_tokens": 5369151.0, "step": 178 }, { "epoch": 0.4834571235651587, "grad_norm": 30.459449768066406, "learning_rate": 6.030384747647786e-06, "loss": 29.8327, "mean_token_accuracy": 0.5987524092197418, "num_tokens": 5400705.0, "step": 179 }, { "epoch": 0.4861580013504389, "grad_norm": 33.08839416503906, "learning_rate": 5.9912601238464765e-06, "loss": 33.7659, "mean_token_accuracy": 0.5521523095667362, "num_tokens": 5428339.0, "step": 180 }, { "epoch": 0.4861580013504389, "eval_loss": 2.1207644939422607, "eval_mean_token_accuracy": 0.5510402059555054, "eval_num_tokens": 5428339.0, "eval_runtime": 36.036, "eval_samples_per_second": 5.356, "eval_steps_per_second": 0.694, "step": 180 }, { "epoch": 0.4888588791357191, "grad_norm": 34.80035400390625, "learning_rate": 5.952097879975965e-06, "loss": 35.1741, "mean_token_accuracy": 0.5513673648238182, "num_tokens": 5455447.0, "step": 181 }, { "epoch": 0.49155975692099935, "grad_norm": 35.057674407958984, "learning_rate": 5.912901015030575e-06, "loss": 35.3602, "mean_token_accuracy": 0.5500445552170277, "num_tokens": 5486985.0, "step": 182 }, { "epoch": 0.49426063470627957, "grad_norm": 33.302162170410156, "learning_rate": 5.873672530655874e-06, "loss": 33.5979, "mean_token_accuracy": 0.5720045119524002, "num_tokens": 5520594.0, "step": 183 }, { "epoch": 0.49696151249155973, "grad_norm": 32.105648040771484, "learning_rate": 5.834415430918802e-06, "loss": 31.3504, "mean_token_accuracy": 0.5655222907662392, "num_tokens": 5550979.0, "step": 184 }, { "epoch": 0.49966239027683995, "grad_norm": 36.847652435302734, "learning_rate": 5.795132722077625e-06, "loss": 33.2057, "mean_token_accuracy": 0.5620989352464676, "num_tokens": 5581925.0, "step": 185 }, { "epoch": 0.5023632680621202, "grad_norm": 37.06019592285156, "learning_rate": 5.755827412351724e-06, "loss": 33.1479, "mean_token_accuracy": 0.557017132639885, "num_tokens": 5610615.0, "step": 186 }, { "epoch": 0.5050641458474004, "grad_norm": 34.89840316772461, "learning_rate": 5.716502511691224e-06, "loss": 35.6245, "mean_token_accuracy": 0.5443479046225548, "num_tokens": 5644856.0, "step": 187 }, { "epoch": 0.5077650236326806, "grad_norm": 30.693859100341797, "learning_rate": 5.677161031546502e-06, "loss": 32.7316, "mean_token_accuracy": 0.5558462180197239, "num_tokens": 5678322.0, "step": 188 }, { "epoch": 0.5104659014179609, "grad_norm": 33.81172561645508, "learning_rate": 5.6378059846375695e-06, "loss": 33.786, "mean_token_accuracy": 0.5528871193528175, "num_tokens": 5706038.0, "step": 189 }, { "epoch": 0.513166779203241, "grad_norm": 33.86280822753906, "learning_rate": 5.598440384723359e-06, "loss": 33.4285, "mean_token_accuracy": 0.5394289828836918, "num_tokens": 5738386.0, "step": 190 }, { "epoch": 0.513166779203241, "eval_loss": 2.117305040359497, "eval_mean_token_accuracy": 0.5516872084140778, "eval_num_tokens": 5738386.0, "eval_runtime": 35.9063, "eval_samples_per_second": 5.375, "eval_steps_per_second": 0.696, "step": 190 }, { "epoch": 0.5158676569885212, "grad_norm": 31.8300724029541, "learning_rate": 5.559067246370946e-06, "loss": 34.3171, "mean_token_accuracy": 0.547035563737154, "num_tokens": 5775397.0, "step": 191 }, { "epoch": 0.5185685347738015, "grad_norm": 33.112220764160156, "learning_rate": 5.5196895847246835e-06, "loss": 37.0536, "mean_token_accuracy": 0.5280914977192879, "num_tokens": 5805124.0, "step": 192 }, { "epoch": 0.5212694125590817, "grad_norm": 33.582523345947266, "learning_rate": 5.480310415275317e-06, "loss": 31.8587, "mean_token_accuracy": 0.576672401279211, "num_tokens": 5834371.0, "step": 193 }, { "epoch": 0.5239702903443619, "grad_norm": 33.5911979675293, "learning_rate": 5.440932753629055e-06, "loss": 36.4298, "mean_token_accuracy": 0.5219374448060989, "num_tokens": 5868814.0, "step": 194 }, { "epoch": 0.5266711681296421, "grad_norm": 38.97636032104492, "learning_rate": 5.4015596152766425e-06, "loss": 33.012, "mean_token_accuracy": 0.5398210883140564, "num_tokens": 5892035.0, "step": 195 }, { "epoch": 0.5293720459149224, "grad_norm": 34.776519775390625, "learning_rate": 5.3621940153624345e-06, "loss": 30.8443, "mean_token_accuracy": 0.5654129348695278, "num_tokens": 5920208.0, "step": 196 }, { "epoch": 0.5320729237002025, "grad_norm": 32.7991943359375, "learning_rate": 5.322838968453499e-06, "loss": 32.5586, "mean_token_accuracy": 0.558869332075119, "num_tokens": 5949134.0, "step": 197 }, { "epoch": 0.5347738014854828, "grad_norm": 33.91230392456055, "learning_rate": 5.283497488308778e-06, "loss": 33.2294, "mean_token_accuracy": 0.5437094829976559, "num_tokens": 5987338.0, "step": 198 }, { "epoch": 0.537474679270763, "grad_norm": 35.91592788696289, "learning_rate": 5.244172587648279e-06, "loss": 36.6458, "mean_token_accuracy": 0.5238574855029583, "num_tokens": 6012530.0, "step": 199 }, { "epoch": 0.5401755570560433, "grad_norm": 30.558795928955078, "learning_rate": 5.204867277922376e-06, "loss": 34.0649, "mean_token_accuracy": 0.5468941740691662, "num_tokens": 6044255.0, "step": 200 }, { "epoch": 0.5401755570560433, "eval_loss": 2.114649772644043, "eval_mean_token_accuracy": 0.5514435756206513, "eval_num_tokens": 6044255.0, "eval_runtime": 36.3613, "eval_samples_per_second": 5.308, "eval_steps_per_second": 0.688, "step": 200 }, { "epoch": 0.5428764348413234, "grad_norm": 33.02035140991211, "learning_rate": 5.1655845690812e-06, "loss": 36.4362, "mean_token_accuracy": 0.5108117498457432, "num_tokens": 6071802.0, "step": 201 }, { "epoch": 0.5455773126266037, "grad_norm": 35.454627990722656, "learning_rate": 5.1263274693441266e-06, "loss": 36.2943, "mean_token_accuracy": 0.5278307497501373, "num_tokens": 6098292.0, "step": 202 }, { "epoch": 0.5482781904118839, "grad_norm": 28.900753021240234, "learning_rate": 5.087098984969426e-06, "loss": 35.7501, "mean_token_accuracy": 0.5413194857537746, "num_tokens": 6134699.0, "step": 203 }, { "epoch": 0.550979068197164, "grad_norm": 38.361167907714844, "learning_rate": 5.0479021200240376e-06, "loss": 33.152, "mean_token_accuracy": 0.5451154075562954, "num_tokens": 6161243.0, "step": 204 }, { "epoch": 0.5536799459824443, "grad_norm": 34.21308517456055, "learning_rate": 5.008739876153524e-06, "loss": 36.3784, "mean_token_accuracy": 0.529074665158987, "num_tokens": 6192802.0, "step": 205 }, { "epoch": 0.5563808237677245, "grad_norm": 33.641265869140625, "learning_rate": 4.9696152523522154e-06, "loss": 37.4078, "mean_token_accuracy": 0.5083318874239922, "num_tokens": 6225383.0, "step": 206 }, { "epoch": 0.5590817015530047, "grad_norm": 32.44288635253906, "learning_rate": 4.930531244733541e-06, "loss": 37.2304, "mean_token_accuracy": 0.5130453407764435, "num_tokens": 6261632.0, "step": 207 }, { "epoch": 0.5617825793382849, "grad_norm": 35.67074203491211, "learning_rate": 4.891490846300595e-06, "loss": 33.3719, "mean_token_accuracy": 0.5488149896264076, "num_tokens": 6284527.0, "step": 208 }, { "epoch": 0.5644834571235652, "grad_norm": 36.547908782958984, "learning_rate": 4.852497046716938e-06, "loss": 32.6247, "mean_token_accuracy": 0.5457115955650806, "num_tokens": 6307067.0, "step": 209 }, { "epoch": 0.5671843349088453, "grad_norm": 33.31892776489258, "learning_rate": 4.81355283207765e-06, "loss": 34.1281, "mean_token_accuracy": 0.5441075004637241, "num_tokens": 6338037.0, "step": 210 }, { "epoch": 0.5671843349088453, "eval_loss": 2.111090898513794, "eval_mean_token_accuracy": 0.5522787737846374, "eval_num_tokens": 6338037.0, "eval_runtime": 36.3826, "eval_samples_per_second": 5.305, "eval_steps_per_second": 0.687, "step": 210 }, { "epoch": 0.5698852126941256, "grad_norm": 34.43659591674805, "learning_rate": 4.774661184680664e-06, "loss": 32.9328, "mean_token_accuracy": 0.5641088709235191, "num_tokens": 6364909.0, "step": 211 }, { "epoch": 0.5725860904794058, "grad_norm": 31.227294921875, "learning_rate": 4.735825082798371e-06, "loss": 33.3527, "mean_token_accuracy": 0.5522276423871517, "num_tokens": 6397160.0, "step": 212 }, { "epoch": 0.5752869682646861, "grad_norm": 37.88279342651367, "learning_rate": 4.6970475004495645e-06, "loss": 35.3766, "mean_token_accuracy": 0.5384668968617916, "num_tokens": 6425247.0, "step": 213 }, { "epoch": 0.5779878460499662, "grad_norm": 37.55131530761719, "learning_rate": 4.658331407171689e-06, "loss": 32.1668, "mean_token_accuracy": 0.5646487064659595, "num_tokens": 6452344.0, "step": 214 }, { "epoch": 0.5806887238352465, "grad_norm": 31.42235565185547, "learning_rate": 4.619679767793431e-06, "loss": 33.5628, "mean_token_accuracy": 0.5600736439228058, "num_tokens": 6481836.0, "step": 215 }, { "epoch": 0.5833896016205267, "grad_norm": 34.976295471191406, "learning_rate": 4.581095542207683e-06, "loss": 32.6679, "mean_token_accuracy": 0.5608968362212181, "num_tokens": 6508560.0, "step": 216 }, { "epoch": 0.5860904794058069, "grad_norm": 36.642757415771484, "learning_rate": 4.542581685144872e-06, "loss": 33.0759, "mean_token_accuracy": 0.5415187776088715, "num_tokens": 6532351.0, "step": 217 }, { "epoch": 0.5887913571910871, "grad_norm": 36.19927978515625, "learning_rate": 4.504141145946698e-06, "loss": 34.6595, "mean_token_accuracy": 0.5488590188324451, "num_tokens": 6561667.0, "step": 218 }, { "epoch": 0.5914922349763673, "grad_norm": 34.810150146484375, "learning_rate": 4.46577686834027e-06, "loss": 32.2549, "mean_token_accuracy": 0.5848142206668854, "num_tokens": 6588504.0, "step": 219 }, { "epoch": 0.5941931127616475, "grad_norm": 36.03407287597656, "learning_rate": 4.427491790212685e-06, "loss": 36.3079, "mean_token_accuracy": 0.5366330035030842, "num_tokens": 6619419.0, "step": 220 }, { "epoch": 0.5941931127616475, "eval_loss": 2.1080169677734375, "eval_mean_token_accuracy": 0.5526835465431214, "eval_num_tokens": 6619419.0, "eval_runtime": 36.1795, "eval_samples_per_second": 5.335, "eval_steps_per_second": 0.691, "step": 220 }, { "epoch": 0.5968939905469277, "grad_norm": 32.30882263183594, "learning_rate": 4.389288843386046e-06, "loss": 36.0002, "mean_token_accuracy": 0.5268178954720497, "num_tokens": 6645308.0, "step": 221 }, { "epoch": 0.599594868332208, "grad_norm": 35.183467864990234, "learning_rate": 4.351170953392941e-06, "loss": 36.7108, "mean_token_accuracy": 0.517642181366682, "num_tokens": 6672733.0, "step": 222 }, { "epoch": 0.6022957461174882, "grad_norm": 30.575868606567383, "learning_rate": 4.31314103925242e-06, "loss": 33.246, "mean_token_accuracy": 0.5619362033903599, "num_tokens": 6706433.0, "step": 223 }, { "epoch": 0.6049966239027684, "grad_norm": 38.29409408569336, "learning_rate": 4.2752020132464575e-06, "loss": 35.1915, "mean_token_accuracy": 0.5246193036437035, "num_tokens": 6729575.0, "step": 224 }, { "epoch": 0.6076975016880486, "grad_norm": 31.32200050354004, "learning_rate": 4.237356780696924e-06, "loss": 35.6308, "mean_token_accuracy": 0.5413262136280537, "num_tokens": 6762622.0, "step": 225 }, { "epoch": 0.6103983794733289, "grad_norm": 36.57932662963867, "learning_rate": 4.199608239743112e-06, "loss": 32.5361, "mean_token_accuracy": 0.5666808746755123, "num_tokens": 6786772.0, "step": 226 }, { "epoch": 0.613099257258609, "grad_norm": 33.47692108154297, "learning_rate": 4.161959281119796e-06, "loss": 36.3793, "mean_token_accuracy": 0.5364098139107227, "num_tokens": 6825965.0, "step": 227 }, { "epoch": 0.6158001350438893, "grad_norm": 33.4936408996582, "learning_rate": 4.1244127879358644e-06, "loss": 35.9367, "mean_token_accuracy": 0.5211806520819664, "num_tokens": 6851261.0, "step": 228 }, { "epoch": 0.6185010128291695, "grad_norm": 35.78506851196289, "learning_rate": 4.086971635453529e-06, "loss": 27.107, "mean_token_accuracy": 0.5930493995547295, "num_tokens": 6877386.0, "step": 229 }, { "epoch": 0.6212018906144497, "grad_norm": 37.56957244873047, "learning_rate": 4.049638690868154e-06, "loss": 34.0885, "mean_token_accuracy": 0.5284733846783638, "num_tokens": 6903734.0, "step": 230 }, { "epoch": 0.6212018906144497, "eval_loss": 2.1059694290161133, "eval_mean_token_accuracy": 0.5528413355350494, "eval_num_tokens": 6903734.0, "eval_runtime": 36.4868, "eval_samples_per_second": 5.29, "eval_steps_per_second": 0.685, "step": 230 }, { "epoch": 0.6239027683997299, "grad_norm": 33.32976150512695, "learning_rate": 4.012416813088673e-06, "loss": 36.4343, "mean_token_accuracy": 0.5360178723931313, "num_tokens": 6937106.0, "step": 231 }, { "epoch": 0.6266036461850101, "grad_norm": 36.342777252197266, "learning_rate": 3.975308852518673e-06, "loss": 35.8903, "mean_token_accuracy": 0.5370488092303276, "num_tokens": 6963834.0, "step": 232 }, { "epoch": 0.6293045239702904, "grad_norm": 30.61956214904785, "learning_rate": 3.938317650838105e-06, "loss": 36.9583, "mean_token_accuracy": 0.5260182470083237, "num_tokens": 6995978.0, "step": 233 }, { "epoch": 0.6320054017555705, "grad_norm": 38.61687469482422, "learning_rate": 3.901446040785671e-06, "loss": 33.7734, "mean_token_accuracy": 0.5394391231238842, "num_tokens": 7020067.0, "step": 234 }, { "epoch": 0.6347062795408508, "grad_norm": 35.18577575683594, "learning_rate": 3.8646968459419065e-06, "loss": 34.8349, "mean_token_accuracy": 0.5318831242620945, "num_tokens": 7047521.0, "step": 235 }, { "epoch": 0.637407157326131, "grad_norm": 35.326602935791016, "learning_rate": 3.828072880512944e-06, "loss": 34.2787, "mean_token_accuracy": 0.5627218596637249, "num_tokens": 7078241.0, "step": 236 }, { "epoch": 0.6401080351114112, "grad_norm": 31.163745880126953, "learning_rate": 3.7915769491150046e-06, "loss": 34.5556, "mean_token_accuracy": 0.5384848527610302, "num_tokens": 7106784.0, "step": 237 }, { "epoch": 0.6428089128966914, "grad_norm": 31.930444717407227, "learning_rate": 3.7552118465596364e-06, "loss": 31.1109, "mean_token_accuracy": 0.5878217443823814, "num_tokens": 7137123.0, "step": 238 }, { "epoch": 0.6455097906819717, "grad_norm": 31.309144973754883, "learning_rate": 3.7189803576396743e-06, "loss": 32.7886, "mean_token_accuracy": 0.5596463643014431, "num_tokens": 7177127.0, "step": 239 }, { "epoch": 0.6482106684672518, "grad_norm": 35.24614334106445, "learning_rate": 3.6828852569159977e-06, "loss": 34.71, "mean_token_accuracy": 0.5278642177581787, "num_tokens": 7204498.0, "step": 240 }, { "epoch": 0.6482106684672518, "eval_loss": 2.1038832664489746, "eval_mean_token_accuracy": 0.5533580040931702, "eval_num_tokens": 7204498.0, "eval_runtime": 36.0922, "eval_samples_per_second": 5.347, "eval_steps_per_second": 0.693, "step": 240 }, { "epoch": 0.6509115462525321, "grad_norm": 32.8538932800293, "learning_rate": 3.6469293085050516e-06, "loss": 35.516, "mean_token_accuracy": 0.5260259509086609, "num_tokens": 7237533.0, "step": 241 }, { "epoch": 0.6536124240378123, "grad_norm": 42.09233474731445, "learning_rate": 3.6111152658671744e-06, "loss": 35.0674, "mean_token_accuracy": 0.5364427417516708, "num_tokens": 7264923.0, "step": 242 }, { "epoch": 0.6563133018230926, "grad_norm": 33.587406158447266, "learning_rate": 3.5754458715957485e-06, "loss": 36.4777, "mean_token_accuracy": 0.524031039327383, "num_tokens": 7294443.0, "step": 243 }, { "epoch": 0.6590141796083727, "grad_norm": 37.58925247192383, "learning_rate": 3.539923857207159e-06, "loss": 35.2353, "mean_token_accuracy": 0.5326221697032452, "num_tokens": 7320501.0, "step": 244 }, { "epoch": 0.6617150573936529, "grad_norm": 33.682735443115234, "learning_rate": 3.50455194293164e-06, "loss": 34.7487, "mean_token_accuracy": 0.540156327188015, "num_tokens": 7348223.0, "step": 245 }, { "epoch": 0.6644159351789332, "grad_norm": 32.76019287109375, "learning_rate": 3.4693328375049472e-06, "loss": 35.7718, "mean_token_accuracy": 0.5343315489590168, "num_tokens": 7377839.0, "step": 246 }, { "epoch": 0.6671168129642133, "grad_norm": 32.861392974853516, "learning_rate": 3.434269237960929e-06, "loss": 30.9004, "mean_token_accuracy": 0.5879909247159958, "num_tokens": 7411436.0, "step": 247 }, { "epoch": 0.6698176907494936, "grad_norm": 40.85888671875, "learning_rate": 3.3993638294249975e-06, "loss": 34.8619, "mean_token_accuracy": 0.5253729037940502, "num_tokens": 7436176.0, "step": 248 }, { "epoch": 0.6725185685347738, "grad_norm": 37.22172927856445, "learning_rate": 3.364619284908497e-06, "loss": 32.7447, "mean_token_accuracy": 0.5523360446095467, "num_tokens": 7469116.0, "step": 249 }, { "epoch": 0.675219446320054, "grad_norm": 33.47834777832031, "learning_rate": 3.330038265104014e-06, "loss": 34.9445, "mean_token_accuracy": 0.5251070633530617, "num_tokens": 7498872.0, "step": 250 }, { "epoch": 0.675219446320054, "eval_loss": 2.1019482612609863, "eval_mean_token_accuracy": 0.5536054599285126, "eval_num_tokens": 7498872.0, "eval_runtime": 36.3436, "eval_samples_per_second": 5.31, "eval_steps_per_second": 0.688, "step": 250 }, { "epoch": 0.6779203241053342, "grad_norm": 33.24917984008789, "learning_rate": 3.2956234181816215e-06, "loss": 33.6479, "mean_token_accuracy": 0.5462458655238152, "num_tokens": 7523235.0, "step": 251 }, { "epoch": 0.6806212018906145, "grad_norm": 27.41337776184082, "learning_rate": 3.261377379586085e-06, "loss": 34.9183, "mean_token_accuracy": 0.5495909862220287, "num_tokens": 7560469.0, "step": 252 }, { "epoch": 0.6833220796758946, "grad_norm": 33.81147384643555, "learning_rate": 3.2273027718350504e-06, "loss": 34.0592, "mean_token_accuracy": 0.5564935505390167, "num_tokens": 7586680.0, "step": 253 }, { "epoch": 0.6860229574611749, "grad_norm": 31.468517303466797, "learning_rate": 3.1934022043182054e-06, "loss": 33.2399, "mean_token_accuracy": 0.5463755503296852, "num_tokens": 7618058.0, "step": 254 }, { "epoch": 0.6887238352464551, "grad_norm": 35.69879150390625, "learning_rate": 3.1596782730974586e-06, "loss": 31.8585, "mean_token_accuracy": 0.5690927989780903, "num_tokens": 7649451.0, "step": 255 }, { "epoch": 0.6914247130317354, "grad_norm": 32.49742889404297, "learning_rate": 3.126133560708147e-06, "loss": 32.857, "mean_token_accuracy": 0.560420136898756, "num_tokens": 7678938.0, "step": 256 }, { "epoch": 0.6941255908170155, "grad_norm": 33.171897888183594, "learning_rate": 3.092770635961246e-06, "loss": 35.3575, "mean_token_accuracy": 0.5336680859327316, "num_tokens": 7709849.0, "step": 257 }, { "epoch": 0.6968264686022958, "grad_norm": 36.92119216918945, "learning_rate": 3.0595920537466767e-06, "loss": 36.0429, "mean_token_accuracy": 0.5323917642235756, "num_tokens": 7733756.0, "step": 258 }, { "epoch": 0.699527346387576, "grad_norm": 32.853572845458984, "learning_rate": 3.0266003548376455e-06, "loss": 29.8182, "mean_token_accuracy": 0.6018063575029373, "num_tokens": 7760789.0, "step": 259 }, { "epoch": 0.7022282241728561, "grad_norm": 31.29108428955078, "learning_rate": 2.9937980656960698e-06, "loss": 35.7206, "mean_token_accuracy": 0.5477294139564037, "num_tokens": 7790314.0, "step": 260 }, { "epoch": 0.7022282241728561, "eval_loss": 2.099925994873047, "eval_mean_token_accuracy": 0.5539823198318481, "eval_num_tokens": 7790314.0, "eval_runtime": 36.2408, "eval_samples_per_second": 5.325, "eval_steps_per_second": 0.69, "step": 260 }, { "epoch": 0.7049291019581364, "grad_norm": 31.41156578063965, "learning_rate": 2.9611876982791166e-06, "loss": 33.21, "mean_token_accuracy": 0.5604838952422142, "num_tokens": 7817429.0, "step": 261 }, { "epoch": 0.7076299797434166, "grad_norm": 32.671878814697266, "learning_rate": 2.9287717498468306e-06, "loss": 33.6561, "mean_token_accuracy": 0.5613893494009972, "num_tokens": 7845505.0, "step": 262 }, { "epoch": 0.7103308575286968, "grad_norm": 31.6097469329834, "learning_rate": 2.8965527027708996e-06, "loss": 33.5515, "mean_token_accuracy": 0.568169392645359, "num_tokens": 7883615.0, "step": 263 }, { "epoch": 0.713031735313977, "grad_norm": 28.50749969482422, "learning_rate": 2.8645330243445592e-06, "loss": 38.7651, "mean_token_accuracy": 0.4980784021317959, "num_tokens": 7923277.0, "step": 264 }, { "epoch": 0.7157326130992573, "grad_norm": 37.514503479003906, "learning_rate": 2.8327151665936513e-06, "loss": 36.2043, "mean_token_accuracy": 0.5105516910552979, "num_tokens": 7949225.0, "step": 265 }, { "epoch": 0.7184334908845375, "grad_norm": 36.01073455810547, "learning_rate": 2.8011015660888475e-06, "loss": 33.5025, "mean_token_accuracy": 0.5363100245594978, "num_tokens": 7979280.0, "step": 266 }, { "epoch": 0.7211343686698177, "grad_norm": 30.460277557373047, "learning_rate": 2.7696946437590644e-06, "loss": 31.9682, "mean_token_accuracy": 0.5711518712341785, "num_tokens": 8011947.0, "step": 267 }, { "epoch": 0.7238352464550979, "grad_norm": 32.15833282470703, "learning_rate": 2.7384968047060667e-06, "loss": 33.9796, "mean_token_accuracy": 0.5551204010844231, "num_tokens": 8041244.0, "step": 268 }, { "epoch": 0.7265361242403782, "grad_norm": 35.379356384277344, "learning_rate": 2.707510438020296e-06, "loss": 33.4958, "mean_token_accuracy": 0.5553654357790947, "num_tokens": 8072916.0, "step": 269 }, { "epoch": 0.7292370020256583, "grad_norm": 33.38249588012695, "learning_rate": 2.676737916597903e-06, "loss": 32.3767, "mean_token_accuracy": 0.5678667202591896, "num_tokens": 8104715.0, "step": 270 }, { "epoch": 0.7292370020256583, "eval_loss": 2.098012924194336, "eval_mean_token_accuracy": 0.554110803604126, "eval_num_tokens": 8104715.0, "eval_runtime": 36.209, "eval_samples_per_second": 5.33, "eval_steps_per_second": 0.69, "step": 270 }, { "epoch": 0.7319378798109386, "grad_norm": 31.15945053100586, "learning_rate": 2.6461815969590466e-06, "loss": 30.8953, "mean_token_accuracy": 0.5976364612579346, "num_tokens": 8141758.0, "step": 271 }, { "epoch": 0.7346387575962188, "grad_norm": 30.994585037231445, "learning_rate": 2.6158438190674355e-06, "loss": 37.4373, "mean_token_accuracy": 0.5256949178874493, "num_tokens": 8175477.0, "step": 272 }, { "epoch": 0.7373396353814989, "grad_norm": 29.04298973083496, "learning_rate": 2.5857269061511264e-06, "loss": 31.8416, "mean_token_accuracy": 0.5794625282287598, "num_tokens": 8215754.0, "step": 273 }, { "epoch": 0.7400405131667792, "grad_norm": 34.691158294677734, "learning_rate": 2.555833164524621e-06, "loss": 34.6707, "mean_token_accuracy": 0.5313400216400623, "num_tokens": 8247816.0, "step": 274 }, { "epoch": 0.7427413909520594, "grad_norm": 36.317115783691406, "learning_rate": 2.5261648834122525e-06, "loss": 37.1844, "mean_token_accuracy": 0.5230943337082863, "num_tokens": 8276832.0, "step": 275 }, { "epoch": 0.7454422687373397, "grad_norm": 34.50430679321289, "learning_rate": 2.496724334772876e-06, "loss": 33.1776, "mean_token_accuracy": 0.5589844100177288, "num_tokens": 8309113.0, "step": 276 }, { "epoch": 0.7481431465226198, "grad_norm": 34.186431884765625, "learning_rate": 2.467513773125886e-06, "loss": 35.7932, "mean_token_accuracy": 0.5354582667350769, "num_tokens": 8343267.0, "step": 277 }, { "epoch": 0.7508440243079001, "grad_norm": 33.04575729370117, "learning_rate": 2.4385354353785686e-06, "loss": 32.9948, "mean_token_accuracy": 0.5602673552930355, "num_tokens": 8380273.0, "step": 278 }, { "epoch": 0.7535449020931803, "grad_norm": 30.902019500732422, "learning_rate": 2.4097915406548022e-06, "loss": 35.586, "mean_token_accuracy": 0.5362657532095909, "num_tokens": 8415808.0, "step": 279 }, { "epoch": 0.7562457798784605, "grad_norm": 29.209392547607422, "learning_rate": 2.381284290125122e-06, "loss": 36.0241, "mean_token_accuracy": 0.5266930237412453, "num_tokens": 8453175.0, "step": 280 }, { "epoch": 0.7562457798784605, "eval_loss": 2.0966179370880127, "eval_mean_token_accuracy": 0.5541414618492126, "eval_num_tokens": 8453175.0, "eval_runtime": 36.6601, "eval_samples_per_second": 5.265, "eval_steps_per_second": 0.682, "step": 280 }, { "epoch": 0.7589466576637407, "grad_norm": 32.1947135925293, "learning_rate": 2.3530158668381488e-06, "loss": 33.9757, "mean_token_accuracy": 0.5407238230109215, "num_tokens": 8481129.0, "step": 281 }, { "epoch": 0.761647535449021, "grad_norm": 36.1117057800293, "learning_rate": 2.3249884355534287e-06, "loss": 31.8849, "mean_token_accuracy": 0.560691338032484, "num_tokens": 8505163.0, "step": 282 }, { "epoch": 0.7643484132343011, "grad_norm": 33.53237533569336, "learning_rate": 2.297204142575644e-06, "loss": 31.5556, "mean_token_accuracy": 0.5559076778590679, "num_tokens": 8534560.0, "step": 283 }, { "epoch": 0.7670492910195814, "grad_norm": 30.95838165283203, "learning_rate": 2.2696651155902543e-06, "loss": 34.8573, "mean_token_accuracy": 0.5212380439043045, "num_tokens": 8567816.0, "step": 284 }, { "epoch": 0.7697501688048616, "grad_norm": 32.302490234375, "learning_rate": 2.2423734635005735e-06, "loss": 32.5954, "mean_token_accuracy": 0.5536459907889366, "num_tokens": 8595346.0, "step": 285 }, { "epoch": 0.7724510465901417, "grad_norm": 30.771692276000977, "learning_rate": 2.215331276266258e-06, "loss": 32.1524, "mean_token_accuracy": 0.5622397661209106, "num_tokens": 8628639.0, "step": 286 }, { "epoch": 0.775151924375422, "grad_norm": 33.84574890136719, "learning_rate": 2.1885406247432666e-06, "loss": 37.9864, "mean_token_accuracy": 0.5149603337049484, "num_tokens": 8659547.0, "step": 287 }, { "epoch": 0.7778528021607022, "grad_norm": 34.21319580078125, "learning_rate": 2.162003560525279e-06, "loss": 33.3636, "mean_token_accuracy": 0.5604279115796089, "num_tokens": 8685980.0, "step": 288 }, { "epoch": 0.7805536799459825, "grad_norm": 30.261869430541992, "learning_rate": 2.135722115786582e-06, "loss": 30.5025, "mean_token_accuracy": 0.5800755359232426, "num_tokens": 8718249.0, "step": 289 }, { "epoch": 0.7832545577312626, "grad_norm": 29.031036376953125, "learning_rate": 2.1096983031264525e-06, "loss": 33.0431, "mean_token_accuracy": 0.5630405880510807, "num_tokens": 8754982.0, "step": 290 }, { "epoch": 0.7832545577312626, "eval_loss": 2.09511661529541, "eval_mean_token_accuracy": 0.5544790041446686, "eval_num_tokens": 8754982.0, "eval_runtime": 36.1303, "eval_samples_per_second": 5.342, "eval_steps_per_second": 0.692, "step": 290 }, { "epoch": 0.7859554355165429, "grad_norm": 31.977375030517578, "learning_rate": 2.083934115415032e-06, "loss": 30.9131, "mean_token_accuracy": 0.5798417702317238, "num_tokens": 8780201.0, "step": 291 }, { "epoch": 0.7886563133018231, "grad_norm": 30.645751953125, "learning_rate": 2.058431525640719e-06, "loss": 36.2473, "mean_token_accuracy": 0.5279138945043087, "num_tokens": 8809300.0, "step": 292 }, { "epoch": 0.7913571910871033, "grad_norm": 34.21016311645508, "learning_rate": 2.0331924867590795e-06, "loss": 33.0684, "mean_token_accuracy": 0.5504949688911438, "num_tokens": 8838501.0, "step": 293 }, { "epoch": 0.7940580688723835, "grad_norm": 32.018638610839844, "learning_rate": 2.008218931543286e-06, "loss": 33.325, "mean_token_accuracy": 0.5460036881268024, "num_tokens": 8868056.0, "step": 294 }, { "epoch": 0.7967589466576638, "grad_norm": 37.11056900024414, "learning_rate": 1.983512772436116e-06, "loss": 33.2696, "mean_token_accuracy": 0.5541720166802406, "num_tokens": 8896219.0, "step": 295 }, { "epoch": 0.799459824442944, "grad_norm": 33.04764175415039, "learning_rate": 1.9590759014034983e-06, "loss": 36.4369, "mean_token_accuracy": 0.5230924636125565, "num_tokens": 8925580.0, "step": 296 }, { "epoch": 0.8021607022282242, "grad_norm": 31.53118133544922, "learning_rate": 1.93491018978962e-06, "loss": 35.5526, "mean_token_accuracy": 0.5302544310688972, "num_tokens": 8954679.0, "step": 297 }, { "epoch": 0.8048615800135044, "grad_norm": 33.37092971801758, "learning_rate": 1.911017488173637e-06, "loss": 33.8277, "mean_token_accuracy": 0.5305632688105106, "num_tokens": 8982582.0, "step": 298 }, { "epoch": 0.8075624577987845, "grad_norm": 29.140186309814453, "learning_rate": 1.8873996262279449e-06, "loss": 29.7419, "mean_token_accuracy": 0.5824360325932503, "num_tokens": 9022151.0, "step": 299 }, { "epoch": 0.8102633355840648, "grad_norm": 32.51633834838867, "learning_rate": 1.8640584125780695e-06, "loss": 35.8525, "mean_token_accuracy": 0.5333940424025059, "num_tokens": 9049816.0, "step": 300 }, { "epoch": 0.8102633355840648, "eval_loss": 2.0937230587005615, "eval_mean_token_accuracy": 0.5548382639884949, "eval_num_tokens": 9049816.0, "eval_runtime": 35.8048, "eval_samples_per_second": 5.39, "eval_steps_per_second": 0.698, "step": 300 }, { "epoch": 0.812964213369345, "grad_norm": 33.11845779418945, "learning_rate": 1.8409956346641711e-06, "loss": 37.6469, "mean_token_accuracy": 0.5174382068216801, "num_tokens": 9078392.0, "step": 301 }, { "epoch": 0.8156650911546253, "grad_norm": 37.55961990356445, "learning_rate": 1.8182130586041532e-06, "loss": 31.841, "mean_token_accuracy": 0.559474378824234, "num_tokens": 9106295.0, "step": 302 }, { "epoch": 0.8183659689399054, "grad_norm": 41.26569747924805, "learning_rate": 1.7957124290584246e-06, "loss": 34.8936, "mean_token_accuracy": 0.5113974660634995, "num_tokens": 9133426.0, "step": 303 }, { "epoch": 0.8210668467251857, "grad_norm": 36.07694625854492, "learning_rate": 1.7734954690962907e-06, "loss": 38.0633, "mean_token_accuracy": 0.5195271596312523, "num_tokens": 9161575.0, "step": 304 }, { "epoch": 0.8237677245104659, "grad_norm": 36.4818000793457, "learning_rate": 1.751563880064005e-06, "loss": 35.4923, "mean_token_accuracy": 0.5254251956939697, "num_tokens": 9188560.0, "step": 305 }, { "epoch": 0.8264686022957461, "grad_norm": 32.5327262878418, "learning_rate": 1.7299193414544824e-06, "loss": 28.6696, "mean_token_accuracy": 0.593474805355072, "num_tokens": 9220974.0, "step": 306 }, { "epoch": 0.8291694800810263, "grad_norm": 29.985431671142578, "learning_rate": 1.7085635107786852e-06, "loss": 35.074, "mean_token_accuracy": 0.5393181852996349, "num_tokens": 9256443.0, "step": 307 }, { "epoch": 0.8318703578663066, "grad_norm": 32.53824234008789, "learning_rate": 1.68749802343869e-06, "loss": 34.66, "mean_token_accuracy": 0.5329767391085625, "num_tokens": 9288774.0, "step": 308 }, { "epoch": 0.8345712356515868, "grad_norm": 30.13478660583496, "learning_rate": 1.6667244926024596e-06, "loss": 34.5152, "mean_token_accuracy": 0.5427077859640121, "num_tokens": 9321438.0, "step": 309 }, { "epoch": 0.837272113436867, "grad_norm": 29.59137725830078, "learning_rate": 1.646244509080298e-06, "loss": 34.3903, "mean_token_accuracy": 0.5382002554833889, "num_tokens": 9354192.0, "step": 310 }, { "epoch": 0.837272113436867, "eval_loss": 2.092498540878296, "eval_mean_token_accuracy": 0.5547536420822143, "eval_num_tokens": 9354192.0, "eval_runtime": 36.6136, "eval_samples_per_second": 5.271, "eval_steps_per_second": 0.683, "step": 310 }, { "epoch": 0.8399729912221472, "grad_norm": 30.583677291870117, "learning_rate": 1.6260596412030338e-06, "loss": 36.9061, "mean_token_accuracy": 0.5144309028983116, "num_tokens": 9386006.0, "step": 311 }, { "epoch": 0.8426738690074275, "grad_norm": 41.30168533325195, "learning_rate": 1.6061714347019202e-06, "loss": 33.9611, "mean_token_accuracy": 0.5292498059570789, "num_tokens": 9415343.0, "step": 312 }, { "epoch": 0.8453747467927076, "grad_norm": 30.707265853881836, "learning_rate": 1.586581412590261e-06, "loss": 36.798, "mean_token_accuracy": 0.5260944701731205, "num_tokens": 9452683.0, "step": 313 }, { "epoch": 0.8480756245779878, "grad_norm": 37.20066452026367, "learning_rate": 1.5672910750467852e-06, "loss": 32.8137, "mean_token_accuracy": 0.5507222265005112, "num_tokens": 9482484.0, "step": 314 }, { "epoch": 0.8507765023632681, "grad_norm": 29.6890869140625, "learning_rate": 1.548301899300761e-06, "loss": 38.557, "mean_token_accuracy": 0.5040196180343628, "num_tokens": 9517582.0, "step": 315 }, { "epoch": 0.8534773801485482, "grad_norm": 31.31963348388672, "learning_rate": 1.5296153395188727e-06, "loss": 38.265, "mean_token_accuracy": 0.5037461705505848, "num_tokens": 9548412.0, "step": 316 }, { "epoch": 0.8561782579338285, "grad_norm": 31.79639434814453, "learning_rate": 1.5112328266938637e-06, "loss": 34.4414, "mean_token_accuracy": 0.5635125301778316, "num_tokens": 9581771.0, "step": 317 }, { "epoch": 0.8588791357191087, "grad_norm": 29.19704246520996, "learning_rate": 1.4931557685349507e-06, "loss": 35.8184, "mean_token_accuracy": 0.5325366817414761, "num_tokens": 9621108.0, "step": 318 }, { "epoch": 0.861580013504389, "grad_norm": 33.64675521850586, "learning_rate": 1.475385549360027e-06, "loss": 32.1758, "mean_token_accuracy": 0.5607290975749493, "num_tokens": 9650068.0, "step": 319 }, { "epoch": 0.8642808912896691, "grad_norm": 34.78401184082031, "learning_rate": 1.4579235299896485e-06, "loss": 31.7521, "mean_token_accuracy": 0.5685227960348129, "num_tokens": 9681623.0, "step": 320 }, { "epoch": 0.8642808912896691, "eval_loss": 2.091031551361084, "eval_mean_token_accuracy": 0.5548200500011444, "eval_num_tokens": 9681623.0, "eval_runtime": 36.1678, "eval_samples_per_second": 5.336, "eval_steps_per_second": 0.691, "step": 320 }, { "epoch": 0.8669817690749494, "grad_norm": 33.14302444458008, "learning_rate": 1.4407710476428255e-06, "loss": 34.4481, "mean_token_accuracy": 0.5331101752817631, "num_tokens": 9711841.0, "step": 321 }, { "epoch": 0.8696826468602296, "grad_norm": 34.452632904052734, "learning_rate": 1.4239294158346236e-06, "loss": 32.6911, "mean_token_accuracy": 0.567806776612997, "num_tokens": 9745335.0, "step": 322 }, { "epoch": 0.8723835246455098, "grad_norm": 35.28751754760742, "learning_rate": 1.407399924275572e-06, "loss": 35.6433, "mean_token_accuracy": 0.5379412919282913, "num_tokens": 9777444.0, "step": 323 }, { "epoch": 0.87508440243079, "grad_norm": 33.17687225341797, "learning_rate": 1.3911838387728972e-06, "loss": 37.061, "mean_token_accuracy": 0.5195654146373272, "num_tokens": 9813160.0, "step": 324 }, { "epoch": 0.8777852802160703, "grad_norm": 33.07435989379883, "learning_rate": 1.3752824011335976e-06, "loss": 36.5411, "mean_token_accuracy": 0.5400748178362846, "num_tokens": 9839662.0, "step": 325 }, { "epoch": 0.8804861580013504, "grad_norm": 42.2828483581543, "learning_rate": 1.35969682906934e-06, "loss": 32.8082, "mean_token_accuracy": 0.5310876406729221, "num_tokens": 9869375.0, "step": 326 }, { "epoch": 0.8831870357866306, "grad_norm": 31.91793441772461, "learning_rate": 1.3444283161032122e-06, "loss": 26.1468, "mean_token_accuracy": 0.6453410163521767, "num_tokens": 9907212.0, "step": 327 }, { "epoch": 0.8858879135719109, "grad_norm": 32.09194564819336, "learning_rate": 1.3294780314783229e-06, "loss": 37.8104, "mean_token_accuracy": 0.4997388496994972, "num_tokens": 9937601.0, "step": 328 }, { "epoch": 0.888588791357191, "grad_norm": 29.958486557006836, "learning_rate": 1.3148471200682625e-06, "loss": 33.4813, "mean_token_accuracy": 0.5626767501235008, "num_tokens": 9974730.0, "step": 329 }, { "epoch": 0.8912896691424713, "grad_norm": 33.71958923339844, "learning_rate": 1.3005367022894318e-06, "loss": 29.9562, "mean_token_accuracy": 0.6012796461582184, "num_tokens": 9999327.0, "step": 330 }, { "epoch": 0.8912896691424713, "eval_loss": 2.090074062347412, "eval_mean_token_accuracy": 0.5552214550971984, "eval_num_tokens": 9999327.0, "eval_runtime": 36.1097, "eval_samples_per_second": 5.345, "eval_steps_per_second": 0.692, "step": 330 }, { "epoch": 0.8939905469277515, "grad_norm": 41.03810119628906, "learning_rate": 1.2865478740152418e-06, "loss": 30.057, "mean_token_accuracy": 0.5762356519699097, "num_tokens": 10031842.0, "step": 331 }, { "epoch": 0.8966914247130318, "grad_norm": 32.73988342285156, "learning_rate": 1.2728817064921897e-06, "loss": 34.1446, "mean_token_accuracy": 0.5440133661031723, "num_tokens": 10062514.0, "step": 332 }, { "epoch": 0.8993923024983119, "grad_norm": 27.51495933532715, "learning_rate": 1.2595392462578314e-06, "loss": 29.0438, "mean_token_accuracy": 0.6018290631473064, "num_tokens": 10108245.0, "step": 333 }, { "epoch": 0.9020931802835922, "grad_norm": 35.75312805175781, "learning_rate": 1.2465215150606284e-06, "loss": 33.8071, "mean_token_accuracy": 0.5359045565128326, "num_tokens": 10134674.0, "step": 334 }, { "epoch": 0.9047940580688724, "grad_norm": 28.75173568725586, "learning_rate": 1.2338295097817152e-06, "loss": 30.145, "mean_token_accuracy": 0.586941484361887, "num_tokens": 10172393.0, "step": 335 }, { "epoch": 0.9074949358541526, "grad_norm": 38.10531997680664, "learning_rate": 1.2214642023585529e-06, "loss": 36.2288, "mean_token_accuracy": 0.5035588406026363, "num_tokens": 10203088.0, "step": 336 }, { "epoch": 0.9101958136394328, "grad_norm": 30.347970962524414, "learning_rate": 1.2094265397104953e-06, "loss": 33.9351, "mean_token_accuracy": 0.5529263578355312, "num_tokens": 10234293.0, "step": 337 }, { "epoch": 0.9128966914247131, "grad_norm": 29.981414794921875, "learning_rate": 1.1977174436662878e-06, "loss": 36.8807, "mean_token_accuracy": 0.5269143581390381, "num_tokens": 10266920.0, "step": 338 }, { "epoch": 0.9155975692099932, "grad_norm": 32.27985763549805, "learning_rate": 1.186337810893462e-06, "loss": 35.1172, "mean_token_accuracy": 0.5343122817575932, "num_tokens": 10293519.0, "step": 339 }, { "epoch": 0.9182984469952734, "grad_norm": 37.25572967529297, "learning_rate": 1.175288512829677e-06, "loss": 33.2276, "mean_token_accuracy": 0.5559127852320671, "num_tokens": 10322074.0, "step": 340 }, { "epoch": 0.9182984469952734, "eval_loss": 2.0892035961151123, "eval_mean_token_accuracy": 0.555384691953659, "eval_num_tokens": 10322074.0, "eval_runtime": 36.5828, "eval_samples_per_second": 5.276, "eval_steps_per_second": 0.683, "step": 340 }, { "epoch": 0.9209993247805537, "grad_norm": 32.46873474121094, "learning_rate": 1.1645703956159862e-06, "loss": 37.1182, "mean_token_accuracy": 0.5203519575297832, "num_tokens": 10353554.0, "step": 341 }, { "epoch": 0.9237002025658338, "grad_norm": 36.3703727722168, "learning_rate": 1.1541842800320353e-06, "loss": 32.6077, "mean_token_accuracy": 0.545674841850996, "num_tokens": 10377040.0, "step": 342 }, { "epoch": 0.9264010803511141, "grad_norm": 31.539222717285156, "learning_rate": 1.1441309614332147e-06, "loss": 31.5152, "mean_token_accuracy": 0.5626633390784264, "num_tokens": 10407844.0, "step": 343 }, { "epoch": 0.9291019581363943, "grad_norm": 34.05552673339844, "learning_rate": 1.1344112096897478e-06, "loss": 34.321, "mean_token_accuracy": 0.5334953516721725, "num_tokens": 10438078.0, "step": 344 }, { "epoch": 0.9318028359216746, "grad_norm": 35.00425720214844, "learning_rate": 1.1250257691277358e-06, "loss": 34.4051, "mean_token_accuracy": 0.5464885123074055, "num_tokens": 10469694.0, "step": 345 }, { "epoch": 0.9345037137069547, "grad_norm": 34.7364616394043, "learning_rate": 1.115975358472162e-06, "loss": 30.7082, "mean_token_accuracy": 0.5882319211959839, "num_tokens": 10495976.0, "step": 346 }, { "epoch": 0.937204591492235, "grad_norm": 36.294376373291016, "learning_rate": 1.1072606707918464e-06, "loss": 35.1527, "mean_token_accuracy": 0.5179415792226791, "num_tokens": 10519518.0, "step": 347 }, { "epoch": 0.9399054692775152, "grad_norm": 31.10394859313965, "learning_rate": 1.0988823734463784e-06, "loss": 34.5518, "mean_token_accuracy": 0.5454081632196903, "num_tokens": 10549556.0, "step": 348 }, { "epoch": 0.9426063470627954, "grad_norm": 33.66935729980469, "learning_rate": 1.090841108035005e-06, "loss": 33.5837, "mean_token_accuracy": 0.5329492837190628, "num_tokens": 10575704.0, "step": 349 }, { "epoch": 0.9453072248480756, "grad_norm": 28.914289474487305, "learning_rate": 1.0831374903475009e-06, "loss": 34.3631, "mean_token_accuracy": 0.5375620424747467, "num_tokens": 10607537.0, "step": 350 }, { "epoch": 0.9453072248480756, "eval_loss": 2.088322639465332, "eval_mean_token_accuracy": 0.5555643165111541, "eval_num_tokens": 10607537.0, "eval_runtime": 36.7646, "eval_samples_per_second": 5.25, "eval_steps_per_second": 0.68, "step": 350 }, { "epoch": 0.9480081026333559, "grad_norm": 32.65071487426758, "learning_rate": 1.075772110317013e-06, "loss": 31.9288, "mean_token_accuracy": 0.5655727498233318, "num_tokens": 10634772.0, "step": 351 }, { "epoch": 0.950708980418636, "grad_norm": 36.29484558105469, "learning_rate": 1.068745531974882e-06, "loss": 33.6489, "mean_token_accuracy": 0.5422691218554974, "num_tokens": 10660174.0, "step": 352 }, { "epoch": 0.9534098582039163, "grad_norm": 33.30977249145508, "learning_rate": 1.0620582934074512e-06, "loss": 31.6845, "mean_token_accuracy": 0.5767942070960999, "num_tokens": 10698717.0, "step": 353 }, { "epoch": 0.9561107359891965, "grad_norm": 35.180477142333984, "learning_rate": 1.0557109067148597e-06, "loss": 33.2479, "mean_token_accuracy": 0.5502138063311577, "num_tokens": 10723116.0, "step": 354 }, { "epoch": 0.9588116137744767, "grad_norm": 27.213275909423828, "learning_rate": 1.0497038579718255e-06, "loss": 32.5942, "mean_token_accuracy": 0.5732335671782494, "num_tokens": 10773330.0, "step": 355 }, { "epoch": 0.9615124915597569, "grad_norm": 32.196250915527344, "learning_rate": 1.044037607190426e-06, "loss": 35.136, "mean_token_accuracy": 0.5395778194069862, "num_tokens": 10803748.0, "step": 356 }, { "epoch": 0.9642133693450371, "grad_norm": 33.37202072143555, "learning_rate": 1.0387125882848665e-06, "loss": 34.3062, "mean_token_accuracy": 0.5422671809792519, "num_tokens": 10831049.0, "step": 357 }, { "epoch": 0.9669142471303174, "grad_norm": 27.47268295288086, "learning_rate": 1.0337292090382532e-06, "loss": 32.9391, "mean_token_accuracy": 0.5572673715651035, "num_tokens": 10871029.0, "step": 358 }, { "epoch": 0.9696151249155975, "grad_norm": 34.746891021728516, "learning_rate": 1.0290878510713683e-06, "loss": 34.3936, "mean_token_accuracy": 0.5453428328037262, "num_tokens": 10902364.0, "step": 359 }, { "epoch": 0.9723160027008778, "grad_norm": 33.34050369262695, "learning_rate": 1.0247888698134422e-06, "loss": 33.8804, "mean_token_accuracy": 0.5376629531383514, "num_tokens": 10929833.0, "step": 360 }, { "epoch": 0.9723160027008778, "eval_loss": 2.0879721641540527, "eval_mean_token_accuracy": 0.5556378149986267, "eval_num_tokens": 10929833.0, "eval_runtime": 36.5089, "eval_samples_per_second": 5.286, "eval_steps_per_second": 0.685, "step": 360 }, { "epoch": 0.975016880486158, "grad_norm": 33.2555046081543, "learning_rate": 1.0208325944749372e-06, "loss": 31.6532, "mean_token_accuracy": 0.5595313608646393, "num_tokens": 10961716.0, "step": 361 }, { "epoch": 0.9777177582714383, "grad_norm": 34.11859893798828, "learning_rate": 1.017219328022336e-06, "loss": 34.9486, "mean_token_accuracy": 0.545049749314785, "num_tokens": 10988681.0, "step": 362 }, { "epoch": 0.9804186360567184, "grad_norm": 30.698774337768555, "learning_rate": 1.0139493471549426e-06, "loss": 32.8165, "mean_token_accuracy": 0.546560000628233, "num_tokens": 11016207.0, "step": 363 }, { "epoch": 0.9831195138419987, "grad_norm": 32.19876480102539, "learning_rate": 1.0110229022836908e-06, "loss": 33.841, "mean_token_accuracy": 0.5365865342319012, "num_tokens": 11045469.0, "step": 364 }, { "epoch": 0.9858203916272789, "grad_norm": 32.29637145996094, "learning_rate": 1.008440217511969e-06, "loss": 32.5542, "mean_token_accuracy": 0.554278589785099, "num_tokens": 11077332.0, "step": 365 }, { "epoch": 0.9885212694125591, "grad_norm": 35.50790023803711, "learning_rate": 1.0062014906184595e-06, "loss": 29.0268, "mean_token_accuracy": 0.6077466309070587, "num_tokens": 11119106.0, "step": 366 }, { "epoch": 0.9912221471978393, "grad_norm": 38.41935348510742, "learning_rate": 1.0043068930419925e-06, "loss": 33.5489, "mean_token_accuracy": 0.5493052676320076, "num_tokens": 11138449.0, "step": 367 }, { "epoch": 0.9939230249831195, "grad_norm": 32.61334228515625, "learning_rate": 1.002756569868416e-06, "loss": 36.0695, "mean_token_accuracy": 0.5341297462582588, "num_tokens": 11166476.0, "step": 368 }, { "epoch": 0.9966239027683997, "grad_norm": 30.612293243408203, "learning_rate": 1.001550639819487e-06, "loss": 30.2004, "mean_token_accuracy": 0.5847633183002472, "num_tokens": 11196265.0, "step": 369 }, { "epoch": 0.9993247805536799, "grad_norm": 31.51392364501953, "learning_rate": 1.00068919524378e-06, "loss": 36.1897, "mean_token_accuracy": 0.5242423750460148, "num_tokens": 11225018.0, "step": 370 }, { "epoch": 0.9993247805536799, "eval_loss": 2.08726167678833, "eval_mean_token_accuracy": 0.5559327018260956, "eval_num_tokens": 11225018.0, "eval_runtime": 36.2368, "eval_samples_per_second": 5.326, "eval_steps_per_second": 0.69, "step": 370 }, { "epoch": 0.9993247805536799, "eval_loss": 2.08726167678833, "eval_mean_token_accuracy": 0.5559327018260956, "eval_num_tokens": 0.0, "eval_runtime": 190.1539, "eval_samples_per_second": 1.015, "eval_steps_per_second": 0.131, "step": 370 }, { "epoch": 0.9993247805536799, "mean_token_accuracy": 0.503634586930275, "num_tokens": 5485.0, "step": 370, "total_flos": 5.200773558933914e+16, "train_loss": 0.02867867237812764, "train_runtime": 251.2644, "train_samples_per_second": 23.573, "train_steps_per_second": 1.477 } ], "logging_steps": 1, "max_steps": 371, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.200773558933914e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }