{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21, "eval_steps": 500, "global_step": 455000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002, "grad_norm": 1.603668212890625, "learning_rate": 0.0001999604, "loss": 2.0926, "step": 100 }, { "epoch": 0.0004, "grad_norm": 1.1335899829864502, "learning_rate": 0.0001999204, "loss": 1.5915, "step": 200 }, { "epoch": 0.0006, "grad_norm": 0.9146980047225952, "learning_rate": 0.00019988040000000002, "loss": 1.413, "step": 300 }, { "epoch": 0.0008, "grad_norm": 1.1228628158569336, "learning_rate": 0.0001998404, "loss": 1.3208, "step": 400 }, { "epoch": 0.001, "grad_norm": 0.623985230922699, "learning_rate": 0.0001998004, "loss": 1.2557, "step": 500 }, { "epoch": 0.0012, "grad_norm": 0.7038842439651489, "learning_rate": 0.0001997604, "loss": 1.1387, "step": 600 }, { "epoch": 0.0014, "grad_norm": 0.820369303226471, "learning_rate": 0.00019972040000000002, "loss": 1.1653, "step": 700 }, { "epoch": 0.0016, "grad_norm": 0.6667912006378174, "learning_rate": 0.0001996804, "loss": 1.1342, "step": 800 }, { "epoch": 0.0018, "grad_norm": 0.616651713848114, "learning_rate": 0.0001996404, "loss": 1.1548, "step": 900 }, { "epoch": 0.002, "grad_norm": 0.5994811058044434, "learning_rate": 0.0001996004, "loss": 1.0464, "step": 1000 }, { "epoch": 0.0022, "grad_norm": 0.5694602131843567, "learning_rate": 0.0001995604, "loss": 1.1394, "step": 1100 }, { "epoch": 0.0024, "grad_norm": 0.5835832357406616, "learning_rate": 0.00019952040000000002, "loss": 1.0209, "step": 1200 }, { "epoch": 0.0026, "grad_norm": 1.4923994541168213, "learning_rate": 0.0001994804, "loss": 1.0708, "step": 1300 }, { "epoch": 0.0028, "grad_norm": 0.669029712677002, "learning_rate": 0.00019944040000000003, "loss": 0.9359, "step": 1400 }, { "epoch": 0.003, "grad_norm": 1.1236016750335693, "learning_rate": 0.00019940040000000002, "loss": 0.9963, "step": 1500 }, { "epoch": 0.0032, "grad_norm": 0.7666842341423035, "learning_rate": 0.0001993604, "loss": 1.0306, "step": 1600 }, { "epoch": 0.0034, "grad_norm": 1.5545086860656738, "learning_rate": 0.0001993204, "loss": 0.9544, "step": 1700 }, { "epoch": 0.0036, "grad_norm": 0.7571123242378235, "learning_rate": 0.0001992804, "loss": 0.9473, "step": 1800 }, { "epoch": 0.0038, "grad_norm": 0.5189706683158875, "learning_rate": 0.00019924040000000002, "loss": 0.9743, "step": 1900 }, { "epoch": 0.004, "grad_norm": 0.9225242137908936, "learning_rate": 0.00019920040000000002, "loss": 0.9029, "step": 2000 }, { "epoch": 0.0042, "grad_norm": 0.6427626609802246, "learning_rate": 0.0001991604, "loss": 0.9111, "step": 2100 }, { "epoch": 0.0044, "grad_norm": 0.5414828658103943, "learning_rate": 0.0001991204, "loss": 0.9591, "step": 2200 }, { "epoch": 0.0046, "grad_norm": 0.5731277465820312, "learning_rate": 0.0001990804, "loss": 0.9327, "step": 2300 }, { "epoch": 0.0048, "grad_norm": 0.6296040415763855, "learning_rate": 0.00019904040000000002, "loss": 0.8148, "step": 2400 }, { "epoch": 0.005, "grad_norm": 0.5396831631660461, "learning_rate": 0.0001990004, "loss": 0.8056, "step": 2500 }, { "epoch": 0.0052, "grad_norm": 0.5632336735725403, "learning_rate": 0.0001989604, "loss": 0.8224, "step": 2600 }, { "epoch": 0.0054, "grad_norm": 0.7668570280075073, "learning_rate": 0.0001989204, "loss": 0.8137, "step": 2700 }, { "epoch": 0.0056, "grad_norm": 0.5585981607437134, "learning_rate": 0.00019888040000000002, "loss": 0.8658, "step": 2800 }, { "epoch": 0.0058, "grad_norm": 0.5207762122154236, "learning_rate": 0.00019884040000000001, "loss": 0.796, "step": 2900 }, { "epoch": 0.006, "grad_norm": 0.6384051442146301, "learning_rate": 0.0001988004, "loss": 0.979, "step": 3000 }, { "epoch": 0.0062, "grad_norm": 0.4824763834476471, "learning_rate": 0.0001987604, "loss": 0.7749, "step": 3100 }, { "epoch": 0.0064, "grad_norm": 0.5262443423271179, "learning_rate": 0.0001987204, "loss": 0.7486, "step": 3200 }, { "epoch": 0.0066, "grad_norm": 3.371579170227051, "learning_rate": 0.00019868040000000002, "loss": 0.8155, "step": 3300 }, { "epoch": 0.0068, "grad_norm": 0.46106863021850586, "learning_rate": 0.0001986404, "loss": 0.7683, "step": 3400 }, { "epoch": 0.007, "grad_norm": 0.5194998979568481, "learning_rate": 0.00019860040000000003, "loss": 0.7919, "step": 3500 }, { "epoch": 0.0072, "grad_norm": 0.5358483195304871, "learning_rate": 0.0001985604, "loss": 0.7655, "step": 3600 }, { "epoch": 0.0074, "grad_norm": 0.4279918372631073, "learning_rate": 0.00019852040000000002, "loss": 0.7611, "step": 3700 }, { "epoch": 0.0076, "grad_norm": 0.3322402834892273, "learning_rate": 0.0001984804, "loss": 0.8131, "step": 3800 }, { "epoch": 0.0078, "grad_norm": 0.5196714401245117, "learning_rate": 0.0001984404, "loss": 0.7348, "step": 3900 }, { "epoch": 0.008, "grad_norm": 0.6269809603691101, "learning_rate": 0.00019840040000000003, "loss": 0.7274, "step": 4000 }, { "epoch": 0.0082, "grad_norm": 0.7291073203086853, "learning_rate": 0.0001983604, "loss": 0.8001, "step": 4100 }, { "epoch": 0.0084, "grad_norm": 1.0613459348678589, "learning_rate": 0.0001983204, "loss": 0.7169, "step": 4200 }, { "epoch": 0.0086, "grad_norm": 0.46023017168045044, "learning_rate": 0.0001982804, "loss": 0.7456, "step": 4300 }, { "epoch": 0.0088, "grad_norm": 0.3527376651763916, "learning_rate": 0.00019824040000000003, "loss": 0.699, "step": 4400 }, { "epoch": 0.009, "grad_norm": 0.5758853554725647, "learning_rate": 0.00019820040000000002, "loss": 0.7226, "step": 4500 }, { "epoch": 0.0092, "grad_norm": 0.500385582447052, "learning_rate": 0.00019816040000000001, "loss": 0.753, "step": 4600 }, { "epoch": 0.0094, "grad_norm": 0.8873578906059265, "learning_rate": 0.0001981204, "loss": 0.7047, "step": 4700 }, { "epoch": 0.0096, "grad_norm": 0.6691761612892151, "learning_rate": 0.0001980804, "loss": 0.7139, "step": 4800 }, { "epoch": 0.0098, "grad_norm": 0.593289852142334, "learning_rate": 0.00019804040000000002, "loss": 0.7075, "step": 4900 }, { "epoch": 0.01, "grad_norm": 0.7012218236923218, "learning_rate": 0.00019800040000000002, "loss": 0.6878, "step": 5000 }, { "epoch": 0.0102, "grad_norm": 0.5074142217636108, "learning_rate": 0.0001979604, "loss": 0.7574, "step": 5100 }, { "epoch": 0.0104, "grad_norm": 0.9699320197105408, "learning_rate": 0.0001979204, "loss": 0.6619, "step": 5200 }, { "epoch": 0.0106, "grad_norm": 0.44478604197502136, "learning_rate": 0.0001978804, "loss": 0.7029, "step": 5300 }, { "epoch": 0.0108, "grad_norm": 0.43934938311576843, "learning_rate": 0.00019784040000000002, "loss": 0.6991, "step": 5400 }, { "epoch": 0.011, "grad_norm": 0.4505891501903534, "learning_rate": 0.0001978004, "loss": 0.7019, "step": 5500 }, { "epoch": 0.0112, "grad_norm": 0.40732213854789734, "learning_rate": 0.0001977604, "loss": 0.7085, "step": 5600 }, { "epoch": 0.0114, "grad_norm": 0.5004173517227173, "learning_rate": 0.0001977204, "loss": 0.6757, "step": 5700 }, { "epoch": 0.0116, "grad_norm": 0.5011471509933472, "learning_rate": 0.00019768040000000002, "loss": 0.6752, "step": 5800 }, { "epoch": 0.0118, "grad_norm": 0.3992891311645508, "learning_rate": 0.0001976404, "loss": 0.6733, "step": 5900 }, { "epoch": 0.012, "grad_norm": 0.3974108099937439, "learning_rate": 0.0001976004, "loss": 0.6737, "step": 6000 }, { "epoch": 0.0122, "grad_norm": 0.4213123917579651, "learning_rate": 0.0001975604, "loss": 0.7769, "step": 6100 }, { "epoch": 0.0124, "grad_norm": 0.4016417860984802, "learning_rate": 0.0001975204, "loss": 0.6731, "step": 6200 }, { "epoch": 0.0126, "grad_norm": 0.45475897192955017, "learning_rate": 0.00019748040000000001, "loss": 0.6566, "step": 6300 }, { "epoch": 0.0128, "grad_norm": 0.298532098531723, "learning_rate": 0.0001974404, "loss": 0.6221, "step": 6400 }, { "epoch": 0.013, "grad_norm": 0.36682987213134766, "learning_rate": 0.00019740040000000003, "loss": 0.6402, "step": 6500 }, { "epoch": 0.0132, "grad_norm": 0.35242322087287903, "learning_rate": 0.0001973604, "loss": 0.6069, "step": 6600 }, { "epoch": 0.0134, "grad_norm": 0.49721652269363403, "learning_rate": 0.00019732040000000002, "loss": 0.6847, "step": 6700 }, { "epoch": 0.0136, "grad_norm": 0.371697336435318, "learning_rate": 0.0001972804, "loss": 0.617, "step": 6800 }, { "epoch": 0.0138, "grad_norm": 0.498140424489975, "learning_rate": 0.0001972404, "loss": 0.7371, "step": 6900 }, { "epoch": 0.014, "grad_norm": 0.4353884160518646, "learning_rate": 0.00019720040000000002, "loss": 0.6746, "step": 7000 }, { "epoch": 0.0142, "grad_norm": 0.47410452365875244, "learning_rate": 0.0001971604, "loss": 0.6462, "step": 7100 }, { "epoch": 0.0144, "grad_norm": 0.558781087398529, "learning_rate": 0.0001971204, "loss": 0.6459, "step": 7200 }, { "epoch": 0.0146, "grad_norm": 0.41540494561195374, "learning_rate": 0.0001970804, "loss": 0.6269, "step": 7300 }, { "epoch": 0.0148, "grad_norm": 0.5316945910453796, "learning_rate": 0.00019704040000000003, "loss": 0.6097, "step": 7400 }, { "epoch": 0.015, "grad_norm": 0.5045424103736877, "learning_rate": 0.00019700040000000002, "loss": 0.6496, "step": 7500 }, { "epoch": 0.0152, "grad_norm": 0.3779171407222748, "learning_rate": 0.0001969604, "loss": 0.632, "step": 7600 }, { "epoch": 0.0154, "grad_norm": 0.31508123874664307, "learning_rate": 0.0001969204, "loss": 0.595, "step": 7700 }, { "epoch": 0.0156, "grad_norm": 0.5666154623031616, "learning_rate": 0.0001968804, "loss": 0.6193, "step": 7800 }, { "epoch": 0.0158, "grad_norm": 0.36079517006874084, "learning_rate": 0.00019684040000000002, "loss": 0.6154, "step": 7900 }, { "epoch": 0.016, "grad_norm": 0.716064989566803, "learning_rate": 0.00019680040000000001, "loss": 0.6018, "step": 8000 }, { "epoch": 0.0162, "grad_norm": 0.3534316122531891, "learning_rate": 0.0001967604, "loss": 0.6298, "step": 8100 }, { "epoch": 0.0164, "grad_norm": 0.31048017740249634, "learning_rate": 0.0001967204, "loss": 0.6118, "step": 8200 }, { "epoch": 0.0166, "grad_norm": 0.7001683115959167, "learning_rate": 0.00019668040000000002, "loss": 0.6154, "step": 8300 }, { "epoch": 0.0168, "grad_norm": 0.577167809009552, "learning_rate": 0.00019664040000000002, "loss": 0.6081, "step": 8400 }, { "epoch": 0.017, "grad_norm": 0.48630014061927795, "learning_rate": 0.0001966004, "loss": 0.6225, "step": 8500 }, { "epoch": 0.0172, "grad_norm": 0.3509364426136017, "learning_rate": 0.0001965604, "loss": 0.6283, "step": 8600 }, { "epoch": 0.0174, "grad_norm": 0.5807685256004333, "learning_rate": 0.0001965204, "loss": 0.5904, "step": 8700 }, { "epoch": 0.0176, "grad_norm": 0.40310025215148926, "learning_rate": 0.00019648040000000002, "loss": 0.6043, "step": 8800 }, { "epoch": 0.0178, "grad_norm": 0.4283163845539093, "learning_rate": 0.0001964404, "loss": 0.5648, "step": 8900 }, { "epoch": 0.018, "grad_norm": 0.33734825253486633, "learning_rate": 0.0001964004, "loss": 0.5728, "step": 9000 }, { "epoch": 0.0182, "grad_norm": 0.2931211292743683, "learning_rate": 0.0001963604, "loss": 0.5937, "step": 9100 }, { "epoch": 0.0184, "grad_norm": 0.4612201154232025, "learning_rate": 0.0001963204, "loss": 0.5934, "step": 9200 }, { "epoch": 0.0186, "grad_norm": 0.3644779622554779, "learning_rate": 0.0001962804, "loss": 0.5766, "step": 9300 }, { "epoch": 0.0188, "grad_norm": 0.30560925602912903, "learning_rate": 0.0001962404, "loss": 0.5464, "step": 9400 }, { "epoch": 0.019, "grad_norm": 0.6216826438903809, "learning_rate": 0.00019620040000000003, "loss": 0.6206, "step": 9500 }, { "epoch": 0.0192, "grad_norm": 0.3877085745334625, "learning_rate": 0.0001961604, "loss": 0.5825, "step": 9600 }, { "epoch": 0.0194, "grad_norm": 0.4004894495010376, "learning_rate": 0.00019612040000000001, "loss": 0.5918, "step": 9700 }, { "epoch": 0.0196, "grad_norm": 0.49157753586769104, "learning_rate": 0.0001960804, "loss": 0.5891, "step": 9800 }, { "epoch": 0.0198, "grad_norm": 1.031232237815857, "learning_rate": 0.0001960404, "loss": 0.5748, "step": 9900 }, { "epoch": 0.02, "grad_norm": 0.36149775981903076, "learning_rate": 0.00019600040000000002, "loss": 0.5844, "step": 10000 }, { "epoch": 0.0202, "grad_norm": 0.29936593770980835, "learning_rate": 0.0001959604, "loss": 0.5687, "step": 10100 }, { "epoch": 0.0204, "grad_norm": 4.51532506942749, "learning_rate": 0.0001959204, "loss": 0.6077, "step": 10200 }, { "epoch": 0.0206, "grad_norm": 0.44001829624176025, "learning_rate": 0.0001958804, "loss": 0.6138, "step": 10300 }, { "epoch": 0.0208, "grad_norm": 0.26877814531326294, "learning_rate": 0.00019584040000000002, "loss": 0.5787, "step": 10400 }, { "epoch": 0.021, "grad_norm": 0.441687673330307, "learning_rate": 0.00019580040000000002, "loss": 0.5353, "step": 10500 }, { "epoch": 0.0212, "grad_norm": 0.4635355472564697, "learning_rate": 0.0001957604, "loss": 0.5686, "step": 10600 }, { "epoch": 0.0214, "grad_norm": 0.3567267060279846, "learning_rate": 0.0001957204, "loss": 0.5363, "step": 10700 }, { "epoch": 0.0216, "grad_norm": 0.5282173156738281, "learning_rate": 0.0001956804, "loss": 0.582, "step": 10800 }, { "epoch": 0.0218, "grad_norm": 0.32945743203163147, "learning_rate": 0.00019564040000000002, "loss": 0.5695, "step": 10900 }, { "epoch": 0.022, "grad_norm": 1.3247750997543335, "learning_rate": 0.0001956004, "loss": 0.5665, "step": 11000 }, { "epoch": 0.0222, "grad_norm": 0.3688965439796448, "learning_rate": 0.0001955604, "loss": 0.5835, "step": 11100 }, { "epoch": 0.0224, "grad_norm": 0.4591335356235504, "learning_rate": 0.0001955204, "loss": 0.5638, "step": 11200 }, { "epoch": 0.0226, "grad_norm": 0.4654744267463684, "learning_rate": 0.00019548040000000002, "loss": 0.5821, "step": 11300 }, { "epoch": 0.0228, "grad_norm": 1.729272484779358, "learning_rate": 0.0001954404, "loss": 0.6013, "step": 11400 }, { "epoch": 0.023, "grad_norm": 0.3169005513191223, "learning_rate": 0.0001954004, "loss": 0.5678, "step": 11500 }, { "epoch": 0.0232, "grad_norm": 0.4935179650783539, "learning_rate": 0.0001953604, "loss": 0.5517, "step": 11600 }, { "epoch": 0.0234, "grad_norm": 0.3395957350730896, "learning_rate": 0.0001953204, "loss": 0.5387, "step": 11700 }, { "epoch": 0.0236, "grad_norm": 0.4479553997516632, "learning_rate": 0.00019528040000000001, "loss": 0.5748, "step": 11800 }, { "epoch": 0.0238, "grad_norm": 0.36801132559776306, "learning_rate": 0.0001952404, "loss": 0.6047, "step": 11900 }, { "epoch": 0.024, "grad_norm": 0.5720965266227722, "learning_rate": 0.00019520040000000003, "loss": 0.5458, "step": 12000 }, { "epoch": 0.0242, "grad_norm": 0.37826624512672424, "learning_rate": 0.0001951604, "loss": 0.5512, "step": 12100 }, { "epoch": 0.0244, "grad_norm": 0.39435476064682007, "learning_rate": 0.00019512040000000002, "loss": 0.5388, "step": 12200 }, { "epoch": 0.0246, "grad_norm": 0.4711388349533081, "learning_rate": 0.0001950804, "loss": 0.535, "step": 12300 }, { "epoch": 0.0248, "grad_norm": 0.38097721338272095, "learning_rate": 0.0001950404, "loss": 0.5675, "step": 12400 }, { "epoch": 0.025, "grad_norm": 0.3961537778377533, "learning_rate": 0.00019500040000000002, "loss": 0.5727, "step": 12500 }, { "epoch": 0.0252, "grad_norm": 0.5444319248199463, "learning_rate": 0.0001949604, "loss": 0.5879, "step": 12600 }, { "epoch": 0.0254, "grad_norm": 0.5849190354347229, "learning_rate": 0.0001949204, "loss": 0.5635, "step": 12700 }, { "epoch": 0.0256, "grad_norm": 0.6113494038581848, "learning_rate": 0.0001948804, "loss": 0.5467, "step": 12800 }, { "epoch": 0.0258, "grad_norm": 0.4276942312717438, "learning_rate": 0.0001948404, "loss": 0.5554, "step": 12900 }, { "epoch": 0.026, "grad_norm": 0.48870015144348145, "learning_rate": 0.00019480040000000002, "loss": 0.5986, "step": 13000 }, { "epoch": 0.0262, "grad_norm": 0.27192604541778564, "learning_rate": 0.00019476039999999999, "loss": 0.5551, "step": 13100 }, { "epoch": 0.0264, "grad_norm": 0.6044579148292542, "learning_rate": 0.0001947204, "loss": 0.5477, "step": 13200 }, { "epoch": 0.0266, "grad_norm": 0.4254133105278015, "learning_rate": 0.0001946804, "loss": 0.547, "step": 13300 }, { "epoch": 0.0268, "grad_norm": 1.369423747062683, "learning_rate": 0.00019464040000000002, "loss": 0.5492, "step": 13400 }, { "epoch": 0.027, "grad_norm": 0.38343751430511475, "learning_rate": 0.00019460040000000001, "loss": 0.5275, "step": 13500 }, { "epoch": 0.0272, "grad_norm": 0.45244088768959045, "learning_rate": 0.0001945604, "loss": 0.4827, "step": 13600 }, { "epoch": 0.0274, "grad_norm": 0.3227088451385498, "learning_rate": 0.0001945204, "loss": 0.4863, "step": 13700 }, { "epoch": 0.0276, "grad_norm": 0.45713531970977783, "learning_rate": 0.0001944804, "loss": 0.4933, "step": 13800 }, { "epoch": 0.0278, "grad_norm": 0.42994385957717896, "learning_rate": 0.00019444040000000002, "loss": 0.484, "step": 13900 }, { "epoch": 0.028, "grad_norm": 0.3284667134284973, "learning_rate": 0.0001944004, "loss": 0.4956, "step": 14000 }, { "epoch": 0.0282, "grad_norm": 0.44729650020599365, "learning_rate": 0.00019436040000000003, "loss": 0.4595, "step": 14100 }, { "epoch": 0.0284, "grad_norm": 0.31246569752693176, "learning_rate": 0.0001943204, "loss": 0.4796, "step": 14200 }, { "epoch": 0.0286, "grad_norm": 0.46601220965385437, "learning_rate": 0.00019428040000000002, "loss": 0.4956, "step": 14300 }, { "epoch": 0.0288, "grad_norm": 0.33735036849975586, "learning_rate": 0.0001942404, "loss": 0.4689, "step": 14400 }, { "epoch": 0.029, "grad_norm": 0.44884735345840454, "learning_rate": 0.0001942004, "loss": 0.4913, "step": 14500 }, { "epoch": 0.0292, "grad_norm": 1.72175133228302, "learning_rate": 0.00019416040000000003, "loss": 0.4932, "step": 14600 }, { "epoch": 0.0294, "grad_norm": 0.8482571244239807, "learning_rate": 0.0001941204, "loss": 0.4834, "step": 14700 }, { "epoch": 0.0296, "grad_norm": 0.35360008478164673, "learning_rate": 0.0001940804, "loss": 0.498, "step": 14800 }, { "epoch": 0.0298, "grad_norm": 0.32927367091178894, "learning_rate": 0.0001940404, "loss": 0.4824, "step": 14900 }, { "epoch": 0.03, "grad_norm": 0.4251364469528198, "learning_rate": 0.00019400040000000003, "loss": 0.4941, "step": 15000 }, { "epoch": 0.0302, "grad_norm": 0.3267940878868103, "learning_rate": 0.00019396040000000002, "loss": 0.4943, "step": 15100 }, { "epoch": 0.0304, "grad_norm": 0.29757174849510193, "learning_rate": 0.00019392040000000001, "loss": 0.4856, "step": 15200 }, { "epoch": 0.0306, "grad_norm": 0.4625142216682434, "learning_rate": 0.0001938804, "loss": 0.4759, "step": 15300 }, { "epoch": 0.0308, "grad_norm": 0.3954981565475464, "learning_rate": 0.0001938404, "loss": 0.4731, "step": 15400 }, { "epoch": 0.031, "grad_norm": 0.42979204654693604, "learning_rate": 0.00019380040000000002, "loss": 0.518, "step": 15500 }, { "epoch": 0.0312, "grad_norm": 0.5260553956031799, "learning_rate": 0.00019376040000000002, "loss": 0.4934, "step": 15600 }, { "epoch": 0.0314, "grad_norm": 0.6792057156562805, "learning_rate": 0.0001937204, "loss": 0.478, "step": 15700 }, { "epoch": 0.0316, "grad_norm": 0.3626260459423065, "learning_rate": 0.0001936804, "loss": 0.4859, "step": 15800 }, { "epoch": 0.0318, "grad_norm": 0.6549792885780334, "learning_rate": 0.00019364040000000002, "loss": 0.4843, "step": 15900 }, { "epoch": 0.032, "grad_norm": 0.40819457173347473, "learning_rate": 0.00019360040000000002, "loss": 0.4876, "step": 16000 }, { "epoch": 0.0322, "grad_norm": 0.46695244312286377, "learning_rate": 0.0001935604, "loss": 0.493, "step": 16100 }, { "epoch": 0.0324, "grad_norm": 0.29340672492980957, "learning_rate": 0.0001935204, "loss": 0.4822, "step": 16200 }, { "epoch": 0.0326, "grad_norm": 0.4895103871822357, "learning_rate": 0.0001934804, "loss": 0.4775, "step": 16300 }, { "epoch": 0.0328, "grad_norm": 0.38141900300979614, "learning_rate": 0.00019344040000000002, "loss": 0.5175, "step": 16400 }, { "epoch": 0.033, "grad_norm": 0.2875644862651825, "learning_rate": 0.0001934004, "loss": 0.5004, "step": 16500 }, { "epoch": 0.0332, "grad_norm": 1.0335603952407837, "learning_rate": 0.0001933604, "loss": 0.4917, "step": 16600 }, { "epoch": 0.0334, "grad_norm": 0.3046559691429138, "learning_rate": 0.0001933204, "loss": 0.5966, "step": 16700 }, { "epoch": 0.0336, "grad_norm": 0.3726640045642853, "learning_rate": 0.0001932804, "loss": 0.4882, "step": 16800 }, { "epoch": 0.0338, "grad_norm": 0.3167787194252014, "learning_rate": 0.00019324040000000001, "loss": 0.4979, "step": 16900 }, { "epoch": 0.034, "grad_norm": 0.23184147477149963, "learning_rate": 0.0001932004, "loss": 0.4873, "step": 17000 }, { "epoch": 0.0342, "grad_norm": 0.27494415640830994, "learning_rate": 0.00019316040000000003, "loss": 0.4735, "step": 17100 }, { "epoch": 0.0344, "grad_norm": 2.1850268840789795, "learning_rate": 0.0001931204, "loss": 0.4706, "step": 17200 }, { "epoch": 0.0346, "grad_norm": 0.6089339852333069, "learning_rate": 0.00019308040000000002, "loss": 0.4736, "step": 17300 }, { "epoch": 0.0348, "grad_norm": 0.33707159757614136, "learning_rate": 0.0001930404, "loss": 0.4639, "step": 17400 }, { "epoch": 0.035, "grad_norm": 0.3079136610031128, "learning_rate": 0.0001930004, "loss": 0.4576, "step": 17500 }, { "epoch": 0.0352, "grad_norm": 0.9146600961685181, "learning_rate": 0.00019296040000000002, "loss": 0.4713, "step": 17600 }, { "epoch": 0.0354, "grad_norm": 0.3352242708206177, "learning_rate": 0.0001929204, "loss": 0.4628, "step": 17700 }, { "epoch": 0.0356, "grad_norm": 0.3531840443611145, "learning_rate": 0.0001928804, "loss": 0.4814, "step": 17800 }, { "epoch": 0.0358, "grad_norm": 0.3614370822906494, "learning_rate": 0.0001928404, "loss": 0.4786, "step": 17900 }, { "epoch": 0.036, "grad_norm": 0.3174838721752167, "learning_rate": 0.00019280040000000003, "loss": 0.4618, "step": 18000 }, { "epoch": 0.0362, "grad_norm": 0.5685095191001892, "learning_rate": 0.00019276040000000002, "loss": 0.4678, "step": 18100 }, { "epoch": 0.0364, "grad_norm": 0.35674697160720825, "learning_rate": 0.0001927204, "loss": 0.4709, "step": 18200 }, { "epoch": 0.0366, "grad_norm": 0.3839879631996155, "learning_rate": 0.0001926804, "loss": 0.4709, "step": 18300 }, { "epoch": 0.0368, "grad_norm": 0.401214599609375, "learning_rate": 0.0001926404, "loss": 0.4716, "step": 18400 }, { "epoch": 0.037, "grad_norm": 0.4122023582458496, "learning_rate": 0.00019260040000000002, "loss": 0.481, "step": 18500 }, { "epoch": 0.0372, "grad_norm": 0.42760711908340454, "learning_rate": 0.00019256040000000001, "loss": 0.4534, "step": 18600 }, { "epoch": 0.0374, "grad_norm": 4.317193508148193, "learning_rate": 0.0001925204, "loss": 0.4838, "step": 18700 }, { "epoch": 0.0376, "grad_norm": 0.3325408697128296, "learning_rate": 0.0001924804, "loss": 0.4756, "step": 18800 }, { "epoch": 0.0378, "grad_norm": 0.4391893744468689, "learning_rate": 0.00019244040000000002, "loss": 0.4625, "step": 18900 }, { "epoch": 0.038, "grad_norm": 0.3271416127681732, "learning_rate": 0.00019240040000000002, "loss": 0.452, "step": 19000 }, { "epoch": 0.0382, "grad_norm": 0.2682945132255554, "learning_rate": 0.0001923604, "loss": 0.4677, "step": 19100 }, { "epoch": 0.0384, "grad_norm": 0.2958075702190399, "learning_rate": 0.0001923204, "loss": 0.4715, "step": 19200 }, { "epoch": 0.0386, "grad_norm": 0.6007779240608215, "learning_rate": 0.0001922804, "loss": 0.4581, "step": 19300 }, { "epoch": 0.0388, "grad_norm": 0.40477871894836426, "learning_rate": 0.00019224040000000002, "loss": 0.4675, "step": 19400 }, { "epoch": 0.039, "grad_norm": 0.44704627990722656, "learning_rate": 0.0001922004, "loss": 0.4846, "step": 19500 }, { "epoch": 0.0392, "grad_norm": 0.4317639172077179, "learning_rate": 0.00019216040000000003, "loss": 0.4681, "step": 19600 }, { "epoch": 0.0394, "grad_norm": 0.274308443069458, "learning_rate": 0.0001921204, "loss": 0.4577, "step": 19700 }, { "epoch": 0.0396, "grad_norm": 0.3873860239982605, "learning_rate": 0.0001920804, "loss": 0.4505, "step": 19800 }, { "epoch": 0.0398, "grad_norm": 0.4344724714756012, "learning_rate": 0.0001920404, "loss": 0.4531, "step": 19900 }, { "epoch": 0.04, "grad_norm": 0.3240549564361572, "learning_rate": 0.0001920004, "loss": 0.4616, "step": 20000 }, { "epoch": 0.0402, "grad_norm": 0.3235743045806885, "learning_rate": 0.00019196040000000003, "loss": 0.4435, "step": 20100 }, { "epoch": 0.0404, "grad_norm": 0.6763646602630615, "learning_rate": 0.0001919204, "loss": 0.4806, "step": 20200 }, { "epoch": 0.0406, "grad_norm": 0.3265356123447418, "learning_rate": 0.00019188040000000001, "loss": 0.4674, "step": 20300 }, { "epoch": 0.0408, "grad_norm": 0.41400811076164246, "learning_rate": 0.0001918404, "loss": 0.4687, "step": 20400 }, { "epoch": 0.041, "grad_norm": 1.1848185062408447, "learning_rate": 0.0001918004, "loss": 0.4602, "step": 20500 }, { "epoch": 0.0412, "grad_norm": 0.4341808259487152, "learning_rate": 0.00019176040000000002, "loss": 0.4559, "step": 20600 }, { "epoch": 0.0414, "grad_norm": 0.40686967968940735, "learning_rate": 0.0001917204, "loss": 0.4408, "step": 20700 }, { "epoch": 0.0416, "grad_norm": 0.28525295853614807, "learning_rate": 0.0001916804, "loss": 0.4841, "step": 20800 }, { "epoch": 0.0418, "grad_norm": 0.29934221506118774, "learning_rate": 0.0001916404, "loss": 0.4562, "step": 20900 }, { "epoch": 0.042, "grad_norm": 0.6304417848587036, "learning_rate": 0.00019160040000000002, "loss": 0.4609, "step": 21000 }, { "epoch": 0.0422, "grad_norm": 0.2794235348701477, "learning_rate": 0.00019156040000000002, "loss": 0.4578, "step": 21100 }, { "epoch": 0.0424, "grad_norm": 0.9360523223876953, "learning_rate": 0.0001915204, "loss": 0.4563, "step": 21200 }, { "epoch": 0.0426, "grad_norm": 0.3164953291416168, "learning_rate": 0.0001914804, "loss": 0.4563, "step": 21300 }, { "epoch": 0.0428, "grad_norm": 0.2829827070236206, "learning_rate": 0.0001914404, "loss": 0.4509, "step": 21400 }, { "epoch": 0.043, "grad_norm": 0.3804665803909302, "learning_rate": 0.00019140040000000002, "loss": 0.4523, "step": 21500 }, { "epoch": 0.0432, "grad_norm": 0.2858908176422119, "learning_rate": 0.0001913604, "loss": 0.4669, "step": 21600 }, { "epoch": 0.0434, "grad_norm": 0.29328569769859314, "learning_rate": 0.0001913204, "loss": 0.4571, "step": 21700 }, { "epoch": 0.0436, "grad_norm": 0.3529110252857208, "learning_rate": 0.0001912804, "loss": 0.4546, "step": 21800 }, { "epoch": 0.0438, "grad_norm": 0.2541522681713104, "learning_rate": 0.00019124040000000002, "loss": 0.4618, "step": 21900 }, { "epoch": 0.044, "grad_norm": 0.3136732280254364, "learning_rate": 0.0001912004, "loss": 0.4579, "step": 22000 }, { "epoch": 0.0442, "grad_norm": 0.35282132029533386, "learning_rate": 0.0001911604, "loss": 0.4801, "step": 22100 }, { "epoch": 0.0444, "grad_norm": 0.32009491324424744, "learning_rate": 0.0001911204, "loss": 0.4422, "step": 22200 }, { "epoch": 0.0446, "grad_norm": 0.25222790241241455, "learning_rate": 0.0001910804, "loss": 0.5024, "step": 22300 }, { "epoch": 0.0448, "grad_norm": 0.24487198889255524, "learning_rate": 0.00019104040000000001, "loss": 0.44, "step": 22400 }, { "epoch": 0.045, "grad_norm": 0.25609472393989563, "learning_rate": 0.0001910004, "loss": 0.4403, "step": 22500 }, { "epoch": 0.0452, "grad_norm": 0.24061603844165802, "learning_rate": 0.00019096040000000003, "loss": 0.434, "step": 22600 }, { "epoch": 0.0454, "grad_norm": 0.24821767210960388, "learning_rate": 0.0001909204, "loss": 0.4383, "step": 22700 }, { "epoch": 0.0456, "grad_norm": 0.3346046507358551, "learning_rate": 0.00019088040000000002, "loss": 0.4337, "step": 22800 }, { "epoch": 0.0458, "grad_norm": 0.4589807093143463, "learning_rate": 0.0001908404, "loss": 0.436, "step": 22900 }, { "epoch": 0.046, "grad_norm": 0.24371834099292755, "learning_rate": 0.0001908004, "loss": 0.4374, "step": 23000 }, { "epoch": 0.0462, "grad_norm": 0.30695632100105286, "learning_rate": 0.00019076040000000002, "loss": 0.4507, "step": 23100 }, { "epoch": 0.0464, "grad_norm": 0.3627791702747345, "learning_rate": 0.0001907204, "loss": 0.4853, "step": 23200 }, { "epoch": 0.0466, "grad_norm": 0.4218233525753021, "learning_rate": 0.0001906804, "loss": 0.4422, "step": 23300 }, { "epoch": 0.0468, "grad_norm": 0.3387954831123352, "learning_rate": 0.0001906404, "loss": 0.4443, "step": 23400 }, { "epoch": 0.047, "grad_norm": 0.3261500597000122, "learning_rate": 0.0001906004, "loss": 0.4438, "step": 23500 }, { "epoch": 0.0472, "grad_norm": 0.27388980984687805, "learning_rate": 0.00019056040000000002, "loss": 0.4383, "step": 23600 }, { "epoch": 0.0474, "grad_norm": 0.2767557203769684, "learning_rate": 0.00019052039999999999, "loss": 0.444, "step": 23700 }, { "epoch": 0.0476, "grad_norm": 0.37184062600135803, "learning_rate": 0.0001904804, "loss": 0.443, "step": 23800 }, { "epoch": 0.0478, "grad_norm": 0.33180898427963257, "learning_rate": 0.0001904404, "loss": 0.4601, "step": 23900 }, { "epoch": 0.048, "grad_norm": 0.3857232630252838, "learning_rate": 0.00019040040000000002, "loss": 0.4645, "step": 24000 }, { "epoch": 0.0482, "grad_norm": 0.4319915175437927, "learning_rate": 0.00019036040000000001, "loss": 0.4464, "step": 24100 }, { "epoch": 0.0484, "grad_norm": 1.3287200927734375, "learning_rate": 0.0001903204, "loss": 0.4419, "step": 24200 }, { "epoch": 0.0486, "grad_norm": 0.2736344337463379, "learning_rate": 0.0001902804, "loss": 0.4541, "step": 24300 }, { "epoch": 0.0488, "grad_norm": 0.2820674777030945, "learning_rate": 0.0001902404, "loss": 0.4284, "step": 24400 }, { "epoch": 0.049, "grad_norm": 0.4350157678127289, "learning_rate": 0.00019020040000000002, "loss": 0.4487, "step": 24500 }, { "epoch": 0.0492, "grad_norm": 0.29447850584983826, "learning_rate": 0.0001901604, "loss": 0.4321, "step": 24600 }, { "epoch": 0.0494, "grad_norm": 0.5137728452682495, "learning_rate": 0.0001901204, "loss": 0.4482, "step": 24700 }, { "epoch": 0.0496, "grad_norm": 0.33691874146461487, "learning_rate": 0.0001900804, "loss": 0.4471, "step": 24800 }, { "epoch": 0.0498, "grad_norm": 0.4733399748802185, "learning_rate": 0.00019004040000000002, "loss": 0.4448, "step": 24900 }, { "epoch": 0.05, "grad_norm": 0.2643273174762726, "learning_rate": 0.0001900004, "loss": 0.4413, "step": 25000 }, { "epoch": 0.0502, "grad_norm": 0.33353281021118164, "learning_rate": 0.0001899604, "loss": 0.4519, "step": 25100 }, { "epoch": 0.0504, "grad_norm": 0.7628878355026245, "learning_rate": 0.0001899204, "loss": 0.4843, "step": 25200 }, { "epoch": 0.0506, "grad_norm": 0.33647382259368896, "learning_rate": 0.0001898804, "loss": 0.4402, "step": 25300 }, { "epoch": 0.0508, "grad_norm": 0.2926424741744995, "learning_rate": 0.0001898404, "loss": 0.4441, "step": 25400 }, { "epoch": 0.051, "grad_norm": 0.24321198463439941, "learning_rate": 0.0001898004, "loss": 0.4485, "step": 25500 }, { "epoch": 0.0512, "grad_norm": 0.2866199314594269, "learning_rate": 0.00018976040000000003, "loss": 0.4311, "step": 25600 }, { "epoch": 0.0514, "grad_norm": 0.27528929710388184, "learning_rate": 0.0001897204, "loss": 0.4478, "step": 25700 }, { "epoch": 0.0516, "grad_norm": 0.27291688323020935, "learning_rate": 0.00018968040000000001, "loss": 0.4504, "step": 25800 }, { "epoch": 0.0518, "grad_norm": 0.36198413372039795, "learning_rate": 0.0001896404, "loss": 0.4304, "step": 25900 }, { "epoch": 0.052, "grad_norm": 1.4908273220062256, "learning_rate": 0.0001896004, "loss": 0.4516, "step": 26000 }, { "epoch": 0.0522, "grad_norm": 0.36965224146842957, "learning_rate": 0.00018956040000000002, "loss": 0.4227, "step": 26100 }, { "epoch": 0.0524, "grad_norm": 0.3467487394809723, "learning_rate": 0.00018952040000000002, "loss": 0.4352, "step": 26200 }, { "epoch": 0.0526, "grad_norm": 0.5308062434196472, "learning_rate": 0.0001894804, "loss": 0.4636, "step": 26300 }, { "epoch": 0.0528, "grad_norm": 0.30632877349853516, "learning_rate": 0.0001894404, "loss": 0.4416, "step": 26400 }, { "epoch": 0.053, "grad_norm": 0.9236398339271545, "learning_rate": 0.00018940040000000002, "loss": 0.4302, "step": 26500 }, { "epoch": 0.0532, "grad_norm": 0.37798625230789185, "learning_rate": 0.00018936040000000002, "loss": 0.4479, "step": 26600 }, { "epoch": 0.0534, "grad_norm": 0.20882748067378998, "learning_rate": 0.0001893204, "loss": 0.4256, "step": 26700 }, { "epoch": 0.0536, "grad_norm": 0.3316884934902191, "learning_rate": 0.0001892804, "loss": 0.4281, "step": 26800 }, { "epoch": 0.0538, "grad_norm": 0.2426268756389618, "learning_rate": 0.0001892404, "loss": 0.4328, "step": 26900 }, { "epoch": 0.054, "grad_norm": 0.263194739818573, "learning_rate": 0.00018920040000000002, "loss": 0.4321, "step": 27000 }, { "epoch": 0.0542, "grad_norm": 0.2998006343841553, "learning_rate": 0.0001891604, "loss": 0.4322, "step": 27100 }, { "epoch": 0.0544, "grad_norm": 0.32721447944641113, "learning_rate": 0.00018912040000000003, "loss": 0.4483, "step": 27200 }, { "epoch": 0.0546, "grad_norm": 0.33025509119033813, "learning_rate": 0.0001890804, "loss": 0.4429, "step": 27300 }, { "epoch": 0.0548, "grad_norm": 0.2779247462749481, "learning_rate": 0.0001890404, "loss": 0.4243, "step": 27400 }, { "epoch": 0.055, "grad_norm": 0.4281350374221802, "learning_rate": 0.00018900040000000001, "loss": 0.43, "step": 27500 }, { "epoch": 0.0552, "grad_norm": 0.3283655345439911, "learning_rate": 0.0001889604, "loss": 0.4358, "step": 27600 }, { "epoch": 0.0554, "grad_norm": 0.4739823341369629, "learning_rate": 0.00018892040000000003, "loss": 0.4729, "step": 27700 }, { "epoch": 0.0556, "grad_norm": 0.5889720916748047, "learning_rate": 0.0001888804, "loss": 0.4436, "step": 27800 }, { "epoch": 0.0558, "grad_norm": 0.6123796105384827, "learning_rate": 0.00018884040000000002, "loss": 0.4222, "step": 27900 }, { "epoch": 0.056, "grad_norm": 0.24797767400741577, "learning_rate": 0.0001888004, "loss": 0.4527, "step": 28000 }, { "epoch": 0.0562, "grad_norm": 0.2619722783565521, "learning_rate": 0.0001887604, "loss": 0.4307, "step": 28100 }, { "epoch": 0.0564, "grad_norm": 0.309741348028183, "learning_rate": 0.00018872040000000002, "loss": 0.4343, "step": 28200 }, { "epoch": 0.0566, "grad_norm": 0.6912897825241089, "learning_rate": 0.0001886804, "loss": 0.4137, "step": 28300 }, { "epoch": 0.0568, "grad_norm": 0.40055686235427856, "learning_rate": 0.0001886404, "loss": 0.4395, "step": 28400 }, { "epoch": 0.057, "grad_norm": 0.19984565675258636, "learning_rate": 0.0001886004, "loss": 0.4418, "step": 28500 }, { "epoch": 0.0572, "grad_norm": 0.4430752694606781, "learning_rate": 0.00018856040000000003, "loss": 0.4269, "step": 28600 }, { "epoch": 0.0574, "grad_norm": 0.37985754013061523, "learning_rate": 0.00018852040000000002, "loss": 0.4349, "step": 28700 }, { "epoch": 0.0576, "grad_norm": 0.29130980372428894, "learning_rate": 0.0001884804, "loss": 0.4394, "step": 28800 }, { "epoch": 0.0578, "grad_norm": 0.5696423053741455, "learning_rate": 0.0001884404, "loss": 0.442, "step": 28900 }, { "epoch": 0.058, "grad_norm": 0.25018632411956787, "learning_rate": 0.0001884004, "loss": 0.4433, "step": 29000 }, { "epoch": 0.0582, "grad_norm": 0.2951875627040863, "learning_rate": 0.00018836040000000002, "loss": 0.4188, "step": 29100 }, { "epoch": 0.0584, "grad_norm": 0.23352210223674774, "learning_rate": 0.00018832040000000001, "loss": 0.4222, "step": 29200 }, { "epoch": 0.0586, "grad_norm": 0.3203904330730438, "learning_rate": 0.0001882804, "loss": 0.4261, "step": 29300 }, { "epoch": 0.0588, "grad_norm": 0.34108293056488037, "learning_rate": 0.0001882404, "loss": 0.4341, "step": 29400 }, { "epoch": 0.059, "grad_norm": 0.45176732540130615, "learning_rate": 0.00018820040000000002, "loss": 0.4667, "step": 29500 }, { "epoch": 0.0592, "grad_norm": 0.4928261637687683, "learning_rate": 0.00018816040000000002, "loss": 0.4546, "step": 29600 }, { "epoch": 0.0594, "grad_norm": 0.31960615515708923, "learning_rate": 0.0001881204, "loss": 0.4346, "step": 29700 }, { "epoch": 0.0596, "grad_norm": 0.2550900876522064, "learning_rate": 0.0001880804, "loss": 0.4181, "step": 29800 }, { "epoch": 0.0598, "grad_norm": 0.21990257501602173, "learning_rate": 0.0001880404, "loss": 0.4103, "step": 29900 }, { "epoch": 0.06, "grad_norm": 0.2592441439628601, "learning_rate": 0.00018800040000000002, "loss": 0.4092, "step": 30000 }, { "epoch": 0.0602, "grad_norm": 0.2601667046546936, "learning_rate": 0.0001879604, "loss": 0.3927, "step": 30100 }, { "epoch": 0.0604, "grad_norm": 0.24636943638324738, "learning_rate": 0.00018792040000000003, "loss": 0.3929, "step": 30200 }, { "epoch": 0.0606, "grad_norm": 0.3238808810710907, "learning_rate": 0.0001878804, "loss": 0.3894, "step": 30300 }, { "epoch": 0.0608, "grad_norm": 0.2241670787334442, "learning_rate": 0.00018784040000000002, "loss": 0.3862, "step": 30400 }, { "epoch": 0.061, "grad_norm": 0.26162776350975037, "learning_rate": 0.0001878004, "loss": 0.3836, "step": 30500 }, { "epoch": 0.0612, "grad_norm": 0.21369214355945587, "learning_rate": 0.0001877604, "loss": 0.3829, "step": 30600 }, { "epoch": 0.0614, "grad_norm": 0.311394602060318, "learning_rate": 0.00018772040000000003, "loss": 0.384, "step": 30700 }, { "epoch": 0.0616, "grad_norm": 0.2316521257162094, "learning_rate": 0.0001876804, "loss": 0.3934, "step": 30800 }, { "epoch": 0.0618, "grad_norm": 0.4280303120613098, "learning_rate": 0.00018764040000000001, "loss": 0.3983, "step": 30900 }, { "epoch": 0.062, "grad_norm": 0.26460182666778564, "learning_rate": 0.0001876004, "loss": 0.3815, "step": 31000 }, { "epoch": 0.0622, "grad_norm": 0.2175382673740387, "learning_rate": 0.0001875604, "loss": 0.393, "step": 31100 }, { "epoch": 0.0624, "grad_norm": 0.2641454041004181, "learning_rate": 0.00018752040000000002, "loss": 0.3982, "step": 31200 }, { "epoch": 0.0626, "grad_norm": 0.28961893916130066, "learning_rate": 0.0001874804, "loss": 0.393, "step": 31300 }, { "epoch": 0.0628, "grad_norm": 0.2512940764427185, "learning_rate": 0.0001874404, "loss": 0.3896, "step": 31400 }, { "epoch": 0.063, "grad_norm": 0.26094546914100647, "learning_rate": 0.0001874004, "loss": 0.3861, "step": 31500 }, { "epoch": 0.0632, "grad_norm": 1.4088988304138184, "learning_rate": 0.00018736040000000002, "loss": 0.3894, "step": 31600 }, { "epoch": 0.0634, "grad_norm": 0.21915282309055328, "learning_rate": 0.00018732040000000002, "loss": 0.3895, "step": 31700 }, { "epoch": 0.0636, "grad_norm": 0.282810240983963, "learning_rate": 0.0001872804, "loss": 0.3814, "step": 31800 }, { "epoch": 0.0638, "grad_norm": 0.24024060368537903, "learning_rate": 0.0001872404, "loss": 0.3802, "step": 31900 }, { "epoch": 0.064, "grad_norm": 0.254407674074173, "learning_rate": 0.0001872004, "loss": 0.3916, "step": 32000 }, { "epoch": 0.0642, "grad_norm": 0.22265967726707458, "learning_rate": 0.00018716040000000002, "loss": 0.3941, "step": 32100 }, { "epoch": 0.0644, "grad_norm": 0.2248506098985672, "learning_rate": 0.0001871204, "loss": 0.3878, "step": 32200 }, { "epoch": 0.0646, "grad_norm": 0.30514708161354065, "learning_rate": 0.0001870804, "loss": 0.3861, "step": 32300 }, { "epoch": 0.0648, "grad_norm": 0.24802802503108978, "learning_rate": 0.0001870404, "loss": 0.3802, "step": 32400 }, { "epoch": 0.065, "grad_norm": 0.25993114709854126, "learning_rate": 0.00018700040000000002, "loss": 0.3832, "step": 32500 }, { "epoch": 0.0652, "grad_norm": 0.2758074104785919, "learning_rate": 0.0001869604, "loss": 0.3898, "step": 32600 }, { "epoch": 0.0654, "grad_norm": 0.2821817696094513, "learning_rate": 0.0001869204, "loss": 0.3901, "step": 32700 }, { "epoch": 0.0656, "grad_norm": 0.3593108057975769, "learning_rate": 0.0001868804, "loss": 0.3835, "step": 32800 }, { "epoch": 0.0658, "grad_norm": 0.21605613827705383, "learning_rate": 0.0001868404, "loss": 0.3862, "step": 32900 }, { "epoch": 0.066, "grad_norm": 0.34763646125793457, "learning_rate": 0.00018680040000000001, "loss": 0.3861, "step": 33000 }, { "epoch": 0.0662, "grad_norm": 0.2341729700565338, "learning_rate": 0.0001867604, "loss": 0.3876, "step": 33100 }, { "epoch": 0.0664, "grad_norm": 0.6066027879714966, "learning_rate": 0.00018672040000000003, "loss": 0.3904, "step": 33200 }, { "epoch": 0.0666, "grad_norm": 0.2845059037208557, "learning_rate": 0.0001866804, "loss": 0.387, "step": 33300 }, { "epoch": 0.0668, "grad_norm": 0.23616880178451538, "learning_rate": 0.00018664040000000002, "loss": 0.3862, "step": 33400 }, { "epoch": 0.067, "grad_norm": 0.21980926394462585, "learning_rate": 0.0001866004, "loss": 0.3888, "step": 33500 }, { "epoch": 0.0672, "grad_norm": 0.2976222336292267, "learning_rate": 0.0001865604, "loss": 0.3873, "step": 33600 }, { "epoch": 0.0674, "grad_norm": 0.23099921643733978, "learning_rate": 0.00018652040000000002, "loss": 0.3816, "step": 33700 }, { "epoch": 0.0676, "grad_norm": 0.2811383306980133, "learning_rate": 0.0001864804, "loss": 0.3849, "step": 33800 }, { "epoch": 0.0678, "grad_norm": 0.2094060480594635, "learning_rate": 0.0001864404, "loss": 0.3761, "step": 33900 }, { "epoch": 0.068, "grad_norm": 0.2863057553768158, "learning_rate": 0.0001864004, "loss": 0.384, "step": 34000 }, { "epoch": 0.0682, "grad_norm": 0.23131683468818665, "learning_rate": 0.00018636040000000003, "loss": 0.3898, "step": 34100 }, { "epoch": 0.0684, "grad_norm": 0.2947816848754883, "learning_rate": 0.00018632040000000002, "loss": 0.3853, "step": 34200 }, { "epoch": 0.0686, "grad_norm": 0.2566851079463959, "learning_rate": 0.00018628039999999999, "loss": 0.3743, "step": 34300 }, { "epoch": 0.0688, "grad_norm": 0.23540231585502625, "learning_rate": 0.0001862404, "loss": 0.3777, "step": 34400 }, { "epoch": 0.069, "grad_norm": 0.3022463917732239, "learning_rate": 0.0001862004, "loss": 0.3865, "step": 34500 }, { "epoch": 0.0692, "grad_norm": 0.2179107815027237, "learning_rate": 0.00018616040000000002, "loss": 0.3802, "step": 34600 }, { "epoch": 0.0694, "grad_norm": 0.3824068009853363, "learning_rate": 0.00018612040000000001, "loss": 0.385, "step": 34700 }, { "epoch": 0.0696, "grad_norm": 0.2288985550403595, "learning_rate": 0.0001860804, "loss": 0.3767, "step": 34800 }, { "epoch": 0.0698, "grad_norm": 0.228486567735672, "learning_rate": 0.0001860404, "loss": 0.3827, "step": 34900 }, { "epoch": 0.07, "grad_norm": 0.2173689603805542, "learning_rate": 0.0001860004, "loss": 0.3865, "step": 35000 }, { "epoch": 0.0702, "grad_norm": 0.27074554562568665, "learning_rate": 0.00018596040000000002, "loss": 0.3756, "step": 35100 }, { "epoch": 0.0704, "grad_norm": 0.3166576623916626, "learning_rate": 0.0001859204, "loss": 0.3889, "step": 35200 }, { "epoch": 0.0706, "grad_norm": 0.3322249948978424, "learning_rate": 0.0001858804, "loss": 0.3793, "step": 35300 }, { "epoch": 0.0708, "grad_norm": 0.2725512981414795, "learning_rate": 0.0001858404, "loss": 0.3828, "step": 35400 }, { "epoch": 0.071, "grad_norm": 0.20953910052776337, "learning_rate": 0.00018580040000000002, "loss": 0.3771, "step": 35500 }, { "epoch": 0.0712, "grad_norm": 0.2445133924484253, "learning_rate": 0.0001857604, "loss": 0.3737, "step": 35600 }, { "epoch": 0.0714, "grad_norm": 0.5710493922233582, "learning_rate": 0.0001857204, "loss": 0.3712, "step": 35700 }, { "epoch": 0.0716, "grad_norm": 0.3139127790927887, "learning_rate": 0.0001856804, "loss": 0.3787, "step": 35800 }, { "epoch": 0.0718, "grad_norm": 0.23770715296268463, "learning_rate": 0.0001856404, "loss": 0.3814, "step": 35900 }, { "epoch": 0.072, "grad_norm": 0.3806588351726532, "learning_rate": 0.0001856004, "loss": 0.373, "step": 36000 }, { "epoch": 0.0722, "grad_norm": 0.25435832142829895, "learning_rate": 0.0001855604, "loss": 0.3789, "step": 36100 }, { "epoch": 0.0724, "grad_norm": 0.26886075735092163, "learning_rate": 0.00018552040000000003, "loss": 0.3837, "step": 36200 }, { "epoch": 0.0726, "grad_norm": 0.22618788480758667, "learning_rate": 0.0001854804, "loss": 0.3775, "step": 36300 }, { "epoch": 0.0728, "grad_norm": 0.1857277899980545, "learning_rate": 0.00018544040000000001, "loss": 0.3759, "step": 36400 }, { "epoch": 0.073, "grad_norm": 0.34357950091362, "learning_rate": 0.0001854004, "loss": 0.3698, "step": 36500 }, { "epoch": 0.0732, "grad_norm": 0.26498422026634216, "learning_rate": 0.0001853604, "loss": 0.3702, "step": 36600 }, { "epoch": 0.0734, "grad_norm": 0.21762557327747345, "learning_rate": 0.00018532040000000002, "loss": 0.3756, "step": 36700 }, { "epoch": 0.0736, "grad_norm": 0.2890067398548126, "learning_rate": 0.0001852804, "loss": 0.3707, "step": 36800 }, { "epoch": 0.0738, "grad_norm": 0.24609336256980896, "learning_rate": 0.0001852404, "loss": 0.3766, "step": 36900 }, { "epoch": 0.074, "grad_norm": 0.262658953666687, "learning_rate": 0.0001852004, "loss": 0.3736, "step": 37000 }, { "epoch": 0.0742, "grad_norm": 0.20918047428131104, "learning_rate": 0.00018516040000000002, "loss": 0.3756, "step": 37100 }, { "epoch": 0.0744, "grad_norm": 0.324136883020401, "learning_rate": 0.00018512040000000002, "loss": 0.3737, "step": 37200 }, { "epoch": 0.0746, "grad_norm": 0.3206271827220917, "learning_rate": 0.0001850804, "loss": 0.3725, "step": 37300 }, { "epoch": 0.0748, "grad_norm": 0.24515919387340546, "learning_rate": 0.0001850404, "loss": 0.369, "step": 37400 }, { "epoch": 0.075, "grad_norm": 0.441785991191864, "learning_rate": 0.0001850004, "loss": 0.3843, "step": 37500 }, { "epoch": 0.0752, "grad_norm": 0.20712384581565857, "learning_rate": 0.00018496040000000002, "loss": 0.3758, "step": 37600 }, { "epoch": 0.0754, "grad_norm": 0.2728515565395355, "learning_rate": 0.0001849204, "loss": 0.3674, "step": 37700 }, { "epoch": 0.0756, "grad_norm": 0.34958529472351074, "learning_rate": 0.0001848804, "loss": 0.3713, "step": 37800 }, { "epoch": 0.0758, "grad_norm": 0.2223168909549713, "learning_rate": 0.0001848404, "loss": 0.3724, "step": 37900 }, { "epoch": 0.076, "grad_norm": 0.22961725294589996, "learning_rate": 0.0001848004, "loss": 0.3768, "step": 38000 }, { "epoch": 0.0762, "grad_norm": 0.23851755261421204, "learning_rate": 0.00018476040000000001, "loss": 0.3715, "step": 38100 }, { "epoch": 0.0764, "grad_norm": 0.3652637004852295, "learning_rate": 0.0001847204, "loss": 0.3585, "step": 38200 }, { "epoch": 0.0766, "grad_norm": 0.2556198835372925, "learning_rate": 0.0001846804, "loss": 0.3682, "step": 38300 }, { "epoch": 0.0768, "grad_norm": 0.2592809498310089, "learning_rate": 0.0001846404, "loss": 0.3697, "step": 38400 }, { "epoch": 0.077, "grad_norm": 0.217251256108284, "learning_rate": 0.00018460040000000002, "loss": 0.3657, "step": 38500 }, { "epoch": 0.0772, "grad_norm": 0.22854512929916382, "learning_rate": 0.0001845604, "loss": 0.3669, "step": 38600 }, { "epoch": 0.0774, "grad_norm": 0.4855661690235138, "learning_rate": 0.0001845204, "loss": 0.3672, "step": 38700 }, { "epoch": 0.0776, "grad_norm": 0.26462578773498535, "learning_rate": 0.00018448040000000002, "loss": 0.3642, "step": 38800 }, { "epoch": 0.0778, "grad_norm": 0.24063749611377716, "learning_rate": 0.0001844404, "loss": 0.3682, "step": 38900 }, { "epoch": 0.078, "grad_norm": 0.21736368536949158, "learning_rate": 0.0001844004, "loss": 0.3813, "step": 39000 }, { "epoch": 0.0782, "grad_norm": 0.2315133512020111, "learning_rate": 0.0001843604, "loss": 0.3735, "step": 39100 }, { "epoch": 0.0784, "grad_norm": 0.23136012256145477, "learning_rate": 0.00018432040000000003, "loss": 0.3603, "step": 39200 }, { "epoch": 0.0786, "grad_norm": 0.2924899756908417, "learning_rate": 0.00018428040000000002, "loss": 0.3632, "step": 39300 }, { "epoch": 0.0788, "grad_norm": 0.307910293340683, "learning_rate": 0.0001842404, "loss": 0.3671, "step": 39400 }, { "epoch": 0.079, "grad_norm": 0.22759227454662323, "learning_rate": 0.0001842004, "loss": 0.3707, "step": 39500 }, { "epoch": 0.0792, "grad_norm": 0.21660542488098145, "learning_rate": 0.0001841604, "loss": 0.3728, "step": 39600 }, { "epoch": 0.0794, "grad_norm": 0.2659781575202942, "learning_rate": 0.00018412040000000002, "loss": 0.3729, "step": 39700 }, { "epoch": 0.0796, "grad_norm": 0.25910884141921997, "learning_rate": 0.00018408040000000001, "loss": 0.3681, "step": 39800 }, { "epoch": 0.0798, "grad_norm": 0.2730049788951874, "learning_rate": 0.0001840404, "loss": 0.3716, "step": 39900 }, { "epoch": 0.08, "grad_norm": 0.23555901646614075, "learning_rate": 0.0001840004, "loss": 0.3683, "step": 40000 }, { "epoch": 0.0802, "grad_norm": 0.22227054834365845, "learning_rate": 0.00018396040000000002, "loss": 0.371, "step": 40100 }, { "epoch": 0.0804, "grad_norm": 0.21306991577148438, "learning_rate": 0.00018392040000000002, "loss": 0.3666, "step": 40200 }, { "epoch": 0.0806, "grad_norm": 0.37939247488975525, "learning_rate": 0.0001838804, "loss": 0.3782, "step": 40300 }, { "epoch": 0.0808, "grad_norm": 0.26984167098999023, "learning_rate": 0.0001838404, "loss": 0.3717, "step": 40400 }, { "epoch": 0.081, "grad_norm": 0.22309809923171997, "learning_rate": 0.0001838004, "loss": 0.3761, "step": 40500 }, { "epoch": 0.0812, "grad_norm": 0.22144988179206848, "learning_rate": 0.00018376040000000002, "loss": 0.3609, "step": 40600 }, { "epoch": 0.0814, "grad_norm": 0.23661202192306519, "learning_rate": 0.0001837204, "loss": 0.364, "step": 40700 }, { "epoch": 0.0816, "grad_norm": 0.23225858807563782, "learning_rate": 0.00018368040000000003, "loss": 0.3608, "step": 40800 }, { "epoch": 0.0818, "grad_norm": 0.21633361279964447, "learning_rate": 0.0001836404, "loss": 0.3677, "step": 40900 }, { "epoch": 0.082, "grad_norm": 0.20861107110977173, "learning_rate": 0.00018360040000000002, "loss": 0.3586, "step": 41000 }, { "epoch": 0.0822, "grad_norm": 0.22681254148483276, "learning_rate": 0.0001835604, "loss": 0.3503, "step": 41100 }, { "epoch": 0.0824, "grad_norm": 0.29530957341194153, "learning_rate": 0.0001835204, "loss": 0.3546, "step": 41200 }, { "epoch": 0.0826, "grad_norm": 0.22965680062770844, "learning_rate": 0.00018348040000000003, "loss": 0.3581, "step": 41300 }, { "epoch": 0.0828, "grad_norm": 0.24958209693431854, "learning_rate": 0.0001834404, "loss": 0.3601, "step": 41400 }, { "epoch": 0.083, "grad_norm": 0.34926220774650574, "learning_rate": 0.0001834004, "loss": 0.3546, "step": 41500 }, { "epoch": 0.0832, "grad_norm": 0.23350180685520172, "learning_rate": 0.0001833604, "loss": 0.3591, "step": 41600 }, { "epoch": 0.0834, "grad_norm": 0.28490149974823, "learning_rate": 0.00018332040000000003, "loss": 0.3595, "step": 41700 }, { "epoch": 0.0836, "grad_norm": 0.37000980973243713, "learning_rate": 0.00018328040000000002, "loss": 0.357, "step": 41800 }, { "epoch": 0.0838, "grad_norm": 0.25736093521118164, "learning_rate": 0.0001832404, "loss": 0.3584, "step": 41900 }, { "epoch": 0.084, "grad_norm": 0.3347998559474945, "learning_rate": 0.0001832004, "loss": 0.3668, "step": 42000 }, { "epoch": 0.0842, "grad_norm": 0.259397029876709, "learning_rate": 0.0001831604, "loss": 0.358, "step": 42100 }, { "epoch": 0.0844, "grad_norm": 0.26411810517311096, "learning_rate": 0.00018312040000000002, "loss": 0.3678, "step": 42200 }, { "epoch": 0.0846, "grad_norm": 0.24818368256092072, "learning_rate": 0.00018308040000000002, "loss": 0.3563, "step": 42300 }, { "epoch": 0.0848, "grad_norm": 0.27950188517570496, "learning_rate": 0.0001830404, "loss": 0.3561, "step": 42400 }, { "epoch": 0.085, "grad_norm": 0.21346434950828552, "learning_rate": 0.0001830004, "loss": 0.3608, "step": 42500 }, { "epoch": 0.0852, "grad_norm": 0.20952267944812775, "learning_rate": 0.0001829604, "loss": 0.3565, "step": 42600 }, { "epoch": 0.0854, "grad_norm": 0.3930506706237793, "learning_rate": 0.00018292040000000002, "loss": 0.3591, "step": 42700 }, { "epoch": 0.0856, "grad_norm": 0.19581203162670135, "learning_rate": 0.0001828804, "loss": 0.3575, "step": 42800 }, { "epoch": 0.0858, "grad_norm": 0.20917409658432007, "learning_rate": 0.0001828404, "loss": 0.3609, "step": 42900 }, { "epoch": 0.086, "grad_norm": 0.47267964482307434, "learning_rate": 0.0001828004, "loss": 0.3629, "step": 43000 }, { "epoch": 0.0862, "grad_norm": 0.3048167824745178, "learning_rate": 0.00018276040000000002, "loss": 0.3535, "step": 43100 }, { "epoch": 0.0864, "grad_norm": 0.23811852931976318, "learning_rate": 0.0001827204, "loss": 0.362, "step": 43200 }, { "epoch": 0.0866, "grad_norm": 0.21190153062343597, "learning_rate": 0.0001826804, "loss": 0.3856, "step": 43300 }, { "epoch": 0.0868, "grad_norm": 0.2275034487247467, "learning_rate": 0.0001826404, "loss": 0.3586, "step": 43400 }, { "epoch": 0.087, "grad_norm": 0.19929227232933044, "learning_rate": 0.0001826004, "loss": 0.3518, "step": 43500 }, { "epoch": 0.0872, "grad_norm": 0.25546959042549133, "learning_rate": 0.00018256040000000001, "loss": 0.3543, "step": 43600 }, { "epoch": 0.0874, "grad_norm": 0.20379209518432617, "learning_rate": 0.0001825204, "loss": 0.3599, "step": 43700 }, { "epoch": 0.0876, "grad_norm": 0.27397245168685913, "learning_rate": 0.00018248040000000003, "loss": 0.3573, "step": 43800 }, { "epoch": 0.0878, "grad_norm": 0.23391596972942352, "learning_rate": 0.0001824404, "loss": 0.3538, "step": 43900 }, { "epoch": 0.088, "grad_norm": 0.23923568427562714, "learning_rate": 0.00018240040000000002, "loss": 0.3529, "step": 44000 }, { "epoch": 0.0882, "grad_norm": 0.2521684467792511, "learning_rate": 0.0001823604, "loss": 0.3691, "step": 44100 }, { "epoch": 0.0884, "grad_norm": 0.2728547155857086, "learning_rate": 0.0001823204, "loss": 0.3596, "step": 44200 }, { "epoch": 0.0886, "grad_norm": 0.28586891293525696, "learning_rate": 0.00018228040000000002, "loss": 0.3634, "step": 44300 }, { "epoch": 0.0888, "grad_norm": 0.2821336090564728, "learning_rate": 0.0001822404, "loss": 0.3554, "step": 44400 }, { "epoch": 0.089, "grad_norm": 0.38594385981559753, "learning_rate": 0.0001822004, "loss": 0.3559, "step": 44500 }, { "epoch": 0.0892, "grad_norm": 0.235568568110466, "learning_rate": 0.0001821604, "loss": 0.3514, "step": 44600 }, { "epoch": 0.0894, "grad_norm": 0.2375953644514084, "learning_rate": 0.00018212040000000003, "loss": 0.3576, "step": 44700 }, { "epoch": 0.0896, "grad_norm": 0.20530065894126892, "learning_rate": 0.00018208040000000002, "loss": 0.3574, "step": 44800 }, { "epoch": 0.0898, "grad_norm": 0.1840105950832367, "learning_rate": 0.0001820404, "loss": 0.351, "step": 44900 }, { "epoch": 0.09, "grad_norm": 0.25125375390052795, "learning_rate": 0.0001820004, "loss": 0.3533, "step": 45000 }, { "epoch": 0.0902, "grad_norm": 0.2656700313091278, "learning_rate": 0.0001819604, "loss": 0.3604, "step": 45100 }, { "epoch": 0.0904, "grad_norm": 0.27208247780799866, "learning_rate": 0.00018192040000000002, "loss": 0.3552, "step": 45200 }, { "epoch": 0.0906, "grad_norm": 0.2230006903409958, "learning_rate": 0.00018188040000000001, "loss": 0.3515, "step": 45300 }, { "epoch": 0.0908, "grad_norm": 0.28219157457351685, "learning_rate": 0.0001818404, "loss": 0.357, "step": 45400 }, { "epoch": 0.091, "grad_norm": 0.21550622582435608, "learning_rate": 0.0001818004, "loss": 0.3512, "step": 45500 }, { "epoch": 0.0912, "grad_norm": 0.23631860315799713, "learning_rate": 0.0001817604, "loss": 0.3471, "step": 45600 }, { "epoch": 0.0914, "grad_norm": 0.1854236125946045, "learning_rate": 0.00018172040000000002, "loss": 0.3509, "step": 45700 }, { "epoch": 0.0916, "grad_norm": 0.2366824895143509, "learning_rate": 0.0001816804, "loss": 0.3583, "step": 45800 }, { "epoch": 0.0918, "grad_norm": 0.37892627716064453, "learning_rate": 0.0001816404, "loss": 0.3479, "step": 45900 }, { "epoch": 0.092, "grad_norm": 0.2608516216278076, "learning_rate": 0.0001816004, "loss": 0.3478, "step": 46000 }, { "epoch": 0.0922, "grad_norm": 0.23194310069084167, "learning_rate": 0.00018156040000000002, "loss": 0.3654, "step": 46100 }, { "epoch": 0.0924, "grad_norm": 0.2503475248813629, "learning_rate": 0.0001815204, "loss": 0.3508, "step": 46200 }, { "epoch": 0.0926, "grad_norm": 0.25393983721733093, "learning_rate": 0.0001814804, "loss": 0.3512, "step": 46300 }, { "epoch": 0.0928, "grad_norm": 0.19703878462314606, "learning_rate": 0.0001814404, "loss": 0.3468, "step": 46400 }, { "epoch": 0.093, "grad_norm": 0.18077781796455383, "learning_rate": 0.0001814004, "loss": 0.3529, "step": 46500 }, { "epoch": 0.0932, "grad_norm": 0.21559618413448334, "learning_rate": 0.0001813604, "loss": 0.3454, "step": 46600 }, { "epoch": 0.0934, "grad_norm": 0.223658949136734, "learning_rate": 0.0001813204, "loss": 0.3545, "step": 46700 }, { "epoch": 0.0936, "grad_norm": 0.2453765571117401, "learning_rate": 0.00018128040000000003, "loss": 0.3493, "step": 46800 }, { "epoch": 0.0938, "grad_norm": 0.2784949541091919, "learning_rate": 0.0001812404, "loss": 0.3432, "step": 46900 }, { "epoch": 0.094, "grad_norm": 0.21154820919036865, "learning_rate": 0.00018120040000000001, "loss": 0.347, "step": 47000 }, { "epoch": 0.0942, "grad_norm": 0.2772074341773987, "learning_rate": 0.0001811604, "loss": 0.3495, "step": 47100 }, { "epoch": 0.0944, "grad_norm": 0.20805895328521729, "learning_rate": 0.0001811204, "loss": 0.361, "step": 47200 }, { "epoch": 0.0946, "grad_norm": 0.1839285045862198, "learning_rate": 0.00018108040000000002, "loss": 0.3557, "step": 47300 }, { "epoch": 0.0948, "grad_norm": 0.23907071352005005, "learning_rate": 0.0001810404, "loss": 0.3416, "step": 47400 }, { "epoch": 0.095, "grad_norm": 0.3368641138076782, "learning_rate": 0.0001810004, "loss": 0.3483, "step": 47500 }, { "epoch": 0.0952, "grad_norm": 0.24331678450107574, "learning_rate": 0.0001809604, "loss": 0.3494, "step": 47600 }, { "epoch": 0.0954, "grad_norm": 0.20715878903865814, "learning_rate": 0.00018092040000000002, "loss": 0.3457, "step": 47700 }, { "epoch": 0.0956, "grad_norm": 0.1883363276720047, "learning_rate": 0.00018088040000000002, "loss": 0.3556, "step": 47800 }, { "epoch": 0.0958, "grad_norm": 0.24465380609035492, "learning_rate": 0.0001808404, "loss": 0.3447, "step": 47900 }, { "epoch": 0.096, "grad_norm": 0.24823075532913208, "learning_rate": 0.0001808004, "loss": 0.3431, "step": 48000 }, { "epoch": 0.0962, "grad_norm": 0.3323017358779907, "learning_rate": 0.0001807604, "loss": 0.3515, "step": 48100 }, { "epoch": 0.0964, "grad_norm": 0.2804560363292694, "learning_rate": 0.00018072040000000002, "loss": 0.351, "step": 48200 }, { "epoch": 0.0966, "grad_norm": 0.26708027720451355, "learning_rate": 0.0001806804, "loss": 0.338, "step": 48300 }, { "epoch": 0.0968, "grad_norm": 0.20081064105033875, "learning_rate": 0.0001806404, "loss": 0.3424, "step": 48400 }, { "epoch": 0.097, "grad_norm": 0.19008444249629974, "learning_rate": 0.0001806004, "loss": 0.3449, "step": 48500 }, { "epoch": 0.0972, "grad_norm": 0.2081468105316162, "learning_rate": 0.00018056040000000002, "loss": 0.3419, "step": 48600 }, { "epoch": 0.0974, "grad_norm": 0.41435888409614563, "learning_rate": 0.00018052040000000001, "loss": 0.345, "step": 48700 }, { "epoch": 0.0976, "grad_norm": 0.30877628922462463, "learning_rate": 0.0001804804, "loss": 0.3457, "step": 48800 }, { "epoch": 0.0978, "grad_norm": 0.25084757804870605, "learning_rate": 0.0001804404, "loss": 0.3436, "step": 48900 }, { "epoch": 0.098, "grad_norm": 0.26010265946388245, "learning_rate": 0.0001804004, "loss": 0.3522, "step": 49000 }, { "epoch": 0.0982, "grad_norm": 0.45441487431526184, "learning_rate": 0.00018036040000000002, "loss": 0.3366, "step": 49100 }, { "epoch": 0.0984, "grad_norm": 0.2062908411026001, "learning_rate": 0.0001803204, "loss": 0.3415, "step": 49200 }, { "epoch": 0.0986, "grad_norm": 0.24297113716602325, "learning_rate": 0.0001802804, "loss": 0.3403, "step": 49300 }, { "epoch": 0.0988, "grad_norm": 0.2683410942554474, "learning_rate": 0.0001802404, "loss": 0.3234, "step": 49400 }, { "epoch": 0.099, "grad_norm": 0.2585706114768982, "learning_rate": 0.0001802004, "loss": 0.3402, "step": 49500 }, { "epoch": 0.0992, "grad_norm": 0.2423756718635559, "learning_rate": 0.0001801604, "loss": 0.3394, "step": 49600 }, { "epoch": 0.0994, "grad_norm": 0.20874212682247162, "learning_rate": 0.0001801204, "loss": 0.3469, "step": 49700 }, { "epoch": 0.0996, "grad_norm": 0.26174086332321167, "learning_rate": 0.00018008040000000003, "loss": 0.3424, "step": 49800 }, { "epoch": 0.0998, "grad_norm": 0.258007675409317, "learning_rate": 0.0001800404, "loss": 0.333, "step": 49900 }, { "epoch": 0.1, "grad_norm": 0.26558035612106323, "learning_rate": 0.0001800004, "loss": 0.3371, "step": 50000 }, { "epoch": 0.1002, "grad_norm": 0.2108493149280548, "learning_rate": 0.0001799604, "loss": 0.3439, "step": 50100 }, { "epoch": 0.1004, "grad_norm": 0.2356238216161728, "learning_rate": 0.0001799204, "loss": 0.3479, "step": 50200 }, { "epoch": 0.1006, "grad_norm": 0.22705116868019104, "learning_rate": 0.00017988040000000002, "loss": 0.3384, "step": 50300 }, { "epoch": 0.1008, "grad_norm": 0.22554484009742737, "learning_rate": 0.0001798404, "loss": 0.3398, "step": 50400 }, { "epoch": 0.101, "grad_norm": 0.23426580429077148, "learning_rate": 0.0001798004, "loss": 0.3463, "step": 50500 }, { "epoch": 0.1012, "grad_norm": 0.22517123818397522, "learning_rate": 0.0001797604, "loss": 0.3386, "step": 50600 }, { "epoch": 0.1014, "grad_norm": 0.24184340238571167, "learning_rate": 0.00017972040000000002, "loss": 0.3437, "step": 50700 }, { "epoch": 0.1016, "grad_norm": 0.3470735251903534, "learning_rate": 0.00017968040000000002, "loss": 0.3407, "step": 50800 }, { "epoch": 0.1018, "grad_norm": 0.35720571875572205, "learning_rate": 0.0001796404, "loss": 0.3379, "step": 50900 }, { "epoch": 0.102, "grad_norm": 0.25200462341308594, "learning_rate": 0.0001796004, "loss": 0.3454, "step": 51000 }, { "epoch": 0.1022, "grad_norm": 0.19085712730884552, "learning_rate": 0.0001795604, "loss": 0.3406, "step": 51100 }, { "epoch": 0.1024, "grad_norm": 0.21733929216861725, "learning_rate": 0.00017952040000000002, "loss": 0.3361, "step": 51200 }, { "epoch": 0.1026, "grad_norm": 0.2199040800333023, "learning_rate": 0.0001794804, "loss": 0.3381, "step": 51300 }, { "epoch": 0.1028, "grad_norm": 0.21806757152080536, "learning_rate": 0.00017944040000000003, "loss": 0.3391, "step": 51400 }, { "epoch": 0.103, "grad_norm": 0.27425751090049744, "learning_rate": 0.0001794004, "loss": 0.3336, "step": 51500 }, { "epoch": 0.1032, "grad_norm": 0.24505355954170227, "learning_rate": 0.00017936040000000002, "loss": 0.3384, "step": 51600 }, { "epoch": 0.1034, "grad_norm": 0.19927924871444702, "learning_rate": 0.0001793204, "loss": 0.3559, "step": 51700 }, { "epoch": 0.1036, "grad_norm": 0.31233856081962585, "learning_rate": 0.0001792804, "loss": 0.3369, "step": 51800 }, { "epoch": 0.1038, "grad_norm": 0.30651381611824036, "learning_rate": 0.00017924040000000003, "loss": 0.3579, "step": 51900 }, { "epoch": 0.104, "grad_norm": 0.18664880096912384, "learning_rate": 0.0001792004, "loss": 0.343, "step": 52000 }, { "epoch": 0.1042, "grad_norm": 0.48104581236839294, "learning_rate": 0.0001791604, "loss": 0.3705, "step": 52100 }, { "epoch": 0.1044, "grad_norm": 0.24804629385471344, "learning_rate": 0.0001791204, "loss": 0.3563, "step": 52200 }, { "epoch": 0.1046, "grad_norm": 0.2167651504278183, "learning_rate": 0.00017908040000000003, "loss": 0.3396, "step": 52300 }, { "epoch": 0.1048, "grad_norm": 0.5007449984550476, "learning_rate": 0.00017904040000000002, "loss": 0.3436, "step": 52400 }, { "epoch": 0.105, "grad_norm": 0.2310677170753479, "learning_rate": 0.0001790004, "loss": 0.3355, "step": 52500 }, { "epoch": 0.1052, "grad_norm": 0.26683372259140015, "learning_rate": 0.0001789604, "loss": 0.3472, "step": 52600 }, { "epoch": 0.1054, "grad_norm": 0.22053886950016022, "learning_rate": 0.0001789204, "loss": 0.3393, "step": 52700 }, { "epoch": 0.1056, "grad_norm": 0.24362322688102722, "learning_rate": 0.00017888040000000002, "loss": 0.3478, "step": 52800 }, { "epoch": 0.1058, "grad_norm": 0.28843867778778076, "learning_rate": 0.00017884040000000002, "loss": 0.3347, "step": 52900 }, { "epoch": 0.106, "grad_norm": 0.24268370866775513, "learning_rate": 0.0001788004, "loss": 0.3479, "step": 53000 }, { "epoch": 0.1062, "grad_norm": 0.25353243947029114, "learning_rate": 0.0001787604, "loss": 0.3503, "step": 53100 }, { "epoch": 0.1064, "grad_norm": 0.34019261598587036, "learning_rate": 0.0001787204, "loss": 0.3564, "step": 53200 }, { "epoch": 0.1066, "grad_norm": 0.28113237023353577, "learning_rate": 0.00017868040000000002, "loss": 0.3409, "step": 53300 }, { "epoch": 0.1068, "grad_norm": 0.33453458547592163, "learning_rate": 0.0001786404, "loss": 0.3393, "step": 53400 }, { "epoch": 0.107, "grad_norm": 0.25770464539527893, "learning_rate": 0.0001786004, "loss": 0.3448, "step": 53500 }, { "epoch": 0.1072, "grad_norm": 0.2797645628452301, "learning_rate": 0.0001785604, "loss": 0.3463, "step": 53600 }, { "epoch": 0.1074, "grad_norm": 0.26500073075294495, "learning_rate": 0.00017852040000000002, "loss": 0.3391, "step": 53700 }, { "epoch": 0.1076, "grad_norm": 0.23151659965515137, "learning_rate": 0.0001784804, "loss": 0.3369, "step": 53800 }, { "epoch": 0.1078, "grad_norm": 0.27178940176963806, "learning_rate": 0.0001784404, "loss": 0.3364, "step": 53900 }, { "epoch": 0.108, "grad_norm": 0.2767798900604248, "learning_rate": 0.0001784004, "loss": 0.3379, "step": 54000 }, { "epoch": 0.1082, "grad_norm": 0.21509261429309845, "learning_rate": 0.0001783604, "loss": 0.3382, "step": 54100 }, { "epoch": 0.1084, "grad_norm": 0.24864695966243744, "learning_rate": 0.00017832040000000001, "loss": 0.3369, "step": 54200 }, { "epoch": 0.1086, "grad_norm": 0.2107497751712799, "learning_rate": 0.0001782804, "loss": 0.333, "step": 54300 }, { "epoch": 0.1088, "grad_norm": 0.20573535561561584, "learning_rate": 0.00017824040000000003, "loss": 0.3397, "step": 54400 }, { "epoch": 0.109, "grad_norm": 0.25302717089653015, "learning_rate": 0.0001782004, "loss": 0.3384, "step": 54500 }, { "epoch": 0.1092, "grad_norm": 0.22738224267959595, "learning_rate": 0.00017816040000000002, "loss": 0.3398, "step": 54600 }, { "epoch": 0.1094, "grad_norm": 0.28848886489868164, "learning_rate": 0.0001781204, "loss": 0.3361, "step": 54700 }, { "epoch": 0.1096, "grad_norm": 0.23477931320667267, "learning_rate": 0.0001780804, "loss": 0.3414, "step": 54800 }, { "epoch": 0.1098, "grad_norm": 0.24742473661899567, "learning_rate": 0.00017804040000000002, "loss": 0.3359, "step": 54900 }, { "epoch": 0.11, "grad_norm": 0.24106016755104065, "learning_rate": 0.0001780004, "loss": 0.3428, "step": 55000 }, { "epoch": 0.1102, "grad_norm": 0.22427873313426971, "learning_rate": 0.0001779604, "loss": 0.346, "step": 55100 }, { "epoch": 0.1104, "grad_norm": 0.31361493468284607, "learning_rate": 0.0001779204, "loss": 0.3454, "step": 55200 }, { "epoch": 0.1106, "grad_norm": 0.2146797627210617, "learning_rate": 0.00017788040000000003, "loss": 0.3368, "step": 55300 }, { "epoch": 0.1108, "grad_norm": 0.2894279658794403, "learning_rate": 0.00017784040000000002, "loss": 0.339, "step": 55400 }, { "epoch": 0.111, "grad_norm": 0.3044085204601288, "learning_rate": 0.0001778004, "loss": 0.3411, "step": 55500 }, { "epoch": 0.1112, "grad_norm": 0.18939179182052612, "learning_rate": 0.0001777604, "loss": 0.3471, "step": 55600 }, { "epoch": 0.1114, "grad_norm": 0.24099594354629517, "learning_rate": 0.0001777204, "loss": 0.3296, "step": 55700 }, { "epoch": 0.1116, "grad_norm": 0.2313549667596817, "learning_rate": 0.00017768040000000002, "loss": 0.3384, "step": 55800 }, { "epoch": 0.1118, "grad_norm": 0.20545341074466705, "learning_rate": 0.00017764040000000001, "loss": 0.331, "step": 55900 }, { "epoch": 0.112, "grad_norm": 0.22012929618358612, "learning_rate": 0.0001776004, "loss": 0.3383, "step": 56000 }, { "epoch": 0.1122, "grad_norm": 0.2095547914505005, "learning_rate": 0.0001775604, "loss": 0.3355, "step": 56100 }, { "epoch": 0.1124, "grad_norm": 0.2438439428806305, "learning_rate": 0.00017752040000000002, "loss": 0.3305, "step": 56200 }, { "epoch": 0.1126, "grad_norm": 0.5170906186103821, "learning_rate": 0.00017748040000000002, "loss": 0.3385, "step": 56300 }, { "epoch": 0.1128, "grad_norm": 0.1860545575618744, "learning_rate": 0.0001774404, "loss": 0.3336, "step": 56400 }, { "epoch": 0.113, "grad_norm": 0.24801748991012573, "learning_rate": 0.0001774004, "loss": 0.3286, "step": 56500 }, { "epoch": 0.1132, "grad_norm": 0.25407829880714417, "learning_rate": 0.0001773604, "loss": 0.3402, "step": 56600 }, { "epoch": 0.1134, "grad_norm": 0.20441265404224396, "learning_rate": 0.00017732040000000002, "loss": 0.3336, "step": 56700 }, { "epoch": 0.1136, "grad_norm": 0.2255195528268814, "learning_rate": 0.0001772804, "loss": 0.3283, "step": 56800 }, { "epoch": 0.1138, "grad_norm": 0.3453531265258789, "learning_rate": 0.0001772404, "loss": 0.3325, "step": 56900 }, { "epoch": 0.114, "grad_norm": 0.1892947405576706, "learning_rate": 0.0001772004, "loss": 0.3298, "step": 57000 }, { "epoch": 0.1142, "grad_norm": 0.3136577308177948, "learning_rate": 0.0001771604, "loss": 0.3324, "step": 57100 }, { "epoch": 0.1144, "grad_norm": 0.2848574221134186, "learning_rate": 0.0001771204, "loss": 0.3266, "step": 57200 }, { "epoch": 0.1146, "grad_norm": 0.3240659832954407, "learning_rate": 0.0001770804, "loss": 0.3413, "step": 57300 }, { "epoch": 0.1148, "grad_norm": 0.2988292872905731, "learning_rate": 0.00017704040000000003, "loss": 0.3335, "step": 57400 }, { "epoch": 0.115, "grad_norm": 0.31577932834625244, "learning_rate": 0.0001770004, "loss": 0.3294, "step": 57500 }, { "epoch": 0.1152, "grad_norm": 0.32162484526634216, "learning_rate": 0.00017696040000000001, "loss": 0.3521, "step": 57600 }, { "epoch": 0.1154, "grad_norm": 0.661695122718811, "learning_rate": 0.0001769204, "loss": 0.3324, "step": 57700 }, { "epoch": 0.1156, "grad_norm": 0.25184112787246704, "learning_rate": 0.0001768804, "loss": 0.3438, "step": 57800 }, { "epoch": 0.1158, "grad_norm": 0.24458423256874084, "learning_rate": 0.00017684040000000002, "loss": 0.3297, "step": 57900 }, { "epoch": 0.116, "grad_norm": 0.23055191338062286, "learning_rate": 0.0001768004, "loss": 0.328, "step": 58000 }, { "epoch": 0.1162, "grad_norm": 0.22799059748649597, "learning_rate": 0.0001767604, "loss": 0.3305, "step": 58100 }, { "epoch": 0.1164, "grad_norm": 0.17628814280033112, "learning_rate": 0.0001767204, "loss": 0.3351, "step": 58200 }, { "epoch": 0.1166, "grad_norm": 0.29074835777282715, "learning_rate": 0.00017668040000000002, "loss": 0.3267, "step": 58300 }, { "epoch": 0.1168, "grad_norm": 0.26170602440834045, "learning_rate": 0.00017664040000000002, "loss": 0.332, "step": 58400 }, { "epoch": 0.117, "grad_norm": 0.18410225212574005, "learning_rate": 0.0001766004, "loss": 0.3291, "step": 58500 }, { "epoch": 0.1172, "grad_norm": 0.3183979392051697, "learning_rate": 0.0001765604, "loss": 0.3311, "step": 58600 }, { "epoch": 0.1174, "grad_norm": 0.21326270699501038, "learning_rate": 0.0001765204, "loss": 0.3279, "step": 58700 }, { "epoch": 0.1176, "grad_norm": 0.2310231775045395, "learning_rate": 0.00017648040000000002, "loss": 0.3267, "step": 58800 }, { "epoch": 0.1178, "grad_norm": 0.30372264981269836, "learning_rate": 0.0001764404, "loss": 0.3411, "step": 58900 }, { "epoch": 0.118, "grad_norm": 0.22354651987552643, "learning_rate": 0.0001764004, "loss": 0.3323, "step": 59000 }, { "epoch": 0.1182, "grad_norm": 0.22137747704982758, "learning_rate": 0.0001763604, "loss": 0.3261, "step": 59100 }, { "epoch": 0.1184, "grad_norm": 0.20340581238269806, "learning_rate": 0.00017632040000000002, "loss": 0.3291, "step": 59200 }, { "epoch": 0.1186, "grad_norm": 0.21525894105434418, "learning_rate": 0.00017628040000000001, "loss": 0.324, "step": 59300 }, { "epoch": 0.1188, "grad_norm": 0.21832208335399628, "learning_rate": 0.0001762404, "loss": 0.3319, "step": 59400 }, { "epoch": 0.119, "grad_norm": 0.2369861602783203, "learning_rate": 0.0001762004, "loss": 0.3298, "step": 59500 }, { "epoch": 0.1192, "grad_norm": 0.4107494652271271, "learning_rate": 0.0001761604, "loss": 0.3306, "step": 59600 }, { "epoch": 0.1194, "grad_norm": 0.23435452580451965, "learning_rate": 0.00017612040000000002, "loss": 0.3239, "step": 59700 }, { "epoch": 0.1196, "grad_norm": 0.20295970141887665, "learning_rate": 0.0001760804, "loss": 0.335, "step": 59800 }, { "epoch": 0.1198, "grad_norm": 0.29473334550857544, "learning_rate": 0.00017604040000000003, "loss": 0.332, "step": 59900 }, { "epoch": 0.12, "grad_norm": 0.22703693807125092, "learning_rate": 0.0001760004, "loss": 0.3291, "step": 60000 }, { "epoch": 0.1202, "grad_norm": 0.2531866729259491, "learning_rate": 0.0001759604, "loss": 0.3326, "step": 60100 }, { "epoch": 0.1204, "grad_norm": 0.20017871260643005, "learning_rate": 0.0001759204, "loss": 0.3304, "step": 60200 }, { "epoch": 0.1206, "grad_norm": 0.22206462919712067, "learning_rate": 0.0001758804, "loss": 0.3442, "step": 60300 }, { "epoch": 0.1208, "grad_norm": 0.24844832718372345, "learning_rate": 0.00017584040000000002, "loss": 0.3301, "step": 60400 }, { "epoch": 0.121, "grad_norm": 0.19882357120513916, "learning_rate": 0.0001758004, "loss": 0.3296, "step": 60500 }, { "epoch": 0.1212, "grad_norm": 0.2944953441619873, "learning_rate": 0.0001757604, "loss": 0.3283, "step": 60600 }, { "epoch": 0.1214, "grad_norm": 0.21407614648342133, "learning_rate": 0.0001757204, "loss": 0.328, "step": 60700 }, { "epoch": 0.1216, "grad_norm": 0.19046856462955475, "learning_rate": 0.0001756804, "loss": 0.3292, "step": 60800 }, { "epoch": 0.1218, "grad_norm": 0.22615882754325867, "learning_rate": 0.00017564040000000002, "loss": 0.3316, "step": 60900 }, { "epoch": 0.122, "grad_norm": 0.1709730625152588, "learning_rate": 0.0001756004, "loss": 0.3318, "step": 61000 }, { "epoch": 0.1222, "grad_norm": 0.1718514859676361, "learning_rate": 0.0001755604, "loss": 0.3201, "step": 61100 }, { "epoch": 0.1224, "grad_norm": 0.23949329555034637, "learning_rate": 0.0001755204, "loss": 0.3329, "step": 61200 }, { "epoch": 0.1226, "grad_norm": 0.2090582698583603, "learning_rate": 0.00017548040000000002, "loss": 0.3354, "step": 61300 }, { "epoch": 0.1228, "grad_norm": 0.2083326280117035, "learning_rate": 0.00017544040000000002, "loss": 0.3293, "step": 61400 }, { "epoch": 0.123, "grad_norm": 0.21610133349895477, "learning_rate": 0.0001754004, "loss": 0.3252, "step": 61500 }, { "epoch": 0.1232, "grad_norm": 0.2022552639245987, "learning_rate": 0.0001753604, "loss": 0.3322, "step": 61600 }, { "epoch": 0.1234, "grad_norm": 0.24811524152755737, "learning_rate": 0.0001753204, "loss": 0.3405, "step": 61700 }, { "epoch": 0.1236, "grad_norm": 0.25753867626190186, "learning_rate": 0.00017528040000000002, "loss": 0.3359, "step": 61800 }, { "epoch": 0.1238, "grad_norm": 0.32304272055625916, "learning_rate": 0.0001752404, "loss": 0.3254, "step": 61900 }, { "epoch": 0.124, "grad_norm": 0.18287380039691925, "learning_rate": 0.0001752004, "loss": 0.3298, "step": 62000 }, { "epoch": 0.1242, "grad_norm": 0.19973132014274597, "learning_rate": 0.0001751604, "loss": 0.3363, "step": 62100 }, { "epoch": 0.1244, "grad_norm": 0.23819290101528168, "learning_rate": 0.00017512040000000002, "loss": 0.3238, "step": 62200 }, { "epoch": 0.1246, "grad_norm": 0.27806952595710754, "learning_rate": 0.0001750804, "loss": 0.3286, "step": 62300 }, { "epoch": 0.1248, "grad_norm": 0.17436453700065613, "learning_rate": 0.0001750404, "loss": 0.3266, "step": 62400 }, { "epoch": 0.125, "grad_norm": 0.2046598643064499, "learning_rate": 0.0001750004, "loss": 0.32, "step": 62500 }, { "epoch": 0.1252, "grad_norm": 0.23056712746620178, "learning_rate": 0.0001749604, "loss": 0.325, "step": 62600 }, { "epoch": 0.1254, "grad_norm": 0.32476136088371277, "learning_rate": 0.0001749204, "loss": 0.3301, "step": 62700 }, { "epoch": 0.1256, "grad_norm": 0.24279266595840454, "learning_rate": 0.0001748804, "loss": 0.3253, "step": 62800 }, { "epoch": 0.1258, "grad_norm": 0.1884712129831314, "learning_rate": 0.00017484040000000003, "loss": 0.3309, "step": 62900 }, { "epoch": 0.126, "grad_norm": 0.21700552105903625, "learning_rate": 0.0001748004, "loss": 0.3299, "step": 63000 }, { "epoch": 0.1262, "grad_norm": 0.22113928198814392, "learning_rate": 0.00017476040000000001, "loss": 0.3283, "step": 63100 }, { "epoch": 0.1264, "grad_norm": 0.2137170284986496, "learning_rate": 0.0001747204, "loss": 0.3243, "step": 63200 }, { "epoch": 0.1266, "grad_norm": 0.273173063993454, "learning_rate": 0.0001746804, "loss": 0.3337, "step": 63300 }, { "epoch": 0.1268, "grad_norm": 0.2576025724411011, "learning_rate": 0.00017464040000000002, "loss": 0.329, "step": 63400 }, { "epoch": 0.127, "grad_norm": 0.3533582389354706, "learning_rate": 0.0001746004, "loss": 0.3322, "step": 63500 }, { "epoch": 0.1272, "grad_norm": 0.373040109872818, "learning_rate": 0.0001745604, "loss": 0.3262, "step": 63600 }, { "epoch": 0.1274, "grad_norm": 0.2242802381515503, "learning_rate": 0.0001745204, "loss": 0.328, "step": 63700 }, { "epoch": 0.1276, "grad_norm": 0.4904221296310425, "learning_rate": 0.0001744804, "loss": 0.3253, "step": 63800 }, { "epoch": 0.1278, "grad_norm": 0.20114104449748993, "learning_rate": 0.00017444040000000002, "loss": 0.3272, "step": 63900 }, { "epoch": 0.128, "grad_norm": 0.18482904136180878, "learning_rate": 0.0001744004, "loss": 0.3254, "step": 64000 }, { "epoch": 0.1282, "grad_norm": 0.19531016051769257, "learning_rate": 0.0001743604, "loss": 0.3206, "step": 64100 }, { "epoch": 0.1284, "grad_norm": 0.2820129990577698, "learning_rate": 0.0001743204, "loss": 0.3324, "step": 64200 }, { "epoch": 0.1286, "grad_norm": 0.23063960671424866, "learning_rate": 0.00017428040000000002, "loss": 0.3262, "step": 64300 }, { "epoch": 0.1288, "grad_norm": 0.2425381988286972, "learning_rate": 0.0001742404, "loss": 0.3221, "step": 64400 }, { "epoch": 0.129, "grad_norm": 0.1685548573732376, "learning_rate": 0.0001742004, "loss": 0.3225, "step": 64500 }, { "epoch": 0.1292, "grad_norm": 0.23022134602069855, "learning_rate": 0.0001741604, "loss": 0.3297, "step": 64600 }, { "epoch": 0.1294, "grad_norm": 0.2650277018547058, "learning_rate": 0.0001741204, "loss": 0.3293, "step": 64700 }, { "epoch": 0.1296, "grad_norm": 0.2417951077222824, "learning_rate": 0.00017408040000000001, "loss": 0.328, "step": 64800 }, { "epoch": 0.1298, "grad_norm": 0.22391964495182037, "learning_rate": 0.0001740404, "loss": 0.3266, "step": 64900 }, { "epoch": 0.13, "grad_norm": 0.2012777179479599, "learning_rate": 0.00017400040000000003, "loss": 0.3278, "step": 65000 }, { "epoch": 0.1302, "grad_norm": 0.18540987372398376, "learning_rate": 0.0001739604, "loss": 0.337, "step": 65100 }, { "epoch": 0.1304, "grad_norm": 0.19150865077972412, "learning_rate": 0.00017392040000000002, "loss": 0.3319, "step": 65200 }, { "epoch": 0.1306, "grad_norm": 0.24447111785411835, "learning_rate": 0.0001738804, "loss": 0.3312, "step": 65300 }, { "epoch": 0.1308, "grad_norm": 0.20017050206661224, "learning_rate": 0.0001738404, "loss": 0.3375, "step": 65400 }, { "epoch": 0.131, "grad_norm": 0.2571297585964203, "learning_rate": 0.00017380040000000002, "loss": 0.3268, "step": 65500 }, { "epoch": 0.1312, "grad_norm": 0.18792451918125153, "learning_rate": 0.0001737604, "loss": 0.3261, "step": 65600 }, { "epoch": 0.1314, "grad_norm": 0.4205167889595032, "learning_rate": 0.0001737204, "loss": 0.33, "step": 65700 }, { "epoch": 0.1316, "grad_norm": 0.2724183201789856, "learning_rate": 0.0001736804, "loss": 0.3273, "step": 65800 }, { "epoch": 0.1318, "grad_norm": 0.21143561601638794, "learning_rate": 0.00017364040000000003, "loss": 0.3281, "step": 65900 }, { "epoch": 0.132, "grad_norm": 0.1903173327445984, "learning_rate": 0.00017360040000000002, "loss": 0.3185, "step": 66000 }, { "epoch": 0.1322, "grad_norm": 0.3626430928707123, "learning_rate": 0.0001735604, "loss": 0.3411, "step": 66100 }, { "epoch": 0.1324, "grad_norm": 0.1814277172088623, "learning_rate": 0.0001735204, "loss": 0.3301, "step": 66200 }, { "epoch": 0.1326, "grad_norm": 0.22784367203712463, "learning_rate": 0.0001734804, "loss": 0.3244, "step": 66300 }, { "epoch": 0.1328, "grad_norm": 0.2449122965335846, "learning_rate": 0.00017344040000000002, "loss": 0.3281, "step": 66400 }, { "epoch": 0.133, "grad_norm": 0.17180882394313812, "learning_rate": 0.00017340040000000001, "loss": 0.3237, "step": 66500 }, { "epoch": 0.1332, "grad_norm": 0.24725696444511414, "learning_rate": 0.0001733604, "loss": 0.3281, "step": 66600 }, { "epoch": 0.1334, "grad_norm": 0.31062814593315125, "learning_rate": 0.0001733204, "loss": 0.3312, "step": 66700 }, { "epoch": 0.1336, "grad_norm": 0.20241935551166534, "learning_rate": 0.00017328040000000002, "loss": 0.3214, "step": 66800 }, { "epoch": 0.1338, "grad_norm": 0.319897323846817, "learning_rate": 0.00017324040000000002, "loss": 0.3262, "step": 66900 }, { "epoch": 0.134, "grad_norm": 0.36917874217033386, "learning_rate": 0.0001732004, "loss": 0.3232, "step": 67000 }, { "epoch": 0.1342, "grad_norm": 0.24121050536632538, "learning_rate": 0.0001731604, "loss": 0.3333, "step": 67100 }, { "epoch": 0.1344, "grad_norm": 0.21649840474128723, "learning_rate": 0.0001731204, "loss": 0.3246, "step": 67200 }, { "epoch": 0.1346, "grad_norm": 0.21219566464424133, "learning_rate": 0.00017308040000000002, "loss": 0.3263, "step": 67300 }, { "epoch": 0.1348, "grad_norm": 0.20997853577136993, "learning_rate": 0.0001730404, "loss": 0.321, "step": 67400 }, { "epoch": 0.135, "grad_norm": 0.20444467663764954, "learning_rate": 0.0001730004, "loss": 0.3242, "step": 67500 }, { "epoch": 0.1352, "grad_norm": 0.28162842988967896, "learning_rate": 0.0001729604, "loss": 0.3191, "step": 67600 }, { "epoch": 0.1354, "grad_norm": 0.266450971364975, "learning_rate": 0.0001729204, "loss": 0.3201, "step": 67700 }, { "epoch": 0.1356, "grad_norm": 0.22291947901248932, "learning_rate": 0.0001728804, "loss": 0.3171, "step": 67800 }, { "epoch": 0.1358, "grad_norm": 0.46510088443756104, "learning_rate": 0.0001728404, "loss": 0.3276, "step": 67900 }, { "epoch": 0.136, "grad_norm": 0.2612142562866211, "learning_rate": 0.00017280040000000003, "loss": 0.3313, "step": 68000 }, { "epoch": 0.1362, "grad_norm": 0.19849452376365662, "learning_rate": 0.0001727604, "loss": 0.3155, "step": 68100 }, { "epoch": 0.1364, "grad_norm": 0.32336533069610596, "learning_rate": 0.00017272040000000001, "loss": 0.3225, "step": 68200 }, { "epoch": 0.1366, "grad_norm": 0.26454541087150574, "learning_rate": 0.0001726804, "loss": 0.333, "step": 68300 }, { "epoch": 0.1368, "grad_norm": 0.2315870225429535, "learning_rate": 0.0001726404, "loss": 0.3297, "step": 68400 }, { "epoch": 0.137, "grad_norm": 0.3298985958099365, "learning_rate": 0.00017260040000000002, "loss": 0.3261, "step": 68500 }, { "epoch": 0.1372, "grad_norm": 0.2369878590106964, "learning_rate": 0.0001725604, "loss": 0.3287, "step": 68600 }, { "epoch": 0.1374, "grad_norm": 0.18581421673297882, "learning_rate": 0.0001725204, "loss": 0.3221, "step": 68700 }, { "epoch": 0.1376, "grad_norm": 0.3875245153903961, "learning_rate": 0.0001724804, "loss": 0.3181, "step": 68800 }, { "epoch": 0.1378, "grad_norm": 0.216262087225914, "learning_rate": 0.00017244040000000002, "loss": 0.3169, "step": 68900 }, { "epoch": 0.138, "grad_norm": 0.21017438173294067, "learning_rate": 0.00017240040000000002, "loss": 0.3184, "step": 69000 }, { "epoch": 0.1382, "grad_norm": 0.20590710639953613, "learning_rate": 0.0001723604, "loss": 0.3287, "step": 69100 }, { "epoch": 0.1384, "grad_norm": 0.2776513695716858, "learning_rate": 0.0001723204, "loss": 0.3195, "step": 69200 }, { "epoch": 0.1386, "grad_norm": 0.21307912468910217, "learning_rate": 0.0001722804, "loss": 0.3277, "step": 69300 }, { "epoch": 0.1388, "grad_norm": 0.2287779301404953, "learning_rate": 0.00017224040000000002, "loss": 0.3199, "step": 69400 }, { "epoch": 0.139, "grad_norm": 0.33188334107398987, "learning_rate": 0.0001722004, "loss": 0.3167, "step": 69500 }, { "epoch": 0.1392, "grad_norm": 0.2860109508037567, "learning_rate": 0.0001721604, "loss": 0.3151, "step": 69600 }, { "epoch": 0.1394, "grad_norm": 0.19062092900276184, "learning_rate": 0.0001721204, "loss": 0.3237, "step": 69700 }, { "epoch": 0.1396, "grad_norm": 0.21721702814102173, "learning_rate": 0.00017208040000000002, "loss": 0.3173, "step": 69800 }, { "epoch": 0.1398, "grad_norm": 0.20423859357833862, "learning_rate": 0.00017204040000000001, "loss": 0.3197, "step": 69900 }, { "epoch": 0.14, "grad_norm": 0.21749666333198547, "learning_rate": 0.0001720004, "loss": 0.3274, "step": 70000 }, { "epoch": 0.1402, "grad_norm": 0.2770256996154785, "learning_rate": 0.0001719604, "loss": 0.3189, "step": 70100 }, { "epoch": 0.1404, "grad_norm": 0.300563782453537, "learning_rate": 0.0001719204, "loss": 0.3158, "step": 70200 }, { "epoch": 0.1406, "grad_norm": 0.21529285609722137, "learning_rate": 0.00017188040000000002, "loss": 0.3234, "step": 70300 }, { "epoch": 0.1408, "grad_norm": 0.30113551020622253, "learning_rate": 0.0001718404, "loss": 0.3188, "step": 70400 }, { "epoch": 0.141, "grad_norm": 0.22355583310127258, "learning_rate": 0.00017180040000000003, "loss": 0.321, "step": 70500 }, { "epoch": 0.1412, "grad_norm": 0.23251134157180786, "learning_rate": 0.0001717604, "loss": 0.3215, "step": 70600 }, { "epoch": 0.1414, "grad_norm": 0.2621996998786926, "learning_rate": 0.00017172040000000002, "loss": 0.328, "step": 70700 }, { "epoch": 0.1416, "grad_norm": 0.24400697648525238, "learning_rate": 0.0001716804, "loss": 0.3182, "step": 70800 }, { "epoch": 0.1418, "grad_norm": 0.182724729180336, "learning_rate": 0.0001716404, "loss": 0.3104, "step": 70900 }, { "epoch": 0.142, "grad_norm": 0.2504395544528961, "learning_rate": 0.00017160040000000002, "loss": 0.3186, "step": 71000 }, { "epoch": 0.1422, "grad_norm": 0.3078869581222534, "learning_rate": 0.0001715604, "loss": 0.3175, "step": 71100 }, { "epoch": 0.1424, "grad_norm": 0.2186155915260315, "learning_rate": 0.0001715204, "loss": 0.32, "step": 71200 }, { "epoch": 0.1426, "grad_norm": 0.18938790261745453, "learning_rate": 0.0001714804, "loss": 0.3138, "step": 71300 }, { "epoch": 0.1428, "grad_norm": 0.21720623970031738, "learning_rate": 0.0001714404, "loss": 0.3192, "step": 71400 }, { "epoch": 0.143, "grad_norm": 0.18728028237819672, "learning_rate": 0.00017140040000000002, "loss": 0.3261, "step": 71500 }, { "epoch": 0.1432, "grad_norm": 0.23028717935085297, "learning_rate": 0.0001713604, "loss": 0.3167, "step": 71600 }, { "epoch": 0.1434, "grad_norm": 0.22258538007736206, "learning_rate": 0.0001713204, "loss": 0.3138, "step": 71700 }, { "epoch": 0.1436, "grad_norm": 0.3108178675174713, "learning_rate": 0.0001712804, "loss": 0.3261, "step": 71800 }, { "epoch": 0.1438, "grad_norm": 0.2817421853542328, "learning_rate": 0.00017124040000000002, "loss": 0.3153, "step": 71900 }, { "epoch": 0.144, "grad_norm": 0.2490064799785614, "learning_rate": 0.00017120040000000002, "loss": 0.314, "step": 72000 }, { "epoch": 0.1442, "grad_norm": 0.18532483279705048, "learning_rate": 0.0001711604, "loss": 0.3162, "step": 72100 }, { "epoch": 0.1444, "grad_norm": 0.23727016150951385, "learning_rate": 0.0001711204, "loss": 0.3278, "step": 72200 }, { "epoch": 0.1446, "grad_norm": 0.18576467037200928, "learning_rate": 0.0001710804, "loss": 0.3186, "step": 72300 }, { "epoch": 0.1448, "grad_norm": 0.21316267549991608, "learning_rate": 0.00017104040000000002, "loss": 0.3198, "step": 72400 }, { "epoch": 0.145, "grad_norm": 0.21670052409172058, "learning_rate": 0.0001710004, "loss": 0.3187, "step": 72500 }, { "epoch": 0.1452, "grad_norm": 0.260593056678772, "learning_rate": 0.0001709604, "loss": 0.3119, "step": 72600 }, { "epoch": 0.1454, "grad_norm": 0.24345095455646515, "learning_rate": 0.0001709204, "loss": 0.3081, "step": 72700 }, { "epoch": 0.1456, "grad_norm": 0.21282540261745453, "learning_rate": 0.00017088040000000002, "loss": 0.3222, "step": 72800 }, { "epoch": 0.1458, "grad_norm": 0.22302696108818054, "learning_rate": 0.0001708404, "loss": 0.3133, "step": 72900 }, { "epoch": 0.146, "grad_norm": 0.24244046211242676, "learning_rate": 0.0001708004, "loss": 0.3215, "step": 73000 }, { "epoch": 0.1462, "grad_norm": 0.651138961315155, "learning_rate": 0.0001707604, "loss": 0.3123, "step": 73100 }, { "epoch": 0.1464, "grad_norm": 0.27869105339050293, "learning_rate": 0.0001707204, "loss": 0.3184, "step": 73200 }, { "epoch": 0.1466, "grad_norm": 0.2049490511417389, "learning_rate": 0.0001706804, "loss": 0.3157, "step": 73300 }, { "epoch": 0.1468, "grad_norm": 0.2697393298149109, "learning_rate": 0.0001706404, "loss": 0.334, "step": 73400 }, { "epoch": 0.147, "grad_norm": 0.17334036529064178, "learning_rate": 0.00017060040000000003, "loss": 0.3145, "step": 73500 }, { "epoch": 0.1472, "grad_norm": 0.2551487386226654, "learning_rate": 0.0001705604, "loss": 0.3138, "step": 73600 }, { "epoch": 0.1474, "grad_norm": 0.23012271523475647, "learning_rate": 0.00017052040000000001, "loss": 0.3133, "step": 73700 }, { "epoch": 0.1476, "grad_norm": 0.21023279428482056, "learning_rate": 0.0001704804, "loss": 0.3096, "step": 73800 }, { "epoch": 0.1478, "grad_norm": 0.24528542160987854, "learning_rate": 0.0001704404, "loss": 0.313, "step": 73900 }, { "epoch": 0.148, "grad_norm": 0.24039429426193237, "learning_rate": 0.00017040040000000002, "loss": 0.3155, "step": 74000 }, { "epoch": 0.1482, "grad_norm": 0.20780912041664124, "learning_rate": 0.0001703604, "loss": 0.3131, "step": 74100 }, { "epoch": 0.1484, "grad_norm": 0.18914741277694702, "learning_rate": 0.0001703204, "loss": 0.3125, "step": 74200 }, { "epoch": 0.1486, "grad_norm": 0.18693341314792633, "learning_rate": 0.0001702804, "loss": 0.3185, "step": 74300 }, { "epoch": 0.1488, "grad_norm": 0.2793961465358734, "learning_rate": 0.00017024040000000002, "loss": 0.3209, "step": 74400 }, { "epoch": 0.149, "grad_norm": 0.3118375539779663, "learning_rate": 0.00017020040000000002, "loss": 0.3173, "step": 74500 }, { "epoch": 0.1492, "grad_norm": 0.23574410378932953, "learning_rate": 0.00017016039999999998, "loss": 0.3167, "step": 74600 }, { "epoch": 0.1494, "grad_norm": 0.33424112200737, "learning_rate": 0.0001701204, "loss": 0.3196, "step": 74700 }, { "epoch": 0.1496, "grad_norm": 0.24209052324295044, "learning_rate": 0.0001700804, "loss": 0.3201, "step": 74800 }, { "epoch": 0.1498, "grad_norm": 0.21331624686717987, "learning_rate": 0.00017004040000000002, "loss": 0.3163, "step": 74900 }, { "epoch": 0.15, "grad_norm": 0.20186734199523926, "learning_rate": 0.0001700004, "loss": 0.3174, "step": 75000 }, { "epoch": 0.1502, "grad_norm": 0.17939358949661255, "learning_rate": 0.0001699604, "loss": 0.3185, "step": 75100 }, { "epoch": 0.1504, "grad_norm": 0.23708830773830414, "learning_rate": 0.0001699204, "loss": 0.3239, "step": 75200 }, { "epoch": 0.1506, "grad_norm": 0.23293326795101166, "learning_rate": 0.0001698804, "loss": 0.3211, "step": 75300 }, { "epoch": 0.1508, "grad_norm": 0.18440894782543182, "learning_rate": 0.00016984040000000001, "loss": 0.3118, "step": 75400 }, { "epoch": 0.151, "grad_norm": 0.2222815454006195, "learning_rate": 0.0001698004, "loss": 0.3108, "step": 75500 }, { "epoch": 0.1512, "grad_norm": 0.24183017015457153, "learning_rate": 0.0001697604, "loss": 0.3145, "step": 75600 }, { "epoch": 0.1514, "grad_norm": 0.2108313888311386, "learning_rate": 0.0001697204, "loss": 0.3128, "step": 75700 }, { "epoch": 0.1516, "grad_norm": 0.25066909193992615, "learning_rate": 0.00016968040000000002, "loss": 0.3223, "step": 75800 }, { "epoch": 0.1518, "grad_norm": 0.2033839225769043, "learning_rate": 0.0001696404, "loss": 0.3253, "step": 75900 }, { "epoch": 0.152, "grad_norm": 0.21795319020748138, "learning_rate": 0.0001696004, "loss": 0.3186, "step": 76000 }, { "epoch": 0.1522, "grad_norm": 0.2567816972732544, "learning_rate": 0.0001695604, "loss": 0.3184, "step": 76100 }, { "epoch": 0.1524, "grad_norm": 0.2667364478111267, "learning_rate": 0.0001695204, "loss": 0.3174, "step": 76200 }, { "epoch": 0.1526, "grad_norm": 0.24945732951164246, "learning_rate": 0.0001694804, "loss": 0.3166, "step": 76300 }, { "epoch": 0.1528, "grad_norm": 0.3093933165073395, "learning_rate": 0.0001694404, "loss": 0.3089, "step": 76400 }, { "epoch": 0.153, "grad_norm": 0.2426934689283371, "learning_rate": 0.00016940040000000003, "loss": 0.3128, "step": 76500 }, { "epoch": 0.1532, "grad_norm": 0.20812727510929108, "learning_rate": 0.00016936040000000002, "loss": 0.3093, "step": 76600 }, { "epoch": 0.1534, "grad_norm": 0.4342077672481537, "learning_rate": 0.0001693204, "loss": 0.3295, "step": 76700 }, { "epoch": 0.1536, "grad_norm": 0.2487853765487671, "learning_rate": 0.0001692804, "loss": 0.3202, "step": 76800 }, { "epoch": 0.1538, "grad_norm": 0.21004439890384674, "learning_rate": 0.0001692404, "loss": 0.3136, "step": 76900 }, { "epoch": 0.154, "grad_norm": 0.20426279306411743, "learning_rate": 0.00016920040000000002, "loss": 0.3154, "step": 77000 }, { "epoch": 0.1542, "grad_norm": 0.2324894517660141, "learning_rate": 0.00016916040000000001, "loss": 0.3124, "step": 77100 }, { "epoch": 0.1544, "grad_norm": 0.24114584922790527, "learning_rate": 0.0001691204, "loss": 0.3218, "step": 77200 }, { "epoch": 0.1546, "grad_norm": 0.21619604527950287, "learning_rate": 0.0001690804, "loss": 0.3081, "step": 77300 }, { "epoch": 0.1548, "grad_norm": 0.3266576826572418, "learning_rate": 0.00016904040000000002, "loss": 0.3154, "step": 77400 }, { "epoch": 0.155, "grad_norm": 0.19163742661476135, "learning_rate": 0.00016900040000000002, "loss": 0.3112, "step": 77500 }, { "epoch": 0.1552, "grad_norm": 0.24338412284851074, "learning_rate": 0.0001689604, "loss": 0.309, "step": 77600 }, { "epoch": 0.1554, "grad_norm": 0.2039482295513153, "learning_rate": 0.0001689204, "loss": 0.3196, "step": 77700 }, { "epoch": 0.1556, "grad_norm": 0.2261604517698288, "learning_rate": 0.0001688804, "loss": 0.3157, "step": 77800 }, { "epoch": 0.1558, "grad_norm": 0.23354065418243408, "learning_rate": 0.00016884040000000002, "loss": 0.3091, "step": 77900 }, { "epoch": 0.156, "grad_norm": 0.3552130162715912, "learning_rate": 0.0001688004, "loss": 0.3138, "step": 78000 }, { "epoch": 0.1562, "grad_norm": 0.3127024471759796, "learning_rate": 0.00016876040000000003, "loss": 0.3134, "step": 78100 }, { "epoch": 0.1564, "grad_norm": 0.2056431621313095, "learning_rate": 0.0001687204, "loss": 0.3082, "step": 78200 }, { "epoch": 0.1566, "grad_norm": 0.22769121825695038, "learning_rate": 0.0001686804, "loss": 0.318, "step": 78300 }, { "epoch": 0.1568, "grad_norm": 0.1975649893283844, "learning_rate": 0.0001686404, "loss": 0.3284, "step": 78400 }, { "epoch": 0.157, "grad_norm": 0.25407645106315613, "learning_rate": 0.0001686004, "loss": 0.3124, "step": 78500 }, { "epoch": 0.1572, "grad_norm": 0.18915730714797974, "learning_rate": 0.00016856040000000003, "loss": 0.3125, "step": 78600 }, { "epoch": 0.1574, "grad_norm": 0.20961833000183105, "learning_rate": 0.0001685204, "loss": 0.3113, "step": 78700 }, { "epoch": 0.1576, "grad_norm": 0.20440556108951569, "learning_rate": 0.00016848040000000001, "loss": 0.3133, "step": 78800 }, { "epoch": 0.1578, "grad_norm": 0.20298384130001068, "learning_rate": 0.0001684404, "loss": 0.3195, "step": 78900 }, { "epoch": 0.158, "grad_norm": 0.2182042896747589, "learning_rate": 0.0001684004, "loss": 0.3091, "step": 79000 }, { "epoch": 0.1582, "grad_norm": 0.22923989593982697, "learning_rate": 0.00016836040000000002, "loss": 0.3181, "step": 79100 }, { "epoch": 0.1584, "grad_norm": 0.3075358271598816, "learning_rate": 0.0001683204, "loss": 0.3143, "step": 79200 }, { "epoch": 0.1586, "grad_norm": 0.2053437978029251, "learning_rate": 0.0001682804, "loss": 0.3086, "step": 79300 }, { "epoch": 0.1588, "grad_norm": 0.7424286007881165, "learning_rate": 0.0001682404, "loss": 0.3103, "step": 79400 }, { "epoch": 0.159, "grad_norm": 0.6188491582870483, "learning_rate": 0.00016820040000000002, "loss": 0.3183, "step": 79500 }, { "epoch": 0.1592, "grad_norm": 0.276163786649704, "learning_rate": 0.00016816040000000002, "loss": 0.3191, "step": 79600 }, { "epoch": 0.1594, "grad_norm": 0.18703459203243256, "learning_rate": 0.0001681204, "loss": 0.3139, "step": 79700 }, { "epoch": 0.1596, "grad_norm": 0.17597968876361847, "learning_rate": 0.0001680804, "loss": 0.3108, "step": 79800 }, { "epoch": 0.1598, "grad_norm": 0.24276955425739288, "learning_rate": 0.0001680404, "loss": 0.3068, "step": 79900 }, { "epoch": 0.16, "grad_norm": 0.21838606894016266, "learning_rate": 0.00016800040000000002, "loss": 0.3132, "step": 80000 }, { "epoch": 0.1602, "grad_norm": 0.2652193009853363, "learning_rate": 0.0001679604, "loss": 0.3107, "step": 80100 }, { "epoch": 0.1604, "grad_norm": 0.22899410128593445, "learning_rate": 0.0001679204, "loss": 0.3124, "step": 80200 }, { "epoch": 0.1606, "grad_norm": 0.19513286650180817, "learning_rate": 0.0001678804, "loss": 0.3125, "step": 80300 }, { "epoch": 0.1608, "grad_norm": 0.25317785143852234, "learning_rate": 0.00016784040000000002, "loss": 0.3075, "step": 80400 }, { "epoch": 0.161, "grad_norm": 0.21707969903945923, "learning_rate": 0.00016780040000000001, "loss": 0.3203, "step": 80500 }, { "epoch": 0.1612, "grad_norm": 0.22994515299797058, "learning_rate": 0.0001677604, "loss": 0.3136, "step": 80600 }, { "epoch": 0.1614, "grad_norm": 0.29428768157958984, "learning_rate": 0.0001677204, "loss": 0.3201, "step": 80700 }, { "epoch": 0.1616, "grad_norm": 0.24848338961601257, "learning_rate": 0.0001676804, "loss": 0.3169, "step": 80800 }, { "epoch": 0.1618, "grad_norm": 0.2289322018623352, "learning_rate": 0.00016764040000000002, "loss": 0.3055, "step": 80900 }, { "epoch": 0.162, "grad_norm": 0.20457616448402405, "learning_rate": 0.0001676004, "loss": 0.3115, "step": 81000 }, { "epoch": 0.1622, "grad_norm": 0.21032114326953888, "learning_rate": 0.00016756040000000003, "loss": 0.3116, "step": 81100 }, { "epoch": 0.1624, "grad_norm": 0.20674587786197662, "learning_rate": 0.0001675204, "loss": 0.3184, "step": 81200 }, { "epoch": 0.1626, "grad_norm": 0.4695895314216614, "learning_rate": 0.00016748040000000002, "loss": 0.3165, "step": 81300 }, { "epoch": 0.1628, "grad_norm": 0.26248136162757874, "learning_rate": 0.0001674404, "loss": 0.3092, "step": 81400 }, { "epoch": 0.163, "grad_norm": 0.20731079578399658, "learning_rate": 0.0001674004, "loss": 0.3141, "step": 81500 }, { "epoch": 0.1632, "grad_norm": 0.2757965922355652, "learning_rate": 0.00016736040000000002, "loss": 0.3124, "step": 81600 }, { "epoch": 0.1634, "grad_norm": 0.23029279708862305, "learning_rate": 0.0001673204, "loss": 0.3079, "step": 81700 }, { "epoch": 0.1636, "grad_norm": 0.22975629568099976, "learning_rate": 0.0001672804, "loss": 0.3157, "step": 81800 }, { "epoch": 0.1638, "grad_norm": 0.27681633830070496, "learning_rate": 0.0001672404, "loss": 0.31, "step": 81900 }, { "epoch": 0.164, "grad_norm": 0.20509104430675507, "learning_rate": 0.0001672004, "loss": 0.3141, "step": 82000 }, { "epoch": 0.1642, "grad_norm": 0.22510622441768646, "learning_rate": 0.00016716040000000002, "loss": 0.3046, "step": 82100 }, { "epoch": 0.1644, "grad_norm": 0.24418208003044128, "learning_rate": 0.0001671204, "loss": 0.324, "step": 82200 }, { "epoch": 0.1646, "grad_norm": 0.2196940779685974, "learning_rate": 0.0001670804, "loss": 0.3079, "step": 82300 }, { "epoch": 0.1648, "grad_norm": 0.19491960108280182, "learning_rate": 0.0001670404, "loss": 0.3114, "step": 82400 }, { "epoch": 0.165, "grad_norm": 0.18604259192943573, "learning_rate": 0.00016700040000000002, "loss": 0.3087, "step": 82500 }, { "epoch": 0.1652, "grad_norm": 0.22354719042778015, "learning_rate": 0.00016696040000000002, "loss": 0.3059, "step": 82600 }, { "epoch": 0.1654, "grad_norm": 0.19884774088859558, "learning_rate": 0.0001669204, "loss": 0.3094, "step": 82700 }, { "epoch": 0.1656, "grad_norm": 0.17430974543094635, "learning_rate": 0.0001668804, "loss": 0.3062, "step": 82800 }, { "epoch": 0.1658, "grad_norm": 0.32356834411621094, "learning_rate": 0.0001668404, "loss": 0.3127, "step": 82900 }, { "epoch": 0.166, "grad_norm": 0.19034580886363983, "learning_rate": 0.00016680040000000002, "loss": 0.308, "step": 83000 }, { "epoch": 0.1662, "grad_norm": 0.2106025516986847, "learning_rate": 0.0001667604, "loss": 0.3065, "step": 83100 }, { "epoch": 0.1664, "grad_norm": 0.22621573507785797, "learning_rate": 0.0001667204, "loss": 0.327, "step": 83200 }, { "epoch": 0.1666, "grad_norm": 0.19813230633735657, "learning_rate": 0.0001666804, "loss": 0.3081, "step": 83300 }, { "epoch": 0.1668, "grad_norm": 0.20063921809196472, "learning_rate": 0.00016664040000000002, "loss": 0.3069, "step": 83400 }, { "epoch": 0.167, "grad_norm": 0.1986655443906784, "learning_rate": 0.0001666004, "loss": 0.316, "step": 83500 }, { "epoch": 0.1672, "grad_norm": 0.22707831859588623, "learning_rate": 0.0001665604, "loss": 0.3103, "step": 83600 }, { "epoch": 0.1674, "grad_norm": 0.233692929148674, "learning_rate": 0.0001665204, "loss": 0.3196, "step": 83700 }, { "epoch": 0.1676, "grad_norm": 0.35872554779052734, "learning_rate": 0.0001664804, "loss": 0.3179, "step": 83800 }, { "epoch": 0.1678, "grad_norm": 0.3032999336719513, "learning_rate": 0.0001664404, "loss": 0.3123, "step": 83900 }, { "epoch": 0.168, "grad_norm": 0.27240657806396484, "learning_rate": 0.0001664004, "loss": 0.312, "step": 84000 }, { "epoch": 0.1682, "grad_norm": 0.20436140894889832, "learning_rate": 0.00016636040000000003, "loss": 0.3079, "step": 84100 }, { "epoch": 0.1684, "grad_norm": 0.26683861017227173, "learning_rate": 0.0001663204, "loss": 0.3107, "step": 84200 }, { "epoch": 0.1686, "grad_norm": 0.3275362253189087, "learning_rate": 0.00016628040000000001, "loss": 0.3066, "step": 84300 }, { "epoch": 0.1688, "grad_norm": 0.24865290522575378, "learning_rate": 0.0001662404, "loss": 0.3103, "step": 84400 }, { "epoch": 0.169, "grad_norm": 0.24508464336395264, "learning_rate": 0.0001662004, "loss": 0.3084, "step": 84500 }, { "epoch": 0.1692, "grad_norm": 0.19160589575767517, "learning_rate": 0.00016616040000000002, "loss": 0.3086, "step": 84600 }, { "epoch": 0.1694, "grad_norm": 0.21846391260623932, "learning_rate": 0.0001661204, "loss": 0.3038, "step": 84700 }, { "epoch": 0.1696, "grad_norm": 0.3486625850200653, "learning_rate": 0.0001660804, "loss": 0.3157, "step": 84800 }, { "epoch": 0.1698, "grad_norm": 0.281613826751709, "learning_rate": 0.0001660404, "loss": 0.3051, "step": 84900 }, { "epoch": 0.17, "grad_norm": 0.2179608792066574, "learning_rate": 0.00016600040000000002, "loss": 0.3258, "step": 85000 }, { "epoch": 0.1702, "grad_norm": 0.24830220639705658, "learning_rate": 0.00016596040000000002, "loss": 0.312, "step": 85100 }, { "epoch": 0.1704, "grad_norm": 0.2119312733411789, "learning_rate": 0.0001659204, "loss": 0.3119, "step": 85200 }, { "epoch": 0.1706, "grad_norm": 0.34956997632980347, "learning_rate": 0.0001658804, "loss": 0.3004, "step": 85300 }, { "epoch": 0.1708, "grad_norm": 0.2335246205329895, "learning_rate": 0.0001658404, "loss": 0.3102, "step": 85400 }, { "epoch": 0.171, "grad_norm": 0.2509993612766266, "learning_rate": 0.00016580040000000002, "loss": 0.3118, "step": 85500 }, { "epoch": 0.1712, "grad_norm": 0.23735851049423218, "learning_rate": 0.0001657604, "loss": 0.3073, "step": 85600 }, { "epoch": 0.1714, "grad_norm": 0.1960500031709671, "learning_rate": 0.0001657204, "loss": 0.3061, "step": 85700 }, { "epoch": 0.1716, "grad_norm": 0.17898212373256683, "learning_rate": 0.0001656804, "loss": 0.3056, "step": 85800 }, { "epoch": 0.1718, "grad_norm": 0.207002654671669, "learning_rate": 0.0001656404, "loss": 0.3114, "step": 85900 }, { "epoch": 0.172, "grad_norm": 0.19042788445949554, "learning_rate": 0.00016560040000000001, "loss": 0.3109, "step": 86000 }, { "epoch": 0.1722, "grad_norm": 0.2865552604198456, "learning_rate": 0.0001655604, "loss": 0.3185, "step": 86100 }, { "epoch": 0.1724, "grad_norm": 0.19299517571926117, "learning_rate": 0.0001655204, "loss": 0.3329, "step": 86200 }, { "epoch": 0.1726, "grad_norm": 0.22495882213115692, "learning_rate": 0.0001654804, "loss": 0.3175, "step": 86300 }, { "epoch": 0.1728, "grad_norm": 0.23224885761737823, "learning_rate": 0.00016544040000000002, "loss": 0.3076, "step": 86400 }, { "epoch": 0.173, "grad_norm": 0.24318882822990417, "learning_rate": 0.0001654004, "loss": 0.3123, "step": 86500 }, { "epoch": 0.1732, "grad_norm": 0.24434731900691986, "learning_rate": 0.0001653604, "loss": 0.3092, "step": 86600 }, { "epoch": 0.1734, "grad_norm": 0.23535099625587463, "learning_rate": 0.0001653204, "loss": 0.3091, "step": 86700 }, { "epoch": 0.1736, "grad_norm": 0.3537319600582123, "learning_rate": 0.0001652804, "loss": 0.3187, "step": 86800 }, { "epoch": 0.1738, "grad_norm": 0.2087029367685318, "learning_rate": 0.0001652404, "loss": 0.3096, "step": 86900 }, { "epoch": 0.174, "grad_norm": 0.20568256080150604, "learning_rate": 0.0001652004, "loss": 0.3114, "step": 87000 }, { "epoch": 0.1742, "grad_norm": 0.19157211482524872, "learning_rate": 0.00016516040000000003, "loss": 0.3124, "step": 87100 }, { "epoch": 0.1744, "grad_norm": 0.20994393527507782, "learning_rate": 0.0001651204, "loss": 0.3047, "step": 87200 }, { "epoch": 0.1746, "grad_norm": 0.4066809117794037, "learning_rate": 0.0001650804, "loss": 0.3105, "step": 87300 }, { "epoch": 0.1748, "grad_norm": 0.21257740259170532, "learning_rate": 0.0001650404, "loss": 0.3084, "step": 87400 }, { "epoch": 0.175, "grad_norm": 0.21162879467010498, "learning_rate": 0.0001650004, "loss": 0.3105, "step": 87500 }, { "epoch": 0.1752, "grad_norm": 0.21106883883476257, "learning_rate": 0.00016496040000000002, "loss": 0.3098, "step": 87600 }, { "epoch": 0.1754, "grad_norm": 0.21054783463478088, "learning_rate": 0.0001649204, "loss": 0.3018, "step": 87700 }, { "epoch": 0.1756, "grad_norm": 0.1708284318447113, "learning_rate": 0.0001648804, "loss": 0.3049, "step": 87800 }, { "epoch": 0.1758, "grad_norm": 0.20066872239112854, "learning_rate": 0.0001648404, "loss": 0.3055, "step": 87900 }, { "epoch": 0.176, "grad_norm": 0.3257821798324585, "learning_rate": 0.00016480040000000002, "loss": 0.3102, "step": 88000 }, { "epoch": 0.1762, "grad_norm": 0.1875178962945938, "learning_rate": 0.00016476040000000002, "loss": 0.311, "step": 88100 }, { "epoch": 0.1764, "grad_norm": 0.3387661874294281, "learning_rate": 0.0001647204, "loss": 0.3101, "step": 88200 }, { "epoch": 0.1766, "grad_norm": 0.3336031436920166, "learning_rate": 0.0001646804, "loss": 0.3154, "step": 88300 }, { "epoch": 0.1768, "grad_norm": 0.21378177404403687, "learning_rate": 0.0001646404, "loss": 0.3195, "step": 88400 }, { "epoch": 0.177, "grad_norm": 0.2274891436100006, "learning_rate": 0.00016460040000000002, "loss": 0.3096, "step": 88500 }, { "epoch": 0.1772, "grad_norm": 0.23640379309654236, "learning_rate": 0.0001645604, "loss": 0.3089, "step": 88600 }, { "epoch": 0.1774, "grad_norm": 0.19889909029006958, "learning_rate": 0.00016452040000000003, "loss": 0.3058, "step": 88700 }, { "epoch": 0.1776, "grad_norm": 0.20005521178245544, "learning_rate": 0.0001644804, "loss": 0.3059, "step": 88800 }, { "epoch": 0.1778, "grad_norm": 0.18955092132091522, "learning_rate": 0.00016444040000000002, "loss": 0.3026, "step": 88900 }, { "epoch": 0.178, "grad_norm": 0.23005805909633636, "learning_rate": 0.0001644004, "loss": 0.3067, "step": 89000 }, { "epoch": 0.1782, "grad_norm": 0.1776682585477829, "learning_rate": 0.0001643604, "loss": 0.3029, "step": 89100 }, { "epoch": 0.1784, "grad_norm": 0.29056572914123535, "learning_rate": 0.00016432040000000003, "loss": 0.3025, "step": 89200 }, { "epoch": 0.1786, "grad_norm": 0.2611645460128784, "learning_rate": 0.0001642804, "loss": 0.3022, "step": 89300 }, { "epoch": 0.1788, "grad_norm": 0.22479821741580963, "learning_rate": 0.00016424040000000001, "loss": 0.3102, "step": 89400 }, { "epoch": 0.179, "grad_norm": 0.3284640908241272, "learning_rate": 0.0001642004, "loss": 0.3078, "step": 89500 }, { "epoch": 0.1792, "grad_norm": 0.19706085324287415, "learning_rate": 0.0001641604, "loss": 0.3005, "step": 89600 }, { "epoch": 0.1794, "grad_norm": 0.1871166229248047, "learning_rate": 0.00016412040000000002, "loss": 0.3067, "step": 89700 }, { "epoch": 0.1796, "grad_norm": 0.18414820730686188, "learning_rate": 0.0001640804, "loss": 0.3065, "step": 89800 }, { "epoch": 0.1798, "grad_norm": 0.22224344313144684, "learning_rate": 0.0001640404, "loss": 0.3023, "step": 89900 }, { "epoch": 0.18, "grad_norm": 0.1839105784893036, "learning_rate": 0.0001640004, "loss": 0.3044, "step": 90000 }, { "epoch": 0.1802, "grad_norm": 0.3029592037200928, "learning_rate": 0.00016396040000000002, "loss": 0.3002, "step": 90100 }, { "epoch": 0.1804, "grad_norm": 0.21777479350566864, "learning_rate": 0.00016392040000000002, "loss": 0.3055, "step": 90200 }, { "epoch": 0.1806, "grad_norm": 0.1881372481584549, "learning_rate": 0.0001638804, "loss": 0.3015, "step": 90300 }, { "epoch": 0.1808, "grad_norm": 0.20605388283729553, "learning_rate": 0.0001638404, "loss": 0.3086, "step": 90400 }, { "epoch": 0.181, "grad_norm": 0.21422967314720154, "learning_rate": 0.0001638004, "loss": 0.3107, "step": 90500 }, { "epoch": 0.1812, "grad_norm": 0.18776142597198486, "learning_rate": 0.00016376040000000002, "loss": 0.297, "step": 90600 }, { "epoch": 0.1814, "grad_norm": 0.22885388135910034, "learning_rate": 0.0001637204, "loss": 0.2897, "step": 90700 }, { "epoch": 0.1816, "grad_norm": 0.16700296103954315, "learning_rate": 0.0001636804, "loss": 0.298, "step": 90800 }, { "epoch": 0.1818, "grad_norm": 0.2172263115644455, "learning_rate": 0.0001636404, "loss": 0.2995, "step": 90900 }, { "epoch": 0.182, "grad_norm": 0.25213050842285156, "learning_rate": 0.00016360040000000002, "loss": 0.3077, "step": 91000 }, { "epoch": 0.1822, "grad_norm": 0.22334252297878265, "learning_rate": 0.00016356040000000001, "loss": 0.3077, "step": 91100 }, { "epoch": 0.1824, "grad_norm": 0.25537583231925964, "learning_rate": 0.0001635204, "loss": 0.3074, "step": 91200 }, { "epoch": 0.1826, "grad_norm": 0.24679423868656158, "learning_rate": 0.0001634804, "loss": 0.3139, "step": 91300 }, { "epoch": 0.1828, "grad_norm": 0.26473841071128845, "learning_rate": 0.0001634404, "loss": 0.3155, "step": 91400 }, { "epoch": 0.183, "grad_norm": 0.23032037913799286, "learning_rate": 0.00016340040000000002, "loss": 0.3191, "step": 91500 }, { "epoch": 0.1832, "grad_norm": 0.4667571485042572, "learning_rate": 0.0001633604, "loss": 0.3006, "step": 91600 }, { "epoch": 0.1834, "grad_norm": 0.1938803791999817, "learning_rate": 0.00016332040000000003, "loss": 0.3034, "step": 91700 }, { "epoch": 0.1836, "grad_norm": 0.1822306215763092, "learning_rate": 0.0001632804, "loss": 0.306, "step": 91800 }, { "epoch": 0.1838, "grad_norm": 0.29107385873794556, "learning_rate": 0.00016324040000000002, "loss": 0.304, "step": 91900 }, { "epoch": 0.184, "grad_norm": 0.3339613676071167, "learning_rate": 0.0001632004, "loss": 0.2963, "step": 92000 }, { "epoch": 0.1842, "grad_norm": 0.2274635285139084, "learning_rate": 0.0001631604, "loss": 0.2997, "step": 92100 }, { "epoch": 0.1844, "grad_norm": 0.19457891583442688, "learning_rate": 0.00016312040000000002, "loss": 0.306, "step": 92200 }, { "epoch": 0.1846, "grad_norm": 0.2524142265319824, "learning_rate": 0.0001630804, "loss": 0.3096, "step": 92300 }, { "epoch": 0.1848, "grad_norm": 0.25092560052871704, "learning_rate": 0.0001630404, "loss": 0.2999, "step": 92400 }, { "epoch": 0.185, "grad_norm": 0.323590487241745, "learning_rate": 0.0001630004, "loss": 0.2996, "step": 92500 }, { "epoch": 0.1852, "grad_norm": 0.4235788881778717, "learning_rate": 0.00016296040000000003, "loss": 0.3031, "step": 92600 }, { "epoch": 0.1854, "grad_norm": 0.19289268553256989, "learning_rate": 0.00016292040000000002, "loss": 0.3026, "step": 92700 }, { "epoch": 0.1856, "grad_norm": 0.1985141932964325, "learning_rate": 0.00016288039999999999, "loss": 0.302, "step": 92800 }, { "epoch": 0.1858, "grad_norm": 0.3089956045150757, "learning_rate": 0.0001628404, "loss": 0.3002, "step": 92900 }, { "epoch": 0.186, "grad_norm": 0.2129218727350235, "learning_rate": 0.0001628004, "loss": 0.3036, "step": 93000 }, { "epoch": 0.1862, "grad_norm": 0.3660809397697449, "learning_rate": 0.00016276040000000002, "loss": 0.3017, "step": 93100 }, { "epoch": 0.1864, "grad_norm": 0.22182439267635345, "learning_rate": 0.00016272040000000002, "loss": 0.3012, "step": 93200 }, { "epoch": 0.1866, "grad_norm": 0.22655236721038818, "learning_rate": 0.0001626804, "loss": 0.3054, "step": 93300 }, { "epoch": 0.1868, "grad_norm": 0.19816626608371735, "learning_rate": 0.0001626404, "loss": 0.2978, "step": 93400 }, { "epoch": 0.187, "grad_norm": 0.19407741725444794, "learning_rate": 0.0001626004, "loss": 0.3013, "step": 93500 }, { "epoch": 0.1872, "grad_norm": 0.15464486181735992, "learning_rate": 0.00016256040000000002, "loss": 0.3073, "step": 93600 }, { "epoch": 0.1874, "grad_norm": 0.34153929352760315, "learning_rate": 0.0001625204, "loss": 0.3023, "step": 93700 }, { "epoch": 0.1876, "grad_norm": 0.35526177287101746, "learning_rate": 0.0001624804, "loss": 0.311, "step": 93800 }, { "epoch": 0.1878, "grad_norm": 0.2251167744398117, "learning_rate": 0.0001624404, "loss": 0.3127, "step": 93900 }, { "epoch": 0.188, "grad_norm": 0.25468969345092773, "learning_rate": 0.00016240040000000002, "loss": 0.3004, "step": 94000 }, { "epoch": 0.1882, "grad_norm": 0.17790032923221588, "learning_rate": 0.0001623604, "loss": 0.3096, "step": 94100 }, { "epoch": 0.1884, "grad_norm": 0.23429210484027863, "learning_rate": 0.0001623204, "loss": 0.3023, "step": 94200 }, { "epoch": 0.1886, "grad_norm": 0.3114053010940552, "learning_rate": 0.0001622804, "loss": 0.3043, "step": 94300 }, { "epoch": 0.1888, "grad_norm": 0.19942690432071686, "learning_rate": 0.0001622404, "loss": 0.2985, "step": 94400 }, { "epoch": 0.189, "grad_norm": 0.22419995069503784, "learning_rate": 0.0001622004, "loss": 0.3014, "step": 94500 }, { "epoch": 0.1892, "grad_norm": 0.19408832490444183, "learning_rate": 0.0001621604, "loss": 0.3031, "step": 94600 }, { "epoch": 0.1894, "grad_norm": 0.24643263220787048, "learning_rate": 0.00016212040000000003, "loss": 0.3042, "step": 94700 }, { "epoch": 0.1896, "grad_norm": 0.22279608249664307, "learning_rate": 0.0001620804, "loss": 0.3022, "step": 94800 }, { "epoch": 0.1898, "grad_norm": 0.340925931930542, "learning_rate": 0.00016204040000000001, "loss": 0.2986, "step": 94900 }, { "epoch": 0.19, "grad_norm": 0.1957314908504486, "learning_rate": 0.0001620004, "loss": 0.3014, "step": 95000 }, { "epoch": 0.1902, "grad_norm": 0.23643195629119873, "learning_rate": 0.0001619604, "loss": 0.3103, "step": 95100 }, { "epoch": 0.1904, "grad_norm": 0.2049199789762497, "learning_rate": 0.00016192040000000002, "loss": 0.3039, "step": 95200 }, { "epoch": 0.1906, "grad_norm": 0.19225084781646729, "learning_rate": 0.0001618804, "loss": 0.2963, "step": 95300 }, { "epoch": 0.1908, "grad_norm": 0.2920241355895996, "learning_rate": 0.0001618404, "loss": 0.2996, "step": 95400 }, { "epoch": 0.191, "grad_norm": 0.21930663287639618, "learning_rate": 0.0001618004, "loss": 0.3014, "step": 95500 }, { "epoch": 0.1912, "grad_norm": 0.2423180788755417, "learning_rate": 0.00016176040000000002, "loss": 0.3032, "step": 95600 }, { "epoch": 0.1914, "grad_norm": 0.37270796298980713, "learning_rate": 0.00016172040000000002, "loss": 0.3054, "step": 95700 }, { "epoch": 0.1916, "grad_norm": 0.26363104581832886, "learning_rate": 0.0001616804, "loss": 0.3049, "step": 95800 }, { "epoch": 0.1918, "grad_norm": 0.2359706610441208, "learning_rate": 0.0001616404, "loss": 0.3016, "step": 95900 }, { "epoch": 0.192, "grad_norm": 0.2633230984210968, "learning_rate": 0.0001616004, "loss": 0.3027, "step": 96000 }, { "epoch": 0.1922, "grad_norm": 0.16779562830924988, "learning_rate": 0.00016156040000000002, "loss": 0.3006, "step": 96100 }, { "epoch": 0.1924, "grad_norm": 0.17921751737594604, "learning_rate": 0.0001615204, "loss": 0.2997, "step": 96200 }, { "epoch": 0.1926, "grad_norm": 0.1813753992319107, "learning_rate": 0.0001614804, "loss": 0.2989, "step": 96300 }, { "epoch": 0.1928, "grad_norm": 0.2046331912279129, "learning_rate": 0.0001614404, "loss": 0.2992, "step": 96400 }, { "epoch": 0.193, "grad_norm": 0.1848832368850708, "learning_rate": 0.0001614004, "loss": 0.301, "step": 96500 }, { "epoch": 0.1932, "grad_norm": 0.27407926321029663, "learning_rate": 0.00016136040000000001, "loss": 0.3084, "step": 96600 }, { "epoch": 0.1934, "grad_norm": 0.17408868670463562, "learning_rate": 0.0001613204, "loss": 0.2984, "step": 96700 }, { "epoch": 0.1936, "grad_norm": 0.3170103132724762, "learning_rate": 0.0001612804, "loss": 0.3098, "step": 96800 }, { "epoch": 0.1938, "grad_norm": 0.1906721144914627, "learning_rate": 0.0001612404, "loss": 0.3002, "step": 96900 }, { "epoch": 0.194, "grad_norm": 0.22694748640060425, "learning_rate": 0.00016120040000000002, "loss": 0.3056, "step": 97000 }, { "epoch": 0.1942, "grad_norm": 0.2381366342306137, "learning_rate": 0.0001611604, "loss": 0.3112, "step": 97100 }, { "epoch": 0.1944, "grad_norm": 0.2619655132293701, "learning_rate": 0.0001611204, "loss": 0.3044, "step": 97200 }, { "epoch": 0.1946, "grad_norm": 0.22410011291503906, "learning_rate": 0.0001610804, "loss": 0.2976, "step": 97300 }, { "epoch": 0.1948, "grad_norm": 0.17326219379901886, "learning_rate": 0.0001610404, "loss": 0.2977, "step": 97400 }, { "epoch": 0.195, "grad_norm": 0.16517135500907898, "learning_rate": 0.0001610004, "loss": 0.3074, "step": 97500 }, { "epoch": 0.1952, "grad_norm": 0.2561172544956207, "learning_rate": 0.0001609604, "loss": 0.3074, "step": 97600 }, { "epoch": 0.1954, "grad_norm": 0.19643421471118927, "learning_rate": 0.00016092040000000003, "loss": 0.3031, "step": 97700 }, { "epoch": 0.1956, "grad_norm": 0.26884031295776367, "learning_rate": 0.0001608804, "loss": 0.298, "step": 97800 }, { "epoch": 0.1958, "grad_norm": 0.1689986288547516, "learning_rate": 0.0001608404, "loss": 0.303, "step": 97900 }, { "epoch": 0.196, "grad_norm": 0.22177661955356598, "learning_rate": 0.0001608004, "loss": 0.3113, "step": 98000 }, { "epoch": 0.1962, "grad_norm": 0.23559048771858215, "learning_rate": 0.0001607604, "loss": 0.3019, "step": 98100 }, { "epoch": 0.1964, "grad_norm": 0.2539811432361603, "learning_rate": 0.00016072040000000002, "loss": 0.3017, "step": 98200 }, { "epoch": 0.1966, "grad_norm": 0.2040734440088272, "learning_rate": 0.0001606804, "loss": 0.294, "step": 98300 }, { "epoch": 0.1968, "grad_norm": 0.18892860412597656, "learning_rate": 0.0001606404, "loss": 0.3082, "step": 98400 }, { "epoch": 0.197, "grad_norm": 0.23716984689235687, "learning_rate": 0.0001606004, "loss": 0.3005, "step": 98500 }, { "epoch": 0.1972, "grad_norm": 0.28783920407295227, "learning_rate": 0.00016056040000000002, "loss": 0.303, "step": 98600 }, { "epoch": 0.1974, "grad_norm": 0.31491243839263916, "learning_rate": 0.00016052040000000002, "loss": 0.2989, "step": 98700 }, { "epoch": 0.1976, "grad_norm": 0.2184073030948639, "learning_rate": 0.0001604804, "loss": 0.2992, "step": 98800 }, { "epoch": 0.1978, "grad_norm": 0.19414541125297546, "learning_rate": 0.0001604404, "loss": 0.3067, "step": 98900 }, { "epoch": 0.198, "grad_norm": 0.18975822627544403, "learning_rate": 0.0001604004, "loss": 0.2994, "step": 99000 }, { "epoch": 0.1982, "grad_norm": 0.24121256172657013, "learning_rate": 0.00016036040000000002, "loss": 0.2967, "step": 99100 }, { "epoch": 0.1984, "grad_norm": 0.19773977994918823, "learning_rate": 0.0001603204, "loss": 0.2955, "step": 99200 }, { "epoch": 0.1986, "grad_norm": 0.22694402933120728, "learning_rate": 0.0001602804, "loss": 0.2977, "step": 99300 }, { "epoch": 0.1988, "grad_norm": 0.3497297465801239, "learning_rate": 0.0001602404, "loss": 0.3044, "step": 99400 }, { "epoch": 0.199, "grad_norm": 0.18374694883823395, "learning_rate": 0.00016020040000000002, "loss": 0.2965, "step": 99500 }, { "epoch": 0.1992, "grad_norm": 0.33236002922058105, "learning_rate": 0.0001601604, "loss": 0.3007, "step": 99600 }, { "epoch": 0.1994, "grad_norm": 0.17745955288410187, "learning_rate": 0.0001601204, "loss": 0.2924, "step": 99700 }, { "epoch": 0.1996, "grad_norm": 0.3132670223712921, "learning_rate": 0.0001600804, "loss": 0.3021, "step": 99800 }, { "epoch": 0.1998, "grad_norm": 0.2375771552324295, "learning_rate": 0.0001600404, "loss": 0.3016, "step": 99900 }, { "epoch": 0.2, "grad_norm": 0.31381866335868835, "learning_rate": 0.00016000040000000001, "loss": 0.2965, "step": 100000 }, { "epoch": 0.0002, "grad_norm": 0.24663914740085602, "learning_rate": 0.0001599604, "loss": 0.3381, "step": 100100 }, { "epoch": 0.0004, "grad_norm": 0.6509382128715515, "learning_rate": 0.0001599204, "loss": 0.3687, "step": 100200 }, { "epoch": 0.0006, "grad_norm": 0.8887121081352234, "learning_rate": 0.0001598804, "loss": 0.3573, "step": 100300 }, { "epoch": 0.0008, "grad_norm": 0.366519033908844, "learning_rate": 0.0001598404, "loss": 0.3624, "step": 100400 }, { "epoch": 0.001, "grad_norm": 0.2282172590494156, "learning_rate": 0.0001598004, "loss": 0.3336, "step": 100500 }, { "epoch": 0.0012, "grad_norm": 0.46760937571525574, "learning_rate": 0.0001597604, "loss": 0.3279, "step": 100600 }, { "epoch": 0.0014, "grad_norm": 0.38649675250053406, "learning_rate": 0.00015972040000000002, "loss": 0.3385, "step": 100700 }, { "epoch": 0.0016, "grad_norm": 0.4251730740070343, "learning_rate": 0.0001596804, "loss": 0.335, "step": 100800 }, { "epoch": 0.0018, "grad_norm": 0.3479570150375366, "learning_rate": 0.0001596404, "loss": 0.3238, "step": 100900 }, { "epoch": 0.002, "grad_norm": 0.3096480667591095, "learning_rate": 0.0001596004, "loss": 0.3159, "step": 101000 }, { "epoch": 0.0022, "grad_norm": 0.36655187606811523, "learning_rate": 0.0001595604, "loss": 0.3457, "step": 101100 }, { "epoch": 0.0024, "grad_norm": 0.6139711141586304, "learning_rate": 0.00015952040000000002, "loss": 0.3249, "step": 101200 }, { "epoch": 0.0026, "grad_norm": 0.6844519376754761, "learning_rate": 0.0001594804, "loss": 0.3306, "step": 101300 }, { "epoch": 0.0028, "grad_norm": 0.16951784491539001, "learning_rate": 0.0001594404, "loss": 0.3109, "step": 101400 }, { "epoch": 0.003, "grad_norm": 0.4543248414993286, "learning_rate": 0.0001594004, "loss": 0.3241, "step": 101500 }, { "epoch": 0.0032, "grad_norm": 0.6316892504692078, "learning_rate": 0.00015936040000000002, "loss": 0.3479, "step": 101600 }, { "epoch": 0.0034, "grad_norm": 0.8181749582290649, "learning_rate": 0.00015932040000000001, "loss": 0.3215, "step": 101700 }, { "epoch": 0.0036, "grad_norm": 0.33392763137817383, "learning_rate": 0.0001592804, "loss": 0.3253, "step": 101800 }, { "epoch": 0.0038, "grad_norm": 0.27364447712898254, "learning_rate": 0.0001592404, "loss": 0.3252, "step": 101900 }, { "epoch": 0.004, "grad_norm": 0.443889319896698, "learning_rate": 0.0001592004, "loss": 0.3214, "step": 102000 }, { "epoch": 0.0042, "grad_norm": 0.23544713854789734, "learning_rate": 0.00015916040000000002, "loss": 0.3227, "step": 102100 }, { "epoch": 0.0044, "grad_norm": 0.24548670649528503, "learning_rate": 0.0001591204, "loss": 0.3491, "step": 102200 }, { "epoch": 0.0046, "grad_norm": 0.4170477092266083, "learning_rate": 0.00015908040000000003, "loss": 0.3318, "step": 102300 }, { "epoch": 0.0048, "grad_norm": 0.24510768055915833, "learning_rate": 0.0001590404, "loss": 0.3116, "step": 102400 }, { "epoch": 0.005, "grad_norm": 0.2921444773674011, "learning_rate": 0.00015900040000000002, "loss": 0.3049, "step": 102500 }, { "epoch": 0.0052, "grad_norm": 0.2453673630952835, "learning_rate": 0.0001589604, "loss": 0.3052, "step": 102600 }, { "epoch": 0.0054, "grad_norm": 0.3682481348514557, "learning_rate": 0.0001589204, "loss": 0.3123, "step": 102700 }, { "epoch": 0.0056, "grad_norm": 0.6065927743911743, "learning_rate": 0.00015888040000000002, "loss": 0.3278, "step": 102800 }, { "epoch": 0.0058, "grad_norm": 0.27878880500793457, "learning_rate": 0.0001588404, "loss": 0.3095, "step": 102900 }, { "epoch": 0.006, "grad_norm": 0.3071708083152771, "learning_rate": 0.0001588004, "loss": 0.3729, "step": 103000 }, { "epoch": 0.0062, "grad_norm": 0.27451980113983154, "learning_rate": 0.0001587604, "loss": 0.3234, "step": 103100 }, { "epoch": 0.0064, "grad_norm": 0.1992095708847046, "learning_rate": 0.00015872040000000003, "loss": 0.3142, "step": 103200 }, { "epoch": 0.0066, "grad_norm": 1.1765793561935425, "learning_rate": 0.00015868040000000002, "loss": 0.3365, "step": 103300 }, { "epoch": 0.0068, "grad_norm": 0.22701187431812286, "learning_rate": 0.0001586404, "loss": 0.3165, "step": 103400 }, { "epoch": 0.007, "grad_norm": 0.30635154247283936, "learning_rate": 0.0001586004, "loss": 0.3271, "step": 103500 }, { "epoch": 0.0072, "grad_norm": 0.3161128759384155, "learning_rate": 0.0001585604, "loss": 0.3246, "step": 103600 }, { "epoch": 0.0074, "grad_norm": 0.2176142781972885, "learning_rate": 0.00015852040000000002, "loss": 0.3342, "step": 103700 }, { "epoch": 0.0076, "grad_norm": 0.15987540781497955, "learning_rate": 0.00015848040000000001, "loss": 0.3345, "step": 103800 }, { "epoch": 0.0078, "grad_norm": 0.2533618211746216, "learning_rate": 0.0001584404, "loss": 0.3176, "step": 103900 }, { "epoch": 0.008, "grad_norm": 0.28857430815696716, "learning_rate": 0.0001584004, "loss": 0.3287, "step": 104000 }, { "epoch": 0.0082, "grad_norm": 0.4161205291748047, "learning_rate": 0.0001583604, "loss": 0.3342, "step": 104100 }, { "epoch": 0.0084, "grad_norm": 0.851681649684906, "learning_rate": 0.00015832040000000002, "loss": 0.318, "step": 104200 }, { "epoch": 0.0086, "grad_norm": 0.21251384913921356, "learning_rate": 0.0001582804, "loss": 0.3125, "step": 104300 }, { "epoch": 0.0088, "grad_norm": 0.19260574877262115, "learning_rate": 0.0001582404, "loss": 0.3137, "step": 104400 }, { "epoch": 0.009, "grad_norm": 0.3808644413948059, "learning_rate": 0.0001582004, "loss": 0.33, "step": 104500 }, { "epoch": 0.0092, "grad_norm": 0.23479130864143372, "learning_rate": 0.00015816040000000002, "loss": 0.3368, "step": 104600 }, { "epoch": 0.0094, "grad_norm": 0.3280342221260071, "learning_rate": 0.0001581204, "loss": 0.3242, "step": 104700 }, { "epoch": 0.0096, "grad_norm": 0.3940763473510742, "learning_rate": 0.0001580804, "loss": 0.3362, "step": 104800 }, { "epoch": 0.0098, "grad_norm": 0.23537495732307434, "learning_rate": 0.0001580404, "loss": 0.3342, "step": 104900 }, { "epoch": 0.01, "grad_norm": 0.3302271366119385, "learning_rate": 0.0001580004, "loss": 0.3262, "step": 105000 }, { "epoch": 0.0102, "grad_norm": 0.3009653389453888, "learning_rate": 0.0001579604, "loss": 0.3353, "step": 105100 }, { "epoch": 0.0104, "grad_norm": 0.28532806038856506, "learning_rate": 0.0001579204, "loss": 0.3156, "step": 105200 }, { "epoch": 0.0106, "grad_norm": 0.24950698018074036, "learning_rate": 0.00015788040000000003, "loss": 0.3294, "step": 105300 }, { "epoch": 0.0108, "grad_norm": 0.26212894916534424, "learning_rate": 0.0001578404, "loss": 0.3329, "step": 105400 }, { "epoch": 0.011, "grad_norm": 0.22611194849014282, "learning_rate": 0.00015780040000000001, "loss": 0.3311, "step": 105500 }, { "epoch": 0.0112, "grad_norm": 0.2197837382555008, "learning_rate": 0.0001577604, "loss": 0.327, "step": 105600 }, { "epoch": 0.0114, "grad_norm": 0.31932878494262695, "learning_rate": 0.0001577204, "loss": 0.316, "step": 105700 }, { "epoch": 0.0116, "grad_norm": 0.2394053041934967, "learning_rate": 0.00015768040000000002, "loss": 0.331, "step": 105800 }, { "epoch": 0.0118, "grad_norm": 0.20572052896022797, "learning_rate": 0.0001576404, "loss": 0.3324, "step": 105900 }, { "epoch": 0.012, "grad_norm": 0.19541539251804352, "learning_rate": 0.0001576004, "loss": 0.3201, "step": 106000 }, { "epoch": 0.0122, "grad_norm": 0.2593480348587036, "learning_rate": 0.0001575604, "loss": 0.3545, "step": 106100 }, { "epoch": 0.0124, "grad_norm": 0.2632744014263153, "learning_rate": 0.00015752040000000002, "loss": 0.3217, "step": 106200 }, { "epoch": 0.0126, "grad_norm": 0.2666124701499939, "learning_rate": 0.00015748040000000002, "loss": 0.3302, "step": 106300 }, { "epoch": 0.0128, "grad_norm": 0.17219951748847961, "learning_rate": 0.0001574404, "loss": 0.3136, "step": 106400 }, { "epoch": 0.013, "grad_norm": 0.22360770404338837, "learning_rate": 0.0001574004, "loss": 0.3171, "step": 106500 }, { "epoch": 0.0132, "grad_norm": 0.24799229204654694, "learning_rate": 0.0001573604, "loss": 0.3081, "step": 106600 }, { "epoch": 0.0134, "grad_norm": 0.20889222621917725, "learning_rate": 0.00015732040000000002, "loss": 0.3466, "step": 106700 }, { "epoch": 0.0136, "grad_norm": 0.2627999186515808, "learning_rate": 0.0001572804, "loss": 0.3139, "step": 106800 }, { "epoch": 0.0138, "grad_norm": 0.3904314339160919, "learning_rate": 0.0001572404, "loss": 0.3438, "step": 106900 }, { "epoch": 0.014, "grad_norm": 0.2342156618833542, "learning_rate": 0.0001572004, "loss": 0.335, "step": 107000 }, { "epoch": 0.0142, "grad_norm": 0.17536994814872742, "learning_rate": 0.00015716040000000002, "loss": 0.3221, "step": 107100 }, { "epoch": 0.0144, "grad_norm": 0.3388653099536896, "learning_rate": 0.00015712040000000001, "loss": 0.3233, "step": 107200 }, { "epoch": 0.0146, "grad_norm": 0.1892201155424118, "learning_rate": 0.0001570804, "loss": 0.3233, "step": 107300 }, { "epoch": 0.0148, "grad_norm": 0.4015616178512573, "learning_rate": 0.0001570404, "loss": 0.3135, "step": 107400 }, { "epoch": 0.015, "grad_norm": 0.2883925139904022, "learning_rate": 0.0001570004, "loss": 0.3252, "step": 107500 }, { "epoch": 0.0152, "grad_norm": 0.27161914110183716, "learning_rate": 0.00015696040000000002, "loss": 0.3202, "step": 107600 }, { "epoch": 0.0154, "grad_norm": 0.1823039948940277, "learning_rate": 0.0001569204, "loss": 0.3132, "step": 107700 }, { "epoch": 0.0156, "grad_norm": 0.23866082727909088, "learning_rate": 0.0001568804, "loss": 0.3158, "step": 107800 }, { "epoch": 0.0158, "grad_norm": 0.3076813519001007, "learning_rate": 0.0001568404, "loss": 0.3097, "step": 107900 }, { "epoch": 0.016, "grad_norm": 0.3990881145000458, "learning_rate": 0.0001568004, "loss": 0.3117, "step": 108000 }, { "epoch": 0.0162, "grad_norm": 0.2077111005783081, "learning_rate": 0.0001567604, "loss": 0.3304, "step": 108100 }, { "epoch": 0.0164, "grad_norm": 0.20977669954299927, "learning_rate": 0.0001567204, "loss": 0.3177, "step": 108200 }, { "epoch": 0.0166, "grad_norm": 0.33695122599601746, "learning_rate": 0.00015668040000000003, "loss": 0.3127, "step": 108300 }, { "epoch": 0.0168, "grad_norm": 0.3933875262737274, "learning_rate": 0.0001566404, "loss": 0.3187, "step": 108400 }, { "epoch": 0.017, "grad_norm": 0.24768926203250885, "learning_rate": 0.0001566004, "loss": 0.3279, "step": 108500 }, { "epoch": 0.0172, "grad_norm": 0.23914366960525513, "learning_rate": 0.0001565604, "loss": 0.3389, "step": 108600 }, { "epoch": 0.0174, "grad_norm": 0.2776049077510834, "learning_rate": 0.0001565204, "loss": 0.3202, "step": 108700 }, { "epoch": 0.0176, "grad_norm": 0.2476639598608017, "learning_rate": 0.00015648040000000002, "loss": 0.3203, "step": 108800 }, { "epoch": 0.0178, "grad_norm": 0.25279584527015686, "learning_rate": 0.0001564404, "loss": 0.3031, "step": 108900 }, { "epoch": 0.018, "grad_norm": 0.19141827523708344, "learning_rate": 0.0001564004, "loss": 0.3146, "step": 109000 }, { "epoch": 0.0182, "grad_norm": 0.1972874402999878, "learning_rate": 0.0001563604, "loss": 0.3188, "step": 109100 }, { "epoch": 0.0184, "grad_norm": 0.21893855929374695, "learning_rate": 0.00015632040000000002, "loss": 0.3208, "step": 109200 }, { "epoch": 0.0186, "grad_norm": 0.2527053654193878, "learning_rate": 0.00015628040000000002, "loss": 0.3135, "step": 109300 }, { "epoch": 0.0188, "grad_norm": 0.21851500868797302, "learning_rate": 0.0001562404, "loss": 0.2979, "step": 109400 }, { "epoch": 0.019, "grad_norm": 0.41574743390083313, "learning_rate": 0.0001562004, "loss": 0.3395, "step": 109500 }, { "epoch": 0.0192, "grad_norm": 0.23641952872276306, "learning_rate": 0.0001561604, "loss": 0.3052, "step": 109600 }, { "epoch": 0.0194, "grad_norm": 0.2123720645904541, "learning_rate": 0.00015612040000000002, "loss": 0.3294, "step": 109700 }, { "epoch": 0.0196, "grad_norm": 0.2654692530632019, "learning_rate": 0.0001560804, "loss": 0.3133, "step": 109800 }, { "epoch": 0.0198, "grad_norm": 0.7307865023612976, "learning_rate": 0.0001560404, "loss": 0.3231, "step": 109900 }, { "epoch": 0.02, "grad_norm": 0.23842667043209076, "learning_rate": 0.0001560004, "loss": 0.3278, "step": 110000 }, { "epoch": 0.0202, "grad_norm": 0.16572842001914978, "learning_rate": 0.00015596040000000002, "loss": 0.3166, "step": 110100 }, { "epoch": 0.0204, "grad_norm": 1.4259867668151855, "learning_rate": 0.0001559204, "loss": 0.3491, "step": 110200 }, { "epoch": 0.0206, "grad_norm": 0.2205478399991989, "learning_rate": 0.0001558804, "loss": 0.3302, "step": 110300 }, { "epoch": 0.0208, "grad_norm": 0.20075049996376038, "learning_rate": 0.0001558404, "loss": 0.3316, "step": 110400 }, { "epoch": 0.021, "grad_norm": 0.2577056884765625, "learning_rate": 0.0001558004, "loss": 0.2999, "step": 110500 }, { "epoch": 0.0212, "grad_norm": 0.2297847867012024, "learning_rate": 0.00015576040000000001, "loss": 0.3269, "step": 110600 }, { "epoch": 0.0214, "grad_norm": 0.21519100666046143, "learning_rate": 0.0001557204, "loss": 0.3017, "step": 110700 }, { "epoch": 0.0216, "grad_norm": 0.3626148998737335, "learning_rate": 0.00015568040000000003, "loss": 0.3238, "step": 110800 }, { "epoch": 0.0218, "grad_norm": 0.20477981865406036, "learning_rate": 0.0001556404, "loss": 0.3233, "step": 110900 }, { "epoch": 0.022, "grad_norm": 0.593943178653717, "learning_rate": 0.0001556004, "loss": 0.3187, "step": 111000 }, { "epoch": 0.0222, "grad_norm": 0.22717879712581635, "learning_rate": 0.0001555604, "loss": 0.329, "step": 111100 }, { "epoch": 0.0224, "grad_norm": 0.24597914516925812, "learning_rate": 0.0001555204, "loss": 0.3204, "step": 111200 }, { "epoch": 0.0226, "grad_norm": 0.6169671416282654, "learning_rate": 0.00015548040000000002, "loss": 0.3318, "step": 111300 }, { "epoch": 0.0228, "grad_norm": 0.512597918510437, "learning_rate": 0.0001554404, "loss": 0.3495, "step": 111400 }, { "epoch": 0.023, "grad_norm": 0.1982726901769638, "learning_rate": 0.0001554004, "loss": 0.3256, "step": 111500 }, { "epoch": 0.0232, "grad_norm": 0.24446536600589752, "learning_rate": 0.0001553604, "loss": 0.3182, "step": 111600 }, { "epoch": 0.0234, "grad_norm": 0.1990106999874115, "learning_rate": 0.0001553204, "loss": 0.3103, "step": 111700 }, { "epoch": 0.0236, "grad_norm": 0.32062387466430664, "learning_rate": 0.00015528040000000002, "loss": 0.3336, "step": 111800 }, { "epoch": 0.0238, "grad_norm": 0.24834854900836945, "learning_rate": 0.00015524039999999999, "loss": 0.3718, "step": 111900 }, { "epoch": 0.024, "grad_norm": 0.431271493434906, "learning_rate": 0.0001552004, "loss": 0.3148, "step": 112000 }, { "epoch": 0.0242, "grad_norm": 0.24593783915042877, "learning_rate": 0.0001551604, "loss": 0.3206, "step": 112100 }, { "epoch": 0.0244, "grad_norm": 0.22774197161197662, "learning_rate": 0.00015512040000000002, "loss": 0.323, "step": 112200 }, { "epoch": 0.0246, "grad_norm": 0.28193041682243347, "learning_rate": 0.00015508040000000001, "loss": 0.321, "step": 112300 }, { "epoch": 0.0248, "grad_norm": 0.27978020906448364, "learning_rate": 0.0001550404, "loss": 0.3293, "step": 112400 }, { "epoch": 0.025, "grad_norm": 0.33567145466804504, "learning_rate": 0.0001550004, "loss": 0.3398, "step": 112500 }, { "epoch": 0.0252, "grad_norm": 0.31685078144073486, "learning_rate": 0.0001549604, "loss": 0.3468, "step": 112600 }, { "epoch": 0.0254, "grad_norm": 0.23281008005142212, "learning_rate": 0.00015492040000000002, "loss": 0.3415, "step": 112700 }, { "epoch": 0.0256, "grad_norm": 0.29617008566856384, "learning_rate": 0.0001548804, "loss": 0.3215, "step": 112800 }, { "epoch": 0.0258, "grad_norm": 0.28596845269203186, "learning_rate": 0.0001548404, "loss": 0.3273, "step": 112900 }, { "epoch": 0.026, "grad_norm": 0.2520228922367096, "learning_rate": 0.0001548004, "loss": 0.3656, "step": 113000 }, { "epoch": 0.0262, "grad_norm": 0.1868925839662552, "learning_rate": 0.00015476040000000002, "loss": 0.3305, "step": 113100 }, { "epoch": 0.0264, "grad_norm": 0.2512012720108032, "learning_rate": 0.0001547204, "loss": 0.3331, "step": 113200 }, { "epoch": 0.0266, "grad_norm": 0.24037250876426697, "learning_rate": 0.0001546804, "loss": 0.3265, "step": 113300 }, { "epoch": 0.0268, "grad_norm": 0.44156473875045776, "learning_rate": 0.0001546404, "loss": 0.325, "step": 113400 }, { "epoch": 0.027, "grad_norm": 0.212955042719841, "learning_rate": 0.0001546004, "loss": 0.3103, "step": 113500 }, { "epoch": 0.0272, "grad_norm": 0.3389400839805603, "learning_rate": 0.0001545604, "loss": 0.2959, "step": 113600 }, { "epoch": 0.0274, "grad_norm": 0.21144139766693115, "learning_rate": 0.0001545204, "loss": 0.298, "step": 113700 }, { "epoch": 0.0276, "grad_norm": 0.2982296049594879, "learning_rate": 0.00015448040000000003, "loss": 0.2967, "step": 113800 }, { "epoch": 0.0278, "grad_norm": 0.23678374290466309, "learning_rate": 0.00015444040000000002, "loss": 0.2932, "step": 113900 }, { "epoch": 0.028, "grad_norm": 0.18785041570663452, "learning_rate": 0.0001544004, "loss": 0.3059, "step": 114000 }, { "epoch": 0.0282, "grad_norm": 0.27205973863601685, "learning_rate": 0.0001543604, "loss": 0.2817, "step": 114100 }, { "epoch": 0.0284, "grad_norm": 0.18749140202999115, "learning_rate": 0.0001543204, "loss": 0.2911, "step": 114200 }, { "epoch": 0.0286, "grad_norm": 0.21401409804821014, "learning_rate": 0.00015428040000000002, "loss": 0.3041, "step": 114300 }, { "epoch": 0.0288, "grad_norm": 0.20857536792755127, "learning_rate": 0.00015424040000000001, "loss": 0.2904, "step": 114400 }, { "epoch": 0.029, "grad_norm": 0.3695431053638458, "learning_rate": 0.0001542004, "loss": 0.3039, "step": 114500 }, { "epoch": 0.0292, "grad_norm": 0.42280298471450806, "learning_rate": 0.0001541604, "loss": 0.2926, "step": 114600 }, { "epoch": 0.0294, "grad_norm": 0.3620811998844147, "learning_rate": 0.0001541204, "loss": 0.29, "step": 114700 }, { "epoch": 0.0296, "grad_norm": 0.23109310865402222, "learning_rate": 0.00015408040000000002, "loss": 0.3008, "step": 114800 }, { "epoch": 0.0298, "grad_norm": 0.17469018697738647, "learning_rate": 0.0001540404, "loss": 0.2955, "step": 114900 }, { "epoch": 0.03, "grad_norm": 0.25855448842048645, "learning_rate": 0.0001540004, "loss": 0.3033, "step": 115000 }, { "epoch": 0.0302, "grad_norm": 0.2488468438386917, "learning_rate": 0.0001539604, "loss": 0.3022, "step": 115100 }, { "epoch": 0.0304, "grad_norm": 0.19968360662460327, "learning_rate": 0.00015392040000000002, "loss": 0.2934, "step": 115200 }, { "epoch": 0.0306, "grad_norm": 0.22593122720718384, "learning_rate": 0.0001538804, "loss": 0.292, "step": 115300 }, { "epoch": 0.0308, "grad_norm": 0.27579984068870544, "learning_rate": 0.0001538404, "loss": 0.295, "step": 115400 }, { "epoch": 0.031, "grad_norm": 0.25232911109924316, "learning_rate": 0.0001538004, "loss": 0.3041, "step": 115500 }, { "epoch": 0.0312, "grad_norm": 0.295006662607193, "learning_rate": 0.0001537604, "loss": 0.3088, "step": 115600 }, { "epoch": 0.0314, "grad_norm": 0.520390510559082, "learning_rate": 0.0001537204, "loss": 0.2972, "step": 115700 }, { "epoch": 0.0316, "grad_norm": 0.24330058693885803, "learning_rate": 0.0001536804, "loss": 0.2937, "step": 115800 }, { "epoch": 0.0318, "grad_norm": 0.4119510054588318, "learning_rate": 0.00015364040000000003, "loss": 0.298, "step": 115900 }, { "epoch": 0.032, "grad_norm": 0.309059202671051, "learning_rate": 0.0001536004, "loss": 0.3025, "step": 116000 }, { "epoch": 0.0322, "grad_norm": 0.30650094151496887, "learning_rate": 0.00015356040000000001, "loss": 0.3046, "step": 116100 }, { "epoch": 0.0324, "grad_norm": 0.24224722385406494, "learning_rate": 0.0001535204, "loss": 0.299, "step": 116200 }, { "epoch": 0.0326, "grad_norm": 0.32729941606521606, "learning_rate": 0.0001534804, "loss": 0.3008, "step": 116300 }, { "epoch": 0.0328, "grad_norm": 0.1765100657939911, "learning_rate": 0.00015344040000000002, "loss": 0.3152, "step": 116400 }, { "epoch": 0.033, "grad_norm": 0.25575095415115356, "learning_rate": 0.0001534004, "loss": 0.306, "step": 116500 }, { "epoch": 0.0332, "grad_norm": 0.6274484992027283, "learning_rate": 0.0001533604, "loss": 0.3092, "step": 116600 }, { "epoch": 0.0334, "grad_norm": 0.21123872697353363, "learning_rate": 0.0001533204, "loss": 0.3978, "step": 116700 }, { "epoch": 0.0336, "grad_norm": 0.29489830136299133, "learning_rate": 0.00015328040000000002, "loss": 0.3038, "step": 116800 }, { "epoch": 0.0338, "grad_norm": 0.2510158121585846, "learning_rate": 0.00015324040000000002, "loss": 0.3114, "step": 116900 }, { "epoch": 0.034, "grad_norm": 0.23414947092533112, "learning_rate": 0.0001532004, "loss": 0.3052, "step": 117000 }, { "epoch": 0.0342, "grad_norm": 0.19055521488189697, "learning_rate": 0.0001531604, "loss": 0.3013, "step": 117100 }, { "epoch": 0.0344, "grad_norm": 0.8105829954147339, "learning_rate": 0.0001531204, "loss": 0.2967, "step": 117200 }, { "epoch": 0.0346, "grad_norm": 0.4188934564590454, "learning_rate": 0.00015308040000000002, "loss": 0.2951, "step": 117300 }, { "epoch": 0.0348, "grad_norm": 0.21309562027454376, "learning_rate": 0.0001530404, "loss": 0.2981, "step": 117400 }, { "epoch": 0.035, "grad_norm": 0.18121813237667084, "learning_rate": 0.0001530004, "loss": 0.2931, "step": 117500 }, { "epoch": 0.0352, "grad_norm": 0.632888674736023, "learning_rate": 0.0001529604, "loss": 0.2981, "step": 117600 }, { "epoch": 0.0354, "grad_norm": 0.3604109287261963, "learning_rate": 0.00015292040000000002, "loss": 0.2895, "step": 117700 }, { "epoch": 0.0356, "grad_norm": 0.2515547573566437, "learning_rate": 0.00015288040000000001, "loss": 0.3134, "step": 117800 }, { "epoch": 0.0358, "grad_norm": 0.26086676120758057, "learning_rate": 0.0001528404, "loss": 0.3068, "step": 117900 }, { "epoch": 0.036, "grad_norm": 0.20093345642089844, "learning_rate": 0.0001528004, "loss": 0.2985, "step": 118000 }, { "epoch": 0.0362, "grad_norm": 0.43283843994140625, "learning_rate": 0.0001527604, "loss": 0.304, "step": 118100 }, { "epoch": 0.0364, "grad_norm": 0.2671961784362793, "learning_rate": 0.00015272040000000002, "loss": 0.2987, "step": 118200 }, { "epoch": 0.0366, "grad_norm": 0.2851164638996124, "learning_rate": 0.0001526804, "loss": 0.3012, "step": 118300 }, { "epoch": 0.0368, "grad_norm": 0.27135607600212097, "learning_rate": 0.00015264040000000003, "loss": 0.3035, "step": 118400 }, { "epoch": 0.037, "grad_norm": 0.2493288666009903, "learning_rate": 0.0001526004, "loss": 0.3087, "step": 118500 }, { "epoch": 0.0372, "grad_norm": 0.2523597478866577, "learning_rate": 0.0001525604, "loss": 0.2925, "step": 118600 }, { "epoch": 0.0374, "grad_norm": 2.79110050201416, "learning_rate": 0.0001525204, "loss": 0.3105, "step": 118700 }, { "epoch": 0.0376, "grad_norm": 0.20949222147464752, "learning_rate": 0.0001524804, "loss": 0.3069, "step": 118800 }, { "epoch": 0.0378, "grad_norm": 0.2129492163658142, "learning_rate": 0.00015244040000000003, "loss": 0.2957, "step": 118900 }, { "epoch": 0.038, "grad_norm": 0.2193364053964615, "learning_rate": 0.0001524004, "loss": 0.2939, "step": 119000 }, { "epoch": 0.0382, "grad_norm": 0.21624450385570526, "learning_rate": 0.0001523604, "loss": 0.3033, "step": 119100 }, { "epoch": 0.0384, "grad_norm": 0.16359418630599976, "learning_rate": 0.0001523204, "loss": 0.3051, "step": 119200 }, { "epoch": 0.0386, "grad_norm": 0.38990601897239685, "learning_rate": 0.0001522804, "loss": 0.2948, "step": 119300 }, { "epoch": 0.0388, "grad_norm": 0.25620323419570923, "learning_rate": 0.00015224040000000002, "loss": 0.3069, "step": 119400 }, { "epoch": 0.039, "grad_norm": 0.3383728563785553, "learning_rate": 0.0001522004, "loss": 0.3168, "step": 119500 }, { "epoch": 0.0392, "grad_norm": 0.296837717294693, "learning_rate": 0.0001521604, "loss": 0.3068, "step": 119600 }, { "epoch": 0.0394, "grad_norm": 0.17246387898921967, "learning_rate": 0.0001521204, "loss": 0.2959, "step": 119700 }, { "epoch": 0.0396, "grad_norm": 0.27589595317840576, "learning_rate": 0.00015208040000000002, "loss": 0.2961, "step": 119800 }, { "epoch": 0.0398, "grad_norm": 0.2941451668739319, "learning_rate": 0.00015204040000000002, "loss": 0.294, "step": 119900 }, { "epoch": 0.04, "grad_norm": 0.18005476891994476, "learning_rate": 0.0001520004, "loss": 0.3013, "step": 120000 }, { "epoch": 0.0002, "grad_norm": 0.21327625215053558, "learning_rate": 0.0001519604, "loss": 0.2806, "step": 120100 }, { "epoch": 0.0004, "grad_norm": 0.4669700562953949, "learning_rate": 0.0001519204, "loss": 0.2849, "step": 120200 }, { "epoch": 0.0006, "grad_norm": 0.3588303029537201, "learning_rate": 0.00015188040000000002, "loss": 0.2861, "step": 120300 }, { "epoch": 0.0008, "grad_norm": 0.28102320432662964, "learning_rate": 0.0001518404, "loss": 0.2878, "step": 120400 }, { "epoch": 0.001, "grad_norm": 0.1880110502243042, "learning_rate": 0.0001518004, "loss": 0.2811, "step": 120500 }, { "epoch": 0.0012, "grad_norm": 0.35984283685684204, "learning_rate": 0.0001517604, "loss": 0.277, "step": 120600 }, { "epoch": 0.0014, "grad_norm": 0.46658045053482056, "learning_rate": 0.00015172040000000002, "loss": 0.2813, "step": 120700 }, { "epoch": 0.0016, "grad_norm": 0.3781064748764038, "learning_rate": 0.0001516804, "loss": 0.2833, "step": 120800 }, { "epoch": 0.0018, "grad_norm": 0.30914661288261414, "learning_rate": 0.0001516404, "loss": 0.2806, "step": 120900 }, { "epoch": 0.002, "grad_norm": 0.32875189185142517, "learning_rate": 0.0001516004, "loss": 0.2778, "step": 121000 }, { "epoch": 0.0022, "grad_norm": 0.21213066577911377, "learning_rate": 0.0001515604, "loss": 0.2951, "step": 121100 }, { "epoch": 0.0024, "grad_norm": 0.31731417775154114, "learning_rate": 0.00015152040000000001, "loss": 0.2819, "step": 121200 }, { "epoch": 0.0026, "grad_norm": 0.5732041597366333, "learning_rate": 0.0001514804, "loss": 0.2908, "step": 121300 }, { "epoch": 0.0028, "grad_norm": 0.1619558483362198, "learning_rate": 0.00015144040000000003, "loss": 0.2796, "step": 121400 }, { "epoch": 0.003, "grad_norm": 0.4144911766052246, "learning_rate": 0.0001514004, "loss": 0.2892, "step": 121500 }, { "epoch": 0.0032, "grad_norm": 0.2565133273601532, "learning_rate": 0.00015136040000000002, "loss": 0.3034, "step": 121600 }, { "epoch": 0.0034, "grad_norm": 0.5545548796653748, "learning_rate": 0.0001513204, "loss": 0.2899, "step": 121700 }, { "epoch": 0.0036, "grad_norm": 0.21982091665267944, "learning_rate": 0.0001512804, "loss": 0.2901, "step": 121800 }, { "epoch": 0.0038, "grad_norm": 0.20543520152568817, "learning_rate": 0.00015124040000000002, "loss": 0.2925, "step": 121900 }, { "epoch": 0.004, "grad_norm": 0.27426043152809143, "learning_rate": 0.0001512004, "loss": 0.2911, "step": 122000 }, { "epoch": 0.0042, "grad_norm": 0.20343656837940216, "learning_rate": 0.0001511604, "loss": 0.2918, "step": 122100 }, { "epoch": 0.0044, "grad_norm": 0.21032807230949402, "learning_rate": 0.0001511204, "loss": 0.3072, "step": 122200 }, { "epoch": 0.0046, "grad_norm": 0.3014376163482666, "learning_rate": 0.0001510804, "loss": 0.2942, "step": 122300 }, { "epoch": 0.0048, "grad_norm": 0.2240341305732727, "learning_rate": 0.00015104040000000002, "loss": 0.2799, "step": 122400 }, { "epoch": 0.005, "grad_norm": 0.2607118785381317, "learning_rate": 0.00015100039999999998, "loss": 0.2782, "step": 122500 }, { "epoch": 0.0052, "grad_norm": 0.2203282117843628, "learning_rate": 0.0001509604, "loss": 0.28, "step": 122600 }, { "epoch": 0.0054, "grad_norm": 0.275749534368515, "learning_rate": 0.0001509204, "loss": 0.2859, "step": 122700 }, { "epoch": 0.0056, "grad_norm": 0.2172224372625351, "learning_rate": 0.00015088040000000002, "loss": 0.2916, "step": 122800 }, { "epoch": 0.0058, "grad_norm": 0.24563243985176086, "learning_rate": 0.00015084040000000001, "loss": 0.2784, "step": 122900 }, { "epoch": 0.006, "grad_norm": 0.25981712341308594, "learning_rate": 0.0001508004, "loss": 0.3223, "step": 123000 }, { "epoch": 0.0062, "grad_norm": 0.22978562116622925, "learning_rate": 0.0001507604, "loss": 0.2904, "step": 123100 }, { "epoch": 0.0064, "grad_norm": 0.18407663702964783, "learning_rate": 0.0001507204, "loss": 0.2847, "step": 123200 }, { "epoch": 0.0066, "grad_norm": 0.697337806224823, "learning_rate": 0.00015068040000000002, "loss": 0.2996, "step": 123300 }, { "epoch": 0.0068, "grad_norm": 0.17664720118045807, "learning_rate": 0.0001506404, "loss": 0.2878, "step": 123400 }, { "epoch": 0.007, "grad_norm": 0.24168753623962402, "learning_rate": 0.0001506004, "loss": 0.2988, "step": 123500 }, { "epoch": 0.0072, "grad_norm": 0.20818035304546356, "learning_rate": 0.0001505604, "loss": 0.2945, "step": 123600 }, { "epoch": 0.0074, "grad_norm": 0.22152996063232422, "learning_rate": 0.00015052040000000002, "loss": 0.3007, "step": 123700 }, { "epoch": 0.0076, "grad_norm": 0.15268588066101074, "learning_rate": 0.0001504804, "loss": 0.3035, "step": 123800 }, { "epoch": 0.0078, "grad_norm": 0.19523349404335022, "learning_rate": 0.0001504404, "loss": 0.2886, "step": 123900 }, { "epoch": 0.008, "grad_norm": 0.25065121054649353, "learning_rate": 0.0001504004, "loss": 0.2931, "step": 124000 }, { "epoch": 0.0082, "grad_norm": 0.2577344477176666, "learning_rate": 0.0001503604, "loss": 0.3029, "step": 124100 }, { "epoch": 0.0084, "grad_norm": 0.49803975224494934, "learning_rate": 0.0001503204, "loss": 0.2879, "step": 124200 }, { "epoch": 0.0086, "grad_norm": 0.2067602127790451, "learning_rate": 0.0001502804, "loss": 0.2863, "step": 124300 }, { "epoch": 0.0088, "grad_norm": 0.14126543700695038, "learning_rate": 0.00015024040000000003, "loss": 0.2845, "step": 124400 }, { "epoch": 0.009, "grad_norm": 0.24404196441173553, "learning_rate": 0.0001502004, "loss": 0.2985, "step": 124500 }, { "epoch": 0.0092, "grad_norm": 0.2484026700258255, "learning_rate": 0.0001501604, "loss": 0.3004, "step": 124600 }, { "epoch": 0.0094, "grad_norm": 0.3001592457294464, "learning_rate": 0.0001501204, "loss": 0.2953, "step": 124700 }, { "epoch": 0.0096, "grad_norm": 0.2282063364982605, "learning_rate": 0.0001500804, "loss": 0.3028, "step": 124800 }, { "epoch": 0.0098, "grad_norm": 0.20628437399864197, "learning_rate": 0.00015004040000000002, "loss": 0.2982, "step": 124900 }, { "epoch": 0.01, "grad_norm": 0.25722581148147583, "learning_rate": 0.0001500004, "loss": 0.2941, "step": 125000 }, { "epoch": 0.0102, "grad_norm": 0.23121199011802673, "learning_rate": 0.0001499604, "loss": 0.3042, "step": 125100 }, { "epoch": 0.0104, "grad_norm": 0.26357603073120117, "learning_rate": 0.0001499204, "loss": 0.2879, "step": 125200 }, { "epoch": 0.0106, "grad_norm": 0.24245673418045044, "learning_rate": 0.00014988040000000002, "loss": 0.2984, "step": 125300 }, { "epoch": 0.0108, "grad_norm": 0.23373626172542572, "learning_rate": 0.00014984040000000002, "loss": 0.3001, "step": 125400 }, { "epoch": 0.011, "grad_norm": 0.22621707618236542, "learning_rate": 0.00014980039999999998, "loss": 0.2982, "step": 125500 }, { "epoch": 0.0112, "grad_norm": 0.1624283492565155, "learning_rate": 0.0001497604, "loss": 0.2955, "step": 125600 }, { "epoch": 0.0114, "grad_norm": 0.2670327126979828, "learning_rate": 0.0001497204, "loss": 0.2883, "step": 125700 }, { "epoch": 0.0116, "grad_norm": 0.23716334998607635, "learning_rate": 0.00014968040000000002, "loss": 0.3013, "step": 125800 }, { "epoch": 0.0118, "grad_norm": 0.19362571835517883, "learning_rate": 0.0001496404, "loss": 0.3015, "step": 125900 }, { "epoch": 0.012, "grad_norm": 0.16306428611278534, "learning_rate": 0.0001496004, "loss": 0.2924, "step": 126000 }, { "epoch": 0.0122, "grad_norm": 0.25153741240501404, "learning_rate": 0.0001495604, "loss": 0.3193, "step": 126100 }, { "epoch": 0.0124, "grad_norm": 0.2298291176557541, "learning_rate": 0.0001495204, "loss": 0.2925, "step": 126200 }, { "epoch": 0.0126, "grad_norm": 0.2596379220485687, "learning_rate": 0.0001494804, "loss": 0.2992, "step": 126300 }, { "epoch": 0.0128, "grad_norm": 0.18484342098236084, "learning_rate": 0.0001494404, "loss": 0.2869, "step": 126400 }, { "epoch": 0.013, "grad_norm": 0.21150384843349457, "learning_rate": 0.00014940040000000003, "loss": 0.2917, "step": 126500 }, { "epoch": 0.0132, "grad_norm": 0.210966095328331, "learning_rate": 0.0001493604, "loss": 0.2858, "step": 126600 }, { "epoch": 0.0134, "grad_norm": 0.19671154022216797, "learning_rate": 0.00014932040000000001, "loss": 0.3121, "step": 126700 }, { "epoch": 0.0136, "grad_norm": 0.24427378177642822, "learning_rate": 0.0001492804, "loss": 0.2878, "step": 126800 }, { "epoch": 0.0138, "grad_norm": 0.278323233127594, "learning_rate": 0.0001492404, "loss": 0.3073, "step": 126900 }, { "epoch": 0.014, "grad_norm": 0.2476424276828766, "learning_rate": 0.00014920040000000002, "loss": 0.3057, "step": 127000 }, { "epoch": 0.0142, "grad_norm": 0.18065540492534637, "learning_rate": 0.0001491604, "loss": 0.2981, "step": 127100 }, { "epoch": 0.0144, "grad_norm": 0.28167441487312317, "learning_rate": 0.0001491204, "loss": 0.2962, "step": 127200 }, { "epoch": 0.0146, "grad_norm": 0.19970493018627167, "learning_rate": 0.0001490804, "loss": 0.2969, "step": 127300 }, { "epoch": 0.0148, "grad_norm": 0.40125706791877747, "learning_rate": 0.00014904040000000002, "loss": 0.2901, "step": 127400 }, { "epoch": 0.015, "grad_norm": 0.24552318453788757, "learning_rate": 0.00014900040000000002, "loss": 0.3009, "step": 127500 }, { "epoch": 0.0152, "grad_norm": 0.22551432251930237, "learning_rate": 0.0001489604, "loss": 0.2938, "step": 127600 }, { "epoch": 0.0154, "grad_norm": 0.21164937317371368, "learning_rate": 0.0001489204, "loss": 0.2914, "step": 127700 }, { "epoch": 0.0156, "grad_norm": 0.22755509614944458, "learning_rate": 0.0001488804, "loss": 0.2918, "step": 127800 }, { "epoch": 0.0158, "grad_norm": 0.28624242544174194, "learning_rate": 0.00014884040000000002, "loss": 0.2889, "step": 127900 }, { "epoch": 0.016, "grad_norm": 0.42901697754859924, "learning_rate": 0.0001488004, "loss": 0.2893, "step": 128000 }, { "epoch": 0.0162, "grad_norm": 0.2009563446044922, "learning_rate": 0.0001487604, "loss": 0.3049, "step": 128100 }, { "epoch": 0.0164, "grad_norm": 0.2092672884464264, "learning_rate": 0.0001487204, "loss": 0.2929, "step": 128200 }, { "epoch": 0.0166, "grad_norm": 0.2817251682281494, "learning_rate": 0.00014868040000000002, "loss": 0.2916, "step": 128300 }, { "epoch": 0.0168, "grad_norm": 0.3064669668674469, "learning_rate": 0.00014864040000000001, "loss": 0.2961, "step": 128400 }, { "epoch": 0.017, "grad_norm": 0.23615750670433044, "learning_rate": 0.0001486004, "loss": 0.3013, "step": 128500 }, { "epoch": 0.0172, "grad_norm": 0.2097761034965515, "learning_rate": 0.0001485604, "loss": 0.3128, "step": 128600 }, { "epoch": 0.0174, "grad_norm": 0.20526331663131714, "learning_rate": 0.0001485204, "loss": 0.2986, "step": 128700 }, { "epoch": 0.0176, "grad_norm": 0.24395328760147095, "learning_rate": 0.00014848040000000002, "loss": 0.2977, "step": 128800 }, { "epoch": 0.0178, "grad_norm": 0.31493526697158813, "learning_rate": 0.0001484404, "loss": 0.2832, "step": 128900 }, { "epoch": 0.018, "grad_norm": 0.19086699187755585, "learning_rate": 0.00014840040000000003, "loss": 0.2905, "step": 129000 }, { "epoch": 0.0182, "grad_norm": 0.16701176762580872, "learning_rate": 0.0001483604, "loss": 0.2968, "step": 129100 }, { "epoch": 0.0184, "grad_norm": 0.18984141945838928, "learning_rate": 0.0001483204, "loss": 0.2944, "step": 129200 }, { "epoch": 0.0186, "grad_norm": 0.21061408519744873, "learning_rate": 0.0001482804, "loss": 0.2907, "step": 129300 }, { "epoch": 0.0188, "grad_norm": 0.18577519059181213, "learning_rate": 0.0001482404, "loss": 0.2783, "step": 129400 }, { "epoch": 0.019, "grad_norm": 0.36443671584129333, "learning_rate": 0.00014820040000000003, "loss": 0.3125, "step": 129500 }, { "epoch": 0.0192, "grad_norm": 0.25582483410835266, "learning_rate": 0.0001481604, "loss": 0.2863, "step": 129600 }, { "epoch": 0.0194, "grad_norm": 0.16937601566314697, "learning_rate": 0.0001481204, "loss": 0.3062, "step": 129700 }, { "epoch": 0.0196, "grad_norm": 0.2256769835948944, "learning_rate": 0.0001480804, "loss": 0.2894, "step": 129800 }, { "epoch": 0.0198, "grad_norm": 0.5192826390266418, "learning_rate": 0.0001480404, "loss": 0.2953, "step": 129900 }, { "epoch": 0.02, "grad_norm": 0.21238556504249573, "learning_rate": 0.00014800040000000002, "loss": 0.3024, "step": 130000 }, { "epoch": 0.0202, "grad_norm": 0.15953408181667328, "learning_rate": 0.0001479604, "loss": 0.2966, "step": 130100 }, { "epoch": 0.0204, "grad_norm": 0.6439893841743469, "learning_rate": 0.0001479204, "loss": 0.3231, "step": 130200 }, { "epoch": 0.0206, "grad_norm": 0.2107216715812683, "learning_rate": 0.0001478804, "loss": 0.3037, "step": 130300 }, { "epoch": 0.0208, "grad_norm": 0.2086646556854248, "learning_rate": 0.00014784040000000002, "loss": 0.3024, "step": 130400 }, { "epoch": 0.021, "grad_norm": 0.23454782366752625, "learning_rate": 0.00014780040000000002, "loss": 0.2821, "step": 130500 }, { "epoch": 0.0212, "grad_norm": 0.195747971534729, "learning_rate": 0.0001477604, "loss": 0.2976, "step": 130600 }, { "epoch": 0.0214, "grad_norm": 0.22037339210510254, "learning_rate": 0.0001477204, "loss": 0.2796, "step": 130700 }, { "epoch": 0.0216, "grad_norm": 0.28433117270469666, "learning_rate": 0.0001476804, "loss": 0.3009, "step": 130800 }, { "epoch": 0.0218, "grad_norm": 0.5101615786552429, "learning_rate": 0.00014764040000000002, "loss": 0.3018, "step": 130900 }, { "epoch": 0.022, "grad_norm": 0.4728710353374481, "learning_rate": 0.0001476004, "loss": 0.2966, "step": 131000 }, { "epoch": 0.0222, "grad_norm": 0.2941736876964569, "learning_rate": 0.0001475604, "loss": 0.3063, "step": 131100 }, { "epoch": 0.0224, "grad_norm": 0.26847612857818604, "learning_rate": 0.0001475204, "loss": 0.3034, "step": 131200 }, { "epoch": 0.0226, "grad_norm": 0.30397793650627136, "learning_rate": 0.00014748040000000002, "loss": 0.3112, "step": 131300 }, { "epoch": 0.0228, "grad_norm": 0.48798418045043945, "learning_rate": 0.0001474404, "loss": 0.3263, "step": 131400 }, { "epoch": 0.023, "grad_norm": 0.20954224467277527, "learning_rate": 0.0001474004, "loss": 0.3032, "step": 131500 }, { "epoch": 0.0232, "grad_norm": 0.26696014404296875, "learning_rate": 0.0001473604, "loss": 0.2978, "step": 131600 }, { "epoch": 0.0234, "grad_norm": 0.1869426965713501, "learning_rate": 0.0001473204, "loss": 0.2897, "step": 131700 }, { "epoch": 0.0236, "grad_norm": 0.2813512980937958, "learning_rate": 0.00014728040000000001, "loss": 0.3065, "step": 131800 }, { "epoch": 0.0238, "grad_norm": 0.2745366096496582, "learning_rate": 0.0001472404, "loss": 0.3405, "step": 131900 }, { "epoch": 0.024, "grad_norm": 0.3381493091583252, "learning_rate": 0.00014720040000000003, "loss": 0.2944, "step": 132000 }, { "epoch": 0.0242, "grad_norm": 0.20233681797981262, "learning_rate": 0.0001471604, "loss": 0.3009, "step": 132100 }, { "epoch": 0.0244, "grad_norm": 0.24026724696159363, "learning_rate": 0.00014712040000000002, "loss": 0.2996, "step": 132200 }, { "epoch": 0.0246, "grad_norm": 0.24572235345840454, "learning_rate": 0.0001470804, "loss": 0.2985, "step": 132300 }, { "epoch": 0.0248, "grad_norm": 0.26701977849006653, "learning_rate": 0.0001470404, "loss": 0.3037, "step": 132400 }, { "epoch": 0.025, "grad_norm": 0.24025647342205048, "learning_rate": 0.00014700040000000002, "loss": 0.3154, "step": 132500 }, { "epoch": 0.0252, "grad_norm": 0.3687751889228821, "learning_rate": 0.0001469604, "loss": 0.3188, "step": 132600 }, { "epoch": 0.0254, "grad_norm": 0.215996652841568, "learning_rate": 0.0001469204, "loss": 0.3143, "step": 132700 }, { "epoch": 0.0256, "grad_norm": 0.2471071183681488, "learning_rate": 0.0001468804, "loss": 0.3019, "step": 132800 }, { "epoch": 0.0258, "grad_norm": 0.21399539709091187, "learning_rate": 0.00014684040000000002, "loss": 0.3064, "step": 132900 }, { "epoch": 0.026, "grad_norm": 0.2229408621788025, "learning_rate": 0.00014680040000000002, "loss": 0.3383, "step": 133000 }, { "epoch": 0.0262, "grad_norm": 0.1846669614315033, "learning_rate": 0.00014676039999999998, "loss": 0.3073, "step": 133100 }, { "epoch": 0.0264, "grad_norm": 0.2286502569913864, "learning_rate": 0.0001467204, "loss": 0.3073, "step": 133200 }, { "epoch": 0.0266, "grad_norm": 0.21924042701721191, "learning_rate": 0.0001466804, "loss": 0.3055, "step": 133300 }, { "epoch": 0.0268, "grad_norm": 0.3561740517616272, "learning_rate": 0.00014664040000000002, "loss": 0.3023, "step": 133400 }, { "epoch": 0.027, "grad_norm": 0.2278672158718109, "learning_rate": 0.0001466004, "loss": 0.2862, "step": 133500 }, { "epoch": 0.0272, "grad_norm": 0.28851574659347534, "learning_rate": 0.0001465604, "loss": 0.2724, "step": 133600 }, { "epoch": 0.0274, "grad_norm": 0.20886564254760742, "learning_rate": 0.0001465204, "loss": 0.2766, "step": 133700 }, { "epoch": 0.0276, "grad_norm": 0.2499471753835678, "learning_rate": 0.0001464804, "loss": 0.2761, "step": 133800 }, { "epoch": 0.0278, "grad_norm": 0.1883542537689209, "learning_rate": 0.00014644040000000002, "loss": 0.2739, "step": 133900 }, { "epoch": 0.028, "grad_norm": 0.16889332234859467, "learning_rate": 0.0001464004, "loss": 0.284, "step": 134000 }, { "epoch": 0.0282, "grad_norm": 0.23524640500545502, "learning_rate": 0.0001463604, "loss": 0.265, "step": 134100 }, { "epoch": 0.0284, "grad_norm": 0.18847911059856415, "learning_rate": 0.0001463204, "loss": 0.2718, "step": 134200 }, { "epoch": 0.0286, "grad_norm": 0.1972835212945938, "learning_rate": 0.00014628040000000002, "loss": 0.2858, "step": 134300 }, { "epoch": 0.0288, "grad_norm": 0.2064981758594513, "learning_rate": 0.0001462404, "loss": 0.2715, "step": 134400 }, { "epoch": 0.029, "grad_norm": 0.39654669165611267, "learning_rate": 0.0001462004, "loss": 0.2831, "step": 134500 }, { "epoch": 0.0292, "grad_norm": 0.7286176085472107, "learning_rate": 0.0001461604, "loss": 0.2753, "step": 134600 }, { "epoch": 0.0294, "grad_norm": 0.2998080849647522, "learning_rate": 0.0001461204, "loss": 0.2737, "step": 134700 }, { "epoch": 0.0296, "grad_norm": 0.22182480990886688, "learning_rate": 0.0001460804, "loss": 0.2809, "step": 134800 }, { "epoch": 0.0298, "grad_norm": 0.3183176517486572, "learning_rate": 0.0001460404, "loss": 0.278, "step": 134900 }, { "epoch": 0.03, "grad_norm": 0.24670830368995667, "learning_rate": 0.00014600040000000003, "loss": 0.284, "step": 135000 }, { "epoch": 0.0302, "grad_norm": 0.22563083469867706, "learning_rate": 0.0001459604, "loss": 0.2843, "step": 135100 }, { "epoch": 0.0304, "grad_norm": 0.2058597058057785, "learning_rate": 0.0001459204, "loss": 0.2767, "step": 135200 }, { "epoch": 0.0306, "grad_norm": 0.21196039021015167, "learning_rate": 0.0001458804, "loss": 0.2765, "step": 135300 }, { "epoch": 0.0308, "grad_norm": 0.24366965889930725, "learning_rate": 0.0001458404, "loss": 0.2783, "step": 135400 }, { "epoch": 0.031, "grad_norm": 0.24236196279525757, "learning_rate": 0.00014580040000000002, "loss": 0.2873, "step": 135500 }, { "epoch": 0.0312, "grad_norm": 0.2742379903793335, "learning_rate": 0.0001457604, "loss": 0.2895, "step": 135600 }, { "epoch": 0.0314, "grad_norm": 0.40432196855545044, "learning_rate": 0.0001457204, "loss": 0.2802, "step": 135700 }, { "epoch": 0.0316, "grad_norm": 0.21296831965446472, "learning_rate": 0.0001456804, "loss": 0.2787, "step": 135800 }, { "epoch": 0.0318, "grad_norm": 0.38033196330070496, "learning_rate": 0.00014564040000000002, "loss": 0.2832, "step": 135900 }, { "epoch": 0.032, "grad_norm": 0.28289541602134705, "learning_rate": 0.00014560040000000002, "loss": 0.2853, "step": 136000 }, { "epoch": 0.0322, "grad_norm": 0.21910668909549713, "learning_rate": 0.0001455604, "loss": 0.2872, "step": 136100 }, { "epoch": 0.0324, "grad_norm": 0.23765510320663452, "learning_rate": 0.0001455204, "loss": 0.2826, "step": 136200 }, { "epoch": 0.0326, "grad_norm": 0.27492231130599976, "learning_rate": 0.0001454804, "loss": 0.2825, "step": 136300 }, { "epoch": 0.0328, "grad_norm": 0.16266077756881714, "learning_rate": 0.00014544040000000002, "loss": 0.2949, "step": 136400 }, { "epoch": 0.033, "grad_norm": 0.21813906729221344, "learning_rate": 0.0001454004, "loss": 0.2898, "step": 136500 }, { "epoch": 0.0332, "grad_norm": 0.6559190154075623, "learning_rate": 0.0001453604, "loss": 0.2917, "step": 136600 }, { "epoch": 0.0334, "grad_norm": 0.21168918907642365, "learning_rate": 0.0001453204, "loss": 0.3655, "step": 136700 }, { "epoch": 0.0336, "grad_norm": 0.2966395616531372, "learning_rate": 0.0001452804, "loss": 0.2859, "step": 136800 }, { "epoch": 0.0338, "grad_norm": 0.20525045692920685, "learning_rate": 0.0001452404, "loss": 0.293, "step": 136900 }, { "epoch": 0.034, "grad_norm": 0.17695209383964539, "learning_rate": 0.0001452004, "loss": 0.2869, "step": 137000 }, { "epoch": 0.0342, "grad_norm": 0.182274729013443, "learning_rate": 0.0001451604, "loss": 0.2843, "step": 137100 }, { "epoch": 0.0344, "grad_norm": 0.8087812662124634, "learning_rate": 0.0001451204, "loss": 0.28, "step": 137200 }, { "epoch": 0.0346, "grad_norm": 0.33212119340896606, "learning_rate": 0.00014508040000000001, "loss": 0.2787, "step": 137300 }, { "epoch": 0.0348, "grad_norm": 0.18305575847625732, "learning_rate": 0.0001450404, "loss": 0.2812, "step": 137400 }, { "epoch": 0.035, "grad_norm": 0.17764481902122498, "learning_rate": 0.0001450004, "loss": 0.2773, "step": 137500 }, { "epoch": 0.0352, "grad_norm": 0.33276671171188354, "learning_rate": 0.0001449604, "loss": 0.2829, "step": 137600 }, { "epoch": 0.0354, "grad_norm": 0.2309039831161499, "learning_rate": 0.0001449204, "loss": 0.2744, "step": 137700 }, { "epoch": 0.0356, "grad_norm": 0.21504448354244232, "learning_rate": 0.0001448804, "loss": 0.2975, "step": 137800 }, { "epoch": 0.0358, "grad_norm": 0.23908798396587372, "learning_rate": 0.0001448404, "loss": 0.2891, "step": 137900 }, { "epoch": 0.036, "grad_norm": 0.18185609579086304, "learning_rate": 0.00014480040000000002, "loss": 0.2821, "step": 138000 }, { "epoch": 0.0362, "grad_norm": 0.3852616548538208, "learning_rate": 0.0001447604, "loss": 0.2854, "step": 138100 }, { "epoch": 0.0364, "grad_norm": 0.2340957075357437, "learning_rate": 0.0001447204, "loss": 0.2819, "step": 138200 }, { "epoch": 0.0366, "grad_norm": 0.23798440396785736, "learning_rate": 0.0001446804, "loss": 0.2838, "step": 138300 }, { "epoch": 0.0368, "grad_norm": 0.2655554413795471, "learning_rate": 0.0001446404, "loss": 0.2847, "step": 138400 }, { "epoch": 0.037, "grad_norm": 0.20889754593372345, "learning_rate": 0.00014460040000000002, "loss": 0.2906, "step": 138500 }, { "epoch": 0.0372, "grad_norm": 0.2111879587173462, "learning_rate": 0.00014456039999999999, "loss": 0.2766, "step": 138600 }, { "epoch": 0.0374, "grad_norm": 1.0969158411026, "learning_rate": 0.0001445204, "loss": 0.2879, "step": 138700 }, { "epoch": 0.0376, "grad_norm": 0.20623628795146942, "learning_rate": 0.0001444804, "loss": 0.288, "step": 138800 }, { "epoch": 0.0378, "grad_norm": 0.23224857449531555, "learning_rate": 0.00014444040000000002, "loss": 0.2794, "step": 138900 }, { "epoch": 0.038, "grad_norm": 0.20052120089530945, "learning_rate": 0.00014440040000000001, "loss": 0.2779, "step": 139000 }, { "epoch": 0.0382, "grad_norm": 0.21525472402572632, "learning_rate": 0.0001443604, "loss": 0.2861, "step": 139100 }, { "epoch": 0.0384, "grad_norm": 0.155807763338089, "learning_rate": 0.0001443204, "loss": 0.2868, "step": 139200 }, { "epoch": 0.0386, "grad_norm": 0.3168600797653198, "learning_rate": 0.0001442804, "loss": 0.2789, "step": 139300 }, { "epoch": 0.0388, "grad_norm": 0.23451729118824005, "learning_rate": 0.00014424040000000002, "loss": 0.2891, "step": 139400 }, { "epoch": 0.039, "grad_norm": 0.29926997423171997, "learning_rate": 0.0001442004, "loss": 0.2977, "step": 139500 }, { "epoch": 0.0392, "grad_norm": 0.27708712220191956, "learning_rate": 0.00014416040000000003, "loss": 0.2897, "step": 139600 }, { "epoch": 0.0394, "grad_norm": 0.17544563114643097, "learning_rate": 0.0001441204, "loss": 0.282, "step": 139700 }, { "epoch": 0.0396, "grad_norm": 0.24782776832580566, "learning_rate": 0.00014408040000000002, "loss": 0.2789, "step": 139800 }, { "epoch": 0.0398, "grad_norm": 0.23590870201587677, "learning_rate": 0.0001440404, "loss": 0.2794, "step": 139900 }, { "epoch": 0.04, "grad_norm": 0.17239037156105042, "learning_rate": 0.0001440004, "loss": 0.2843, "step": 140000 }, { "epoch": 0.0402, "grad_norm": 0.22361674904823303, "learning_rate": 0.00014396040000000003, "loss": 0.2794, "step": 140100 }, { "epoch": 0.0404, "grad_norm": 0.3022303581237793, "learning_rate": 0.0001439204, "loss": 0.3019, "step": 140200 }, { "epoch": 0.0406, "grad_norm": 0.2557013928890228, "learning_rate": 0.0001438804, "loss": 0.2996, "step": 140300 }, { "epoch": 0.0408, "grad_norm": 0.278455913066864, "learning_rate": 0.0001438404, "loss": 0.2935, "step": 140400 }, { "epoch": 0.041, "grad_norm": 0.37940022349357605, "learning_rate": 0.0001438004, "loss": 0.2886, "step": 140500 }, { "epoch": 0.0412, "grad_norm": 0.34311047196388245, "learning_rate": 0.00014376040000000002, "loss": 0.2892, "step": 140600 }, { "epoch": 0.0414, "grad_norm": 0.3010990619659424, "learning_rate": 0.0001437204, "loss": 0.2841, "step": 140700 }, { "epoch": 0.0416, "grad_norm": 0.19272881746292114, "learning_rate": 0.0001436804, "loss": 0.3048, "step": 140800 }, { "epoch": 0.0418, "grad_norm": 0.20627625286579132, "learning_rate": 0.0001436404, "loss": 0.295, "step": 140900 }, { "epoch": 0.042, "grad_norm": 0.4096256494522095, "learning_rate": 0.00014360040000000002, "loss": 0.296, "step": 141000 }, { "epoch": 0.0422, "grad_norm": 0.20318618416786194, "learning_rate": 0.00014356040000000002, "loss": 0.2917, "step": 141100 }, { "epoch": 0.0424, "grad_norm": 0.5557262897491455, "learning_rate": 0.0001435204, "loss": 0.2922, "step": 141200 }, { "epoch": 0.0426, "grad_norm": 0.2203870415687561, "learning_rate": 0.0001434804, "loss": 0.2947, "step": 141300 }, { "epoch": 0.0428, "grad_norm": 0.23622627556324005, "learning_rate": 0.0001434404, "loss": 0.2935, "step": 141400 }, { "epoch": 0.043, "grad_norm": 0.24292029440402985, "learning_rate": 0.00014340040000000002, "loss": 0.2956, "step": 141500 }, { "epoch": 0.0432, "grad_norm": 0.18479755520820618, "learning_rate": 0.0001433604, "loss": 0.2997, "step": 141600 }, { "epoch": 0.0434, "grad_norm": 0.2119598090648651, "learning_rate": 0.0001433204, "loss": 0.2957, "step": 141700 }, { "epoch": 0.0436, "grad_norm": 0.2696900963783264, "learning_rate": 0.0001432804, "loss": 0.293, "step": 141800 }, { "epoch": 0.0438, "grad_norm": 0.17380273342132568, "learning_rate": 0.00014324040000000002, "loss": 0.2979, "step": 141900 }, { "epoch": 0.044, "grad_norm": 0.20531192421913147, "learning_rate": 0.0001432004, "loss": 0.2969, "step": 142000 }, { "epoch": 0.0442, "grad_norm": 0.2422923445701599, "learning_rate": 0.0001431604, "loss": 0.3016, "step": 142100 }, { "epoch": 0.0444, "grad_norm": 0.16883300244808197, "learning_rate": 0.0001431204, "loss": 0.2839, "step": 142200 }, { "epoch": 0.0446, "grad_norm": 0.1872025579214096, "learning_rate": 0.0001430804, "loss": 0.3367, "step": 142300 }, { "epoch": 0.0448, "grad_norm": 0.18559499084949493, "learning_rate": 0.00014304040000000001, "loss": 0.2872, "step": 142400 }, { "epoch": 0.045, "grad_norm": 0.22933410108089447, "learning_rate": 0.0001430004, "loss": 0.2856, "step": 142500 }, { "epoch": 0.0452, "grad_norm": 0.1616288274526596, "learning_rate": 0.00014296040000000003, "loss": 0.2832, "step": 142600 }, { "epoch": 0.0454, "grad_norm": 0.190198615193367, "learning_rate": 0.0001429204, "loss": 0.2811, "step": 142700 }, { "epoch": 0.0456, "grad_norm": 0.2478422075510025, "learning_rate": 0.00014288040000000002, "loss": 0.2808, "step": 142800 }, { "epoch": 0.0458, "grad_norm": 0.3362259566783905, "learning_rate": 0.0001428404, "loss": 0.285, "step": 142900 }, { "epoch": 0.046, "grad_norm": 0.1876300573348999, "learning_rate": 0.0001428004, "loss": 0.2844, "step": 143000 }, { "epoch": 0.0462, "grad_norm": 0.267817884683609, "learning_rate": 0.00014276040000000002, "loss": 0.2967, "step": 143100 }, { "epoch": 0.0464, "grad_norm": 0.2802794277667999, "learning_rate": 0.0001427204, "loss": 0.3118, "step": 143200 }, { "epoch": 0.0466, "grad_norm": 0.27438464760780334, "learning_rate": 0.0001426804, "loss": 0.2868, "step": 143300 }, { "epoch": 0.0468, "grad_norm": 0.22925788164138794, "learning_rate": 0.0001426404, "loss": 0.2945, "step": 143400 }, { "epoch": 0.047, "grad_norm": 0.17650224268436432, "learning_rate": 0.00014260040000000002, "loss": 0.2908, "step": 143500 }, { "epoch": 0.0472, "grad_norm": 0.1927453577518463, "learning_rate": 0.00014256040000000002, "loss": 0.2882, "step": 143600 }, { "epoch": 0.0474, "grad_norm": 0.2470424622297287, "learning_rate": 0.00014252039999999998, "loss": 0.2918, "step": 143700 }, { "epoch": 0.0476, "grad_norm": 0.2224511355161667, "learning_rate": 0.0001424804, "loss": 0.2924, "step": 143800 }, { "epoch": 0.0478, "grad_norm": 0.25327613949775696, "learning_rate": 0.0001424404, "loss": 0.2966, "step": 143900 }, { "epoch": 0.048, "grad_norm": 0.2762177288532257, "learning_rate": 0.00014240040000000002, "loss": 0.2976, "step": 144000 }, { "epoch": 0.0482, "grad_norm": 0.32673609256744385, "learning_rate": 0.0001423604, "loss": 0.2974, "step": 144100 }, { "epoch": 0.0484, "grad_norm": 0.4225119352340698, "learning_rate": 0.0001423204, "loss": 0.2931, "step": 144200 }, { "epoch": 0.0486, "grad_norm": 0.20474287867546082, "learning_rate": 0.0001422804, "loss": 0.2992, "step": 144300 }, { "epoch": 0.0488, "grad_norm": 0.210563525557518, "learning_rate": 0.0001422404, "loss": 0.2848, "step": 144400 }, { "epoch": 0.049, "grad_norm": 0.21854105591773987, "learning_rate": 0.00014220040000000001, "loss": 0.2965, "step": 144500 }, { "epoch": 0.0492, "grad_norm": 0.2694493532180786, "learning_rate": 0.0001421604, "loss": 0.2847, "step": 144600 }, { "epoch": 0.0494, "grad_norm": 0.33580654859542847, "learning_rate": 0.0001421204, "loss": 0.2971, "step": 144700 }, { "epoch": 0.0496, "grad_norm": 0.22318777441978455, "learning_rate": 0.0001420804, "loss": 0.2951, "step": 144800 }, { "epoch": 0.0498, "grad_norm": 0.21913912892341614, "learning_rate": 0.00014204040000000002, "loss": 0.2943, "step": 144900 }, { "epoch": 0.05, "grad_norm": 0.1860969066619873, "learning_rate": 0.0001420004, "loss": 0.2937, "step": 145000 }, { "epoch": 0.0502, "grad_norm": 0.18611373007297516, "learning_rate": 0.0001419604, "loss": 0.2915, "step": 145100 }, { "epoch": 0.0504, "grad_norm": 0.3601659834384918, "learning_rate": 0.0001419204, "loss": 0.3132, "step": 145200 }, { "epoch": 0.0506, "grad_norm": 0.22689419984817505, "learning_rate": 0.0001418804, "loss": 0.2991, "step": 145300 }, { "epoch": 0.0508, "grad_norm": 0.22691553831100464, "learning_rate": 0.0001418404, "loss": 0.2964, "step": 145400 }, { "epoch": 0.051, "grad_norm": 0.18553423881530762, "learning_rate": 0.0001418004, "loss": 0.2975, "step": 145500 }, { "epoch": 0.0512, "grad_norm": 0.21025227010250092, "learning_rate": 0.00014176040000000003, "loss": 0.2923, "step": 145600 }, { "epoch": 0.0514, "grad_norm": 0.17990227043628693, "learning_rate": 0.0001417204, "loss": 0.3035, "step": 145700 }, { "epoch": 0.0516, "grad_norm": 0.18237918615341187, "learning_rate": 0.0001416804, "loss": 0.3022, "step": 145800 }, { "epoch": 0.0518, "grad_norm": 0.20058906078338623, "learning_rate": 0.0001416404, "loss": 0.2902, "step": 145900 }, { "epoch": 0.052, "grad_norm": 0.40441182255744934, "learning_rate": 0.0001416004, "loss": 0.3083, "step": 146000 }, { "epoch": 0.0522, "grad_norm": 0.2583122253417969, "learning_rate": 0.00014156040000000002, "loss": 0.285, "step": 146100 }, { "epoch": 0.0524, "grad_norm": 0.18074502050876617, "learning_rate": 0.0001415204, "loss": 0.2932, "step": 146200 }, { "epoch": 0.0526, "grad_norm": 0.3700363337993622, "learning_rate": 0.0001414804, "loss": 0.314, "step": 146300 }, { "epoch": 0.0528, "grad_norm": 0.20429983735084534, "learning_rate": 0.0001414404, "loss": 0.2994, "step": 146400 }, { "epoch": 0.053, "grad_norm": 0.4449596405029297, "learning_rate": 0.00014140040000000002, "loss": 0.2922, "step": 146500 }, { "epoch": 0.0532, "grad_norm": 0.27035078406333923, "learning_rate": 0.00014136040000000002, "loss": 0.3025, "step": 146600 }, { "epoch": 0.0534, "grad_norm": 0.1620853841304779, "learning_rate": 0.0001413204, "loss": 0.2883, "step": 146700 }, { "epoch": 0.0536, "grad_norm": 0.20872916281223297, "learning_rate": 0.0001412804, "loss": 0.2899, "step": 146800 }, { "epoch": 0.0538, "grad_norm": 0.171489417552948, "learning_rate": 0.0001412404, "loss": 0.293, "step": 146900 }, { "epoch": 0.054, "grad_norm": 0.19745928049087524, "learning_rate": 0.00014120040000000002, "loss": 0.2898, "step": 147000 }, { "epoch": 0.0542, "grad_norm": 0.18130256235599518, "learning_rate": 0.0001411604, "loss": 0.2923, "step": 147100 }, { "epoch": 0.0544, "grad_norm": 0.29075926542282104, "learning_rate": 0.0001411204, "loss": 0.3032, "step": 147200 }, { "epoch": 0.0546, "grad_norm": 0.21553675830364227, "learning_rate": 0.0001410804, "loss": 0.3012, "step": 147300 }, { "epoch": 0.0548, "grad_norm": 0.23423010110855103, "learning_rate": 0.00014104040000000002, "loss": 0.2911, "step": 147400 }, { "epoch": 0.055, "grad_norm": 0.2579517364501953, "learning_rate": 0.0001410004, "loss": 0.292, "step": 147500 }, { "epoch": 0.0552, "grad_norm": 0.23888039588928223, "learning_rate": 0.0001409604, "loss": 0.3001, "step": 147600 }, { "epoch": 0.0554, "grad_norm": 0.32866355776786804, "learning_rate": 0.0001409204, "loss": 0.318, "step": 147700 }, { "epoch": 0.0556, "grad_norm": 0.30911415815353394, "learning_rate": 0.0001408804, "loss": 0.3005, "step": 147800 }, { "epoch": 0.0558, "grad_norm": 0.2983124852180481, "learning_rate": 0.00014084040000000001, "loss": 0.2895, "step": 147900 }, { "epoch": 0.056, "grad_norm": 0.18074147403240204, "learning_rate": 0.0001408004, "loss": 0.3108, "step": 148000 }, { "epoch": 0.0562, "grad_norm": 0.17594757676124573, "learning_rate": 0.0001407604, "loss": 0.2967, "step": 148100 }, { "epoch": 0.0564, "grad_norm": 0.21774758398532867, "learning_rate": 0.0001407204, "loss": 0.299, "step": 148200 }, { "epoch": 0.0566, "grad_norm": 0.3890017569065094, "learning_rate": 0.0001406804, "loss": 0.2822, "step": 148300 }, { "epoch": 0.0568, "grad_norm": 0.23971693217754364, "learning_rate": 0.0001406404, "loss": 0.3042, "step": 148400 }, { "epoch": 0.057, "grad_norm": 0.14672812819480896, "learning_rate": 0.0001406004, "loss": 0.3023, "step": 148500 }, { "epoch": 0.0572, "grad_norm": 0.22197787463665009, "learning_rate": 0.00014056040000000002, "loss": 0.2946, "step": 148600 }, { "epoch": 0.0574, "grad_norm": 0.34892770648002625, "learning_rate": 0.0001405204, "loss": 0.3027, "step": 148700 }, { "epoch": 0.0576, "grad_norm": 0.22468386590480804, "learning_rate": 0.0001404804, "loss": 0.3036, "step": 148800 }, { "epoch": 0.0578, "grad_norm": 0.3529873192310333, "learning_rate": 0.0001404404, "loss": 0.3103, "step": 148900 }, { "epoch": 0.058, "grad_norm": 0.22541065514087677, "learning_rate": 0.0001404004, "loss": 0.3031, "step": 149000 }, { "epoch": 0.0582, "grad_norm": 0.19883927702903748, "learning_rate": 0.00014036040000000002, "loss": 0.2875, "step": 149100 }, { "epoch": 0.0584, "grad_norm": 0.16821545362472534, "learning_rate": 0.00014032039999999999, "loss": 0.2906, "step": 149200 }, { "epoch": 0.0586, "grad_norm": 0.2122853845357895, "learning_rate": 0.0001402804, "loss": 0.294, "step": 149300 }, { "epoch": 0.0588, "grad_norm": 0.21868352591991425, "learning_rate": 0.0001402404, "loss": 0.3014, "step": 149400 }, { "epoch": 0.059, "grad_norm": 0.26323187351226807, "learning_rate": 0.00014020040000000002, "loss": 0.3239, "step": 149500 }, { "epoch": 0.0592, "grad_norm": 0.28496459126472473, "learning_rate": 0.00014016040000000001, "loss": 0.3198, "step": 149600 }, { "epoch": 0.0594, "grad_norm": 0.21418221294879913, "learning_rate": 0.0001401204, "loss": 0.3037, "step": 149700 }, { "epoch": 0.0596, "grad_norm": 0.18050341308116913, "learning_rate": 0.0001400804, "loss": 0.2864, "step": 149800 }, { "epoch": 0.0598, "grad_norm": 0.18909691274166107, "learning_rate": 0.0001400404, "loss": 0.2765, "step": 149900 }, { "epoch": 0.06, "grad_norm": 0.1939447522163391, "learning_rate": 0.00014000040000000002, "loss": 0.2759, "step": 150000 }, { "epoch": 0.0602, "grad_norm": 0.2437305450439453, "learning_rate": 0.0001399604, "loss": 0.2684, "step": 150100 }, { "epoch": 0.0604, "grad_norm": 0.18825781345367432, "learning_rate": 0.0001399204, "loss": 0.2675, "step": 150200 }, { "epoch": 0.0606, "grad_norm": 0.17045961320400238, "learning_rate": 0.0001398804, "loss": 0.2668, "step": 150300 }, { "epoch": 0.0608, "grad_norm": 0.16906027495861053, "learning_rate": 0.00013984040000000002, "loss": 0.2641, "step": 150400 }, { "epoch": 0.061, "grad_norm": 0.19279079139232635, "learning_rate": 0.0001398004, "loss": 0.2656, "step": 150500 }, { "epoch": 0.0612, "grad_norm": 0.16379393637180328, "learning_rate": 0.0001397604, "loss": 0.2643, "step": 150600 }, { "epoch": 0.0614, "grad_norm": 0.20136988162994385, "learning_rate": 0.0001397204, "loss": 0.2644, "step": 150700 }, { "epoch": 0.0616, "grad_norm": 0.16851016879081726, "learning_rate": 0.0001396804, "loss": 0.2688, "step": 150800 }, { "epoch": 0.0618, "grad_norm": 0.3253178596496582, "learning_rate": 0.0001396404, "loss": 0.2745, "step": 150900 }, { "epoch": 0.062, "grad_norm": 0.22693248093128204, "learning_rate": 0.0001396004, "loss": 0.264, "step": 151000 }, { "epoch": 0.0622, "grad_norm": 0.16674602031707764, "learning_rate": 0.00013956040000000003, "loss": 0.2702, "step": 151100 }, { "epoch": 0.0624, "grad_norm": 0.17631779611110687, "learning_rate": 0.00013952040000000002, "loss": 0.2707, "step": 151200 }, { "epoch": 0.0626, "grad_norm": 0.18675994873046875, "learning_rate": 0.0001394804, "loss": 0.2697, "step": 151300 }, { "epoch": 0.0628, "grad_norm": 0.19959695637226105, "learning_rate": 0.0001394404, "loss": 0.2674, "step": 151400 }, { "epoch": 0.063, "grad_norm": 0.1849915087223053, "learning_rate": 0.0001394004, "loss": 0.2662, "step": 151500 }, { "epoch": 0.0632, "grad_norm": 0.9266983866691589, "learning_rate": 0.00013936040000000002, "loss": 0.2693, "step": 151600 }, { "epoch": 0.0634, "grad_norm": 0.1627340465784073, "learning_rate": 0.00013932040000000002, "loss": 0.2692, "step": 151700 }, { "epoch": 0.0636, "grad_norm": 0.20373468101024628, "learning_rate": 0.0001392804, "loss": 0.263, "step": 151800 }, { "epoch": 0.0638, "grad_norm": 0.18756113946437836, "learning_rate": 0.0001392404, "loss": 0.2629, "step": 151900 }, { "epoch": 0.064, "grad_norm": 0.21463698148727417, "learning_rate": 0.0001392004, "loss": 0.2727, "step": 152000 }, { "epoch": 0.0642, "grad_norm": 0.18175888061523438, "learning_rate": 0.00013916040000000002, "loss": 0.2732, "step": 152100 }, { "epoch": 0.0644, "grad_norm": 0.19156472384929657, "learning_rate": 0.0001391204, "loss": 0.2681, "step": 152200 }, { "epoch": 0.0646, "grad_norm": 0.1933964043855667, "learning_rate": 0.0001390804, "loss": 0.2669, "step": 152300 }, { "epoch": 0.0648, "grad_norm": 0.16370782256126404, "learning_rate": 0.0001390404, "loss": 0.2637, "step": 152400 }, { "epoch": 0.065, "grad_norm": 0.1704418957233429, "learning_rate": 0.00013900040000000002, "loss": 0.2651, "step": 152500 }, { "epoch": 0.0652, "grad_norm": 0.16683322191238403, "learning_rate": 0.0001389604, "loss": 0.2704, "step": 152600 }, { "epoch": 0.0654, "grad_norm": 0.2169724553823471, "learning_rate": 0.0001389204, "loss": 0.2701, "step": 152700 }, { "epoch": 0.0656, "grad_norm": 0.16984739899635315, "learning_rate": 0.0001388804, "loss": 0.267, "step": 152800 }, { "epoch": 0.0658, "grad_norm": 0.1774764508008957, "learning_rate": 0.0001388404, "loss": 0.2694, "step": 152900 }, { "epoch": 0.066, "grad_norm": 0.25803592801094055, "learning_rate": 0.00013880040000000001, "loss": 0.2692, "step": 153000 }, { "epoch": 0.0662, "grad_norm": 0.16305796802043915, "learning_rate": 0.0001387604, "loss": 0.2666, "step": 153100 }, { "epoch": 0.0664, "grad_norm": 0.28455424308776855, "learning_rate": 0.00013872040000000003, "loss": 0.2692, "step": 153200 }, { "epoch": 0.0666, "grad_norm": 0.527635931968689, "learning_rate": 0.0001386804, "loss": 0.2709, "step": 153300 }, { "epoch": 0.0668, "grad_norm": 0.15370622277259827, "learning_rate": 0.00013864040000000002, "loss": 0.2709, "step": 153400 }, { "epoch": 0.067, "grad_norm": 0.17640765011310577, "learning_rate": 0.0001386004, "loss": 0.271, "step": 153500 }, { "epoch": 0.0672, "grad_norm": 0.19771219789981842, "learning_rate": 0.0001385604, "loss": 0.2709, "step": 153600 }, { "epoch": 0.0674, "grad_norm": 0.1551404446363449, "learning_rate": 0.00013852040000000002, "loss": 0.2661, "step": 153700 }, { "epoch": 0.0676, "grad_norm": 0.17845940589904785, "learning_rate": 0.0001384804, "loss": 0.2696, "step": 153800 }, { "epoch": 0.0678, "grad_norm": 0.1720665991306305, "learning_rate": 0.0001384404, "loss": 0.2648, "step": 153900 }, { "epoch": 0.068, "grad_norm": 0.1719100922346115, "learning_rate": 0.0001384004, "loss": 0.2688, "step": 154000 }, { "epoch": 0.0682, "grad_norm": 0.16675078868865967, "learning_rate": 0.00013836040000000002, "loss": 0.274, "step": 154100 }, { "epoch": 0.0684, "grad_norm": 0.21262894570827484, "learning_rate": 0.00013832040000000002, "loss": 0.2699, "step": 154200 }, { "epoch": 0.0686, "grad_norm": 0.21415004134178162, "learning_rate": 0.0001382804, "loss": 0.2634, "step": 154300 }, { "epoch": 0.0688, "grad_norm": 0.16432279348373413, "learning_rate": 0.0001382404, "loss": 0.2671, "step": 154400 }, { "epoch": 0.069, "grad_norm": 0.2584374248981476, "learning_rate": 0.0001382004, "loss": 0.2731, "step": 154500 }, { "epoch": 0.0692, "grad_norm": 0.1635134071111679, "learning_rate": 0.00013816040000000002, "loss": 0.2686, "step": 154600 }, { "epoch": 0.0694, "grad_norm": 0.23434379696846008, "learning_rate": 0.0001381204, "loss": 0.2749, "step": 154700 }, { "epoch": 0.0696, "grad_norm": 0.16734665632247925, "learning_rate": 0.0001380804, "loss": 0.2663, "step": 154800 }, { "epoch": 0.0698, "grad_norm": 0.17818975448608398, "learning_rate": 0.0001380404, "loss": 0.2702, "step": 154900 }, { "epoch": 0.07, "grad_norm": 0.17465141415596008, "learning_rate": 0.0001380004, "loss": 0.2728, "step": 155000 }, { "epoch": 0.0702, "grad_norm": 0.24960975348949432, "learning_rate": 0.00013796040000000001, "loss": 0.2663, "step": 155100 }, { "epoch": 0.0704, "grad_norm": 0.22631630301475525, "learning_rate": 0.0001379204, "loss": 0.2728, "step": 155200 }, { "epoch": 0.0706, "grad_norm": 0.35504263639450073, "learning_rate": 0.0001378804, "loss": 0.2704, "step": 155300 }, { "epoch": 0.0708, "grad_norm": 0.19621668756008148, "learning_rate": 0.0001378404, "loss": 0.2723, "step": 155400 }, { "epoch": 0.071, "grad_norm": 0.14076755940914154, "learning_rate": 0.00013780040000000002, "loss": 0.2676, "step": 155500 }, { "epoch": 0.0712, "grad_norm": 0.19740091264247894, "learning_rate": 0.0001377604, "loss": 0.2654, "step": 155600 }, { "epoch": 0.0714, "grad_norm": 0.21776890754699707, "learning_rate": 0.0001377204, "loss": 0.2648, "step": 155700 }, { "epoch": 0.0716, "grad_norm": 0.19757398962974548, "learning_rate": 0.0001376804, "loss": 0.2706, "step": 155800 }, { "epoch": 0.0718, "grad_norm": 0.17804448306560516, "learning_rate": 0.0001376404, "loss": 0.2692, "step": 155900 }, { "epoch": 0.072, "grad_norm": 0.28381094336509705, "learning_rate": 0.0001376004, "loss": 0.2658, "step": 156000 }, { "epoch": 0.0722, "grad_norm": 0.19585195183753967, "learning_rate": 0.0001375604, "loss": 0.27, "step": 156100 }, { "epoch": 0.0724, "grad_norm": 0.1824631541967392, "learning_rate": 0.00013752040000000003, "loss": 0.2726, "step": 156200 }, { "epoch": 0.0726, "grad_norm": 0.16430000960826874, "learning_rate": 0.0001374804, "loss": 0.2702, "step": 156300 }, { "epoch": 0.0728, "grad_norm": 0.14714659750461578, "learning_rate": 0.0001374404, "loss": 0.2704, "step": 156400 }, { "epoch": 0.073, "grad_norm": 0.2480650097131729, "learning_rate": 0.0001374004, "loss": 0.2651, "step": 156500 }, { "epoch": 0.0732, "grad_norm": 0.1619512140750885, "learning_rate": 0.0001373604, "loss": 0.2661, "step": 156600 }, { "epoch": 0.0734, "grad_norm": 0.16349893808364868, "learning_rate": 0.00013732040000000002, "loss": 0.2684, "step": 156700 }, { "epoch": 0.0736, "grad_norm": 0.31547239422798157, "learning_rate": 0.0001372804, "loss": 0.267, "step": 156800 }, { "epoch": 0.0738, "grad_norm": 0.17574888467788696, "learning_rate": 0.0001372404, "loss": 0.2712, "step": 156900 }, { "epoch": 0.074, "grad_norm": 0.21195268630981445, "learning_rate": 0.0001372004, "loss": 0.2689, "step": 157000 }, { "epoch": 0.0742, "grad_norm": 0.16163906455039978, "learning_rate": 0.00013716040000000002, "loss": 0.2696, "step": 157100 }, { "epoch": 0.0744, "grad_norm": 0.22510552406311035, "learning_rate": 0.00013712040000000002, "loss": 0.2691, "step": 157200 }, { "epoch": 0.0746, "grad_norm": 0.27251046895980835, "learning_rate": 0.0001370804, "loss": 0.2678, "step": 157300 }, { "epoch": 0.0748, "grad_norm": 0.24684669077396393, "learning_rate": 0.0001370404, "loss": 0.267, "step": 157400 }, { "epoch": 0.075, "grad_norm": 0.2159154713153839, "learning_rate": 0.0001370004, "loss": 0.2792, "step": 157500 }, { "epoch": 0.0752, "grad_norm": 0.16219884157180786, "learning_rate": 0.00013696040000000002, "loss": 0.2714, "step": 157600 }, { "epoch": 0.0754, "grad_norm": 0.28837162256240845, "learning_rate": 0.0001369204, "loss": 0.2668, "step": 157700 }, { "epoch": 0.0756, "grad_norm": 0.23692043125629425, "learning_rate": 0.0001368804, "loss": 0.2673, "step": 157800 }, { "epoch": 0.0758, "grad_norm": 0.1836722046136856, "learning_rate": 0.0001368404, "loss": 0.2685, "step": 157900 }, { "epoch": 0.076, "grad_norm": 0.18108327686786652, "learning_rate": 0.00013680040000000002, "loss": 0.2725, "step": 158000 }, { "epoch": 0.0762, "grad_norm": 0.14798210561275482, "learning_rate": 0.0001367604, "loss": 0.2682, "step": 158100 }, { "epoch": 0.0764, "grad_norm": 0.3181903064250946, "learning_rate": 0.0001367204, "loss": 0.2608, "step": 158200 }, { "epoch": 0.0766, "grad_norm": 0.15039204061031342, "learning_rate": 0.0001366804, "loss": 0.2676, "step": 158300 }, { "epoch": 0.0768, "grad_norm": 0.18739774823188782, "learning_rate": 0.0001366404, "loss": 0.269, "step": 158400 }, { "epoch": 0.077, "grad_norm": 0.28927820920944214, "learning_rate": 0.00013660040000000001, "loss": 0.2656, "step": 158500 }, { "epoch": 0.0772, "grad_norm": 0.174117773771286, "learning_rate": 0.0001365604, "loss": 0.2639, "step": 158600 }, { "epoch": 0.0774, "grad_norm": 0.20544083416461945, "learning_rate": 0.0001365204, "loss": 0.2686, "step": 158700 }, { "epoch": 0.0776, "grad_norm": 0.219607412815094, "learning_rate": 0.0001364804, "loss": 0.2665, "step": 158800 }, { "epoch": 0.0778, "grad_norm": 0.16388162970542908, "learning_rate": 0.0001364404, "loss": 0.2676, "step": 158900 }, { "epoch": 0.078, "grad_norm": 0.2054087072610855, "learning_rate": 0.0001364004, "loss": 0.2767, "step": 159000 }, { "epoch": 0.0782, "grad_norm": 0.15745876729488373, "learning_rate": 0.0001363604, "loss": 0.2713, "step": 159100 }, { "epoch": 0.0784, "grad_norm": 0.17080196738243103, "learning_rate": 0.00013632040000000002, "loss": 0.2622, "step": 159200 }, { "epoch": 0.0786, "grad_norm": 0.2554245591163635, "learning_rate": 0.0001362804, "loss": 0.2658, "step": 159300 }, { "epoch": 0.0788, "grad_norm": 0.25696536898612976, "learning_rate": 0.0001362404, "loss": 0.2684, "step": 159400 }, { "epoch": 0.079, "grad_norm": 0.15896977484226227, "learning_rate": 0.0001362004, "loss": 0.2698, "step": 159500 }, { "epoch": 0.0792, "grad_norm": 0.17716248333454132, "learning_rate": 0.0001361604, "loss": 0.2713, "step": 159600 }, { "epoch": 0.0794, "grad_norm": 0.2042681723833084, "learning_rate": 0.00013612040000000002, "loss": 0.272, "step": 159700 }, { "epoch": 0.0796, "grad_norm": 0.21027176082134247, "learning_rate": 0.00013608039999999999, "loss": 0.2701, "step": 159800 }, { "epoch": 0.0798, "grad_norm": 0.17798981070518494, "learning_rate": 0.0001360404, "loss": 0.2722, "step": 159900 }, { "epoch": 0.08, "grad_norm": 0.21452707052230835, "learning_rate": 0.0001360004, "loss": 0.2704, "step": 160000 }, { "epoch": 0.0802, "grad_norm": 0.19280506670475006, "learning_rate": 0.00013596040000000002, "loss": 0.2714, "step": 160100 }, { "epoch": 0.0804, "grad_norm": 0.1743934005498886, "learning_rate": 0.00013592040000000001, "loss": 0.2681, "step": 160200 }, { "epoch": 0.0806, "grad_norm": 0.20632795989513397, "learning_rate": 0.0001358804, "loss": 0.2767, "step": 160300 }, { "epoch": 0.0808, "grad_norm": 0.18461468815803528, "learning_rate": 0.0001358404, "loss": 0.2716, "step": 160400 }, { "epoch": 0.081, "grad_norm": 0.18317373096942902, "learning_rate": 0.0001358004, "loss": 0.2742, "step": 160500 }, { "epoch": 0.0812, "grad_norm": 0.18154622614383698, "learning_rate": 0.00013576040000000002, "loss": 0.2647, "step": 160600 }, { "epoch": 0.0814, "grad_norm": 0.20829536020755768, "learning_rate": 0.0001357204, "loss": 0.2679, "step": 160700 }, { "epoch": 0.0816, "grad_norm": 0.19264093041419983, "learning_rate": 0.0001356804, "loss": 0.2655, "step": 160800 }, { "epoch": 0.0818, "grad_norm": 0.22138293087482452, "learning_rate": 0.0001356404, "loss": 0.272, "step": 160900 }, { "epoch": 0.082, "grad_norm": 0.18807452917099, "learning_rate": 0.00013560040000000002, "loss": 0.265, "step": 161000 }, { "epoch": 0.0822, "grad_norm": 0.1932212859392166, "learning_rate": 0.0001355604, "loss": 0.259, "step": 161100 }, { "epoch": 0.0824, "grad_norm": 0.1871035099029541, "learning_rate": 0.0001355204, "loss": 0.2631, "step": 161200 }, { "epoch": 0.0826, "grad_norm": 0.16003726422786713, "learning_rate": 0.0001354804, "loss": 0.2654, "step": 161300 }, { "epoch": 0.0828, "grad_norm": 0.1761871874332428, "learning_rate": 0.0001354404, "loss": 0.2669, "step": 161400 }, { "epoch": 0.083, "grad_norm": 0.27383115887641907, "learning_rate": 0.0001354004, "loss": 0.2621, "step": 161500 }, { "epoch": 0.0832, "grad_norm": 0.1668674498796463, "learning_rate": 0.0001353604, "loss": 0.2665, "step": 161600 }, { "epoch": 0.0834, "grad_norm": 0.19219066202640533, "learning_rate": 0.00013532040000000003, "loss": 0.2672, "step": 161700 }, { "epoch": 0.0836, "grad_norm": 0.2823545038700104, "learning_rate": 0.0001352804, "loss": 0.2655, "step": 161800 }, { "epoch": 0.0838, "grad_norm": 0.18881294131278992, "learning_rate": 0.00013524040000000001, "loss": 0.266, "step": 161900 }, { "epoch": 0.084, "grad_norm": 0.1976308971643448, "learning_rate": 0.0001352004, "loss": 0.2691, "step": 162000 }, { "epoch": 0.0842, "grad_norm": 0.22563625872135162, "learning_rate": 0.0001351604, "loss": 0.2655, "step": 162100 }, { "epoch": 0.0844, "grad_norm": 0.18993031978607178, "learning_rate": 0.00013512040000000002, "loss": 0.269, "step": 162200 }, { "epoch": 0.0846, "grad_norm": 0.2097342163324356, "learning_rate": 0.0001350804, "loss": 0.2652, "step": 162300 }, { "epoch": 0.0848, "grad_norm": 0.1664929836988449, "learning_rate": 0.0001350404, "loss": 0.2654, "step": 162400 }, { "epoch": 0.085, "grad_norm": 0.1798952966928482, "learning_rate": 0.0001350004, "loss": 0.2661, "step": 162500 }, { "epoch": 0.0852, "grad_norm": 0.17582497000694275, "learning_rate": 0.0001349604, "loss": 0.2651, "step": 162600 }, { "epoch": 0.0854, "grad_norm": 0.28709277510643005, "learning_rate": 0.00013492040000000002, "loss": 0.2669, "step": 162700 }, { "epoch": 0.0856, "grad_norm": 0.1520322561264038, "learning_rate": 0.00013488039999999998, "loss": 0.2652, "step": 162800 }, { "epoch": 0.0858, "grad_norm": 0.1492086499929428, "learning_rate": 0.0001348404, "loss": 0.2667, "step": 162900 }, { "epoch": 0.086, "grad_norm": 0.25710374116897583, "learning_rate": 0.0001348004, "loss": 0.2697, "step": 163000 }, { "epoch": 0.0862, "grad_norm": 0.20499125123023987, "learning_rate": 0.00013476040000000002, "loss": 0.264, "step": 163100 }, { "epoch": 0.0864, "grad_norm": 0.19236227869987488, "learning_rate": 0.0001347204, "loss": 0.2692, "step": 163200 }, { "epoch": 0.0866, "grad_norm": 0.1704840511083603, "learning_rate": 0.0001346804, "loss": 0.2857, "step": 163300 }, { "epoch": 0.0868, "grad_norm": 0.3140028417110443, "learning_rate": 0.0001346404, "loss": 0.2689, "step": 163400 }, { "epoch": 0.087, "grad_norm": 0.16201360523700714, "learning_rate": 0.0001346004, "loss": 0.2646, "step": 163500 }, { "epoch": 0.0872, "grad_norm": 0.22757022082805634, "learning_rate": 0.00013456040000000001, "loss": 0.266, "step": 163600 }, { "epoch": 0.0874, "grad_norm": 0.15154384076595306, "learning_rate": 0.0001345204, "loss": 0.2688, "step": 163700 }, { "epoch": 0.0876, "grad_norm": 0.2439052164554596, "learning_rate": 0.00013448040000000003, "loss": 0.2656, "step": 163800 }, { "epoch": 0.0878, "grad_norm": 0.1832723617553711, "learning_rate": 0.0001344404, "loss": 0.2657, "step": 163900 }, { "epoch": 0.088, "grad_norm": 0.2097846120595932, "learning_rate": 0.00013440040000000002, "loss": 0.2633, "step": 164000 }, { "epoch": 0.0882, "grad_norm": 0.2136882096529007, "learning_rate": 0.0001343604, "loss": 0.2756, "step": 164100 }, { "epoch": 0.0884, "grad_norm": 0.20696696639060974, "learning_rate": 0.0001343204, "loss": 0.2675, "step": 164200 }, { "epoch": 0.0886, "grad_norm": 0.2125898003578186, "learning_rate": 0.00013428040000000002, "loss": 0.2728, "step": 164300 }, { "epoch": 0.0888, "grad_norm": 0.16671323776245117, "learning_rate": 0.0001342404, "loss": 0.2667, "step": 164400 }, { "epoch": 0.089, "grad_norm": 0.18992967903614044, "learning_rate": 0.0001342004, "loss": 0.2666, "step": 164500 }, { "epoch": 0.0892, "grad_norm": 0.167401522397995, "learning_rate": 0.0001341604, "loss": 0.2649, "step": 164600 }, { "epoch": 0.0894, "grad_norm": 0.18029069900512695, "learning_rate": 0.00013412040000000002, "loss": 0.2683, "step": 164700 }, { "epoch": 0.0896, "grad_norm": 0.17287403345108032, "learning_rate": 0.00013408040000000002, "loss": 0.2697, "step": 164800 }, { "epoch": 0.0898, "grad_norm": 0.15097787976264954, "learning_rate": 0.0001340404, "loss": 0.265, "step": 164900 }, { "epoch": 0.09, "grad_norm": 0.1807047575712204, "learning_rate": 0.0001340004, "loss": 0.2669, "step": 165000 }, { "epoch": 0.0902, "grad_norm": 0.20332248508930206, "learning_rate": 0.0001339604, "loss": 0.2699, "step": 165100 }, { "epoch": 0.0904, "grad_norm": 0.24715851247310638, "learning_rate": 0.00013392040000000002, "loss": 0.2661, "step": 165200 }, { "epoch": 0.0906, "grad_norm": 0.1838693916797638, "learning_rate": 0.0001338804, "loss": 0.2645, "step": 165300 }, { "epoch": 0.0908, "grad_norm": 0.29177427291870117, "learning_rate": 0.0001338404, "loss": 0.2686, "step": 165400 }, { "epoch": 0.091, "grad_norm": 0.16363517940044403, "learning_rate": 0.0001338004, "loss": 0.2663, "step": 165500 }, { "epoch": 0.0912, "grad_norm": 0.17923811078071594, "learning_rate": 0.00013376040000000002, "loss": 0.2637, "step": 165600 }, { "epoch": 0.0914, "grad_norm": 0.14745573699474335, "learning_rate": 0.00013372040000000001, "loss": 0.2644, "step": 165700 }, { "epoch": 0.0916, "grad_norm": 0.17744286358356476, "learning_rate": 0.0001336804, "loss": 0.2697, "step": 165800 }, { "epoch": 0.0918, "grad_norm": 0.2632181942462921, "learning_rate": 0.0001336404, "loss": 0.2622, "step": 165900 }, { "epoch": 0.092, "grad_norm": 0.22580741345882416, "learning_rate": 0.0001336004, "loss": 0.2623, "step": 166000 }, { "epoch": 0.0922, "grad_norm": 0.2017957717180252, "learning_rate": 0.00013356040000000002, "loss": 0.2723, "step": 166100 }, { "epoch": 0.0924, "grad_norm": 0.20650072395801544, "learning_rate": 0.0001335204, "loss": 0.2646, "step": 166200 }, { "epoch": 0.0926, "grad_norm": 0.19651727378368378, "learning_rate": 0.0001334804, "loss": 0.2638, "step": 166300 }, { "epoch": 0.0928, "grad_norm": 0.16937686502933502, "learning_rate": 0.0001334404, "loss": 0.2627, "step": 166400 }, { "epoch": 0.093, "grad_norm": 0.16137665510177612, "learning_rate": 0.0001334004, "loss": 0.2643, "step": 166500 }, { "epoch": 0.0932, "grad_norm": 0.18096907436847687, "learning_rate": 0.0001333604, "loss": 0.2623, "step": 166600 }, { "epoch": 0.0934, "grad_norm": 0.1687513142824173, "learning_rate": 0.0001333204, "loss": 0.2679, "step": 166700 }, { "epoch": 0.0936, "grad_norm": 0.18491438031196594, "learning_rate": 0.00013328040000000003, "loss": 0.2663, "step": 166800 }, { "epoch": 0.0938, "grad_norm": 0.21767160296440125, "learning_rate": 0.0001332404, "loss": 0.2614, "step": 166900 }, { "epoch": 0.094, "grad_norm": 0.18175816535949707, "learning_rate": 0.0001332004, "loss": 0.2638, "step": 167000 }, { "epoch": 0.0942, "grad_norm": 0.17034652829170227, "learning_rate": 0.0001331604, "loss": 0.2656, "step": 167100 }, { "epoch": 0.0944, "grad_norm": 0.1829775869846344, "learning_rate": 0.0001331204, "loss": 0.2723, "step": 167200 }, { "epoch": 0.0946, "grad_norm": 0.16462214291095734, "learning_rate": 0.00013308040000000002, "loss": 0.2678, "step": 167300 }, { "epoch": 0.0948, "grad_norm": 0.18933749198913574, "learning_rate": 0.0001330404, "loss": 0.2598, "step": 167400 }, { "epoch": 0.095, "grad_norm": 0.2435947209596634, "learning_rate": 0.0001330004, "loss": 0.2648, "step": 167500 }, { "epoch": 0.0952, "grad_norm": 0.20233914256095886, "learning_rate": 0.0001329604, "loss": 0.2659, "step": 167600 }, { "epoch": 0.0954, "grad_norm": 0.17714782059192657, "learning_rate": 0.00013292040000000002, "loss": 0.2644, "step": 167700 }, { "epoch": 0.0956, "grad_norm": 0.14922687411308289, "learning_rate": 0.00013288040000000002, "loss": 0.2695, "step": 167800 }, { "epoch": 0.0958, "grad_norm": 0.1976463347673416, "learning_rate": 0.0001328404, "loss": 0.2628, "step": 167900 }, { "epoch": 0.096, "grad_norm": 0.18765133619308472, "learning_rate": 0.0001328004, "loss": 0.2612, "step": 168000 }, { "epoch": 0.0962, "grad_norm": 0.2899118959903717, "learning_rate": 0.0001327604, "loss": 0.2677, "step": 168100 }, { "epoch": 0.0964, "grad_norm": 0.20916017889976501, "learning_rate": 0.00013272040000000002, "loss": 0.2664, "step": 168200 }, { "epoch": 0.0966, "grad_norm": 0.20737506449222565, "learning_rate": 0.0001326804, "loss": 0.2585, "step": 168300 }, { "epoch": 0.0968, "grad_norm": 0.16495130956172943, "learning_rate": 0.0001326404, "loss": 0.2618, "step": 168400 }, { "epoch": 0.097, "grad_norm": 0.3457525074481964, "learning_rate": 0.0001326004, "loss": 0.2603, "step": 168500 }, { "epoch": 0.0972, "grad_norm": 0.16713711619377136, "learning_rate": 0.00013256040000000002, "loss": 0.2631, "step": 168600 }, { "epoch": 0.0974, "grad_norm": 0.4487501382827759, "learning_rate": 0.0001325204, "loss": 0.2655, "step": 168700 }, { "epoch": 0.0976, "grad_norm": 0.16808322072029114, "learning_rate": 0.0001324804, "loss": 0.265, "step": 168800 }, { "epoch": 0.0978, "grad_norm": 0.1778227984905243, "learning_rate": 0.0001324404, "loss": 0.2629, "step": 168900 }, { "epoch": 0.098, "grad_norm": 0.30036282539367676, "learning_rate": 0.0001324004, "loss": 0.269, "step": 169000 }, { "epoch": 0.0982, "grad_norm": 0.5122041702270508, "learning_rate": 0.00013236040000000001, "loss": 0.2575, "step": 169100 }, { "epoch": 0.0984, "grad_norm": 0.14579719305038452, "learning_rate": 0.0001323204, "loss": 0.2615, "step": 169200 }, { "epoch": 0.0986, "grad_norm": 0.2066614180803299, "learning_rate": 0.00013228040000000003, "loss": 0.2618, "step": 169300 }, { "epoch": 0.0988, "grad_norm": 0.20652034878730774, "learning_rate": 0.0001322404, "loss": 0.2491, "step": 169400 }, { "epoch": 0.099, "grad_norm": 0.23203743994235992, "learning_rate": 0.0001322004, "loss": 0.2618, "step": 169500 }, { "epoch": 0.0992, "grad_norm": 0.2068849503993988, "learning_rate": 0.0001321604, "loss": 0.2606, "step": 169600 }, { "epoch": 0.0994, "grad_norm": 0.19553183019161224, "learning_rate": 0.0001321204, "loss": 0.2661, "step": 169700 }, { "epoch": 0.0996, "grad_norm": 0.17411251366138458, "learning_rate": 0.00013208040000000002, "loss": 0.2633, "step": 169800 }, { "epoch": 0.0998, "grad_norm": 0.23143243789672852, "learning_rate": 0.0001320404, "loss": 0.2562, "step": 169900 }, { "epoch": 0.1, "grad_norm": 0.18598131835460663, "learning_rate": 0.0001320004, "loss": 0.2585, "step": 170000 }, { "epoch": 0.1002, "grad_norm": 0.2250274121761322, "learning_rate": 0.0001319604, "loss": 0.2631, "step": 170100 }, { "epoch": 0.1004, "grad_norm": 0.2246299684047699, "learning_rate": 0.0001319204, "loss": 0.2663, "step": 170200 }, { "epoch": 0.1006, "grad_norm": 0.18941205739974976, "learning_rate": 0.00013188040000000002, "loss": 0.2609, "step": 170300 }, { "epoch": 0.1008, "grad_norm": 0.16915076971054077, "learning_rate": 0.00013184039999999999, "loss": 0.2615, "step": 170400 }, { "epoch": 0.101, "grad_norm": 0.18507012724876404, "learning_rate": 0.0001318004, "loss": 0.2668, "step": 170500 }, { "epoch": 0.1012, "grad_norm": 0.19404035806655884, "learning_rate": 0.0001317604, "loss": 0.2604, "step": 170600 }, { "epoch": 0.1014, "grad_norm": 0.20818071067333221, "learning_rate": 0.00013172040000000002, "loss": 0.2635, "step": 170700 }, { "epoch": 0.1016, "grad_norm": 0.20802633464336395, "learning_rate": 0.00013168040000000001, "loss": 0.2608, "step": 170800 }, { "epoch": 0.1018, "grad_norm": 0.25072330236434937, "learning_rate": 0.0001316404, "loss": 0.2589, "step": 170900 }, { "epoch": 0.102, "grad_norm": 0.1490134745836258, "learning_rate": 0.0001316004, "loss": 0.2642, "step": 171000 }, { "epoch": 0.1022, "grad_norm": 0.15050648152828217, "learning_rate": 0.0001315604, "loss": 0.2625, "step": 171100 }, { "epoch": 0.1024, "grad_norm": 0.16798128187656403, "learning_rate": 0.00013152040000000002, "loss": 0.2601, "step": 171200 }, { "epoch": 0.1026, "grad_norm": 0.1618351936340332, "learning_rate": 0.0001314804, "loss": 0.2618, "step": 171300 }, { "epoch": 0.1028, "grad_norm": 0.19324688613414764, "learning_rate": 0.0001314404, "loss": 0.2617, "step": 171400 }, { "epoch": 0.103, "grad_norm": 0.16977021098136902, "learning_rate": 0.0001314004, "loss": 0.2583, "step": 171500 }, { "epoch": 0.1032, "grad_norm": 0.1766977161169052, "learning_rate": 0.00013136040000000002, "loss": 0.2617, "step": 171600 }, { "epoch": 0.1034, "grad_norm": 0.16178765892982483, "learning_rate": 0.0001313204, "loss": 0.2735, "step": 171700 }, { "epoch": 0.1036, "grad_norm": 0.2680456340312958, "learning_rate": 0.0001312804, "loss": 0.261, "step": 171800 }, { "epoch": 0.1038, "grad_norm": 0.1756458580493927, "learning_rate": 0.0001312404, "loss": 0.2736, "step": 171900 }, { "epoch": 0.104, "grad_norm": 0.14756278693675995, "learning_rate": 0.0001312004, "loss": 0.2646, "step": 172000 }, { "epoch": 0.1042, "grad_norm": 0.4272075593471527, "learning_rate": 0.0001311604, "loss": 0.2837, "step": 172100 }, { "epoch": 0.1044, "grad_norm": 0.2615519165992737, "learning_rate": 0.0001311204, "loss": 0.2742, "step": 172200 }, { "epoch": 0.1046, "grad_norm": 0.15839534997940063, "learning_rate": 0.00013108040000000003, "loss": 0.2642, "step": 172300 }, { "epoch": 0.1048, "grad_norm": 0.20636843144893646, "learning_rate": 0.0001310404, "loss": 0.2663, "step": 172400 }, { "epoch": 0.105, "grad_norm": 0.16299419105052948, "learning_rate": 0.00013100040000000001, "loss": 0.2599, "step": 172500 }, { "epoch": 0.1052, "grad_norm": 0.19725507497787476, "learning_rate": 0.0001309604, "loss": 0.2705, "step": 172600 }, { "epoch": 0.1054, "grad_norm": 0.20907683670520782, "learning_rate": 0.0001309204, "loss": 0.2637, "step": 172700 }, { "epoch": 0.1056, "grad_norm": 0.1777603030204773, "learning_rate": 0.00013088040000000002, "loss": 0.2696, "step": 172800 }, { "epoch": 0.1058, "grad_norm": 0.2649913430213928, "learning_rate": 0.0001308404, "loss": 0.2602, "step": 172900 }, { "epoch": 0.106, "grad_norm": 0.21683943271636963, "learning_rate": 0.0001308004, "loss": 0.268, "step": 173000 }, { "epoch": 0.1062, "grad_norm": 0.1863240897655487, "learning_rate": 0.0001307604, "loss": 0.2699, "step": 173100 }, { "epoch": 0.1064, "grad_norm": 0.38173478841781616, "learning_rate": 0.0001307204, "loss": 0.2741, "step": 173200 }, { "epoch": 0.1066, "grad_norm": 0.19350466132164001, "learning_rate": 0.00013068040000000002, "loss": 0.264, "step": 173300 }, { "epoch": 0.1068, "grad_norm": 0.21752768754959106, "learning_rate": 0.00013064039999999998, "loss": 0.2638, "step": 173400 }, { "epoch": 0.107, "grad_norm": 0.22547826170921326, "learning_rate": 0.0001306004, "loss": 0.2669, "step": 173500 }, { "epoch": 0.1072, "grad_norm": 0.17793551087379456, "learning_rate": 0.0001305604, "loss": 0.2668, "step": 173600 }, { "epoch": 0.1074, "grad_norm": 0.18250034749507904, "learning_rate": 0.00013052040000000002, "loss": 0.2634, "step": 173700 }, { "epoch": 0.1076, "grad_norm": 0.17843498289585114, "learning_rate": 0.0001304804, "loss": 0.2632, "step": 173800 }, { "epoch": 0.1078, "grad_norm": 0.2799086570739746, "learning_rate": 0.0001304404, "loss": 0.2619, "step": 173900 }, { "epoch": 0.108, "grad_norm": 0.2233753800392151, "learning_rate": 0.0001304004, "loss": 0.2625, "step": 174000 }, { "epoch": 0.1082, "grad_norm": 0.18286100029945374, "learning_rate": 0.0001303604, "loss": 0.2634, "step": 174100 }, { "epoch": 0.1084, "grad_norm": 0.1985454261302948, "learning_rate": 0.0001303204, "loss": 0.2626, "step": 174200 }, { "epoch": 0.1086, "grad_norm": 0.191930890083313, "learning_rate": 0.0001302804, "loss": 0.261, "step": 174300 }, { "epoch": 0.1088, "grad_norm": 0.17980682849884033, "learning_rate": 0.0001302404, "loss": 0.264, "step": 174400 }, { "epoch": 0.109, "grad_norm": 0.16460399329662323, "learning_rate": 0.0001302004, "loss": 0.2657, "step": 174500 }, { "epoch": 0.1092, "grad_norm": 0.22608372569084167, "learning_rate": 0.00013016040000000002, "loss": 0.2671, "step": 174600 }, { "epoch": 0.1094, "grad_norm": 0.22313453257083893, "learning_rate": 0.0001301204, "loss": 0.2633, "step": 174700 }, { "epoch": 0.1096, "grad_norm": 0.20686650276184082, "learning_rate": 0.0001300804, "loss": 0.2655, "step": 174800 }, { "epoch": 0.1098, "grad_norm": 0.23883718252182007, "learning_rate": 0.0001300404, "loss": 0.2633, "step": 174900 }, { "epoch": 0.11, "grad_norm": 0.19800545275211334, "learning_rate": 0.0001300004, "loss": 0.268, "step": 175000 }, { "epoch": 0.1102, "grad_norm": 0.18685345351696014, "learning_rate": 0.0001299604, "loss": 0.2678, "step": 175100 }, { "epoch": 0.1104, "grad_norm": 0.22192412614822388, "learning_rate": 0.0001299204, "loss": 0.2689, "step": 175200 }, { "epoch": 0.1106, "grad_norm": 0.20726250112056732, "learning_rate": 0.00012988040000000002, "loss": 0.2613, "step": 175300 }, { "epoch": 0.1108, "grad_norm": 0.2257356494665146, "learning_rate": 0.0001298404, "loss": 0.264, "step": 175400 }, { "epoch": 0.111, "grad_norm": 0.21950094401836395, "learning_rate": 0.0001298004, "loss": 0.2664, "step": 175500 }, { "epoch": 0.1112, "grad_norm": 0.18882116675376892, "learning_rate": 0.0001297604, "loss": 0.2719, "step": 175600 }, { "epoch": 0.1114, "grad_norm": 0.2492358237504959, "learning_rate": 0.0001297204, "loss": 0.2588, "step": 175700 }, { "epoch": 0.1116, "grad_norm": 0.15519103407859802, "learning_rate": 0.00012968040000000002, "loss": 0.2629, "step": 175800 }, { "epoch": 0.1118, "grad_norm": 0.16526469588279724, "learning_rate": 0.00012964039999999999, "loss": 0.2594, "step": 175900 }, { "epoch": 0.112, "grad_norm": 0.1781369298696518, "learning_rate": 0.0001296004, "loss": 0.2638, "step": 176000 }, { "epoch": 0.1122, "grad_norm": 0.19480067491531372, "learning_rate": 0.0001295604, "loss": 0.263, "step": 176100 }, { "epoch": 0.1124, "grad_norm": 0.20331743359565735, "learning_rate": 0.00012952040000000002, "loss": 0.2595, "step": 176200 }, { "epoch": 0.1126, "grad_norm": 0.404156357049942, "learning_rate": 0.00012948040000000001, "loss": 0.264, "step": 176300 }, { "epoch": 0.1128, "grad_norm": 0.17432494461536407, "learning_rate": 0.0001294404, "loss": 0.2616, "step": 176400 }, { "epoch": 0.113, "grad_norm": 0.17324478924274445, "learning_rate": 0.0001294004, "loss": 0.2566, "step": 176500 }, { "epoch": 0.1132, "grad_norm": 0.20663540065288544, "learning_rate": 0.0001293604, "loss": 0.2659, "step": 176600 }, { "epoch": 0.1134, "grad_norm": 0.16922618448734283, "learning_rate": 0.00012932040000000002, "loss": 0.2621, "step": 176700 }, { "epoch": 0.1136, "grad_norm": 0.1731468141078949, "learning_rate": 0.0001292804, "loss": 0.2583, "step": 176800 }, { "epoch": 0.1138, "grad_norm": 0.3375247120857239, "learning_rate": 0.0001292404, "loss": 0.2624, "step": 176900 }, { "epoch": 0.114, "grad_norm": 0.14527542889118195, "learning_rate": 0.0001292004, "loss": 0.2598, "step": 177000 }, { "epoch": 0.1142, "grad_norm": 0.18705546855926514, "learning_rate": 0.0001291604, "loss": 0.2602, "step": 177100 }, { "epoch": 0.1144, "grad_norm": 0.23862403631210327, "learning_rate": 0.0001291204, "loss": 0.2568, "step": 177200 }, { "epoch": 0.1146, "grad_norm": 0.21297907829284668, "learning_rate": 0.0001290804, "loss": 0.2687, "step": 177300 }, { "epoch": 0.1148, "grad_norm": 0.24559397995471954, "learning_rate": 0.00012904040000000003, "loss": 0.2638, "step": 177400 }, { "epoch": 0.115, "grad_norm": 0.1739484816789627, "learning_rate": 0.0001290004, "loss": 0.26, "step": 177500 }, { "epoch": 0.1152, "grad_norm": 0.1745755672454834, "learning_rate": 0.0001289604, "loss": 0.2781, "step": 177600 }, { "epoch": 0.1154, "grad_norm": 0.4448339343070984, "learning_rate": 0.0001289204, "loss": 0.2606, "step": 177700 }, { "epoch": 0.1156, "grad_norm": 0.18532177805900574, "learning_rate": 0.0001288804, "loss": 0.2695, "step": 177800 }, { "epoch": 0.1158, "grad_norm": 0.20550072193145752, "learning_rate": 0.00012884040000000002, "loss": 0.2586, "step": 177900 }, { "epoch": 0.116, "grad_norm": 0.18768703937530518, "learning_rate": 0.0001288004, "loss": 0.2581, "step": 178000 }, { "epoch": 0.1162, "grad_norm": 0.21779786050319672, "learning_rate": 0.0001287604, "loss": 0.2598, "step": 178100 }, { "epoch": 0.1164, "grad_norm": 0.16268861293792725, "learning_rate": 0.0001287204, "loss": 0.2639, "step": 178200 }, { "epoch": 0.1166, "grad_norm": 0.18373139202594757, "learning_rate": 0.00012868040000000002, "loss": 0.2573, "step": 178300 }, { "epoch": 0.1168, "grad_norm": 0.17486025393009186, "learning_rate": 0.00012864040000000002, "loss": 0.2628, "step": 178400 }, { "epoch": 0.117, "grad_norm": 0.17271597683429718, "learning_rate": 0.0001286004, "loss": 0.2603, "step": 178500 }, { "epoch": 0.1172, "grad_norm": 0.15884092450141907, "learning_rate": 0.0001285604, "loss": 0.2616, "step": 178600 }, { "epoch": 0.1174, "grad_norm": 0.22521570324897766, "learning_rate": 0.0001285204, "loss": 0.2596, "step": 178700 }, { "epoch": 0.1176, "grad_norm": 0.1907728910446167, "learning_rate": 0.00012848040000000002, "loss": 0.2585, "step": 178800 }, { "epoch": 0.1178, "grad_norm": 0.24702465534210205, "learning_rate": 0.0001284404, "loss": 0.268, "step": 178900 }, { "epoch": 0.118, "grad_norm": 0.15964145958423615, "learning_rate": 0.0001284004, "loss": 0.2633, "step": 179000 }, { "epoch": 0.1182, "grad_norm": 0.17996646463871002, "learning_rate": 0.0001283604, "loss": 0.2581, "step": 179100 }, { "epoch": 0.1184, "grad_norm": 0.1755184382200241, "learning_rate": 0.00012832040000000002, "loss": 0.26, "step": 179200 }, { "epoch": 0.1186, "grad_norm": 0.16567598283290863, "learning_rate": 0.0001282804, "loss": 0.2568, "step": 179300 }, { "epoch": 0.1188, "grad_norm": 0.16007539629936218, "learning_rate": 0.0001282404, "loss": 0.262, "step": 179400 }, { "epoch": 0.119, "grad_norm": 0.28034693002700806, "learning_rate": 0.0001282004, "loss": 0.2603, "step": 179500 }, { "epoch": 0.1192, "grad_norm": 0.28950920701026917, "learning_rate": 0.0001281604, "loss": 0.2611, "step": 179600 }, { "epoch": 0.1194, "grad_norm": 0.18529054522514343, "learning_rate": 0.00012812040000000001, "loss": 0.2574, "step": 179700 }, { "epoch": 0.1196, "grad_norm": 0.18269339203834534, "learning_rate": 0.0001280804, "loss": 0.2644, "step": 179800 }, { "epoch": 0.1198, "grad_norm": 0.27880364656448364, "learning_rate": 0.00012804040000000003, "loss": 0.2618, "step": 179900 }, { "epoch": 0.12, "grad_norm": 0.1725231260061264, "learning_rate": 0.0001280004, "loss": 0.2599, "step": 180000 }, { "epoch": 0.1202, "grad_norm": 0.2305331975221634, "learning_rate": 0.00012796040000000002, "loss": 0.2625, "step": 180100 }, { "epoch": 0.1204, "grad_norm": 0.14379462599754333, "learning_rate": 0.0001279204, "loss": 0.2627, "step": 180200 }, { "epoch": 0.1206, "grad_norm": 0.16537553071975708, "learning_rate": 0.0001278804, "loss": 0.2721, "step": 180300 }, { "epoch": 0.1208, "grad_norm": 0.21422922611236572, "learning_rate": 0.00012784040000000002, "loss": 0.2627, "step": 180400 }, { "epoch": 0.121, "grad_norm": 0.1767064481973648, "learning_rate": 0.0001278004, "loss": 0.2594, "step": 180500 }, { "epoch": 0.1212, "grad_norm": 0.23920567333698273, "learning_rate": 0.0001277604, "loss": 0.2595, "step": 180600 }, { "epoch": 0.1214, "grad_norm": 0.16521887481212616, "learning_rate": 0.0001277204, "loss": 0.2612, "step": 180700 }, { "epoch": 0.1216, "grad_norm": 0.1697589010000229, "learning_rate": 0.0001276804, "loss": 0.2619, "step": 180800 }, { "epoch": 0.1218, "grad_norm": 0.20626257359981537, "learning_rate": 0.00012764040000000002, "loss": 0.2642, "step": 180900 }, { "epoch": 0.122, "grad_norm": 0.16152922809123993, "learning_rate": 0.00012760039999999999, "loss": 0.2637, "step": 181000 }, { "epoch": 0.1222, "grad_norm": 0.15324938297271729, "learning_rate": 0.0001275604, "loss": 0.2557, "step": 181100 }, { "epoch": 0.1224, "grad_norm": 0.18400631844997406, "learning_rate": 0.0001275204, "loss": 0.2643, "step": 181200 }, { "epoch": 0.1226, "grad_norm": 0.1637372076511383, "learning_rate": 0.00012748040000000002, "loss": 0.2673, "step": 181300 }, { "epoch": 0.1228, "grad_norm": 0.23258568346500397, "learning_rate": 0.00012744040000000001, "loss": 0.2619, "step": 181400 }, { "epoch": 0.123, "grad_norm": 0.16462406516075134, "learning_rate": 0.0001274004, "loss": 0.2589, "step": 181500 }, { "epoch": 0.1232, "grad_norm": 0.19165945053100586, "learning_rate": 0.0001273604, "loss": 0.2649, "step": 181600 }, { "epoch": 0.1234, "grad_norm": 0.23923306167125702, "learning_rate": 0.0001273204, "loss": 0.2711, "step": 181700 }, { "epoch": 0.1236, "grad_norm": 0.23697294294834137, "learning_rate": 0.00012728040000000002, "loss": 0.2658, "step": 181800 }, { "epoch": 0.1238, "grad_norm": 0.24161309003829956, "learning_rate": 0.0001272404, "loss": 0.258, "step": 181900 }, { "epoch": 0.124, "grad_norm": 0.1704692542552948, "learning_rate": 0.0001272004, "loss": 0.2615, "step": 182000 }, { "epoch": 0.1242, "grad_norm": 0.16872115433216095, "learning_rate": 0.0001271604, "loss": 0.2674, "step": 182100 }, { "epoch": 0.1244, "grad_norm": 0.20729516446590424, "learning_rate": 0.00012712040000000002, "loss": 0.2585, "step": 182200 }, { "epoch": 0.1246, "grad_norm": 0.20898325741291046, "learning_rate": 0.0001270804, "loss": 0.2627, "step": 182300 }, { "epoch": 0.1248, "grad_norm": 0.1441413015127182, "learning_rate": 0.0001270404, "loss": 0.2603, "step": 182400 }, { "epoch": 0.125, "grad_norm": 0.18905945122241974, "learning_rate": 0.0001270004, "loss": 0.254, "step": 182500 }, { "epoch": 0.1252, "grad_norm": 0.17578963935375214, "learning_rate": 0.0001269604, "loss": 0.2608, "step": 182600 }, { "epoch": 0.1254, "grad_norm": 0.23273304104804993, "learning_rate": 0.0001269204, "loss": 0.2625, "step": 182700 }, { "epoch": 0.1256, "grad_norm": 0.228327676653862, "learning_rate": 0.0001268804, "loss": 0.2598, "step": 182800 }, { "epoch": 0.1258, "grad_norm": 0.1630987524986267, "learning_rate": 0.00012684040000000003, "loss": 0.2645, "step": 182900 }, { "epoch": 0.126, "grad_norm": 0.18932506442070007, "learning_rate": 0.0001268004, "loss": 0.2633, "step": 183000 }, { "epoch": 0.1262, "grad_norm": 0.18664585053920746, "learning_rate": 0.00012676040000000001, "loss": 0.2627, "step": 183100 }, { "epoch": 0.1264, "grad_norm": 0.2066875547170639, "learning_rate": 0.0001267204, "loss": 0.2598, "step": 183200 }, { "epoch": 0.1266, "grad_norm": 0.20927661657333374, "learning_rate": 0.0001266804, "loss": 0.2659, "step": 183300 }, { "epoch": 0.1268, "grad_norm": 0.204673171043396, "learning_rate": 0.00012664040000000002, "loss": 0.2605, "step": 183400 }, { "epoch": 0.127, "grad_norm": 0.23662003874778748, "learning_rate": 0.0001266004, "loss": 0.2638, "step": 183500 }, { "epoch": 0.1272, "grad_norm": 0.30359625816345215, "learning_rate": 0.0001265604, "loss": 0.2597, "step": 183600 }, { "epoch": 0.1274, "grad_norm": 0.2385796308517456, "learning_rate": 0.0001265204, "loss": 0.2633, "step": 183700 }, { "epoch": 0.1276, "grad_norm": 0.3185318112373352, "learning_rate": 0.00012648040000000002, "loss": 0.2609, "step": 183800 }, { "epoch": 0.1278, "grad_norm": 0.17162935435771942, "learning_rate": 0.00012644040000000002, "loss": 0.2624, "step": 183900 }, { "epoch": 0.128, "grad_norm": 0.16286444664001465, "learning_rate": 0.00012640039999999998, "loss": 0.2602, "step": 184000 }, { "epoch": 0.1282, "grad_norm": 0.15972454845905304, "learning_rate": 0.0001263604, "loss": 0.2578, "step": 184100 }, { "epoch": 0.1284, "grad_norm": 0.1734430193901062, "learning_rate": 0.0001263204, "loss": 0.2639, "step": 184200 }, { "epoch": 0.1286, "grad_norm": 0.16511160135269165, "learning_rate": 0.00012628040000000002, "loss": 0.2606, "step": 184300 }, { "epoch": 0.1288, "grad_norm": 0.22905713319778442, "learning_rate": 0.0001262404, "loss": 0.258, "step": 184400 }, { "epoch": 0.129, "grad_norm": 0.16669723391532898, "learning_rate": 0.0001262004, "loss": 0.2582, "step": 184500 }, { "epoch": 0.1292, "grad_norm": 0.18425410985946655, "learning_rate": 0.0001261604, "loss": 0.2649, "step": 184600 }, { "epoch": 0.1294, "grad_norm": 0.18333709239959717, "learning_rate": 0.0001261204, "loss": 0.2636, "step": 184700 }, { "epoch": 0.1296, "grad_norm": 0.19844196736812592, "learning_rate": 0.0001260804, "loss": 0.263, "step": 184800 }, { "epoch": 0.1298, "grad_norm": 0.21080031991004944, "learning_rate": 0.0001260404, "loss": 0.2617, "step": 184900 }, { "epoch": 0.13, "grad_norm": 0.16929513216018677, "learning_rate": 0.0001260004, "loss": 0.2637, "step": 185000 }, { "epoch": 0.1302, "grad_norm": 0.17420923709869385, "learning_rate": 0.0001259604, "loss": 0.2655, "step": 185100 }, { "epoch": 0.1304, "grad_norm": 0.15296240150928497, "learning_rate": 0.00012592040000000001, "loss": 0.2654, "step": 185200 }, { "epoch": 0.1306, "grad_norm": 0.18833494186401367, "learning_rate": 0.0001258804, "loss": 0.2657, "step": 185300 }, { "epoch": 0.1308, "grad_norm": 0.1517093926668167, "learning_rate": 0.0001258404, "loss": 0.2694, "step": 185400 }, { "epoch": 0.131, "grad_norm": 0.22326596081256866, "learning_rate": 0.0001258004, "loss": 0.2616, "step": 185500 }, { "epoch": 0.1312, "grad_norm": 0.17264465987682343, "learning_rate": 0.0001257604, "loss": 0.2618, "step": 185600 }, { "epoch": 0.1314, "grad_norm": 0.28218379616737366, "learning_rate": 0.0001257204, "loss": 0.2641, "step": 185700 }, { "epoch": 0.1316, "grad_norm": 0.19472543895244598, "learning_rate": 0.0001256804, "loss": 0.2638, "step": 185800 }, { "epoch": 0.1318, "grad_norm": 0.17265281081199646, "learning_rate": 0.00012564040000000002, "loss": 0.2624, "step": 185900 }, { "epoch": 0.132, "grad_norm": 0.16616639494895935, "learning_rate": 0.0001256004, "loss": 0.2569, "step": 186000 }, { "epoch": 0.1322, "grad_norm": 0.2015598714351654, "learning_rate": 0.0001255604, "loss": 0.2739, "step": 186100 }, { "epoch": 0.1324, "grad_norm": 0.16873787343502045, "learning_rate": 0.0001255204, "loss": 0.2643, "step": 186200 }, { "epoch": 0.1326, "grad_norm": 0.3382190763950348, "learning_rate": 0.0001254804, "loss": 0.2612, "step": 186300 }, { "epoch": 0.1328, "grad_norm": 0.1990320235490799, "learning_rate": 0.00012544040000000002, "loss": 0.2641, "step": 186400 }, { "epoch": 0.133, "grad_norm": 0.17715241014957428, "learning_rate": 0.00012540039999999999, "loss": 0.2619, "step": 186500 }, { "epoch": 0.1332, "grad_norm": 0.18182724714279175, "learning_rate": 0.0001253604, "loss": 0.2632, "step": 186600 }, { "epoch": 0.1334, "grad_norm": 0.24784061312675476, "learning_rate": 0.0001253204, "loss": 0.2643, "step": 186700 }, { "epoch": 0.1336, "grad_norm": 0.1730160117149353, "learning_rate": 0.00012528040000000002, "loss": 0.2599, "step": 186800 }, { "epoch": 0.1338, "grad_norm": 0.25901031494140625, "learning_rate": 0.00012524040000000001, "loss": 0.2627, "step": 186900 }, { "epoch": 0.134, "grad_norm": 0.2513553500175476, "learning_rate": 0.0001252004, "loss": 0.2601, "step": 187000 }, { "epoch": 0.1342, "grad_norm": 0.18127425014972687, "learning_rate": 0.0001251604, "loss": 0.2688, "step": 187100 }, { "epoch": 0.1344, "grad_norm": 0.16403953731060028, "learning_rate": 0.0001251204, "loss": 0.2604, "step": 187200 }, { "epoch": 0.1346, "grad_norm": 0.19153177738189697, "learning_rate": 0.00012508040000000002, "loss": 0.2607, "step": 187300 }, { "epoch": 0.1348, "grad_norm": 0.17384052276611328, "learning_rate": 0.0001250404, "loss": 0.2579, "step": 187400 }, { "epoch": 0.135, "grad_norm": 0.17870062589645386, "learning_rate": 0.0001250004, "loss": 0.2605, "step": 187500 }, { "epoch": 0.1352, "grad_norm": 0.16759884357452393, "learning_rate": 0.0001249604, "loss": 0.2579, "step": 187600 }, { "epoch": 0.1354, "grad_norm": 0.19198445975780487, "learning_rate": 0.0001249204, "loss": 0.2583, "step": 187700 }, { "epoch": 0.1356, "grad_norm": 0.17366041243076324, "learning_rate": 0.0001248804, "loss": 0.2566, "step": 187800 }, { "epoch": 0.1358, "grad_norm": 0.3349681794643402, "learning_rate": 0.0001248404, "loss": 0.264, "step": 187900 }, { "epoch": 0.136, "grad_norm": 0.21217289566993713, "learning_rate": 0.0001248004, "loss": 0.2657, "step": 188000 }, { "epoch": 0.1362, "grad_norm": 0.17328070104122162, "learning_rate": 0.0001247604, "loss": 0.2552, "step": 188100 }, { "epoch": 0.1364, "grad_norm": 0.3321928381919861, "learning_rate": 0.0001247204, "loss": 0.2617, "step": 188200 }, { "epoch": 0.1366, "grad_norm": 0.16838262975215912, "learning_rate": 0.0001246804, "loss": 0.2674, "step": 188300 }, { "epoch": 0.1368, "grad_norm": 0.19490031898021698, "learning_rate": 0.0001246404, "loss": 0.2651, "step": 188400 }, { "epoch": 0.137, "grad_norm": 0.2795969247817993, "learning_rate": 0.0001246004, "loss": 0.2598, "step": 188500 }, { "epoch": 0.1372, "grad_norm": 0.18094158172607422, "learning_rate": 0.0001245604, "loss": 0.2631, "step": 188600 }, { "epoch": 0.1374, "grad_norm": 0.17936332523822784, "learning_rate": 0.0001245204, "loss": 0.2599, "step": 188700 }, { "epoch": 0.1376, "grad_norm": 0.4011989235877991, "learning_rate": 0.0001244804, "loss": 0.2577, "step": 188800 }, { "epoch": 0.1378, "grad_norm": 0.1783379763364792, "learning_rate": 0.00012444040000000002, "loss": 0.2562, "step": 188900 }, { "epoch": 0.138, "grad_norm": 0.18040119111537933, "learning_rate": 0.00012440040000000002, "loss": 0.2579, "step": 189000 }, { "epoch": 0.1382, "grad_norm": 0.181350976228714, "learning_rate": 0.0001243604, "loss": 0.2662, "step": 189100 }, { "epoch": 0.1384, "grad_norm": 0.18030929565429688, "learning_rate": 0.0001243204, "loss": 0.2588, "step": 189200 }, { "epoch": 0.1386, "grad_norm": 0.19084039330482483, "learning_rate": 0.0001242804, "loss": 0.2654, "step": 189300 }, { "epoch": 0.1388, "grad_norm": 0.23569531738758087, "learning_rate": 0.00012424040000000002, "loss": 0.259, "step": 189400 }, { "epoch": 0.139, "grad_norm": 0.2429647147655487, "learning_rate": 0.0001242004, "loss": 0.2581, "step": 189500 }, { "epoch": 0.1392, "grad_norm": 0.20208866894245148, "learning_rate": 0.0001241604, "loss": 0.2563, "step": 189600 }, { "epoch": 0.1394, "grad_norm": 0.1706075370311737, "learning_rate": 0.0001241204, "loss": 0.2634, "step": 189700 }, { "epoch": 0.1396, "grad_norm": 0.1640571802854538, "learning_rate": 0.00012408040000000002, "loss": 0.2582, "step": 189800 }, { "epoch": 0.1398, "grad_norm": 0.22723062336444855, "learning_rate": 0.0001240404, "loss": 0.2603, "step": 189900 }, { "epoch": 0.14, "grad_norm": 0.16876673698425293, "learning_rate": 0.0001240004, "loss": 0.2636, "step": 190000 }, { "epoch": 0.1402, "grad_norm": 0.2153484970331192, "learning_rate": 0.0001239604, "loss": 0.2586, "step": 190100 }, { "epoch": 0.1404, "grad_norm": 0.21755315363407135, "learning_rate": 0.0001239204, "loss": 0.2565, "step": 190200 }, { "epoch": 0.1406, "grad_norm": 0.18556717038154602, "learning_rate": 0.00012388040000000001, "loss": 0.2631, "step": 190300 }, { "epoch": 0.1408, "grad_norm": 0.19718804955482483, "learning_rate": 0.0001238404, "loss": 0.2593, "step": 190400 }, { "epoch": 0.141, "grad_norm": 0.2352330982685089, "learning_rate": 0.00012380040000000003, "loss": 0.2606, "step": 190500 }, { "epoch": 0.1412, "grad_norm": 0.22349053621292114, "learning_rate": 0.0001237604, "loss": 0.2614, "step": 190600 }, { "epoch": 0.1414, "grad_norm": 0.27996236085891724, "learning_rate": 0.00012372040000000002, "loss": 0.2651, "step": 190700 }, { "epoch": 0.1416, "grad_norm": 0.21903091669082642, "learning_rate": 0.0001236804, "loss": 0.2577, "step": 190800 }, { "epoch": 0.1418, "grad_norm": 0.1741315722465515, "learning_rate": 0.0001236404, "loss": 0.2526, "step": 190900 }, { "epoch": 0.142, "grad_norm": 0.2520033121109009, "learning_rate": 0.00012360040000000002, "loss": 0.259, "step": 191000 }, { "epoch": 0.1422, "grad_norm": 0.23828014731407166, "learning_rate": 0.0001235604, "loss": 0.2594, "step": 191100 }, { "epoch": 0.1424, "grad_norm": 0.16912546753883362, "learning_rate": 0.0001235204, "loss": 0.2616, "step": 191200 }, { "epoch": 0.1426, "grad_norm": 0.17198620736598969, "learning_rate": 0.0001234804, "loss": 0.256, "step": 191300 }, { "epoch": 0.1428, "grad_norm": 0.17525877058506012, "learning_rate": 0.0001234404, "loss": 0.2601, "step": 191400 }, { "epoch": 0.143, "grad_norm": 0.16216328740119934, "learning_rate": 0.00012340040000000002, "loss": 0.2641, "step": 191500 }, { "epoch": 0.1432, "grad_norm": 0.1828595995903015, "learning_rate": 0.00012336039999999999, "loss": 0.2582, "step": 191600 }, { "epoch": 0.1434, "grad_norm": 0.16869176924228668, "learning_rate": 0.0001233204, "loss": 0.2554, "step": 191700 }, { "epoch": 0.1436, "grad_norm": 0.19655516743659973, "learning_rate": 0.0001232804, "loss": 0.2663, "step": 191800 }, { "epoch": 0.1438, "grad_norm": 0.20977061986923218, "learning_rate": 0.00012324040000000002, "loss": 0.2565, "step": 191900 }, { "epoch": 0.144, "grad_norm": 0.22906339168548584, "learning_rate": 0.00012320040000000001, "loss": 0.2553, "step": 192000 }, { "epoch": 0.1442, "grad_norm": 0.1683686375617981, "learning_rate": 0.0001231604, "loss": 0.2581, "step": 192100 }, { "epoch": 0.1444, "grad_norm": 0.20973190665245056, "learning_rate": 0.0001231204, "loss": 0.2642, "step": 192200 }, { "epoch": 0.1446, "grad_norm": 0.14577773213386536, "learning_rate": 0.0001230804, "loss": 0.26, "step": 192300 }, { "epoch": 0.1448, "grad_norm": 0.19426365196704865, "learning_rate": 0.00012304040000000002, "loss": 0.2622, "step": 192400 }, { "epoch": 0.145, "grad_norm": 0.17555978894233704, "learning_rate": 0.0001230004, "loss": 0.2598, "step": 192500 }, { "epoch": 0.1452, "grad_norm": 0.2241448312997818, "learning_rate": 0.0001229604, "loss": 0.2549, "step": 192600 }, { "epoch": 0.1454, "grad_norm": 0.24879616498947144, "learning_rate": 0.0001229204, "loss": 0.2525, "step": 192700 }, { "epoch": 0.1456, "grad_norm": 0.18659988045692444, "learning_rate": 0.00012288040000000002, "loss": 0.2635, "step": 192800 }, { "epoch": 0.1458, "grad_norm": 0.19412420690059662, "learning_rate": 0.0001228404, "loss": 0.2553, "step": 192900 }, { "epoch": 0.146, "grad_norm": 0.16501417756080627, "learning_rate": 0.0001228004, "loss": 0.2621, "step": 193000 }, { "epoch": 0.1462, "grad_norm": 0.18333154916763306, "learning_rate": 0.0001227604, "loss": 0.2545, "step": 193100 }, { "epoch": 0.1464, "grad_norm": 0.25169798731803894, "learning_rate": 0.0001227204, "loss": 0.26, "step": 193200 }, { "epoch": 0.1466, "grad_norm": 0.18788355588912964, "learning_rate": 0.0001226804, "loss": 0.2575, "step": 193300 }, { "epoch": 0.1468, "grad_norm": 0.23078390955924988, "learning_rate": 0.0001226404, "loss": 0.2717, "step": 193400 }, { "epoch": 0.147, "grad_norm": 0.18893170356750488, "learning_rate": 0.00012260040000000003, "loss": 0.2572, "step": 193500 }, { "epoch": 0.1472, "grad_norm": 0.20808455348014832, "learning_rate": 0.0001225604, "loss": 0.2576, "step": 193600 }, { "epoch": 0.1474, "grad_norm": 0.1902913898229599, "learning_rate": 0.00012252040000000001, "loss": 0.2568, "step": 193700 }, { "epoch": 0.1476, "grad_norm": 0.17236173152923584, "learning_rate": 0.0001224804, "loss": 0.2541, "step": 193800 }, { "epoch": 0.1478, "grad_norm": 0.22261063754558563, "learning_rate": 0.0001224404, "loss": 0.2572, "step": 193900 }, { "epoch": 0.148, "grad_norm": 0.21105404198169708, "learning_rate": 0.00012240040000000002, "loss": 0.2595, "step": 194000 }, { "epoch": 0.1482, "grad_norm": 0.1775384098291397, "learning_rate": 0.0001223604, "loss": 0.2566, "step": 194100 }, { "epoch": 0.1484, "grad_norm": 0.16679984331130981, "learning_rate": 0.0001223204, "loss": 0.2567, "step": 194200 }, { "epoch": 0.1486, "grad_norm": 0.15811361372470856, "learning_rate": 0.0001222804, "loss": 0.2627, "step": 194300 }, { "epoch": 0.1488, "grad_norm": 0.1975756138563156, "learning_rate": 0.00012224040000000002, "loss": 0.2629, "step": 194400 }, { "epoch": 0.149, "grad_norm": 0.2405877411365509, "learning_rate": 0.00012220040000000002, "loss": 0.2591, "step": 194500 }, { "epoch": 0.1492, "grad_norm": 0.1887446641921997, "learning_rate": 0.0001221604, "loss": 0.26, "step": 194600 }, { "epoch": 0.1494, "grad_norm": 0.23812128603458405, "learning_rate": 0.0001221204, "loss": 0.2602, "step": 194700 }, { "epoch": 0.1496, "grad_norm": 0.2019837647676468, "learning_rate": 0.0001220804, "loss": 0.2614, "step": 194800 }, { "epoch": 0.1498, "grad_norm": 0.18969768285751343, "learning_rate": 0.0001220404, "loss": 0.2597, "step": 194900 }, { "epoch": 0.15, "grad_norm": 0.2558167278766632, "learning_rate": 0.00012200040000000001, "loss": 0.2601, "step": 195000 }, { "epoch": 0.1502, "grad_norm": 0.16301903128623962, "learning_rate": 0.00012196039999999999, "loss": 0.2613, "step": 195100 }, { "epoch": 0.1504, "grad_norm": 0.19650568068027496, "learning_rate": 0.0001219204, "loss": 0.2643, "step": 195200 }, { "epoch": 0.1506, "grad_norm": 0.19182758033275604, "learning_rate": 0.0001218804, "loss": 0.2622, "step": 195300 }, { "epoch": 0.1508, "grad_norm": 0.16109509766101837, "learning_rate": 0.00012184040000000001, "loss": 0.2567, "step": 195400 }, { "epoch": 0.151, "grad_norm": 0.20243091881275177, "learning_rate": 0.00012180040000000002, "loss": 0.2561, "step": 195500 }, { "epoch": 0.1512, "grad_norm": 0.18910594284534454, "learning_rate": 0.0001217604, "loss": 0.2589, "step": 195600 }, { "epoch": 0.1514, "grad_norm": 0.17565961182117462, "learning_rate": 0.00012172040000000001, "loss": 0.2575, "step": 195700 }, { "epoch": 0.1516, "grad_norm": 0.18943890929222107, "learning_rate": 0.0001216804, "loss": 0.262, "step": 195800 }, { "epoch": 0.1518, "grad_norm": 0.1772325187921524, "learning_rate": 0.00012164040000000001, "loss": 0.2659, "step": 195900 }, { "epoch": 0.152, "grad_norm": 0.17579589784145355, "learning_rate": 0.00012160040000000002, "loss": 0.2611, "step": 196000 }, { "epoch": 0.1522, "grad_norm": 0.1820620447397232, "learning_rate": 0.0001215604, "loss": 0.2621, "step": 196100 }, { "epoch": 0.1524, "grad_norm": 0.20846767723560333, "learning_rate": 0.0001215204, "loss": 0.2607, "step": 196200 }, { "epoch": 0.1526, "grad_norm": 0.20666386187076569, "learning_rate": 0.00012148040000000001, "loss": 0.2587, "step": 196300 }, { "epoch": 0.1528, "grad_norm": 0.3281834125518799, "learning_rate": 0.0001214404, "loss": 0.2539, "step": 196400 }, { "epoch": 0.153, "grad_norm": 0.20208649337291718, "learning_rate": 0.00012140040000000001, "loss": 0.2578, "step": 196500 }, { "epoch": 0.1532, "grad_norm": 0.28220322728157043, "learning_rate": 0.00012136039999999999, "loss": 0.2558, "step": 196600 }, { "epoch": 0.1534, "grad_norm": 0.33225101232528687, "learning_rate": 0.0001213204, "loss": 0.2661, "step": 196700 }, { "epoch": 0.1536, "grad_norm": 0.24750415980815887, "learning_rate": 0.0001212804, "loss": 0.2624, "step": 196800 }, { "epoch": 0.1538, "grad_norm": 0.1799037754535675, "learning_rate": 0.00012124040000000001, "loss": 0.2584, "step": 196900 }, { "epoch": 0.154, "grad_norm": 0.20016971230506897, "learning_rate": 0.00012120040000000002, "loss": 0.2599, "step": 197000 }, { "epoch": 0.1542, "grad_norm": 0.18166713416576385, "learning_rate": 0.0001211604, "loss": 0.2567, "step": 197100 }, { "epoch": 0.1544, "grad_norm": 0.1939893662929535, "learning_rate": 0.0001211204, "loss": 0.2646, "step": 197200 }, { "epoch": 0.1546, "grad_norm": 0.18318818509578705, "learning_rate": 0.0001210804, "loss": 0.2552, "step": 197300 }, { "epoch": 0.1548, "grad_norm": 0.27068910002708435, "learning_rate": 0.00012104040000000001, "loss": 0.2594, "step": 197400 }, { "epoch": 0.155, "grad_norm": 0.20049890875816345, "learning_rate": 0.00012100040000000001, "loss": 0.2569, "step": 197500 }, { "epoch": 0.1552, "grad_norm": 0.1976662427186966, "learning_rate": 0.0001209604, "loss": 0.2544, "step": 197600 }, { "epoch": 0.1554, "grad_norm": 0.17978644371032715, "learning_rate": 0.0001209204, "loss": 0.2626, "step": 197700 }, { "epoch": 0.1556, "grad_norm": 0.18449051678180695, "learning_rate": 0.00012088040000000001, "loss": 0.2609, "step": 197800 }, { "epoch": 0.1558, "grad_norm": 0.21777702867984772, "learning_rate": 0.0001208404, "loss": 0.2557, "step": 197900 }, { "epoch": 0.156, "grad_norm": 0.34685173630714417, "learning_rate": 0.00012080040000000001, "loss": 0.2598, "step": 198000 }, { "epoch": 0.1562, "grad_norm": 0.21102797985076904, "learning_rate": 0.00012076039999999999, "loss": 0.258, "step": 198100 }, { "epoch": 0.1564, "grad_norm": 0.1801084280014038, "learning_rate": 0.0001207204, "loss": 0.2545, "step": 198200 }, { "epoch": 0.1566, "grad_norm": 0.24270586669445038, "learning_rate": 0.0001206804, "loss": 0.2594, "step": 198300 }, { "epoch": 0.1568, "grad_norm": 0.1699536144733429, "learning_rate": 0.00012064040000000001, "loss": 0.2665, "step": 198400 }, { "epoch": 0.157, "grad_norm": 0.15559914708137512, "learning_rate": 0.00012060040000000002, "loss": 0.2577, "step": 198500 }, { "epoch": 0.1572, "grad_norm": 0.1730017513036728, "learning_rate": 0.0001205604, "loss": 0.2578, "step": 198600 }, { "epoch": 0.1574, "grad_norm": 0.18403738737106323, "learning_rate": 0.0001205204, "loss": 0.257, "step": 198700 }, { "epoch": 0.1576, "grad_norm": 0.17950795590877533, "learning_rate": 0.0001204804, "loss": 0.2581, "step": 198800 }, { "epoch": 0.1578, "grad_norm": 0.18229971826076508, "learning_rate": 0.0001204404, "loss": 0.2611, "step": 198900 }, { "epoch": 0.158, "grad_norm": 0.1665981113910675, "learning_rate": 0.00012040040000000001, "loss": 0.2562, "step": 199000 }, { "epoch": 0.1582, "grad_norm": 0.1657390296459198, "learning_rate": 0.0001203604, "loss": 0.2629, "step": 199100 }, { "epoch": 0.1584, "grad_norm": 0.18978402018547058, "learning_rate": 0.0001203204, "loss": 0.2613, "step": 199200 }, { "epoch": 0.1586, "grad_norm": 0.18768629431724548, "learning_rate": 0.00012028040000000001, "loss": 0.2564, "step": 199300 }, { "epoch": 0.1588, "grad_norm": 0.3750452697277069, "learning_rate": 0.00012024040000000002, "loss": 0.2577, "step": 199400 }, { "epoch": 0.159, "grad_norm": 0.25771307945251465, "learning_rate": 0.00012020040000000001, "loss": 0.2642, "step": 199500 }, { "epoch": 0.1592, "grad_norm": 0.27435046434402466, "learning_rate": 0.00012016039999999999, "loss": 0.263, "step": 199600 }, { "epoch": 0.1594, "grad_norm": 0.1829531192779541, "learning_rate": 0.0001201204, "loss": 0.2606, "step": 199700 }, { "epoch": 0.1596, "grad_norm": 0.14673271775245667, "learning_rate": 0.0001200804, "loss": 0.2575, "step": 199800 }, { "epoch": 0.1598, "grad_norm": 0.19495797157287598, "learning_rate": 0.00012004040000000001, "loss": 0.2545, "step": 199900 }, { "epoch": 0.16, "grad_norm": 0.20886088907718658, "learning_rate": 0.00012000040000000002, "loss": 0.2584, "step": 200000 }, { "epoch": 0.1602, "grad_norm": 0.18301694095134735, "learning_rate": 0.0001199604, "loss": 0.2581, "step": 200100 }, { "epoch": 0.1604, "grad_norm": 0.1781015843153, "learning_rate": 0.0001199204, "loss": 0.2583, "step": 200200 }, { "epoch": 0.1606, "grad_norm": 0.15711505711078644, "learning_rate": 0.0001198804, "loss": 0.2594, "step": 200300 }, { "epoch": 0.1608, "grad_norm": 0.21567769348621368, "learning_rate": 0.0001198404, "loss": 0.2553, "step": 200400 }, { "epoch": 0.161, "grad_norm": 0.18019573390483856, "learning_rate": 0.00011980040000000001, "loss": 0.2653, "step": 200500 }, { "epoch": 0.1612, "grad_norm": 0.1789156198501587, "learning_rate": 0.00011976039999999999, "loss": 0.2604, "step": 200600 }, { "epoch": 0.1614, "grad_norm": 0.26856061816215515, "learning_rate": 0.0001197204, "loss": 0.2625, "step": 200700 }, { "epoch": 0.1616, "grad_norm": 0.16721145808696747, "learning_rate": 0.00011968040000000001, "loss": 0.2628, "step": 200800 }, { "epoch": 0.1618, "grad_norm": 0.1759210228919983, "learning_rate": 0.00011964040000000001, "loss": 0.2534, "step": 200900 }, { "epoch": 0.162, "grad_norm": 0.15787266194820404, "learning_rate": 0.00011960040000000001, "loss": 0.2586, "step": 201000 }, { "epoch": 0.1622, "grad_norm": 0.1680240035057068, "learning_rate": 0.0001195604, "loss": 0.2583, "step": 201100 }, { "epoch": 0.1624, "grad_norm": 0.19923582673072815, "learning_rate": 0.0001195204, "loss": 0.2637, "step": 201200 }, { "epoch": 0.1626, "grad_norm": 0.33889421820640564, "learning_rate": 0.0001194804, "loss": 0.261, "step": 201300 }, { "epoch": 0.1628, "grad_norm": 0.22548066079616547, "learning_rate": 0.00011944040000000001, "loss": 0.2557, "step": 201400 }, { "epoch": 0.163, "grad_norm": 0.2182902842760086, "learning_rate": 0.00011940040000000002, "loss": 0.2593, "step": 201500 }, { "epoch": 0.1632, "grad_norm": 0.27033495903015137, "learning_rate": 0.00011936040000000001, "loss": 0.2588, "step": 201600 }, { "epoch": 0.1634, "grad_norm": 0.3093793988227844, "learning_rate": 0.0001193204, "loss": 0.2561, "step": 201700 }, { "epoch": 0.1636, "grad_norm": 0.20741653442382812, "learning_rate": 0.0001192804, "loss": 0.2623, "step": 201800 }, { "epoch": 0.1638, "grad_norm": 0.2333420366048813, "learning_rate": 0.0001192404, "loss": 0.2574, "step": 201900 }, { "epoch": 0.164, "grad_norm": 0.20007722079753876, "learning_rate": 0.00011920040000000001, "loss": 0.2618, "step": 202000 }, { "epoch": 0.1642, "grad_norm": 0.18958450853824615, "learning_rate": 0.00011916040000000002, "loss": 0.2545, "step": 202100 }, { "epoch": 0.1644, "grad_norm": 0.22179192304611206, "learning_rate": 0.0001191204, "loss": 0.2637, "step": 202200 }, { "epoch": 0.1646, "grad_norm": 0.1823706179857254, "learning_rate": 0.0001190804, "loss": 0.2567, "step": 202300 }, { "epoch": 0.1648, "grad_norm": 0.19839805364608765, "learning_rate": 0.00011904040000000001, "loss": 0.2582, "step": 202400 }, { "epoch": 0.165, "grad_norm": 0.1794578582048416, "learning_rate": 0.0001190004, "loss": 0.2563, "step": 202500 }, { "epoch": 0.1652, "grad_norm": 0.1632799506187439, "learning_rate": 0.00011896040000000001, "loss": 0.2544, "step": 202600 }, { "epoch": 0.1654, "grad_norm": 0.14698299765586853, "learning_rate": 0.0001189204, "loss": 0.2583, "step": 202700 }, { "epoch": 0.1656, "grad_norm": 0.1564948856830597, "learning_rate": 0.0001188804, "loss": 0.2544, "step": 202800 }, { "epoch": 0.1658, "grad_norm": 0.27698883414268494, "learning_rate": 0.00011884040000000001, "loss": 0.2603, "step": 202900 }, { "epoch": 0.166, "grad_norm": 0.17376329004764557, "learning_rate": 0.00011880040000000002, "loss": 0.2569, "step": 203000 }, { "epoch": 0.1662, "grad_norm": 0.21339082717895508, "learning_rate": 0.00011876040000000002, "loss": 0.2554, "step": 203100 }, { "epoch": 0.1664, "grad_norm": 0.19938743114471436, "learning_rate": 0.0001187204, "loss": 0.2703, "step": 203200 }, { "epoch": 0.1666, "grad_norm": 0.1865181177854538, "learning_rate": 0.0001186804, "loss": 0.2559, "step": 203300 }, { "epoch": 0.1668, "grad_norm": 0.18764150142669678, "learning_rate": 0.0001186404, "loss": 0.2565, "step": 203400 }, { "epoch": 0.167, "grad_norm": 0.16691163182258606, "learning_rate": 0.00011860040000000001, "loss": 0.2616, "step": 203500 }, { "epoch": 0.1672, "grad_norm": 0.20289692282676697, "learning_rate": 0.00011856040000000002, "loss": 0.2585, "step": 203600 }, { "epoch": 0.1674, "grad_norm": 0.2008928805589676, "learning_rate": 0.0001185204, "loss": 0.2654, "step": 203700 }, { "epoch": 0.1676, "grad_norm": 0.29621559381484985, "learning_rate": 0.0001184804, "loss": 0.2649, "step": 203800 }, { "epoch": 0.1678, "grad_norm": 0.23792728781700134, "learning_rate": 0.00011844040000000001, "loss": 0.2606, "step": 203900 }, { "epoch": 0.168, "grad_norm": 0.1951097548007965, "learning_rate": 0.0001184004, "loss": 0.2603, "step": 204000 }, { "epoch": 0.1682, "grad_norm": 0.18603862822055817, "learning_rate": 0.00011836040000000001, "loss": 0.2567, "step": 204100 }, { "epoch": 0.1684, "grad_norm": 0.248612180352211, "learning_rate": 0.00011832039999999999, "loss": 0.2588, "step": 204200 }, { "epoch": 0.1686, "grad_norm": 0.3616507351398468, "learning_rate": 0.0001182804, "loss": 0.2568, "step": 204300 }, { "epoch": 0.1688, "grad_norm": 0.22224590182304382, "learning_rate": 0.00011824040000000001, "loss": 0.2589, "step": 204400 }, { "epoch": 0.169, "grad_norm": 0.22203755378723145, "learning_rate": 0.00011820040000000001, "loss": 0.2572, "step": 204500 }, { "epoch": 0.1692, "grad_norm": 0.16411060094833374, "learning_rate": 0.00011816040000000002, "loss": 0.2578, "step": 204600 }, { "epoch": 0.1694, "grad_norm": 0.17382003366947174, "learning_rate": 0.0001181204, "loss": 0.2546, "step": 204700 }, { "epoch": 0.1696, "grad_norm": 0.21096429228782654, "learning_rate": 0.00011808040000000001, "loss": 0.2619, "step": 204800 }, { "epoch": 0.1698, "grad_norm": 0.2046499401330948, "learning_rate": 0.0001180404, "loss": 0.2538, "step": 204900 }, { "epoch": 0.17, "grad_norm": 0.1903340220451355, "learning_rate": 0.00011800040000000001, "loss": 0.2695, "step": 205000 }, { "epoch": 0.1702, "grad_norm": 0.19105155766010284, "learning_rate": 0.00011796040000000002, "loss": 0.2592, "step": 205100 }, { "epoch": 0.1704, "grad_norm": 0.18848542869091034, "learning_rate": 0.0001179204, "loss": 0.2607, "step": 205200 }, { "epoch": 0.1706, "grad_norm": 0.19775767624378204, "learning_rate": 0.0001178804, "loss": 0.2514, "step": 205300 }, { "epoch": 0.1708, "grad_norm": 0.2048797607421875, "learning_rate": 0.00011784040000000001, "loss": 0.2584, "step": 205400 }, { "epoch": 0.171, "grad_norm": 0.17378486692905426, "learning_rate": 0.0001178004, "loss": 0.2611, "step": 205500 }, { "epoch": 0.1712, "grad_norm": 0.19337409734725952, "learning_rate": 0.00011776040000000001, "loss": 0.2552, "step": 205600 }, { "epoch": 0.1714, "grad_norm": 0.1871696412563324, "learning_rate": 0.00011772039999999999, "loss": 0.2555, "step": 205700 }, { "epoch": 0.1716, "grad_norm": 0.15516479313373566, "learning_rate": 0.0001176804, "loss": 0.255, "step": 205800 }, { "epoch": 0.1718, "grad_norm": 0.18520411849021912, "learning_rate": 0.0001176404, "loss": 0.2607, "step": 205900 }, { "epoch": 0.172, "grad_norm": 0.1725122481584549, "learning_rate": 0.00011760040000000001, "loss": 0.2599, "step": 206000 }, { "epoch": 0.1722, "grad_norm": 0.24798348546028137, "learning_rate": 0.00011756040000000002, "loss": 0.2654, "step": 206100 }, { "epoch": 0.1724, "grad_norm": 0.15769024193286896, "learning_rate": 0.0001175204, "loss": 0.2753, "step": 206200 }, { "epoch": 0.1726, "grad_norm": 0.2662038207054138, "learning_rate": 0.00011748040000000001, "loss": 0.2642, "step": 206300 }, { "epoch": 0.1728, "grad_norm": 0.22056128084659576, "learning_rate": 0.0001174404, "loss": 0.2573, "step": 206400 }, { "epoch": 0.173, "grad_norm": 0.24194368720054626, "learning_rate": 0.00011740040000000001, "loss": 0.2605, "step": 206500 }, { "epoch": 0.1732, "grad_norm": 0.2201591581106186, "learning_rate": 0.00011736040000000002, "loss": 0.26, "step": 206600 }, { "epoch": 0.1734, "grad_norm": 0.17984557151794434, "learning_rate": 0.0001173204, "loss": 0.2588, "step": 206700 }, { "epoch": 0.1736, "grad_norm": 0.2862537205219269, "learning_rate": 0.0001172804, "loss": 0.2656, "step": 206800 }, { "epoch": 0.1738, "grad_norm": 0.20301680266857147, "learning_rate": 0.00011724040000000001, "loss": 0.2596, "step": 206900 }, { "epoch": 0.174, "grad_norm": 0.16194941103458405, "learning_rate": 0.0001172004, "loss": 0.2604, "step": 207000 }, { "epoch": 0.1742, "grad_norm": 0.20305371284484863, "learning_rate": 0.00011716040000000001, "loss": 0.2605, "step": 207100 }, { "epoch": 0.1744, "grad_norm": 0.17961016297340393, "learning_rate": 0.00011712039999999999, "loss": 0.2553, "step": 207200 }, { "epoch": 0.1746, "grad_norm": 0.15147832036018372, "learning_rate": 0.0001170804, "loss": 0.2608, "step": 207300 }, { "epoch": 0.1748, "grad_norm": 0.17695245146751404, "learning_rate": 0.0001170404, "loss": 0.2577, "step": 207400 }, { "epoch": 0.175, "grad_norm": 0.16684728860855103, "learning_rate": 0.00011700040000000001, "loss": 0.2602, "step": 207500 }, { "epoch": 0.1752, "grad_norm": 0.1773560494184494, "learning_rate": 0.00011696040000000002, "loss": 0.2592, "step": 207600 }, { "epoch": 0.1754, "grad_norm": 0.1829720139503479, "learning_rate": 0.0001169204, "loss": 0.2533, "step": 207700 }, { "epoch": 0.1756, "grad_norm": 0.14604096114635468, "learning_rate": 0.0001168804, "loss": 0.2559, "step": 207800 }, { "epoch": 0.1758, "grad_norm": 0.16805389523506165, "learning_rate": 0.0001168404, "loss": 0.2565, "step": 207900 }, { "epoch": 0.176, "grad_norm": 0.2917552888393402, "learning_rate": 0.00011680040000000001, "loss": 0.2596, "step": 208000 }, { "epoch": 0.1762, "grad_norm": 0.1552281528711319, "learning_rate": 0.00011676040000000001, "loss": 0.2595, "step": 208100 }, { "epoch": 0.1764, "grad_norm": 0.21594230830669403, "learning_rate": 0.0001167204, "loss": 0.26, "step": 208200 }, { "epoch": 0.1766, "grad_norm": 0.2598268389701843, "learning_rate": 0.0001166804, "loss": 0.2634, "step": 208300 }, { "epoch": 0.1768, "grad_norm": 0.17285478115081787, "learning_rate": 0.00011664040000000001, "loss": 0.2663, "step": 208400 }, { "epoch": 0.177, "grad_norm": 0.2128245234489441, "learning_rate": 0.00011660040000000002, "loss": 0.2596, "step": 208500 }, { "epoch": 0.1772, "grad_norm": 0.17592047154903412, "learning_rate": 0.00011656040000000001, "loss": 0.259, "step": 208600 }, { "epoch": 0.1774, "grad_norm": 0.1863638013601303, "learning_rate": 0.00011652039999999999, "loss": 0.2574, "step": 208700 }, { "epoch": 0.1776, "grad_norm": 0.1951245218515396, "learning_rate": 0.0001164804, "loss": 0.2569, "step": 208800 }, { "epoch": 0.1778, "grad_norm": 0.20469717681407928, "learning_rate": 0.0001164404, "loss": 0.2548, "step": 208900 }, { "epoch": 0.178, "grad_norm": 0.18521174788475037, "learning_rate": 0.00011640040000000001, "loss": 0.2581, "step": 209000 }, { "epoch": 0.1782, "grad_norm": 0.16419340670108795, "learning_rate": 0.00011636040000000002, "loss": 0.2557, "step": 209100 }, { "epoch": 0.1784, "grad_norm": 0.2025509625673294, "learning_rate": 0.0001163204, "loss": 0.2553, "step": 209200 }, { "epoch": 0.1786, "grad_norm": 0.2792588174343109, "learning_rate": 0.0001162804, "loss": 0.2554, "step": 209300 }, { "epoch": 0.1788, "grad_norm": 0.18840503692626953, "learning_rate": 0.0001162404, "loss": 0.2612, "step": 209400 }, { "epoch": 0.179, "grad_norm": 0.3353501558303833, "learning_rate": 0.0001162004, "loss": 0.2575, "step": 209500 }, { "epoch": 0.1792, "grad_norm": 0.17007525265216827, "learning_rate": 0.00011616040000000001, "loss": 0.2534, "step": 209600 }, { "epoch": 0.1794, "grad_norm": 0.1712578982114792, "learning_rate": 0.0001161204, "loss": 0.2579, "step": 209700 }, { "epoch": 0.1796, "grad_norm": 0.16284547746181488, "learning_rate": 0.0001160804, "loss": 0.2578, "step": 209800 }, { "epoch": 0.1798, "grad_norm": 0.22904619574546814, "learning_rate": 0.00011604040000000001, "loss": 0.2554, "step": 209900 }, { "epoch": 0.18, "grad_norm": 0.1613014191389084, "learning_rate": 0.00011600040000000002, "loss": 0.2569, "step": 210000 }, { "epoch": 0.1802, "grad_norm": 0.24920882284641266, "learning_rate": 0.00011596040000000001, "loss": 0.2523, "step": 210100 }, { "epoch": 0.1804, "grad_norm": 0.16219836473464966, "learning_rate": 0.0001159204, "loss": 0.257, "step": 210200 }, { "epoch": 0.1806, "grad_norm": 0.14738017320632935, "learning_rate": 0.0001158804, "loss": 0.2534, "step": 210300 }, { "epoch": 0.1808, "grad_norm": 0.16476276516914368, "learning_rate": 0.0001158404, "loss": 0.261, "step": 210400 }, { "epoch": 0.181, "grad_norm": 0.20898793637752533, "learning_rate": 0.00011580040000000001, "loss": 0.261, "step": 210500 }, { "epoch": 0.1812, "grad_norm": 0.16828548908233643, "learning_rate": 0.00011576040000000002, "loss": 0.2502, "step": 210600 }, { "epoch": 0.1814, "grad_norm": 0.17808620631694794, "learning_rate": 0.0001157204, "loss": 0.2445, "step": 210700 }, { "epoch": 0.1816, "grad_norm": 0.16437947750091553, "learning_rate": 0.0001156804, "loss": 0.2521, "step": 210800 }, { "epoch": 0.1818, "grad_norm": 0.2212170660495758, "learning_rate": 0.0001156404, "loss": 0.2532, "step": 210900 }, { "epoch": 0.182, "grad_norm": 0.22296178340911865, "learning_rate": 0.0001156004, "loss": 0.2594, "step": 211000 }, { "epoch": 0.1822, "grad_norm": 0.19012941420078278, "learning_rate": 0.00011556040000000001, "loss": 0.2601, "step": 211100 }, { "epoch": 0.1824, "grad_norm": 0.2223205268383026, "learning_rate": 0.00011552039999999999, "loss": 0.2571, "step": 211200 }, { "epoch": 0.1826, "grad_norm": 0.193226620554924, "learning_rate": 0.0001154804, "loss": 0.2635, "step": 211300 }, { "epoch": 0.1828, "grad_norm": 0.19742873311042786, "learning_rate": 0.00011544040000000001, "loss": 0.2635, "step": 211400 }, { "epoch": 0.183, "grad_norm": 0.2180163711309433, "learning_rate": 0.00011540040000000001, "loss": 0.2669, "step": 211500 }, { "epoch": 0.1832, "grad_norm": 0.20754875242710114, "learning_rate": 0.00011536040000000001, "loss": 0.252, "step": 211600 }, { "epoch": 0.1834, "grad_norm": 0.16631507873535156, "learning_rate": 0.0001153204, "loss": 0.256, "step": 211700 }, { "epoch": 0.1836, "grad_norm": 0.1716802716255188, "learning_rate": 0.0001152804, "loss": 0.2578, "step": 211800 }, { "epoch": 0.1838, "grad_norm": 0.22143660485744476, "learning_rate": 0.0001152404, "loss": 0.2566, "step": 211900 }, { "epoch": 0.184, "grad_norm": 0.2684420347213745, "learning_rate": 0.00011520040000000001, "loss": 0.2515, "step": 212000 }, { "epoch": 0.1842, "grad_norm": 0.22609145939350128, "learning_rate": 0.00011516040000000002, "loss": 0.2532, "step": 212100 }, { "epoch": 0.1844, "grad_norm": 0.1780252754688263, "learning_rate": 0.0001151204, "loss": 0.2579, "step": 212200 }, { "epoch": 0.1846, "grad_norm": 0.256816029548645, "learning_rate": 0.0001150804, "loss": 0.2628, "step": 212300 }, { "epoch": 0.1848, "grad_norm": 0.22887232899665833, "learning_rate": 0.0001150404, "loss": 0.254, "step": 212400 }, { "epoch": 0.185, "grad_norm": 0.17870065569877625, "learning_rate": 0.0001150004, "loss": 0.2525, "step": 212500 }, { "epoch": 0.1852, "grad_norm": 0.4473213255405426, "learning_rate": 0.00011496040000000001, "loss": 0.2565, "step": 212600 }, { "epoch": 0.1854, "grad_norm": 0.1810619980096817, "learning_rate": 0.00011492039999999999, "loss": 0.2565, "step": 212700 }, { "epoch": 0.1856, "grad_norm": 0.1680411398410797, "learning_rate": 0.0001148804, "loss": 0.2555, "step": 212800 }, { "epoch": 0.1858, "grad_norm": 0.22519907355308533, "learning_rate": 0.0001148404, "loss": 0.252, "step": 212900 }, { "epoch": 0.186, "grad_norm": 0.18328481912612915, "learning_rate": 0.00011480040000000001, "loss": 0.2567, "step": 213000 }, { "epoch": 0.1862, "grad_norm": 0.4796239733695984, "learning_rate": 0.0001147604, "loss": 0.2552, "step": 213100 }, { "epoch": 0.1864, "grad_norm": 0.17960181832313538, "learning_rate": 0.0001147204, "loss": 0.2552, "step": 213200 }, { "epoch": 0.1866, "grad_norm": 0.21092750132083893, "learning_rate": 0.0001146804, "loss": 0.2591, "step": 213300 }, { "epoch": 0.1868, "grad_norm": 0.1919945329427719, "learning_rate": 0.0001146404, "loss": 0.2524, "step": 213400 }, { "epoch": 0.187, "grad_norm": 0.1630232334136963, "learning_rate": 0.00011460040000000001, "loss": 0.2548, "step": 213500 }, { "epoch": 0.1872, "grad_norm": 0.15087801218032837, "learning_rate": 0.00011456040000000002, "loss": 0.2595, "step": 213600 }, { "epoch": 0.1874, "grad_norm": 0.31527626514434814, "learning_rate": 0.00011452040000000002, "loss": 0.2553, "step": 213700 }, { "epoch": 0.1876, "grad_norm": 0.27339211106300354, "learning_rate": 0.0001144804, "loss": 0.2617, "step": 213800 }, { "epoch": 0.1878, "grad_norm": 0.19266119599342346, "learning_rate": 0.00011444040000000001, "loss": 0.2628, "step": 213900 }, { "epoch": 0.188, "grad_norm": 0.26474958658218384, "learning_rate": 0.0001144004, "loss": 0.2533, "step": 214000 }, { "epoch": 0.1882, "grad_norm": 0.16048365831375122, "learning_rate": 0.00011436040000000001, "loss": 0.261, "step": 214100 }, { "epoch": 0.1884, "grad_norm": 0.20829501748085022, "learning_rate": 0.00011432040000000002, "loss": 0.2562, "step": 214200 }, { "epoch": 0.1886, "grad_norm": 0.2848805785179138, "learning_rate": 0.0001142804, "loss": 0.2586, "step": 214300 }, { "epoch": 0.1888, "grad_norm": 0.2071709930896759, "learning_rate": 0.0001142404, "loss": 0.2534, "step": 214400 }, { "epoch": 0.189, "grad_norm": 0.20445095002651215, "learning_rate": 0.00011420040000000001, "loss": 0.2553, "step": 214500 }, { "epoch": 0.1892, "grad_norm": 0.1799619346857071, "learning_rate": 0.0001141604, "loss": 0.2571, "step": 214600 }, { "epoch": 0.1894, "grad_norm": 0.23064669966697693, "learning_rate": 0.00011412040000000001, "loss": 0.2581, "step": 214700 }, { "epoch": 0.1896, "grad_norm": 0.20549724996089935, "learning_rate": 0.00011408039999999999, "loss": 0.2563, "step": 214800 }, { "epoch": 0.1898, "grad_norm": 0.3230568766593933, "learning_rate": 0.0001140404, "loss": 0.2549, "step": 214900 }, { "epoch": 0.19, "grad_norm": 0.15154241025447845, "learning_rate": 0.00011400040000000001, "loss": 0.256, "step": 215000 }, { "epoch": 0.1902, "grad_norm": 0.18913942575454712, "learning_rate": 0.00011396040000000001, "loss": 0.2632, "step": 215100 }, { "epoch": 0.1904, "grad_norm": 0.19355298578739166, "learning_rate": 0.00011392040000000002, "loss": 0.2577, "step": 215200 }, { "epoch": 0.1906, "grad_norm": 0.16174174845218658, "learning_rate": 0.0001138804, "loss": 0.2504, "step": 215300 }, { "epoch": 0.1908, "grad_norm": 0.24760814011096954, "learning_rate": 0.00011384040000000001, "loss": 0.2541, "step": 215400 }, { "epoch": 0.191, "grad_norm": 0.19982276856899261, "learning_rate": 0.0001138004, "loss": 0.2553, "step": 215500 }, { "epoch": 0.1912, "grad_norm": 0.22562120854854584, "learning_rate": 0.00011376040000000001, "loss": 0.2564, "step": 215600 }, { "epoch": 0.1914, "grad_norm": 0.2680492401123047, "learning_rate": 0.00011372040000000002, "loss": 0.2582, "step": 215700 }, { "epoch": 0.1916, "grad_norm": 0.22115637362003326, "learning_rate": 0.0001136804, "loss": 0.2576, "step": 215800 }, { "epoch": 0.1918, "grad_norm": 0.19587989151477814, "learning_rate": 0.0001136404, "loss": 0.2566, "step": 215900 }, { "epoch": 0.192, "grad_norm": 0.25761643052101135, "learning_rate": 0.00011360040000000001, "loss": 0.2572, "step": 216000 }, { "epoch": 0.1922, "grad_norm": 0.1627892553806305, "learning_rate": 0.0001135604, "loss": 0.255, "step": 216100 }, { "epoch": 0.1924, "grad_norm": 0.18419532477855682, "learning_rate": 0.00011352040000000001, "loss": 0.2558, "step": 216200 }, { "epoch": 0.1926, "grad_norm": 0.18171828985214233, "learning_rate": 0.00011348039999999999, "loss": 0.2548, "step": 216300 }, { "epoch": 0.1928, "grad_norm": 0.16602705419063568, "learning_rate": 0.0001134404, "loss": 0.2542, "step": 216400 }, { "epoch": 0.193, "grad_norm": 0.17187577486038208, "learning_rate": 0.0001134004, "loss": 0.2559, "step": 216500 }, { "epoch": 0.1932, "grad_norm": 0.2111077755689621, "learning_rate": 0.00011336040000000001, "loss": 0.2578, "step": 216600 }, { "epoch": 0.1934, "grad_norm": 0.1766299456357956, "learning_rate": 0.00011332040000000002, "loss": 0.254, "step": 216700 }, { "epoch": 0.1936, "grad_norm": 0.2486806958913803, "learning_rate": 0.0001132804, "loss": 0.2642, "step": 216800 }, { "epoch": 0.1938, "grad_norm": 0.1899750679731369, "learning_rate": 0.00011324040000000001, "loss": 0.2542, "step": 216900 }, { "epoch": 0.194, "grad_norm": 0.18645033240318298, "learning_rate": 0.0001132004, "loss": 0.2589, "step": 217000 }, { "epoch": 0.1942, "grad_norm": 0.1879652589559555, "learning_rate": 0.00011316040000000001, "loss": 0.2623, "step": 217100 }, { "epoch": 0.1944, "grad_norm": 0.21320343017578125, "learning_rate": 0.00011312040000000002, "loss": 0.2572, "step": 217200 }, { "epoch": 0.1946, "grad_norm": 0.19344399869441986, "learning_rate": 0.0001130804, "loss": 0.2525, "step": 217300 }, { "epoch": 0.1948, "grad_norm": 0.16314131021499634, "learning_rate": 0.0001130404, "loss": 0.2532, "step": 217400 }, { "epoch": 0.195, "grad_norm": 0.153633713722229, "learning_rate": 0.00011300040000000001, "loss": 0.2613, "step": 217500 }, { "epoch": 0.1952, "grad_norm": 0.231405109167099, "learning_rate": 0.00011296040000000002, "loss": 0.2613, "step": 217600 }, { "epoch": 0.1954, "grad_norm": 0.17982491850852966, "learning_rate": 0.00011292040000000001, "loss": 0.2568, "step": 217700 }, { "epoch": 0.1956, "grad_norm": 0.21746788918972015, "learning_rate": 0.00011288039999999999, "loss": 0.2552, "step": 217800 }, { "epoch": 0.1958, "grad_norm": 0.1724511682987213, "learning_rate": 0.0001128404, "loss": 0.2593, "step": 217900 }, { "epoch": 0.196, "grad_norm": 0.1525942087173462, "learning_rate": 0.0001128004, "loss": 0.2647, "step": 218000 }, { "epoch": 0.1962, "grad_norm": 0.20681273937225342, "learning_rate": 0.00011276040000000001, "loss": 0.2569, "step": 218100 }, { "epoch": 0.1964, "grad_norm": 0.25222960114479065, "learning_rate": 0.00011272040000000002, "loss": 0.256, "step": 218200 }, { "epoch": 0.1966, "grad_norm": 0.19064846634864807, "learning_rate": 0.0001126804, "loss": 0.2507, "step": 218300 }, { "epoch": 0.1968, "grad_norm": 0.1997394561767578, "learning_rate": 0.0001126404, "loss": 0.2612, "step": 218400 }, { "epoch": 0.197, "grad_norm": 0.1820058673620224, "learning_rate": 0.0001126004, "loss": 0.2559, "step": 218500 }, { "epoch": 0.1972, "grad_norm": 0.23284319043159485, "learning_rate": 0.00011256040000000001, "loss": 0.2578, "step": 218600 }, { "epoch": 0.1974, "grad_norm": 0.3271949291229248, "learning_rate": 0.00011252040000000001, "loss": 0.2555, "step": 218700 }, { "epoch": 0.1976, "grad_norm": 0.18260078132152557, "learning_rate": 0.0001124804, "loss": 0.255, "step": 218800 }, { "epoch": 0.1978, "grad_norm": 0.1611817330121994, "learning_rate": 0.0001124404, "loss": 0.2601, "step": 218900 }, { "epoch": 0.198, "grad_norm": 0.22105377912521362, "learning_rate": 0.00011240040000000001, "loss": 0.2551, "step": 219000 }, { "epoch": 0.1982, "grad_norm": 0.22783438861370087, "learning_rate": 0.00011236040000000002, "loss": 0.2536, "step": 219100 }, { "epoch": 0.1984, "grad_norm": 0.14097750186920166, "learning_rate": 0.00011232040000000001, "loss": 0.251, "step": 219200 }, { "epoch": 0.1986, "grad_norm": 0.16217005252838135, "learning_rate": 0.0001122804, "loss": 0.2546, "step": 219300 }, { "epoch": 0.1988, "grad_norm": 0.28094884753227234, "learning_rate": 0.0001122404, "loss": 0.2597, "step": 219400 }, { "epoch": 0.199, "grad_norm": 0.1714087426662445, "learning_rate": 0.0001122004, "loss": 0.2532, "step": 219500 }, { "epoch": 0.1992, "grad_norm": 0.2341964989900589, "learning_rate": 0.00011216040000000001, "loss": 0.2566, "step": 219600 }, { "epoch": 0.1994, "grad_norm": 0.16600032150745392, "learning_rate": 0.00011212040000000002, "loss": 0.2502, "step": 219700 }, { "epoch": 0.1996, "grad_norm": 0.3427354693412781, "learning_rate": 0.0001120804, "loss": 0.2568, "step": 219800 }, { "epoch": 0.1998, "grad_norm": 0.2476433664560318, "learning_rate": 0.0001120404, "loss": 0.2579, "step": 219900 }, { "epoch": 0.2, "grad_norm": 0.24420465528964996, "learning_rate": 0.0001120004, "loss": 0.2519, "step": 220000 }, { "epoch": 0.2002, "grad_norm": 0.4817318618297577, "learning_rate": 0.0001119604, "loss": 0.3021, "step": 220100 }, { "epoch": 0.2004, "grad_norm": 0.42361992597579956, "learning_rate": 0.00011192040000000001, "loss": 0.3307, "step": 220200 }, { "epoch": 0.2006, "grad_norm": 0.20908264815807343, "learning_rate": 0.0001118804, "loss": 0.316, "step": 220300 }, { "epoch": 0.2008, "grad_norm": 0.4234218895435333, "learning_rate": 0.0001118404, "loss": 0.3001, "step": 220400 }, { "epoch": 0.201, "grad_norm": 0.5726023316383362, "learning_rate": 0.00011180040000000001, "loss": 0.3016, "step": 220500 }, { "epoch": 0.2012, "grad_norm": 0.28417375683784485, "learning_rate": 0.00011176040000000002, "loss": 0.3389, "step": 220600 }, { "epoch": 0.2014, "grad_norm": 0.3290591537952423, "learning_rate": 0.00011172040000000001, "loss": 0.3019, "step": 220700 }, { "epoch": 0.2016, "grad_norm": 0.5273597836494446, "learning_rate": 0.0001116804, "loss": 0.3008, "step": 220800 }, { "epoch": 0.2018, "grad_norm": 0.19379092752933502, "learning_rate": 0.0001116404, "loss": 0.3282, "step": 220900 }, { "epoch": 0.202, "grad_norm": 0.19746637344360352, "learning_rate": 0.0001116004, "loss": 0.2928, "step": 221000 }, { "epoch": 0.2022, "grad_norm": 0.2760812044143677, "learning_rate": 0.00011156040000000001, "loss": 0.2961, "step": 221100 }, { "epoch": 0.2024, "grad_norm": 0.21008889377117157, "learning_rate": 0.00011152040000000002, "loss": 0.2986, "step": 221200 }, { "epoch": 0.2026, "grad_norm": 0.4908127784729004, "learning_rate": 0.0001114804, "loss": 0.3456, "step": 221300 }, { "epoch": 0.2028, "grad_norm": 0.1982090175151825, "learning_rate": 0.0001114404, "loss": 0.3186, "step": 221400 }, { "epoch": 0.203, "grad_norm": 0.37534865736961365, "learning_rate": 0.0001114004, "loss": 0.2995, "step": 221500 }, { "epoch": 0.2032, "grad_norm": 0.24973396956920624, "learning_rate": 0.0001113604, "loss": 0.2996, "step": 221600 }, { "epoch": 0.2034, "grad_norm": 0.21024543046951294, "learning_rate": 0.00011132040000000001, "loss": 0.2907, "step": 221700 }, { "epoch": 0.2036, "grad_norm": 0.3225703537464142, "learning_rate": 0.00011128039999999999, "loss": 0.2911, "step": 221800 }, { "epoch": 0.2038, "grad_norm": 0.4581950306892395, "learning_rate": 0.0001112404, "loss": 0.3637, "step": 221900 }, { "epoch": 0.204, "grad_norm": 0.5433754920959473, "learning_rate": 0.00011120040000000001, "loss": 0.3487, "step": 222000 }, { "epoch": 0.2042, "grad_norm": 0.22788307070732117, "learning_rate": 0.00011116040000000001, "loss": 0.3004, "step": 222100 }, { "epoch": 0.2044, "grad_norm": 0.16002947092056274, "learning_rate": 0.00011112040000000001, "loss": 0.2993, "step": 222200 }, { "epoch": 0.2046, "grad_norm": 0.28923583030700684, "learning_rate": 0.0001110804, "loss": 0.3036, "step": 222300 }, { "epoch": 0.2048, "grad_norm": 0.3555202782154083, "learning_rate": 0.0001110404, "loss": 0.338, "step": 222400 }, { "epoch": 0.205, "grad_norm": 0.1331017166376114, "learning_rate": 0.0001110004, "loss": 0.2904, "step": 222500 }, { "epoch": 0.2052, "grad_norm": 0.269748717546463, "learning_rate": 0.00011096040000000001, "loss": 0.3133, "step": 222600 }, { "epoch": 0.2054, "grad_norm": 0.919607937335968, "learning_rate": 0.00011092040000000002, "loss": 0.3133, "step": 222700 }, { "epoch": 0.2056, "grad_norm": 0.27156761288642883, "learning_rate": 0.0001108804, "loss": 0.3065, "step": 222800 }, { "epoch": 0.2058, "grad_norm": 0.48692193627357483, "learning_rate": 0.0001108404, "loss": 0.3025, "step": 222900 }, { "epoch": 0.206, "grad_norm": 0.8623552322387695, "learning_rate": 0.00011080040000000001, "loss": 0.296, "step": 223000 }, { "epoch": 0.2062, "grad_norm": 0.18269303441047668, "learning_rate": 0.0001107604, "loss": 0.277, "step": 223100 }, { "epoch": 0.2064, "grad_norm": 0.37067535519599915, "learning_rate": 0.00011072040000000001, "loss": 0.3043, "step": 223200 }, { "epoch": 0.2066, "grad_norm": 0.25582921504974365, "learning_rate": 0.00011068039999999999, "loss": 0.2949, "step": 223300 }, { "epoch": 0.2068, "grad_norm": 0.19148407876491547, "learning_rate": 0.0001106404, "loss": 0.2955, "step": 223400 }, { "epoch": 0.207, "grad_norm": 0.24919286370277405, "learning_rate": 0.0001106004, "loss": 0.3014, "step": 223500 }, { "epoch": 0.2072, "grad_norm": 0.24463938176631927, "learning_rate": 0.00011056040000000001, "loss": 0.3057, "step": 223600 }, { "epoch": 0.2074, "grad_norm": 0.35586199164390564, "learning_rate": 0.0001105204, "loss": 0.29, "step": 223700 }, { "epoch": 0.2076, "grad_norm": 0.319210946559906, "learning_rate": 0.0001104804, "loss": 0.368, "step": 223800 }, { "epoch": 0.2078, "grad_norm": 0.282400906085968, "learning_rate": 0.0001104404, "loss": 0.292, "step": 223900 }, { "epoch": 0.208, "grad_norm": 0.17433995008468628, "learning_rate": 0.0001104004, "loss": 0.2892, "step": 224000 }, { "epoch": 0.2082, "grad_norm": 0.5151282548904419, "learning_rate": 0.00011036040000000001, "loss": 0.2962, "step": 224100 }, { "epoch": 0.2084, "grad_norm": 0.2073827087879181, "learning_rate": 0.00011032040000000002, "loss": 0.2928, "step": 224200 }, { "epoch": 0.2086, "grad_norm": 0.40052372217178345, "learning_rate": 0.0001102804, "loss": 0.297, "step": 224300 }, { "epoch": 0.2088, "grad_norm": 0.26308512687683105, "learning_rate": 0.0001102404, "loss": 0.3246, "step": 224400 }, { "epoch": 0.209, "grad_norm": 0.35187768936157227, "learning_rate": 0.00011020040000000001, "loss": 0.3031, "step": 224500 }, { "epoch": 0.2092, "grad_norm": 0.1798812747001648, "learning_rate": 0.0001101604, "loss": 0.2947, "step": 224600 }, { "epoch": 0.2094, "grad_norm": 0.2565075159072876, "learning_rate": 0.00011012040000000001, "loss": 0.3018, "step": 224700 }, { "epoch": 0.2096, "grad_norm": 0.2061193436384201, "learning_rate": 0.00011008039999999999, "loss": 0.2986, "step": 224800 }, { "epoch": 0.2098, "grad_norm": 0.15753225982189178, "learning_rate": 0.0001100404, "loss": 0.2993, "step": 224900 }, { "epoch": 0.21, "grad_norm": 0.18000277876853943, "learning_rate": 0.0001100004, "loss": 0.2809, "step": 225000 }, { "epoch": 0.2102, "grad_norm": 0.17245367169380188, "learning_rate": 0.00010996040000000001, "loss": 0.3003, "step": 225100 }, { "epoch": 0.2104, "grad_norm": 0.27620208263397217, "learning_rate": 0.0001099204, "loss": 0.2863, "step": 225200 }, { "epoch": 0.2106, "grad_norm": 0.1691516935825348, "learning_rate": 0.0001098804, "loss": 0.3085, "step": 225300 }, { "epoch": 0.2108, "grad_norm": 0.25270703434944153, "learning_rate": 0.00010984039999999999, "loss": 0.3018, "step": 225400 }, { "epoch": 0.211, "grad_norm": 0.26797574758529663, "learning_rate": 0.0001098004, "loss": 0.2919, "step": 225500 }, { "epoch": 0.2112, "grad_norm": 0.1746404618024826, "learning_rate": 0.00010976040000000001, "loss": 0.282, "step": 225600 }, { "epoch": 0.2114, "grad_norm": 0.2220151722431183, "learning_rate": 0.00010972040000000001, "loss": 0.3083, "step": 225700 }, { "epoch": 0.2116, "grad_norm": 0.3341788947582245, "learning_rate": 0.0001096804, "loss": 0.2846, "step": 225800 }, { "epoch": 0.2118, "grad_norm": 0.2519269585609436, "learning_rate": 0.0001096404, "loss": 0.3022, "step": 225900 }, { "epoch": 0.212, "grad_norm": 0.4205581545829773, "learning_rate": 0.00010960040000000001, "loss": 0.3163, "step": 226000 }, { "epoch": 0.2122, "grad_norm": 0.2838189899921417, "learning_rate": 0.0001095604, "loss": 0.2892, "step": 226100 }, { "epoch": 0.2124, "grad_norm": 0.25370433926582336, "learning_rate": 0.00010952040000000001, "loss": 0.282, "step": 226200 }, { "epoch": 0.2126, "grad_norm": 0.26368477940559387, "learning_rate": 0.00010948040000000002, "loss": 0.2784, "step": 226300 }, { "epoch": 0.2128, "grad_norm": 0.46951645612716675, "learning_rate": 0.0001094404, "loss": 0.3009, "step": 226400 }, { "epoch": 0.213, "grad_norm": 0.4232272207736969, "learning_rate": 0.0001094004, "loss": 0.2887, "step": 226500 }, { "epoch": 0.2132, "grad_norm": 0.19953791797161102, "learning_rate": 0.00010936040000000001, "loss": 0.2851, "step": 226600 }, { "epoch": 0.2134, "grad_norm": 0.28893914818763733, "learning_rate": 0.00010932040000000002, "loss": 0.2904, "step": 226700 }, { "epoch": 0.2136, "grad_norm": 0.18542243540287018, "learning_rate": 0.00010928040000000001, "loss": 0.2925, "step": 226800 }, { "epoch": 0.2138, "grad_norm": 0.2543767988681793, "learning_rate": 0.00010924039999999999, "loss": 0.2859, "step": 226900 }, { "epoch": 0.214, "grad_norm": 0.380341500043869, "learning_rate": 0.0001092004, "loss": 0.2948, "step": 227000 }, { "epoch": 0.2142, "grad_norm": 0.6200868487358093, "learning_rate": 0.0001091604, "loss": 0.3034, "step": 227100 }, { "epoch": 0.2144, "grad_norm": 0.40341097116470337, "learning_rate": 0.00010912040000000001, "loss": 0.2803, "step": 227200 }, { "epoch": 0.2146, "grad_norm": 0.27767351269721985, "learning_rate": 0.00010908040000000002, "loss": 0.2922, "step": 227300 }, { "epoch": 0.2148, "grad_norm": 0.18957166373729706, "learning_rate": 0.0001090404, "loss": 0.3052, "step": 227400 }, { "epoch": 0.215, "grad_norm": 0.21950003504753113, "learning_rate": 0.00010900040000000001, "loss": 0.3, "step": 227500 }, { "epoch": 0.2152, "grad_norm": 0.36292436718940735, "learning_rate": 0.0001089604, "loss": 0.291, "step": 227600 }, { "epoch": 0.2154, "grad_norm": 0.4383956789970398, "learning_rate": 0.00010892040000000001, "loss": 0.302, "step": 227700 }, { "epoch": 0.2156, "grad_norm": 0.18310613930225372, "learning_rate": 0.00010888040000000002, "loss": 0.3081, "step": 227800 }, { "epoch": 0.2158, "grad_norm": 0.21227945387363434, "learning_rate": 0.0001088404, "loss": 0.2791, "step": 227900 }, { "epoch": 0.216, "grad_norm": 0.25162750482559204, "learning_rate": 0.0001088004, "loss": 0.2922, "step": 228000 }, { "epoch": 0.2162, "grad_norm": 0.4016803205013275, "learning_rate": 0.00010876040000000001, "loss": 0.325, "step": 228100 }, { "epoch": 0.2164, "grad_norm": 0.24426665902137756, "learning_rate": 0.00010872040000000002, "loss": 0.2946, "step": 228200 }, { "epoch": 0.2166, "grad_norm": 0.35735762119293213, "learning_rate": 0.00010868040000000001, "loss": 0.2953, "step": 228300 }, { "epoch": 0.2168, "grad_norm": 0.17666518688201904, "learning_rate": 0.0001086404, "loss": 0.2933, "step": 228400 }, { "epoch": 0.217, "grad_norm": 0.4070577323436737, "learning_rate": 0.0001086004, "loss": 0.2955, "step": 228500 }, { "epoch": 0.2172, "grad_norm": 0.24427427351474762, "learning_rate": 0.0001085604, "loss": 0.2858, "step": 228600 }, { "epoch": 0.2174, "grad_norm": 0.21364831924438477, "learning_rate": 0.00010852040000000001, "loss": 0.2919, "step": 228700 }, { "epoch": 0.2176, "grad_norm": 0.20407788455486298, "learning_rate": 0.00010848040000000002, "loss": 0.2898, "step": 228800 }, { "epoch": 0.2178, "grad_norm": 0.31540021300315857, "learning_rate": 0.0001084404, "loss": 0.2987, "step": 228900 }, { "epoch": 0.218, "grad_norm": 0.15270252525806427, "learning_rate": 0.0001084004, "loss": 0.2872, "step": 229000 }, { "epoch": 0.2182, "grad_norm": 0.23393623530864716, "learning_rate": 0.0001083604, "loss": 0.3058, "step": 229100 }, { "epoch": 0.2184, "grad_norm": 0.5334755778312683, "learning_rate": 0.00010832040000000001, "loss": 0.29, "step": 229200 }, { "epoch": 0.2186, "grad_norm": 0.1923484355211258, "learning_rate": 0.00010828040000000001, "loss": 0.2873, "step": 229300 }, { "epoch": 0.2188, "grad_norm": 0.23377379775047302, "learning_rate": 0.0001082404, "loss": 0.2823, "step": 229400 }, { "epoch": 0.219, "grad_norm": 0.22919674217700958, "learning_rate": 0.0001082004, "loss": 0.2836, "step": 229500 }, { "epoch": 0.2192, "grad_norm": 0.20516985654830933, "learning_rate": 0.00010816040000000001, "loss": 0.29, "step": 229600 }, { "epoch": 0.2194, "grad_norm": 0.19901040196418762, "learning_rate": 0.00010812040000000002, "loss": 0.2963, "step": 229700 }, { "epoch": 0.2196, "grad_norm": 0.24688464403152466, "learning_rate": 0.00010808040000000001, "loss": 0.2855, "step": 229800 }, { "epoch": 0.2198, "grad_norm": 0.23144333064556122, "learning_rate": 0.0001080404, "loss": 0.2964, "step": 229900 }, { "epoch": 0.22, "grad_norm": 0.2202957421541214, "learning_rate": 0.0001080004, "loss": 0.2925, "step": 230000 }, { "epoch": 0.2202, "grad_norm": 0.23589707911014557, "learning_rate": 0.0001079604, "loss": 0.2759, "step": 230100 }, { "epoch": 0.2204, "grad_norm": 0.31147778034210205, "learning_rate": 0.00010792040000000001, "loss": 0.2668, "step": 230200 }, { "epoch": 0.2206, "grad_norm": 0.36505284905433655, "learning_rate": 0.00010788040000000002, "loss": 0.2807, "step": 230300 }, { "epoch": 0.2208, "grad_norm": 0.24104100465774536, "learning_rate": 0.0001078404, "loss": 0.2802, "step": 230400 }, { "epoch": 0.221, "grad_norm": 0.2541787922382355, "learning_rate": 0.0001078004, "loss": 0.2738, "step": 230500 }, { "epoch": 0.2212, "grad_norm": 0.18488843739032745, "learning_rate": 0.0001077604, "loss": 0.2781, "step": 230600 }, { "epoch": 0.2214, "grad_norm": 0.30733954906463623, "learning_rate": 0.0001077204, "loss": 0.2689, "step": 230700 }, { "epoch": 0.2216, "grad_norm": 0.4184741675853729, "learning_rate": 0.00010768040000000001, "loss": 0.2821, "step": 230800 }, { "epoch": 0.2218, "grad_norm": 0.234775573015213, "learning_rate": 0.0001076404, "loss": 0.2788, "step": 230900 }, { "epoch": 0.222, "grad_norm": 0.1866404116153717, "learning_rate": 0.0001076004, "loss": 0.2805, "step": 231000 }, { "epoch": 0.2222, "grad_norm": 0.20000393688678741, "learning_rate": 0.00010756040000000001, "loss": 0.2786, "step": 231100 }, { "epoch": 0.2224, "grad_norm": 0.2550050914287567, "learning_rate": 0.00010752040000000001, "loss": 0.2808, "step": 231200 }, { "epoch": 0.2226, "grad_norm": 0.19352710247039795, "learning_rate": 0.00010748040000000001, "loss": 0.2794, "step": 231300 }, { "epoch": 0.2228, "grad_norm": 0.33794355392456055, "learning_rate": 0.0001074404, "loss": 0.2654, "step": 231400 }, { "epoch": 0.223, "grad_norm": 0.1755092740058899, "learning_rate": 0.0001074004, "loss": 0.2777, "step": 231500 }, { "epoch": 0.2232, "grad_norm": 0.2609795331954956, "learning_rate": 0.0001073604, "loss": 0.2749, "step": 231600 }, { "epoch": 0.2234, "grad_norm": 0.23666484653949738, "learning_rate": 0.00010732040000000001, "loss": 0.2837, "step": 231700 }, { "epoch": 0.2236, "grad_norm": 0.28469526767730713, "learning_rate": 0.00010728040000000002, "loss": 0.283, "step": 231800 }, { "epoch": 0.2238, "grad_norm": 0.21141879260540009, "learning_rate": 0.0001072404, "loss": 0.2922, "step": 231900 }, { "epoch": 0.224, "grad_norm": 0.19850678741931915, "learning_rate": 0.0001072004, "loss": 0.2771, "step": 232000 }, { "epoch": 0.2242, "grad_norm": 0.24743816256523132, "learning_rate": 0.00010716040000000001, "loss": 0.2749, "step": 232100 }, { "epoch": 0.2244, "grad_norm": 0.3401179015636444, "learning_rate": 0.0001071204, "loss": 0.2781, "step": 232200 }, { "epoch": 0.2246, "grad_norm": 0.2130950391292572, "learning_rate": 0.00010708040000000001, "loss": 0.27, "step": 232300 }, { "epoch": 0.2248, "grad_norm": 0.37853744626045227, "learning_rate": 0.00010704039999999999, "loss": 0.2838, "step": 232400 }, { "epoch": 0.225, "grad_norm": 0.20562714338302612, "learning_rate": 0.0001070004, "loss": 0.2653, "step": 232500 }, { "epoch": 0.2252, "grad_norm": 0.17086434364318848, "learning_rate": 0.00010696040000000001, "loss": 0.2765, "step": 232600 }, { "epoch": 0.2254, "grad_norm": 0.3876626193523407, "learning_rate": 0.00010692040000000001, "loss": 0.2758, "step": 232700 }, { "epoch": 0.2256, "grad_norm": 0.29403626918792725, "learning_rate": 0.00010688040000000001, "loss": 0.2783, "step": 232800 }, { "epoch": 0.2258, "grad_norm": 0.3147401809692383, "learning_rate": 0.0001068404, "loss": 0.2843, "step": 232900 }, { "epoch": 0.226, "grad_norm": 0.38840705156326294, "learning_rate": 0.0001068004, "loss": 0.3095, "step": 233000 }, { "epoch": 0.2262, "grad_norm": 0.2079588621854782, "learning_rate": 0.0001067604, "loss": 0.2971, "step": 233100 }, { "epoch": 0.2264, "grad_norm": 0.293052077293396, "learning_rate": 0.00010672040000000001, "loss": 0.2787, "step": 233200 }, { "epoch": 0.2266, "grad_norm": 0.1692897379398346, "learning_rate": 0.00010668040000000002, "loss": 0.2939, "step": 233300 }, { "epoch": 0.2268, "grad_norm": 0.19356147944927216, "learning_rate": 0.0001066404, "loss": 0.2901, "step": 233400 }, { "epoch": 0.227, "grad_norm": 0.2273617833852768, "learning_rate": 0.0001066004, "loss": 0.2794, "step": 233500 }, { "epoch": 0.2272, "grad_norm": 0.21963585913181305, "learning_rate": 0.00010656040000000001, "loss": 0.2882, "step": 233600 }, { "epoch": 0.2274, "grad_norm": 0.3202444911003113, "learning_rate": 0.0001065204, "loss": 0.2735, "step": 233700 }, { "epoch": 0.2276, "grad_norm": 0.2695014774799347, "learning_rate": 0.00010648040000000001, "loss": 0.288, "step": 233800 }, { "epoch": 0.2278, "grad_norm": 0.1968889981508255, "learning_rate": 0.00010644039999999999, "loss": 0.2818, "step": 233900 }, { "epoch": 0.228, "grad_norm": 0.2076377421617508, "learning_rate": 0.0001064004, "loss": 0.2806, "step": 234000 }, { "epoch": 0.2282, "grad_norm": 0.17942437529563904, "learning_rate": 0.0001063604, "loss": 0.2874, "step": 234100 }, { "epoch": 0.2284, "grad_norm": 0.20164412260055542, "learning_rate": 0.00010632040000000001, "loss": 0.271, "step": 234200 }, { "epoch": 0.2286, "grad_norm": 0.3018455505371094, "learning_rate": 0.0001062804, "loss": 0.2787, "step": 234300 }, { "epoch": 0.2288, "grad_norm": 0.18581052124500275, "learning_rate": 0.0001062404, "loss": 0.271, "step": 234400 }, { "epoch": 0.229, "grad_norm": 0.36110278964042664, "learning_rate": 0.0001062004, "loss": 0.2856, "step": 234500 }, { "epoch": 0.2292, "grad_norm": 0.21756871044635773, "learning_rate": 0.0001061604, "loss": 0.2872, "step": 234600 }, { "epoch": 0.2294, "grad_norm": 0.1998421549797058, "learning_rate": 0.00010612040000000001, "loss": 0.28, "step": 234700 }, { "epoch": 0.2296, "grad_norm": 0.17958033084869385, "learning_rate": 0.00010608040000000002, "loss": 0.2811, "step": 234800 }, { "epoch": 0.2298, "grad_norm": 0.6013987064361572, "learning_rate": 0.0001060404, "loss": 0.2732, "step": 234900 }, { "epoch": 0.23, "grad_norm": 0.20178425312042236, "learning_rate": 0.0001060004, "loss": 0.2713, "step": 235000 }, { "epoch": 0.2302, "grad_norm": 0.2956780791282654, "learning_rate": 0.00010596040000000001, "loss": 0.2775, "step": 235100 }, { "epoch": 0.2304, "grad_norm": 0.2311795949935913, "learning_rate": 0.0001059204, "loss": 0.2807, "step": 235200 }, { "epoch": 0.2306, "grad_norm": 0.24748362600803375, "learning_rate": 0.00010588040000000001, "loss": 0.2657, "step": 235300 }, { "epoch": 0.2308, "grad_norm": 0.38484346866607666, "learning_rate": 0.00010584039999999999, "loss": 0.2714, "step": 235400 }, { "epoch": 0.231, "grad_norm": 0.20243670046329498, "learning_rate": 0.0001058004, "loss": 0.2783, "step": 235500 }, { "epoch": 0.2312, "grad_norm": 0.5278353095054626, "learning_rate": 0.0001057604, "loss": 0.2779, "step": 235600 }, { "epoch": 0.2314, "grad_norm": 0.27309587597846985, "learning_rate": 0.00010572040000000001, "loss": 0.2695, "step": 235700 }, { "epoch": 0.2316, "grad_norm": 0.16521087288856506, "learning_rate": 0.00010568040000000002, "loss": 0.2732, "step": 235800 }, { "epoch": 0.2318, "grad_norm": 0.5062386393547058, "learning_rate": 0.0001056404, "loss": 0.2878, "step": 235900 }, { "epoch": 0.232, "grad_norm": 0.19394774734973907, "learning_rate": 0.00010560039999999999, "loss": 0.2851, "step": 236000 }, { "epoch": 0.2322, "grad_norm": 0.171609029173851, "learning_rate": 0.0001055604, "loss": 0.2809, "step": 236100 }, { "epoch": 0.2324, "grad_norm": 0.30568891763687134, "learning_rate": 0.00010552040000000001, "loss": 0.2952, "step": 236200 }, { "epoch": 0.2326, "grad_norm": 0.32132649421691895, "learning_rate": 0.00010548040000000001, "loss": 0.2751, "step": 236300 }, { "epoch": 0.2328, "grad_norm": 0.30104103684425354, "learning_rate": 0.0001054404, "loss": 0.285, "step": 236400 }, { "epoch": 0.233, "grad_norm": 0.19023063778877258, "learning_rate": 0.0001054004, "loss": 0.3591, "step": 236500 }, { "epoch": 0.2332, "grad_norm": 0.1921398788690567, "learning_rate": 0.00010536040000000001, "loss": 0.2813, "step": 236600 }, { "epoch": 0.2334, "grad_norm": 0.15040914714336395, "learning_rate": 0.0001053204, "loss": 0.2873, "step": 236700 }, { "epoch": 0.2336, "grad_norm": 0.18588508665561676, "learning_rate": 0.00010528040000000001, "loss": 0.2722, "step": 236800 }, { "epoch": 0.2338, "grad_norm": 0.3410860002040863, "learning_rate": 0.00010524039999999999, "loss": 0.2811, "step": 236900 }, { "epoch": 0.234, "grad_norm": 0.21273022890090942, "learning_rate": 0.0001052004, "loss": 0.2892, "step": 237000 }, { "epoch": 0.2342, "grad_norm": 0.20495010912418365, "learning_rate": 0.0001051604, "loss": 0.2826, "step": 237100 }, { "epoch": 0.2344, "grad_norm": 0.3774225413799286, "learning_rate": 0.00010512040000000001, "loss": 0.2704, "step": 237200 }, { "epoch": 0.2346, "grad_norm": 0.1908106654882431, "learning_rate": 0.00010508040000000002, "loss": 0.2854, "step": 237300 }, { "epoch": 0.2348, "grad_norm": 0.333554208278656, "learning_rate": 0.0001050404, "loss": 0.276, "step": 237400 }, { "epoch": 0.235, "grad_norm": 0.2886081337928772, "learning_rate": 0.0001050004, "loss": 0.2737, "step": 237500 }, { "epoch": 0.2352, "grad_norm": 0.2619887888431549, "learning_rate": 0.0001049604, "loss": 0.2718, "step": 237600 }, { "epoch": 0.2354, "grad_norm": 0.2604043483734131, "learning_rate": 0.0001049204, "loss": 0.2757, "step": 237700 }, { "epoch": 0.2356, "grad_norm": 0.20619331300258636, "learning_rate": 0.00010488040000000001, "loss": 0.2722, "step": 237800 }, { "epoch": 0.2358, "grad_norm": 0.1808670163154602, "learning_rate": 0.00010484039999999999, "loss": 0.2715, "step": 237900 }, { "epoch": 0.236, "grad_norm": 0.39049023389816284, "learning_rate": 0.0001048004, "loss": 0.271, "step": 238000 }, { "epoch": 0.2362, "grad_norm": 0.2106139063835144, "learning_rate": 0.00010476040000000001, "loss": 0.2775, "step": 238100 }, { "epoch": 0.2364, "grad_norm": 0.36072278022766113, "learning_rate": 0.0001047204, "loss": 0.2719, "step": 238200 }, { "epoch": 0.2366, "grad_norm": 0.1794702261686325, "learning_rate": 0.00010468040000000001, "loss": 0.2865, "step": 238300 }, { "epoch": 0.2368, "grad_norm": 0.3740937113761902, "learning_rate": 0.00010464039999999999, "loss": 0.2725, "step": 238400 }, { "epoch": 0.237, "grad_norm": 0.2138763815164566, "learning_rate": 0.0001046004, "loss": 0.2776, "step": 238500 }, { "epoch": 0.2372, "grad_norm": 0.21753883361816406, "learning_rate": 0.0001045604, "loss": 0.2854, "step": 238600 }, { "epoch": 0.2374, "grad_norm": 0.19092372059822083, "learning_rate": 0.00010452040000000001, "loss": 0.2785, "step": 238700 }, { "epoch": 0.2376, "grad_norm": 0.18898741900920868, "learning_rate": 0.00010448040000000002, "loss": 0.2747, "step": 238800 }, { "epoch": 0.2378, "grad_norm": 0.3324822187423706, "learning_rate": 0.00010444040000000001, "loss": 0.2685, "step": 238900 }, { "epoch": 0.238, "grad_norm": 0.24389317631721497, "learning_rate": 0.0001044004, "loss": 0.2748, "step": 239000 }, { "epoch": 0.2382, "grad_norm": 0.23654966056346893, "learning_rate": 0.0001043604, "loss": 0.2845, "step": 239100 }, { "epoch": 0.2384, "grad_norm": 0.3116486072540283, "learning_rate": 0.0001043204, "loss": 0.2732, "step": 239200 }, { "epoch": 0.2386, "grad_norm": 0.33123424649238586, "learning_rate": 0.00010428040000000001, "loss": 0.2974, "step": 239300 }, { "epoch": 0.2388, "grad_norm": 0.2512767016887665, "learning_rate": 0.00010424040000000002, "loss": 0.2799, "step": 239400 }, { "epoch": 0.239, "grad_norm": 0.1622258871793747, "learning_rate": 0.0001042004, "loss": 0.2889, "step": 239500 }, { "epoch": 0.2392, "grad_norm": 0.33309224247932434, "learning_rate": 0.0001041604, "loss": 0.2861, "step": 239600 }, { "epoch": 0.2394, "grad_norm": 0.657027542591095, "learning_rate": 0.0001041204, "loss": 0.2772, "step": 239700 }, { "epoch": 0.2396, "grad_norm": 0.33285826444625854, "learning_rate": 0.00010408040000000001, "loss": 0.2832, "step": 239800 }, { "epoch": 0.2398, "grad_norm": 0.3333481550216675, "learning_rate": 0.00010404040000000001, "loss": 0.2798, "step": 239900 }, { "epoch": 0.24, "grad_norm": 0.17029236257076263, "learning_rate": 0.0001040004, "loss": 0.2781, "step": 240000 }, { "epoch": 0.2402, "grad_norm": 0.2667362689971924, "learning_rate": 0.0001039604, "loss": 0.2745, "step": 240100 }, { "epoch": 0.2404, "grad_norm": 0.2005651295185089, "learning_rate": 0.00010392040000000001, "loss": 0.2708, "step": 240200 }, { "epoch": 0.2406, "grad_norm": 0.22746022045612335, "learning_rate": 0.00010388040000000002, "loss": 0.2826, "step": 240300 }, { "epoch": 0.2408, "grad_norm": 0.4452439546585083, "learning_rate": 0.00010384040000000001, "loss": 0.2786, "step": 240400 }, { "epoch": 0.241, "grad_norm": 0.2846798002719879, "learning_rate": 0.0001038004, "loss": 0.2759, "step": 240500 }, { "epoch": 0.2412, "grad_norm": 0.5280228853225708, "learning_rate": 0.0001037604, "loss": 0.2831, "step": 240600 }, { "epoch": 0.2414, "grad_norm": 0.18407422304153442, "learning_rate": 0.0001037204, "loss": 0.2861, "step": 240700 }, { "epoch": 0.2416, "grad_norm": 0.27900245785713196, "learning_rate": 0.00010368040000000001, "loss": 0.2761, "step": 240800 }, { "epoch": 0.2418, "grad_norm": 0.4371165931224823, "learning_rate": 0.00010364040000000002, "loss": 0.2741, "step": 240900 }, { "epoch": 0.242, "grad_norm": 0.1729346215724945, "learning_rate": 0.0001036004, "loss": 0.2749, "step": 241000 }, { "epoch": 0.2422, "grad_norm": 0.3014446794986725, "learning_rate": 0.0001035604, "loss": 0.2757, "step": 241100 }, { "epoch": 0.2424, "grad_norm": 0.2890118956565857, "learning_rate": 0.00010352040000000001, "loss": 0.2748, "step": 241200 }, { "epoch": 0.2426, "grad_norm": 0.18363769352436066, "learning_rate": 0.0001034804, "loss": 0.2786, "step": 241300 }, { "epoch": 0.2428, "grad_norm": 0.23119623959064484, "learning_rate": 0.00010344040000000001, "loss": 0.2763, "step": 241400 }, { "epoch": 0.243, "grad_norm": 0.23859907686710358, "learning_rate": 0.0001034004, "loss": 0.2766, "step": 241500 }, { "epoch": 0.2432, "grad_norm": 0.2567295432090759, "learning_rate": 0.0001033604, "loss": 0.2733, "step": 241600 }, { "epoch": 0.2434, "grad_norm": 0.3161749243736267, "learning_rate": 0.00010332040000000001, "loss": 0.2697, "step": 241700 }, { "epoch": 0.2436, "grad_norm": 0.31291186809539795, "learning_rate": 0.00010328040000000001, "loss": 0.2612, "step": 241800 }, { "epoch": 0.2438, "grad_norm": 0.23509889841079712, "learning_rate": 0.00010324040000000001, "loss": 0.2825, "step": 241900 }, { "epoch": 0.244, "grad_norm": 0.21681292355060577, "learning_rate": 0.0001032004, "loss": 0.2797, "step": 242000 }, { "epoch": 0.2442, "grad_norm": 0.2644917070865631, "learning_rate": 0.0001031604, "loss": 0.271, "step": 242100 }, { "epoch": 0.2444, "grad_norm": 0.536569356918335, "learning_rate": 0.0001031204, "loss": 0.2731, "step": 242200 }, { "epoch": 0.2446, "grad_norm": 0.44594717025756836, "learning_rate": 0.00010308040000000001, "loss": 0.2774, "step": 242300 }, { "epoch": 0.2448, "grad_norm": 0.309821754693985, "learning_rate": 0.00010304040000000002, "loss": 0.2899, "step": 242400 }, { "epoch": 0.245, "grad_norm": 0.28232014179229736, "learning_rate": 0.0001030004, "loss": 0.2944, "step": 242500 }, { "epoch": 0.2452, "grad_norm": 0.29917338490486145, "learning_rate": 0.0001029604, "loss": 0.2795, "step": 242600 }, { "epoch": 0.2454, "grad_norm": 0.23704950511455536, "learning_rate": 0.00010292040000000001, "loss": 0.2796, "step": 242700 }, { "epoch": 0.2456, "grad_norm": 0.21638484299182892, "learning_rate": 0.0001028804, "loss": 0.2636, "step": 242800 }, { "epoch": 0.2458, "grad_norm": 0.18547122180461884, "learning_rate": 0.00010284040000000001, "loss": 0.2669, "step": 242900 }, { "epoch": 0.246, "grad_norm": 0.17270751297473907, "learning_rate": 0.00010280039999999999, "loss": 0.2593, "step": 243000 }, { "epoch": 0.2462, "grad_norm": 0.16788358986377716, "learning_rate": 0.0001027604, "loss": 0.2592, "step": 243100 }, { "epoch": 0.2464, "grad_norm": 0.1689780205488205, "learning_rate": 0.0001027204, "loss": 0.2647, "step": 243200 }, { "epoch": 0.2466, "grad_norm": 0.28541648387908936, "learning_rate": 0.00010268040000000001, "loss": 0.2598, "step": 243300 }, { "epoch": 0.2468, "grad_norm": 0.24988971650600433, "learning_rate": 0.00010264040000000002, "loss": 0.2606, "step": 243400 }, { "epoch": 0.247, "grad_norm": 0.17981068789958954, "learning_rate": 0.0001026004, "loss": 0.2581, "step": 243500 }, { "epoch": 0.2472, "grad_norm": 0.19057908654212952, "learning_rate": 0.0001025604, "loss": 0.2645, "step": 243600 }, { "epoch": 0.2474, "grad_norm": 0.1446358561515808, "learning_rate": 0.0001025204, "loss": 0.2513, "step": 243700 }, { "epoch": 0.2476, "grad_norm": 0.18903489410877228, "learning_rate": 0.00010248040000000001, "loss": 0.2577, "step": 243800 }, { "epoch": 0.2478, "grad_norm": 0.18427778780460358, "learning_rate": 0.00010244040000000002, "loss": 0.2549, "step": 243900 }, { "epoch": 0.248, "grad_norm": 0.17634162306785583, "learning_rate": 0.0001024004, "loss": 0.2544, "step": 244000 }, { "epoch": 0.2482, "grad_norm": 0.15997561812400818, "learning_rate": 0.0001023604, "loss": 0.257, "step": 244100 }, { "epoch": 0.2484, "grad_norm": 0.13847199082374573, "learning_rate": 0.00010232040000000001, "loss": 0.2518, "step": 244200 }, { "epoch": 0.2486, "grad_norm": 0.19378510117530823, "learning_rate": 0.0001022804, "loss": 0.2608, "step": 244300 }, { "epoch": 0.2488, "grad_norm": 0.2129637449979782, "learning_rate": 0.00010224040000000001, "loss": 0.2586, "step": 244400 }, { "epoch": 0.249, "grad_norm": 0.28681081533432007, "learning_rate": 0.00010220039999999999, "loss": 0.259, "step": 244500 }, { "epoch": 0.2492, "grad_norm": 0.20721031725406647, "learning_rate": 0.0001021604, "loss": 0.2585, "step": 244600 }, { "epoch": 0.2494, "grad_norm": 0.2532545328140259, "learning_rate": 0.0001021204, "loss": 0.2532, "step": 244700 }, { "epoch": 0.2496, "grad_norm": 0.28852516412734985, "learning_rate": 0.00010208040000000001, "loss": 0.255, "step": 244800 }, { "epoch": 0.2498, "grad_norm": 0.1656378209590912, "learning_rate": 0.00010204040000000002, "loss": 0.2537, "step": 244900 }, { "epoch": 0.25, "grad_norm": 0.2153751105070114, "learning_rate": 0.0001020004, "loss": 0.259, "step": 245000 }, { "epoch": 0.2502, "grad_norm": 0.19507664442062378, "learning_rate": 0.0001019604, "loss": 0.2587, "step": 245100 }, { "epoch": 0.2504, "grad_norm": 0.1736658811569214, "learning_rate": 0.0001019204, "loss": 0.2612, "step": 245200 }, { "epoch": 0.2506, "grad_norm": 0.18013814091682434, "learning_rate": 0.00010188040000000001, "loss": 0.2554, "step": 245300 }, { "epoch": 0.2508, "grad_norm": 0.2120230495929718, "learning_rate": 0.00010184040000000002, "loss": 0.2519, "step": 245400 }, { "epoch": 0.251, "grad_norm": 0.16198742389678955, "learning_rate": 0.0001018004, "loss": 0.254, "step": 245500 }, { "epoch": 0.2512, "grad_norm": 0.16579264402389526, "learning_rate": 0.0001017604, "loss": 0.258, "step": 245600 }, { "epoch": 0.2514, "grad_norm": 0.23009906709194183, "learning_rate": 0.00010172040000000001, "loss": 0.254, "step": 245700 }, { "epoch": 0.2516, "grad_norm": 0.15772415697574615, "learning_rate": 0.0001016804, "loss": 0.2532, "step": 245800 }, { "epoch": 0.2518, "grad_norm": 0.20444506406784058, "learning_rate": 0.00010164040000000001, "loss": 0.2524, "step": 245900 }, { "epoch": 0.252, "grad_norm": 0.19540543854236603, "learning_rate": 0.00010160039999999999, "loss": 0.2538, "step": 246000 }, { "epoch": 0.2522, "grad_norm": 0.17972715198993683, "learning_rate": 0.0001015604, "loss": 0.259, "step": 246100 }, { "epoch": 0.2524, "grad_norm": 0.17533235251903534, "learning_rate": 0.0001015204, "loss": 0.2564, "step": 246200 }, { "epoch": 0.2526, "grad_norm": 0.1778615266084671, "learning_rate": 0.00010148040000000001, "loss": 0.2579, "step": 246300 }, { "epoch": 0.2528, "grad_norm": 0.27196893095970154, "learning_rate": 0.00010144040000000002, "loss": 0.2631, "step": 246400 }, { "epoch": 0.253, "grad_norm": 0.20036780834197998, "learning_rate": 0.0001014004, "loss": 0.2619, "step": 246500 }, { "epoch": 0.2532, "grad_norm": 0.18973121047019958, "learning_rate": 0.0001013604, "loss": 0.2637, "step": 246600 }, { "epoch": 0.2534, "grad_norm": 0.17328648269176483, "learning_rate": 0.0001013204, "loss": 0.2516, "step": 246700 }, { "epoch": 0.2536, "grad_norm": 0.1752815842628479, "learning_rate": 0.00010128040000000001, "loss": 0.2572, "step": 246800 }, { "epoch": 0.2538, "grad_norm": 0.24637199938297272, "learning_rate": 0.00010124040000000001, "loss": 0.2543, "step": 246900 }, { "epoch": 0.254, "grad_norm": 0.19035212695598602, "learning_rate": 0.0001012004, "loss": 0.2513, "step": 247000 }, { "epoch": 0.2542, "grad_norm": 0.14904499053955078, "learning_rate": 0.0001011604, "loss": 0.2579, "step": 247100 }, { "epoch": 0.2544, "grad_norm": 0.6997934579849243, "learning_rate": 0.00010112040000000001, "loss": 0.2545, "step": 247200 }, { "epoch": 0.2546, "grad_norm": 0.1693519502878189, "learning_rate": 0.0001010804, "loss": 0.2575, "step": 247300 }, { "epoch": 0.2548, "grad_norm": 0.1666778028011322, "learning_rate": 0.00010104040000000001, "loss": 0.2645, "step": 247400 }, { "epoch": 0.255, "grad_norm": 0.24935069680213928, "learning_rate": 0.00010100039999999999, "loss": 0.2571, "step": 247500 }, { "epoch": 0.2552, "grad_norm": 0.1595294177532196, "learning_rate": 0.0001009604, "loss": 0.2556, "step": 247600 }, { "epoch": 0.2554, "grad_norm": 0.1594139188528061, "learning_rate": 0.0001009204, "loss": 0.2612, "step": 247700 }, { "epoch": 0.2556, "grad_norm": 0.2499750703573227, "learning_rate": 0.00010088040000000001, "loss": 0.2679, "step": 247800 }, { "epoch": 0.2558, "grad_norm": 0.1567823588848114, "learning_rate": 0.00010084040000000002, "loss": 0.2573, "step": 247900 }, { "epoch": 0.256, "grad_norm": 0.17192710936069489, "learning_rate": 0.0001008004, "loss": 0.2543, "step": 248000 }, { "epoch": 0.2562, "grad_norm": 0.18040655553340912, "learning_rate": 0.0001007604, "loss": 0.2632, "step": 248100 }, { "epoch": 0.2564, "grad_norm": 0.29884371161460876, "learning_rate": 0.0001007204, "loss": 0.2545, "step": 248200 }, { "epoch": 0.2566, "grad_norm": 0.2246190458536148, "learning_rate": 0.0001006804, "loss": 0.2507, "step": 248300 }, { "epoch": 0.2568, "grad_norm": 0.21513308584690094, "learning_rate": 0.00010064040000000001, "loss": 0.2517, "step": 248400 }, { "epoch": 0.257, "grad_norm": 0.17849339544773102, "learning_rate": 0.00010060039999999999, "loss": 0.253, "step": 248500 }, { "epoch": 0.2572, "grad_norm": 0.2598421573638916, "learning_rate": 0.0001005604, "loss": 0.2546, "step": 248600 }, { "epoch": 0.2574, "grad_norm": 0.21260765194892883, "learning_rate": 0.00010052040000000001, "loss": 0.2523, "step": 248700 }, { "epoch": 0.2576, "grad_norm": 0.9292067289352417, "learning_rate": 0.00010048040000000001, "loss": 0.2514, "step": 248800 }, { "epoch": 0.2578, "grad_norm": 0.1865740567445755, "learning_rate": 0.00010044040000000001, "loss": 0.2583, "step": 248900 }, { "epoch": 0.258, "grad_norm": 0.22997444868087769, "learning_rate": 0.00010040039999999999, "loss": 0.2542, "step": 249000 }, { "epoch": 0.2582, "grad_norm": 0.23303471505641937, "learning_rate": 0.0001003604, "loss": 0.2538, "step": 249100 }, { "epoch": 0.2584, "grad_norm": 0.26385053992271423, "learning_rate": 0.0001003204, "loss": 0.254, "step": 249200 }, { "epoch": 0.2586, "grad_norm": 0.36450210213661194, "learning_rate": 0.00010028040000000001, "loss": 0.261, "step": 249300 }, { "epoch": 0.2588, "grad_norm": 0.20195017755031586, "learning_rate": 0.00010024040000000002, "loss": 0.2497, "step": 249400 }, { "epoch": 0.259, "grad_norm": 0.24559420347213745, "learning_rate": 0.0001002004, "loss": 0.2556, "step": 249500 }, { "epoch": 0.2592, "grad_norm": 0.42150649428367615, "learning_rate": 0.0001001604, "loss": 0.257, "step": 249600 }, { "epoch": 0.2594, "grad_norm": 0.19844648241996765, "learning_rate": 0.0001001204, "loss": 0.2574, "step": 249700 }, { "epoch": 0.2596, "grad_norm": 0.18794132769107819, "learning_rate": 0.0001000804, "loss": 0.2507, "step": 249800 }, { "epoch": 0.2598, "grad_norm": 0.17367449402809143, "learning_rate": 0.00010004040000000001, "loss": 0.2526, "step": 249900 }, { "epoch": 0.26, "grad_norm": 0.23622073233127594, "learning_rate": 0.00010000039999999999, "loss": 0.2523, "step": 250000 }, { "epoch": 0.2602, "grad_norm": 0.1616039127111435, "learning_rate": 9.996040000000001e-05, "loss": 0.2555, "step": 250100 }, { "epoch": 0.2604, "grad_norm": 0.1847952902317047, "learning_rate": 9.99204e-05, "loss": 0.2545, "step": 250200 }, { "epoch": 0.2606, "grad_norm": 0.2138368785381317, "learning_rate": 9.988040000000001e-05, "loss": 0.2485, "step": 250300 }, { "epoch": 0.2608, "grad_norm": 0.17640897631645203, "learning_rate": 9.984040000000001e-05, "loss": 0.2585, "step": 250400 }, { "epoch": 0.261, "grad_norm": 0.20163944363594055, "learning_rate": 9.98004e-05, "loss": 0.2486, "step": 250500 }, { "epoch": 0.2612, "grad_norm": 0.23935359716415405, "learning_rate": 9.976040000000001e-05, "loss": 0.2603, "step": 250600 }, { "epoch": 0.2614, "grad_norm": 0.14974956214427948, "learning_rate": 9.97204e-05, "loss": 0.2557, "step": 250700 }, { "epoch": 0.2616, "grad_norm": 0.17297634482383728, "learning_rate": 9.968040000000001e-05, "loss": 0.2562, "step": 250800 }, { "epoch": 0.2618, "grad_norm": 0.31046923995018005, "learning_rate": 9.96404e-05, "loss": 0.2569, "step": 250900 }, { "epoch": 0.262, "grad_norm": 0.1529819816350937, "learning_rate": 9.960040000000001e-05, "loss": 0.2559, "step": 251000 }, { "epoch": 0.2622, "grad_norm": 0.22081358730793, "learning_rate": 9.95604e-05, "loss": 0.2595, "step": 251100 }, { "epoch": 0.2624, "grad_norm": 0.2596571445465088, "learning_rate": 9.95204e-05, "loss": 0.2621, "step": 251200 }, { "epoch": 0.2626, "grad_norm": 0.20315712690353394, "learning_rate": 9.94804e-05, "loss": 0.2556, "step": 251300 }, { "epoch": 0.2628, "grad_norm": 0.16273678839206696, "learning_rate": 9.94404e-05, "loss": 0.2538, "step": 251400 }, { "epoch": 0.263, "grad_norm": 0.3179258108139038, "learning_rate": 9.94004e-05, "loss": 0.2547, "step": 251500 }, { "epoch": 0.2632, "grad_norm": 0.18335247039794922, "learning_rate": 9.936040000000001e-05, "loss": 0.275, "step": 251600 }, { "epoch": 0.2634, "grad_norm": 0.19399204850196838, "learning_rate": 9.93204e-05, "loss": 0.253, "step": 251700 }, { "epoch": 0.2636, "grad_norm": 0.16826485097408295, "learning_rate": 9.928040000000001e-05, "loss": 0.2511, "step": 251800 }, { "epoch": 0.2638, "grad_norm": 0.16970914602279663, "learning_rate": 9.92404e-05, "loss": 0.2485, "step": 251900 }, { "epoch": 0.264, "grad_norm": 0.19796758890151978, "learning_rate": 9.92004e-05, "loss": 0.25, "step": 252000 }, { "epoch": 0.2642, "grad_norm": 0.23124122619628906, "learning_rate": 9.916040000000001e-05, "loss": 0.2517, "step": 252100 }, { "epoch": 0.2644, "grad_norm": 0.18028624355793, "learning_rate": 9.91204e-05, "loss": 0.2549, "step": 252200 }, { "epoch": 0.2646, "grad_norm": 0.24230654537677765, "learning_rate": 9.908040000000001e-05, "loss": 0.2511, "step": 252300 }, { "epoch": 0.2648, "grad_norm": 0.16083365678787231, "learning_rate": 9.90404e-05, "loss": 0.2491, "step": 252400 }, { "epoch": 0.265, "grad_norm": 0.3353089392185211, "learning_rate": 9.900040000000001e-05, "loss": 0.2586, "step": 252500 }, { "epoch": 0.2652, "grad_norm": 0.29795363545417786, "learning_rate": 9.89604e-05, "loss": 0.2581, "step": 252600 }, { "epoch": 0.2654, "grad_norm": 0.18846677243709564, "learning_rate": 9.89204e-05, "loss": 0.2596, "step": 252700 }, { "epoch": 0.2656, "grad_norm": 0.16122601926326752, "learning_rate": 9.88804e-05, "loss": 0.2517, "step": 252800 }, { "epoch": 0.2658, "grad_norm": 0.19372184574604034, "learning_rate": 9.88404e-05, "loss": 0.2474, "step": 252900 }, { "epoch": 0.266, "grad_norm": 0.1936195194721222, "learning_rate": 9.88004e-05, "loss": 0.2546, "step": 253000 }, { "epoch": 0.2662, "grad_norm": 0.2815191149711609, "learning_rate": 9.876040000000001e-05, "loss": 0.2507, "step": 253100 }, { "epoch": 0.2664, "grad_norm": 0.16391977667808533, "learning_rate": 9.87204e-05, "loss": 0.2538, "step": 253200 }, { "epoch": 0.2666, "grad_norm": 0.17948247492313385, "learning_rate": 9.868040000000001e-05, "loss": 0.253, "step": 253300 }, { "epoch": 0.2668, "grad_norm": 0.18989241123199463, "learning_rate": 9.86404e-05, "loss": 0.2547, "step": 253400 }, { "epoch": 0.267, "grad_norm": 0.20617620646953583, "learning_rate": 9.86004e-05, "loss": 0.2549, "step": 253500 }, { "epoch": 0.2672, "grad_norm": 0.2014150768518448, "learning_rate": 9.85604e-05, "loss": 0.2588, "step": 253600 }, { "epoch": 0.2674, "grad_norm": 0.22781629860401154, "learning_rate": 9.85204e-05, "loss": 0.2625, "step": 253700 }, { "epoch": 0.2676, "grad_norm": 0.179660364985466, "learning_rate": 9.84804e-05, "loss": 0.2547, "step": 253800 }, { "epoch": 0.2678, "grad_norm": 0.18104757368564606, "learning_rate": 9.84404e-05, "loss": 0.2569, "step": 253900 }, { "epoch": 0.268, "grad_norm": 0.42554691433906555, "learning_rate": 9.840040000000001e-05, "loss": 0.2606, "step": 254000 }, { "epoch": 0.2682, "grad_norm": 0.17360180616378784, "learning_rate": 9.836040000000001e-05, "loss": 0.2583, "step": 254100 }, { "epoch": 0.2684, "grad_norm": 0.21442458033561707, "learning_rate": 9.83204e-05, "loss": 0.2525, "step": 254200 }, { "epoch": 0.2686, "grad_norm": 0.17544832825660706, "learning_rate": 9.82804e-05, "loss": 0.2512, "step": 254300 }, { "epoch": 0.2688, "grad_norm": 0.18496903777122498, "learning_rate": 9.824040000000001e-05, "loss": 0.2501, "step": 254400 }, { "epoch": 0.269, "grad_norm": 0.20425285398960114, "learning_rate": 9.82004e-05, "loss": 0.2529, "step": 254500 }, { "epoch": 0.2692, "grad_norm": 0.16543030738830566, "learning_rate": 9.816040000000001e-05, "loss": 0.2572, "step": 254600 }, { "epoch": 0.2694, "grad_norm": 0.18236897885799408, "learning_rate": 9.81204e-05, "loss": 0.2546, "step": 254700 }, { "epoch": 0.2696, "grad_norm": 0.35450029373168945, "learning_rate": 9.808040000000001e-05, "loss": 0.2519, "step": 254800 }, { "epoch": 0.2698, "grad_norm": 0.1963033676147461, "learning_rate": 9.80404e-05, "loss": 0.274, "step": 254900 }, { "epoch": 0.27, "grad_norm": 0.24699929356575012, "learning_rate": 9.80004e-05, "loss": 0.2546, "step": 255000 }, { "epoch": 0.2702, "grad_norm": 0.2196625918149948, "learning_rate": 9.79604e-05, "loss": 0.2565, "step": 255100 }, { "epoch": 0.2704, "grad_norm": 0.2058703750371933, "learning_rate": 9.79204e-05, "loss": 0.2576, "step": 255200 }, { "epoch": 0.2706, "grad_norm": 0.16371659934520721, "learning_rate": 9.78804e-05, "loss": 0.2534, "step": 255300 }, { "epoch": 0.2708, "grad_norm": 0.249703511595726, "learning_rate": 9.784040000000001e-05, "loss": 0.251, "step": 255400 }, { "epoch": 0.271, "grad_norm": 0.18516069650650024, "learning_rate": 9.78004e-05, "loss": 0.2562, "step": 255500 }, { "epoch": 0.2712, "grad_norm": 0.17773735523223877, "learning_rate": 9.776040000000001e-05, "loss": 0.2583, "step": 255600 }, { "epoch": 0.2714, "grad_norm": 0.18748611211776733, "learning_rate": 9.772040000000001e-05, "loss": 0.2588, "step": 255700 }, { "epoch": 0.2716, "grad_norm": 0.16686369478702545, "learning_rate": 9.76804e-05, "loss": 0.2603, "step": 255800 }, { "epoch": 0.2718, "grad_norm": 0.17523817718029022, "learning_rate": 9.764040000000001e-05, "loss": 0.2466, "step": 255900 }, { "epoch": 0.272, "grad_norm": 0.2433302402496338, "learning_rate": 9.76004e-05, "loss": 0.2528, "step": 256000 }, { "epoch": 0.2722, "grad_norm": 0.14348742365837097, "learning_rate": 9.756040000000001e-05, "loss": 0.2494, "step": 256100 }, { "epoch": 0.2724, "grad_norm": 0.2618894577026367, "learning_rate": 9.75204e-05, "loss": 0.2606, "step": 256200 }, { "epoch": 0.2726, "grad_norm": 0.2586542069911957, "learning_rate": 9.748040000000001e-05, "loss": 0.2476, "step": 256300 }, { "epoch": 0.2728, "grad_norm": 0.17183682322502136, "learning_rate": 9.74404e-05, "loss": 0.253, "step": 256400 }, { "epoch": 0.273, "grad_norm": 0.15170541405677795, "learning_rate": 9.74004e-05, "loss": 0.253, "step": 256500 }, { "epoch": 0.2732, "grad_norm": 0.20044559240341187, "learning_rate": 9.73604e-05, "loss": 0.2594, "step": 256600 }, { "epoch": 0.2734, "grad_norm": 0.1726810783147812, "learning_rate": 9.73204e-05, "loss": 0.2544, "step": 256700 }, { "epoch": 0.2736, "grad_norm": 0.17345348000526428, "learning_rate": 9.72804e-05, "loss": 0.2543, "step": 256800 }, { "epoch": 0.2738, "grad_norm": 0.1733126938343048, "learning_rate": 9.724040000000001e-05, "loss": 0.2516, "step": 256900 }, { "epoch": 0.274, "grad_norm": 0.2917121946811676, "learning_rate": 9.72004e-05, "loss": 0.2605, "step": 257000 }, { "epoch": 0.2742, "grad_norm": 0.1882348358631134, "learning_rate": 9.716040000000001e-05, "loss": 0.2584, "step": 257100 }, { "epoch": 0.2744, "grad_norm": 0.1900627166032791, "learning_rate": 9.71204e-05, "loss": 0.2614, "step": 257200 }, { "epoch": 0.2746, "grad_norm": 0.21451936662197113, "learning_rate": 9.70804e-05, "loss": 0.255, "step": 257300 }, { "epoch": 0.2748, "grad_norm": 0.24926543235778809, "learning_rate": 9.704040000000001e-05, "loss": 0.2502, "step": 257400 }, { "epoch": 0.275, "grad_norm": 0.19679979979991913, "learning_rate": 9.70004e-05, "loss": 0.2504, "step": 257500 }, { "epoch": 0.2752, "grad_norm": 0.27472445368766785, "learning_rate": 9.696040000000001e-05, "loss": 0.2499, "step": 257600 }, { "epoch": 0.2754, "grad_norm": 0.1651807725429535, "learning_rate": 9.69204e-05, "loss": 0.2574, "step": 257700 }, { "epoch": 0.2756, "grad_norm": 0.2149466574192047, "learning_rate": 9.688040000000001e-05, "loss": 0.2466, "step": 257800 }, { "epoch": 0.2758, "grad_norm": 0.21232870221138, "learning_rate": 9.684040000000002e-05, "loss": 0.2539, "step": 257900 }, { "epoch": 0.276, "grad_norm": 0.20928317308425903, "learning_rate": 9.68004e-05, "loss": 0.2496, "step": 258000 }, { "epoch": 0.2762, "grad_norm": 0.2095847725868225, "learning_rate": 9.67604e-05, "loss": 0.2552, "step": 258100 }, { "epoch": 0.2764, "grad_norm": 0.1795085370540619, "learning_rate": 9.67204e-05, "loss": 0.2495, "step": 258200 }, { "epoch": 0.2766, "grad_norm": 0.17618554830551147, "learning_rate": 9.66804e-05, "loss": 0.252, "step": 258300 }, { "epoch": 0.2768, "grad_norm": 0.16858777403831482, "learning_rate": 9.664040000000001e-05, "loss": 0.2585, "step": 258400 }, { "epoch": 0.277, "grad_norm": 0.21656472980976105, "learning_rate": 9.66004e-05, "loss": 0.2498, "step": 258500 }, { "epoch": 0.2772, "grad_norm": 0.18165180087089539, "learning_rate": 9.656040000000001e-05, "loss": 0.2529, "step": 258600 }, { "epoch": 0.2774, "grad_norm": 0.2290857881307602, "learning_rate": 9.65204e-05, "loss": 0.25, "step": 258700 }, { "epoch": 0.2776, "grad_norm": 0.1842830330133438, "learning_rate": 9.64804e-05, "loss": 0.2526, "step": 258800 }, { "epoch": 0.2778, "grad_norm": 0.16289664804935455, "learning_rate": 9.64404e-05, "loss": 0.2546, "step": 258900 }, { "epoch": 0.278, "grad_norm": 0.22200186550617218, "learning_rate": 9.64004e-05, "loss": 0.2487, "step": 259000 }, { "epoch": 0.2782, "grad_norm": 0.15263180434703827, "learning_rate": 9.63604e-05, "loss": 0.2532, "step": 259100 }, { "epoch": 0.2784, "grad_norm": 0.19079795479774475, "learning_rate": 9.63204e-05, "loss": 0.2463, "step": 259200 }, { "epoch": 0.2786, "grad_norm": 0.21255415678024292, "learning_rate": 9.628040000000001e-05, "loss": 0.2485, "step": 259300 }, { "epoch": 0.2788, "grad_norm": 0.15365895628929138, "learning_rate": 9.624040000000001e-05, "loss": 0.2472, "step": 259400 }, { "epoch": 0.279, "grad_norm": 0.1517648547887802, "learning_rate": 9.620040000000001e-05, "loss": 0.2538, "step": 259500 }, { "epoch": 0.2792, "grad_norm": 0.2428562343120575, "learning_rate": 9.61604e-05, "loss": 0.2455, "step": 259600 }, { "epoch": 0.2794, "grad_norm": 0.15415999293327332, "learning_rate": 9.61204e-05, "loss": 0.2472, "step": 259700 }, { "epoch": 0.2796, "grad_norm": 0.19108308851718903, "learning_rate": 9.60804e-05, "loss": 0.2482, "step": 259800 }, { "epoch": 0.2798, "grad_norm": 0.1723984181880951, "learning_rate": 9.604040000000001e-05, "loss": 0.2502, "step": 259900 }, { "epoch": 0.28, "grad_norm": 0.24995626509189606, "learning_rate": 9.60004e-05, "loss": 0.2578, "step": 260000 }, { "epoch": 0.2802, "grad_norm": 0.17275862395763397, "learning_rate": 9.596040000000001e-05, "loss": 0.2583, "step": 260100 }, { "epoch": 0.2804, "grad_norm": 0.20496828854084015, "learning_rate": 9.59204e-05, "loss": 0.2502, "step": 260200 }, { "epoch": 0.2806, "grad_norm": 0.15254418551921844, "learning_rate": 9.58804e-05, "loss": 0.2453, "step": 260300 }, { "epoch": 0.2808, "grad_norm": 0.22347435355186462, "learning_rate": 9.58404e-05, "loss": 0.2655, "step": 260400 }, { "epoch": 0.281, "grad_norm": 0.27912843227386475, "learning_rate": 9.58004e-05, "loss": 0.2595, "step": 260500 }, { "epoch": 0.2812, "grad_norm": 0.17391923069953918, "learning_rate": 9.57604e-05, "loss": 0.2441, "step": 260600 }, { "epoch": 0.2814, "grad_norm": 0.20242680609226227, "learning_rate": 9.572040000000001e-05, "loss": 0.2513, "step": 260700 }, { "epoch": 0.2816, "grad_norm": 0.19099730253219604, "learning_rate": 9.56804e-05, "loss": 0.2474, "step": 260800 }, { "epoch": 0.2818, "grad_norm": 0.17173656821250916, "learning_rate": 9.564040000000001e-05, "loss": 0.2468, "step": 260900 }, { "epoch": 0.282, "grad_norm": 0.2755275368690491, "learning_rate": 9.560040000000001e-05, "loss": 0.2579, "step": 261000 }, { "epoch": 0.2822, "grad_norm": 0.1447450965642929, "learning_rate": 9.55604e-05, "loss": 0.2549, "step": 261100 }, { "epoch": 0.2824, "grad_norm": 0.17367517948150635, "learning_rate": 9.552040000000001e-05, "loss": 0.2493, "step": 261200 }, { "epoch": 0.2826, "grad_norm": 0.19174928963184357, "learning_rate": 9.54804e-05, "loss": 0.2518, "step": 261300 }, { "epoch": 0.2828, "grad_norm": 0.20758305490016937, "learning_rate": 9.544040000000001e-05, "loss": 0.2522, "step": 261400 }, { "epoch": 0.283, "grad_norm": 0.172040194272995, "learning_rate": 9.54004e-05, "loss": 0.2527, "step": 261500 }, { "epoch": 0.2832, "grad_norm": 0.19083595275878906, "learning_rate": 9.536040000000001e-05, "loss": 0.2439, "step": 261600 }, { "epoch": 0.2834, "grad_norm": 0.161021426320076, "learning_rate": 9.53204e-05, "loss": 0.2529, "step": 261700 }, { "epoch": 0.2836, "grad_norm": 0.1945277452468872, "learning_rate": 9.52804e-05, "loss": 0.2508, "step": 261800 }, { "epoch": 0.2838, "grad_norm": 0.18637241423130035, "learning_rate": 9.52404e-05, "loss": 0.2504, "step": 261900 }, { "epoch": 0.284, "grad_norm": 0.6245195865631104, "learning_rate": 9.52004e-05, "loss": 0.2512, "step": 262000 }, { "epoch": 0.2842, "grad_norm": 0.26768702268600464, "learning_rate": 9.51604e-05, "loss": 0.2504, "step": 262100 }, { "epoch": 0.2844, "grad_norm": 0.19558322429656982, "learning_rate": 9.512040000000001e-05, "loss": 0.2543, "step": 262200 }, { "epoch": 0.2846, "grad_norm": 0.19235068559646606, "learning_rate": 9.50804e-05, "loss": 0.2491, "step": 262300 }, { "epoch": 0.2848, "grad_norm": 0.19615627825260162, "learning_rate": 9.504040000000001e-05, "loss": 0.2486, "step": 262400 }, { "epoch": 0.285, "grad_norm": 0.4205188751220703, "learning_rate": 9.50004e-05, "loss": 0.2507, "step": 262500 }, { "epoch": 0.2852, "grad_norm": 0.2430608868598938, "learning_rate": 9.49604e-05, "loss": 0.2509, "step": 262600 }, { "epoch": 0.2854, "grad_norm": 0.17347165942192078, "learning_rate": 9.492040000000001e-05, "loss": 0.2495, "step": 262700 }, { "epoch": 0.2856, "grad_norm": 0.256759375333786, "learning_rate": 9.48804e-05, "loss": 0.2503, "step": 262800 }, { "epoch": 0.2858, "grad_norm": 0.16245199739933014, "learning_rate": 9.484040000000001e-05, "loss": 0.2465, "step": 262900 }, { "epoch": 0.286, "grad_norm": 0.17991042137145996, "learning_rate": 9.48004e-05, "loss": 0.2503, "step": 263000 }, { "epoch": 0.2862, "grad_norm": 0.1637287586927414, "learning_rate": 9.476040000000001e-05, "loss": 0.2472, "step": 263100 }, { "epoch": 0.2864, "grad_norm": 0.2055928260087967, "learning_rate": 9.472040000000002e-05, "loss": 0.2495, "step": 263200 }, { "epoch": 0.2866, "grad_norm": 0.19248582422733307, "learning_rate": 9.468040000000001e-05, "loss": 0.2465, "step": 263300 }, { "epoch": 0.2868, "grad_norm": 0.19688032567501068, "learning_rate": 9.46404e-05, "loss": 0.2486, "step": 263400 }, { "epoch": 0.287, "grad_norm": 0.1787942796945572, "learning_rate": 9.46004e-05, "loss": 0.2483, "step": 263500 }, { "epoch": 0.2872, "grad_norm": 0.18415433168411255, "learning_rate": 9.45604e-05, "loss": 0.2566, "step": 263600 }, { "epoch": 0.2874, "grad_norm": 0.19202789664268494, "learning_rate": 9.452040000000001e-05, "loss": 0.2498, "step": 263700 }, { "epoch": 0.2876, "grad_norm": 0.27513137459754944, "learning_rate": 9.44804e-05, "loss": 0.2508, "step": 263800 }, { "epoch": 0.2878, "grad_norm": 0.17416657507419586, "learning_rate": 9.444040000000001e-05, "loss": 0.2501, "step": 263900 }, { "epoch": 0.288, "grad_norm": 0.15521082282066345, "learning_rate": 9.44004e-05, "loss": 0.2462, "step": 264000 }, { "epoch": 0.2882, "grad_norm": 0.17850640416145325, "learning_rate": 9.43604e-05, "loss": 0.2577, "step": 264100 }, { "epoch": 0.2884, "grad_norm": 0.28766483068466187, "learning_rate": 9.43204e-05, "loss": 0.2539, "step": 264200 }, { "epoch": 0.2886, "grad_norm": 0.1784902662038803, "learning_rate": 9.42804e-05, "loss": 0.2531, "step": 264300 }, { "epoch": 0.2888, "grad_norm": 0.19980043172836304, "learning_rate": 9.42404e-05, "loss": 0.2467, "step": 264400 }, { "epoch": 0.289, "grad_norm": 1.2848162651062012, "learning_rate": 9.42004e-05, "loss": 0.2493, "step": 264500 }, { "epoch": 0.2892, "grad_norm": 0.20343329012393951, "learning_rate": 9.416040000000001e-05, "loss": 0.248, "step": 264600 }, { "epoch": 0.2894, "grad_norm": 0.22436699271202087, "learning_rate": 9.412040000000001e-05, "loss": 0.2492, "step": 264700 }, { "epoch": 0.2896, "grad_norm": 0.19231826066970825, "learning_rate": 9.408040000000001e-05, "loss": 0.2457, "step": 264800 }, { "epoch": 0.2898, "grad_norm": 0.20982171595096588, "learning_rate": 9.40404e-05, "loss": 0.2461, "step": 264900 }, { "epoch": 0.29, "grad_norm": 0.25590869784355164, "learning_rate": 9.40004e-05, "loss": 0.2511, "step": 265000 }, { "epoch": 0.2902, "grad_norm": 0.19312362372875214, "learning_rate": 9.39604e-05, "loss": 0.2466, "step": 265100 }, { "epoch": 0.2904, "grad_norm": 0.23638297617435455, "learning_rate": 9.392040000000001e-05, "loss": 0.2466, "step": 265200 }, { "epoch": 0.2906, "grad_norm": 0.16785204410552979, "learning_rate": 9.38804e-05, "loss": 0.2469, "step": 265300 }, { "epoch": 0.2908, "grad_norm": 0.2154064029455185, "learning_rate": 9.384040000000001e-05, "loss": 0.2706, "step": 265400 }, { "epoch": 0.291, "grad_norm": 0.18391206860542297, "learning_rate": 9.38004e-05, "loss": 0.243, "step": 265500 }, { "epoch": 0.2912, "grad_norm": 0.20186170935630798, "learning_rate": 9.37604e-05, "loss": 0.2555, "step": 265600 }, { "epoch": 0.2914, "grad_norm": 0.19941025972366333, "learning_rate": 9.37204e-05, "loss": 0.248, "step": 265700 }, { "epoch": 0.2916, "grad_norm": 0.18308115005493164, "learning_rate": 9.36804e-05, "loss": 0.2464, "step": 265800 }, { "epoch": 0.2918, "grad_norm": 0.24809174239635468, "learning_rate": 9.36404e-05, "loss": 0.2536, "step": 265900 }, { "epoch": 0.292, "grad_norm": 0.2352696657180786, "learning_rate": 9.36004e-05, "loss": 0.2515, "step": 266000 }, { "epoch": 0.2922, "grad_norm": 0.20043088495731354, "learning_rate": 9.35604e-05, "loss": 0.249, "step": 266100 }, { "epoch": 0.2924, "grad_norm": 0.33282342553138733, "learning_rate": 9.352040000000001e-05, "loss": 0.2588, "step": 266200 }, { "epoch": 0.2926, "grad_norm": 0.1646975576877594, "learning_rate": 9.348040000000001e-05, "loss": 0.2571, "step": 266300 }, { "epoch": 0.2928, "grad_norm": 0.2019079327583313, "learning_rate": 9.34404e-05, "loss": 0.2501, "step": 266400 }, { "epoch": 0.293, "grad_norm": 0.3303513526916504, "learning_rate": 9.34004e-05, "loss": 0.2547, "step": 266500 }, { "epoch": 0.2932, "grad_norm": 0.1550179123878479, "learning_rate": 9.33604e-05, "loss": 0.2554, "step": 266600 }, { "epoch": 0.2934, "grad_norm": 0.3752208948135376, "learning_rate": 9.332040000000001e-05, "loss": 0.2493, "step": 266700 }, { "epoch": 0.2936, "grad_norm": 0.18411359190940857, "learning_rate": 9.32804e-05, "loss": 0.2569, "step": 266800 }, { "epoch": 0.2938, "grad_norm": 0.17852115631103516, "learning_rate": 9.324040000000001e-05, "loss": 0.2479, "step": 266900 }, { "epoch": 0.294, "grad_norm": 0.17118707299232483, "learning_rate": 9.320040000000002e-05, "loss": 0.254, "step": 267000 }, { "epoch": 0.2942, "grad_norm": 0.35995370149612427, "learning_rate": 9.31604e-05, "loss": 0.2512, "step": 267100 }, { "epoch": 0.2944, "grad_norm": 0.27355167269706726, "learning_rate": 9.31204e-05, "loss": 0.2549, "step": 267200 }, { "epoch": 0.2946, "grad_norm": 0.1623479276895523, "learning_rate": 9.30804e-05, "loss": 0.2522, "step": 267300 }, { "epoch": 0.2948, "grad_norm": 0.26281481981277466, "learning_rate": 9.30404e-05, "loss": 0.2505, "step": 267400 }, { "epoch": 0.295, "grad_norm": 0.17927655577659607, "learning_rate": 9.300040000000001e-05, "loss": 0.2497, "step": 267500 }, { "epoch": 0.2952, "grad_norm": 0.1781659871339798, "learning_rate": 9.29604e-05, "loss": 0.2497, "step": 267600 }, { "epoch": 0.2954, "grad_norm": 0.16235920786857605, "learning_rate": 9.292040000000001e-05, "loss": 0.2567, "step": 267700 }, { "epoch": 0.2956, "grad_norm": 0.1638641655445099, "learning_rate": 9.28804e-05, "loss": 0.2469, "step": 267800 }, { "epoch": 0.2958, "grad_norm": 0.2075788974761963, "learning_rate": 9.28404e-05, "loss": 0.2482, "step": 267900 }, { "epoch": 0.296, "grad_norm": 0.1806693822145462, "learning_rate": 9.280040000000001e-05, "loss": 0.2516, "step": 268000 }, { "epoch": 0.2962, "grad_norm": 0.22953234612941742, "learning_rate": 9.27604e-05, "loss": 0.2631, "step": 268100 }, { "epoch": 0.2964, "grad_norm": 0.22682808339595795, "learning_rate": 9.272040000000001e-05, "loss": 0.2532, "step": 268200 }, { "epoch": 0.2966, "grad_norm": 0.3644724488258362, "learning_rate": 9.26804e-05, "loss": 0.2521, "step": 268300 }, { "epoch": 0.2968, "grad_norm": 0.18433807790279388, "learning_rate": 9.264040000000001e-05, "loss": 0.2578, "step": 268400 }, { "epoch": 0.297, "grad_norm": 0.1955367773771286, "learning_rate": 9.260040000000002e-05, "loss": 0.2493, "step": 268500 }, { "epoch": 0.2972, "grad_norm": 0.2631000578403473, "learning_rate": 9.256040000000001e-05, "loss": 0.2487, "step": 268600 }, { "epoch": 0.2974, "grad_norm": 0.24312226474285126, "learning_rate": 9.25204e-05, "loss": 0.2501, "step": 268700 }, { "epoch": 0.2976, "grad_norm": 0.19950313866138458, "learning_rate": 9.24804e-05, "loss": 0.2504, "step": 268800 }, { "epoch": 0.2978, "grad_norm": 0.21298249065876007, "learning_rate": 9.24404e-05, "loss": 0.2492, "step": 268900 }, { "epoch": 0.298, "grad_norm": 0.27059680223464966, "learning_rate": 9.240040000000001e-05, "loss": 0.2585, "step": 269000 }, { "epoch": 0.2982, "grad_norm": 0.17913617193698883, "learning_rate": 9.23604e-05, "loss": 0.2426, "step": 269100 }, { "epoch": 0.2984, "grad_norm": 0.19647064805030823, "learning_rate": 9.232040000000001e-05, "loss": 0.2485, "step": 269200 }, { "epoch": 0.2986, "grad_norm": 0.14924323558807373, "learning_rate": 9.22804e-05, "loss": 0.2514, "step": 269300 }, { "epoch": 0.2988, "grad_norm": 0.18103660643100739, "learning_rate": 9.22404e-05, "loss": 0.253, "step": 269400 }, { "epoch": 0.299, "grad_norm": 0.16981446743011475, "learning_rate": 9.22004e-05, "loss": 0.2506, "step": 269500 }, { "epoch": 0.2992, "grad_norm": 0.17313528060913086, "learning_rate": 9.21604e-05, "loss": 0.255, "step": 269600 }, { "epoch": 0.2994, "grad_norm": 0.19472962617874146, "learning_rate": 9.21204e-05, "loss": 0.2462, "step": 269700 }, { "epoch": 0.2996, "grad_norm": 0.20072679221630096, "learning_rate": 9.20804e-05, "loss": 0.2499, "step": 269800 }, { "epoch": 0.2998, "grad_norm": 0.20446142554283142, "learning_rate": 9.204040000000001e-05, "loss": 0.2484, "step": 269900 }, { "epoch": 0.3, "grad_norm": 0.23795530200004578, "learning_rate": 9.200040000000001e-05, "loss": 0.2488, "step": 270000 }, { "epoch": 0.3002, "grad_norm": 0.5230748057365417, "learning_rate": 9.196040000000001e-05, "loss": 0.2517, "step": 270100 }, { "epoch": 0.3004, "grad_norm": 0.22013844549655914, "learning_rate": 9.19204e-05, "loss": 0.2482, "step": 270200 }, { "epoch": 0.3006, "grad_norm": 0.20569878816604614, "learning_rate": 9.18804e-05, "loss": 0.2592, "step": 270300 }, { "epoch": 0.3008, "grad_norm": 0.16592691838741302, "learning_rate": 9.18404e-05, "loss": 0.2539, "step": 270400 }, { "epoch": 0.301, "grad_norm": 0.20824261009693146, "learning_rate": 9.180040000000001e-05, "loss": 0.2523, "step": 270500 }, { "epoch": 0.3012, "grad_norm": 0.21853414177894592, "learning_rate": 9.17604e-05, "loss": 0.2476, "step": 270600 }, { "epoch": 0.3014, "grad_norm": 0.15546807646751404, "learning_rate": 9.172040000000001e-05, "loss": 0.2489, "step": 270700 }, { "epoch": 0.3016, "grad_norm": 0.19030147790908813, "learning_rate": 9.16804e-05, "loss": 0.2444, "step": 270800 }, { "epoch": 0.3018, "grad_norm": 0.20716990530490875, "learning_rate": 9.16404e-05, "loss": 0.2479, "step": 270900 }, { "epoch": 0.302, "grad_norm": 0.1584189087152481, "learning_rate": 9.16004e-05, "loss": 0.2461, "step": 271000 }, { "epoch": 0.3022, "grad_norm": 0.15803031623363495, "learning_rate": 9.15604e-05, "loss": 0.2492, "step": 271100 }, { "epoch": 0.3024, "grad_norm": 0.18134865164756775, "learning_rate": 9.15204e-05, "loss": 0.2458, "step": 271200 }, { "epoch": 0.3026, "grad_norm": 0.17175163328647614, "learning_rate": 9.14804e-05, "loss": 0.2507, "step": 271300 }, { "epoch": 0.3028, "grad_norm": 0.13778148591518402, "learning_rate": 9.14404e-05, "loss": 0.2517, "step": 271400 }, { "epoch": 0.303, "grad_norm": 0.18867234885692596, "learning_rate": 9.140040000000001e-05, "loss": 0.2477, "step": 271500 }, { "epoch": 0.3032, "grad_norm": 0.18188384175300598, "learning_rate": 9.136040000000001e-05, "loss": 0.2564, "step": 271600 }, { "epoch": 0.3034, "grad_norm": 0.18826958537101746, "learning_rate": 9.13204e-05, "loss": 0.2468, "step": 271700 }, { "epoch": 0.3036, "grad_norm": 0.18992917239665985, "learning_rate": 9.12804e-05, "loss": 0.2521, "step": 271800 }, { "epoch": 0.3038, "grad_norm": 0.2327507585287094, "learning_rate": 9.12404e-05, "loss": 0.2514, "step": 271900 }, { "epoch": 0.304, "grad_norm": 0.24102598428726196, "learning_rate": 9.120040000000001e-05, "loss": 0.2539, "step": 272000 }, { "epoch": 0.3042, "grad_norm": 0.16849716007709503, "learning_rate": 9.11604e-05, "loss": 0.2486, "step": 272100 }, { "epoch": 0.3044, "grad_norm": 0.34158793091773987, "learning_rate": 9.112040000000001e-05, "loss": 0.2524, "step": 272200 }, { "epoch": 0.3046, "grad_norm": 0.29653236269950867, "learning_rate": 9.10804e-05, "loss": 0.259, "step": 272300 }, { "epoch": 0.3048, "grad_norm": 0.19480286538600922, "learning_rate": 9.104040000000001e-05, "loss": 0.2505, "step": 272400 }, { "epoch": 0.305, "grad_norm": 0.19145742058753967, "learning_rate": 9.10004e-05, "loss": 0.2539, "step": 272500 }, { "epoch": 0.3052, "grad_norm": 0.5713980793952942, "learning_rate": 9.09604e-05, "loss": 0.2558, "step": 272600 }, { "epoch": 0.3054, "grad_norm": 0.2311251163482666, "learning_rate": 9.09204e-05, "loss": 0.2525, "step": 272700 }, { "epoch": 0.3056, "grad_norm": 0.17040377855300903, "learning_rate": 9.08804e-05, "loss": 0.2611, "step": 272800 }, { "epoch": 0.3058, "grad_norm": 0.16640836000442505, "learning_rate": 9.08404e-05, "loss": 0.2485, "step": 272900 }, { "epoch": 0.306, "grad_norm": 0.18391956388950348, "learning_rate": 9.080040000000001e-05, "loss": 0.2515, "step": 273000 }, { "epoch": 0.3062, "grad_norm": 0.20270375907421112, "learning_rate": 9.07604e-05, "loss": 0.2495, "step": 273100 }, { "epoch": 0.3064, "grad_norm": 0.1664918065071106, "learning_rate": 9.07204e-05, "loss": 0.253, "step": 273200 }, { "epoch": 0.3066, "grad_norm": 0.2487817406654358, "learning_rate": 9.068040000000001e-05, "loss": 0.2534, "step": 273300 }, { "epoch": 0.3068, "grad_norm": 0.19376716017723083, "learning_rate": 9.06404e-05, "loss": 0.2447, "step": 273400 }, { "epoch": 0.307, "grad_norm": 1.2538508176803589, "learning_rate": 9.060040000000001e-05, "loss": 0.2626, "step": 273500 }, { "epoch": 0.3072, "grad_norm": 0.18994612991809845, "learning_rate": 9.05604e-05, "loss": 0.269, "step": 273600 }, { "epoch": 0.3074, "grad_norm": 0.1983320116996765, "learning_rate": 9.052040000000001e-05, "loss": 0.2484, "step": 273700 }, { "epoch": 0.3076, "grad_norm": 0.17373573780059814, "learning_rate": 9.048040000000002e-05, "loss": 0.245, "step": 273800 }, { "epoch": 0.3078, "grad_norm": 0.2375088334083557, "learning_rate": 9.044040000000001e-05, "loss": 0.2467, "step": 273900 }, { "epoch": 0.308, "grad_norm": 0.16651317477226257, "learning_rate": 9.04004e-05, "loss": 0.2551, "step": 274000 }, { "epoch": 0.3082, "grad_norm": 0.27858221530914307, "learning_rate": 9.03604e-05, "loss": 0.2434, "step": 274100 }, { "epoch": 0.3084, "grad_norm": 0.18154625594615936, "learning_rate": 9.03204e-05, "loss": 0.2591, "step": 274200 }, { "epoch": 0.3086, "grad_norm": 0.2042841613292694, "learning_rate": 9.028040000000001e-05, "loss": 0.2435, "step": 274300 }, { "epoch": 0.3088, "grad_norm": 0.2147081345319748, "learning_rate": 9.02404e-05, "loss": 0.2484, "step": 274400 }, { "epoch": 0.309, "grad_norm": 0.1780577152967453, "learning_rate": 9.020040000000001e-05, "loss": 0.2476, "step": 274500 }, { "epoch": 0.3092, "grad_norm": 0.2056666761636734, "learning_rate": 9.01604e-05, "loss": 0.256, "step": 274600 }, { "epoch": 0.3094, "grad_norm": 0.27037736773490906, "learning_rate": 9.01204e-05, "loss": 0.2486, "step": 274700 }, { "epoch": 0.3096, "grad_norm": 0.17876963317394257, "learning_rate": 9.00804e-05, "loss": 0.253, "step": 274800 }, { "epoch": 0.3098, "grad_norm": 0.1852559596300125, "learning_rate": 9.00404e-05, "loss": 0.2535, "step": 274900 }, { "epoch": 0.31, "grad_norm": 0.2233329564332962, "learning_rate": 9.00004e-05, "loss": 0.2478, "step": 275000 }, { "epoch": 0.3102, "grad_norm": 0.2136661559343338, "learning_rate": 8.99604e-05, "loss": 0.2555, "step": 275100 }, { "epoch": 0.3104, "grad_norm": 0.2538791000843048, "learning_rate": 8.992040000000001e-05, "loss": 0.2453, "step": 275200 }, { "epoch": 0.3106, "grad_norm": 0.1559320092201233, "learning_rate": 8.988040000000001e-05, "loss": 0.2462, "step": 275300 }, { "epoch": 0.3108, "grad_norm": 0.21376411616802216, "learning_rate": 8.984040000000001e-05, "loss": 0.247, "step": 275400 }, { "epoch": 0.311, "grad_norm": 0.20133382081985474, "learning_rate": 8.98004e-05, "loss": 0.254, "step": 275500 }, { "epoch": 0.3112, "grad_norm": 0.17593859136104584, "learning_rate": 8.97604e-05, "loss": 0.2535, "step": 275600 }, { "epoch": 0.3114, "grad_norm": 0.20064404606819153, "learning_rate": 8.97204e-05, "loss": 0.2527, "step": 275700 }, { "epoch": 0.3116, "grad_norm": 0.17532220482826233, "learning_rate": 8.968040000000001e-05, "loss": 0.2533, "step": 275800 }, { "epoch": 0.3118, "grad_norm": 0.21646389365196228, "learning_rate": 8.96404e-05, "loss": 0.25, "step": 275900 }, { "epoch": 0.312, "grad_norm": 0.2135733962059021, "learning_rate": 8.960040000000001e-05, "loss": 0.2463, "step": 276000 }, { "epoch": 0.3122, "grad_norm": 0.24991312623023987, "learning_rate": 8.95604e-05, "loss": 0.2506, "step": 276100 }, { "epoch": 0.3124, "grad_norm": 0.20160618424415588, "learning_rate": 8.95204e-05, "loss": 0.2508, "step": 276200 }, { "epoch": 0.3126, "grad_norm": 0.3549211323261261, "learning_rate": 8.94804e-05, "loss": 0.2452, "step": 276300 }, { "epoch": 0.3128, "grad_norm": 0.18709681928157806, "learning_rate": 8.94404e-05, "loss": 0.2463, "step": 276400 }, { "epoch": 0.313, "grad_norm": 0.1972089558839798, "learning_rate": 8.94004e-05, "loss": 0.2434, "step": 276500 }, { "epoch": 0.3132, "grad_norm": 0.21796953678131104, "learning_rate": 8.93604e-05, "loss": 0.2577, "step": 276600 }, { "epoch": 0.3134, "grad_norm": 0.19307708740234375, "learning_rate": 8.93204e-05, "loss": 0.2485, "step": 276700 }, { "epoch": 0.3136, "grad_norm": 0.24761377274990082, "learning_rate": 8.928040000000001e-05, "loss": 0.2482, "step": 276800 }, { "epoch": 0.3138, "grad_norm": 0.1852557212114334, "learning_rate": 8.924040000000001e-05, "loss": 0.2549, "step": 276900 }, { "epoch": 0.314, "grad_norm": 0.18273688852787018, "learning_rate": 8.92004e-05, "loss": 0.2463, "step": 277000 }, { "epoch": 0.3142, "grad_norm": 0.25522246956825256, "learning_rate": 8.91604e-05, "loss": 0.2467, "step": 277100 }, { "epoch": 0.3144, "grad_norm": 0.1901063621044159, "learning_rate": 8.91204e-05, "loss": 0.2592, "step": 277200 }, { "epoch": 0.3146, "grad_norm": 0.21189343929290771, "learning_rate": 8.908040000000001e-05, "loss": 0.2549, "step": 277300 }, { "epoch": 0.3148, "grad_norm": 0.17540982365608215, "learning_rate": 8.90404e-05, "loss": 0.2534, "step": 277400 }, { "epoch": 0.315, "grad_norm": 0.1724601536989212, "learning_rate": 8.900040000000001e-05, "loss": 0.2529, "step": 277500 }, { "epoch": 0.3152, "grad_norm": 0.21971330046653748, "learning_rate": 8.89604e-05, "loss": 0.2542, "step": 277600 }, { "epoch": 0.3154, "grad_norm": 0.18612246215343475, "learning_rate": 8.892040000000001e-05, "loss": 0.249, "step": 277700 }, { "epoch": 0.3156, "grad_norm": 0.15609164535999298, "learning_rate": 8.88804e-05, "loss": 0.2486, "step": 277800 }, { "epoch": 0.3158, "grad_norm": 0.26058584451675415, "learning_rate": 8.88404e-05, "loss": 0.2489, "step": 277900 }, { "epoch": 0.316, "grad_norm": 0.2343665212392807, "learning_rate": 8.88004e-05, "loss": 0.2434, "step": 278000 }, { "epoch": 0.3162, "grad_norm": 0.26184457540512085, "learning_rate": 8.87604e-05, "loss": 0.2558, "step": 278100 }, { "epoch": 0.3164, "grad_norm": 0.23153603076934814, "learning_rate": 8.87204e-05, "loss": 0.2515, "step": 278200 }, { "epoch": 0.3166, "grad_norm": 0.16645345091819763, "learning_rate": 8.868040000000001e-05, "loss": 0.2506, "step": 278300 }, { "epoch": 0.3168, "grad_norm": 0.21093203127384186, "learning_rate": 8.86404e-05, "loss": 0.2501, "step": 278400 }, { "epoch": 0.317, "grad_norm": 0.2181226760149002, "learning_rate": 8.86004e-05, "loss": 0.2469, "step": 278500 }, { "epoch": 0.3172, "grad_norm": 0.27833208441734314, "learning_rate": 8.85604e-05, "loss": 0.2493, "step": 278600 }, { "epoch": 0.3174, "grad_norm": 0.18451841175556183, "learning_rate": 8.85204e-05, "loss": 0.2476, "step": 278700 }, { "epoch": 0.3176, "grad_norm": 0.15855015814304352, "learning_rate": 8.848040000000001e-05, "loss": 0.2562, "step": 278800 }, { "epoch": 0.3178, "grad_norm": 0.19361542165279388, "learning_rate": 8.84404e-05, "loss": 0.2461, "step": 278900 }, { "epoch": 0.318, "grad_norm": 0.1782364845275879, "learning_rate": 8.840040000000001e-05, "loss": 0.2504, "step": 279000 }, { "epoch": 0.3182, "grad_norm": 0.1721198856830597, "learning_rate": 8.83604e-05, "loss": 0.2489, "step": 279100 }, { "epoch": 0.3184, "grad_norm": 0.17030316591262817, "learning_rate": 8.832040000000001e-05, "loss": 0.2485, "step": 279200 }, { "epoch": 0.3186, "grad_norm": 0.2248735874891281, "learning_rate": 8.82804e-05, "loss": 0.252, "step": 279300 }, { "epoch": 0.3188, "grad_norm": 0.18986405432224274, "learning_rate": 8.82404e-05, "loss": 0.2536, "step": 279400 }, { "epoch": 0.319, "grad_norm": 0.48810842633247375, "learning_rate": 8.82004e-05, "loss": 0.2584, "step": 279500 }, { "epoch": 0.3192, "grad_norm": 0.16459937393665314, "learning_rate": 8.816040000000001e-05, "loss": 0.2482, "step": 279600 }, { "epoch": 0.3194, "grad_norm": 0.17048123478889465, "learning_rate": 8.81204e-05, "loss": 0.2472, "step": 279700 }, { "epoch": 0.3196, "grad_norm": 0.20386305451393127, "learning_rate": 8.808040000000001e-05, "loss": 0.2453, "step": 279800 }, { "epoch": 0.3198, "grad_norm": 0.2072535902261734, "learning_rate": 8.80404e-05, "loss": 0.2536, "step": 279900 }, { "epoch": 0.32, "grad_norm": 0.21061697602272034, "learning_rate": 8.80004e-05, "loss": 0.25, "step": 280000 }, { "epoch": 0.3202, "grad_norm": 0.18452088534832, "learning_rate": 8.79604e-05, "loss": 0.251, "step": 280100 }, { "epoch": 0.3204, "grad_norm": 0.27870404720306396, "learning_rate": 8.79204e-05, "loss": 0.2523, "step": 280200 }, { "epoch": 0.3206, "grad_norm": 0.20520621538162231, "learning_rate": 8.78804e-05, "loss": 0.2463, "step": 280300 }, { "epoch": 0.3208, "grad_norm": 0.23470567166805267, "learning_rate": 8.78404e-05, "loss": 0.2507, "step": 280400 }, { "epoch": 0.321, "grad_norm": 0.17464961111545563, "learning_rate": 8.780040000000001e-05, "loss": 0.2463, "step": 280500 }, { "epoch": 0.3212, "grad_norm": 0.2375008910894394, "learning_rate": 8.776040000000001e-05, "loss": 0.2457, "step": 280600 }, { "epoch": 0.3214, "grad_norm": 0.17004603147506714, "learning_rate": 8.772040000000001e-05, "loss": 0.2511, "step": 280700 }, { "epoch": 0.3216, "grad_norm": 0.3254378139972687, "learning_rate": 8.76804e-05, "loss": 0.2513, "step": 280800 }, { "epoch": 0.3218, "grad_norm": 0.1874452382326126, "learning_rate": 8.76404e-05, "loss": 0.2451, "step": 280900 }, { "epoch": 0.322, "grad_norm": 0.18145489692687988, "learning_rate": 8.76004e-05, "loss": 0.2469, "step": 281000 }, { "epoch": 0.3222, "grad_norm": 0.15016548335552216, "learning_rate": 8.756040000000001e-05, "loss": 0.2496, "step": 281100 }, { "epoch": 0.3224, "grad_norm": 0.3445073962211609, "learning_rate": 8.75204e-05, "loss": 0.2484, "step": 281200 }, { "epoch": 0.3226, "grad_norm": 0.17580726742744446, "learning_rate": 8.748040000000001e-05, "loss": 0.2482, "step": 281300 }, { "epoch": 0.3228, "grad_norm": 0.17913737893104553, "learning_rate": 8.74404e-05, "loss": 0.2457, "step": 281400 }, { "epoch": 0.323, "grad_norm": 0.20740285515785217, "learning_rate": 8.740040000000001e-05, "loss": 0.2519, "step": 281500 }, { "epoch": 0.3232, "grad_norm": 0.15794052183628082, "learning_rate": 8.73604e-05, "loss": 0.2478, "step": 281600 }, { "epoch": 0.3234, "grad_norm": 0.204098641872406, "learning_rate": 8.73204e-05, "loss": 0.2478, "step": 281700 }, { "epoch": 0.3236, "grad_norm": 0.17985448241233826, "learning_rate": 8.72804e-05, "loss": 0.2511, "step": 281800 }, { "epoch": 0.3238, "grad_norm": 0.20703667402267456, "learning_rate": 8.72404e-05, "loss": 0.249, "step": 281900 }, { "epoch": 0.324, "grad_norm": 0.3906717002391815, "learning_rate": 8.72004e-05, "loss": 0.2498, "step": 282000 }, { "epoch": 0.3242, "grad_norm": 0.32392579317092896, "learning_rate": 8.716040000000001e-05, "loss": 0.2529, "step": 282100 }, { "epoch": 0.3244, "grad_norm": 0.2961254119873047, "learning_rate": 8.712040000000001e-05, "loss": 0.2528, "step": 282200 }, { "epoch": 0.3246, "grad_norm": 0.18365971744060516, "learning_rate": 8.70804e-05, "loss": 0.2508, "step": 282300 }, { "epoch": 0.3248, "grad_norm": 0.30931851267814636, "learning_rate": 8.70404e-05, "loss": 0.2557, "step": 282400 }, { "epoch": 0.325, "grad_norm": 0.18898595869541168, "learning_rate": 8.70004e-05, "loss": 0.2444, "step": 282500 }, { "epoch": 0.3252, "grad_norm": 0.27550897002220154, "learning_rate": 8.696040000000001e-05, "loss": 0.2427, "step": 282600 }, { "epoch": 0.3254, "grad_norm": 0.19707511365413666, "learning_rate": 8.69204e-05, "loss": 0.2517, "step": 282700 }, { "epoch": 0.3256, "grad_norm": 0.18959122896194458, "learning_rate": 8.688040000000001e-05, "loss": 0.2474, "step": 282800 }, { "epoch": 0.3258, "grad_norm": 0.263849675655365, "learning_rate": 8.68404e-05, "loss": 0.2497, "step": 282900 }, { "epoch": 0.326, "grad_norm": 0.17215219140052795, "learning_rate": 8.680040000000001e-05, "loss": 0.256, "step": 283000 }, { "epoch": 0.3262, "grad_norm": 0.1411217749118805, "learning_rate": 8.67604e-05, "loss": 0.2485, "step": 283100 }, { "epoch": 0.3264, "grad_norm": 0.19203592836856842, "learning_rate": 8.67204e-05, "loss": 0.2478, "step": 283200 }, { "epoch": 0.3266, "grad_norm": 0.18021759390830994, "learning_rate": 8.66804e-05, "loss": 0.2478, "step": 283300 }, { "epoch": 0.3268, "grad_norm": 0.15332086384296417, "learning_rate": 8.66404e-05, "loss": 0.2443, "step": 283400 }, { "epoch": 0.327, "grad_norm": 0.1966552585363388, "learning_rate": 8.66004e-05, "loss": 0.248, "step": 283500 }, { "epoch": 0.3272, "grad_norm": 0.21253438293933868, "learning_rate": 8.656040000000001e-05, "loss": 0.2472, "step": 283600 }, { "epoch": 0.3274, "grad_norm": 0.253312349319458, "learning_rate": 8.65204e-05, "loss": 0.2522, "step": 283700 }, { "epoch": 0.3276, "grad_norm": 0.17193368077278137, "learning_rate": 8.64804e-05, "loss": 0.2564, "step": 283800 }, { "epoch": 0.3278, "grad_norm": 0.17592443525791168, "learning_rate": 8.644039999999999e-05, "loss": 0.2484, "step": 283900 }, { "epoch": 0.328, "grad_norm": 0.200908824801445, "learning_rate": 8.64004e-05, "loss": 0.2473, "step": 284000 }, { "epoch": 0.3282, "grad_norm": 0.21171993017196655, "learning_rate": 8.636040000000001e-05, "loss": 0.2482, "step": 284100 }, { "epoch": 0.3284, "grad_norm": 0.21706272661685944, "learning_rate": 8.63204e-05, "loss": 0.2537, "step": 284200 }, { "epoch": 0.3286, "grad_norm": 0.21344873309135437, "learning_rate": 8.628040000000001e-05, "loss": 0.2501, "step": 284300 }, { "epoch": 0.3288, "grad_norm": 0.19360865652561188, "learning_rate": 8.62404e-05, "loss": 0.2499, "step": 284400 }, { "epoch": 0.329, "grad_norm": 0.16247357428073883, "learning_rate": 8.620040000000001e-05, "loss": 0.246, "step": 284500 }, { "epoch": 0.3292, "grad_norm": 0.205808624625206, "learning_rate": 8.61604e-05, "loss": 0.2592, "step": 284600 }, { "epoch": 0.3294, "grad_norm": 0.21998731791973114, "learning_rate": 8.61204e-05, "loss": 0.2512, "step": 284700 }, { "epoch": 0.3296, "grad_norm": 0.17610786855220795, "learning_rate": 8.60804e-05, "loss": 0.2474, "step": 284800 }, { "epoch": 0.3298, "grad_norm": 0.1882948875427246, "learning_rate": 8.60404e-05, "loss": 0.2512, "step": 284900 }, { "epoch": 0.33, "grad_norm": 0.1443343162536621, "learning_rate": 8.60004e-05, "loss": 0.244, "step": 285000 }, { "epoch": 0.3302, "grad_norm": 0.22007732093334198, "learning_rate": 8.596040000000001e-05, "loss": 0.252, "step": 285100 }, { "epoch": 0.3304, "grad_norm": 0.1846906691789627, "learning_rate": 8.59204e-05, "loss": 0.2482, "step": 285200 }, { "epoch": 0.3306, "grad_norm": 0.19831278920173645, "learning_rate": 8.58804e-05, "loss": 0.2463, "step": 285300 }, { "epoch": 0.3308, "grad_norm": 0.16694426536560059, "learning_rate": 8.584039999999999e-05, "loss": 0.2461, "step": 285400 }, { "epoch": 0.331, "grad_norm": 0.22869306802749634, "learning_rate": 8.58004e-05, "loss": 0.2461, "step": 285500 }, { "epoch": 0.3312, "grad_norm": 0.29163485765457153, "learning_rate": 8.57604e-05, "loss": 0.2514, "step": 285600 }, { "epoch": 0.3314, "grad_norm": 0.20715934038162231, "learning_rate": 8.57204e-05, "loss": 0.2494, "step": 285700 }, { "epoch": 0.3316, "grad_norm": 0.2089393138885498, "learning_rate": 8.568040000000001e-05, "loss": 0.255, "step": 285800 }, { "epoch": 0.3318, "grad_norm": 0.1605835258960724, "learning_rate": 8.564040000000001e-05, "loss": 0.2502, "step": 285900 }, { "epoch": 0.332, "grad_norm": 0.21266242861747742, "learning_rate": 8.560040000000001e-05, "loss": 0.2506, "step": 286000 }, { "epoch": 0.3322, "grad_norm": 0.18077294528484344, "learning_rate": 8.55604e-05, "loss": 0.2463, "step": 286100 }, { "epoch": 0.3324, "grad_norm": 0.2691899538040161, "learning_rate": 8.55204e-05, "loss": 0.2484, "step": 286200 }, { "epoch": 0.3326, "grad_norm": 0.20911413431167603, "learning_rate": 8.54804e-05, "loss": 0.2492, "step": 286300 }, { "epoch": 0.3328, "grad_norm": 0.20955608785152435, "learning_rate": 8.544040000000001e-05, "loss": 0.2501, "step": 286400 }, { "epoch": 0.333, "grad_norm": 0.16261516511440277, "learning_rate": 8.54004e-05, "loss": 0.2481, "step": 286500 }, { "epoch": 0.3332, "grad_norm": 0.3086774945259094, "learning_rate": 8.536040000000001e-05, "loss": 0.2505, "step": 286600 }, { "epoch": 0.3334, "grad_norm": 0.16996116936206818, "learning_rate": 8.53204e-05, "loss": 0.25, "step": 286700 }, { "epoch": 0.3336, "grad_norm": 0.2675207853317261, "learning_rate": 8.528040000000001e-05, "loss": 0.2459, "step": 286800 }, { "epoch": 0.3338, "grad_norm": 0.26065778732299805, "learning_rate": 8.52404e-05, "loss": 0.2468, "step": 286900 }, { "epoch": 0.334, "grad_norm": 0.15775251388549805, "learning_rate": 8.52004e-05, "loss": 0.248, "step": 287000 }, { "epoch": 0.3342, "grad_norm": 0.17829616367816925, "learning_rate": 8.51604e-05, "loss": 0.2522, "step": 287100 }, { "epoch": 0.3344, "grad_norm": 0.283382385969162, "learning_rate": 8.51204e-05, "loss": 0.2504, "step": 287200 }, { "epoch": 0.3346, "grad_norm": 0.21750444173812866, "learning_rate": 8.50804e-05, "loss": 0.2458, "step": 287300 }, { "epoch": 0.3348, "grad_norm": 0.17604978382587433, "learning_rate": 8.504040000000001e-05, "loss": 0.2543, "step": 287400 }, { "epoch": 0.335, "grad_norm": 0.17278654873371124, "learning_rate": 8.500040000000001e-05, "loss": 0.2437, "step": 287500 }, { "epoch": 0.3352, "grad_norm": 0.1866379976272583, "learning_rate": 8.49604e-05, "loss": 0.2473, "step": 287600 }, { "epoch": 0.3354, "grad_norm": 0.18013985455036163, "learning_rate": 8.49204e-05, "loss": 0.2504, "step": 287700 }, { "epoch": 0.3356, "grad_norm": 0.1813124716281891, "learning_rate": 8.48804e-05, "loss": 0.25, "step": 287800 }, { "epoch": 0.3358, "grad_norm": 0.1880805343389511, "learning_rate": 8.484040000000001e-05, "loss": 0.2506, "step": 287900 }, { "epoch": 0.336, "grad_norm": 0.20966345071792603, "learning_rate": 8.48004e-05, "loss": 0.2508, "step": 288000 }, { "epoch": 0.3362, "grad_norm": 0.23822133243083954, "learning_rate": 8.476040000000001e-05, "loss": 0.2549, "step": 288100 }, { "epoch": 0.3364, "grad_norm": 0.17772118747234344, "learning_rate": 8.47204e-05, "loss": 0.2441, "step": 288200 }, { "epoch": 0.3366, "grad_norm": 0.22590018808841705, "learning_rate": 8.468040000000001e-05, "loss": 0.2517, "step": 288300 }, { "epoch": 0.3368, "grad_norm": 0.19911415874958038, "learning_rate": 8.46404e-05, "loss": 0.2495, "step": 288400 }, { "epoch": 0.337, "grad_norm": 0.19281023740768433, "learning_rate": 8.46004e-05, "loss": 0.2535, "step": 288500 }, { "epoch": 0.3372, "grad_norm": 0.18460847437381744, "learning_rate": 8.45604e-05, "loss": 0.2541, "step": 288600 }, { "epoch": 0.3374, "grad_norm": 0.2108038067817688, "learning_rate": 8.45204e-05, "loss": 0.252, "step": 288700 }, { "epoch": 0.3376, "grad_norm": 0.24235336482524872, "learning_rate": 8.44804e-05, "loss": 0.2482, "step": 288800 }, { "epoch": 0.3378, "grad_norm": 0.16646993160247803, "learning_rate": 8.444040000000001e-05, "loss": 0.254, "step": 288900 }, { "epoch": 0.338, "grad_norm": 0.2231166511774063, "learning_rate": 8.44004e-05, "loss": 0.2479, "step": 289000 }, { "epoch": 0.3382, "grad_norm": 0.24372661113739014, "learning_rate": 8.43604e-05, "loss": 0.2488, "step": 289100 }, { "epoch": 0.3384, "grad_norm": 0.1555611491203308, "learning_rate": 8.432039999999999e-05, "loss": 0.2532, "step": 289200 }, { "epoch": 0.3386, "grad_norm": 0.22455960512161255, "learning_rate": 8.42804e-05, "loss": 0.2474, "step": 289300 }, { "epoch": 0.3388, "grad_norm": 0.2038494348526001, "learning_rate": 8.424040000000001e-05, "loss": 0.2468, "step": 289400 }, { "epoch": 0.339, "grad_norm": 0.2508559226989746, "learning_rate": 8.42004e-05, "loss": 0.2489, "step": 289500 }, { "epoch": 0.3392, "grad_norm": 0.2195146679878235, "learning_rate": 8.416040000000001e-05, "loss": 0.2452, "step": 289600 }, { "epoch": 0.3394, "grad_norm": 0.19063441455364227, "learning_rate": 8.41204e-05, "loss": 0.2491, "step": 289700 }, { "epoch": 0.3396, "grad_norm": 0.2436915785074234, "learning_rate": 8.408040000000001e-05, "loss": 0.2508, "step": 289800 }, { "epoch": 0.3398, "grad_norm": 0.20626668632030487, "learning_rate": 8.40404e-05, "loss": 0.2497, "step": 289900 }, { "epoch": 0.34, "grad_norm": 0.1684507131576538, "learning_rate": 8.40004e-05, "loss": 0.2493, "step": 290000 }, { "epoch": 0.3402, "grad_norm": 0.16648483276367188, "learning_rate": 8.39604e-05, "loss": 0.2526, "step": 290100 }, { "epoch": 0.3404, "grad_norm": 0.1838679313659668, "learning_rate": 8.39204e-05, "loss": 0.2477, "step": 290200 }, { "epoch": 0.3406, "grad_norm": 0.204168438911438, "learning_rate": 8.38804e-05, "loss": 0.247, "step": 290300 }, { "epoch": 0.3408, "grad_norm": 0.18529953062534332, "learning_rate": 8.384040000000001e-05, "loss": 0.25, "step": 290400 }, { "epoch": 0.341, "grad_norm": 0.17540723085403442, "learning_rate": 8.38004e-05, "loss": 0.2467, "step": 290500 }, { "epoch": 0.3412, "grad_norm": 0.15867659449577332, "learning_rate": 8.376040000000001e-05, "loss": 0.2442, "step": 290600 }, { "epoch": 0.3414, "grad_norm": 0.24484454095363617, "learning_rate": 8.372039999999999e-05, "loss": 0.2486, "step": 290700 }, { "epoch": 0.3416, "grad_norm": 0.21966548264026642, "learning_rate": 8.36804e-05, "loss": 0.2487, "step": 290800 }, { "epoch": 0.3418, "grad_norm": 0.16184256970882416, "learning_rate": 8.36404e-05, "loss": 0.2464, "step": 290900 }, { "epoch": 0.342, "grad_norm": 0.2605167329311371, "learning_rate": 8.36004e-05, "loss": 0.2456, "step": 291000 }, { "epoch": 0.3422, "grad_norm": 0.15184715390205383, "learning_rate": 8.356040000000001e-05, "loss": 0.2544, "step": 291100 }, { "epoch": 0.3424, "grad_norm": 0.18287336826324463, "learning_rate": 8.35204e-05, "loss": 0.2461, "step": 291200 }, { "epoch": 0.3426, "grad_norm": 0.1705942153930664, "learning_rate": 8.348040000000001e-05, "loss": 0.2468, "step": 291300 }, { "epoch": 0.3428, "grad_norm": 0.4577696919441223, "learning_rate": 8.34404e-05, "loss": 0.2488, "step": 291400 }, { "epoch": 0.343, "grad_norm": 0.21272404491901398, "learning_rate": 8.34004e-05, "loss": 0.2522, "step": 291500 }, { "epoch": 0.3432, "grad_norm": 0.24788586795330048, "learning_rate": 8.33604e-05, "loss": 0.2437, "step": 291600 }, { "epoch": 0.3434, "grad_norm": 0.16285791993141174, "learning_rate": 8.33204e-05, "loss": 0.2455, "step": 291700 }, { "epoch": 0.3436, "grad_norm": 0.1585230529308319, "learning_rate": 8.32804e-05, "loss": 0.2454, "step": 291800 }, { "epoch": 0.3438, "grad_norm": 0.1985248178243637, "learning_rate": 8.324040000000001e-05, "loss": 0.2475, "step": 291900 }, { "epoch": 0.344, "grad_norm": 0.3097187280654907, "learning_rate": 8.32004e-05, "loss": 0.2479, "step": 292000 }, { "epoch": 0.3442, "grad_norm": 0.21880201995372772, "learning_rate": 8.316040000000001e-05, "loss": 0.2495, "step": 292100 }, { "epoch": 0.3444, "grad_norm": 0.1829034388065338, "learning_rate": 8.31204e-05, "loss": 0.2501, "step": 292200 }, { "epoch": 0.3446, "grad_norm": 0.1782861053943634, "learning_rate": 8.30804e-05, "loss": 0.2532, "step": 292300 }, { "epoch": 0.3448, "grad_norm": 0.22069762647151947, "learning_rate": 8.30404e-05, "loss": 0.2608, "step": 292400 }, { "epoch": 0.345, "grad_norm": 0.18594609200954437, "learning_rate": 8.30004e-05, "loss": 0.2552, "step": 292500 }, { "epoch": 0.3452, "grad_norm": 0.17892573773860931, "learning_rate": 8.29604e-05, "loss": 0.2488, "step": 292600 }, { "epoch": 0.3454, "grad_norm": 0.18807783722877502, "learning_rate": 8.292040000000001e-05, "loss": 0.2513, "step": 292700 }, { "epoch": 0.3456, "grad_norm": 0.19882932305335999, "learning_rate": 8.288040000000001e-05, "loss": 0.2489, "step": 292800 }, { "epoch": 0.3458, "grad_norm": 0.19057603180408478, "learning_rate": 8.28404e-05, "loss": 0.2479, "step": 292900 }, { "epoch": 0.346, "grad_norm": 0.2387564480304718, "learning_rate": 8.28004e-05, "loss": 0.2488, "step": 293000 }, { "epoch": 0.3462, "grad_norm": 0.22695794701576233, "learning_rate": 8.27604e-05, "loss": 0.2452, "step": 293100 }, { "epoch": 0.3464, "grad_norm": 0.15103566646575928, "learning_rate": 8.272040000000001e-05, "loss": 0.2488, "step": 293200 }, { "epoch": 0.3466, "grad_norm": 0.16485248506069183, "learning_rate": 8.26804e-05, "loss": 0.2457, "step": 293300 }, { "epoch": 0.3468, "grad_norm": 0.1788826584815979, "learning_rate": 8.264040000000001e-05, "loss": 0.2479, "step": 293400 }, { "epoch": 0.347, "grad_norm": 0.17442439496517181, "learning_rate": 8.26004e-05, "loss": 0.2471, "step": 293500 }, { "epoch": 0.3472, "grad_norm": 0.5051277875900269, "learning_rate": 8.256040000000001e-05, "loss": 0.2474, "step": 293600 }, { "epoch": 0.3474, "grad_norm": 0.40286529064178467, "learning_rate": 8.25204e-05, "loss": 0.2482, "step": 293700 }, { "epoch": 0.3476, "grad_norm": 0.2842089831829071, "learning_rate": 8.24804e-05, "loss": 0.2494, "step": 293800 }, { "epoch": 0.3478, "grad_norm": 0.17578501999378204, "learning_rate": 8.24404e-05, "loss": 0.2527, "step": 293900 }, { "epoch": 0.348, "grad_norm": 0.17574481666088104, "learning_rate": 8.24004e-05, "loss": 0.2463, "step": 294000 }, { "epoch": 0.3482, "grad_norm": 0.1960979551076889, "learning_rate": 8.23604e-05, "loss": 0.2495, "step": 294100 }, { "epoch": 0.3484, "grad_norm": 0.1842304915189743, "learning_rate": 8.232040000000001e-05, "loss": 0.25, "step": 294200 }, { "epoch": 0.3486, "grad_norm": 0.17555032670497894, "learning_rate": 8.22804e-05, "loss": 0.2487, "step": 294300 }, { "epoch": 0.3488, "grad_norm": 0.18717283010482788, "learning_rate": 8.224040000000001e-05, "loss": 0.2462, "step": 294400 }, { "epoch": 0.349, "grad_norm": 0.18235217034816742, "learning_rate": 8.220039999999999e-05, "loss": 0.2472, "step": 294500 }, { "epoch": 0.3492, "grad_norm": 0.19905789196491241, "learning_rate": 8.21604e-05, "loss": 0.2485, "step": 294600 }, { "epoch": 0.3494, "grad_norm": 0.19601567089557648, "learning_rate": 8.212040000000001e-05, "loss": 0.2519, "step": 294700 }, { "epoch": 0.3496, "grad_norm": 0.16348402202129364, "learning_rate": 8.20804e-05, "loss": 0.2463, "step": 294800 }, { "epoch": 0.3498, "grad_norm": 0.16451190412044525, "learning_rate": 8.204040000000001e-05, "loss": 0.251, "step": 294900 }, { "epoch": 0.35, "grad_norm": 0.20977216958999634, "learning_rate": 8.20004e-05, "loss": 0.2465, "step": 295000 }, { "epoch": 0.3502, "grad_norm": 0.3382442891597748, "learning_rate": 8.196040000000001e-05, "loss": 0.2456, "step": 295100 }, { "epoch": 0.3504, "grad_norm": 0.22302016615867615, "learning_rate": 8.19204e-05, "loss": 0.2469, "step": 295200 }, { "epoch": 0.3506, "grad_norm": 0.1620716005563736, "learning_rate": 8.18804e-05, "loss": 0.2435, "step": 295300 }, { "epoch": 0.3508, "grad_norm": 0.2126971334218979, "learning_rate": 8.18404e-05, "loss": 0.2444, "step": 295400 }, { "epoch": 0.351, "grad_norm": 0.1620679646730423, "learning_rate": 8.18004e-05, "loss": 0.2402, "step": 295500 }, { "epoch": 0.3512, "grad_norm": 0.19541557133197784, "learning_rate": 8.17604e-05, "loss": 0.2426, "step": 295600 }, { "epoch": 0.3514, "grad_norm": 0.18142946064472198, "learning_rate": 8.172040000000001e-05, "loss": 0.2452, "step": 295700 }, { "epoch": 0.3516, "grad_norm": 0.18419720232486725, "learning_rate": 8.16804e-05, "loss": 0.2466, "step": 295800 }, { "epoch": 0.3518, "grad_norm": 0.19173642992973328, "learning_rate": 8.164040000000001e-05, "loss": 0.2487, "step": 295900 }, { "epoch": 0.352, "grad_norm": 0.16842658817768097, "learning_rate": 8.16004e-05, "loss": 0.2508, "step": 296000 }, { "epoch": 0.3522, "grad_norm": 0.23482634127140045, "learning_rate": 8.15604e-05, "loss": 0.2421, "step": 296100 }, { "epoch": 0.3524, "grad_norm": 0.17169761657714844, "learning_rate": 8.15204e-05, "loss": 0.2454, "step": 296200 }, { "epoch": 0.3526, "grad_norm": 0.1711745262145996, "learning_rate": 8.14804e-05, "loss": 0.2424, "step": 296300 }, { "epoch": 0.3528, "grad_norm": 0.18024346232414246, "learning_rate": 8.144040000000001e-05, "loss": 0.244, "step": 296400 }, { "epoch": 0.353, "grad_norm": 0.468720018863678, "learning_rate": 8.14004e-05, "loss": 0.2452, "step": 296500 }, { "epoch": 0.3532, "grad_norm": 0.16344809532165527, "learning_rate": 8.136040000000001e-05, "loss": 0.2426, "step": 296600 }, { "epoch": 0.3534, "grad_norm": 0.22236470878124237, "learning_rate": 8.13204e-05, "loss": 0.2466, "step": 296700 }, { "epoch": 0.3536, "grad_norm": 0.19982776045799255, "learning_rate": 8.12804e-05, "loss": 0.2506, "step": 296800 }, { "epoch": 0.3538, "grad_norm": 0.1859310418367386, "learning_rate": 8.12404e-05, "loss": 0.251, "step": 296900 }, { "epoch": 0.354, "grad_norm": 0.21040932834148407, "learning_rate": 8.12004e-05, "loss": 0.2457, "step": 297000 }, { "epoch": 0.3542, "grad_norm": 0.1694370061159134, "learning_rate": 8.11604e-05, "loss": 0.2444, "step": 297100 }, { "epoch": 0.3544, "grad_norm": 0.17156536877155304, "learning_rate": 8.112040000000001e-05, "loss": 0.2424, "step": 297200 }, { "epoch": 0.3546, "grad_norm": 0.17718739807605743, "learning_rate": 8.10804e-05, "loss": 0.2411, "step": 297300 }, { "epoch": 0.3548, "grad_norm": 0.23720037937164307, "learning_rate": 8.104040000000001e-05, "loss": 0.2469, "step": 297400 }, { "epoch": 0.355, "grad_norm": 0.2064405083656311, "learning_rate": 8.10004e-05, "loss": 0.2415, "step": 297500 }, { "epoch": 0.3552, "grad_norm": 0.1649145632982254, "learning_rate": 8.09604e-05, "loss": 0.2421, "step": 297600 }, { "epoch": 0.3554, "grad_norm": 0.1422872543334961, "learning_rate": 8.09204e-05, "loss": 0.2414, "step": 297700 }, { "epoch": 0.3556, "grad_norm": 0.42385339736938477, "learning_rate": 8.08804e-05, "loss": 0.2425, "step": 297800 }, { "epoch": 0.3558, "grad_norm": 0.1627955287694931, "learning_rate": 8.08404e-05, "loss": 0.2481, "step": 297900 }, { "epoch": 0.356, "grad_norm": 0.1613747775554657, "learning_rate": 8.08004e-05, "loss": 0.2478, "step": 298000 }, { "epoch": 0.3562, "grad_norm": 0.23526862263679504, "learning_rate": 8.076040000000001e-05, "loss": 0.248, "step": 298100 }, { "epoch": 0.3564, "grad_norm": 0.18639631569385529, "learning_rate": 8.07204e-05, "loss": 0.2567, "step": 298200 }, { "epoch": 0.3566, "grad_norm": 0.2338380515575409, "learning_rate": 8.06804e-05, "loss": 0.2494, "step": 298300 }, { "epoch": 0.3568, "grad_norm": 0.165755957365036, "learning_rate": 8.06404e-05, "loss": 0.2518, "step": 298400 }, { "epoch": 0.357, "grad_norm": 0.1789596974849701, "learning_rate": 8.060040000000001e-05, "loss": 0.2483, "step": 298500 }, { "epoch": 0.3572, "grad_norm": 0.174707293510437, "learning_rate": 8.05604e-05, "loss": 0.2445, "step": 298600 }, { "epoch": 0.3574, "grad_norm": 0.15874457359313965, "learning_rate": 8.052040000000001e-05, "loss": 0.2526, "step": 298700 }, { "epoch": 0.3576, "grad_norm": 0.15661177039146423, "learning_rate": 8.04804e-05, "loss": 0.2447, "step": 298800 }, { "epoch": 0.3578, "grad_norm": 0.185741126537323, "learning_rate": 8.044040000000001e-05, "loss": 0.254, "step": 298900 }, { "epoch": 0.358, "grad_norm": 0.1975129246711731, "learning_rate": 8.04004e-05, "loss": 0.2453, "step": 299000 }, { "epoch": 0.3582, "grad_norm": 0.21340824663639069, "learning_rate": 8.03604e-05, "loss": 0.2436, "step": 299100 }, { "epoch": 0.3584, "grad_norm": 0.1729285567998886, "learning_rate": 8.03204e-05, "loss": 0.2495, "step": 299200 }, { "epoch": 0.3586, "grad_norm": 0.2982395589351654, "learning_rate": 8.02804e-05, "loss": 0.2443, "step": 299300 }, { "epoch": 0.3588, "grad_norm": 0.18718211352825165, "learning_rate": 8.02404e-05, "loss": 0.2421, "step": 299400 }, { "epoch": 0.359, "grad_norm": 0.18755905330181122, "learning_rate": 8.020040000000001e-05, "loss": 0.2517, "step": 299500 }, { "epoch": 0.3592, "grad_norm": 0.15775266289710999, "learning_rate": 8.01604e-05, "loss": 0.2471, "step": 299600 }, { "epoch": 0.3594, "grad_norm": 0.14446353912353516, "learning_rate": 8.012040000000001e-05, "loss": 0.2433, "step": 299700 }, { "epoch": 0.3596, "grad_norm": 0.17306815087795258, "learning_rate": 8.008039999999999e-05, "loss": 0.2464, "step": 299800 }, { "epoch": 0.3598, "grad_norm": 0.22161149978637695, "learning_rate": 8.00404e-05, "loss": 0.2467, "step": 299900 }, { "epoch": 0.36, "grad_norm": 0.15656442940235138, "learning_rate": 8.000040000000001e-05, "loss": 0.2445, "step": 300000 }, { "epoch": 0.3602, "grad_norm": 0.17966248095035553, "learning_rate": 7.99604e-05, "loss": 0.2451, "step": 300100 }, { "epoch": 0.3604, "grad_norm": 0.18749384582042694, "learning_rate": 7.992040000000001e-05, "loss": 0.242, "step": 300200 }, { "epoch": 0.3606, "grad_norm": 0.18940359354019165, "learning_rate": 7.98804e-05, "loss": 0.241, "step": 300300 }, { "epoch": 0.3608, "grad_norm": 0.1695726066827774, "learning_rate": 7.984040000000001e-05, "loss": 0.247, "step": 300400 }, { "epoch": 0.361, "grad_norm": 0.23196519911289215, "learning_rate": 7.98004e-05, "loss": 0.2518, "step": 300500 }, { "epoch": 0.3612, "grad_norm": 0.1548641175031662, "learning_rate": 7.97604e-05, "loss": 0.2424, "step": 300600 }, { "epoch": 0.3614, "grad_norm": 0.25332915782928467, "learning_rate": 7.97204e-05, "loss": 0.2428, "step": 300700 }, { "epoch": 0.3616, "grad_norm": 0.21986594796180725, "learning_rate": 7.96804e-05, "loss": 0.2447, "step": 300800 }, { "epoch": 0.3618, "grad_norm": 0.18515200912952423, "learning_rate": 7.96404e-05, "loss": 0.2424, "step": 300900 }, { "epoch": 0.362, "grad_norm": 0.23712636530399323, "learning_rate": 7.960040000000001e-05, "loss": 0.2513, "step": 301000 }, { "epoch": 0.3622, "grad_norm": 0.21687628328800201, "learning_rate": 7.95604e-05, "loss": 0.249, "step": 301100 }, { "epoch": 0.3624, "grad_norm": 0.22492177784442902, "learning_rate": 7.952040000000001e-05, "loss": 0.2463, "step": 301200 }, { "epoch": 0.3626, "grad_norm": 0.19002969563007355, "learning_rate": 7.94804e-05, "loss": 0.2455, "step": 301300 }, { "epoch": 0.3628, "grad_norm": 0.1988956481218338, "learning_rate": 7.94404e-05, "loss": 0.2405, "step": 301400 }, { "epoch": 0.363, "grad_norm": 0.1998603492975235, "learning_rate": 7.94004e-05, "loss": 0.2454, "step": 301500 }, { "epoch": 0.3632, "grad_norm": 0.23634324967861176, "learning_rate": 7.93604e-05, "loss": 0.2481, "step": 301600 }, { "epoch": 0.3634, "grad_norm": 0.17416614294052124, "learning_rate": 7.932040000000001e-05, "loss": 0.2418, "step": 301700 }, { "epoch": 0.3636, "grad_norm": 0.2007831186056137, "learning_rate": 7.92804e-05, "loss": 0.2538, "step": 301800 }, { "epoch": 0.3638, "grad_norm": 0.1813400238752365, "learning_rate": 7.924040000000001e-05, "loss": 0.2553, "step": 301900 }, { "epoch": 0.364, "grad_norm": 0.18407954275608063, "learning_rate": 7.92004e-05, "loss": 0.2429, "step": 302000 }, { "epoch": 0.3642, "grad_norm": 0.2030446082353592, "learning_rate": 7.91604e-05, "loss": 0.243, "step": 302100 }, { "epoch": 0.3644, "grad_norm": 0.18323349952697754, "learning_rate": 7.91204e-05, "loss": 0.2449, "step": 302200 }, { "epoch": 0.3646, "grad_norm": 0.1829667091369629, "learning_rate": 7.90804e-05, "loss": 0.2497, "step": 302300 }, { "epoch": 0.3648, "grad_norm": 0.21287867426872253, "learning_rate": 7.90404e-05, "loss": 0.2463, "step": 302400 }, { "epoch": 0.365, "grad_norm": 0.17376112937927246, "learning_rate": 7.900040000000001e-05, "loss": 0.241, "step": 302500 }, { "epoch": 0.3652, "grad_norm": 0.19770017266273499, "learning_rate": 7.89604e-05, "loss": 0.247, "step": 302600 }, { "epoch": 0.3654, "grad_norm": 0.2480800300836563, "learning_rate": 7.892040000000001e-05, "loss": 0.2408, "step": 302700 }, { "epoch": 0.3656, "grad_norm": 0.1723504662513733, "learning_rate": 7.88804e-05, "loss": 0.2475, "step": 302800 }, { "epoch": 0.3658, "grad_norm": 0.19746597111225128, "learning_rate": 7.88404e-05, "loss": 0.2387, "step": 302900 }, { "epoch": 0.366, "grad_norm": 0.2700865864753723, "learning_rate": 7.88004e-05, "loss": 0.2431, "step": 303000 }, { "epoch": 0.3662, "grad_norm": 0.18088355660438538, "learning_rate": 7.87604e-05, "loss": 0.2425, "step": 303100 }, { "epoch": 0.3664, "grad_norm": 0.24252882599830627, "learning_rate": 7.87204e-05, "loss": 0.2456, "step": 303200 }, { "epoch": 0.3666, "grad_norm": 0.18904949724674225, "learning_rate": 7.86804e-05, "loss": 0.2485, "step": 303300 }, { "epoch": 0.3668, "grad_norm": 0.2009989470243454, "learning_rate": 7.864040000000001e-05, "loss": 0.2457, "step": 303400 }, { "epoch": 0.367, "grad_norm": 0.21063704788684845, "learning_rate": 7.860040000000001e-05, "loss": 0.2437, "step": 303500 }, { "epoch": 0.3672, "grad_norm": 0.24128037691116333, "learning_rate": 7.85604e-05, "loss": 0.2519, "step": 303600 }, { "epoch": 0.3674, "grad_norm": 0.18868161737918854, "learning_rate": 7.85204e-05, "loss": 0.2456, "step": 303700 }, { "epoch": 0.3676, "grad_norm": 0.24962688982486725, "learning_rate": 7.84804e-05, "loss": 0.2483, "step": 303800 }, { "epoch": 0.3678, "grad_norm": 0.15526580810546875, "learning_rate": 7.84404e-05, "loss": 0.2507, "step": 303900 }, { "epoch": 0.368, "grad_norm": 0.21151374280452728, "learning_rate": 7.840040000000001e-05, "loss": 0.2587, "step": 304000 }, { "epoch": 0.3682, "grad_norm": 0.18684406578540802, "learning_rate": 7.83604e-05, "loss": 0.2472, "step": 304100 }, { "epoch": 0.3684, "grad_norm": 0.5539630651473999, "learning_rate": 7.832040000000001e-05, "loss": 0.2514, "step": 304200 }, { "epoch": 0.3686, "grad_norm": 0.31068259477615356, "learning_rate": 7.82804e-05, "loss": 0.2521, "step": 304300 }, { "epoch": 0.3688, "grad_norm": 0.15399712324142456, "learning_rate": 7.82404e-05, "loss": 0.245, "step": 304400 }, { "epoch": 0.369, "grad_norm": 0.16748294234275818, "learning_rate": 7.82004e-05, "loss": 0.2458, "step": 304500 }, { "epoch": 0.3692, "grad_norm": 0.35579004883766174, "learning_rate": 7.81604e-05, "loss": 0.2753, "step": 304600 }, { "epoch": 0.3694, "grad_norm": 0.28640273213386536, "learning_rate": 7.81204e-05, "loss": 0.3173, "step": 304700 }, { "epoch": 0.3696, "grad_norm": 0.5388140082359314, "learning_rate": 7.808040000000001e-05, "loss": 0.3084, "step": 304800 }, { "epoch": 0.3698, "grad_norm": 0.28529220819473267, "learning_rate": 7.80404e-05, "loss": 0.2922, "step": 304900 }, { "epoch": 0.37, "grad_norm": 0.465318888425827, "learning_rate": 7.800040000000001e-05, "loss": 0.3043, "step": 305000 }, { "epoch": 0.3702, "grad_norm": 0.25032395124435425, "learning_rate": 7.79604e-05, "loss": 0.3063, "step": 305100 }, { "epoch": 0.3704, "grad_norm": 0.4543182849884033, "learning_rate": 7.79204e-05, "loss": 0.2866, "step": 305200 }, { "epoch": 0.3706, "grad_norm": 0.305585652589798, "learning_rate": 7.788040000000001e-05, "loss": 0.2914, "step": 305300 }, { "epoch": 0.3708, "grad_norm": 0.9315662980079651, "learning_rate": 7.78404e-05, "loss": 0.2972, "step": 305400 }, { "epoch": 0.371, "grad_norm": 0.32617905735969543, "learning_rate": 7.780040000000001e-05, "loss": 0.3116, "step": 305500 }, { "epoch": 0.3712, "grad_norm": 0.3560982346534729, "learning_rate": 7.77604e-05, "loss": 0.3046, "step": 305600 }, { "epoch": 0.3714, "grad_norm": 0.23608624935150146, "learning_rate": 7.772040000000001e-05, "loss": 0.2764, "step": 305700 }, { "epoch": 0.3716, "grad_norm": 0.429745614528656, "learning_rate": 7.76804e-05, "loss": 0.3084, "step": 305800 }, { "epoch": 0.3718, "grad_norm": 0.181761234998703, "learning_rate": 7.76404e-05, "loss": 0.2848, "step": 305900 }, { "epoch": 0.372, "grad_norm": 0.2306230366230011, "learning_rate": 7.76004e-05, "loss": 0.2956, "step": 306000 }, { "epoch": 0.3722, "grad_norm": 0.22008122503757477, "learning_rate": 7.75604e-05, "loss": 0.2669, "step": 306100 }, { "epoch": 0.3724, "grad_norm": 0.1795935481786728, "learning_rate": 7.75204e-05, "loss": 0.2864, "step": 306200 }, { "epoch": 0.3726, "grad_norm": 0.28549113869667053, "learning_rate": 7.748040000000001e-05, "loss": 0.2774, "step": 306300 }, { "epoch": 0.3728, "grad_norm": 0.2546445429325104, "learning_rate": 7.74404e-05, "loss": 0.2714, "step": 306400 }, { "epoch": 0.373, "grad_norm": 0.23946426808834076, "learning_rate": 7.740040000000001e-05, "loss": 0.271, "step": 306500 }, { "epoch": 0.3732, "grad_norm": 0.4045504927635193, "learning_rate": 7.73604e-05, "loss": 0.3021, "step": 306600 }, { "epoch": 0.3734, "grad_norm": 0.32764193415641785, "learning_rate": 7.73204e-05, "loss": 0.2769, "step": 306700 }, { "epoch": 0.3736, "grad_norm": 0.2573786973953247, "learning_rate": 7.72804e-05, "loss": 0.2901, "step": 306800 }, { "epoch": 0.3738, "grad_norm": 0.34804463386535645, "learning_rate": 7.72404e-05, "loss": 0.2925, "step": 306900 }, { "epoch": 0.374, "grad_norm": 0.3770942986011505, "learning_rate": 7.720040000000001e-05, "loss": 0.2738, "step": 307000 }, { "epoch": 0.3742, "grad_norm": 0.27824512124061584, "learning_rate": 7.71604e-05, "loss": 0.28, "step": 307100 }, { "epoch": 0.3744, "grad_norm": 0.24019598960876465, "learning_rate": 7.712040000000001e-05, "loss": 0.2983, "step": 307200 }, { "epoch": 0.3746, "grad_norm": 0.283953458070755, "learning_rate": 7.70804e-05, "loss": 0.293, "step": 307300 }, { "epoch": 0.3748, "grad_norm": 0.2683544456958771, "learning_rate": 7.70404e-05, "loss": 0.2729, "step": 307400 }, { "epoch": 0.375, "grad_norm": 0.3089542090892792, "learning_rate": 7.70004e-05, "loss": 0.2916, "step": 307500 }, { "epoch": 0.3752, "grad_norm": 0.5444771647453308, "learning_rate": 7.69604e-05, "loss": 0.2795, "step": 307600 }, { "epoch": 0.3754, "grad_norm": 0.333789587020874, "learning_rate": 7.69204e-05, "loss": 0.2751, "step": 307700 }, { "epoch": 0.3756, "grad_norm": 0.7448511123657227, "learning_rate": 7.688040000000001e-05, "loss": 0.29, "step": 307800 }, { "epoch": 0.3758, "grad_norm": 0.17118731141090393, "learning_rate": 7.68404e-05, "loss": 0.2819, "step": 307900 }, { "epoch": 0.376, "grad_norm": 0.1759895384311676, "learning_rate": 7.680040000000001e-05, "loss": 0.2877, "step": 308000 }, { "epoch": 0.3762, "grad_norm": 0.26918289065361023, "learning_rate": 7.67604e-05, "loss": 0.2738, "step": 308100 }, { "epoch": 0.3764, "grad_norm": 0.2078232765197754, "learning_rate": 7.67204e-05, "loss": 0.2881, "step": 308200 }, { "epoch": 0.3766, "grad_norm": 0.6741946339607239, "learning_rate": 7.66804e-05, "loss": 0.2742, "step": 308300 }, { "epoch": 0.3768, "grad_norm": 0.2831641137599945, "learning_rate": 7.66404e-05, "loss": 0.3325, "step": 308400 }, { "epoch": 0.377, "grad_norm": 0.16423487663269043, "learning_rate": 7.66004e-05, "loss": 0.2887, "step": 308500 }, { "epoch": 0.3772, "grad_norm": 0.2626672089099884, "learning_rate": 7.65604e-05, "loss": 0.2761, "step": 308600 }, { "epoch": 0.3774, "grad_norm": 0.23534747958183289, "learning_rate": 7.652040000000001e-05, "loss": 0.2843, "step": 308700 }, { "epoch": 0.3776, "grad_norm": 0.4902610182762146, "learning_rate": 7.648040000000001e-05, "loss": 0.2959, "step": 308800 }, { "epoch": 0.3778, "grad_norm": 0.29607897996902466, "learning_rate": 7.644040000000001e-05, "loss": 0.2678, "step": 308900 }, { "epoch": 0.378, "grad_norm": 0.3284173011779785, "learning_rate": 7.64004e-05, "loss": 0.2764, "step": 309000 }, { "epoch": 0.3782, "grad_norm": 0.24199332296848297, "learning_rate": 7.63604e-05, "loss": 0.2766, "step": 309100 }, { "epoch": 0.3784, "grad_norm": 0.3074169158935547, "learning_rate": 7.63204e-05, "loss": 0.2901, "step": 309200 }, { "epoch": 0.3786, "grad_norm": 0.3491748869419098, "learning_rate": 7.628040000000001e-05, "loss": 0.2846, "step": 309300 }, { "epoch": 0.3788, "grad_norm": 0.46956080198287964, "learning_rate": 7.62404e-05, "loss": 0.2853, "step": 309400 }, { "epoch": 0.379, "grad_norm": 0.2703664004802704, "learning_rate": 7.620040000000001e-05, "loss": 0.3082, "step": 309500 }, { "epoch": 0.3792, "grad_norm": 0.25405412912368774, "learning_rate": 7.61604e-05, "loss": 0.2806, "step": 309600 }, { "epoch": 0.3794, "grad_norm": 0.39644813537597656, "learning_rate": 7.61204e-05, "loss": 0.2921, "step": 309700 }, { "epoch": 0.3796, "grad_norm": 0.16149291396141052, "learning_rate": 7.60804e-05, "loss": 0.2622, "step": 309800 }, { "epoch": 0.3798, "grad_norm": 0.314878910779953, "learning_rate": 7.60404e-05, "loss": 0.2703, "step": 309900 }, { "epoch": 0.38, "grad_norm": 0.2966255247592926, "learning_rate": 7.60004e-05, "loss": 0.2854, "step": 310000 }, { "epoch": 0.3802, "grad_norm": 0.20654204487800598, "learning_rate": 7.59604e-05, "loss": 0.29, "step": 310100 }, { "epoch": 0.3804, "grad_norm": 0.2614297568798065, "learning_rate": 7.59204e-05, "loss": 0.2918, "step": 310200 }, { "epoch": 0.3806, "grad_norm": 0.22196181118488312, "learning_rate": 7.588040000000001e-05, "loss": 0.2773, "step": 310300 }, { "epoch": 0.3808, "grad_norm": 0.19726967811584473, "learning_rate": 7.58404e-05, "loss": 0.2817, "step": 310400 }, { "epoch": 0.381, "grad_norm": 0.24383938312530518, "learning_rate": 7.58004e-05, "loss": 0.2837, "step": 310500 }, { "epoch": 0.3812, "grad_norm": 0.21394483745098114, "learning_rate": 7.576040000000001e-05, "loss": 0.2759, "step": 310600 }, { "epoch": 0.3814, "grad_norm": 0.2858792245388031, "learning_rate": 7.57204e-05, "loss": 0.291, "step": 310700 }, { "epoch": 0.3816, "grad_norm": 0.19695499539375305, "learning_rate": 7.568040000000001e-05, "loss": 0.2819, "step": 310800 }, { "epoch": 0.3818, "grad_norm": 0.21151651442050934, "learning_rate": 7.56404e-05, "loss": 0.2702, "step": 310900 }, { "epoch": 0.382, "grad_norm": 0.22047953307628632, "learning_rate": 7.560040000000001e-05, "loss": 0.2866, "step": 311000 }, { "epoch": 0.3822, "grad_norm": 0.5067945718765259, "learning_rate": 7.55604e-05, "loss": 0.2756, "step": 311100 }, { "epoch": 0.3824, "grad_norm": 0.43224334716796875, "learning_rate": 7.55204e-05, "loss": 0.288, "step": 311200 }, { "epoch": 0.3826, "grad_norm": 0.2836945652961731, "learning_rate": 7.54804e-05, "loss": 0.2803, "step": 311300 }, { "epoch": 0.3828, "grad_norm": 0.6460776925086975, "learning_rate": 7.54404e-05, "loss": 0.2686, "step": 311400 }, { "epoch": 0.383, "grad_norm": 0.3450317978858948, "learning_rate": 7.54004e-05, "loss": 0.2747, "step": 311500 }, { "epoch": 0.3832, "grad_norm": 0.2429281622171402, "learning_rate": 7.536040000000001e-05, "loss": 0.2859, "step": 311600 }, { "epoch": 0.3834, "grad_norm": 0.1564161479473114, "learning_rate": 7.53204e-05, "loss": 0.282, "step": 311700 }, { "epoch": 0.3836, "grad_norm": 0.242931067943573, "learning_rate": 7.528040000000001e-05, "loss": 0.2638, "step": 311800 }, { "epoch": 0.3838, "grad_norm": 0.22181962430477142, "learning_rate": 7.52404e-05, "loss": 0.2862, "step": 311900 }, { "epoch": 0.384, "grad_norm": 0.23662132024765015, "learning_rate": 7.52004e-05, "loss": 0.282, "step": 312000 }, { "epoch": 0.3842, "grad_norm": 0.25929373502731323, "learning_rate": 7.51604e-05, "loss": 0.2827, "step": 312100 }, { "epoch": 0.3844, "grad_norm": 0.3989902436733246, "learning_rate": 7.51204e-05, "loss": 0.2733, "step": 312200 }, { "epoch": 0.3846, "grad_norm": 0.26393795013427734, "learning_rate": 7.508040000000001e-05, "loss": 0.2827, "step": 312300 }, { "epoch": 0.3848, "grad_norm": 0.6005264520645142, "learning_rate": 7.50404e-05, "loss": 0.2917, "step": 312400 }, { "epoch": 0.385, "grad_norm": 0.23418720066547394, "learning_rate": 7.500040000000001e-05, "loss": 0.2657, "step": 312500 }, { "epoch": 0.3852, "grad_norm": 0.1971062570810318, "learning_rate": 7.496040000000002e-05, "loss": 0.2741, "step": 312600 }, { "epoch": 0.3854, "grad_norm": 0.9465832710266113, "learning_rate": 7.49204e-05, "loss": 0.2761, "step": 312700 }, { "epoch": 0.3856, "grad_norm": 0.2490047663450241, "learning_rate": 7.48804e-05, "loss": 0.2843, "step": 312800 }, { "epoch": 0.3858, "grad_norm": 0.26003503799438477, "learning_rate": 7.48404e-05, "loss": 0.2663, "step": 312900 }, { "epoch": 0.386, "grad_norm": 1.0622795820236206, "learning_rate": 7.48004e-05, "loss": 0.2784, "step": 313000 }, { "epoch": 0.3862, "grad_norm": 0.25771743059158325, "learning_rate": 7.476040000000001e-05, "loss": 0.2726, "step": 313100 }, { "epoch": 0.3864, "grad_norm": 0.3834233283996582, "learning_rate": 7.47204e-05, "loss": 0.2598, "step": 313200 }, { "epoch": 0.3866, "grad_norm": 0.19467617571353912, "learning_rate": 7.468040000000001e-05, "loss": 0.2667, "step": 313300 }, { "epoch": 0.3868, "grad_norm": 0.23708763718605042, "learning_rate": 7.46404e-05, "loss": 0.2619, "step": 313400 }, { "epoch": 0.387, "grad_norm": 0.2725387215614319, "learning_rate": 7.46004e-05, "loss": 0.2723, "step": 313500 }, { "epoch": 0.3872, "grad_norm": 0.23830577731132507, "learning_rate": 7.45604e-05, "loss": 0.26, "step": 313600 }, { "epoch": 0.3874, "grad_norm": 0.20995588600635529, "learning_rate": 7.45204e-05, "loss": 0.2563, "step": 313700 }, { "epoch": 0.3876, "grad_norm": 0.20928217470645905, "learning_rate": 7.44804e-05, "loss": 0.2839, "step": 313800 }, { "epoch": 0.3878, "grad_norm": 0.21170674264431, "learning_rate": 7.44404e-05, "loss": 0.2693, "step": 313900 }, { "epoch": 0.388, "grad_norm": 0.16965176165103912, "learning_rate": 7.440040000000001e-05, "loss": 0.2739, "step": 314000 }, { "epoch": 0.3882, "grad_norm": 0.32784977555274963, "learning_rate": 7.436040000000001e-05, "loss": 0.2638, "step": 314100 }, { "epoch": 0.3884, "grad_norm": 0.45630523562431335, "learning_rate": 7.432040000000001e-05, "loss": 0.2681, "step": 314200 }, { "epoch": 0.3886, "grad_norm": 0.30572786927223206, "learning_rate": 7.42804e-05, "loss": 0.2632, "step": 314300 }, { "epoch": 0.3888, "grad_norm": 0.21931806206703186, "learning_rate": 7.42404e-05, "loss": 0.2717, "step": 314400 }, { "epoch": 0.389, "grad_norm": 0.28573718667030334, "learning_rate": 7.42004e-05, "loss": 0.2684, "step": 314500 }, { "epoch": 0.3892, "grad_norm": 2.636169672012329, "learning_rate": 7.416040000000001e-05, "loss": 0.2664, "step": 314600 }, { "epoch": 0.3894, "grad_norm": 0.19433589279651642, "learning_rate": 7.41204e-05, "loss": 0.264, "step": 314700 }, { "epoch": 0.3896, "grad_norm": 0.2967517673969269, "learning_rate": 7.408040000000001e-05, "loss": 0.2744, "step": 314800 }, { "epoch": 0.3898, "grad_norm": 0.29649585485458374, "learning_rate": 7.40404e-05, "loss": 0.2656, "step": 314900 }, { "epoch": 0.39, "grad_norm": 0.2209979146718979, "learning_rate": 7.40004e-05, "loss": 0.275, "step": 315000 }, { "epoch": 0.3902, "grad_norm": 0.1720481663942337, "learning_rate": 7.39604e-05, "loss": 0.2581, "step": 315100 }, { "epoch": 0.3904, "grad_norm": 0.2020689994096756, "learning_rate": 7.39204e-05, "loss": 0.2617, "step": 315200 }, { "epoch": 0.3906, "grad_norm": 0.22698251903057098, "learning_rate": 7.38804e-05, "loss": 0.269, "step": 315300 }, { "epoch": 0.3908, "grad_norm": 0.2073138952255249, "learning_rate": 7.38404e-05, "loss": 0.2801, "step": 315400 }, { "epoch": 0.391, "grad_norm": 0.4935136139392853, "learning_rate": 7.38004e-05, "loss": 0.2653, "step": 315500 }, { "epoch": 0.3912, "grad_norm": 0.1566108763217926, "learning_rate": 7.376040000000001e-05, "loss": 0.258, "step": 315600 }, { "epoch": 0.3914, "grad_norm": 0.24738989770412445, "learning_rate": 7.37204e-05, "loss": 0.2634, "step": 315700 }, { "epoch": 0.3916, "grad_norm": 0.7126500010490417, "learning_rate": 7.36804e-05, "loss": 0.2653, "step": 315800 }, { "epoch": 0.3918, "grad_norm": 0.23705486953258514, "learning_rate": 7.36404e-05, "loss": 0.2677, "step": 315900 }, { "epoch": 0.392, "grad_norm": 0.3229648470878601, "learning_rate": 7.36004e-05, "loss": 0.2816, "step": 316000 }, { "epoch": 0.3922, "grad_norm": 0.20382308959960938, "learning_rate": 7.356040000000001e-05, "loss": 0.2606, "step": 316100 }, { "epoch": 0.3924, "grad_norm": 0.19805587828159332, "learning_rate": 7.35204e-05, "loss": 0.2673, "step": 316200 }, { "epoch": 0.3926, "grad_norm": 0.24583861231803894, "learning_rate": 7.348040000000001e-05, "loss": 0.273, "step": 316300 }, { "epoch": 0.3928, "grad_norm": 0.2229291796684265, "learning_rate": 7.34404e-05, "loss": 0.2791, "step": 316400 }, { "epoch": 0.393, "grad_norm": 0.21293219923973083, "learning_rate": 7.34004e-05, "loss": 0.2672, "step": 316500 }, { "epoch": 0.3932, "grad_norm": 0.20457857847213745, "learning_rate": 7.33604e-05, "loss": 0.261, "step": 316600 }, { "epoch": 0.3934, "grad_norm": 0.21731828153133392, "learning_rate": 7.33204e-05, "loss": 0.2681, "step": 316700 }, { "epoch": 0.3936, "grad_norm": 0.26696541905403137, "learning_rate": 7.32804e-05, "loss": 0.2708, "step": 316800 }, { "epoch": 0.3938, "grad_norm": 0.21308811008930206, "learning_rate": 7.324040000000001e-05, "loss": 0.2642, "step": 316900 }, { "epoch": 0.394, "grad_norm": 0.2161969393491745, "learning_rate": 7.32004e-05, "loss": 0.2633, "step": 317000 }, { "epoch": 0.3942, "grad_norm": 0.21060483157634735, "learning_rate": 7.316040000000001e-05, "loss": 0.2669, "step": 317100 }, { "epoch": 0.3944, "grad_norm": 0.2582820653915405, "learning_rate": 7.31204e-05, "loss": 0.2644, "step": 317200 }, { "epoch": 0.3946, "grad_norm": 0.24517767131328583, "learning_rate": 7.30804e-05, "loss": 0.2691, "step": 317300 }, { "epoch": 0.3948, "grad_norm": 0.2194310426712036, "learning_rate": 7.30404e-05, "loss": 0.2653, "step": 317400 }, { "epoch": 0.395, "grad_norm": 0.28793978691101074, "learning_rate": 7.30004e-05, "loss": 0.2643, "step": 317500 }, { "epoch": 0.3952, "grad_norm": 0.6977545022964478, "learning_rate": 7.296040000000001e-05, "loss": 0.2723, "step": 317600 }, { "epoch": 0.3954, "grad_norm": 0.2802855372428894, "learning_rate": 7.29204e-05, "loss": 0.3051, "step": 317700 }, { "epoch": 0.3956, "grad_norm": 0.2557845413684845, "learning_rate": 7.288040000000001e-05, "loss": 0.2639, "step": 317800 }, { "epoch": 0.3958, "grad_norm": 0.3409139811992645, "learning_rate": 7.284040000000001e-05, "loss": 0.279, "step": 317900 }, { "epoch": 0.396, "grad_norm": 0.18252715468406677, "learning_rate": 7.280040000000001e-05, "loss": 0.2664, "step": 318000 }, { "epoch": 0.3962, "grad_norm": 0.2673168182373047, "learning_rate": 7.27604e-05, "loss": 0.2685, "step": 318100 }, { "epoch": 0.3964, "grad_norm": 0.37415772676467896, "learning_rate": 7.27204e-05, "loss": 0.2564, "step": 318200 }, { "epoch": 0.3966, "grad_norm": 0.23668460547924042, "learning_rate": 7.26804e-05, "loss": 0.2792, "step": 318300 }, { "epoch": 0.3968, "grad_norm": 0.4940067231655121, "learning_rate": 7.264040000000001e-05, "loss": 0.2757, "step": 318400 }, { "epoch": 0.397, "grad_norm": 0.18225626647472382, "learning_rate": 7.26004e-05, "loss": 0.2789, "step": 318500 }, { "epoch": 0.3972, "grad_norm": 0.19703717529773712, "learning_rate": 7.256040000000001e-05, "loss": 0.2761, "step": 318600 }, { "epoch": 0.3974, "grad_norm": 0.21940729022026062, "learning_rate": 7.25204e-05, "loss": 0.291, "step": 318700 }, { "epoch": 0.3976, "grad_norm": 0.293050080537796, "learning_rate": 7.24804e-05, "loss": 0.2809, "step": 318800 }, { "epoch": 0.3978, "grad_norm": 0.45579320192337036, "learning_rate": 7.24404e-05, "loss": 0.2659, "step": 318900 }, { "epoch": 0.398, "grad_norm": 0.20021362602710724, "learning_rate": 7.24004e-05, "loss": 0.2687, "step": 319000 }, { "epoch": 0.3982, "grad_norm": 0.22239625453948975, "learning_rate": 7.23604e-05, "loss": 0.2789, "step": 319100 }, { "epoch": 0.3984, "grad_norm": 0.2077815979719162, "learning_rate": 7.23204e-05, "loss": 0.2563, "step": 319200 }, { "epoch": 0.3986, "grad_norm": 0.2690514028072357, "learning_rate": 7.228040000000001e-05, "loss": 0.2593, "step": 319300 }, { "epoch": 0.3988, "grad_norm": 0.18208642303943634, "learning_rate": 7.224040000000001e-05, "loss": 0.2709, "step": 319400 }, { "epoch": 0.399, "grad_norm": 0.16601015627384186, "learning_rate": 7.220040000000001e-05, "loss": 0.2738, "step": 319500 }, { "epoch": 0.3992, "grad_norm": 0.18831057846546173, "learning_rate": 7.21604e-05, "loss": 0.2703, "step": 319600 }, { "epoch": 0.3994, "grad_norm": 0.19717268645763397, "learning_rate": 7.21204e-05, "loss": 0.2563, "step": 319700 }, { "epoch": 0.3996, "grad_norm": 0.2667396366596222, "learning_rate": 7.20804e-05, "loss": 0.2578, "step": 319800 }, { "epoch": 0.3998, "grad_norm": 0.2868683934211731, "learning_rate": 7.204040000000001e-05, "loss": 0.2691, "step": 319900 }, { "epoch": 0.4, "grad_norm": 0.21218955516815186, "learning_rate": 7.20004e-05, "loss": 0.2637, "step": 320000 }, { "epoch": 0.4002, "grad_norm": 0.4244702160358429, "learning_rate": 7.196040000000001e-05, "loss": 0.2767, "step": 320100 }, { "epoch": 0.4004, "grad_norm": 0.23811425268650055, "learning_rate": 7.19204e-05, "loss": 0.2627, "step": 320200 }, { "epoch": 0.4006, "grad_norm": 0.20318666100502014, "learning_rate": 7.18804e-05, "loss": 0.2616, "step": 320300 }, { "epoch": 0.4008, "grad_norm": 0.3025484085083008, "learning_rate": 7.18404e-05, "loss": 0.2685, "step": 320400 }, { "epoch": 0.401, "grad_norm": 0.17960281670093536, "learning_rate": 7.18004e-05, "loss": 0.2733, "step": 320500 }, { "epoch": 0.4012, "grad_norm": 0.25798577070236206, "learning_rate": 7.17604e-05, "loss": 0.264, "step": 320600 }, { "epoch": 0.4014, "grad_norm": 0.1824919730424881, "learning_rate": 7.17204e-05, "loss": 0.2684, "step": 320700 }, { "epoch": 0.4016, "grad_norm": 0.23640932142734528, "learning_rate": 7.16804e-05, "loss": 0.265, "step": 320800 }, { "epoch": 0.4018, "grad_norm": 0.22346696257591248, "learning_rate": 7.164040000000001e-05, "loss": 0.2574, "step": 320900 }, { "epoch": 0.402, "grad_norm": 0.33641451597213745, "learning_rate": 7.16004e-05, "loss": 0.2644, "step": 321000 }, { "epoch": 0.4022, "grad_norm": 0.24027025699615479, "learning_rate": 7.15604e-05, "loss": 0.2803, "step": 321100 }, { "epoch": 0.4024, "grad_norm": 0.2125771939754486, "learning_rate": 7.15204e-05, "loss": 0.2733, "step": 321200 }, { "epoch": 0.4026, "grad_norm": 0.2774077355861664, "learning_rate": 7.14804e-05, "loss": 0.2781, "step": 321300 }, { "epoch": 0.4028, "grad_norm": 0.1831909418106079, "learning_rate": 7.144040000000001e-05, "loss": 0.2661, "step": 321400 }, { "epoch": 0.403, "grad_norm": 0.2540408968925476, "learning_rate": 7.14004e-05, "loss": 0.2801, "step": 321500 }, { "epoch": 0.4032, "grad_norm": 0.2425178438425064, "learning_rate": 7.136040000000001e-05, "loss": 0.2584, "step": 321600 }, { "epoch": 0.4034, "grad_norm": 0.1953432857990265, "learning_rate": 7.13204e-05, "loss": 0.2674, "step": 321700 }, { "epoch": 0.4036, "grad_norm": 0.31084179878234863, "learning_rate": 7.12804e-05, "loss": 0.2599, "step": 321800 }, { "epoch": 0.4038, "grad_norm": 0.17004917562007904, "learning_rate": 7.12404e-05, "loss": 0.272, "step": 321900 }, { "epoch": 0.404, "grad_norm": 0.2186584621667862, "learning_rate": 7.12004e-05, "loss": 0.2615, "step": 322000 }, { "epoch": 0.4042, "grad_norm": 0.2131323516368866, "learning_rate": 7.11604e-05, "loss": 0.2734, "step": 322100 }, { "epoch": 0.4044, "grad_norm": 0.29444190859794617, "learning_rate": 7.11204e-05, "loss": 0.2577, "step": 322200 }, { "epoch": 0.4046, "grad_norm": 0.2686418890953064, "learning_rate": 7.10804e-05, "loss": 0.2636, "step": 322300 }, { "epoch": 0.4048, "grad_norm": 0.2958160936832428, "learning_rate": 7.104040000000001e-05, "loss": 0.2611, "step": 322400 }, { "epoch": 0.405, "grad_norm": 0.24784350395202637, "learning_rate": 7.10004e-05, "loss": 0.2658, "step": 322500 }, { "epoch": 0.4052, "grad_norm": 0.21654903888702393, "learning_rate": 7.09604e-05, "loss": 0.2625, "step": 322600 }, { "epoch": 0.4054, "grad_norm": 0.17862387001514435, "learning_rate": 7.092039999999999e-05, "loss": 0.2611, "step": 322700 }, { "epoch": 0.4056, "grad_norm": 0.2753390371799469, "learning_rate": 7.08804e-05, "loss": 0.2626, "step": 322800 }, { "epoch": 0.4058, "grad_norm": 0.41417863965034485, "learning_rate": 7.084040000000001e-05, "loss": 0.262, "step": 322900 }, { "epoch": 0.406, "grad_norm": 0.6197725534439087, "learning_rate": 7.08004e-05, "loss": 0.2647, "step": 323000 }, { "epoch": 0.4062, "grad_norm": 0.31697189807891846, "learning_rate": 7.076040000000001e-05, "loss": 0.2633, "step": 323100 }, { "epoch": 0.4064, "grad_norm": 0.7414258718490601, "learning_rate": 7.072040000000001e-05, "loss": 0.271, "step": 323200 }, { "epoch": 0.4066, "grad_norm": 0.22181887924671173, "learning_rate": 7.068040000000001e-05, "loss": 0.2637, "step": 323300 }, { "epoch": 0.4068, "grad_norm": 0.24176335334777832, "learning_rate": 7.06404e-05, "loss": 0.2641, "step": 323400 }, { "epoch": 0.407, "grad_norm": 0.2450632005929947, "learning_rate": 7.06004e-05, "loss": 0.2634, "step": 323500 }, { "epoch": 0.4072, "grad_norm": 0.1967063993215561, "learning_rate": 7.05604e-05, "loss": 0.273, "step": 323600 }, { "epoch": 0.4074, "grad_norm": 0.16844309866428375, "learning_rate": 7.052040000000001e-05, "loss": 0.2664, "step": 323700 }, { "epoch": 0.4076, "grad_norm": 0.24496600031852722, "learning_rate": 7.04804e-05, "loss": 0.2706, "step": 323800 }, { "epoch": 0.4078, "grad_norm": 0.24819956719875336, "learning_rate": 7.044040000000001e-05, "loss": 0.2573, "step": 323900 }, { "epoch": 0.408, "grad_norm": 0.31527596712112427, "learning_rate": 7.04004e-05, "loss": 0.2602, "step": 324000 }, { "epoch": 0.4082, "grad_norm": 0.22740913927555084, "learning_rate": 7.03604e-05, "loss": 0.2614, "step": 324100 }, { "epoch": 0.4084, "grad_norm": 0.2220146805047989, "learning_rate": 7.03204e-05, "loss": 0.2682, "step": 324200 }, { "epoch": 0.4086, "grad_norm": 0.22316837310791016, "learning_rate": 7.02804e-05, "loss": 0.2612, "step": 324300 }, { "epoch": 0.4088, "grad_norm": 0.2431160807609558, "learning_rate": 7.02404e-05, "loss": 0.2753, "step": 324400 }, { "epoch": 0.409, "grad_norm": 0.2228887677192688, "learning_rate": 7.02004e-05, "loss": 0.2612, "step": 324500 }, { "epoch": 0.4092, "grad_norm": 0.25735148787498474, "learning_rate": 7.01604e-05, "loss": 0.2639, "step": 324600 }, { "epoch": 0.4094, "grad_norm": 0.19601066410541534, "learning_rate": 7.012040000000001e-05, "loss": 0.2515, "step": 324700 }, { "epoch": 0.4096, "grad_norm": 0.2028639167547226, "learning_rate": 7.008040000000001e-05, "loss": 0.2704, "step": 324800 }, { "epoch": 0.4098, "grad_norm": 0.18071521818637848, "learning_rate": 7.00404e-05, "loss": 0.2657, "step": 324900 }, { "epoch": 0.41, "grad_norm": 0.22790144383907318, "learning_rate": 7.00004e-05, "loss": 0.2852, "step": 325000 }, { "epoch": 0.4102, "grad_norm": 0.2082364410161972, "learning_rate": 6.99604e-05, "loss": 0.2622, "step": 325100 }, { "epoch": 0.4104, "grad_norm": 0.1667306274175644, "learning_rate": 6.992040000000001e-05, "loss": 0.2555, "step": 325200 }, { "epoch": 0.4106, "grad_norm": 0.1831902712583542, "learning_rate": 6.98804e-05, "loss": 0.2698, "step": 325300 }, { "epoch": 0.4108, "grad_norm": 0.18110473453998566, "learning_rate": 6.984040000000001e-05, "loss": 0.2496, "step": 325400 }, { "epoch": 0.411, "grad_norm": 0.19504287838935852, "learning_rate": 6.98004e-05, "loss": 0.2556, "step": 325500 }, { "epoch": 0.4112, "grad_norm": 0.16663411259651184, "learning_rate": 6.97604e-05, "loss": 0.2577, "step": 325600 }, { "epoch": 0.4114, "grad_norm": 0.43907126784324646, "learning_rate": 6.97204e-05, "loss": 0.2585, "step": 325700 }, { "epoch": 0.4116, "grad_norm": 0.20249086618423462, "learning_rate": 6.96804e-05, "loss": 0.25, "step": 325800 }, { "epoch": 0.4118, "grad_norm": 0.15998288989067078, "learning_rate": 6.96404e-05, "loss": 0.2482, "step": 325900 }, { "epoch": 0.412, "grad_norm": 0.19670312106609344, "learning_rate": 6.96004e-05, "loss": 0.2445, "step": 326000 }, { "epoch": 0.4122, "grad_norm": 0.18469685316085815, "learning_rate": 6.95604e-05, "loss": 0.2502, "step": 326100 }, { "epoch": 0.4124, "grad_norm": 0.18897442519664764, "learning_rate": 6.952040000000001e-05, "loss": 0.2474, "step": 326200 }, { "epoch": 0.4126, "grad_norm": 0.17963925004005432, "learning_rate": 6.94804e-05, "loss": 0.2457, "step": 326300 }, { "epoch": 0.4128, "grad_norm": 0.19767078757286072, "learning_rate": 6.94404e-05, "loss": 0.2515, "step": 326400 }, { "epoch": 0.413, "grad_norm": 0.20545931160449982, "learning_rate": 6.94004e-05, "loss": 0.2459, "step": 326500 }, { "epoch": 0.4132, "grad_norm": 0.17647592723369598, "learning_rate": 6.93604e-05, "loss": 0.2457, "step": 326600 }, { "epoch": 0.4134, "grad_norm": 0.19674290716648102, "learning_rate": 6.932040000000001e-05, "loss": 0.2469, "step": 326700 }, { "epoch": 0.4136, "grad_norm": 0.25124111771583557, "learning_rate": 6.92804e-05, "loss": 0.2434, "step": 326800 }, { "epoch": 0.4138, "grad_norm": 0.19026201963424683, "learning_rate": 6.924040000000001e-05, "loss": 0.2536, "step": 326900 }, { "epoch": 0.414, "grad_norm": 0.20437213778495789, "learning_rate": 6.92004e-05, "loss": 0.2459, "step": 327000 }, { "epoch": 0.4142, "grad_norm": 0.24830076098442078, "learning_rate": 6.916040000000001e-05, "loss": 0.2454, "step": 327100 }, { "epoch": 0.4144, "grad_norm": 0.19002242386341095, "learning_rate": 6.91204e-05, "loss": 0.2472, "step": 327200 }, { "epoch": 0.4146, "grad_norm": 0.1964949518442154, "learning_rate": 6.90804e-05, "loss": 0.2435, "step": 327300 }, { "epoch": 0.4148, "grad_norm": 0.20290331542491913, "learning_rate": 6.90404e-05, "loss": 0.2452, "step": 327400 }, { "epoch": 0.415, "grad_norm": 0.17645521461963654, "learning_rate": 6.90004e-05, "loss": 0.2441, "step": 327500 }, { "epoch": 0.4152, "grad_norm": 0.19181664288043976, "learning_rate": 6.89604e-05, "loss": 0.2487, "step": 327600 }, { "epoch": 0.4154, "grad_norm": 0.18161311745643616, "learning_rate": 6.892040000000001e-05, "loss": 0.2446, "step": 327700 }, { "epoch": 0.4156, "grad_norm": 0.16001363098621368, "learning_rate": 6.88804e-05, "loss": 0.2443, "step": 327800 }, { "epoch": 0.4158, "grad_norm": 0.1850643903017044, "learning_rate": 6.88404e-05, "loss": 0.253, "step": 327900 }, { "epoch": 0.416, "grad_norm": 0.1478988528251648, "learning_rate": 6.880039999999999e-05, "loss": 0.2445, "step": 328000 }, { "epoch": 0.4162, "grad_norm": 0.18644775450229645, "learning_rate": 6.87604e-05, "loss": 0.243, "step": 328100 }, { "epoch": 0.4164, "grad_norm": 0.18201524019241333, "learning_rate": 6.872040000000001e-05, "loss": 0.2423, "step": 328200 }, { "epoch": 0.4166, "grad_norm": 0.25159624218940735, "learning_rate": 6.86804e-05, "loss": 0.2461, "step": 328300 }, { "epoch": 0.4168, "grad_norm": 0.21256084740161896, "learning_rate": 6.864040000000001e-05, "loss": 0.2443, "step": 328400 }, { "epoch": 0.417, "grad_norm": 0.27206477522850037, "learning_rate": 6.86004e-05, "loss": 0.2421, "step": 328500 }, { "epoch": 0.4172, "grad_norm": 0.1862238049507141, "learning_rate": 6.856040000000001e-05, "loss": 0.253, "step": 328600 }, { "epoch": 0.4174, "grad_norm": 0.1940353810787201, "learning_rate": 6.85204e-05, "loss": 0.2454, "step": 328700 }, { "epoch": 0.4176, "grad_norm": 0.14966708421707153, "learning_rate": 6.84804e-05, "loss": 0.2471, "step": 328800 }, { "epoch": 0.4178, "grad_norm": 0.15949209034442902, "learning_rate": 6.84404e-05, "loss": 0.2497, "step": 328900 }, { "epoch": 0.418, "grad_norm": 0.198212668299675, "learning_rate": 6.84004e-05, "loss": 0.2496, "step": 329000 }, { "epoch": 0.4182, "grad_norm": 0.22681452333927155, "learning_rate": 6.83604e-05, "loss": 0.2464, "step": 329100 }, { "epoch": 0.4184, "grad_norm": 0.17633529007434845, "learning_rate": 6.832040000000001e-05, "loss": 0.2519, "step": 329200 }, { "epoch": 0.4186, "grad_norm": 0.2030135840177536, "learning_rate": 6.82804e-05, "loss": 0.2462, "step": 329300 }, { "epoch": 0.4188, "grad_norm": 0.16612933576107025, "learning_rate": 6.82404e-05, "loss": 0.2453, "step": 329400 }, { "epoch": 0.419, "grad_norm": 0.2159220576286316, "learning_rate": 6.82004e-05, "loss": 0.2478, "step": 329500 }, { "epoch": 0.4192, "grad_norm": 0.24220001697540283, "learning_rate": 6.81604e-05, "loss": 0.2477, "step": 329600 }, { "epoch": 0.4194, "grad_norm": 0.1778249442577362, "learning_rate": 6.81204e-05, "loss": 0.2516, "step": 329700 }, { "epoch": 0.4196, "grad_norm": 0.2725713551044464, "learning_rate": 6.80804e-05, "loss": 0.2514, "step": 329800 }, { "epoch": 0.4198, "grad_norm": 0.18258759379386902, "learning_rate": 6.80404e-05, "loss": 0.2479, "step": 329900 }, { "epoch": 0.42, "grad_norm": 0.24233517050743103, "learning_rate": 6.800040000000001e-05, "loss": 0.2501, "step": 330000 }, { "epoch": 0.4202, "grad_norm": 0.16992133855819702, "learning_rate": 6.796040000000001e-05, "loss": 0.2422, "step": 330100 }, { "epoch": 0.4204, "grad_norm": 0.19478961825370789, "learning_rate": 6.79204e-05, "loss": 0.2504, "step": 330200 }, { "epoch": 0.4206, "grad_norm": 0.3001626133918762, "learning_rate": 6.78804e-05, "loss": 0.2555, "step": 330300 }, { "epoch": 0.4208, "grad_norm": 0.15378372371196747, "learning_rate": 6.78404e-05, "loss": 0.2442, "step": 330400 }, { "epoch": 0.421, "grad_norm": 0.17064020037651062, "learning_rate": 6.780040000000001e-05, "loss": 0.2448, "step": 330500 }, { "epoch": 0.4212, "grad_norm": 0.15659099817276, "learning_rate": 6.77604e-05, "loss": 0.2453, "step": 330600 }, { "epoch": 0.4214, "grad_norm": 0.274606317281723, "learning_rate": 6.772040000000001e-05, "loss": 0.2458, "step": 330700 }, { "epoch": 0.4216, "grad_norm": 0.19441047310829163, "learning_rate": 6.76804e-05, "loss": 0.2455, "step": 330800 }, { "epoch": 0.4218, "grad_norm": 0.16700878739356995, "learning_rate": 6.76404e-05, "loss": 0.2433, "step": 330900 }, { "epoch": 0.422, "grad_norm": 0.15905675292015076, "learning_rate": 6.76004e-05, "loss": 0.2447, "step": 331000 }, { "epoch": 0.4222, "grad_norm": 0.2215052992105484, "learning_rate": 6.75604e-05, "loss": 0.2492, "step": 331100 }, { "epoch": 0.4224, "grad_norm": 0.17576748132705688, "learning_rate": 6.75204e-05, "loss": 0.2476, "step": 331200 }, { "epoch": 0.4226, "grad_norm": 0.17451365292072296, "learning_rate": 6.74804e-05, "loss": 0.2487, "step": 331300 }, { "epoch": 0.4228, "grad_norm": 0.5440964102745056, "learning_rate": 6.74404e-05, "loss": 0.2479, "step": 331400 }, { "epoch": 0.423, "grad_norm": 0.19704194366931915, "learning_rate": 6.740040000000001e-05, "loss": 0.247, "step": 331500 }, { "epoch": 0.4232, "grad_norm": 0.20427988469600677, "learning_rate": 6.73604e-05, "loss": 0.2481, "step": 331600 }, { "epoch": 0.4234, "grad_norm": 0.16605882346630096, "learning_rate": 6.73204e-05, "loss": 0.2466, "step": 331700 }, { "epoch": 0.4236, "grad_norm": 0.22239907085895538, "learning_rate": 6.72804e-05, "loss": 0.243, "step": 331800 }, { "epoch": 0.4238, "grad_norm": 0.17800769209861755, "learning_rate": 6.72404e-05, "loss": 0.2489, "step": 331900 }, { "epoch": 0.424, "grad_norm": 0.2153862714767456, "learning_rate": 6.720040000000001e-05, "loss": 0.2489, "step": 332000 }, { "epoch": 0.4242, "grad_norm": 0.3256576359272003, "learning_rate": 6.71604e-05, "loss": 0.242, "step": 332100 }, { "epoch": 0.4244, "grad_norm": 0.20492444932460785, "learning_rate": 6.712040000000001e-05, "loss": 0.242, "step": 332200 }, { "epoch": 0.4246, "grad_norm": 0.276932954788208, "learning_rate": 6.70804e-05, "loss": 0.2524, "step": 332300 }, { "epoch": 0.4248, "grad_norm": 0.17257800698280334, "learning_rate": 6.704040000000001e-05, "loss": 0.2448, "step": 332400 }, { "epoch": 0.425, "grad_norm": 0.1662578284740448, "learning_rate": 6.70004e-05, "loss": 0.2423, "step": 332500 }, { "epoch": 0.4252, "grad_norm": 0.17013122141361237, "learning_rate": 6.69604e-05, "loss": 0.2418, "step": 332600 }, { "epoch": 0.4254, "grad_norm": 0.18194365501403809, "learning_rate": 6.69204e-05, "loss": 0.2484, "step": 332700 }, { "epoch": 0.4256, "grad_norm": 0.17675453424453735, "learning_rate": 6.68804e-05, "loss": 0.2432, "step": 332800 }, { "epoch": 0.4258, "grad_norm": 0.16768479347229004, "learning_rate": 6.68404e-05, "loss": 0.2446, "step": 332900 }, { "epoch": 0.426, "grad_norm": 0.28635919094085693, "learning_rate": 6.680040000000001e-05, "loss": 0.2471, "step": 333000 }, { "epoch": 0.4262, "grad_norm": 0.26237931847572327, "learning_rate": 6.67604e-05, "loss": 0.2461, "step": 333100 }, { "epoch": 0.4264, "grad_norm": 0.15941229462623596, "learning_rate": 6.67204e-05, "loss": 0.2517, "step": 333200 }, { "epoch": 0.4266, "grad_norm": 0.20639023184776306, "learning_rate": 6.668039999999999e-05, "loss": 0.2439, "step": 333300 }, { "epoch": 0.4268, "grad_norm": 0.1907823532819748, "learning_rate": 6.66404e-05, "loss": 0.2441, "step": 333400 }, { "epoch": 0.427, "grad_norm": 0.4512256979942322, "learning_rate": 6.660040000000001e-05, "loss": 0.2555, "step": 333500 }, { "epoch": 0.4272, "grad_norm": 0.2021334320306778, "learning_rate": 6.65604e-05, "loss": 0.2445, "step": 333600 }, { "epoch": 0.4274, "grad_norm": 0.2123294472694397, "learning_rate": 6.652040000000001e-05, "loss": 0.2421, "step": 333700 }, { "epoch": 0.4276, "grad_norm": 0.2353043407201767, "learning_rate": 6.64804e-05, "loss": 0.246, "step": 333800 }, { "epoch": 0.4278, "grad_norm": 0.18522989749908447, "learning_rate": 6.644040000000001e-05, "loss": 0.2466, "step": 333900 }, { "epoch": 0.428, "grad_norm": 0.29194357991218567, "learning_rate": 6.64004e-05, "loss": 0.2472, "step": 334000 }, { "epoch": 0.4282, "grad_norm": 0.21177667379379272, "learning_rate": 6.63604e-05, "loss": 0.2451, "step": 334100 }, { "epoch": 0.4284, "grad_norm": 0.2588765323162079, "learning_rate": 6.63204e-05, "loss": 0.2426, "step": 334200 }, { "epoch": 0.4286, "grad_norm": 0.16683948040008545, "learning_rate": 6.62804e-05, "loss": 0.2385, "step": 334300 }, { "epoch": 0.4288, "grad_norm": 0.2573770582675934, "learning_rate": 6.62404e-05, "loss": 0.2441, "step": 334400 }, { "epoch": 0.429, "grad_norm": 0.24478641152381897, "learning_rate": 6.620040000000001e-05, "loss": 0.2426, "step": 334500 }, { "epoch": 0.4292, "grad_norm": 0.2206483632326126, "learning_rate": 6.61604e-05, "loss": 0.2465, "step": 334600 }, { "epoch": 0.4294, "grad_norm": 0.16202546656131744, "learning_rate": 6.61204e-05, "loss": 0.2448, "step": 334700 }, { "epoch": 0.4296, "grad_norm": 0.29942259192466736, "learning_rate": 6.608039999999999e-05, "loss": 0.2507, "step": 334800 }, { "epoch": 0.4298, "grad_norm": 0.15078793466091156, "learning_rate": 6.60404e-05, "loss": 0.2448, "step": 334900 }, { "epoch": 0.43, "grad_norm": 0.19433638453483582, "learning_rate": 6.60004e-05, "loss": 0.2399, "step": 335000 }, { "epoch": 0.4302, "grad_norm": 0.19410523772239685, "learning_rate": 6.59604e-05, "loss": 0.2456, "step": 335100 }, { "epoch": 0.4304, "grad_norm": 0.2051495611667633, "learning_rate": 6.59204e-05, "loss": 0.245, "step": 335200 }, { "epoch": 0.4306, "grad_norm": 0.1647467315196991, "learning_rate": 6.58804e-05, "loss": 0.2444, "step": 335300 }, { "epoch": 0.4308, "grad_norm": 0.25535687804222107, "learning_rate": 6.584040000000001e-05, "loss": 0.2452, "step": 335400 }, { "epoch": 0.431, "grad_norm": 0.16099178791046143, "learning_rate": 6.58004e-05, "loss": 0.2415, "step": 335500 }, { "epoch": 0.4312, "grad_norm": 0.15489977598190308, "learning_rate": 6.57604e-05, "loss": 0.2452, "step": 335600 }, { "epoch": 0.4314, "grad_norm": 0.18465201556682587, "learning_rate": 6.57204e-05, "loss": 0.2441, "step": 335700 }, { "epoch": 0.4316, "grad_norm": 0.24972856044769287, "learning_rate": 6.568040000000001e-05, "loss": 0.2441, "step": 335800 }, { "epoch": 0.4318, "grad_norm": 0.1736900806427002, "learning_rate": 6.56404e-05, "loss": 0.2452, "step": 335900 }, { "epoch": 0.432, "grad_norm": 0.18592572212219238, "learning_rate": 6.560040000000001e-05, "loss": 0.2397, "step": 336000 }, { "epoch": 0.4322, "grad_norm": 0.1998361349105835, "learning_rate": 6.55604e-05, "loss": 0.2399, "step": 336100 }, { "epoch": 0.4324, "grad_norm": 0.25269144773483276, "learning_rate": 6.552040000000001e-05, "loss": 0.2429, "step": 336200 }, { "epoch": 0.4326, "grad_norm": 0.2118110954761505, "learning_rate": 6.54804e-05, "loss": 0.2397, "step": 336300 }, { "epoch": 0.4328, "grad_norm": 0.20308533310890198, "learning_rate": 6.54404e-05, "loss": 0.248, "step": 336400 }, { "epoch": 0.433, "grad_norm": 0.18508122861385345, "learning_rate": 6.54004e-05, "loss": 0.2446, "step": 336500 }, { "epoch": 0.4332, "grad_norm": 0.1768845021724701, "learning_rate": 6.53604e-05, "loss": 0.2472, "step": 336600 }, { "epoch": 0.4334, "grad_norm": 0.16436360776424408, "learning_rate": 6.53204e-05, "loss": 0.2479, "step": 336700 }, { "epoch": 0.4336, "grad_norm": 0.20555828511714935, "learning_rate": 6.528040000000001e-05, "loss": 0.2475, "step": 336800 }, { "epoch": 0.4338, "grad_norm": 0.2154914289712906, "learning_rate": 6.52404e-05, "loss": 0.2488, "step": 336900 }, { "epoch": 0.434, "grad_norm": 0.18624047935009003, "learning_rate": 6.52004e-05, "loss": 0.2405, "step": 337000 }, { "epoch": 0.4342, "grad_norm": 0.19469185173511505, "learning_rate": 6.51604e-05, "loss": 0.2421, "step": 337100 }, { "epoch": 0.4344, "grad_norm": 0.22191110253334045, "learning_rate": 6.51204e-05, "loss": 0.2495, "step": 337200 }, { "epoch": 0.4346, "grad_norm": 0.15489134192466736, "learning_rate": 6.508040000000001e-05, "loss": 0.2424, "step": 337300 }, { "epoch": 0.4348, "grad_norm": 0.17435604333877563, "learning_rate": 6.50404e-05, "loss": 0.2435, "step": 337400 }, { "epoch": 0.435, "grad_norm": 0.2414373904466629, "learning_rate": 6.500040000000001e-05, "loss": 0.2406, "step": 337500 }, { "epoch": 0.4352, "grad_norm": 0.15996411442756653, "learning_rate": 6.49604e-05, "loss": 0.2411, "step": 337600 }, { "epoch": 0.4354, "grad_norm": 0.20958919823169708, "learning_rate": 6.492040000000001e-05, "loss": 0.2454, "step": 337700 }, { "epoch": 0.4356, "grad_norm": 0.20292189717292786, "learning_rate": 6.48804e-05, "loss": 0.2455, "step": 337800 }, { "epoch": 0.4358, "grad_norm": 0.15378180146217346, "learning_rate": 6.48404e-05, "loss": 0.2476, "step": 337900 }, { "epoch": 0.436, "grad_norm": 0.20952782034873962, "learning_rate": 6.48004e-05, "loss": 0.2502, "step": 338000 }, { "epoch": 0.4362, "grad_norm": 0.17655488848686218, "learning_rate": 6.47604e-05, "loss": 0.243, "step": 338100 }, { "epoch": 0.4364, "grad_norm": 0.18256178498268127, "learning_rate": 6.47204e-05, "loss": 0.2421, "step": 338200 }, { "epoch": 0.4366, "grad_norm": 0.23415111005306244, "learning_rate": 6.468040000000001e-05, "loss": 0.2425, "step": 338300 }, { "epoch": 0.4368, "grad_norm": 0.1419600397348404, "learning_rate": 6.46404e-05, "loss": 0.247, "step": 338400 }, { "epoch": 0.437, "grad_norm": 0.18709716200828552, "learning_rate": 6.46004e-05, "loss": 0.2456, "step": 338500 }, { "epoch": 0.4372, "grad_norm": 0.1568467766046524, "learning_rate": 6.456039999999999e-05, "loss": 0.2381, "step": 338600 }, { "epoch": 0.4374, "grad_norm": 0.17499247193336487, "learning_rate": 6.45204e-05, "loss": 0.2415, "step": 338700 }, { "epoch": 0.4376, "grad_norm": 0.23707623779773712, "learning_rate": 6.448040000000001e-05, "loss": 0.2548, "step": 338800 }, { "epoch": 0.4378, "grad_norm": 0.3676884174346924, "learning_rate": 6.44404e-05, "loss": 0.2466, "step": 338900 }, { "epoch": 0.438, "grad_norm": 0.21501903235912323, "learning_rate": 6.440040000000001e-05, "loss": 0.2501, "step": 339000 }, { "epoch": 0.4382, "grad_norm": 0.19455668330192566, "learning_rate": 6.43604e-05, "loss": 0.2418, "step": 339100 }, { "epoch": 0.4384, "grad_norm": 0.24228647351264954, "learning_rate": 6.432040000000001e-05, "loss": 0.2399, "step": 339200 }, { "epoch": 0.4386, "grad_norm": 0.16758756339550018, "learning_rate": 6.42804e-05, "loss": 0.2457, "step": 339300 }, { "epoch": 0.4388, "grad_norm": 0.24889026582241058, "learning_rate": 6.42404e-05, "loss": 0.2414, "step": 339400 }, { "epoch": 0.439, "grad_norm": 0.2294233739376068, "learning_rate": 6.42004e-05, "loss": 0.2416, "step": 339500 }, { "epoch": 0.4392, "grad_norm": 0.24796201288700104, "learning_rate": 6.41604e-05, "loss": 0.2407, "step": 339600 }, { "epoch": 0.4394, "grad_norm": 0.16139519214630127, "learning_rate": 6.41204e-05, "loss": 0.2453, "step": 339700 }, { "epoch": 0.4396, "grad_norm": 0.1596374362707138, "learning_rate": 6.408040000000001e-05, "loss": 0.2394, "step": 339800 }, { "epoch": 0.4398, "grad_norm": 0.18892623484134674, "learning_rate": 6.40404e-05, "loss": 0.2465, "step": 339900 }, { "epoch": 0.44, "grad_norm": 0.16438473761081696, "learning_rate": 6.400040000000001e-05, "loss": 0.2437, "step": 340000 }, { "epoch": 0.4402, "grad_norm": 0.2160944789648056, "learning_rate": 6.396039999999999e-05, "loss": 0.2451, "step": 340100 }, { "epoch": 0.4404, "grad_norm": 0.1667250096797943, "learning_rate": 6.39204e-05, "loss": 0.245, "step": 340200 }, { "epoch": 0.4406, "grad_norm": 0.15988707542419434, "learning_rate": 6.38804e-05, "loss": 0.2431, "step": 340300 }, { "epoch": 0.4408, "grad_norm": 0.1956913322210312, "learning_rate": 6.38404e-05, "loss": 0.2461, "step": 340400 }, { "epoch": 0.441, "grad_norm": 0.3029384911060333, "learning_rate": 6.38004e-05, "loss": 0.2427, "step": 340500 }, { "epoch": 0.4412, "grad_norm": 0.1784927248954773, "learning_rate": 6.37604e-05, "loss": 0.2403, "step": 340600 }, { "epoch": 0.4414, "grad_norm": 0.18993079662322998, "learning_rate": 6.372040000000001e-05, "loss": 0.2453, "step": 340700 }, { "epoch": 0.4416, "grad_norm": 0.17982354760169983, "learning_rate": 6.36804e-05, "loss": 0.2386, "step": 340800 }, { "epoch": 0.4418, "grad_norm": 0.15973594784736633, "learning_rate": 6.36404e-05, "loss": 0.2412, "step": 340900 }, { "epoch": 0.442, "grad_norm": 0.2465885430574417, "learning_rate": 6.36004e-05, "loss": 0.2431, "step": 341000 }, { "epoch": 0.4422, "grad_norm": 0.2246682345867157, "learning_rate": 6.35604e-05, "loss": 0.2437, "step": 341100 }, { "epoch": 0.4424, "grad_norm": 0.2625156342983246, "learning_rate": 6.35204e-05, "loss": 0.2448, "step": 341200 }, { "epoch": 0.4426, "grad_norm": 0.14324608445167542, "learning_rate": 6.348040000000001e-05, "loss": 0.2398, "step": 341300 }, { "epoch": 0.4428, "grad_norm": 0.1656043529510498, "learning_rate": 6.34404e-05, "loss": 0.2607, "step": 341400 }, { "epoch": 0.443, "grad_norm": 0.2322763055562973, "learning_rate": 6.340040000000001e-05, "loss": 0.2432, "step": 341500 }, { "epoch": 0.4432, "grad_norm": 0.2581785321235657, "learning_rate": 6.33604e-05, "loss": 0.2425, "step": 341600 }, { "epoch": 0.4434, "grad_norm": 0.18727286159992218, "learning_rate": 6.33204e-05, "loss": 0.2418, "step": 341700 }, { "epoch": 0.4436, "grad_norm": 0.2105150818824768, "learning_rate": 6.32804e-05, "loss": 0.2397, "step": 341800 }, { "epoch": 0.4438, "grad_norm": 0.17236988246440887, "learning_rate": 6.32404e-05, "loss": 0.2437, "step": 341900 }, { "epoch": 0.444, "grad_norm": 0.28422054648399353, "learning_rate": 6.32004e-05, "loss": 0.2487, "step": 342000 }, { "epoch": 0.4442, "grad_norm": 0.1856316179037094, "learning_rate": 6.316040000000001e-05, "loss": 0.2402, "step": 342100 }, { "epoch": 0.4444, "grad_norm": 0.17778094112873077, "learning_rate": 6.31204e-05, "loss": 0.2501, "step": 342200 }, { "epoch": 0.4446, "grad_norm": 0.2376558780670166, "learning_rate": 6.30804e-05, "loss": 0.2442, "step": 342300 }, { "epoch": 0.4448, "grad_norm": 0.20546750724315643, "learning_rate": 6.30404e-05, "loss": 0.2476, "step": 342400 }, { "epoch": 0.445, "grad_norm": 0.17325349152088165, "learning_rate": 6.30004e-05, "loss": 0.2413, "step": 342500 }, { "epoch": 0.4452, "grad_norm": 0.23911508917808533, "learning_rate": 6.296040000000001e-05, "loss": 0.2446, "step": 342600 }, { "epoch": 0.4454, "grad_norm": 0.22123664617538452, "learning_rate": 6.29204e-05, "loss": 0.2441, "step": 342700 }, { "epoch": 0.4456, "grad_norm": 0.21504764258861542, "learning_rate": 6.288040000000001e-05, "loss": 0.2502, "step": 342800 }, { "epoch": 0.4458, "grad_norm": 0.16267873346805573, "learning_rate": 6.28404e-05, "loss": 0.243, "step": 342900 }, { "epoch": 0.446, "grad_norm": 0.19336003065109253, "learning_rate": 6.280040000000001e-05, "loss": 0.2497, "step": 343000 }, { "epoch": 0.4462, "grad_norm": 0.18519088625907898, "learning_rate": 6.27604e-05, "loss": 0.2469, "step": 343100 }, { "epoch": 0.4464, "grad_norm": 0.18827581405639648, "learning_rate": 6.27204e-05, "loss": 0.2425, "step": 343200 }, { "epoch": 0.4466, "grad_norm": 0.24695651233196259, "learning_rate": 6.26804e-05, "loss": 0.2406, "step": 343300 }, { "epoch": 0.4468, "grad_norm": 0.22703491151332855, "learning_rate": 6.26404e-05, "loss": 0.2466, "step": 343400 }, { "epoch": 0.447, "grad_norm": 0.16678479313850403, "learning_rate": 6.26004e-05, "loss": 0.2431, "step": 343500 }, { "epoch": 0.4472, "grad_norm": 0.16149210929870605, "learning_rate": 6.256040000000001e-05, "loss": 0.2447, "step": 343600 }, { "epoch": 0.4474, "grad_norm": 0.21965859830379486, "learning_rate": 6.25204e-05, "loss": 0.243, "step": 343700 }, { "epoch": 0.4476, "grad_norm": 0.21483761072158813, "learning_rate": 6.24804e-05, "loss": 0.2437, "step": 343800 }, { "epoch": 0.4478, "grad_norm": 0.1887097805738449, "learning_rate": 6.244039999999999e-05, "loss": 0.2383, "step": 343900 }, { "epoch": 0.448, "grad_norm": 0.23063704371452332, "learning_rate": 6.24004e-05, "loss": 0.2469, "step": 344000 }, { "epoch": 0.4482, "grad_norm": 0.18795616924762726, "learning_rate": 6.236040000000001e-05, "loss": 0.2427, "step": 344100 }, { "epoch": 0.4484, "grad_norm": 0.26034119725227356, "learning_rate": 6.23204e-05, "loss": 0.2457, "step": 344200 }, { "epoch": 0.4486, "grad_norm": 0.16719482839107513, "learning_rate": 6.228040000000001e-05, "loss": 0.2446, "step": 344300 }, { "epoch": 0.4488, "grad_norm": 0.2299802452325821, "learning_rate": 6.22404e-05, "loss": 0.2423, "step": 344400 }, { "epoch": 0.449, "grad_norm": 0.15124650299549103, "learning_rate": 6.220040000000001e-05, "loss": 0.249, "step": 344500 }, { "epoch": 0.4492, "grad_norm": 0.27996626496315, "learning_rate": 6.21604e-05, "loss": 0.2646, "step": 344600 }, { "epoch": 0.4494, "grad_norm": 0.18617983162403107, "learning_rate": 6.21204e-05, "loss": 0.2458, "step": 344700 }, { "epoch": 0.4496, "grad_norm": 0.1682778298854828, "learning_rate": 6.20804e-05, "loss": 0.2454, "step": 344800 }, { "epoch": 0.4498, "grad_norm": 0.212728813290596, "learning_rate": 6.20404e-05, "loss": 0.2453, "step": 344900 }, { "epoch": 0.45, "grad_norm": 0.2137337177991867, "learning_rate": 6.20004e-05, "loss": 0.2421, "step": 345000 }, { "epoch": 0.4502, "grad_norm": 0.18748146295547485, "learning_rate": 6.196040000000001e-05, "loss": 0.2427, "step": 345100 }, { "epoch": 0.4504, "grad_norm": 0.19029875099658966, "learning_rate": 6.19204e-05, "loss": 0.2437, "step": 345200 }, { "epoch": 0.4506, "grad_norm": 0.380691260099411, "learning_rate": 6.188040000000001e-05, "loss": 0.2418, "step": 345300 }, { "epoch": 0.4508, "grad_norm": 0.1864905059337616, "learning_rate": 6.18404e-05, "loss": 0.2442, "step": 345400 }, { "epoch": 0.451, "grad_norm": 0.1725115031003952, "learning_rate": 6.18004e-05, "loss": 0.2473, "step": 345500 }, { "epoch": 0.4512, "grad_norm": 0.2126380056142807, "learning_rate": 6.17604e-05, "loss": 0.246, "step": 345600 }, { "epoch": 0.4514, "grad_norm": 0.16333705186843872, "learning_rate": 6.17204e-05, "loss": 0.2432, "step": 345700 }, { "epoch": 0.4516, "grad_norm": 0.20585864782333374, "learning_rate": 6.16804e-05, "loss": 0.2407, "step": 345800 }, { "epoch": 0.4518, "grad_norm": 0.1900789737701416, "learning_rate": 6.16404e-05, "loss": 0.2418, "step": 345900 }, { "epoch": 0.452, "grad_norm": 0.1498749703168869, "learning_rate": 6.160040000000001e-05, "loss": 0.2478, "step": 346000 }, { "epoch": 0.4522, "grad_norm": 0.15842682123184204, "learning_rate": 6.15604e-05, "loss": 0.2416, "step": 346100 }, { "epoch": 0.4524, "grad_norm": 0.24384468793869019, "learning_rate": 6.15204e-05, "loss": 0.2416, "step": 346200 }, { "epoch": 0.4526, "grad_norm": 0.16510146856307983, "learning_rate": 6.14804e-05, "loss": 0.2414, "step": 346300 }, { "epoch": 0.4528, "grad_norm": 0.21207810938358307, "learning_rate": 6.14404e-05, "loss": 0.2439, "step": 346400 }, { "epoch": 0.453, "grad_norm": 0.1559634655714035, "learning_rate": 6.14004e-05, "loss": 0.2393, "step": 346500 }, { "epoch": 0.4532, "grad_norm": 0.16685090959072113, "learning_rate": 6.136040000000001e-05, "loss": 0.2437, "step": 346600 }, { "epoch": 0.4534, "grad_norm": 0.1927775889635086, "learning_rate": 6.13204e-05, "loss": 0.2431, "step": 346700 }, { "epoch": 0.4536, "grad_norm": 0.15297745168209076, "learning_rate": 6.128040000000001e-05, "loss": 0.2469, "step": 346800 }, { "epoch": 0.4538, "grad_norm": 0.20579373836517334, "learning_rate": 6.12404e-05, "loss": 0.2411, "step": 346900 }, { "epoch": 0.454, "grad_norm": 0.1857280731201172, "learning_rate": 6.12004e-05, "loss": 0.2392, "step": 347000 }, { "epoch": 0.4542, "grad_norm": 0.3481803238391876, "learning_rate": 6.11604e-05, "loss": 0.2442, "step": 347100 }, { "epoch": 0.4544, "grad_norm": 0.18435798585414886, "learning_rate": 6.11204e-05, "loss": 0.2433, "step": 347200 }, { "epoch": 0.4546, "grad_norm": 0.18237179517745972, "learning_rate": 6.10804e-05, "loss": 0.2436, "step": 347300 }, { "epoch": 0.4548, "grad_norm": 0.16900283098220825, "learning_rate": 6.10404e-05, "loss": 0.242, "step": 347400 }, { "epoch": 0.455, "grad_norm": 0.19602380692958832, "learning_rate": 6.10004e-05, "loss": 0.2421, "step": 347500 }, { "epoch": 0.4552, "grad_norm": 0.2622341513633728, "learning_rate": 6.096040000000001e-05, "loss": 0.2415, "step": 347600 }, { "epoch": 0.4554, "grad_norm": 0.3507973253726959, "learning_rate": 6.09204e-05, "loss": 0.2405, "step": 347700 }, { "epoch": 0.4556, "grad_norm": 0.2798367738723755, "learning_rate": 6.08804e-05, "loss": 0.2578, "step": 347800 }, { "epoch": 0.4558, "grad_norm": 0.1859496533870697, "learning_rate": 6.0840399999999994e-05, "loss": 0.2413, "step": 347900 }, { "epoch": 0.456, "grad_norm": 0.1584465652704239, "learning_rate": 6.08004e-05, "loss": 0.2389, "step": 348000 }, { "epoch": 0.4562, "grad_norm": 0.18819481134414673, "learning_rate": 6.076040000000001e-05, "loss": 0.2461, "step": 348100 }, { "epoch": 0.4564, "grad_norm": 0.16603697836399078, "learning_rate": 6.07204e-05, "loss": 0.2412, "step": 348200 }, { "epoch": 0.4566, "grad_norm": 0.17016661167144775, "learning_rate": 6.06804e-05, "loss": 0.2414, "step": 348300 }, { "epoch": 0.4568, "grad_norm": 0.21648702025413513, "learning_rate": 6.064040000000001e-05, "loss": 0.2391, "step": 348400 }, { "epoch": 0.457, "grad_norm": 0.1657271832227707, "learning_rate": 6.0600400000000003e-05, "loss": 0.2437, "step": 348500 }, { "epoch": 0.4572, "grad_norm": 0.19239091873168945, "learning_rate": 6.0560400000000004e-05, "loss": 0.2363, "step": 348600 }, { "epoch": 0.4574, "grad_norm": 0.2200000286102295, "learning_rate": 6.05204e-05, "loss": 0.2412, "step": 348700 }, { "epoch": 0.4576, "grad_norm": 0.18818236887454987, "learning_rate": 6.0480400000000005e-05, "loss": 0.2421, "step": 348800 }, { "epoch": 0.4578, "grad_norm": 0.18705151975154877, "learning_rate": 6.0440400000000005e-05, "loss": 0.2396, "step": 348900 }, { "epoch": 0.458, "grad_norm": 0.19457854330539703, "learning_rate": 6.04004e-05, "loss": 0.2392, "step": 349000 }, { "epoch": 0.4582, "grad_norm": 0.1518254578113556, "learning_rate": 6.0360400000000006e-05, "loss": 0.2497, "step": 349100 }, { "epoch": 0.4584, "grad_norm": 0.21858882904052734, "learning_rate": 6.03204e-05, "loss": 0.2376, "step": 349200 }, { "epoch": 0.4586, "grad_norm": 0.24455934762954712, "learning_rate": 6.0280400000000006e-05, "loss": 0.2364, "step": 349300 }, { "epoch": 0.4588, "grad_norm": 0.26377052068710327, "learning_rate": 6.024040000000001e-05, "loss": 0.2425, "step": 349400 }, { "epoch": 0.459, "grad_norm": 0.2012515366077423, "learning_rate": 6.02004e-05, "loss": 0.2395, "step": 349500 }, { "epoch": 0.4592, "grad_norm": 0.1855122596025467, "learning_rate": 6.016040000000001e-05, "loss": 0.2405, "step": 349600 }, { "epoch": 0.4594, "grad_norm": 0.17877699434757233, "learning_rate": 6.01204e-05, "loss": 0.2417, "step": 349700 }, { "epoch": 0.4596, "grad_norm": 0.20735540986061096, "learning_rate": 6.00804e-05, "loss": 0.244, "step": 349800 }, { "epoch": 0.4598, "grad_norm": 0.15910696983337402, "learning_rate": 6.004040000000001e-05, "loss": 0.2403, "step": 349900 }, { "epoch": 0.46, "grad_norm": 0.16688168048858643, "learning_rate": 6.00004e-05, "loss": 0.241, "step": 350000 }, { "epoch": 0.0002, "grad_norm": 0.18989884853363037, "learning_rate": 5.99604e-05, "loss": 0.2474, "step": 350100 }, { "epoch": 0.0004, "grad_norm": 0.39203211665153503, "learning_rate": 5.9920399999999996e-05, "loss": 0.2589, "step": 350200 }, { "epoch": 0.0006, "grad_norm": 0.4958038628101349, "learning_rate": 5.9880400000000004e-05, "loss": 0.2596, "step": 350300 }, { "epoch": 0.0008, "grad_norm": 0.2854653596878052, "learning_rate": 5.984040000000001e-05, "loss": 0.2638, "step": 350400 }, { "epoch": 0.001, "grad_norm": 0.1860920786857605, "learning_rate": 5.98004e-05, "loss": 0.2509, "step": 350500 }, { "epoch": 0.0012, "grad_norm": 0.41365325450897217, "learning_rate": 5.9760400000000005e-05, "loss": 0.2487, "step": 350600 }, { "epoch": 0.0014, "grad_norm": 0.35434776544570923, "learning_rate": 5.97204e-05, "loss": 0.2528, "step": 350700 }, { "epoch": 0.0016, "grad_norm": 0.3473292887210846, "learning_rate": 5.9680400000000005e-05, "loss": 0.2534, "step": 350800 }, { "epoch": 0.0018, "grad_norm": 0.3211774528026581, "learning_rate": 5.9640400000000006e-05, "loss": 0.2452, "step": 350900 }, { "epoch": 0.002, "grad_norm": 0.32706430554389954, "learning_rate": 5.96004e-05, "loss": 0.2444, "step": 351000 }, { "epoch": 0.0022, "grad_norm": 0.3087383210659027, "learning_rate": 5.9560400000000007e-05, "loss": 0.2609, "step": 351100 }, { "epoch": 0.0024, "grad_norm": 0.3712453246116638, "learning_rate": 5.95204e-05, "loss": 0.2478, "step": 351200 }, { "epoch": 0.0026, "grad_norm": 0.5335580706596375, "learning_rate": 5.94804e-05, "loss": 0.2533, "step": 351300 }, { "epoch": 0.0028, "grad_norm": 0.14397287368774414, "learning_rate": 5.944040000000001e-05, "loss": 0.2397, "step": 351400 }, { "epoch": 0.003, "grad_norm": 0.3205547332763672, "learning_rate": 5.94004e-05, "loss": 0.2501, "step": 351500 }, { "epoch": 0.0032, "grad_norm": 0.29998424649238586, "learning_rate": 5.93604e-05, "loss": 0.2614, "step": 351600 }, { "epoch": 0.0034, "grad_norm": 0.8002346754074097, "learning_rate": 5.9320399999999995e-05, "loss": 0.2461, "step": 351700 }, { "epoch": 0.0036, "grad_norm": 0.23687268793582916, "learning_rate": 5.92804e-05, "loss": 0.2502, "step": 351800 }, { "epoch": 0.0038, "grad_norm": 0.15750548243522644, "learning_rate": 5.924040000000001e-05, "loss": 0.2511, "step": 351900 }, { "epoch": 0.004, "grad_norm": 0.3123827576637268, "learning_rate": 5.92004e-05, "loss": 0.2481, "step": 352000 }, { "epoch": 0.0042, "grad_norm": 0.2805723249912262, "learning_rate": 5.9160400000000004e-05, "loss": 0.2508, "step": 352100 }, { "epoch": 0.0044, "grad_norm": 0.1956070512533188, "learning_rate": 5.91204e-05, "loss": 0.2677, "step": 352200 }, { "epoch": 0.0046, "grad_norm": 0.2959635853767395, "learning_rate": 5.9080400000000004e-05, "loss": 0.2544, "step": 352300 }, { "epoch": 0.0048, "grad_norm": 0.23278026282787323, "learning_rate": 5.9040400000000005e-05, "loss": 0.2426, "step": 352400 }, { "epoch": 0.005, "grad_norm": 0.25376245379447937, "learning_rate": 5.90004e-05, "loss": 0.2397, "step": 352500 }, { "epoch": 0.0052, "grad_norm": 0.2448563426733017, "learning_rate": 5.8960400000000006e-05, "loss": 0.2389, "step": 352600 }, { "epoch": 0.0054, "grad_norm": 0.3233645260334015, "learning_rate": 5.89204e-05, "loss": 0.2447, "step": 352700 }, { "epoch": 0.0056, "grad_norm": 0.2815536558628082, "learning_rate": 5.88804e-05, "loss": 0.2547, "step": 352800 }, { "epoch": 0.0058, "grad_norm": 0.30052757263183594, "learning_rate": 5.884040000000001e-05, "loss": 0.2399, "step": 352900 }, { "epoch": 0.006, "grad_norm": 0.26953259110450745, "learning_rate": 5.88004e-05, "loss": 0.292, "step": 353000 }, { "epoch": 0.0062, "grad_norm": 0.17700572311878204, "learning_rate": 5.876040000000001e-05, "loss": 0.2503, "step": 353100 }, { "epoch": 0.0064, "grad_norm": 0.15203849971294403, "learning_rate": 5.8720399999999994e-05, "loss": 0.2459, "step": 353200 }, { "epoch": 0.0066, "grad_norm": 0.6238852143287659, "learning_rate": 5.86804e-05, "loss": 0.2607, "step": 353300 }, { "epoch": 0.0068, "grad_norm": 0.18438929319381714, "learning_rate": 5.864040000000001e-05, "loss": 0.245, "step": 353400 }, { "epoch": 0.007, "grad_norm": 0.18198923766613007, "learning_rate": 5.86004e-05, "loss": 0.255, "step": 353500 }, { "epoch": 0.0072, "grad_norm": 0.1955738663673401, "learning_rate": 5.85604e-05, "loss": 0.2535, "step": 353600 }, { "epoch": 0.0074, "grad_norm": 0.22063769400119781, "learning_rate": 5.8520399999999996e-05, "loss": 0.2598, "step": 353700 }, { "epoch": 0.0076, "grad_norm": 0.15472210943698883, "learning_rate": 5.8480400000000003e-05, "loss": 0.2614, "step": 353800 }, { "epoch": 0.0078, "grad_norm": 0.19167518615722656, "learning_rate": 5.8440400000000004e-05, "loss": 0.2476, "step": 353900 }, { "epoch": 0.008, "grad_norm": 0.2191273272037506, "learning_rate": 5.84004e-05, "loss": 0.2533, "step": 354000 }, { "epoch": 0.0082, "grad_norm": 0.3382338285446167, "learning_rate": 5.8360400000000005e-05, "loss": 0.2611, "step": 354100 }, { "epoch": 0.0084, "grad_norm": 0.559860110282898, "learning_rate": 5.83204e-05, "loss": 0.2478, "step": 354200 }, { "epoch": 0.0086, "grad_norm": 0.1866038739681244, "learning_rate": 5.82804e-05, "loss": 0.2457, "step": 354300 }, { "epoch": 0.0088, "grad_norm": 0.21642564237117767, "learning_rate": 5.8240400000000006e-05, "loss": 0.246, "step": 354400 }, { "epoch": 0.009, "grad_norm": 0.23860223591327667, "learning_rate": 5.82004e-05, "loss": 0.2595, "step": 354500 }, { "epoch": 0.0092, "grad_norm": 0.21850599348545074, "learning_rate": 5.8160400000000006e-05, "loss": 0.2626, "step": 354600 }, { "epoch": 0.0094, "grad_norm": 0.2536315619945526, "learning_rate": 5.812040000000001e-05, "loss": 0.2551, "step": 354700 }, { "epoch": 0.0096, "grad_norm": 0.22218576073646545, "learning_rate": 5.80804e-05, "loss": 0.2647, "step": 354800 }, { "epoch": 0.0098, "grad_norm": 0.1981973648071289, "learning_rate": 5.804040000000001e-05, "loss": 0.2608, "step": 354900 }, { "epoch": 0.01, "grad_norm": 0.23694828152656555, "learning_rate": 5.80004e-05, "loss": 0.2562, "step": 355000 }, { "epoch": 0.0102, "grad_norm": 0.22222231328487396, "learning_rate": 5.79604e-05, "loss": 0.2607, "step": 355100 }, { "epoch": 0.0104, "grad_norm": 0.26125141978263855, "learning_rate": 5.792040000000001e-05, "loss": 0.2451, "step": 355200 }, { "epoch": 0.0106, "grad_norm": 0.2317095398902893, "learning_rate": 5.78804e-05, "loss": 0.2566, "step": 355300 }, { "epoch": 0.0108, "grad_norm": 0.24477675557136536, "learning_rate": 5.78404e-05, "loss": 0.2598, "step": 355400 }, { "epoch": 0.011, "grad_norm": 0.2259080708026886, "learning_rate": 5.7800399999999996e-05, "loss": 0.2576, "step": 355500 }, { "epoch": 0.0112, "grad_norm": 0.20323753356933594, "learning_rate": 5.7760400000000004e-05, "loss": 0.2531, "step": 355600 }, { "epoch": 0.0114, "grad_norm": 0.2604706883430481, "learning_rate": 5.772040000000001e-05, "loss": 0.2474, "step": 355700 }, { "epoch": 0.0116, "grad_norm": 0.2472306191921234, "learning_rate": 5.7680400000000004e-05, "loss": 0.2574, "step": 355800 }, { "epoch": 0.0118, "grad_norm": 0.21734565496444702, "learning_rate": 5.7640400000000005e-05, "loss": 0.2589, "step": 355900 }, { "epoch": 0.012, "grad_norm": 0.1756725013256073, "learning_rate": 5.76004e-05, "loss": 0.25, "step": 356000 }, { "epoch": 0.0122, "grad_norm": 0.2940695881843567, "learning_rate": 5.7560400000000005e-05, "loss": 0.2797, "step": 356100 }, { "epoch": 0.0124, "grad_norm": 0.1878010779619217, "learning_rate": 5.7520400000000006e-05, "loss": 0.2494, "step": 356200 }, { "epoch": 0.0126, "grad_norm": 0.2582753598690033, "learning_rate": 5.74804e-05, "loss": 0.2584, "step": 356300 }, { "epoch": 0.0128, "grad_norm": 0.15801192820072174, "learning_rate": 5.7440400000000007e-05, "loss": 0.2465, "step": 356400 }, { "epoch": 0.013, "grad_norm": 0.26261720061302185, "learning_rate": 5.74004e-05, "loss": 0.2502, "step": 356500 }, { "epoch": 0.0132, "grad_norm": 0.22228971123695374, "learning_rate": 5.73604e-05, "loss": 0.2445, "step": 356600 }, { "epoch": 0.0134, "grad_norm": 0.2221522033214569, "learning_rate": 5.732040000000001e-05, "loss": 0.2695, "step": 356700 }, { "epoch": 0.0136, "grad_norm": 0.24543708562850952, "learning_rate": 5.72804e-05, "loss": 0.2442, "step": 356800 }, { "epoch": 0.0138, "grad_norm": 0.2565612196922302, "learning_rate": 5.724040000000001e-05, "loss": 0.2623, "step": 356900 }, { "epoch": 0.014, "grad_norm": 0.2401857078075409, "learning_rate": 5.7200399999999995e-05, "loss": 0.2575, "step": 357000 }, { "epoch": 0.0142, "grad_norm": 0.18169806897640228, "learning_rate": 5.71604e-05, "loss": 0.2514, "step": 357100 }, { "epoch": 0.0144, "grad_norm": 0.2812741994857788, "learning_rate": 5.712040000000001e-05, "loss": 0.2513, "step": 357200 }, { "epoch": 0.0146, "grad_norm": 0.16843974590301514, "learning_rate": 5.70804e-05, "loss": 0.2564, "step": 357300 }, { "epoch": 0.0148, "grad_norm": 0.31371912360191345, "learning_rate": 5.7040400000000004e-05, "loss": 0.2476, "step": 357400 }, { "epoch": 0.015, "grad_norm": 0.23738493025302887, "learning_rate": 5.70004e-05, "loss": 0.2573, "step": 357500 }, { "epoch": 0.0152, "grad_norm": 0.19511915743350983, "learning_rate": 5.6960400000000004e-05, "loss": 0.2506, "step": 357600 }, { "epoch": 0.0154, "grad_norm": 0.16517697274684906, "learning_rate": 5.6920400000000005e-05, "loss": 0.2485, "step": 357700 }, { "epoch": 0.0156, "grad_norm": 0.19995634257793427, "learning_rate": 5.68804e-05, "loss": 0.2484, "step": 357800 }, { "epoch": 0.0158, "grad_norm": 0.21146655082702637, "learning_rate": 5.6840400000000006e-05, "loss": 0.2443, "step": 357900 }, { "epoch": 0.016, "grad_norm": 0.34297746419906616, "learning_rate": 5.68004e-05, "loss": 0.2451, "step": 358000 }, { "epoch": 0.0162, "grad_norm": 0.20331263542175293, "learning_rate": 5.67604e-05, "loss": 0.2586, "step": 358100 }, { "epoch": 0.0164, "grad_norm": 0.3667886257171631, "learning_rate": 5.672040000000001e-05, "loss": 0.249, "step": 358200 }, { "epoch": 0.0166, "grad_norm": 0.31036943197250366, "learning_rate": 5.66804e-05, "loss": 0.2457, "step": 358300 }, { "epoch": 0.0168, "grad_norm": 0.2320556640625, "learning_rate": 5.664040000000001e-05, "loss": 0.253, "step": 358400 }, { "epoch": 0.017, "grad_norm": 0.23382051289081573, "learning_rate": 5.66004e-05, "loss": 0.2578, "step": 358500 }, { "epoch": 0.0172, "grad_norm": 0.2036983221769333, "learning_rate": 5.65604e-05, "loss": 0.2636, "step": 358600 }, { "epoch": 0.0174, "grad_norm": 0.3016323745250702, "learning_rate": 5.652040000000001e-05, "loss": 0.2532, "step": 358700 }, { "epoch": 0.0176, "grad_norm": 0.1959076225757599, "learning_rate": 5.64804e-05, "loss": 0.2521, "step": 358800 }, { "epoch": 0.0178, "grad_norm": 0.22912850975990295, "learning_rate": 5.64404e-05, "loss": 0.2401, "step": 358900 }, { "epoch": 0.018, "grad_norm": 0.16965632140636444, "learning_rate": 5.6400399999999996e-05, "loss": 0.2506, "step": 359000 }, { "epoch": 0.0182, "grad_norm": 0.17492711544036865, "learning_rate": 5.63604e-05, "loss": 0.2504, "step": 359100 }, { "epoch": 0.0184, "grad_norm": 0.1555829495191574, "learning_rate": 5.6320400000000004e-05, "loss": 0.254, "step": 359200 }, { "epoch": 0.0186, "grad_norm": 0.20478782057762146, "learning_rate": 5.62804e-05, "loss": 0.245, "step": 359300 }, { "epoch": 0.0188, "grad_norm": 0.21451802551746368, "learning_rate": 5.6240400000000005e-05, "loss": 0.2356, "step": 359400 }, { "epoch": 0.019, "grad_norm": 0.4382535517215729, "learning_rate": 5.62004e-05, "loss": 0.265, "step": 359500 }, { "epoch": 0.0192, "grad_norm": 0.20359216630458832, "learning_rate": 5.6160400000000005e-05, "loss": 0.2408, "step": 359600 }, { "epoch": 0.0194, "grad_norm": 0.17969875037670135, "learning_rate": 5.6120400000000006e-05, "loss": 0.2609, "step": 359700 }, { "epoch": 0.0196, "grad_norm": 0.20156468451023102, "learning_rate": 5.60804e-05, "loss": 0.2464, "step": 359800 }, { "epoch": 0.0198, "grad_norm": 0.518825888633728, "learning_rate": 5.6040400000000006e-05, "loss": 0.255, "step": 359900 }, { "epoch": 0.02, "grad_norm": 0.2389851212501526, "learning_rate": 5.60004e-05, "loss": 0.257, "step": 360000 }, { "epoch": 0.0202, "grad_norm": 0.15437252819538116, "learning_rate": 5.59604e-05, "loss": 0.2511, "step": 360100 }, { "epoch": 0.0204, "grad_norm": 0.687829852104187, "learning_rate": 5.592040000000001e-05, "loss": 0.2643, "step": 360200 }, { "epoch": 0.0206, "grad_norm": 0.18927748501300812, "learning_rate": 5.58804e-05, "loss": 0.2553, "step": 360300 }, { "epoch": 0.0208, "grad_norm": 0.1879296749830246, "learning_rate": 5.58404e-05, "loss": 0.2584, "step": 360400 }, { "epoch": 0.021, "grad_norm": 0.2479875683784485, "learning_rate": 5.5800399999999995e-05, "loss": 0.2374, "step": 360500 }, { "epoch": 0.0212, "grad_norm": 0.20449163019657135, "learning_rate": 5.57604e-05, "loss": 0.2585, "step": 360600 }, { "epoch": 0.0214, "grad_norm": 0.18261049687862396, "learning_rate": 5.57204e-05, "loss": 0.2383, "step": 360700 }, { "epoch": 0.0216, "grad_norm": 0.2606537938117981, "learning_rate": 5.5680399999999996e-05, "loss": 0.2542, "step": 360800 }, { "epoch": 0.0218, "grad_norm": 0.18397171795368195, "learning_rate": 5.5640400000000003e-05, "loss": 0.2565, "step": 360900 }, { "epoch": 0.022, "grad_norm": 0.6222195029258728, "learning_rate": 5.560040000000001e-05, "loss": 0.2505, "step": 361000 }, { "epoch": 0.0222, "grad_norm": 0.26597991585731506, "learning_rate": 5.5560400000000004e-05, "loss": 0.2567, "step": 361100 }, { "epoch": 0.0224, "grad_norm": 0.26086699962615967, "learning_rate": 5.5520400000000005e-05, "loss": 0.2522, "step": 361200 }, { "epoch": 0.0226, "grad_norm": 0.3996891379356384, "learning_rate": 5.54804e-05, "loss": 0.2609, "step": 361300 }, { "epoch": 0.0228, "grad_norm": 0.4706418216228485, "learning_rate": 5.5440400000000005e-05, "loss": 0.2651, "step": 361400 }, { "epoch": 0.023, "grad_norm": 0.23967041075229645, "learning_rate": 5.5400400000000006e-05, "loss": 0.2546, "step": 361500 }, { "epoch": 0.0232, "grad_norm": 0.23435647785663605, "learning_rate": 5.53604e-05, "loss": 0.2494, "step": 361600 }, { "epoch": 0.0234, "grad_norm": 0.1719307005405426, "learning_rate": 5.5320400000000007e-05, "loss": 0.2454, "step": 361700 }, { "epoch": 0.0236, "grad_norm": 0.24978755414485931, "learning_rate": 5.52804e-05, "loss": 0.2624, "step": 361800 }, { "epoch": 0.0238, "grad_norm": 0.2097492516040802, "learning_rate": 5.52404e-05, "loss": 0.2945, "step": 361900 }, { "epoch": 0.024, "grad_norm": 0.23128892481327057, "learning_rate": 5.520040000000001e-05, "loss": 0.2447, "step": 362000 }, { "epoch": 0.0242, "grad_norm": 0.2238485962152481, "learning_rate": 5.51604e-05, "loss": 0.2516, "step": 362100 }, { "epoch": 0.0244, "grad_norm": 0.21893225610256195, "learning_rate": 5.512040000000001e-05, "loss": 0.2568, "step": 362200 }, { "epoch": 0.0246, "grad_norm": 0.18265652656555176, "learning_rate": 5.5080399999999995e-05, "loss": 0.2509, "step": 362300 }, { "epoch": 0.0248, "grad_norm": 0.22736512124538422, "learning_rate": 5.50404e-05, "loss": 0.2554, "step": 362400 }, { "epoch": 0.025, "grad_norm": 0.257276326417923, "learning_rate": 5.500040000000001e-05, "loss": 0.2661, "step": 362500 }, { "epoch": 0.0252, "grad_norm": 0.2855454087257385, "learning_rate": 5.49604e-05, "loss": 0.2708, "step": 362600 }, { "epoch": 0.0254, "grad_norm": 0.2235616147518158, "learning_rate": 5.4920400000000004e-05, "loss": 0.2706, "step": 362700 }, { "epoch": 0.0256, "grad_norm": 0.27502375841140747, "learning_rate": 5.48804e-05, "loss": 0.2533, "step": 362800 }, { "epoch": 0.0258, "grad_norm": 0.31357336044311523, "learning_rate": 5.4840400000000004e-05, "loss": 0.2586, "step": 362900 }, { "epoch": 0.026, "grad_norm": 0.22784167528152466, "learning_rate": 5.4800400000000005e-05, "loss": 0.2924, "step": 363000 }, { "epoch": 0.0262, "grad_norm": 0.16127784550189972, "learning_rate": 5.47604e-05, "loss": 0.2581, "step": 363100 }, { "epoch": 0.0264, "grad_norm": 0.23932571709156036, "learning_rate": 5.4720400000000006e-05, "loss": 0.2612, "step": 363200 }, { "epoch": 0.0266, "grad_norm": 0.22151145339012146, "learning_rate": 5.46804e-05, "loss": 0.2579, "step": 363300 }, { "epoch": 0.0268, "grad_norm": 0.2576836347579956, "learning_rate": 5.46404e-05, "loss": 0.2546, "step": 363400 }, { "epoch": 0.027, "grad_norm": 0.2526397407054901, "learning_rate": 5.460040000000001e-05, "loss": 0.2403, "step": 363500 }, { "epoch": 0.0272, "grad_norm": 0.29295867681503296, "learning_rate": 5.45604e-05, "loss": 0.2351, "step": 363600 }, { "epoch": 0.0274, "grad_norm": 0.21946589648723602, "learning_rate": 5.452040000000001e-05, "loss": 0.2376, "step": 363700 }, { "epoch": 0.0276, "grad_norm": 0.34592190384864807, "learning_rate": 5.44804e-05, "loss": 0.2373, "step": 363800 }, { "epoch": 0.0278, "grad_norm": 0.16898639500141144, "learning_rate": 5.44404e-05, "loss": 0.2348, "step": 363900 }, { "epoch": 0.028, "grad_norm": 0.18720662593841553, "learning_rate": 5.440040000000001e-05, "loss": 0.2453, "step": 364000 }, { "epoch": 0.0282, "grad_norm": 0.18894708156585693, "learning_rate": 5.43604e-05, "loss": 0.2265, "step": 364100 }, { "epoch": 0.0284, "grad_norm": 0.24615564942359924, "learning_rate": 5.43204e-05, "loss": 0.233, "step": 364200 }, { "epoch": 0.0286, "grad_norm": 0.18543006479740143, "learning_rate": 5.4280399999999996e-05, "loss": 0.2448, "step": 364300 }, { "epoch": 0.0288, "grad_norm": 0.21889004111289978, "learning_rate": 5.42404e-05, "loss": 0.2341, "step": 364400 }, { "epoch": 0.029, "grad_norm": 0.40268123149871826, "learning_rate": 5.4200400000000004e-05, "loss": 0.2447, "step": 364500 }, { "epoch": 0.0292, "grad_norm": 0.5381551384925842, "learning_rate": 5.41604e-05, "loss": 0.2338, "step": 364600 }, { "epoch": 0.0294, "grad_norm": 0.39877715706825256, "learning_rate": 5.4120400000000004e-05, "loss": 0.2323, "step": 364700 }, { "epoch": 0.0296, "grad_norm": 0.17573441565036774, "learning_rate": 5.40804e-05, "loss": 0.2382, "step": 364800 }, { "epoch": 0.0298, "grad_norm": 0.14901190996170044, "learning_rate": 5.4040400000000005e-05, "loss": 0.2371, "step": 364900 }, { "epoch": 0.03, "grad_norm": 0.22328852117061615, "learning_rate": 5.4000400000000006e-05, "loss": 0.2417, "step": 365000 }, { "epoch": 0.0302, "grad_norm": 0.1952635943889618, "learning_rate": 5.39604e-05, "loss": 0.2407, "step": 365100 }, { "epoch": 0.0304, "grad_norm": 0.1605110615491867, "learning_rate": 5.3920400000000006e-05, "loss": 0.2352, "step": 365200 }, { "epoch": 0.0306, "grad_norm": 0.24076199531555176, "learning_rate": 5.38804e-05, "loss": 0.2355, "step": 365300 }, { "epoch": 0.0308, "grad_norm": 0.269298791885376, "learning_rate": 5.38404e-05, "loss": 0.2386, "step": 365400 }, { "epoch": 0.031, "grad_norm": 0.22119976580142975, "learning_rate": 5.380040000000001e-05, "loss": 0.2405, "step": 365500 }, { "epoch": 0.0312, "grad_norm": 0.21899624168872833, "learning_rate": 5.37604e-05, "loss": 0.2482, "step": 365600 }, { "epoch": 0.0314, "grad_norm": 0.48613792657852173, "learning_rate": 5.37204e-05, "loss": 0.2388, "step": 365700 }, { "epoch": 0.0316, "grad_norm": 0.22591345012187958, "learning_rate": 5.3680399999999995e-05, "loss": 0.2351, "step": 365800 }, { "epoch": 0.0318, "grad_norm": 0.4765823483467102, "learning_rate": 5.36404e-05, "loss": 0.2387, "step": 365900 }, { "epoch": 0.032, "grad_norm": 0.30145007371902466, "learning_rate": 5.360040000000001e-05, "loss": 0.2419, "step": 366000 }, { "epoch": 0.0322, "grad_norm": 0.2979516386985779, "learning_rate": 5.3560399999999996e-05, "loss": 0.244, "step": 366100 }, { "epoch": 0.0324, "grad_norm": 0.23975351452827454, "learning_rate": 5.3520400000000003e-05, "loss": 0.2393, "step": 366200 }, { "epoch": 0.0326, "grad_norm": 0.22233131527900696, "learning_rate": 5.34804e-05, "loss": 0.2422, "step": 366300 }, { "epoch": 0.0328, "grad_norm": 0.1476048082113266, "learning_rate": 5.3440400000000004e-05, "loss": 0.2493, "step": 366400 }, { "epoch": 0.033, "grad_norm": 0.20657120645046234, "learning_rate": 5.3400400000000005e-05, "loss": 0.2445, "step": 366500 }, { "epoch": 0.0332, "grad_norm": 0.5175828337669373, "learning_rate": 5.33604e-05, "loss": 0.2467, "step": 366600 }, { "epoch": 0.0334, "grad_norm": 0.2220238447189331, "learning_rate": 5.3320400000000005e-05, "loss": 0.319, "step": 366700 }, { "epoch": 0.0336, "grad_norm": 0.3628231883049011, "learning_rate": 5.3280400000000006e-05, "loss": 0.2405, "step": 366800 }, { "epoch": 0.0338, "grad_norm": 0.235669806599617, "learning_rate": 5.32404e-05, "loss": 0.245, "step": 366900 }, { "epoch": 0.034, "grad_norm": 0.1623888611793518, "learning_rate": 5.3200400000000007e-05, "loss": 0.2425, "step": 367000 }, { "epoch": 0.0342, "grad_norm": 0.28828221559524536, "learning_rate": 5.31604e-05, "loss": 0.2411, "step": 367100 }, { "epoch": 0.0344, "grad_norm": 0.7484803199768066, "learning_rate": 5.31204e-05, "loss": 0.2386, "step": 367200 }, { "epoch": 0.0346, "grad_norm": 0.39959147572517395, "learning_rate": 5.308040000000001e-05, "loss": 0.2359, "step": 367300 }, { "epoch": 0.0348, "grad_norm": 0.1961216777563095, "learning_rate": 5.30404e-05, "loss": 0.2398, "step": 367400 }, { "epoch": 0.035, "grad_norm": 0.17088356614112854, "learning_rate": 5.300040000000001e-05, "loss": 0.2363, "step": 367500 }, { "epoch": 0.0352, "grad_norm": 0.394522100687027, "learning_rate": 5.29604e-05, "loss": 0.2385, "step": 367600 }, { "epoch": 0.0354, "grad_norm": 0.26079925894737244, "learning_rate": 5.29204e-05, "loss": 0.2316, "step": 367700 }, { "epoch": 0.0356, "grad_norm": 0.3335099220275879, "learning_rate": 5.288040000000001e-05, "loss": 0.2549, "step": 367800 }, { "epoch": 0.0358, "grad_norm": 0.3195158541202545, "learning_rate": 5.28404e-05, "loss": 0.2463, "step": 367900 }, { "epoch": 0.036, "grad_norm": 0.18612483143806458, "learning_rate": 5.2800400000000004e-05, "loss": 0.2408, "step": 368000 }, { "epoch": 0.0362, "grad_norm": 0.4741574823856354, "learning_rate": 5.27604e-05, "loss": 0.245, "step": 368100 }, { "epoch": 0.0364, "grad_norm": 0.26677343249320984, "learning_rate": 5.2720400000000004e-05, "loss": 0.2402, "step": 368200 }, { "epoch": 0.0366, "grad_norm": 0.24383248388767242, "learning_rate": 5.2680400000000005e-05, "loss": 0.242, "step": 368300 }, { "epoch": 0.0368, "grad_norm": 0.18668265640735626, "learning_rate": 5.26404e-05, "loss": 0.2431, "step": 368400 }, { "epoch": 0.037, "grad_norm": 0.18299442529678345, "learning_rate": 5.2600400000000005e-05, "loss": 0.2464, "step": 368500 }, { "epoch": 0.0372, "grad_norm": 0.22154571115970612, "learning_rate": 5.25604e-05, "loss": 0.2353, "step": 368600 }, { "epoch": 0.0374, "grad_norm": 1.3703587055206299, "learning_rate": 5.2520400000000006e-05, "loss": 0.2474, "step": 368700 }, { "epoch": 0.0376, "grad_norm": 0.19284115731716156, "learning_rate": 5.248040000000001e-05, "loss": 0.2467, "step": 368800 }, { "epoch": 0.0378, "grad_norm": 0.2304501235485077, "learning_rate": 5.24404e-05, "loss": 0.2373, "step": 368900 }, { "epoch": 0.038, "grad_norm": 0.1811191737651825, "learning_rate": 5.240040000000001e-05, "loss": 0.2381, "step": 369000 }, { "epoch": 0.0382, "grad_norm": 0.19747400283813477, "learning_rate": 5.23604e-05, "loss": 0.2437, "step": 369100 }, { "epoch": 0.0384, "grad_norm": 0.16430014371871948, "learning_rate": 5.23204e-05, "loss": 0.2458, "step": 369200 }, { "epoch": 0.0386, "grad_norm": 0.27376890182495117, "learning_rate": 5.228040000000001e-05, "loss": 0.2373, "step": 369300 }, { "epoch": 0.0388, "grad_norm": 0.21681749820709229, "learning_rate": 5.22404e-05, "loss": 0.2479, "step": 369400 }, { "epoch": 0.039, "grad_norm": 0.28773918747901917, "learning_rate": 5.22004e-05, "loss": 0.2521, "step": 369500 }, { "epoch": 0.0392, "grad_norm": 0.26816967129707336, "learning_rate": 5.2160399999999996e-05, "loss": 0.2464, "step": 369600 }, { "epoch": 0.0394, "grad_norm": 0.16062785685062408, "learning_rate": 5.21204e-05, "loss": 0.239, "step": 369700 }, { "epoch": 0.0396, "grad_norm": 0.23461496829986572, "learning_rate": 5.2080400000000004e-05, "loss": 0.2394, "step": 369800 }, { "epoch": 0.0398, "grad_norm": 0.19398300349712372, "learning_rate": 5.20404e-05, "loss": 0.2365, "step": 369900 }, { "epoch": 0.04, "grad_norm": 0.16843223571777344, "learning_rate": 5.2000400000000004e-05, "loss": 0.2425, "step": 370000 }, { "epoch": 0.0402, "grad_norm": 0.19908249378204346, "learning_rate": 5.19604e-05, "loss": 0.2339, "step": 370100 }, { "epoch": 0.0404, "grad_norm": 0.22099874913692474, "learning_rate": 5.1920400000000005e-05, "loss": 0.2519, "step": 370200 }, { "epoch": 0.0406, "grad_norm": 0.2163575440645218, "learning_rate": 5.1880400000000006e-05, "loss": 0.2494, "step": 370300 }, { "epoch": 0.0408, "grad_norm": 0.2572519779205322, "learning_rate": 5.18404e-05, "loss": 0.2431, "step": 370400 }, { "epoch": 0.041, "grad_norm": 0.5304775834083557, "learning_rate": 5.1800400000000006e-05, "loss": 0.2427, "step": 370500 }, { "epoch": 0.0412, "grad_norm": 0.3667397201061249, "learning_rate": 5.17604e-05, "loss": 0.2403, "step": 370600 }, { "epoch": 0.0414, "grad_norm": 0.24394138157367706, "learning_rate": 5.17204e-05, "loss": 0.2401, "step": 370700 }, { "epoch": 0.0416, "grad_norm": 0.16485385596752167, "learning_rate": 5.168040000000001e-05, "loss": 0.2516, "step": 370800 }, { "epoch": 0.0418, "grad_norm": 0.19729548692703247, "learning_rate": 5.16404e-05, "loss": 0.2456, "step": 370900 }, { "epoch": 0.042, "grad_norm": 0.3520711064338684, "learning_rate": 5.16004e-05, "loss": 0.2456, "step": 371000 }, { "epoch": 0.0422, "grad_norm": 0.1835898905992508, "learning_rate": 5.1560399999999995e-05, "loss": 0.2417, "step": 371100 }, { "epoch": 0.0424, "grad_norm": 0.502991795539856, "learning_rate": 5.15204e-05, "loss": 0.2446, "step": 371200 }, { "epoch": 0.0426, "grad_norm": 0.2608524560928345, "learning_rate": 5.148040000000001e-05, "loss": 0.2473, "step": 371300 }, { "epoch": 0.0428, "grad_norm": 0.23622851073741913, "learning_rate": 5.14404e-05, "loss": 0.2451, "step": 371400 }, { "epoch": 0.043, "grad_norm": 0.3664970099925995, "learning_rate": 5.1400400000000003e-05, "loss": 0.2457, "step": 371500 }, { "epoch": 0.0432, "grad_norm": 0.16481013596057892, "learning_rate": 5.13604e-05, "loss": 0.2499, "step": 371600 }, { "epoch": 0.0434, "grad_norm": 0.23250015079975128, "learning_rate": 5.1320400000000004e-05, "loss": 0.2453, "step": 371700 }, { "epoch": 0.0436, "grad_norm": 0.262839674949646, "learning_rate": 5.1280400000000005e-05, "loss": 0.2438, "step": 371800 }, { "epoch": 0.0438, "grad_norm": 0.14810775220394135, "learning_rate": 5.12404e-05, "loss": 0.2445, "step": 371900 }, { "epoch": 0.044, "grad_norm": 0.20043599605560303, "learning_rate": 5.1200400000000005e-05, "loss": 0.2465, "step": 372000 }, { "epoch": 0.0442, "grad_norm": 0.21188238263130188, "learning_rate": 5.11604e-05, "loss": 0.248, "step": 372100 }, { "epoch": 0.0444, "grad_norm": 0.19826635718345642, "learning_rate": 5.11204e-05, "loss": 0.2357, "step": 372200 }, { "epoch": 0.0446, "grad_norm": 0.26934173703193665, "learning_rate": 5.1080400000000006e-05, "loss": 0.2872, "step": 372300 }, { "epoch": 0.0448, "grad_norm": 0.21112340688705444, "learning_rate": 5.10404e-05, "loss": 0.2405, "step": 372400 }, { "epoch": 0.045, "grad_norm": 0.17311957478523254, "learning_rate": 5.10004e-05, "loss": 0.2367, "step": 372500 }, { "epoch": 0.0452, "grad_norm": 0.16153335571289062, "learning_rate": 5.0960399999999994e-05, "loss": 0.2371, "step": 372600 }, { "epoch": 0.0454, "grad_norm": 0.19075436890125275, "learning_rate": 5.09204e-05, "loss": 0.2342, "step": 372700 }, { "epoch": 0.0456, "grad_norm": 0.26471832394599915, "learning_rate": 5.088040000000001e-05, "loss": 0.2336, "step": 372800 }, { "epoch": 0.0458, "grad_norm": 0.2320035994052887, "learning_rate": 5.08404e-05, "loss": 0.238, "step": 372900 }, { "epoch": 0.046, "grad_norm": 0.19585338234901428, "learning_rate": 5.08004e-05, "loss": 0.2374, "step": 373000 }, { "epoch": 0.0462, "grad_norm": 0.21926333010196686, "learning_rate": 5.076040000000001e-05, "loss": 0.2452, "step": 373100 }, { "epoch": 0.0464, "grad_norm": 0.2396099716424942, "learning_rate": 5.07204e-05, "loss": 0.2591, "step": 373200 }, { "epoch": 0.0466, "grad_norm": 0.23458154499530792, "learning_rate": 5.0680400000000004e-05, "loss": 0.2393, "step": 373300 }, { "epoch": 0.0468, "grad_norm": 0.23646235466003418, "learning_rate": 5.06404e-05, "loss": 0.2455, "step": 373400 }, { "epoch": 0.047, "grad_norm": 0.16524997353553772, "learning_rate": 5.0600400000000004e-05, "loss": 0.2423, "step": 373500 }, { "epoch": 0.0472, "grad_norm": 0.19128525257110596, "learning_rate": 5.0560400000000005e-05, "loss": 0.2396, "step": 373600 }, { "epoch": 0.0474, "grad_norm": 0.1869351863861084, "learning_rate": 5.05204e-05, "loss": 0.243, "step": 373700 }, { "epoch": 0.0476, "grad_norm": 0.22153295576572418, "learning_rate": 5.0480400000000005e-05, "loss": 0.2425, "step": 373800 }, { "epoch": 0.0478, "grad_norm": 0.20191314816474915, "learning_rate": 5.04404e-05, "loss": 0.247, "step": 373900 }, { "epoch": 0.048, "grad_norm": 0.20306941866874695, "learning_rate": 5.0400400000000006e-05, "loss": 0.2463, "step": 374000 }, { "epoch": 0.0482, "grad_norm": 0.29261788725852966, "learning_rate": 5.0360400000000007e-05, "loss": 0.2492, "step": 374100 }, { "epoch": 0.0484, "grad_norm": 0.3436906635761261, "learning_rate": 5.03204e-05, "loss": 0.2462, "step": 374200 }, { "epoch": 0.0486, "grad_norm": 0.19893062114715576, "learning_rate": 5.028040000000001e-05, "loss": 0.2487, "step": 374300 }, { "epoch": 0.0488, "grad_norm": 0.17730703949928284, "learning_rate": 5.02404e-05, "loss": 0.2388, "step": 374400 }, { "epoch": 0.049, "grad_norm": 0.2693403661251068, "learning_rate": 5.02004e-05, "loss": 0.2472, "step": 374500 }, { "epoch": 0.0492, "grad_norm": 0.20405225455760956, "learning_rate": 5.016040000000001e-05, "loss": 0.2382, "step": 374600 }, { "epoch": 0.0494, "grad_norm": 0.30344367027282715, "learning_rate": 5.01204e-05, "loss": 0.2484, "step": 374700 }, { "epoch": 0.0496, "grad_norm": 0.23984545469284058, "learning_rate": 5.00804e-05, "loss": 0.2453, "step": 374800 }, { "epoch": 0.0498, "grad_norm": 0.20003117620944977, "learning_rate": 5.0040399999999996e-05, "loss": 0.2434, "step": 374900 }, { "epoch": 0.05, "grad_norm": 0.19574657082557678, "learning_rate": 5.00004e-05, "loss": 0.2461, "step": 375000 }, { "epoch": 0.0502, "grad_norm": 0.1617669016122818, "learning_rate": 4.9960400000000004e-05, "loss": 0.2407, "step": 375100 }, { "epoch": 0.0504, "grad_norm": 0.37597665190696716, "learning_rate": 4.99204e-05, "loss": 0.2579, "step": 375200 }, { "epoch": 0.0506, "grad_norm": 0.21448801457881927, "learning_rate": 4.9880400000000004e-05, "loss": 0.2458, "step": 375300 }, { "epoch": 0.0508, "grad_norm": 0.2033076137304306, "learning_rate": 4.9840400000000005e-05, "loss": 0.246, "step": 375400 }, { "epoch": 0.051, "grad_norm": 0.15409351885318756, "learning_rate": 4.9800400000000005e-05, "loss": 0.2478, "step": 375500 }, { "epoch": 0.0512, "grad_norm": 0.2577267289161682, "learning_rate": 4.97604e-05, "loss": 0.2436, "step": 375600 }, { "epoch": 0.0514, "grad_norm": 0.1761707067489624, "learning_rate": 4.97204e-05, "loss": 0.2505, "step": 375700 }, { "epoch": 0.0516, "grad_norm": 0.2014208734035492, "learning_rate": 4.9680400000000006e-05, "loss": 0.2504, "step": 375800 }, { "epoch": 0.0518, "grad_norm": 0.21886250376701355, "learning_rate": 4.964040000000001e-05, "loss": 0.2437, "step": 375900 }, { "epoch": 0.052, "grad_norm": 0.3855169415473938, "learning_rate": 4.96004e-05, "loss": 0.2589, "step": 376000 }, { "epoch": 0.0522, "grad_norm": 0.21187375485897064, "learning_rate": 4.95604e-05, "loss": 0.2376, "step": 376100 }, { "epoch": 0.0524, "grad_norm": 0.16554217040538788, "learning_rate": 4.95204e-05, "loss": 0.2448, "step": 376200 }, { "epoch": 0.0526, "grad_norm": 0.3250042498111725, "learning_rate": 4.94804e-05, "loss": 0.2583, "step": 376300 }, { "epoch": 0.0528, "grad_norm": 0.21343812346458435, "learning_rate": 4.94404e-05, "loss": 0.2478, "step": 376400 }, { "epoch": 0.053, "grad_norm": 0.2855488359928131, "learning_rate": 4.94004e-05, "loss": 0.2418, "step": 376500 }, { "epoch": 0.0532, "grad_norm": 0.21316973865032196, "learning_rate": 4.93604e-05, "loss": 0.254, "step": 376600 }, { "epoch": 0.0534, "grad_norm": 0.16673021018505096, "learning_rate": 4.93204e-05, "loss": 0.2412, "step": 376700 }, { "epoch": 0.0536, "grad_norm": 0.2050374299287796, "learning_rate": 4.9280400000000003e-05, "loss": 0.2442, "step": 376800 }, { "epoch": 0.0538, "grad_norm": 0.17526887357234955, "learning_rate": 4.9240400000000004e-05, "loss": 0.2456, "step": 376900 }, { "epoch": 0.054, "grad_norm": 0.17823076248168945, "learning_rate": 4.9200400000000004e-05, "loss": 0.2412, "step": 377000 }, { "epoch": 0.0542, "grad_norm": 0.19412919878959656, "learning_rate": 4.91604e-05, "loss": 0.2427, "step": 377100 }, { "epoch": 0.0544, "grad_norm": 0.2670155167579651, "learning_rate": 4.91204e-05, "loss": 0.2478, "step": 377200 }, { "epoch": 0.0546, "grad_norm": 0.3044726252555847, "learning_rate": 4.9080400000000005e-05, "loss": 0.2485, "step": 377300 }, { "epoch": 0.0548, "grad_norm": 0.27025651931762695, "learning_rate": 4.9040400000000006e-05, "loss": 0.2445, "step": 377400 }, { "epoch": 0.055, "grad_norm": 0.19618715345859528, "learning_rate": 4.90004e-05, "loss": 0.2397, "step": 377500 }, { "epoch": 0.0552, "grad_norm": 0.22691640257835388, "learning_rate": 4.89604e-05, "loss": 0.2494, "step": 377600 }, { "epoch": 0.0554, "grad_norm": 0.346432089805603, "learning_rate": 4.892040000000001e-05, "loss": 0.2626, "step": 377700 }, { "epoch": 0.0556, "grad_norm": 0.37104085087776184, "learning_rate": 4.888040000000001e-05, "loss": 0.2472, "step": 377800 }, { "epoch": 0.0558, "grad_norm": 0.25854432582855225, "learning_rate": 4.88404e-05, "loss": 0.2405, "step": 377900 }, { "epoch": 0.056, "grad_norm": 0.22568698227405548, "learning_rate": 4.88004e-05, "loss": 0.2577, "step": 378000 }, { "epoch": 0.0562, "grad_norm": 0.19181467592716217, "learning_rate": 4.87604e-05, "loss": 0.2493, "step": 378100 }, { "epoch": 0.0564, "grad_norm": 0.19588135182857513, "learning_rate": 4.87204e-05, "loss": 0.249, "step": 378200 }, { "epoch": 0.0566, "grad_norm": 0.3186732232570648, "learning_rate": 4.86804e-05, "loss": 0.2346, "step": 378300 }, { "epoch": 0.0568, "grad_norm": 0.21397516131401062, "learning_rate": 4.86404e-05, "loss": 0.254, "step": 378400 }, { "epoch": 0.057, "grad_norm": 0.14002354443073273, "learning_rate": 4.86004e-05, "loss": 0.2503, "step": 378500 }, { "epoch": 0.0572, "grad_norm": 0.18620188534259796, "learning_rate": 4.8560400000000004e-05, "loss": 0.2477, "step": 378600 }, { "epoch": 0.0574, "grad_norm": 0.34367236495018005, "learning_rate": 4.8520400000000004e-05, "loss": 0.2515, "step": 378700 }, { "epoch": 0.0576, "grad_norm": 0.20147454738616943, "learning_rate": 4.8480400000000004e-05, "loss": 0.2509, "step": 378800 }, { "epoch": 0.0578, "grad_norm": 0.22999276220798492, "learning_rate": 4.8440400000000005e-05, "loss": 0.2595, "step": 378900 }, { "epoch": 0.058, "grad_norm": 0.1593378633260727, "learning_rate": 4.84004e-05, "loss": 0.2501, "step": 379000 }, { "epoch": 0.0582, "grad_norm": 0.2817172706127167, "learning_rate": 4.83604e-05, "loss": 0.2411, "step": 379100 }, { "epoch": 0.0584, "grad_norm": 0.16755926609039307, "learning_rate": 4.8320400000000006e-05, "loss": 0.2415, "step": 379200 }, { "epoch": 0.0586, "grad_norm": 0.19628214836120605, "learning_rate": 4.8280400000000006e-05, "loss": 0.2457, "step": 379300 }, { "epoch": 0.0588, "grad_norm": 0.16852396726608276, "learning_rate": 4.82404e-05, "loss": 0.2513, "step": 379400 }, { "epoch": 0.059, "grad_norm": 0.3213140666484833, "learning_rate": 4.82004e-05, "loss": 0.2663, "step": 379500 }, { "epoch": 0.0592, "grad_norm": 0.2476736158132553, "learning_rate": 4.81604e-05, "loss": 0.2671, "step": 379600 }, { "epoch": 0.0594, "grad_norm": 0.1939520239830017, "learning_rate": 4.812040000000001e-05, "loss": 0.2486, "step": 379700 }, { "epoch": 0.0596, "grad_norm": 0.15622283518314362, "learning_rate": 4.80804e-05, "loss": 0.2358, "step": 379800 }, { "epoch": 0.0598, "grad_norm": 0.18065965175628662, "learning_rate": 4.80404e-05, "loss": 0.2286, "step": 379900 }, { "epoch": 0.06, "grad_norm": 0.16677778959274292, "learning_rate": 4.80004e-05, "loss": 0.2283, "step": 380000 }, { "epoch": 0.0602, "grad_norm": 0.2198459357023239, "learning_rate": 4.79604e-05, "loss": 0.2241, "step": 380100 }, { "epoch": 0.0604, "grad_norm": 0.18271780014038086, "learning_rate": 4.79204e-05, "loss": 0.2241, "step": 380200 }, { "epoch": 0.0606, "grad_norm": 0.16740696132183075, "learning_rate": 4.78804e-05, "loss": 0.2233, "step": 380300 }, { "epoch": 0.0608, "grad_norm": 0.1739855706691742, "learning_rate": 4.7840400000000004e-05, "loss": 0.2212, "step": 380400 }, { "epoch": 0.061, "grad_norm": 0.20133738219738007, "learning_rate": 4.7800400000000004e-05, "loss": 0.2228, "step": 380500 }, { "epoch": 0.0612, "grad_norm": 0.15135647356510162, "learning_rate": 4.77604e-05, "loss": 0.2212, "step": 380600 }, { "epoch": 0.0614, "grad_norm": 0.19083605706691742, "learning_rate": 4.7720400000000005e-05, "loss": 0.2218, "step": 380700 }, { "epoch": 0.0616, "grad_norm": 0.16143475472927094, "learning_rate": 4.7680400000000005e-05, "loss": 0.2249, "step": 380800 }, { "epoch": 0.0618, "grad_norm": 0.23794889450073242, "learning_rate": 4.76404e-05, "loss": 0.2298, "step": 380900 }, { "epoch": 0.062, "grad_norm": 0.19995640218257904, "learning_rate": 4.76004e-05, "loss": 0.2209, "step": 381000 }, { "epoch": 0.0622, "grad_norm": 0.1754998117685318, "learning_rate": 4.7560400000000006e-05, "loss": 0.2258, "step": 381100 }, { "epoch": 0.0624, "grad_norm": 0.18169523775577545, "learning_rate": 4.752040000000001e-05, "loss": 0.2254, "step": 381200 }, { "epoch": 0.0626, "grad_norm": 0.18677940964698792, "learning_rate": 4.74804e-05, "loss": 0.2254, "step": 381300 }, { "epoch": 0.0628, "grad_norm": 0.1730923056602478, "learning_rate": 4.74404e-05, "loss": 0.2246, "step": 381400 }, { "epoch": 0.063, "grad_norm": 0.20200613141059875, "learning_rate": 4.74004e-05, "loss": 0.2234, "step": 381500 }, { "epoch": 0.0632, "grad_norm": 0.33600297570228577, "learning_rate": 4.73604e-05, "loss": 0.2258, "step": 381600 }, { "epoch": 0.0634, "grad_norm": 0.16173823177814484, "learning_rate": 4.73204e-05, "loss": 0.2248, "step": 381700 }, { "epoch": 0.0636, "grad_norm": 0.16113696992397308, "learning_rate": 4.72804e-05, "loss": 0.2205, "step": 381800 }, { "epoch": 0.0638, "grad_norm": 0.17868146300315857, "learning_rate": 4.72404e-05, "loss": 0.2205, "step": 381900 }, { "epoch": 0.064, "grad_norm": 0.17537720501422882, "learning_rate": 4.72004e-05, "loss": 0.2271, "step": 382000 }, { "epoch": 0.0642, "grad_norm": 0.17318007349967957, "learning_rate": 4.71604e-05, "loss": 0.2282, "step": 382100 }, { "epoch": 0.0644, "grad_norm": 0.14845995604991913, "learning_rate": 4.7120400000000004e-05, "loss": 0.2247, "step": 382200 }, { "epoch": 0.0646, "grad_norm": 0.17553631961345673, "learning_rate": 4.7080400000000004e-05, "loss": 0.2244, "step": 382300 }, { "epoch": 0.0648, "grad_norm": 0.20734809339046478, "learning_rate": 4.7040400000000005e-05, "loss": 0.2221, "step": 382400 }, { "epoch": 0.065, "grad_norm": 0.1688157171010971, "learning_rate": 4.70004e-05, "loss": 0.2225, "step": 382500 }, { "epoch": 0.0652, "grad_norm": 0.17154380679130554, "learning_rate": 4.6960400000000005e-05, "loss": 0.2267, "step": 382600 }, { "epoch": 0.0654, "grad_norm": 0.19217541813850403, "learning_rate": 4.6920400000000006e-05, "loss": 0.2263, "step": 382700 }, { "epoch": 0.0656, "grad_norm": 0.1954587697982788, "learning_rate": 4.68804e-05, "loss": 0.2242, "step": 382800 }, { "epoch": 0.0658, "grad_norm": 0.18474943935871124, "learning_rate": 4.68404e-05, "loss": 0.2257, "step": 382900 }, { "epoch": 0.066, "grad_norm": 0.1891472041606903, "learning_rate": 4.68004e-05, "loss": 0.2259, "step": 383000 }, { "epoch": 0.0662, "grad_norm": 0.16297364234924316, "learning_rate": 4.676040000000001e-05, "loss": 0.2239, "step": 383100 }, { "epoch": 0.0664, "grad_norm": 0.26741859316825867, "learning_rate": 4.67204e-05, "loss": 0.2265, "step": 383200 }, { "epoch": 0.0666, "grad_norm": 0.22171659767627716, "learning_rate": 4.66804e-05, "loss": 0.2272, "step": 383300 }, { "epoch": 0.0668, "grad_norm": 0.16574308276176453, "learning_rate": 4.66404e-05, "loss": 0.2272, "step": 383400 }, { "epoch": 0.067, "grad_norm": 0.15950843691825867, "learning_rate": 4.66004e-05, "loss": 0.2259, "step": 383500 }, { "epoch": 0.0672, "grad_norm": 0.1801784336566925, "learning_rate": 4.65604e-05, "loss": 0.2277, "step": 383600 }, { "epoch": 0.0674, "grad_norm": 0.14734512567520142, "learning_rate": 4.65204e-05, "loss": 0.224, "step": 383700 }, { "epoch": 0.0676, "grad_norm": 0.20324218273162842, "learning_rate": 4.64804e-05, "loss": 0.2257, "step": 383800 }, { "epoch": 0.0678, "grad_norm": 0.190351203083992, "learning_rate": 4.6440400000000003e-05, "loss": 0.2238, "step": 383900 }, { "epoch": 0.068, "grad_norm": 0.17872920632362366, "learning_rate": 4.6400400000000004e-05, "loss": 0.2253, "step": 384000 }, { "epoch": 0.0682, "grad_norm": 0.1632169634103775, "learning_rate": 4.6360400000000004e-05, "loss": 0.2296, "step": 384100 }, { "epoch": 0.0684, "grad_norm": 0.219464972615242, "learning_rate": 4.6320400000000005e-05, "loss": 0.2267, "step": 384200 }, { "epoch": 0.0686, "grad_norm": 0.19538037478923798, "learning_rate": 4.62804e-05, "loss": 0.2216, "step": 384300 }, { "epoch": 0.0688, "grad_norm": 0.16144002974033356, "learning_rate": 4.62404e-05, "loss": 0.2254, "step": 384400 }, { "epoch": 0.069, "grad_norm": 0.254450261592865, "learning_rate": 4.6200400000000006e-05, "loss": 0.2295, "step": 384500 }, { "epoch": 0.0692, "grad_norm": 0.17138288915157318, "learning_rate": 4.6160400000000006e-05, "loss": 0.2254, "step": 384600 }, { "epoch": 0.0694, "grad_norm": 0.22051529586315155, "learning_rate": 4.61204e-05, "loss": 0.2305, "step": 384700 }, { "epoch": 0.0696, "grad_norm": 0.17428357899188995, "learning_rate": 4.60804e-05, "loss": 0.2237, "step": 384800 }, { "epoch": 0.0698, "grad_norm": 0.16001330316066742, "learning_rate": 4.60404e-05, "loss": 0.2286, "step": 384900 }, { "epoch": 0.07, "grad_norm": 0.16556522250175476, "learning_rate": 4.600040000000001e-05, "loss": 0.2301, "step": 385000 }, { "epoch": 0.0702, "grad_norm": NaN, "learning_rate": 4.59604e-05, "loss": 0.2256, "step": 385100 }, { "epoch": 0.0704, "grad_norm": 0.272354394197464, "learning_rate": 4.59204e-05, "loss": 0.229, "step": 385200 }, { "epoch": 0.0706, "grad_norm": 0.28128334879875183, "learning_rate": 4.58804e-05, "loss": 0.2292, "step": 385300 }, { "epoch": 0.0708, "grad_norm": 0.1840551495552063, "learning_rate": 4.58404e-05, "loss": 0.2297, "step": 385400 }, { "epoch": 0.071, "grad_norm": 0.13684575259685516, "learning_rate": 4.58004e-05, "loss": 0.2253, "step": 385500 }, { "epoch": 0.0712, "grad_norm": 0.17495335638523102, "learning_rate": 4.57604e-05, "loss": 0.2241, "step": 385600 }, { "epoch": 0.0714, "grad_norm": 0.2242329716682434, "learning_rate": 4.5720400000000004e-05, "loss": 0.2238, "step": 385700 }, { "epoch": 0.0716, "grad_norm": 0.20640377700328827, "learning_rate": 4.5680400000000004e-05, "loss": 0.2274, "step": 385800 }, { "epoch": 0.0718, "grad_norm": 0.19061022996902466, "learning_rate": 4.56404e-05, "loss": 0.2252, "step": 385900 }, { "epoch": 0.072, "grad_norm": 0.3011106550693512, "learning_rate": 4.5600400000000005e-05, "loss": 0.2245, "step": 386000 }, { "epoch": 0.0722, "grad_norm": 0.19182322919368744, "learning_rate": 4.5560400000000005e-05, "loss": 0.2285, "step": 386100 }, { "epoch": 0.0724, "grad_norm": 0.15975511074066162, "learning_rate": 4.55204e-05, "loss": 0.2298, "step": 386200 }, { "epoch": 0.0726, "grad_norm": 0.16059580445289612, "learning_rate": 4.54804e-05, "loss": 0.2273, "step": 386300 }, { "epoch": 0.0728, "grad_norm": 0.1402633935213089, "learning_rate": 4.54404e-05, "loss": 0.2276, "step": 386400 }, { "epoch": 0.073, "grad_norm": 0.3752526044845581, "learning_rate": 4.540040000000001e-05, "loss": 0.2246, "step": 386500 }, { "epoch": 0.0732, "grad_norm": 0.15643823146820068, "learning_rate": 4.53604e-05, "loss": 0.2244, "step": 386600 }, { "epoch": 0.0734, "grad_norm": 0.16925235092639923, "learning_rate": 4.53204e-05, "loss": 0.2266, "step": 386700 }, { "epoch": 0.0736, "grad_norm": 0.27427610754966736, "learning_rate": 4.52804e-05, "loss": 0.226, "step": 386800 }, { "epoch": 0.0738, "grad_norm": 0.175080806016922, "learning_rate": 4.52404e-05, "loss": 0.2286, "step": 386900 }, { "epoch": 0.074, "grad_norm": 0.23322616517543793, "learning_rate": 4.52004e-05, "loss": 0.2275, "step": 387000 }, { "epoch": 0.0742, "grad_norm": 0.17752708494663239, "learning_rate": 4.51604e-05, "loss": 0.2277, "step": 387100 }, { "epoch": 0.0744, "grad_norm": 0.18653720617294312, "learning_rate": 4.51204e-05, "loss": 0.2278, "step": 387200 }, { "epoch": 0.0746, "grad_norm": 0.24310079216957092, "learning_rate": 4.50804e-05, "loss": 0.2267, "step": 387300 }, { "epoch": 0.0748, "grad_norm": 0.16927888989448547, "learning_rate": 4.50404e-05, "loss": 0.2263, "step": 387400 }, { "epoch": 0.075, "grad_norm": 0.21127398312091827, "learning_rate": 4.5000400000000004e-05, "loss": 0.2318, "step": 387500 }, { "epoch": 0.0752, "grad_norm": 0.1537827104330063, "learning_rate": 4.4960400000000004e-05, "loss": 0.228, "step": 387600 }, { "epoch": 0.0754, "grad_norm": 0.2062017023563385, "learning_rate": 4.4920400000000004e-05, "loss": 0.2266, "step": 387700 }, { "epoch": 0.0756, "grad_norm": 0.1648017019033432, "learning_rate": 4.48804e-05, "loss": 0.2262, "step": 387800 }, { "epoch": 0.0758, "grad_norm": 0.17811764776706696, "learning_rate": 4.4840400000000005e-05, "loss": 0.2265, "step": 387900 }, { "epoch": 0.076, "grad_norm": 0.25383955240249634, "learning_rate": 4.4800400000000006e-05, "loss": 0.2307, "step": 388000 }, { "epoch": 0.0762, "grad_norm": 0.16029523313045502, "learning_rate": 4.47604e-05, "loss": 0.2271, "step": 388100 }, { "epoch": 0.0764, "grad_norm": 0.3880062699317932, "learning_rate": 4.47204e-05, "loss": 0.2219, "step": 388200 }, { "epoch": 0.0766, "grad_norm": 0.14046251773834229, "learning_rate": 4.46804e-05, "loss": 0.2272, "step": 388300 }, { "epoch": 0.0768, "grad_norm": 0.20129117369651794, "learning_rate": 4.464040000000001e-05, "loss": 0.2276, "step": 388400 }, { "epoch": 0.077, "grad_norm": 0.1550118327140808, "learning_rate": 4.46004e-05, "loss": 0.2253, "step": 388500 }, { "epoch": 0.0772, "grad_norm": 0.196579709649086, "learning_rate": 4.45604e-05, "loss": 0.2239, "step": 388600 }, { "epoch": 0.0774, "grad_norm": 0.23339785635471344, "learning_rate": 4.45204e-05, "loss": 0.2279, "step": 388700 }, { "epoch": 0.0776, "grad_norm": 0.20383737981319427, "learning_rate": 4.44804e-05, "loss": 0.2268, "step": 388800 }, { "epoch": 0.0778, "grad_norm": 0.15796469151973724, "learning_rate": 4.44404e-05, "loss": 0.2258, "step": 388900 }, { "epoch": 0.078, "grad_norm": 0.1844768226146698, "learning_rate": 4.44004e-05, "loss": 0.2325, "step": 389000 }, { "epoch": 0.0782, "grad_norm": 0.15635459125041962, "learning_rate": 4.43604e-05, "loss": 0.2297, "step": 389100 }, { "epoch": 0.0784, "grad_norm": 0.1638452708721161, "learning_rate": 4.4320400000000003e-05, "loss": 0.2229, "step": 389200 }, { "epoch": 0.0786, "grad_norm": 0.2581493854522705, "learning_rate": 4.42804e-05, "loss": 0.2257, "step": 389300 }, { "epoch": 0.0788, "grad_norm": 0.2030574083328247, "learning_rate": 4.4240400000000004e-05, "loss": 0.2282, "step": 389400 }, { "epoch": 0.079, "grad_norm": 0.14893430471420288, "learning_rate": 4.4200400000000005e-05, "loss": 0.2285, "step": 389500 }, { "epoch": 0.0792, "grad_norm": 0.15095509588718414, "learning_rate": 4.4160400000000005e-05, "loss": 0.2303, "step": 389600 }, { "epoch": 0.0794, "grad_norm": 0.20052938163280487, "learning_rate": 4.41204e-05, "loss": 0.2293, "step": 389700 }, { "epoch": 0.0796, "grad_norm": 0.2110019326210022, "learning_rate": 4.40804e-05, "loss": 0.2284, "step": 389800 }, { "epoch": 0.0798, "grad_norm": 0.18157057464122772, "learning_rate": 4.4040400000000006e-05, "loss": 0.2306, "step": 389900 }, { "epoch": 0.08, "grad_norm": 0.21049971878528595, "learning_rate": 4.40004e-05, "loss": 0.2289, "step": 390000 }, { "epoch": 0.0802, "grad_norm": 0.18213237822055817, "learning_rate": 4.39604e-05, "loss": 0.2295, "step": 390100 }, { "epoch": 0.0804, "grad_norm": 0.18442107737064362, "learning_rate": 4.39204e-05, "loss": 0.2284, "step": 390200 }, { "epoch": 0.0806, "grad_norm": 0.18602213263511658, "learning_rate": 4.388040000000001e-05, "loss": 0.2363, "step": 390300 }, { "epoch": 0.0808, "grad_norm": 0.17077329754829407, "learning_rate": 4.38404e-05, "loss": 0.2298, "step": 390400 }, { "epoch": 0.081, "grad_norm": 0.20275497436523438, "learning_rate": 4.38004e-05, "loss": 0.2323, "step": 390500 }, { "epoch": 0.0812, "grad_norm": 0.17397406697273254, "learning_rate": 4.37604e-05, "loss": 0.2261, "step": 390600 }, { "epoch": 0.0814, "grad_norm": 0.14256907999515533, "learning_rate": 4.37204e-05, "loss": 0.2278, "step": 390700 }, { "epoch": 0.0816, "grad_norm": 0.1629251092672348, "learning_rate": 4.36804e-05, "loss": 0.2259, "step": 390800 }, { "epoch": 0.0818, "grad_norm": 0.2366534024477005, "learning_rate": 4.36404e-05, "loss": 0.2315, "step": 390900 }, { "epoch": 0.082, "grad_norm": 0.16596932709217072, "learning_rate": 4.3600400000000004e-05, "loss": 0.2259, "step": 391000 }, { "epoch": 0.0822, "grad_norm": 0.1851380616426468, "learning_rate": 4.3560400000000004e-05, "loss": 0.2209, "step": 391100 }, { "epoch": 0.0824, "grad_norm": 0.1714228391647339, "learning_rate": 4.35204e-05, "loss": 0.2256, "step": 391200 }, { "epoch": 0.0826, "grad_norm": 0.14281560480594635, "learning_rate": 4.3480400000000005e-05, "loss": 0.2262, "step": 391300 }, { "epoch": 0.0828, "grad_norm": 0.17006812989711761, "learning_rate": 4.3440400000000005e-05, "loss": 0.2278, "step": 391400 }, { "epoch": 0.083, "grad_norm": 0.2721594572067261, "learning_rate": 4.3400400000000005e-05, "loss": 0.2244, "step": 391500 }, { "epoch": 0.0832, "grad_norm": 0.15255306661128998, "learning_rate": 4.33604e-05, "loss": 0.2265, "step": 391600 }, { "epoch": 0.0834, "grad_norm": 0.1884760856628418, "learning_rate": 4.33204e-05, "loss": 0.2287, "step": 391700 }, { "epoch": 0.0836, "grad_norm": 0.3142029941082001, "learning_rate": 4.328040000000001e-05, "loss": 0.227, "step": 391800 }, { "epoch": 0.0838, "grad_norm": 0.19981245696544647, "learning_rate": 4.32404e-05, "loss": 0.2277, "step": 391900 }, { "epoch": 0.084, "grad_norm": 0.206933856010437, "learning_rate": 4.32004e-05, "loss": 0.2298, "step": 392000 }, { "epoch": 0.0842, "grad_norm": 0.17843426764011383, "learning_rate": 4.31604e-05, "loss": 0.2258, "step": 392100 }, { "epoch": 0.0844, "grad_norm": 0.18036629259586334, "learning_rate": 4.31204e-05, "loss": 0.2289, "step": 392200 }, { "epoch": 0.0846, "grad_norm": 0.17636817693710327, "learning_rate": 4.30804e-05, "loss": 0.226, "step": 392300 }, { "epoch": 0.0848, "grad_norm": 0.1490882784128189, "learning_rate": 4.30404e-05, "loss": 0.2267, "step": 392400 }, { "epoch": 0.085, "grad_norm": 0.16680997610092163, "learning_rate": 4.30004e-05, "loss": 0.226, "step": 392500 }, { "epoch": 0.0852, "grad_norm": 0.1657077521085739, "learning_rate": 4.29604e-05, "loss": 0.2267, "step": 392600 }, { "epoch": 0.0854, "grad_norm": 0.2529284656047821, "learning_rate": 4.2920399999999997e-05, "loss": 0.2281, "step": 392700 }, { "epoch": 0.0856, "grad_norm": 0.16982971131801605, "learning_rate": 4.2880400000000004e-05, "loss": 0.2256, "step": 392800 }, { "epoch": 0.0858, "grad_norm": 0.14875228703022003, "learning_rate": 4.2840400000000004e-05, "loss": 0.2272, "step": 392900 }, { "epoch": 0.086, "grad_norm": 0.2149345576763153, "learning_rate": 4.2800400000000004e-05, "loss": 0.2302, "step": 393000 }, { "epoch": 0.0862, "grad_norm": 0.17671360075473785, "learning_rate": 4.27604e-05, "loss": 0.2254, "step": 393100 }, { "epoch": 0.0864, "grad_norm": 0.17670981585979462, "learning_rate": 4.27204e-05, "loss": 0.2294, "step": 393200 }, { "epoch": 0.0866, "grad_norm": 0.16914287209510803, "learning_rate": 4.2680400000000006e-05, "loss": 0.2436, "step": 393300 }, { "epoch": 0.0868, "grad_norm": 0.17037755250930786, "learning_rate": 4.26404e-05, "loss": 0.2279, "step": 393400 }, { "epoch": 0.087, "grad_norm": 0.1500585824251175, "learning_rate": 4.26004e-05, "loss": 0.2252, "step": 393500 }, { "epoch": 0.0872, "grad_norm": 0.1869862973690033, "learning_rate": 4.25604e-05, "loss": 0.2271, "step": 393600 }, { "epoch": 0.0874, "grad_norm": 0.16152675449848175, "learning_rate": 4.252040000000001e-05, "loss": 0.229, "step": 393700 }, { "epoch": 0.0876, "grad_norm": 0.19350165128707886, "learning_rate": 4.24804e-05, "loss": 0.2271, "step": 393800 }, { "epoch": 0.0878, "grad_norm": 0.16843397915363312, "learning_rate": 4.24404e-05, "loss": 0.2272, "step": 393900 }, { "epoch": 0.088, "grad_norm": 0.17295992374420166, "learning_rate": 4.24004e-05, "loss": 0.2252, "step": 394000 }, { "epoch": 0.0882, "grad_norm": 0.2227911800146103, "learning_rate": 4.23604e-05, "loss": 0.2327, "step": 394100 }, { "epoch": 0.0884, "grad_norm": 0.22953267395496368, "learning_rate": 4.23204e-05, "loss": 0.2279, "step": 394200 }, { "epoch": 0.0886, "grad_norm": 0.22172044217586517, "learning_rate": 4.22804e-05, "loss": 0.2307, "step": 394300 }, { "epoch": 0.0888, "grad_norm": 0.1599377691745758, "learning_rate": 4.22404e-05, "loss": 0.2273, "step": 394400 }, { "epoch": 0.089, "grad_norm": 0.1549975574016571, "learning_rate": 4.2200400000000003e-05, "loss": 0.2267, "step": 394500 }, { "epoch": 0.0892, "grad_norm": 0.1666010618209839, "learning_rate": 4.21604e-05, "loss": 0.226, "step": 394600 }, { "epoch": 0.0894, "grad_norm": 0.1673179715871811, "learning_rate": 4.2120400000000004e-05, "loss": 0.2289, "step": 394700 }, { "epoch": 0.0896, "grad_norm": 0.16998769342899323, "learning_rate": 4.2080400000000005e-05, "loss": 0.2295, "step": 394800 }, { "epoch": 0.0898, "grad_norm": 0.14935901761054993, "learning_rate": 4.2040400000000005e-05, "loss": 0.2252, "step": 394900 }, { "epoch": 0.09, "grad_norm": 0.14684082567691803, "learning_rate": 4.20004e-05, "loss": 0.2277, "step": 395000 }, { "epoch": 0.0902, "grad_norm": 0.2135782241821289, "learning_rate": 4.19604e-05, "loss": 0.2292, "step": 395100 }, { "epoch": 0.0904, "grad_norm": 0.19524498283863068, "learning_rate": 4.1920400000000006e-05, "loss": 0.2283, "step": 395200 }, { "epoch": 0.0906, "grad_norm": 0.19192557036876678, "learning_rate": 4.18804e-05, "loss": 0.2271, "step": 395300 }, { "epoch": 0.0908, "grad_norm": 0.1547107994556427, "learning_rate": 4.18404e-05, "loss": 0.2292, "step": 395400 }, { "epoch": 0.091, "grad_norm": 0.15521804988384247, "learning_rate": 4.18004e-05, "loss": 0.2262, "step": 395500 }, { "epoch": 0.0912, "grad_norm": 0.17244058847427368, "learning_rate": 4.17604e-05, "loss": 0.226, "step": 395600 }, { "epoch": 0.0914, "grad_norm": 0.17803426086902618, "learning_rate": 4.17204e-05, "loss": 0.2264, "step": 395700 }, { "epoch": 0.0916, "grad_norm": 0.17432081699371338, "learning_rate": 4.16804e-05, "loss": 0.2299, "step": 395800 }, { "epoch": 0.0918, "grad_norm": 0.23937202990055084, "learning_rate": 4.16404e-05, "loss": 0.2252, "step": 395900 }, { "epoch": 0.092, "grad_norm": 0.17643794417381287, "learning_rate": 4.16004e-05, "loss": 0.2253, "step": 396000 }, { "epoch": 0.0922, "grad_norm": 0.1799926906824112, "learning_rate": 4.1560399999999996e-05, "loss": 0.2311, "step": 396100 }, { "epoch": 0.0924, "grad_norm": 0.14338967204093933, "learning_rate": 4.15204e-05, "loss": 0.2262, "step": 396200 }, { "epoch": 0.0926, "grad_norm": 0.21923920512199402, "learning_rate": 4.1480400000000004e-05, "loss": 0.2252, "step": 396300 }, { "epoch": 0.0928, "grad_norm": 0.1880873441696167, "learning_rate": 4.1440400000000004e-05, "loss": 0.2258, "step": 396400 }, { "epoch": 0.093, "grad_norm": 0.15461334586143494, "learning_rate": 4.14004e-05, "loss": 0.2256, "step": 396500 }, { "epoch": 0.0932, "grad_norm": 0.15690234303474426, "learning_rate": 4.1360400000000005e-05, "loss": 0.2257, "step": 396600 }, { "epoch": 0.0934, "grad_norm": 0.15058964490890503, "learning_rate": 4.1320400000000005e-05, "loss": 0.2286, "step": 396700 }, { "epoch": 0.0936, "grad_norm": 0.17528659105300903, "learning_rate": 4.1280400000000005e-05, "loss": 0.2278, "step": 396800 }, { "epoch": 0.0938, "grad_norm": 0.30279967188835144, "learning_rate": 4.12404e-05, "loss": 0.2242, "step": 396900 }, { "epoch": 0.094, "grad_norm": 0.19836954772472382, "learning_rate": 4.12004e-05, "loss": 0.2264, "step": 397000 }, { "epoch": 0.0942, "grad_norm": 0.1697409451007843, "learning_rate": 4.1160400000000007e-05, "loss": 0.2273, "step": 397100 }, { "epoch": 0.0944, "grad_norm": 0.1903415024280548, "learning_rate": 4.11204e-05, "loss": 0.2318, "step": 397200 }, { "epoch": 0.0946, "grad_norm": 0.16050055623054504, "learning_rate": 4.10804e-05, "loss": 0.229, "step": 397300 }, { "epoch": 0.0948, "grad_norm": 0.1549975872039795, "learning_rate": 4.10404e-05, "loss": 0.2235, "step": 397400 }, { "epoch": 0.095, "grad_norm": 0.22184526920318604, "learning_rate": 4.10004e-05, "loss": 0.2278, "step": 397500 }, { "epoch": 0.0952, "grad_norm": 0.16259317100048065, "learning_rate": 4.09604e-05, "loss": 0.2276, "step": 397600 }, { "epoch": 0.0954, "grad_norm": 0.20985530316829681, "learning_rate": 4.09204e-05, "loss": 0.227, "step": 397700 }, { "epoch": 0.0956, "grad_norm": 0.1478949785232544, "learning_rate": 4.08804e-05, "loss": 0.231, "step": 397800 }, { "epoch": 0.0958, "grad_norm": 0.20481759309768677, "learning_rate": 4.08404e-05, "loss": 0.2252, "step": 397900 }, { "epoch": 0.096, "grad_norm": 0.16993680596351624, "learning_rate": 4.0800399999999996e-05, "loss": 0.2241, "step": 398000 }, { "epoch": 0.0962, "grad_norm": 0.2794969379901886, "learning_rate": 4.0760400000000004e-05, "loss": 0.2303, "step": 398100 }, { "epoch": 0.0964, "grad_norm": 0.1929108202457428, "learning_rate": 4.0720400000000004e-05, "loss": 0.2284, "step": 398200 }, { "epoch": 0.0966, "grad_norm": 0.16150234639644623, "learning_rate": 4.0680400000000004e-05, "loss": 0.2217, "step": 398300 }, { "epoch": 0.0968, "grad_norm": 0.15625301003456116, "learning_rate": 4.06404e-05, "loss": 0.2255, "step": 398400 }, { "epoch": 0.097, "grad_norm": 0.16894198954105377, "learning_rate": 4.06004e-05, "loss": 0.2228, "step": 398500 }, { "epoch": 0.0972, "grad_norm": 0.17462077736854553, "learning_rate": 4.0560400000000006e-05, "loss": 0.2255, "step": 398600 }, { "epoch": 0.0974, "grad_norm": 0.2863595187664032, "learning_rate": 4.0520400000000006e-05, "loss": 0.2272, "step": 398700 }, { "epoch": 0.0976, "grad_norm": 0.16492559015750885, "learning_rate": 4.04804e-05, "loss": 0.2275, "step": 398800 }, { "epoch": 0.0978, "grad_norm": 0.14487357437610626, "learning_rate": 4.04404e-05, "loss": 0.2248, "step": 398900 }, { "epoch": 0.098, "grad_norm": 0.22204236686229706, "learning_rate": 4.04004e-05, "loss": 0.2293, "step": 399000 }, { "epoch": 0.0982, "grad_norm": 0.16108718514442444, "learning_rate": 4.03604e-05, "loss": 0.2224, "step": 399100 }, { "epoch": 0.0984, "grad_norm": 0.1740008294582367, "learning_rate": 4.03204e-05, "loss": 0.2246, "step": 399200 }, { "epoch": 0.0986, "grad_norm": 0.1709638088941574, "learning_rate": 4.02804e-05, "loss": 0.2249, "step": 399300 }, { "epoch": 0.0988, "grad_norm": 0.2050946056842804, "learning_rate": 4.02404e-05, "loss": 0.2158, "step": 399400 }, { "epoch": 0.099, "grad_norm": 0.1883661299943924, "learning_rate": 4.02004e-05, "loss": 0.226, "step": 399500 }, { "epoch": 0.0992, "grad_norm": 0.17286445200443268, "learning_rate": 4.01604e-05, "loss": 0.224, "step": 399600 }, { "epoch": 0.0994, "grad_norm": 0.13540473580360413, "learning_rate": 4.01204e-05, "loss": 0.2284, "step": 399700 }, { "epoch": 0.0996, "grad_norm": 0.1601712554693222, "learning_rate": 4.00804e-05, "loss": 0.2252, "step": 399800 }, { "epoch": 0.0998, "grad_norm": 0.2265101969242096, "learning_rate": 4.00404e-05, "loss": 0.2211, "step": 399900 }, { "epoch": 0.1, "grad_norm": 0.18272989988327026, "learning_rate": 4.0000400000000004e-05, "loss": 0.2229, "step": 400000 }, { "epoch": 0.1002, "grad_norm": 0.18946868181228638, "learning_rate": 3.9960400000000005e-05, "loss": 0.2258, "step": 400100 }, { "epoch": 0.1004, "grad_norm": 0.1850133091211319, "learning_rate": 3.9920400000000005e-05, "loss": 0.2277, "step": 400200 }, { "epoch": 0.1006, "grad_norm": 0.18626666069030762, "learning_rate": 3.98804e-05, "loss": 0.2251, "step": 400300 }, { "epoch": 0.1008, "grad_norm": 0.18756605684757233, "learning_rate": 3.98404e-05, "loss": 0.2252, "step": 400400 }, { "epoch": 0.101, "grad_norm": 0.17669208347797394, "learning_rate": 3.9800400000000006e-05, "loss": 0.2292, "step": 400500 }, { "epoch": 0.1012, "grad_norm": 0.1939968466758728, "learning_rate": 3.9760400000000006e-05, "loss": 0.2247, "step": 400600 }, { "epoch": 0.1014, "grad_norm": 0.18509630858898163, "learning_rate": 3.97204e-05, "loss": 0.226, "step": 400700 }, { "epoch": 0.1016, "grad_norm": 0.20480674505233765, "learning_rate": 3.96804e-05, "loss": 0.2238, "step": 400800 }, { "epoch": 0.1018, "grad_norm": 0.21943143010139465, "learning_rate": 3.96404e-05, "loss": 0.2237, "step": 400900 }, { "epoch": 0.102, "grad_norm": 0.15782029926776886, "learning_rate": 3.96004e-05, "loss": 0.2272, "step": 401000 }, { "epoch": 0.1022, "grad_norm": 0.19730308651924133, "learning_rate": 3.95604e-05, "loss": 0.2259, "step": 401100 }, { "epoch": 0.1024, "grad_norm": 0.18070414662361145, "learning_rate": 3.95204e-05, "loss": 0.2232, "step": 401200 }, { "epoch": 0.1026, "grad_norm": 0.14294645190238953, "learning_rate": 3.94804e-05, "loss": 0.2255, "step": 401300 }, { "epoch": 0.1028, "grad_norm": 0.19923175871372223, "learning_rate": 3.94404e-05, "loss": 0.2247, "step": 401400 }, { "epoch": 0.103, "grad_norm": 0.15311995148658752, "learning_rate": 3.94004e-05, "loss": 0.2227, "step": 401500 }, { "epoch": 0.1032, "grad_norm": 0.181657612323761, "learning_rate": 3.9360400000000004e-05, "loss": 0.2258, "step": 401600 }, { "epoch": 0.1034, "grad_norm": 0.2274169921875, "learning_rate": 3.9320400000000004e-05, "loss": 0.2317, "step": 401700 }, { "epoch": 0.1036, "grad_norm": 0.2559068202972412, "learning_rate": 3.92804e-05, "loss": 0.2246, "step": 401800 }, { "epoch": 0.1038, "grad_norm": 0.1480039358139038, "learning_rate": 3.92404e-05, "loss": 0.2339, "step": 401900 }, { "epoch": 0.104, "grad_norm": 0.1887112855911255, "learning_rate": 3.9200400000000005e-05, "loss": 0.2268, "step": 402000 }, { "epoch": 0.1042, "grad_norm": 0.1991635113954544, "learning_rate": 3.9160400000000005e-05, "loss": 0.24, "step": 402100 }, { "epoch": 0.1044, "grad_norm": 0.19259975850582123, "learning_rate": 3.91204e-05, "loss": 0.2319, "step": 402200 }, { "epoch": 0.1046, "grad_norm": 0.16919143497943878, "learning_rate": 3.90804e-05, "loss": 0.2268, "step": 402300 }, { "epoch": 0.1048, "grad_norm": 0.24544304609298706, "learning_rate": 3.90404e-05, "loss": 0.2299, "step": 402400 }, { "epoch": 0.105, "grad_norm": 0.21261242032051086, "learning_rate": 3.900040000000001e-05, "loss": 0.2239, "step": 402500 }, { "epoch": 0.1052, "grad_norm": 0.19854722917079926, "learning_rate": 3.89604e-05, "loss": 0.2317, "step": 402600 }, { "epoch": 0.1054, "grad_norm": 0.29832783341407776, "learning_rate": 3.89204e-05, "loss": 0.2272, "step": 402700 }, { "epoch": 0.1056, "grad_norm": 0.18028579652309418, "learning_rate": 3.88804e-05, "loss": 0.2314, "step": 402800 }, { "epoch": 0.1058, "grad_norm": 0.18919360637664795, "learning_rate": 3.88404e-05, "loss": 0.224, "step": 402900 }, { "epoch": 0.106, "grad_norm": 0.21304070949554443, "learning_rate": 3.88004e-05, "loss": 0.2299, "step": 403000 }, { "epoch": 0.1062, "grad_norm": 0.18480710685253143, "learning_rate": 3.87604e-05, "loss": 0.2302, "step": 403100 }, { "epoch": 0.1064, "grad_norm": 0.38507312536239624, "learning_rate": 3.87204e-05, "loss": 0.2338, "step": 403200 }, { "epoch": 0.1066, "grad_norm": 0.15473273396492004, "learning_rate": 3.86804e-05, "loss": 0.226, "step": 403300 }, { "epoch": 0.1068, "grad_norm": 0.22138014435768127, "learning_rate": 3.8640400000000004e-05, "loss": 0.2265, "step": 403400 }, { "epoch": 0.107, "grad_norm": 0.22224438190460205, "learning_rate": 3.8600400000000004e-05, "loss": 0.2282, "step": 403500 }, { "epoch": 0.1072, "grad_norm": 0.16137254238128662, "learning_rate": 3.8560400000000004e-05, "loss": 0.2282, "step": 403600 }, { "epoch": 0.1074, "grad_norm": 0.18113075196743011, "learning_rate": 3.85204e-05, "loss": 0.2261, "step": 403700 }, { "epoch": 0.1076, "grad_norm": 0.17297004163265228, "learning_rate": 3.84804e-05, "loss": 0.2268, "step": 403800 }, { "epoch": 0.1078, "grad_norm": 0.20817789435386658, "learning_rate": 3.8440400000000006e-05, "loss": 0.2258, "step": 403900 }, { "epoch": 0.108, "grad_norm": 0.20474325120449066, "learning_rate": 3.8400400000000006e-05, "loss": 0.2257, "step": 404000 }, { "epoch": 0.1082, "grad_norm": 0.15835532546043396, "learning_rate": 3.83604e-05, "loss": 0.227, "step": 404100 }, { "epoch": 0.1084, "grad_norm": 0.17614057660102844, "learning_rate": 3.83204e-05, "loss": 0.226, "step": 404200 }, { "epoch": 0.1086, "grad_norm": 0.17774057388305664, "learning_rate": 3.82804e-05, "loss": 0.225, "step": 404300 }, { "epoch": 0.1088, "grad_norm": 0.1751686930656433, "learning_rate": 3.82404e-05, "loss": 0.2274, "step": 404400 }, { "epoch": 0.109, "grad_norm": 0.20452751219272614, "learning_rate": 3.82004e-05, "loss": 0.2278, "step": 404500 }, { "epoch": 0.1092, "grad_norm": 0.1584174484014511, "learning_rate": 3.81604e-05, "loss": 0.2296, "step": 404600 }, { "epoch": 0.1094, "grad_norm": 0.23247212171554565, "learning_rate": 3.81204e-05, "loss": 0.2265, "step": 404700 }, { "epoch": 0.1096, "grad_norm": 0.19274774193763733, "learning_rate": 3.80804e-05, "loss": 0.2277, "step": 404800 }, { "epoch": 0.1098, "grad_norm": 0.19633884727954865, "learning_rate": 3.80404e-05, "loss": 0.227, "step": 404900 }, { "epoch": 0.11, "grad_norm": 0.18933461606502533, "learning_rate": 3.80004e-05, "loss": 0.2295, "step": 405000 }, { "epoch": 0.1102, "grad_norm": 0.20844264328479767, "learning_rate": 3.79604e-05, "loss": 0.2282, "step": 405100 }, { "epoch": 0.1104, "grad_norm": 0.2164926677942276, "learning_rate": 3.7920400000000004e-05, "loss": 0.2304, "step": 405200 }, { "epoch": 0.1106, "grad_norm": 0.16100728511810303, "learning_rate": 3.78804e-05, "loss": 0.2245, "step": 405300 }, { "epoch": 0.1108, "grad_norm": 0.18901631236076355, "learning_rate": 3.7840400000000005e-05, "loss": 0.2268, "step": 405400 }, { "epoch": 0.111, "grad_norm": 0.18958155810832977, "learning_rate": 3.7800400000000005e-05, "loss": 0.2286, "step": 405500 }, { "epoch": 0.1112, "grad_norm": 0.1504233032464981, "learning_rate": 3.77604e-05, "loss": 0.2325, "step": 405600 }, { "epoch": 0.1114, "grad_norm": 0.18149876594543457, "learning_rate": 3.77204e-05, "loss": 0.2237, "step": 405700 }, { "epoch": 0.1116, "grad_norm": 0.16026699542999268, "learning_rate": 3.76804e-05, "loss": 0.2266, "step": 405800 }, { "epoch": 0.1118, "grad_norm": 0.17877326905727386, "learning_rate": 3.7640400000000006e-05, "loss": 0.2239, "step": 405900 }, { "epoch": 0.112, "grad_norm": 0.20097322762012482, "learning_rate": 3.76004e-05, "loss": 0.2278, "step": 406000 }, { "epoch": 0.1122, "grad_norm": 0.21661312878131866, "learning_rate": 3.75604e-05, "loss": 0.2278, "step": 406100 }, { "epoch": 0.1124, "grad_norm": 0.1761494129896164, "learning_rate": 3.75204e-05, "loss": 0.2249, "step": 406200 }, { "epoch": 0.1126, "grad_norm": 0.6409944295883179, "learning_rate": 3.74804e-05, "loss": 0.2274, "step": 406300 }, { "epoch": 0.1128, "grad_norm": 0.14468325674533844, "learning_rate": 3.74404e-05, "loss": 0.2252, "step": 406400 }, { "epoch": 0.113, "grad_norm": 0.1829795092344284, "learning_rate": 3.74004e-05, "loss": 0.2213, "step": 406500 }, { "epoch": 0.1132, "grad_norm": 0.18367330729961395, "learning_rate": 3.73604e-05, "loss": 0.2284, "step": 406600 }, { "epoch": 0.1134, "grad_norm": 0.16368336975574493, "learning_rate": 3.73204e-05, "loss": 0.2256, "step": 406700 }, { "epoch": 0.1136, "grad_norm": 0.15827390551567078, "learning_rate": 3.72804e-05, "loss": 0.2226, "step": 406800 }, { "epoch": 0.1138, "grad_norm": 0.23074659705162048, "learning_rate": 3.7240400000000003e-05, "loss": 0.2257, "step": 406900 }, { "epoch": 0.114, "grad_norm": 0.15967126190662384, "learning_rate": 3.7200400000000004e-05, "loss": 0.2229, "step": 407000 }, { "epoch": 0.1142, "grad_norm": 0.20047785341739655, "learning_rate": 3.71604e-05, "loss": 0.2253, "step": 407100 }, { "epoch": 0.1144, "grad_norm": 0.2328701764345169, "learning_rate": 3.71204e-05, "loss": 0.2216, "step": 407200 }, { "epoch": 0.1146, "grad_norm": 0.2147849202156067, "learning_rate": 3.7080400000000005e-05, "loss": 0.2309, "step": 407300 }, { "epoch": 0.1148, "grad_norm": 0.2192113846540451, "learning_rate": 3.7040400000000005e-05, "loss": 0.2279, "step": 407400 }, { "epoch": 0.115, "grad_norm": 0.1873077005147934, "learning_rate": 3.70004e-05, "loss": 0.2245, "step": 407500 }, { "epoch": 0.1152, "grad_norm": 0.16018076241016388, "learning_rate": 3.69604e-05, "loss": 0.2375, "step": 407600 }, { "epoch": 0.1154, "grad_norm": 0.3920210897922516, "learning_rate": 3.69204e-05, "loss": 0.224, "step": 407700 }, { "epoch": 0.1156, "grad_norm": 0.19828300178050995, "learning_rate": 3.688040000000001e-05, "loss": 0.232, "step": 407800 }, { "epoch": 0.1158, "grad_norm": 0.17930054664611816, "learning_rate": 3.68404e-05, "loss": 0.2226, "step": 407900 }, { "epoch": 0.116, "grad_norm": 0.16561190783977509, "learning_rate": 3.68004e-05, "loss": 0.2236, "step": 408000 }, { "epoch": 0.1162, "grad_norm": 0.20231448113918304, "learning_rate": 3.67604e-05, "loss": 0.2238, "step": 408100 }, { "epoch": 0.1164, "grad_norm": 0.1568211019039154, "learning_rate": 3.67204e-05, "loss": 0.2274, "step": 408200 }, { "epoch": 0.1166, "grad_norm": 0.2083185911178589, "learning_rate": 3.66804e-05, "loss": 0.2217, "step": 408300 }, { "epoch": 0.1168, "grad_norm": 0.20765464007854462, "learning_rate": 3.66404e-05, "loss": 0.2261, "step": 408400 }, { "epoch": 0.117, "grad_norm": 0.15366986393928528, "learning_rate": 3.66004e-05, "loss": 0.2243, "step": 408500 }, { "epoch": 0.1172, "grad_norm": 0.1394629180431366, "learning_rate": 3.65604e-05, "loss": 0.2254, "step": 408600 }, { "epoch": 0.1174, "grad_norm": 0.19688405096530914, "learning_rate": 3.65204e-05, "loss": 0.2244, "step": 408700 }, { "epoch": 0.1176, "grad_norm": 0.2001643031835556, "learning_rate": 3.6480400000000004e-05, "loss": 0.2238, "step": 408800 }, { "epoch": 0.1178, "grad_norm": 0.17548991739749908, "learning_rate": 3.6440400000000004e-05, "loss": 0.2296, "step": 408900 }, { "epoch": 0.118, "grad_norm": 0.1478511244058609, "learning_rate": 3.64004e-05, "loss": 0.2272, "step": 409000 }, { "epoch": 0.1182, "grad_norm": 0.17043019831180573, "learning_rate": 3.63604e-05, "loss": 0.2227, "step": 409100 }, { "epoch": 0.1184, "grad_norm": 0.16290804743766785, "learning_rate": 3.6320400000000006e-05, "loss": 0.2258, "step": 409200 }, { "epoch": 0.1186, "grad_norm": 0.17892128229141235, "learning_rate": 3.6280400000000006e-05, "loss": 0.2222, "step": 409300 }, { "epoch": 0.1188, "grad_norm": 0.14661169052124023, "learning_rate": 3.62404e-05, "loss": 0.2259, "step": 409400 }, { "epoch": 0.119, "grad_norm": 0.21006256341934204, "learning_rate": 3.62004e-05, "loss": 0.2244, "step": 409500 }, { "epoch": 0.1192, "grad_norm": 0.2831527590751648, "learning_rate": 3.61604e-05, "loss": 0.2251, "step": 409600 }, { "epoch": 0.1194, "grad_norm": 0.1787988245487213, "learning_rate": 3.612040000000001e-05, "loss": 0.2231, "step": 409700 }, { "epoch": 0.1196, "grad_norm": 0.17835716903209686, "learning_rate": 3.60804e-05, "loss": 0.2278, "step": 409800 }, { "epoch": 0.1198, "grad_norm": 0.1802866905927658, "learning_rate": 3.60404e-05, "loss": 0.225, "step": 409900 }, { "epoch": 0.12, "grad_norm": 0.15969882905483246, "learning_rate": 3.60004e-05, "loss": 0.2233, "step": 410000 }, { "epoch": 0.1202, "grad_norm": 0.24545249342918396, "learning_rate": 3.59604e-05, "loss": 0.2259, "step": 410100 }, { "epoch": 0.1204, "grad_norm": 0.163286954164505, "learning_rate": 3.59204e-05, "loss": 0.2271, "step": 410200 }, { "epoch": 0.1206, "grad_norm": 0.16609875857830048, "learning_rate": 3.58804e-05, "loss": 0.2328, "step": 410300 }, { "epoch": 0.1208, "grad_norm": 0.1794077455997467, "learning_rate": 3.58404e-05, "loss": 0.2259, "step": 410400 }, { "epoch": 0.121, "grad_norm": 0.15229471027851105, "learning_rate": 3.5800400000000004e-05, "loss": 0.2235, "step": 410500 }, { "epoch": 0.1212, "grad_norm": 0.2652372419834137, "learning_rate": 3.57604e-05, "loss": 0.224, "step": 410600 }, { "epoch": 0.1214, "grad_norm": 0.1757022589445114, "learning_rate": 3.5720400000000004e-05, "loss": 0.2271, "step": 410700 }, { "epoch": 0.1216, "grad_norm": 0.1457105576992035, "learning_rate": 3.5680400000000005e-05, "loss": 0.2265, "step": 410800 }, { "epoch": 0.1218, "grad_norm": 0.15136027336120605, "learning_rate": 3.56404e-05, "loss": 0.2276, "step": 410900 }, { "epoch": 0.122, "grad_norm": 0.1632482409477234, "learning_rate": 3.56004e-05, "loss": 0.2273, "step": 411000 }, { "epoch": 0.1222, "grad_norm": 0.16236582398414612, "learning_rate": 3.55604e-05, "loss": 0.2215, "step": 411100 }, { "epoch": 0.1224, "grad_norm": 0.16312521696090698, "learning_rate": 3.5520400000000006e-05, "loss": 0.2289, "step": 411200 }, { "epoch": 0.1226, "grad_norm": 0.1699019968509674, "learning_rate": 3.54804e-05, "loss": 0.2303, "step": 411300 }, { "epoch": 0.1228, "grad_norm": 0.17259664833545685, "learning_rate": 3.54404e-05, "loss": 0.2262, "step": 411400 }, { "epoch": 0.123, "grad_norm": 0.18414495885372162, "learning_rate": 3.54004e-05, "loss": 0.2232, "step": 411500 }, { "epoch": 0.1232, "grad_norm": 0.17581482231616974, "learning_rate": 3.53604e-05, "loss": 0.2287, "step": 411600 }, { "epoch": 0.1234, "grad_norm": 0.18369857966899872, "learning_rate": 3.53204e-05, "loss": 0.2326, "step": 411700 }, { "epoch": 0.1236, "grad_norm": 0.22637276351451874, "learning_rate": 3.52804e-05, "loss": 0.2278, "step": 411800 }, { "epoch": 0.1238, "grad_norm": 0.14915701746940613, "learning_rate": 3.52404e-05, "loss": 0.2222, "step": 411900 }, { "epoch": 0.124, "grad_norm": 0.16789127886295319, "learning_rate": 3.52004e-05, "loss": 0.225, "step": 412000 }, { "epoch": 0.1242, "grad_norm": 0.16109804809093475, "learning_rate": 3.51604e-05, "loss": 0.2284, "step": 412100 }, { "epoch": 0.1244, "grad_norm": 0.19975192844867706, "learning_rate": 3.5120400000000003e-05, "loss": 0.2221, "step": 412200 }, { "epoch": 0.1246, "grad_norm": 0.18988633155822754, "learning_rate": 3.5080400000000004e-05, "loss": 0.227, "step": 412300 }, { "epoch": 0.1248, "grad_norm": 0.14888371527194977, "learning_rate": 3.5040400000000004e-05, "loss": 0.225, "step": 412400 }, { "epoch": 0.125, "grad_norm": 0.20460769534111023, "learning_rate": 3.50004e-05, "loss": 0.2202, "step": 412500 }, { "epoch": 0.1252, "grad_norm": 0.18129372596740723, "learning_rate": 3.4960400000000005e-05, "loss": 0.2261, "step": 412600 }, { "epoch": 0.1254, "grad_norm": 0.25600898265838623, "learning_rate": 3.4920400000000005e-05, "loss": 0.2264, "step": 412700 }, { "epoch": 0.1256, "grad_norm": 0.16887041926383972, "learning_rate": 3.48804e-05, "loss": 0.2247, "step": 412800 }, { "epoch": 0.1258, "grad_norm": 0.1853538155555725, "learning_rate": 3.48404e-05, "loss": 0.2283, "step": 412900 }, { "epoch": 0.126, "grad_norm": 0.17123030126094818, "learning_rate": 3.48004e-05, "loss": 0.2257, "step": 413000 }, { "epoch": 0.1262, "grad_norm": 0.1886633038520813, "learning_rate": 3.476040000000001e-05, "loss": 0.2263, "step": 413100 }, { "epoch": 0.1264, "grad_norm": 0.16311076283454895, "learning_rate": 3.47204e-05, "loss": 0.2247, "step": 413200 }, { "epoch": 0.1266, "grad_norm": 0.18546321988105774, "learning_rate": 3.46804e-05, "loss": 0.2287, "step": 413300 }, { "epoch": 0.1268, "grad_norm": 0.20453518629074097, "learning_rate": 3.46404e-05, "loss": 0.2248, "step": 413400 }, { "epoch": 0.127, "grad_norm": 0.16402287781238556, "learning_rate": 3.46004e-05, "loss": 0.2272, "step": 413500 }, { "epoch": 0.1272, "grad_norm": 0.31062790751457214, "learning_rate": 3.45604e-05, "loss": 0.2248, "step": 413600 }, { "epoch": 0.1274, "grad_norm": 0.1984662562608719, "learning_rate": 3.45204e-05, "loss": 0.2262, "step": 413700 }, { "epoch": 0.1276, "grad_norm": 0.3165430426597595, "learning_rate": 3.44804e-05, "loss": 0.2261, "step": 413800 }, { "epoch": 0.1278, "grad_norm": 0.1811901032924652, "learning_rate": 3.44404e-05, "loss": 0.227, "step": 413900 }, { "epoch": 0.128, "grad_norm": 0.15821123123168945, "learning_rate": 3.44004e-05, "loss": 0.2252, "step": 414000 }, { "epoch": 0.1282, "grad_norm": 0.15300792455673218, "learning_rate": 3.4360400000000004e-05, "loss": 0.2242, "step": 414100 }, { "epoch": 0.1284, "grad_norm": 0.2539973556995392, "learning_rate": 3.4320400000000004e-05, "loss": 0.2287, "step": 414200 }, { "epoch": 0.1286, "grad_norm": 0.1613328754901886, "learning_rate": 3.4280400000000005e-05, "loss": 0.2258, "step": 414300 }, { "epoch": 0.1288, "grad_norm": 0.27399247884750366, "learning_rate": 3.42404e-05, "loss": 0.2233, "step": 414400 }, { "epoch": 0.129, "grad_norm": 0.17201277613639832, "learning_rate": 3.42004e-05, "loss": 0.2243, "step": 414500 }, { "epoch": 0.1292, "grad_norm": 0.21430495381355286, "learning_rate": 3.4160400000000006e-05, "loss": 0.2295, "step": 414600 }, { "epoch": 0.1294, "grad_norm": 0.1703038066625595, "learning_rate": 3.41204e-05, "loss": 0.2274, "step": 414700 }, { "epoch": 0.1296, "grad_norm": 0.21057896316051483, "learning_rate": 3.40804e-05, "loss": 0.2275, "step": 414800 }, { "epoch": 0.1298, "grad_norm": 0.17118048667907715, "learning_rate": 3.40404e-05, "loss": 0.2261, "step": 414900 }, { "epoch": 0.13, "grad_norm": 0.15217669308185577, "learning_rate": 3.40004e-05, "loss": 0.2286, "step": 415000 }, { "epoch": 0.1302, "grad_norm": 0.1708402931690216, "learning_rate": 3.39604e-05, "loss": 0.2276, "step": 415100 }, { "epoch": 0.1304, "grad_norm": 0.15635427832603455, "learning_rate": 3.39204e-05, "loss": 0.2271, "step": 415200 }, { "epoch": 0.1306, "grad_norm": 0.17114631831645966, "learning_rate": 3.38804e-05, "loss": 0.2289, "step": 415300 }, { "epoch": 0.1308, "grad_norm": 0.1455591470003128, "learning_rate": 3.38404e-05, "loss": 0.2314, "step": 415400 }, { "epoch": 0.131, "grad_norm": 0.19031578302383423, "learning_rate": 3.38004e-05, "loss": 0.2252, "step": 415500 }, { "epoch": 0.1312, "grad_norm": 0.19295114278793335, "learning_rate": 3.37604e-05, "loss": 0.2259, "step": 415600 }, { "epoch": 0.1314, "grad_norm": 0.18876852095127106, "learning_rate": 3.37204e-05, "loss": 0.2278, "step": 415700 }, { "epoch": 0.1316, "grad_norm": 0.20941956341266632, "learning_rate": 3.3680400000000004e-05, "loss": 0.2277, "step": 415800 }, { "epoch": 0.1318, "grad_norm": 0.16733889281749725, "learning_rate": 3.36404e-05, "loss": 0.2257, "step": 415900 }, { "epoch": 0.132, "grad_norm": 0.15858490765094757, "learning_rate": 3.3600400000000004e-05, "loss": 0.2228, "step": 416000 }, { "epoch": 0.1322, "grad_norm": 0.18554091453552246, "learning_rate": 3.3560400000000005e-05, "loss": 0.2359, "step": 416100 }, { "epoch": 0.1324, "grad_norm": 0.15781132876873016, "learning_rate": 3.35204e-05, "loss": 0.2273, "step": 416200 }, { "epoch": 0.1326, "grad_norm": 0.18581081926822662, "learning_rate": 3.34804e-05, "loss": 0.2252, "step": 416300 }, { "epoch": 0.1328, "grad_norm": 0.19088365137577057, "learning_rate": 3.34404e-05, "loss": 0.2281, "step": 416400 }, { "epoch": 0.133, "grad_norm": 0.1562078595161438, "learning_rate": 3.3400400000000006e-05, "loss": 0.2262, "step": 416500 }, { "epoch": 0.1332, "grad_norm": 0.18324877321720123, "learning_rate": 3.33604e-05, "loss": 0.2264, "step": 416600 }, { "epoch": 0.1334, "grad_norm": 0.1622045785188675, "learning_rate": 3.33204e-05, "loss": 0.2271, "step": 416700 }, { "epoch": 0.1336, "grad_norm": 0.16861198842525482, "learning_rate": 3.32804e-05, "loss": 0.2245, "step": 416800 }, { "epoch": 0.1338, "grad_norm": 0.20437948405742645, "learning_rate": 3.32404e-05, "loss": 0.2264, "step": 416900 }, { "epoch": 0.134, "grad_norm": 0.2063809335231781, "learning_rate": 3.32004e-05, "loss": 0.2249, "step": 417000 }, { "epoch": 0.1342, "grad_norm": 0.18307951092720032, "learning_rate": 3.31604e-05, "loss": 0.2324, "step": 417100 }, { "epoch": 0.1344, "grad_norm": 0.1892336905002594, "learning_rate": 3.31204e-05, "loss": 0.2248, "step": 417200 }, { "epoch": 0.1346, "grad_norm": 0.2223246991634369, "learning_rate": 3.30804e-05, "loss": 0.225, "step": 417300 }, { "epoch": 0.1348, "grad_norm": 0.15627120435237885, "learning_rate": 3.3040399999999996e-05, "loss": 0.2228, "step": 417400 }, { "epoch": 0.135, "grad_norm": 0.16463282704353333, "learning_rate": 3.3000400000000003e-05, "loss": 0.2257, "step": 417500 }, { "epoch": 0.1352, "grad_norm": 0.17951765656471252, "learning_rate": 3.2960400000000004e-05, "loss": 0.2234, "step": 417600 }, { "epoch": 0.1354, "grad_norm": 0.17068003118038177, "learning_rate": 3.2920400000000004e-05, "loss": 0.2244, "step": 417700 }, { "epoch": 0.1356, "grad_norm": 0.17062413692474365, "learning_rate": 3.28804e-05, "loss": 0.2224, "step": 417800 }, { "epoch": 0.1358, "grad_norm": 0.27390560507774353, "learning_rate": 3.28404e-05, "loss": 0.2274, "step": 417900 }, { "epoch": 0.136, "grad_norm": 0.15134724974632263, "learning_rate": 3.2800400000000005e-05, "loss": 0.2296, "step": 418000 }, { "epoch": 0.1362, "grad_norm": 0.161467045545578, "learning_rate": 3.27604e-05, "loss": 0.2216, "step": 418100 }, { "epoch": 0.1364, "grad_norm": 0.3517807126045227, "learning_rate": 3.27204e-05, "loss": 0.2276, "step": 418200 }, { "epoch": 0.1366, "grad_norm": 0.19215285778045654, "learning_rate": 3.26804e-05, "loss": 0.2307, "step": 418300 }, { "epoch": 0.1368, "grad_norm": 0.18861427903175354, "learning_rate": 3.264040000000001e-05, "loss": 0.2275, "step": 418400 }, { "epoch": 0.137, "grad_norm": 0.2437637597322464, "learning_rate": 3.26004e-05, "loss": 0.2244, "step": 418500 }, { "epoch": 0.1372, "grad_norm": 0.156779944896698, "learning_rate": 3.25604e-05, "loss": 0.227, "step": 418600 }, { "epoch": 0.1374, "grad_norm": 0.14431990683078766, "learning_rate": 3.25204e-05, "loss": 0.2256, "step": 418700 }, { "epoch": 0.1376, "grad_norm": 0.2503659427165985, "learning_rate": 3.24804e-05, "loss": 0.2234, "step": 418800 }, { "epoch": 0.1378, "grad_norm": 0.20400291681289673, "learning_rate": 3.24404e-05, "loss": 0.2221, "step": 418900 }, { "epoch": 0.138, "grad_norm": 0.16242775321006775, "learning_rate": 3.24004e-05, "loss": 0.2234, "step": 419000 }, { "epoch": 0.1382, "grad_norm": 0.18688417971134186, "learning_rate": 3.23604e-05, "loss": 0.2302, "step": 419100 }, { "epoch": 0.1384, "grad_norm": 0.18453970551490784, "learning_rate": 3.23204e-05, "loss": 0.2234, "step": 419200 }, { "epoch": 0.1386, "grad_norm": 0.1676274985074997, "learning_rate": 3.22804e-05, "loss": 0.2288, "step": 419300 }, { "epoch": 0.1388, "grad_norm": 0.21078824996948242, "learning_rate": 3.2240400000000004e-05, "loss": 0.2244, "step": 419400 }, { "epoch": 0.139, "grad_norm": 0.24954631924629211, "learning_rate": 3.2200400000000004e-05, "loss": 0.2242, "step": 419500 }, { "epoch": 0.1392, "grad_norm": 0.2090296894311905, "learning_rate": 3.2160400000000005e-05, "loss": 0.2219, "step": 419600 }, { "epoch": 0.1394, "grad_norm": 0.15903155505657196, "learning_rate": 3.21204e-05, "loss": 0.2287, "step": 419700 }, { "epoch": 0.1396, "grad_norm": 0.1801076978445053, "learning_rate": 3.20804e-05, "loss": 0.224, "step": 419800 }, { "epoch": 0.1398, "grad_norm": 0.14464084804058075, "learning_rate": 3.2040400000000006e-05, "loss": 0.2256, "step": 419900 }, { "epoch": 0.14, "grad_norm": 0.1716356873512268, "learning_rate": 3.20004e-05, "loss": 0.2269, "step": 420000 }, { "epoch": 0.1402, "grad_norm": 0.2161165177822113, "learning_rate": 3.19604e-05, "loss": 0.2243, "step": 420100 }, { "epoch": 0.1404, "grad_norm": 0.2271643877029419, "learning_rate": 3.19204e-05, "loss": 0.2227, "step": 420200 }, { "epoch": 0.1406, "grad_norm": 0.18350888788700104, "learning_rate": 3.18804e-05, "loss": 0.2281, "step": 420300 }, { "epoch": 0.1408, "grad_norm": 0.17086632549762726, "learning_rate": 3.18404e-05, "loss": 0.2256, "step": 420400 }, { "epoch": 0.141, "grad_norm": 0.20264512300491333, "learning_rate": 3.18004e-05, "loss": 0.226, "step": 420500 }, { "epoch": 0.1412, "grad_norm": 0.1515614241361618, "learning_rate": 3.17604e-05, "loss": 0.2271, "step": 420600 }, { "epoch": 0.1414, "grad_norm": 0.19251267611980438, "learning_rate": 3.17204e-05, "loss": 0.2291, "step": 420700 }, { "epoch": 0.1416, "grad_norm": 0.1888919323682785, "learning_rate": 3.1680399999999996e-05, "loss": 0.2233, "step": 420800 }, { "epoch": 0.1418, "grad_norm": 0.18011757731437683, "learning_rate": 3.16404e-05, "loss": 0.2205, "step": 420900 }, { "epoch": 0.142, "grad_norm": 0.1989680528640747, "learning_rate": 3.16004e-05, "loss": 0.2256, "step": 421000 }, { "epoch": 0.1422, "grad_norm": 0.19918207824230194, "learning_rate": 3.1560400000000004e-05, "loss": 0.2251, "step": 421100 }, { "epoch": 0.1424, "grad_norm": 0.1635255366563797, "learning_rate": 3.15204e-05, "loss": 0.227, "step": 421200 }, { "epoch": 0.1426, "grad_norm": 0.1558133214712143, "learning_rate": 3.14804e-05, "loss": 0.2225, "step": 421300 }, { "epoch": 0.1428, "grad_norm": 0.16875283420085907, "learning_rate": 3.1440400000000005e-05, "loss": 0.2254, "step": 421400 }, { "epoch": 0.143, "grad_norm": 0.16321547329425812, "learning_rate": 3.1400400000000005e-05, "loss": 0.2272, "step": 421500 }, { "epoch": 0.1432, "grad_norm": 0.1466277837753296, "learning_rate": 3.13604e-05, "loss": 0.2234, "step": 421600 }, { "epoch": 0.1434, "grad_norm": 0.1759670078754425, "learning_rate": 3.13204e-05, "loss": 0.2229, "step": 421700 }, { "epoch": 0.1436, "grad_norm": 0.190858855843544, "learning_rate": 3.1280400000000006e-05, "loss": 0.2303, "step": 421800 }, { "epoch": 0.1438, "grad_norm": 0.2101619392633438, "learning_rate": 3.12404e-05, "loss": 0.223, "step": 421900 }, { "epoch": 0.144, "grad_norm": 0.19018958508968353, "learning_rate": 3.12004e-05, "loss": 0.2224, "step": 422000 }, { "epoch": 0.1442, "grad_norm": 0.14888699352741241, "learning_rate": 3.11604e-05, "loss": 0.2252, "step": 422100 }, { "epoch": 0.1444, "grad_norm": 0.20979218184947968, "learning_rate": 3.11204e-05, "loss": 0.2288, "step": 422200 }, { "epoch": 0.1446, "grad_norm": 0.1412057876586914, "learning_rate": 3.10804e-05, "loss": 0.2256, "step": 422300 }, { "epoch": 0.1448, "grad_norm": 0.19975054264068604, "learning_rate": 3.10404e-05, "loss": 0.2279, "step": 422400 }, { "epoch": 0.145, "grad_norm": 0.15945744514465332, "learning_rate": 3.10004e-05, "loss": 0.2254, "step": 422500 }, { "epoch": 0.1452, "grad_norm": 0.19987906515598297, "learning_rate": 3.09604e-05, "loss": 0.2219, "step": 422600 }, { "epoch": 0.1454, "grad_norm": 0.19047895073890686, "learning_rate": 3.0920399999999996e-05, "loss": 0.2206, "step": 422700 }, { "epoch": 0.1456, "grad_norm": 0.17094969749450684, "learning_rate": 3.08804e-05, "loss": 0.2283, "step": 422800 }, { "epoch": 0.1458, "grad_norm": 0.18042507767677307, "learning_rate": 3.0840400000000004e-05, "loss": 0.2217, "step": 422900 }, { "epoch": 0.146, "grad_norm": 0.17472079396247864, "learning_rate": 3.0800400000000004e-05, "loss": 0.2275, "step": 423000 }, { "epoch": 0.1462, "grad_norm": 0.18705900013446808, "learning_rate": 3.07604e-05, "loss": 0.2217, "step": 423100 }, { "epoch": 0.1464, "grad_norm": 0.19462667405605316, "learning_rate": 3.07204e-05, "loss": 0.2257, "step": 423200 }, { "epoch": 0.1466, "grad_norm": 0.18578322231769562, "learning_rate": 3.0680400000000005e-05, "loss": 0.2241, "step": 423300 }, { "epoch": 0.1468, "grad_norm": 0.2105281949043274, "learning_rate": 3.0640400000000006e-05, "loss": 0.234, "step": 423400 }, { "epoch": 0.147, "grad_norm": 0.1453637033700943, "learning_rate": 3.06004e-05, "loss": 0.2235, "step": 423500 }, { "epoch": 0.1472, "grad_norm": 0.17777366936206818, "learning_rate": 3.05604e-05, "loss": 0.2241, "step": 423600 }, { "epoch": 0.1474, "grad_norm": 0.16608352959156036, "learning_rate": 3.05204e-05, "loss": 0.2234, "step": 423700 }, { "epoch": 0.1476, "grad_norm": 0.16040940582752228, "learning_rate": 3.0480400000000004e-05, "loss": 0.2228, "step": 423800 }, { "epoch": 0.1478, "grad_norm": 0.15881648659706116, "learning_rate": 3.04404e-05, "loss": 0.2241, "step": 423900 }, { "epoch": 0.148, "grad_norm": 0.18990524113178253, "learning_rate": 3.04004e-05, "loss": 0.2263, "step": 424000 }, { "epoch": 0.1482, "grad_norm": 0.16839918494224548, "learning_rate": 3.03604e-05, "loss": 0.2238, "step": 424100 }, { "epoch": 0.1484, "grad_norm": 0.14922964572906494, "learning_rate": 3.03204e-05, "loss": 0.2238, "step": 424200 }, { "epoch": 0.1486, "grad_norm": 0.14740827679634094, "learning_rate": 3.0280400000000002e-05, "loss": 0.2266, "step": 424300 }, { "epoch": 0.1488, "grad_norm": 0.18283480405807495, "learning_rate": 3.0240400000000003e-05, "loss": 0.2267, "step": 424400 }, { "epoch": 0.149, "grad_norm": 0.22374579310417175, "learning_rate": 3.02004e-05, "loss": 0.2242, "step": 424500 }, { "epoch": 0.1492, "grad_norm": 0.21242839097976685, "learning_rate": 3.01604e-05, "loss": 0.2261, "step": 424600 }, { "epoch": 0.1494, "grad_norm": 0.17830756306648254, "learning_rate": 3.0120400000000004e-05, "loss": 0.227, "step": 424700 }, { "epoch": 0.1496, "grad_norm": 0.2056087851524353, "learning_rate": 3.0080400000000004e-05, "loss": 0.2271, "step": 424800 }, { "epoch": 0.1498, "grad_norm": 0.15750348567962646, "learning_rate": 3.00404e-05, "loss": 0.2258, "step": 424900 }, { "epoch": 0.15, "grad_norm": 0.1605195552110672, "learning_rate": 3.00004e-05, "loss": 0.2269, "step": 425000 }, { "epoch": 0.1502, "grad_norm": 0.17202390730381012, "learning_rate": 2.99604e-05, "loss": 0.2278, "step": 425100 }, { "epoch": 0.1504, "grad_norm": 0.18993693590164185, "learning_rate": 2.9920400000000002e-05, "loss": 0.2296, "step": 425200 }, { "epoch": 0.1506, "grad_norm": 0.16901157796382904, "learning_rate": 2.9880400000000003e-05, "loss": 0.2262, "step": 425300 }, { "epoch": 0.1508, "grad_norm": 0.18246543407440186, "learning_rate": 2.9840400000000003e-05, "loss": 0.2246, "step": 425400 }, { "epoch": 0.151, "grad_norm": 0.17869111895561218, "learning_rate": 2.98004e-05, "loss": 0.2241, "step": 425500 }, { "epoch": 0.1512, "grad_norm": 0.1899707317352295, "learning_rate": 2.97604e-05, "loss": 0.2259, "step": 425600 }, { "epoch": 0.1514, "grad_norm": 0.18976940214633942, "learning_rate": 2.9720400000000004e-05, "loss": 0.2248, "step": 425700 }, { "epoch": 0.1516, "grad_norm": 0.19041858613491058, "learning_rate": 2.96804e-05, "loss": 0.2265, "step": 425800 }, { "epoch": 0.1518, "grad_norm": 0.19394929707050323, "learning_rate": 2.9640400000000002e-05, "loss": 0.2302, "step": 425900 }, { "epoch": 0.152, "grad_norm": 0.16645781695842743, "learning_rate": 2.96004e-05, "loss": 0.227, "step": 426000 }, { "epoch": 0.1522, "grad_norm": 0.15843971073627472, "learning_rate": 2.95604e-05, "loss": 0.2269, "step": 426100 }, { "epoch": 0.1524, "grad_norm": 0.18866930902004242, "learning_rate": 2.9520400000000003e-05, "loss": 0.2269, "step": 426200 }, { "epoch": 0.1526, "grad_norm": 0.18919654190540314, "learning_rate": 2.9480400000000003e-05, "loss": 0.2248, "step": 426300 }, { "epoch": 0.1528, "grad_norm": 0.22273516654968262, "learning_rate": 2.94404e-05, "loss": 0.2214, "step": 426400 }, { "epoch": 0.153, "grad_norm": 0.20623090863227844, "learning_rate": 2.94004e-05, "loss": 0.2245, "step": 426500 }, { "epoch": 0.1532, "grad_norm": 0.16490277647972107, "learning_rate": 2.9360399999999998e-05, "loss": 0.2234, "step": 426600 }, { "epoch": 0.1534, "grad_norm": 0.30756646394729614, "learning_rate": 2.9320400000000005e-05, "loss": 0.2289, "step": 426700 }, { "epoch": 0.1536, "grad_norm": 0.18680694699287415, "learning_rate": 2.9280400000000002e-05, "loss": 0.2269, "step": 426800 }, { "epoch": 0.1538, "grad_norm": 0.16538353264331818, "learning_rate": 2.9240400000000002e-05, "loss": 0.2249, "step": 426900 }, { "epoch": 0.154, "grad_norm": 0.21780794858932495, "learning_rate": 2.92004e-05, "loss": 0.2263, "step": 427000 }, { "epoch": 0.1542, "grad_norm": 0.1775357723236084, "learning_rate": 2.91604e-05, "loss": 0.2231, "step": 427100 }, { "epoch": 0.1544, "grad_norm": 0.17265263199806213, "learning_rate": 2.9120400000000003e-05, "loss": 0.2299, "step": 427200 }, { "epoch": 0.1546, "grad_norm": 0.19637222588062286, "learning_rate": 2.9080400000000004e-05, "loss": 0.223, "step": 427300 }, { "epoch": 0.1548, "grad_norm": 0.2579210102558136, "learning_rate": 2.90404e-05, "loss": 0.2263, "step": 427400 }, { "epoch": 0.155, "grad_norm": 0.17191685736179352, "learning_rate": 2.90004e-05, "loss": 0.2241, "step": 427500 }, { "epoch": 0.1552, "grad_norm": 0.19325803220272064, "learning_rate": 2.8960399999999998e-05, "loss": 0.2216, "step": 427600 }, { "epoch": 0.1554, "grad_norm": 0.16005942225456238, "learning_rate": 2.8920400000000002e-05, "loss": 0.2276, "step": 427700 }, { "epoch": 0.1556, "grad_norm": 0.17553672194480896, "learning_rate": 2.8880400000000002e-05, "loss": 0.227, "step": 427800 }, { "epoch": 0.1558, "grad_norm": 0.24208134412765503, "learning_rate": 2.88404e-05, "loss": 0.2234, "step": 427900 }, { "epoch": 0.156, "grad_norm": 0.25327280163764954, "learning_rate": 2.88004e-05, "loss": 0.2258, "step": 428000 }, { "epoch": 0.1562, "grad_norm": 0.19016684591770172, "learning_rate": 2.8760400000000003e-05, "loss": 0.2251, "step": 428100 }, { "epoch": 0.1564, "grad_norm": 0.16044831275939941, "learning_rate": 2.8720400000000004e-05, "loss": 0.2227, "step": 428200 }, { "epoch": 0.1566, "grad_norm": 0.2034226357936859, "learning_rate": 2.86804e-05, "loss": 0.2269, "step": 428300 }, { "epoch": 0.1568, "grad_norm": 0.16030117869377136, "learning_rate": 2.86404e-05, "loss": 0.2299, "step": 428400 }, { "epoch": 0.157, "grad_norm": 0.1629854291677475, "learning_rate": 2.8600399999999998e-05, "loss": 0.2237, "step": 428500 }, { "epoch": 0.1572, "grad_norm": 0.1795048862695694, "learning_rate": 2.8560400000000005e-05, "loss": 0.2246, "step": 428600 }, { "epoch": 0.1574, "grad_norm": 0.19448979198932648, "learning_rate": 2.8520400000000002e-05, "loss": 0.2243, "step": 428700 }, { "epoch": 0.1576, "grad_norm": 0.16780062019824982, "learning_rate": 2.8480400000000003e-05, "loss": 0.2251, "step": 428800 }, { "epoch": 0.1578, "grad_norm": 0.18648077547550201, "learning_rate": 2.84404e-05, "loss": 0.2269, "step": 428900 }, { "epoch": 0.158, "grad_norm": 0.15443378686904907, "learning_rate": 2.84004e-05, "loss": 0.2243, "step": 429000 }, { "epoch": 0.1582, "grad_norm": 0.18094438314437866, "learning_rate": 2.8360400000000004e-05, "loss": 0.2293, "step": 429100 }, { "epoch": 0.1584, "grad_norm": 0.15523366630077362, "learning_rate": 2.8320400000000004e-05, "loss": 0.2278, "step": 429200 }, { "epoch": 0.1586, "grad_norm": 0.18622247874736786, "learning_rate": 2.82804e-05, "loss": 0.2242, "step": 429300 }, { "epoch": 0.1588, "grad_norm": 0.20111188292503357, "learning_rate": 2.82404e-05, "loss": 0.2248, "step": 429400 }, { "epoch": 0.159, "grad_norm": 0.23439711332321167, "learning_rate": 2.82004e-05, "loss": 0.2294, "step": 429500 }, { "epoch": 0.1592, "grad_norm": 0.20886704325675964, "learning_rate": 2.8160400000000002e-05, "loss": 0.2288, "step": 429600 }, { "epoch": 0.1594, "grad_norm": 0.1680990606546402, "learning_rate": 2.8120400000000003e-05, "loss": 0.2275, "step": 429700 }, { "epoch": 0.1596, "grad_norm": 0.1492314487695694, "learning_rate": 2.80804e-05, "loss": 0.2248, "step": 429800 }, { "epoch": 0.1598, "grad_norm": 0.2123488038778305, "learning_rate": 2.80404e-05, "loss": 0.2235, "step": 429900 }, { "epoch": 0.16, "grad_norm": 0.17558607459068298, "learning_rate": 2.80004e-05, "loss": 0.2247, "step": 430000 }, { "epoch": 0.1602, "grad_norm": 0.26399195194244385, "learning_rate": 2.7960400000000004e-05, "loss": 0.2248, "step": 430100 }, { "epoch": 0.1604, "grad_norm": 0.16792334616184235, "learning_rate": 2.79204e-05, "loss": 0.2251, "step": 430200 }, { "epoch": 0.1606, "grad_norm": 0.16660743951797485, "learning_rate": 2.78804e-05, "loss": 0.2265, "step": 430300 }, { "epoch": 0.1608, "grad_norm": 0.2021878957748413, "learning_rate": 2.78404e-05, "loss": 0.2231, "step": 430400 }, { "epoch": 0.161, "grad_norm": 0.16516359150409698, "learning_rate": 2.78004e-05, "loss": 0.2317, "step": 430500 }, { "epoch": 0.1612, "grad_norm": 0.16823023557662964, "learning_rate": 2.7760400000000003e-05, "loss": 0.2274, "step": 430600 }, { "epoch": 0.1614, "grad_norm": 0.15308980643749237, "learning_rate": 2.7720400000000003e-05, "loss": 0.2278, "step": 430700 }, { "epoch": 0.1616, "grad_norm": 0.17121495306491852, "learning_rate": 2.76804e-05, "loss": 0.2291, "step": 430800 }, { "epoch": 0.1618, "grad_norm": 0.17018477618694305, "learning_rate": 2.76404e-05, "loss": 0.2214, "step": 430900 }, { "epoch": 0.162, "grad_norm": 0.18164673447608948, "learning_rate": 2.7600400000000004e-05, "loss": 0.2264, "step": 431000 }, { "epoch": 0.1622, "grad_norm": 0.2391088306903839, "learning_rate": 2.75604e-05, "loss": 0.2256, "step": 431100 }, { "epoch": 0.1624, "grad_norm": 0.1486031413078308, "learning_rate": 2.75204e-05, "loss": 0.2305, "step": 431200 }, { "epoch": 0.1626, "grad_norm": 0.3389119803905487, "learning_rate": 2.7480400000000002e-05, "loss": 0.2268, "step": 431300 }, { "epoch": 0.1628, "grad_norm": 0.203375905752182, "learning_rate": 2.74404e-05, "loss": 0.2235, "step": 431400 }, { "epoch": 0.163, "grad_norm": 0.16444042325019836, "learning_rate": 2.7400400000000003e-05, "loss": 0.2258, "step": 431500 }, { "epoch": 0.1632, "grad_norm": 0.26179274916648865, "learning_rate": 2.7360400000000003e-05, "loss": 0.2255, "step": 431600 }, { "epoch": 0.1634, "grad_norm": 0.20581679046154022, "learning_rate": 2.73204e-05, "loss": 0.2228, "step": 431700 }, { "epoch": 0.1636, "grad_norm": 0.19416281580924988, "learning_rate": 2.72804e-05, "loss": 0.2289, "step": 431800 }, { "epoch": 0.1638, "grad_norm": 0.17209891974925995, "learning_rate": 2.7240399999999998e-05, "loss": 0.2248, "step": 431900 }, { "epoch": 0.164, "grad_norm": 0.16744202375411987, "learning_rate": 2.7200400000000005e-05, "loss": 0.2291, "step": 432000 }, { "epoch": 0.1642, "grad_norm": 0.18142908811569214, "learning_rate": 2.7160400000000002e-05, "loss": 0.2228, "step": 432100 }, { "epoch": 0.1644, "grad_norm": 0.2096833437681198, "learning_rate": 2.7120400000000002e-05, "loss": 0.2295, "step": 432200 }, { "epoch": 0.1646, "grad_norm": 0.15951569378376007, "learning_rate": 2.70804e-05, "loss": 0.225, "step": 432300 }, { "epoch": 0.1648, "grad_norm": 0.16083496809005737, "learning_rate": 2.70404e-05, "loss": 0.226, "step": 432400 }, { "epoch": 0.165, "grad_norm": 0.1795596182346344, "learning_rate": 2.7000400000000003e-05, "loss": 0.2243, "step": 432500 }, { "epoch": 0.1652, "grad_norm": 0.18583637475967407, "learning_rate": 2.6960400000000004e-05, "loss": 0.2226, "step": 432600 }, { "epoch": 0.1654, "grad_norm": 0.146294966340065, "learning_rate": 2.69204e-05, "loss": 0.224, "step": 432700 }, { "epoch": 0.1656, "grad_norm": 0.17165237665176392, "learning_rate": 2.68804e-05, "loss": 0.2225, "step": 432800 }, { "epoch": 0.1658, "grad_norm": 0.17422151565551758, "learning_rate": 2.6840399999999998e-05, "loss": 0.2269, "step": 432900 }, { "epoch": 0.166, "grad_norm": 0.17443683743476868, "learning_rate": 2.6800400000000002e-05, "loss": 0.2253, "step": 433000 }, { "epoch": 0.1662, "grad_norm": 0.18008099496364594, "learning_rate": 2.6760400000000002e-05, "loss": 0.2232, "step": 433100 }, { "epoch": 0.1664, "grad_norm": 0.17084477841854095, "learning_rate": 2.6720400000000003e-05, "loss": 0.2333, "step": 433200 }, { "epoch": 0.1666, "grad_norm": 0.17184089124202728, "learning_rate": 2.66804e-05, "loss": 0.2236, "step": 433300 }, { "epoch": 0.1668, "grad_norm": 0.17407119274139404, "learning_rate": 2.66404e-05, "loss": 0.2247, "step": 433400 }, { "epoch": 0.167, "grad_norm": 0.155440554022789, "learning_rate": 2.6600400000000004e-05, "loss": 0.2283, "step": 433500 }, { "epoch": 0.1672, "grad_norm": 0.15436717867851257, "learning_rate": 2.65604e-05, "loss": 0.2259, "step": 433600 }, { "epoch": 0.1674, "grad_norm": 0.20875059068202972, "learning_rate": 2.65204e-05, "loss": 0.2306, "step": 433700 }, { "epoch": 0.1676, "grad_norm": 0.38795992732048035, "learning_rate": 2.6480399999999998e-05, "loss": 0.23, "step": 433800 }, { "epoch": 0.1678, "grad_norm": 0.29372575879096985, "learning_rate": 2.64404e-05, "loss": 0.2278, "step": 433900 }, { "epoch": 0.168, "grad_norm": 0.1667247861623764, "learning_rate": 2.6400400000000002e-05, "loss": 0.2275, "step": 434000 }, { "epoch": 0.1682, "grad_norm": 0.1645084172487259, "learning_rate": 2.6360400000000003e-05, "loss": 0.2242, "step": 434100 }, { "epoch": 0.1684, "grad_norm": 0.20222832262516022, "learning_rate": 2.63204e-05, "loss": 0.2271, "step": 434200 }, { "epoch": 0.1686, "grad_norm": 0.3428006172180176, "learning_rate": 2.62804e-05, "loss": 0.2247, "step": 434300 }, { "epoch": 0.1688, "grad_norm": 0.1995532214641571, "learning_rate": 2.6240400000000004e-05, "loss": 0.2257, "step": 434400 }, { "epoch": 0.169, "grad_norm": 0.18206648528575897, "learning_rate": 2.6200400000000004e-05, "loss": 0.2242, "step": 434500 }, { "epoch": 0.1692, "grad_norm": 0.15445922315120697, "learning_rate": 2.61604e-05, "loss": 0.2251, "step": 434600 }, { "epoch": 0.1694, "grad_norm": 0.1566971242427826, "learning_rate": 2.61204e-05, "loss": 0.2229, "step": 434700 }, { "epoch": 0.1696, "grad_norm": 0.16852755844593048, "learning_rate": 2.60804e-05, "loss": 0.2287, "step": 434800 }, { "epoch": 0.1698, "grad_norm": 0.17215055227279663, "learning_rate": 2.6040400000000002e-05, "loss": 0.2224, "step": 434900 }, { "epoch": 0.17, "grad_norm": 0.22290776669979095, "learning_rate": 2.6000400000000003e-05, "loss": 0.2338, "step": 435000 }, { "epoch": 0.1702, "grad_norm": 0.16391290724277496, "learning_rate": 2.5960400000000003e-05, "loss": 0.2263, "step": 435100 }, { "epoch": 0.1704, "grad_norm": 0.17789016664028168, "learning_rate": 2.59204e-05, "loss": 0.2276, "step": 435200 }, { "epoch": 0.1706, "grad_norm": 0.17922519147396088, "learning_rate": 2.58804e-05, "loss": 0.2208, "step": 435300 }, { "epoch": 0.1708, "grad_norm": 0.16985663771629333, "learning_rate": 2.5840400000000004e-05, "loss": 0.2267, "step": 435400 }, { "epoch": 0.171, "grad_norm": 0.19357039034366608, "learning_rate": 2.58004e-05, "loss": 0.2289, "step": 435500 }, { "epoch": 0.1712, "grad_norm": 0.1737697869539261, "learning_rate": 2.57604e-05, "loss": 0.2238, "step": 435600 }, { "epoch": 0.1714, "grad_norm": 0.17874495685100555, "learning_rate": 2.57204e-05, "loss": 0.2233, "step": 435700 }, { "epoch": 0.1716, "grad_norm": 0.17967748641967773, "learning_rate": 2.56804e-05, "loss": 0.223, "step": 435800 }, { "epoch": 0.1718, "grad_norm": 0.1678665578365326, "learning_rate": 2.5640400000000003e-05, "loss": 0.2282, "step": 435900 }, { "epoch": 0.172, "grad_norm": 0.1909477561712265, "learning_rate": 2.5600400000000003e-05, "loss": 0.2272, "step": 436000 }, { "epoch": 0.1722, "grad_norm": 0.22589877247810364, "learning_rate": 2.55604e-05, "loss": 0.231, "step": 436100 }, { "epoch": 0.1724, "grad_norm": 0.16562041640281677, "learning_rate": 2.55204e-05, "loss": 0.2381, "step": 436200 }, { "epoch": 0.1726, "grad_norm": 0.19467677175998688, "learning_rate": 2.5480399999999997e-05, "loss": 0.2286, "step": 436300 }, { "epoch": 0.1728, "grad_norm": 0.7384561896324158, "learning_rate": 2.5440400000000005e-05, "loss": 0.2238, "step": 436400 }, { "epoch": 0.173, "grad_norm": 0.1401069164276123, "learning_rate": 2.54004e-05, "loss": 0.2256, "step": 436500 }, { "epoch": 0.1732, "grad_norm": 0.20685148239135742, "learning_rate": 2.5360400000000002e-05, "loss": 0.2265, "step": 436600 }, { "epoch": 0.1734, "grad_norm": 0.19295437633991241, "learning_rate": 2.53204e-05, "loss": 0.2262, "step": 436700 }, { "epoch": 0.1736, "grad_norm": 0.23843425512313843, "learning_rate": 2.52804e-05, "loss": 0.2305, "step": 436800 }, { "epoch": 0.1738, "grad_norm": 0.16619522869586945, "learning_rate": 2.5240400000000003e-05, "loss": 0.2272, "step": 436900 }, { "epoch": 0.174, "grad_norm": 0.17892450094223022, "learning_rate": 2.52004e-05, "loss": 0.2283, "step": 437000 }, { "epoch": 0.1742, "grad_norm": 0.15790720283985138, "learning_rate": 2.51604e-05, "loss": 0.2273, "step": 437100 }, { "epoch": 0.1744, "grad_norm": 0.1899292767047882, "learning_rate": 2.51204e-05, "loss": 0.2229, "step": 437200 }, { "epoch": 0.1746, "grad_norm": 0.14178262650966644, "learning_rate": 2.5080400000000005e-05, "loss": 0.2284, "step": 437300 }, { "epoch": 0.1748, "grad_norm": 0.16377106308937073, "learning_rate": 2.50404e-05, "loss": 0.2259, "step": 437400 }, { "epoch": 0.175, "grad_norm": 0.21757832169532776, "learning_rate": 2.5000400000000002e-05, "loss": 0.2281, "step": 437500 }, { "epoch": 0.1752, "grad_norm": 0.16363129019737244, "learning_rate": 2.49604e-05, "loss": 0.2266, "step": 437600 }, { "epoch": 0.1754, "grad_norm": 0.1644134521484375, "learning_rate": 2.4920400000000003e-05, "loss": 0.2226, "step": 437700 }, { "epoch": 0.1756, "grad_norm": 0.14680400490760803, "learning_rate": 2.48804e-05, "loss": 0.2245, "step": 437800 }, { "epoch": 0.1758, "grad_norm": 0.14819519221782684, "learning_rate": 2.4840400000000004e-05, "loss": 0.2251, "step": 437900 }, { "epoch": 0.176, "grad_norm": 0.18625690042972565, "learning_rate": 2.48004e-05, "loss": 0.2286, "step": 438000 }, { "epoch": 0.1762, "grad_norm": 0.15373022854328156, "learning_rate": 2.47604e-05, "loss": 0.2275, "step": 438100 }, { "epoch": 0.1764, "grad_norm": 0.1993611454963684, "learning_rate": 2.47204e-05, "loss": 0.2278, "step": 438200 }, { "epoch": 0.1766, "grad_norm": 0.24995622038841248, "learning_rate": 2.4680400000000002e-05, "loss": 0.2302, "step": 438300 }, { "epoch": 0.1768, "grad_norm": 0.17456863820552826, "learning_rate": 2.4640400000000002e-05, "loss": 0.2309, "step": 438400 }, { "epoch": 0.177, "grad_norm": 0.16433963179588318, "learning_rate": 2.4600400000000003e-05, "loss": 0.2263, "step": 438500 }, { "epoch": 0.1772, "grad_norm": 0.16630324721336365, "learning_rate": 2.45604e-05, "loss": 0.2269, "step": 438600 }, { "epoch": 0.1774, "grad_norm": 0.19276133179664612, "learning_rate": 2.4520400000000003e-05, "loss": 0.2263, "step": 438700 }, { "epoch": 0.1776, "grad_norm": 0.19641803205013275, "learning_rate": 2.44804e-05, "loss": 0.2256, "step": 438800 }, { "epoch": 0.1778, "grad_norm": 0.164507657289505, "learning_rate": 2.44404e-05, "loss": 0.224, "step": 438900 }, { "epoch": 0.178, "grad_norm": 0.15548419952392578, "learning_rate": 2.44004e-05, "loss": 0.2268, "step": 439000 }, { "epoch": 0.1782, "grad_norm": 0.15844307839870453, "learning_rate": 2.43604e-05, "loss": 0.2254, "step": 439100 }, { "epoch": 0.1784, "grad_norm": 0.18595917522907257, "learning_rate": 2.4320400000000002e-05, "loss": 0.2252, "step": 439200 }, { "epoch": 0.1786, "grad_norm": 0.27682188153266907, "learning_rate": 2.42804e-05, "loss": 0.2253, "step": 439300 }, { "epoch": 0.1788, "grad_norm": 0.1652635633945465, "learning_rate": 2.4240400000000003e-05, "loss": 0.2291, "step": 439400 }, { "epoch": 0.179, "grad_norm": 0.3612424433231354, "learning_rate": 2.42004e-05, "loss": 0.2258, "step": 439500 }, { "epoch": 0.1792, "grad_norm": 0.17087934911251068, "learning_rate": 2.4160400000000003e-05, "loss": 0.2233, "step": 439600 }, { "epoch": 0.1794, "grad_norm": 0.15505070984363556, "learning_rate": 2.41204e-05, "loss": 0.2257, "step": 439700 }, { "epoch": 0.1796, "grad_norm": 0.1622672826051712, "learning_rate": 2.40804e-05, "loss": 0.2252, "step": 439800 }, { "epoch": 0.1798, "grad_norm": 0.19575311243534088, "learning_rate": 2.40404e-05, "loss": 0.2238, "step": 439900 }, { "epoch": 0.18, "grad_norm": 0.1737244427204132, "learning_rate": 2.40004e-05, "loss": 0.2249, "step": 440000 }, { "epoch": 0.1802, "grad_norm": 0.24295039474964142, "learning_rate": 2.3960400000000002e-05, "loss": 0.2219, "step": 440100 }, { "epoch": 0.1804, "grad_norm": 0.17787252366542816, "learning_rate": 2.3920400000000002e-05, "loss": 0.2262, "step": 440200 }, { "epoch": 0.1806, "grad_norm": 0.15430960059165955, "learning_rate": 2.38804e-05, "loss": 0.2237, "step": 440300 }, { "epoch": 0.1808, "grad_norm": 0.17729860544204712, "learning_rate": 2.3840400000000003e-05, "loss": 0.2292, "step": 440400 }, { "epoch": 0.181, "grad_norm": 0.19913239777088165, "learning_rate": 2.38004e-05, "loss": 0.2273, "step": 440500 }, { "epoch": 0.1812, "grad_norm": 0.15231093764305115, "learning_rate": 2.3760400000000004e-05, "loss": 0.2206, "step": 440600 }, { "epoch": 0.1814, "grad_norm": 0.18984483182430267, "learning_rate": 2.37204e-05, "loss": 0.2152, "step": 440700 }, { "epoch": 0.1816, "grad_norm": 0.16935545206069946, "learning_rate": 2.36804e-05, "loss": 0.2228, "step": 440800 }, { "epoch": 0.1818, "grad_norm": 0.20050957798957825, "learning_rate": 2.36404e-05, "loss": 0.2235, "step": 440900 }, { "epoch": 0.182, "grad_norm": 0.19681406021118164, "learning_rate": 2.3600400000000002e-05, "loss": 0.2278, "step": 441000 }, { "epoch": 0.1822, "grad_norm": 0.14469000697135925, "learning_rate": 2.3560400000000002e-05, "loss": 0.2282, "step": 441100 }, { "epoch": 0.1824, "grad_norm": 0.22131530940532684, "learning_rate": 2.35204e-05, "loss": 0.2255, "step": 441200 }, { "epoch": 0.1826, "grad_norm": 0.23927749693393707, "learning_rate": 2.3480400000000003e-05, "loss": 0.2306, "step": 441300 }, { "epoch": 0.1828, "grad_norm": 0.22400574386119843, "learning_rate": 2.34404e-05, "loss": 0.2295, "step": 441400 }, { "epoch": 0.183, "grad_norm": 0.21024823188781738, "learning_rate": 2.34004e-05, "loss": 0.2315, "step": 441500 }, { "epoch": 0.1832, "grad_norm": 0.293183296918869, "learning_rate": 2.33604e-05, "loss": 0.2217, "step": 441600 }, { "epoch": 0.1834, "grad_norm": 0.17141690850257874, "learning_rate": 2.33204e-05, "loss": 0.2252, "step": 441700 }, { "epoch": 0.1836, "grad_norm": 0.19596771895885468, "learning_rate": 2.32804e-05, "loss": 0.2261, "step": 441800 }, { "epoch": 0.1838, "grad_norm": 0.1949038952589035, "learning_rate": 2.3240400000000002e-05, "loss": 0.2252, "step": 441900 }, { "epoch": 0.184, "grad_norm": 0.22144322097301483, "learning_rate": 2.32004e-05, "loss": 0.2218, "step": 442000 }, { "epoch": 0.1842, "grad_norm": 0.17387978732585907, "learning_rate": 2.3160400000000003e-05, "loss": 0.2226, "step": 442100 }, { "epoch": 0.1844, "grad_norm": 0.1720968782901764, "learning_rate": 2.31204e-05, "loss": 0.2263, "step": 442200 }, { "epoch": 0.1846, "grad_norm": 0.2119131088256836, "learning_rate": 2.3080400000000004e-05, "loss": 0.2296, "step": 442300 }, { "epoch": 0.1848, "grad_norm": 0.14969196915626526, "learning_rate": 2.30404e-05, "loss": 0.2229, "step": 442400 }, { "epoch": 0.185, "grad_norm": 0.18958836793899536, "learning_rate": 2.30004e-05, "loss": 0.222, "step": 442500 }, { "epoch": 0.1852, "grad_norm": 0.21705825626850128, "learning_rate": 2.29604e-05, "loss": 0.2251, "step": 442600 }, { "epoch": 0.1854, "grad_norm": 0.19531495869159698, "learning_rate": 2.29204e-05, "loss": 0.2252, "step": 442700 }, { "epoch": 0.1856, "grad_norm": 0.1760839819908142, "learning_rate": 2.2880400000000002e-05, "loss": 0.2248, "step": 442800 }, { "epoch": 0.1858, "grad_norm": 0.18036484718322754, "learning_rate": 2.28404e-05, "loss": 0.222, "step": 442900 }, { "epoch": 0.186, "grad_norm": 0.200086772441864, "learning_rate": 2.2800400000000003e-05, "loss": 0.2259, "step": 443000 }, { "epoch": 0.1862, "grad_norm": 0.5199419856071472, "learning_rate": 2.27604e-05, "loss": 0.2236, "step": 443100 }, { "epoch": 0.1864, "grad_norm": 0.1505269855260849, "learning_rate": 2.27204e-05, "loss": 0.2243, "step": 443200 }, { "epoch": 0.1866, "grad_norm": 0.21658334136009216, "learning_rate": 2.26804e-05, "loss": 0.2277, "step": 443300 }, { "epoch": 0.1868, "grad_norm": 0.1422983855009079, "learning_rate": 2.26404e-05, "loss": 0.2214, "step": 443400 }, { "epoch": 0.187, "grad_norm": 0.18516691029071808, "learning_rate": 2.26004e-05, "loss": 0.2241, "step": 443500 }, { "epoch": 0.1872, "grad_norm": 0.13827116787433624, "learning_rate": 2.2560400000000002e-05, "loss": 0.2277, "step": 443600 }, { "epoch": 0.1874, "grad_norm": 0.33428114652633667, "learning_rate": 2.25204e-05, "loss": 0.2243, "step": 443700 }, { "epoch": 0.1876, "grad_norm": 0.2030119001865387, "learning_rate": 2.2480400000000002e-05, "loss": 0.2277, "step": 443800 }, { "epoch": 0.1878, "grad_norm": 0.16090461611747742, "learning_rate": 2.24404e-05, "loss": 0.2297, "step": 443900 }, { "epoch": 0.188, "grad_norm": 0.2067965716123581, "learning_rate": 2.2400400000000003e-05, "loss": 0.2228, "step": 444000 }, { "epoch": 0.1882, "grad_norm": 0.15703140199184418, "learning_rate": 2.23604e-05, "loss": 0.2282, "step": 444100 }, { "epoch": 0.1884, "grad_norm": 0.1881639063358307, "learning_rate": 2.2320400000000004e-05, "loss": 0.2245, "step": 444200 }, { "epoch": 0.1886, "grad_norm": 0.23924614489078522, "learning_rate": 2.22804e-05, "loss": 0.2267, "step": 444300 }, { "epoch": 0.1888, "grad_norm": 0.17333418130874634, "learning_rate": 2.22404e-05, "loss": 0.2229, "step": 444400 }, { "epoch": 0.189, "grad_norm": 0.17022064328193665, "learning_rate": 2.2200400000000002e-05, "loss": 0.225, "step": 444500 }, { "epoch": 0.1892, "grad_norm": 0.17431145906448364, "learning_rate": 2.2160400000000002e-05, "loss": 0.226, "step": 444600 }, { "epoch": 0.1894, "grad_norm": 0.20178723335266113, "learning_rate": 2.2120400000000003e-05, "loss": 0.226, "step": 444700 }, { "epoch": 0.1896, "grad_norm": 0.18924662470817566, "learning_rate": 2.20804e-05, "loss": 0.2254, "step": 444800 }, { "epoch": 0.1898, "grad_norm": 0.23246605694293976, "learning_rate": 2.20404e-05, "loss": 0.2246, "step": 444900 }, { "epoch": 0.19, "grad_norm": 0.15815980732440948, "learning_rate": 2.20004e-05, "loss": 0.2249, "step": 445000 }, { "epoch": 0.1902, "grad_norm": 0.19790607690811157, "learning_rate": 2.19604e-05, "loss": 0.2302, "step": 445100 }, { "epoch": 0.1904, "grad_norm": 0.19628171622753143, "learning_rate": 2.19204e-05, "loss": 0.2264, "step": 445200 }, { "epoch": 0.1906, "grad_norm": 0.16025413572788239, "learning_rate": 2.18804e-05, "loss": 0.2202, "step": 445300 }, { "epoch": 0.1908, "grad_norm": 0.2109999656677246, "learning_rate": 2.18404e-05, "loss": 0.224, "step": 445400 }, { "epoch": 0.191, "grad_norm": 0.17746329307556152, "learning_rate": 2.1800400000000002e-05, "loss": 0.2253, "step": 445500 }, { "epoch": 0.1912, "grad_norm": 0.18444675207138062, "learning_rate": 2.17604e-05, "loss": 0.2252, "step": 445600 }, { "epoch": 0.1914, "grad_norm": 0.21549223363399506, "learning_rate": 2.1720400000000003e-05, "loss": 0.2265, "step": 445700 }, { "epoch": 0.1916, "grad_norm": 0.16874556243419647, "learning_rate": 2.16804e-05, "loss": 0.2263, "step": 445800 }, { "epoch": 0.1918, "grad_norm": 0.14884252846240997, "learning_rate": 2.1640400000000004e-05, "loss": 0.2252, "step": 445900 }, { "epoch": 0.192, "grad_norm": 0.2405053973197937, "learning_rate": 2.16004e-05, "loss": 0.2267, "step": 446000 }, { "epoch": 0.1922, "grad_norm": 0.1512579768896103, "learning_rate": 2.15604e-05, "loss": 0.2243, "step": 446100 }, { "epoch": 0.1924, "grad_norm": 0.15043461322784424, "learning_rate": 2.15204e-05, "loss": 0.226, "step": 446200 }, { "epoch": 0.1926, "grad_norm": 0.14837458729743958, "learning_rate": 2.1480400000000002e-05, "loss": 0.2251, "step": 446300 }, { "epoch": 0.1928, "grad_norm": 0.1467418372631073, "learning_rate": 2.1440400000000002e-05, "loss": 0.2244, "step": 446400 }, { "epoch": 0.193, "grad_norm": 0.16356542706489563, "learning_rate": 2.1400400000000003e-05, "loss": 0.2255, "step": 446500 }, { "epoch": 0.1932, "grad_norm": 0.19749680161476135, "learning_rate": 2.13604e-05, "loss": 0.2255, "step": 446600 }, { "epoch": 0.1934, "grad_norm": 0.20441758632659912, "learning_rate": 2.13204e-05, "loss": 0.2232, "step": 446700 }, { "epoch": 0.1936, "grad_norm": 0.1879335343837738, "learning_rate": 2.12804e-05, "loss": 0.2306, "step": 446800 }, { "epoch": 0.1938, "grad_norm": 0.1568703055381775, "learning_rate": 2.12404e-05, "loss": 0.2233, "step": 446900 }, { "epoch": 0.194, "grad_norm": 0.1516915112733841, "learning_rate": 2.12004e-05, "loss": 0.228, "step": 447000 }, { "epoch": 0.1942, "grad_norm": 0.1923481822013855, "learning_rate": 2.1160399999999998e-05, "loss": 0.2304, "step": 447100 }, { "epoch": 0.1944, "grad_norm": 0.19211551547050476, "learning_rate": 2.1120400000000002e-05, "loss": 0.2253, "step": 447200 }, { "epoch": 0.1946, "grad_norm": 0.2306138575077057, "learning_rate": 2.10804e-05, "loss": 0.2228, "step": 447300 }, { "epoch": 0.1948, "grad_norm": 0.18642979860305786, "learning_rate": 2.1040400000000003e-05, "loss": 0.2235, "step": 447400 }, { "epoch": 0.195, "grad_norm": 0.1702134609222412, "learning_rate": 2.10004e-05, "loss": 0.2297, "step": 447500 }, { "epoch": 0.1952, "grad_norm": 0.187478706240654, "learning_rate": 2.0960400000000003e-05, "loss": 0.2291, "step": 447600 }, { "epoch": 0.1954, "grad_norm": 0.1675885021686554, "learning_rate": 2.09204e-05, "loss": 0.2257, "step": 447700 }, { "epoch": 0.1956, "grad_norm": 0.20350529253482819, "learning_rate": 2.08804e-05, "loss": 0.225, "step": 447800 }, { "epoch": 0.1958, "grad_norm": 0.1661386489868164, "learning_rate": 2.08404e-05, "loss": 0.2274, "step": 447900 }, { "epoch": 0.196, "grad_norm": 0.15800422430038452, "learning_rate": 2.08004e-05, "loss": 0.2311, "step": 448000 }, { "epoch": 0.1962, "grad_norm": 0.14770348370075226, "learning_rate": 2.0760400000000002e-05, "loss": 0.2255, "step": 448100 }, { "epoch": 0.1964, "grad_norm": 0.22858108580112457, "learning_rate": 2.0720400000000002e-05, "loss": 0.225, "step": 448200 }, { "epoch": 0.1966, "grad_norm": 0.1924201101064682, "learning_rate": 2.06804e-05, "loss": 0.222, "step": 448300 }, { "epoch": 0.1968, "grad_norm": 0.2220558375120163, "learning_rate": 2.06404e-05, "loss": 0.2284, "step": 448400 }, { "epoch": 0.197, "grad_norm": 0.17530234158039093, "learning_rate": 2.06004e-05, "loss": 0.2256, "step": 448500 }, { "epoch": 0.1972, "grad_norm": 0.19386076927185059, "learning_rate": 2.05604e-05, "loss": 0.2267, "step": 448600 }, { "epoch": 0.1974, "grad_norm": 0.38340792059898376, "learning_rate": 2.05204e-05, "loss": 0.2245, "step": 448700 }, { "epoch": 0.1976, "grad_norm": 0.2019384652376175, "learning_rate": 2.04804e-05, "loss": 0.225, "step": 448800 }, { "epoch": 0.1978, "grad_norm": 0.16011062264442444, "learning_rate": 2.04404e-05, "loss": 0.228, "step": 448900 }, { "epoch": 0.198, "grad_norm": 0.2042662352323532, "learning_rate": 2.04004e-05, "loss": 0.2257, "step": 449000 }, { "epoch": 0.1982, "grad_norm": 0.15106824040412903, "learning_rate": 2.0360400000000002e-05, "loss": 0.2236, "step": 449100 }, { "epoch": 0.1984, "grad_norm": 0.15230531990528107, "learning_rate": 2.03204e-05, "loss": 0.2222, "step": 449200 }, { "epoch": 0.1986, "grad_norm": 0.1945761889219284, "learning_rate": 2.0280400000000003e-05, "loss": 0.2256, "step": 449300 }, { "epoch": 0.1988, "grad_norm": 0.2893814444541931, "learning_rate": 2.02404e-05, "loss": 0.2289, "step": 449400 }, { "epoch": 0.199, "grad_norm": 0.16829931735992432, "learning_rate": 2.02004e-05, "loss": 0.2241, "step": 449500 }, { "epoch": 0.1992, "grad_norm": 0.29933756589889526, "learning_rate": 2.01604e-05, "loss": 0.2259, "step": 449600 }, { "epoch": 0.1994, "grad_norm": 0.151699960231781, "learning_rate": 2.01204e-05, "loss": 0.2214, "step": 449700 }, { "epoch": 0.1996, "grad_norm": 0.2555668354034424, "learning_rate": 2.0080400000000002e-05, "loss": 0.2268, "step": 449800 }, { "epoch": 0.1998, "grad_norm": 0.18724453449249268, "learning_rate": 2.0040400000000002e-05, "loss": 0.2262, "step": 449900 }, { "epoch": 0.2, "grad_norm": 0.21397201716899872, "learning_rate": 2.00004e-05, "loss": 0.2222, "step": 450000 }, { "epoch": 0.2002, "grad_norm": 0.31661924719810486, "learning_rate": 1.9960400000000003e-05, "loss": 0.2524, "step": 450100 }, { "epoch": 0.2004, "grad_norm": 0.28846636414527893, "learning_rate": 1.99204e-05, "loss": 0.2639, "step": 450200 }, { "epoch": 0.2006, "grad_norm": 0.2062097191810608, "learning_rate": 1.98804e-05, "loss": 0.2605, "step": 450300 }, { "epoch": 0.2008, "grad_norm": 0.37499383091926575, "learning_rate": 1.98404e-05, "loss": 0.2526, "step": 450400 }, { "epoch": 0.201, "grad_norm": 0.2817152142524719, "learning_rate": 1.98004e-05, "loss": 0.255, "step": 450500 }, { "epoch": 0.2012, "grad_norm": 0.2787519097328186, "learning_rate": 1.97604e-05, "loss": 0.2779, "step": 450600 }, { "epoch": 0.2014, "grad_norm": 0.3081032931804657, "learning_rate": 1.97204e-05, "loss": 0.2528, "step": 450700 }, { "epoch": 0.2016, "grad_norm": 0.45167797803878784, "learning_rate": 1.9680400000000002e-05, "loss": 0.2531, "step": 450800 }, { "epoch": 0.2018, "grad_norm": 0.2237127274274826, "learning_rate": 1.96404e-05, "loss": 0.2742, "step": 450900 }, { "epoch": 0.202, "grad_norm": 0.18586333096027374, "learning_rate": 1.9600400000000003e-05, "loss": 0.2499, "step": 451000 }, { "epoch": 0.2022, "grad_norm": 0.2039756029844284, "learning_rate": 1.95604e-05, "loss": 0.2538, "step": 451100 }, { "epoch": 0.2024, "grad_norm": 0.2960463762283325, "learning_rate": 1.95204e-05, "loss": 0.2555, "step": 451200 }, { "epoch": 0.2026, "grad_norm": 0.5285388827323914, "learning_rate": 1.94804e-05, "loss": 0.2882, "step": 451300 }, { "epoch": 0.2028, "grad_norm": 0.20539702475070953, "learning_rate": 1.94404e-05, "loss": 0.2681, "step": 451400 }, { "epoch": 0.203, "grad_norm": 0.40104198455810547, "learning_rate": 1.94004e-05, "loss": 0.2579, "step": 451500 }, { "epoch": 0.2032, "grad_norm": 0.24579091370105743, "learning_rate": 1.9360400000000002e-05, "loss": 0.2561, "step": 451600 }, { "epoch": 0.2034, "grad_norm": 0.24217070639133453, "learning_rate": 1.93204e-05, "loss": 0.251, "step": 451700 }, { "epoch": 0.2036, "grad_norm": 0.18442483246326447, "learning_rate": 1.9280400000000003e-05, "loss": 0.2504, "step": 451800 }, { "epoch": 0.2038, "grad_norm": 0.24910864233970642, "learning_rate": 1.92404e-05, "loss": 0.3131, "step": 451900 }, { "epoch": 0.204, "grad_norm": 0.2014876753091812, "learning_rate": 1.9200400000000003e-05, "loss": 0.2991, "step": 452000 }, { "epoch": 0.2042, "grad_norm": 0.1724986732006073, "learning_rate": 1.91604e-05, "loss": 0.2537, "step": 452100 }, { "epoch": 0.2044, "grad_norm": 0.16175585985183716, "learning_rate": 1.91204e-05, "loss": 0.2581, "step": 452200 }, { "epoch": 0.2046, "grad_norm": 0.29984617233276367, "learning_rate": 1.90804e-05, "loss": 0.2608, "step": 452300 }, { "epoch": 0.2048, "grad_norm": 0.32500120997428894, "learning_rate": 1.90404e-05, "loss": 0.2811, "step": 452400 }, { "epoch": 0.205, "grad_norm": 0.14806152880191803, "learning_rate": 1.9000400000000002e-05, "loss": 0.2499, "step": 452500 }, { "epoch": 0.2052, "grad_norm": 0.1928393542766571, "learning_rate": 1.89604e-05, "loss": 0.2671, "step": 452600 }, { "epoch": 0.2054, "grad_norm": 0.626834511756897, "learning_rate": 1.8920400000000003e-05, "loss": 0.2681, "step": 452700 }, { "epoch": 0.2056, "grad_norm": 0.2420172393321991, "learning_rate": 1.88804e-05, "loss": 0.2626, "step": 452800 }, { "epoch": 0.2058, "grad_norm": 0.5636458396911621, "learning_rate": 1.88404e-05, "loss": 0.2625, "step": 452900 }, { "epoch": 0.206, "grad_norm": 1.392844319343567, "learning_rate": 1.88004e-05, "loss": 0.2567, "step": 453000 }, { "epoch": 0.2062, "grad_norm": 0.22444498538970947, "learning_rate": 1.87604e-05, "loss": 0.2405, "step": 453100 }, { "epoch": 0.2064, "grad_norm": 0.3851085603237152, "learning_rate": 1.87204e-05, "loss": 0.2609, "step": 453200 }, { "epoch": 0.2066, "grad_norm": 0.2355654239654541, "learning_rate": 1.86804e-05, "loss": 0.2557, "step": 453300 }, { "epoch": 0.2068, "grad_norm": 0.20136545598506927, "learning_rate": 1.86404e-05, "loss": 0.2569, "step": 453400 }, { "epoch": 0.207, "grad_norm": 0.2145121991634369, "learning_rate": 1.8600400000000002e-05, "loss": 0.2602, "step": 453500 }, { "epoch": 0.2072, "grad_norm": 0.20145122706890106, "learning_rate": 1.85604e-05, "loss": 0.2648, "step": 453600 }, { "epoch": 0.2074, "grad_norm": 0.2964533269405365, "learning_rate": 1.8520400000000003e-05, "loss": 0.2509, "step": 453700 }, { "epoch": 0.2076, "grad_norm": 0.22049467265605927, "learning_rate": 1.84804e-05, "loss": 0.3241, "step": 453800 }, { "epoch": 0.2078, "grad_norm": 0.2652212679386139, "learning_rate": 1.84404e-05, "loss": 0.2527, "step": 453900 }, { "epoch": 0.208, "grad_norm": 0.17262853682041168, "learning_rate": 1.84004e-05, "loss": 0.2512, "step": 454000 }, { "epoch": 0.2082, "grad_norm": 0.4729574918746948, "learning_rate": 1.83604e-05, "loss": 0.257, "step": 454100 }, { "epoch": 0.2084, "grad_norm": 0.20896553993225098, "learning_rate": 1.83204e-05, "loss": 0.257, "step": 454200 }, { "epoch": 0.2086, "grad_norm": 0.32045525312423706, "learning_rate": 1.82804e-05, "loss": 0.2587, "step": 454300 }, { "epoch": 0.2088, "grad_norm": 0.27513304352760315, "learning_rate": 1.8240400000000002e-05, "loss": 0.2748, "step": 454400 }, { "epoch": 0.209, "grad_norm": 0.3870587944984436, "learning_rate": 1.82004e-05, "loss": 0.2626, "step": 454500 }, { "epoch": 0.2092, "grad_norm": 0.18776573240756989, "learning_rate": 1.81604e-05, "loss": 0.2574, "step": 454600 }, { "epoch": 0.2094, "grad_norm": 0.2663273513317108, "learning_rate": 1.81204e-05, "loss": 0.2613, "step": 454700 }, { "epoch": 0.2096, "grad_norm": 0.20890645682811737, "learning_rate": 1.80804e-05, "loss": 0.2612, "step": 454800 }, { "epoch": 0.2098, "grad_norm": 0.19033282995224, "learning_rate": 1.80404e-05, "loss": 0.2611, "step": 454900 }, { "epoch": 0.21, "grad_norm": 0.1746765375137329, "learning_rate": 1.80004e-05, "loss": 0.2456, "step": 455000 } ], "logging_steps": 100, "max_steps": 500000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.8875100438528e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }