{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.41, "eval_steps": 500, "global_step": 325000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002, "grad_norm": 1.603668212890625, "learning_rate": 0.0001999604, "loss": 2.0926, "step": 100 }, { "epoch": 0.0004, "grad_norm": 1.1335899829864502, "learning_rate": 0.0001999204, "loss": 1.5915, "step": 200 }, { "epoch": 0.0006, "grad_norm": 0.9146980047225952, "learning_rate": 0.00019988040000000002, "loss": 1.413, "step": 300 }, { "epoch": 0.0008, "grad_norm": 1.1228628158569336, "learning_rate": 0.0001998404, "loss": 1.3208, "step": 400 }, { "epoch": 0.001, "grad_norm": 0.623985230922699, "learning_rate": 0.0001998004, "loss": 1.2557, "step": 500 }, { "epoch": 0.0012, "grad_norm": 0.7038842439651489, "learning_rate": 0.0001997604, "loss": 1.1387, "step": 600 }, { "epoch": 0.0014, "grad_norm": 0.820369303226471, "learning_rate": 0.00019972040000000002, "loss": 1.1653, "step": 700 }, { "epoch": 0.0016, "grad_norm": 0.6667912006378174, "learning_rate": 0.0001996804, "loss": 1.1342, "step": 800 }, { "epoch": 0.0018, "grad_norm": 0.616651713848114, "learning_rate": 0.0001996404, "loss": 1.1548, "step": 900 }, { "epoch": 0.002, "grad_norm": 0.5994811058044434, "learning_rate": 0.0001996004, "loss": 1.0464, "step": 1000 }, { "epoch": 0.0022, "grad_norm": 0.5694602131843567, "learning_rate": 0.0001995604, "loss": 1.1394, "step": 1100 }, { "epoch": 0.0024, "grad_norm": 0.5835832357406616, "learning_rate": 0.00019952040000000002, "loss": 1.0209, "step": 1200 }, { "epoch": 0.0026, "grad_norm": 1.4923994541168213, "learning_rate": 0.0001994804, "loss": 1.0708, "step": 1300 }, { "epoch": 0.0028, "grad_norm": 0.669029712677002, "learning_rate": 0.00019944040000000003, "loss": 0.9359, "step": 1400 }, { "epoch": 0.003, "grad_norm": 1.1236016750335693, "learning_rate": 0.00019940040000000002, "loss": 0.9963, "step": 1500 }, { "epoch": 0.0032, "grad_norm": 0.7666842341423035, "learning_rate": 0.0001993604, "loss": 1.0306, "step": 1600 }, { "epoch": 0.0034, "grad_norm": 1.5545086860656738, "learning_rate": 0.0001993204, "loss": 0.9544, "step": 1700 }, { "epoch": 0.0036, "grad_norm": 0.7571123242378235, "learning_rate": 0.0001992804, "loss": 0.9473, "step": 1800 }, { "epoch": 0.0038, "grad_norm": 0.5189706683158875, "learning_rate": 0.00019924040000000002, "loss": 0.9743, "step": 1900 }, { "epoch": 0.004, "grad_norm": 0.9225242137908936, "learning_rate": 0.00019920040000000002, "loss": 0.9029, "step": 2000 }, { "epoch": 0.0042, "grad_norm": 0.6427626609802246, "learning_rate": 0.0001991604, "loss": 0.9111, "step": 2100 }, { "epoch": 0.0044, "grad_norm": 0.5414828658103943, "learning_rate": 0.0001991204, "loss": 0.9591, "step": 2200 }, { "epoch": 0.0046, "grad_norm": 0.5731277465820312, "learning_rate": 0.0001990804, "loss": 0.9327, "step": 2300 }, { "epoch": 0.0048, "grad_norm": 0.6296040415763855, "learning_rate": 0.00019904040000000002, "loss": 0.8148, "step": 2400 }, { "epoch": 0.005, "grad_norm": 0.5396831631660461, "learning_rate": 0.0001990004, "loss": 0.8056, "step": 2500 }, { "epoch": 0.0052, "grad_norm": 0.5632336735725403, "learning_rate": 0.0001989604, "loss": 0.8224, "step": 2600 }, { "epoch": 0.0054, "grad_norm": 0.7668570280075073, "learning_rate": 0.0001989204, "loss": 0.8137, "step": 2700 }, { "epoch": 0.0056, "grad_norm": 0.5585981607437134, "learning_rate": 0.00019888040000000002, "loss": 0.8658, "step": 2800 }, { "epoch": 0.0058, "grad_norm": 0.5207762122154236, "learning_rate": 0.00019884040000000001, "loss": 0.796, "step": 2900 }, { "epoch": 0.006, "grad_norm": 0.6384051442146301, "learning_rate": 0.0001988004, "loss": 0.979, "step": 3000 }, { "epoch": 0.0062, "grad_norm": 0.4824763834476471, "learning_rate": 0.0001987604, "loss": 0.7749, "step": 3100 }, { "epoch": 0.0064, "grad_norm": 0.5262443423271179, "learning_rate": 0.0001987204, "loss": 0.7486, "step": 3200 }, { "epoch": 0.0066, "grad_norm": 3.371579170227051, "learning_rate": 0.00019868040000000002, "loss": 0.8155, "step": 3300 }, { "epoch": 0.0068, "grad_norm": 0.46106863021850586, "learning_rate": 0.0001986404, "loss": 0.7683, "step": 3400 }, { "epoch": 0.007, "grad_norm": 0.5194998979568481, "learning_rate": 0.00019860040000000003, "loss": 0.7919, "step": 3500 }, { "epoch": 0.0072, "grad_norm": 0.5358483195304871, "learning_rate": 0.0001985604, "loss": 0.7655, "step": 3600 }, { "epoch": 0.0074, "grad_norm": 0.4279918372631073, "learning_rate": 0.00019852040000000002, "loss": 0.7611, "step": 3700 }, { "epoch": 0.0076, "grad_norm": 0.3322402834892273, "learning_rate": 0.0001984804, "loss": 0.8131, "step": 3800 }, { "epoch": 0.0078, "grad_norm": 0.5196714401245117, "learning_rate": 0.0001984404, "loss": 0.7348, "step": 3900 }, { "epoch": 0.008, "grad_norm": 0.6269809603691101, "learning_rate": 0.00019840040000000003, "loss": 0.7274, "step": 4000 }, { "epoch": 0.0082, "grad_norm": 0.7291073203086853, "learning_rate": 0.0001983604, "loss": 0.8001, "step": 4100 }, { "epoch": 0.0084, "grad_norm": 1.0613459348678589, "learning_rate": 0.0001983204, "loss": 0.7169, "step": 4200 }, { "epoch": 0.0086, "grad_norm": 0.46023017168045044, "learning_rate": 0.0001982804, "loss": 0.7456, "step": 4300 }, { "epoch": 0.0088, "grad_norm": 0.3527376651763916, "learning_rate": 0.00019824040000000003, "loss": 0.699, "step": 4400 }, { "epoch": 0.009, "grad_norm": 0.5758853554725647, "learning_rate": 0.00019820040000000002, "loss": 0.7226, "step": 4500 }, { "epoch": 0.0092, "grad_norm": 0.500385582447052, "learning_rate": 0.00019816040000000001, "loss": 0.753, "step": 4600 }, { "epoch": 0.0094, "grad_norm": 0.8873578906059265, "learning_rate": 0.0001981204, "loss": 0.7047, "step": 4700 }, { "epoch": 0.0096, "grad_norm": 0.6691761612892151, "learning_rate": 0.0001980804, "loss": 0.7139, "step": 4800 }, { "epoch": 0.0098, "grad_norm": 0.593289852142334, "learning_rate": 0.00019804040000000002, "loss": 0.7075, "step": 4900 }, { "epoch": 0.01, "grad_norm": 0.7012218236923218, "learning_rate": 0.00019800040000000002, "loss": 0.6878, "step": 5000 }, { "epoch": 0.0102, "grad_norm": 0.5074142217636108, "learning_rate": 0.0001979604, "loss": 0.7574, "step": 5100 }, { "epoch": 0.0104, "grad_norm": 0.9699320197105408, "learning_rate": 0.0001979204, "loss": 0.6619, "step": 5200 }, { "epoch": 0.0106, "grad_norm": 0.44478604197502136, "learning_rate": 0.0001978804, "loss": 0.7029, "step": 5300 }, { "epoch": 0.0108, "grad_norm": 0.43934938311576843, "learning_rate": 0.00019784040000000002, "loss": 0.6991, "step": 5400 }, { "epoch": 0.011, "grad_norm": 0.4505891501903534, "learning_rate": 0.0001978004, "loss": 0.7019, "step": 5500 }, { "epoch": 0.0112, "grad_norm": 0.40732213854789734, "learning_rate": 0.0001977604, "loss": 0.7085, "step": 5600 }, { "epoch": 0.0114, "grad_norm": 0.5004173517227173, "learning_rate": 0.0001977204, "loss": 0.6757, "step": 5700 }, { "epoch": 0.0116, "grad_norm": 0.5011471509933472, "learning_rate": 0.00019768040000000002, "loss": 0.6752, "step": 5800 }, { "epoch": 0.0118, "grad_norm": 0.3992891311645508, "learning_rate": 0.0001976404, "loss": 0.6733, "step": 5900 }, { "epoch": 0.012, "grad_norm": 0.3974108099937439, "learning_rate": 0.0001976004, "loss": 0.6737, "step": 6000 }, { "epoch": 0.0122, "grad_norm": 0.4213123917579651, "learning_rate": 0.0001975604, "loss": 0.7769, "step": 6100 }, { "epoch": 0.0124, "grad_norm": 0.4016417860984802, "learning_rate": 0.0001975204, "loss": 0.6731, "step": 6200 }, { "epoch": 0.0126, "grad_norm": 0.45475897192955017, "learning_rate": 0.00019748040000000001, "loss": 0.6566, "step": 6300 }, { "epoch": 0.0128, "grad_norm": 0.298532098531723, "learning_rate": 0.0001974404, "loss": 0.6221, "step": 6400 }, { "epoch": 0.013, "grad_norm": 0.36682987213134766, "learning_rate": 0.00019740040000000003, "loss": 0.6402, "step": 6500 }, { "epoch": 0.0132, "grad_norm": 0.35242322087287903, "learning_rate": 0.0001973604, "loss": 0.6069, "step": 6600 }, { "epoch": 0.0134, "grad_norm": 0.49721652269363403, "learning_rate": 0.00019732040000000002, "loss": 0.6847, "step": 6700 }, { "epoch": 0.0136, "grad_norm": 0.371697336435318, "learning_rate": 0.0001972804, "loss": 0.617, "step": 6800 }, { "epoch": 0.0138, "grad_norm": 0.498140424489975, "learning_rate": 0.0001972404, "loss": 0.7371, "step": 6900 }, { "epoch": 0.014, "grad_norm": 0.4353884160518646, "learning_rate": 0.00019720040000000002, "loss": 0.6746, "step": 7000 }, { "epoch": 0.0142, "grad_norm": 0.47410452365875244, "learning_rate": 0.0001971604, "loss": 0.6462, "step": 7100 }, { "epoch": 0.0144, "grad_norm": 0.558781087398529, "learning_rate": 0.0001971204, "loss": 0.6459, "step": 7200 }, { "epoch": 0.0146, "grad_norm": 0.41540494561195374, "learning_rate": 0.0001970804, "loss": 0.6269, "step": 7300 }, { "epoch": 0.0148, "grad_norm": 0.5316945910453796, "learning_rate": 0.00019704040000000003, "loss": 0.6097, "step": 7400 }, { "epoch": 0.015, "grad_norm": 0.5045424103736877, "learning_rate": 0.00019700040000000002, "loss": 0.6496, "step": 7500 }, { "epoch": 0.0152, "grad_norm": 0.3779171407222748, "learning_rate": 0.0001969604, "loss": 0.632, "step": 7600 }, { "epoch": 0.0154, "grad_norm": 0.31508123874664307, "learning_rate": 0.0001969204, "loss": 0.595, "step": 7700 }, { "epoch": 0.0156, "grad_norm": 0.5666154623031616, "learning_rate": 0.0001968804, "loss": 0.6193, "step": 7800 }, { "epoch": 0.0158, "grad_norm": 0.36079517006874084, "learning_rate": 0.00019684040000000002, "loss": 0.6154, "step": 7900 }, { "epoch": 0.016, "grad_norm": 0.716064989566803, "learning_rate": 0.00019680040000000001, "loss": 0.6018, "step": 8000 }, { "epoch": 0.0162, "grad_norm": 0.3534316122531891, "learning_rate": 0.0001967604, "loss": 0.6298, "step": 8100 }, { "epoch": 0.0164, "grad_norm": 0.31048017740249634, "learning_rate": 0.0001967204, "loss": 0.6118, "step": 8200 }, { "epoch": 0.0166, "grad_norm": 0.7001683115959167, "learning_rate": 0.00019668040000000002, "loss": 0.6154, "step": 8300 }, { "epoch": 0.0168, "grad_norm": 0.577167809009552, "learning_rate": 0.00019664040000000002, "loss": 0.6081, "step": 8400 }, { "epoch": 0.017, "grad_norm": 0.48630014061927795, "learning_rate": 0.0001966004, "loss": 0.6225, "step": 8500 }, { "epoch": 0.0172, "grad_norm": 0.3509364426136017, "learning_rate": 0.0001965604, "loss": 0.6283, "step": 8600 }, { "epoch": 0.0174, "grad_norm": 0.5807685256004333, "learning_rate": 0.0001965204, "loss": 0.5904, "step": 8700 }, { "epoch": 0.0176, "grad_norm": 0.40310025215148926, "learning_rate": 0.00019648040000000002, "loss": 0.6043, "step": 8800 }, { "epoch": 0.0178, "grad_norm": 0.4283163845539093, "learning_rate": 0.0001964404, "loss": 0.5648, "step": 8900 }, { "epoch": 0.018, "grad_norm": 0.33734825253486633, "learning_rate": 0.0001964004, "loss": 0.5728, "step": 9000 }, { "epoch": 0.0182, "grad_norm": 0.2931211292743683, "learning_rate": 0.0001963604, "loss": 0.5937, "step": 9100 }, { "epoch": 0.0184, "grad_norm": 0.4612201154232025, "learning_rate": 0.0001963204, "loss": 0.5934, "step": 9200 }, { "epoch": 0.0186, "grad_norm": 0.3644779622554779, "learning_rate": 0.0001962804, "loss": 0.5766, "step": 9300 }, { "epoch": 0.0188, "grad_norm": 0.30560925602912903, "learning_rate": 0.0001962404, "loss": 0.5464, "step": 9400 }, { "epoch": 0.019, "grad_norm": 0.6216826438903809, "learning_rate": 0.00019620040000000003, "loss": 0.6206, "step": 9500 }, { "epoch": 0.0192, "grad_norm": 0.3877085745334625, "learning_rate": 0.0001961604, "loss": 0.5825, "step": 9600 }, { "epoch": 0.0194, "grad_norm": 0.4004894495010376, "learning_rate": 0.00019612040000000001, "loss": 0.5918, "step": 9700 }, { "epoch": 0.0196, "grad_norm": 0.49157753586769104, "learning_rate": 0.0001960804, "loss": 0.5891, "step": 9800 }, { "epoch": 0.0198, "grad_norm": 1.031232237815857, "learning_rate": 0.0001960404, "loss": 0.5748, "step": 9900 }, { "epoch": 0.02, "grad_norm": 0.36149775981903076, "learning_rate": 0.00019600040000000002, "loss": 0.5844, "step": 10000 }, { "epoch": 0.0202, "grad_norm": 0.29936593770980835, "learning_rate": 0.0001959604, "loss": 0.5687, "step": 10100 }, { "epoch": 0.0204, "grad_norm": 4.51532506942749, "learning_rate": 0.0001959204, "loss": 0.6077, "step": 10200 }, { "epoch": 0.0206, "grad_norm": 0.44001829624176025, "learning_rate": 0.0001958804, "loss": 0.6138, "step": 10300 }, { "epoch": 0.0208, "grad_norm": 0.26877814531326294, "learning_rate": 0.00019584040000000002, "loss": 0.5787, "step": 10400 }, { "epoch": 0.021, "grad_norm": 0.441687673330307, "learning_rate": 0.00019580040000000002, "loss": 0.5353, "step": 10500 }, { "epoch": 0.0212, "grad_norm": 0.4635355472564697, "learning_rate": 0.0001957604, "loss": 0.5686, "step": 10600 }, { "epoch": 0.0214, "grad_norm": 0.3567267060279846, "learning_rate": 0.0001957204, "loss": 0.5363, "step": 10700 }, { "epoch": 0.0216, "grad_norm": 0.5282173156738281, "learning_rate": 0.0001956804, "loss": 0.582, "step": 10800 }, { "epoch": 0.0218, "grad_norm": 0.32945743203163147, "learning_rate": 0.00019564040000000002, "loss": 0.5695, "step": 10900 }, { "epoch": 0.022, "grad_norm": 1.3247750997543335, "learning_rate": 0.0001956004, "loss": 0.5665, "step": 11000 }, { "epoch": 0.0222, "grad_norm": 0.3688965439796448, "learning_rate": 0.0001955604, "loss": 0.5835, "step": 11100 }, { "epoch": 0.0224, "grad_norm": 0.4591335356235504, "learning_rate": 0.0001955204, "loss": 0.5638, "step": 11200 }, { "epoch": 0.0226, "grad_norm": 0.4654744267463684, "learning_rate": 0.00019548040000000002, "loss": 0.5821, "step": 11300 }, { "epoch": 0.0228, "grad_norm": 1.729272484779358, "learning_rate": 0.0001954404, "loss": 0.6013, "step": 11400 }, { "epoch": 0.023, "grad_norm": 0.3169005513191223, "learning_rate": 0.0001954004, "loss": 0.5678, "step": 11500 }, { "epoch": 0.0232, "grad_norm": 0.4935179650783539, "learning_rate": 0.0001953604, "loss": 0.5517, "step": 11600 }, { "epoch": 0.0234, "grad_norm": 0.3395957350730896, "learning_rate": 0.0001953204, "loss": 0.5387, "step": 11700 }, { "epoch": 0.0236, "grad_norm": 0.4479553997516632, "learning_rate": 0.00019528040000000001, "loss": 0.5748, "step": 11800 }, { "epoch": 0.0238, "grad_norm": 0.36801132559776306, "learning_rate": 0.0001952404, "loss": 0.6047, "step": 11900 }, { "epoch": 0.024, "grad_norm": 0.5720965266227722, "learning_rate": 0.00019520040000000003, "loss": 0.5458, "step": 12000 }, { "epoch": 0.0242, "grad_norm": 0.37826624512672424, "learning_rate": 0.0001951604, "loss": 0.5512, "step": 12100 }, { "epoch": 0.0244, "grad_norm": 0.39435476064682007, "learning_rate": 0.00019512040000000002, "loss": 0.5388, "step": 12200 }, { "epoch": 0.0246, "grad_norm": 0.4711388349533081, "learning_rate": 0.0001950804, "loss": 0.535, "step": 12300 }, { "epoch": 0.0248, "grad_norm": 0.38097721338272095, "learning_rate": 0.0001950404, "loss": 0.5675, "step": 12400 }, { "epoch": 0.025, "grad_norm": 0.3961537778377533, "learning_rate": 0.00019500040000000002, "loss": 0.5727, "step": 12500 }, { "epoch": 0.0252, "grad_norm": 0.5444319248199463, "learning_rate": 0.0001949604, "loss": 0.5879, "step": 12600 }, { "epoch": 0.0254, "grad_norm": 0.5849190354347229, "learning_rate": 0.0001949204, "loss": 0.5635, "step": 12700 }, { "epoch": 0.0256, "grad_norm": 0.6113494038581848, "learning_rate": 0.0001948804, "loss": 0.5467, "step": 12800 }, { "epoch": 0.0258, "grad_norm": 0.4276942312717438, "learning_rate": 0.0001948404, "loss": 0.5554, "step": 12900 }, { "epoch": 0.026, "grad_norm": 0.48870015144348145, "learning_rate": 0.00019480040000000002, "loss": 0.5986, "step": 13000 }, { "epoch": 0.0262, "grad_norm": 0.27192604541778564, "learning_rate": 0.00019476039999999999, "loss": 0.5551, "step": 13100 }, { "epoch": 0.0264, "grad_norm": 0.6044579148292542, "learning_rate": 0.0001947204, "loss": 0.5477, "step": 13200 }, { "epoch": 0.0266, "grad_norm": 0.4254133105278015, "learning_rate": 0.0001946804, "loss": 0.547, "step": 13300 }, { "epoch": 0.0268, "grad_norm": 1.369423747062683, "learning_rate": 0.00019464040000000002, "loss": 0.5492, "step": 13400 }, { "epoch": 0.027, "grad_norm": 0.38343751430511475, "learning_rate": 0.00019460040000000001, "loss": 0.5275, "step": 13500 }, { "epoch": 0.0272, "grad_norm": 0.45244088768959045, "learning_rate": 0.0001945604, "loss": 0.4827, "step": 13600 }, { "epoch": 0.0274, "grad_norm": 0.3227088451385498, "learning_rate": 0.0001945204, "loss": 0.4863, "step": 13700 }, { "epoch": 0.0276, "grad_norm": 0.45713531970977783, "learning_rate": 0.0001944804, "loss": 0.4933, "step": 13800 }, { "epoch": 0.0278, "grad_norm": 0.42994385957717896, "learning_rate": 0.00019444040000000002, "loss": 0.484, "step": 13900 }, { "epoch": 0.028, "grad_norm": 0.3284667134284973, "learning_rate": 0.0001944004, "loss": 0.4956, "step": 14000 }, { "epoch": 0.0282, "grad_norm": 0.44729650020599365, "learning_rate": 0.00019436040000000003, "loss": 0.4595, "step": 14100 }, { "epoch": 0.0284, "grad_norm": 0.31246569752693176, "learning_rate": 0.0001943204, "loss": 0.4796, "step": 14200 }, { "epoch": 0.0286, "grad_norm": 0.46601220965385437, "learning_rate": 0.00019428040000000002, "loss": 0.4956, "step": 14300 }, { "epoch": 0.0288, "grad_norm": 0.33735036849975586, "learning_rate": 0.0001942404, "loss": 0.4689, "step": 14400 }, { "epoch": 0.029, "grad_norm": 0.44884735345840454, "learning_rate": 0.0001942004, "loss": 0.4913, "step": 14500 }, { "epoch": 0.0292, "grad_norm": 1.72175133228302, "learning_rate": 0.00019416040000000003, "loss": 0.4932, "step": 14600 }, { "epoch": 0.0294, "grad_norm": 0.8482571244239807, "learning_rate": 0.0001941204, "loss": 0.4834, "step": 14700 }, { "epoch": 0.0296, "grad_norm": 0.35360008478164673, "learning_rate": 0.0001940804, "loss": 0.498, "step": 14800 }, { "epoch": 0.0298, "grad_norm": 0.32927367091178894, "learning_rate": 0.0001940404, "loss": 0.4824, "step": 14900 }, { "epoch": 0.03, "grad_norm": 0.4251364469528198, "learning_rate": 0.00019400040000000003, "loss": 0.4941, "step": 15000 }, { "epoch": 0.0302, "grad_norm": 0.3267940878868103, "learning_rate": 0.00019396040000000002, "loss": 0.4943, "step": 15100 }, { "epoch": 0.0304, "grad_norm": 0.29757174849510193, "learning_rate": 0.00019392040000000001, "loss": 0.4856, "step": 15200 }, { "epoch": 0.0306, "grad_norm": 0.4625142216682434, "learning_rate": 0.0001938804, "loss": 0.4759, "step": 15300 }, { "epoch": 0.0308, "grad_norm": 0.3954981565475464, "learning_rate": 0.0001938404, "loss": 0.4731, "step": 15400 }, { "epoch": 0.031, "grad_norm": 0.42979204654693604, "learning_rate": 0.00019380040000000002, "loss": 0.518, "step": 15500 }, { "epoch": 0.0312, "grad_norm": 0.5260553956031799, "learning_rate": 0.00019376040000000002, "loss": 0.4934, "step": 15600 }, { "epoch": 0.0314, "grad_norm": 0.6792057156562805, "learning_rate": 0.0001937204, "loss": 0.478, "step": 15700 }, { "epoch": 0.0316, "grad_norm": 0.3626260459423065, "learning_rate": 0.0001936804, "loss": 0.4859, "step": 15800 }, { "epoch": 0.0318, "grad_norm": 0.6549792885780334, "learning_rate": 0.00019364040000000002, "loss": 0.4843, "step": 15900 }, { "epoch": 0.032, "grad_norm": 0.40819457173347473, "learning_rate": 0.00019360040000000002, "loss": 0.4876, "step": 16000 }, { "epoch": 0.0322, "grad_norm": 0.46695244312286377, "learning_rate": 0.0001935604, "loss": 0.493, "step": 16100 }, { "epoch": 0.0324, "grad_norm": 0.29340672492980957, "learning_rate": 0.0001935204, "loss": 0.4822, "step": 16200 }, { "epoch": 0.0326, "grad_norm": 0.4895103871822357, "learning_rate": 0.0001934804, "loss": 0.4775, "step": 16300 }, { "epoch": 0.0328, "grad_norm": 0.38141900300979614, "learning_rate": 0.00019344040000000002, "loss": 0.5175, "step": 16400 }, { "epoch": 0.033, "grad_norm": 0.2875644862651825, "learning_rate": 0.0001934004, "loss": 0.5004, "step": 16500 }, { "epoch": 0.0332, "grad_norm": 1.0335603952407837, "learning_rate": 0.0001933604, "loss": 0.4917, "step": 16600 }, { "epoch": 0.0334, "grad_norm": 0.3046559691429138, "learning_rate": 0.0001933204, "loss": 0.5966, "step": 16700 }, { "epoch": 0.0336, "grad_norm": 0.3726640045642853, "learning_rate": 0.0001932804, "loss": 0.4882, "step": 16800 }, { "epoch": 0.0338, "grad_norm": 0.3167787194252014, "learning_rate": 0.00019324040000000001, "loss": 0.4979, "step": 16900 }, { "epoch": 0.034, "grad_norm": 0.23184147477149963, "learning_rate": 0.0001932004, "loss": 0.4873, "step": 17000 }, { "epoch": 0.0342, "grad_norm": 0.27494415640830994, "learning_rate": 0.00019316040000000003, "loss": 0.4735, "step": 17100 }, { "epoch": 0.0344, "grad_norm": 2.1850268840789795, "learning_rate": 0.0001931204, "loss": 0.4706, "step": 17200 }, { "epoch": 0.0346, "grad_norm": 0.6089339852333069, "learning_rate": 0.00019308040000000002, "loss": 0.4736, "step": 17300 }, { "epoch": 0.0348, "grad_norm": 0.33707159757614136, "learning_rate": 0.0001930404, "loss": 0.4639, "step": 17400 }, { "epoch": 0.035, "grad_norm": 0.3079136610031128, "learning_rate": 0.0001930004, "loss": 0.4576, "step": 17500 }, { "epoch": 0.0352, "grad_norm": 0.9146600961685181, "learning_rate": 0.00019296040000000002, "loss": 0.4713, "step": 17600 }, { "epoch": 0.0354, "grad_norm": 0.3352242708206177, "learning_rate": 0.0001929204, "loss": 0.4628, "step": 17700 }, { "epoch": 0.0356, "grad_norm": 0.3531840443611145, "learning_rate": 0.0001928804, "loss": 0.4814, "step": 17800 }, { "epoch": 0.0358, "grad_norm": 0.3614370822906494, "learning_rate": 0.0001928404, "loss": 0.4786, "step": 17900 }, { "epoch": 0.036, "grad_norm": 0.3174838721752167, "learning_rate": 0.00019280040000000003, "loss": 0.4618, "step": 18000 }, { "epoch": 0.0362, "grad_norm": 0.5685095191001892, "learning_rate": 0.00019276040000000002, "loss": 0.4678, "step": 18100 }, { "epoch": 0.0364, "grad_norm": 0.35674697160720825, "learning_rate": 0.0001927204, "loss": 0.4709, "step": 18200 }, { "epoch": 0.0366, "grad_norm": 0.3839879631996155, "learning_rate": 0.0001926804, "loss": 0.4709, "step": 18300 }, { "epoch": 0.0368, "grad_norm": 0.401214599609375, "learning_rate": 0.0001926404, "loss": 0.4716, "step": 18400 }, { "epoch": 0.037, "grad_norm": 0.4122023582458496, "learning_rate": 0.00019260040000000002, "loss": 0.481, "step": 18500 }, { "epoch": 0.0372, "grad_norm": 0.42760711908340454, "learning_rate": 0.00019256040000000001, "loss": 0.4534, "step": 18600 }, { "epoch": 0.0374, "grad_norm": 4.317193508148193, "learning_rate": 0.0001925204, "loss": 0.4838, "step": 18700 }, { "epoch": 0.0376, "grad_norm": 0.3325408697128296, "learning_rate": 0.0001924804, "loss": 0.4756, "step": 18800 }, { "epoch": 0.0378, "grad_norm": 0.4391893744468689, "learning_rate": 0.00019244040000000002, "loss": 0.4625, "step": 18900 }, { "epoch": 0.038, "grad_norm": 0.3271416127681732, "learning_rate": 0.00019240040000000002, "loss": 0.452, "step": 19000 }, { "epoch": 0.0382, "grad_norm": 0.2682945132255554, "learning_rate": 0.0001923604, "loss": 0.4677, "step": 19100 }, { "epoch": 0.0384, "grad_norm": 0.2958075702190399, "learning_rate": 0.0001923204, "loss": 0.4715, "step": 19200 }, { "epoch": 0.0386, "grad_norm": 0.6007779240608215, "learning_rate": 0.0001922804, "loss": 0.4581, "step": 19300 }, { "epoch": 0.0388, "grad_norm": 0.40477871894836426, "learning_rate": 0.00019224040000000002, "loss": 0.4675, "step": 19400 }, { "epoch": 0.039, "grad_norm": 0.44704627990722656, "learning_rate": 0.0001922004, "loss": 0.4846, "step": 19500 }, { "epoch": 0.0392, "grad_norm": 0.4317639172077179, "learning_rate": 0.00019216040000000003, "loss": 0.4681, "step": 19600 }, { "epoch": 0.0394, "grad_norm": 0.274308443069458, "learning_rate": 0.0001921204, "loss": 0.4577, "step": 19700 }, { "epoch": 0.0396, "grad_norm": 0.3873860239982605, "learning_rate": 0.0001920804, "loss": 0.4505, "step": 19800 }, { "epoch": 0.0398, "grad_norm": 0.4344724714756012, "learning_rate": 0.0001920404, "loss": 0.4531, "step": 19900 }, { "epoch": 0.04, "grad_norm": 0.3240549564361572, "learning_rate": 0.0001920004, "loss": 0.4616, "step": 20000 }, { "epoch": 0.0402, "grad_norm": 0.3235743045806885, "learning_rate": 0.00019196040000000003, "loss": 0.4435, "step": 20100 }, { "epoch": 0.0404, "grad_norm": 0.6763646602630615, "learning_rate": 0.0001919204, "loss": 0.4806, "step": 20200 }, { "epoch": 0.0406, "grad_norm": 0.3265356123447418, "learning_rate": 0.00019188040000000001, "loss": 0.4674, "step": 20300 }, { "epoch": 0.0408, "grad_norm": 0.41400811076164246, "learning_rate": 0.0001918404, "loss": 0.4687, "step": 20400 }, { "epoch": 0.041, "grad_norm": 1.1848185062408447, "learning_rate": 0.0001918004, "loss": 0.4602, "step": 20500 }, { "epoch": 0.0412, "grad_norm": 0.4341808259487152, "learning_rate": 0.00019176040000000002, "loss": 0.4559, "step": 20600 }, { "epoch": 0.0414, "grad_norm": 0.40686967968940735, "learning_rate": 0.0001917204, "loss": 0.4408, "step": 20700 }, { "epoch": 0.0416, "grad_norm": 0.28525295853614807, "learning_rate": 0.0001916804, "loss": 0.4841, "step": 20800 }, { "epoch": 0.0418, "grad_norm": 0.29934221506118774, "learning_rate": 0.0001916404, "loss": 0.4562, "step": 20900 }, { "epoch": 0.042, "grad_norm": 0.6304417848587036, "learning_rate": 0.00019160040000000002, "loss": 0.4609, "step": 21000 }, { "epoch": 0.0422, "grad_norm": 0.2794235348701477, "learning_rate": 0.00019156040000000002, "loss": 0.4578, "step": 21100 }, { "epoch": 0.0424, "grad_norm": 0.9360523223876953, "learning_rate": 0.0001915204, "loss": 0.4563, "step": 21200 }, { "epoch": 0.0426, "grad_norm": 0.3164953291416168, "learning_rate": 0.0001914804, "loss": 0.4563, "step": 21300 }, { "epoch": 0.0428, "grad_norm": 0.2829827070236206, "learning_rate": 0.0001914404, "loss": 0.4509, "step": 21400 }, { "epoch": 0.043, "grad_norm": 0.3804665803909302, "learning_rate": 0.00019140040000000002, "loss": 0.4523, "step": 21500 }, { "epoch": 0.0432, "grad_norm": 0.2858908176422119, "learning_rate": 0.0001913604, "loss": 0.4669, "step": 21600 }, { "epoch": 0.0434, "grad_norm": 0.29328569769859314, "learning_rate": 0.0001913204, "loss": 0.4571, "step": 21700 }, { "epoch": 0.0436, "grad_norm": 0.3529110252857208, "learning_rate": 0.0001912804, "loss": 0.4546, "step": 21800 }, { "epoch": 0.0438, "grad_norm": 0.2541522681713104, "learning_rate": 0.00019124040000000002, "loss": 0.4618, "step": 21900 }, { "epoch": 0.044, "grad_norm": 0.3136732280254364, "learning_rate": 0.0001912004, "loss": 0.4579, "step": 22000 }, { "epoch": 0.0442, "grad_norm": 0.35282132029533386, "learning_rate": 0.0001911604, "loss": 0.4801, "step": 22100 }, { "epoch": 0.0444, "grad_norm": 0.32009491324424744, "learning_rate": 0.0001911204, "loss": 0.4422, "step": 22200 }, { "epoch": 0.0446, "grad_norm": 0.25222790241241455, "learning_rate": 0.0001910804, "loss": 0.5024, "step": 22300 }, { "epoch": 0.0448, "grad_norm": 0.24487198889255524, "learning_rate": 0.00019104040000000001, "loss": 0.44, "step": 22400 }, { "epoch": 0.045, "grad_norm": 0.25609472393989563, "learning_rate": 0.0001910004, "loss": 0.4403, "step": 22500 }, { "epoch": 0.0452, "grad_norm": 0.24061603844165802, "learning_rate": 0.00019096040000000003, "loss": 0.434, "step": 22600 }, { "epoch": 0.0454, "grad_norm": 0.24821767210960388, "learning_rate": 0.0001909204, "loss": 0.4383, "step": 22700 }, { "epoch": 0.0456, "grad_norm": 0.3346046507358551, "learning_rate": 0.00019088040000000002, "loss": 0.4337, "step": 22800 }, { "epoch": 0.0458, "grad_norm": 0.4589807093143463, "learning_rate": 0.0001908404, "loss": 0.436, "step": 22900 }, { "epoch": 0.046, "grad_norm": 0.24371834099292755, "learning_rate": 0.0001908004, "loss": 0.4374, "step": 23000 }, { "epoch": 0.0462, "grad_norm": 0.30695632100105286, "learning_rate": 0.00019076040000000002, "loss": 0.4507, "step": 23100 }, { "epoch": 0.0464, "grad_norm": 0.3627791702747345, "learning_rate": 0.0001907204, "loss": 0.4853, "step": 23200 }, { "epoch": 0.0466, "grad_norm": 0.4218233525753021, "learning_rate": 0.0001906804, "loss": 0.4422, "step": 23300 }, { "epoch": 0.0468, "grad_norm": 0.3387954831123352, "learning_rate": 0.0001906404, "loss": 0.4443, "step": 23400 }, { "epoch": 0.047, "grad_norm": 0.3261500597000122, "learning_rate": 0.0001906004, "loss": 0.4438, "step": 23500 }, { "epoch": 0.0472, "grad_norm": 0.27388980984687805, "learning_rate": 0.00019056040000000002, "loss": 0.4383, "step": 23600 }, { "epoch": 0.0474, "grad_norm": 0.2767557203769684, "learning_rate": 0.00019052039999999999, "loss": 0.444, "step": 23700 }, { "epoch": 0.0476, "grad_norm": 0.37184062600135803, "learning_rate": 0.0001904804, "loss": 0.443, "step": 23800 }, { "epoch": 0.0478, "grad_norm": 0.33180898427963257, "learning_rate": 0.0001904404, "loss": 0.4601, "step": 23900 }, { "epoch": 0.048, "grad_norm": 0.3857232630252838, "learning_rate": 0.00019040040000000002, "loss": 0.4645, "step": 24000 }, { "epoch": 0.0482, "grad_norm": 0.4319915175437927, "learning_rate": 0.00019036040000000001, "loss": 0.4464, "step": 24100 }, { "epoch": 0.0484, "grad_norm": 1.3287200927734375, "learning_rate": 0.0001903204, "loss": 0.4419, "step": 24200 }, { "epoch": 0.0486, "grad_norm": 0.2736344337463379, "learning_rate": 0.0001902804, "loss": 0.4541, "step": 24300 }, { "epoch": 0.0488, "grad_norm": 0.2820674777030945, "learning_rate": 0.0001902404, "loss": 0.4284, "step": 24400 }, { "epoch": 0.049, "grad_norm": 0.4350157678127289, "learning_rate": 0.00019020040000000002, "loss": 0.4487, "step": 24500 }, { "epoch": 0.0492, "grad_norm": 0.29447850584983826, "learning_rate": 0.0001901604, "loss": 0.4321, "step": 24600 }, { "epoch": 0.0494, "grad_norm": 0.5137728452682495, "learning_rate": 0.0001901204, "loss": 0.4482, "step": 24700 }, { "epoch": 0.0496, "grad_norm": 0.33691874146461487, "learning_rate": 0.0001900804, "loss": 0.4471, "step": 24800 }, { "epoch": 0.0498, "grad_norm": 0.4733399748802185, "learning_rate": 0.00019004040000000002, "loss": 0.4448, "step": 24900 }, { "epoch": 0.05, "grad_norm": 0.2643273174762726, "learning_rate": 0.0001900004, "loss": 0.4413, "step": 25000 }, { "epoch": 0.0502, "grad_norm": 0.33353281021118164, "learning_rate": 0.0001899604, "loss": 0.4519, "step": 25100 }, { "epoch": 0.0504, "grad_norm": 0.7628878355026245, "learning_rate": 0.0001899204, "loss": 0.4843, "step": 25200 }, { "epoch": 0.0506, "grad_norm": 0.33647382259368896, "learning_rate": 0.0001898804, "loss": 0.4402, "step": 25300 }, { "epoch": 0.0508, "grad_norm": 0.2926424741744995, "learning_rate": 0.0001898404, "loss": 0.4441, "step": 25400 }, { "epoch": 0.051, "grad_norm": 0.24321198463439941, "learning_rate": 0.0001898004, "loss": 0.4485, "step": 25500 }, { "epoch": 0.0512, "grad_norm": 0.2866199314594269, "learning_rate": 0.00018976040000000003, "loss": 0.4311, "step": 25600 }, { "epoch": 0.0514, "grad_norm": 0.27528929710388184, "learning_rate": 0.0001897204, "loss": 0.4478, "step": 25700 }, { "epoch": 0.0516, "grad_norm": 0.27291688323020935, "learning_rate": 0.00018968040000000001, "loss": 0.4504, "step": 25800 }, { "epoch": 0.0518, "grad_norm": 0.36198413372039795, "learning_rate": 0.0001896404, "loss": 0.4304, "step": 25900 }, { "epoch": 0.052, "grad_norm": 1.4908273220062256, "learning_rate": 0.0001896004, "loss": 0.4516, "step": 26000 }, { "epoch": 0.0522, "grad_norm": 0.36965224146842957, "learning_rate": 0.00018956040000000002, "loss": 0.4227, "step": 26100 }, { "epoch": 0.0524, "grad_norm": 0.3467487394809723, "learning_rate": 0.00018952040000000002, "loss": 0.4352, "step": 26200 }, { "epoch": 0.0526, "grad_norm": 0.5308062434196472, "learning_rate": 0.0001894804, "loss": 0.4636, "step": 26300 }, { "epoch": 0.0528, "grad_norm": 0.30632877349853516, "learning_rate": 0.0001894404, "loss": 0.4416, "step": 26400 }, { "epoch": 0.053, "grad_norm": 0.9236398339271545, "learning_rate": 0.00018940040000000002, "loss": 0.4302, "step": 26500 }, { "epoch": 0.0532, "grad_norm": 0.37798625230789185, "learning_rate": 0.00018936040000000002, "loss": 0.4479, "step": 26600 }, { "epoch": 0.0534, "grad_norm": 0.20882748067378998, "learning_rate": 0.0001893204, "loss": 0.4256, "step": 26700 }, { "epoch": 0.0536, "grad_norm": 0.3316884934902191, "learning_rate": 0.0001892804, "loss": 0.4281, "step": 26800 }, { "epoch": 0.0538, "grad_norm": 0.2426268756389618, "learning_rate": 0.0001892404, "loss": 0.4328, "step": 26900 }, { "epoch": 0.054, "grad_norm": 0.263194739818573, "learning_rate": 0.00018920040000000002, "loss": 0.4321, "step": 27000 }, { "epoch": 0.0542, "grad_norm": 0.2998006343841553, "learning_rate": 0.0001891604, "loss": 0.4322, "step": 27100 }, { "epoch": 0.0544, "grad_norm": 0.32721447944641113, "learning_rate": 0.00018912040000000003, "loss": 0.4483, "step": 27200 }, { "epoch": 0.0546, "grad_norm": 0.33025509119033813, "learning_rate": 0.0001890804, "loss": 0.4429, "step": 27300 }, { "epoch": 0.0548, "grad_norm": 0.2779247462749481, "learning_rate": 0.0001890404, "loss": 0.4243, "step": 27400 }, { "epoch": 0.055, "grad_norm": 0.4281350374221802, "learning_rate": 0.00018900040000000001, "loss": 0.43, "step": 27500 }, { "epoch": 0.0552, "grad_norm": 0.3283655345439911, "learning_rate": 0.0001889604, "loss": 0.4358, "step": 27600 }, { "epoch": 0.0554, "grad_norm": 0.4739823341369629, "learning_rate": 0.00018892040000000003, "loss": 0.4729, "step": 27700 }, { "epoch": 0.0556, "grad_norm": 0.5889720916748047, "learning_rate": 0.0001888804, "loss": 0.4436, "step": 27800 }, { "epoch": 0.0558, "grad_norm": 0.6123796105384827, "learning_rate": 0.00018884040000000002, "loss": 0.4222, "step": 27900 }, { "epoch": 0.056, "grad_norm": 0.24797767400741577, "learning_rate": 0.0001888004, "loss": 0.4527, "step": 28000 }, { "epoch": 0.0562, "grad_norm": 0.2619722783565521, "learning_rate": 0.0001887604, "loss": 0.4307, "step": 28100 }, { "epoch": 0.0564, "grad_norm": 0.309741348028183, "learning_rate": 0.00018872040000000002, "loss": 0.4343, "step": 28200 }, { "epoch": 0.0566, "grad_norm": 0.6912897825241089, "learning_rate": 0.0001886804, "loss": 0.4137, "step": 28300 }, { "epoch": 0.0568, "grad_norm": 0.40055686235427856, "learning_rate": 0.0001886404, "loss": 0.4395, "step": 28400 }, { "epoch": 0.057, "grad_norm": 0.19984565675258636, "learning_rate": 0.0001886004, "loss": 0.4418, "step": 28500 }, { "epoch": 0.0572, "grad_norm": 0.4430752694606781, "learning_rate": 0.00018856040000000003, "loss": 0.4269, "step": 28600 }, { "epoch": 0.0574, "grad_norm": 0.37985754013061523, "learning_rate": 0.00018852040000000002, "loss": 0.4349, "step": 28700 }, { "epoch": 0.0576, "grad_norm": 0.29130980372428894, "learning_rate": 0.0001884804, "loss": 0.4394, "step": 28800 }, { "epoch": 0.0578, "grad_norm": 0.5696423053741455, "learning_rate": 0.0001884404, "loss": 0.442, "step": 28900 }, { "epoch": 0.058, "grad_norm": 0.25018632411956787, "learning_rate": 0.0001884004, "loss": 0.4433, "step": 29000 }, { "epoch": 0.0582, "grad_norm": 0.2951875627040863, "learning_rate": 0.00018836040000000002, "loss": 0.4188, "step": 29100 }, { "epoch": 0.0584, "grad_norm": 0.23352210223674774, "learning_rate": 0.00018832040000000001, "loss": 0.4222, "step": 29200 }, { "epoch": 0.0586, "grad_norm": 0.3203904330730438, "learning_rate": 0.0001882804, "loss": 0.4261, "step": 29300 }, { "epoch": 0.0588, "grad_norm": 0.34108293056488037, "learning_rate": 0.0001882404, "loss": 0.4341, "step": 29400 }, { "epoch": 0.059, "grad_norm": 0.45176732540130615, "learning_rate": 0.00018820040000000002, "loss": 0.4667, "step": 29500 }, { "epoch": 0.0592, "grad_norm": 0.4928261637687683, "learning_rate": 0.00018816040000000002, "loss": 0.4546, "step": 29600 }, { "epoch": 0.0594, "grad_norm": 0.31960615515708923, "learning_rate": 0.0001881204, "loss": 0.4346, "step": 29700 }, { "epoch": 0.0596, "grad_norm": 0.2550900876522064, "learning_rate": 0.0001880804, "loss": 0.4181, "step": 29800 }, { "epoch": 0.0598, "grad_norm": 0.21990257501602173, "learning_rate": 0.0001880404, "loss": 0.4103, "step": 29900 }, { "epoch": 0.06, "grad_norm": 0.2592441439628601, "learning_rate": 0.00018800040000000002, "loss": 0.4092, "step": 30000 }, { "epoch": 0.0602, "grad_norm": 0.2601667046546936, "learning_rate": 0.0001879604, "loss": 0.3927, "step": 30100 }, { "epoch": 0.0604, "grad_norm": 0.24636943638324738, "learning_rate": 0.00018792040000000003, "loss": 0.3929, "step": 30200 }, { "epoch": 0.0606, "grad_norm": 0.3238808810710907, "learning_rate": 0.0001878804, "loss": 0.3894, "step": 30300 }, { "epoch": 0.0608, "grad_norm": 0.2241670787334442, "learning_rate": 0.00018784040000000002, "loss": 0.3862, "step": 30400 }, { "epoch": 0.061, "grad_norm": 0.26162776350975037, "learning_rate": 0.0001878004, "loss": 0.3836, "step": 30500 }, { "epoch": 0.0612, "grad_norm": 0.21369214355945587, "learning_rate": 0.0001877604, "loss": 0.3829, "step": 30600 }, { "epoch": 0.0614, "grad_norm": 0.311394602060318, "learning_rate": 0.00018772040000000003, "loss": 0.384, "step": 30700 }, { "epoch": 0.0616, "grad_norm": 0.2316521257162094, "learning_rate": 0.0001876804, "loss": 0.3934, "step": 30800 }, { "epoch": 0.0618, "grad_norm": 0.4280303120613098, "learning_rate": 0.00018764040000000001, "loss": 0.3983, "step": 30900 }, { "epoch": 0.062, "grad_norm": 0.26460182666778564, "learning_rate": 0.0001876004, "loss": 0.3815, "step": 31000 }, { "epoch": 0.0622, "grad_norm": 0.2175382673740387, "learning_rate": 0.0001875604, "loss": 0.393, "step": 31100 }, { "epoch": 0.0624, "grad_norm": 0.2641454041004181, "learning_rate": 0.00018752040000000002, "loss": 0.3982, "step": 31200 }, { "epoch": 0.0626, "grad_norm": 0.28961893916130066, "learning_rate": 0.0001874804, "loss": 0.393, "step": 31300 }, { "epoch": 0.0628, "grad_norm": 0.2512940764427185, "learning_rate": 0.0001874404, "loss": 0.3896, "step": 31400 }, { "epoch": 0.063, "grad_norm": 0.26094546914100647, "learning_rate": 0.0001874004, "loss": 0.3861, "step": 31500 }, { "epoch": 0.0632, "grad_norm": 1.4088988304138184, "learning_rate": 0.00018736040000000002, "loss": 0.3894, "step": 31600 }, { "epoch": 0.0634, "grad_norm": 0.21915282309055328, "learning_rate": 0.00018732040000000002, "loss": 0.3895, "step": 31700 }, { "epoch": 0.0636, "grad_norm": 0.282810240983963, "learning_rate": 0.0001872804, "loss": 0.3814, "step": 31800 }, { "epoch": 0.0638, "grad_norm": 0.24024060368537903, "learning_rate": 0.0001872404, "loss": 0.3802, "step": 31900 }, { "epoch": 0.064, "grad_norm": 0.254407674074173, "learning_rate": 0.0001872004, "loss": 0.3916, "step": 32000 }, { "epoch": 0.0642, "grad_norm": 0.22265967726707458, "learning_rate": 0.00018716040000000002, "loss": 0.3941, "step": 32100 }, { "epoch": 0.0644, "grad_norm": 0.2248506098985672, "learning_rate": 0.0001871204, "loss": 0.3878, "step": 32200 }, { "epoch": 0.0646, "grad_norm": 0.30514708161354065, "learning_rate": 0.0001870804, "loss": 0.3861, "step": 32300 }, { "epoch": 0.0648, "grad_norm": 0.24802802503108978, "learning_rate": 0.0001870404, "loss": 0.3802, "step": 32400 }, { "epoch": 0.065, "grad_norm": 0.25993114709854126, "learning_rate": 0.00018700040000000002, "loss": 0.3832, "step": 32500 }, { "epoch": 0.0652, "grad_norm": 0.2758074104785919, "learning_rate": 0.0001869604, "loss": 0.3898, "step": 32600 }, { "epoch": 0.0654, "grad_norm": 0.2821817696094513, "learning_rate": 0.0001869204, "loss": 0.3901, "step": 32700 }, { "epoch": 0.0656, "grad_norm": 0.3593108057975769, "learning_rate": 0.0001868804, "loss": 0.3835, "step": 32800 }, { "epoch": 0.0658, "grad_norm": 0.21605613827705383, "learning_rate": 0.0001868404, "loss": 0.3862, "step": 32900 }, { "epoch": 0.066, "grad_norm": 0.34763646125793457, "learning_rate": 0.00018680040000000001, "loss": 0.3861, "step": 33000 }, { "epoch": 0.0662, "grad_norm": 0.2341729700565338, "learning_rate": 0.0001867604, "loss": 0.3876, "step": 33100 }, { "epoch": 0.0664, "grad_norm": 0.6066027879714966, "learning_rate": 0.00018672040000000003, "loss": 0.3904, "step": 33200 }, { "epoch": 0.0666, "grad_norm": 0.2845059037208557, "learning_rate": 0.0001866804, "loss": 0.387, "step": 33300 }, { "epoch": 0.0668, "grad_norm": 0.23616880178451538, "learning_rate": 0.00018664040000000002, "loss": 0.3862, "step": 33400 }, { "epoch": 0.067, "grad_norm": 0.21980926394462585, "learning_rate": 0.0001866004, "loss": 0.3888, "step": 33500 }, { "epoch": 0.0672, "grad_norm": 0.2976222336292267, "learning_rate": 0.0001865604, "loss": 0.3873, "step": 33600 }, { "epoch": 0.0674, "grad_norm": 0.23099921643733978, "learning_rate": 0.00018652040000000002, "loss": 0.3816, "step": 33700 }, { "epoch": 0.0676, "grad_norm": 0.2811383306980133, "learning_rate": 0.0001864804, "loss": 0.3849, "step": 33800 }, { "epoch": 0.0678, "grad_norm": 0.2094060480594635, "learning_rate": 0.0001864404, "loss": 0.3761, "step": 33900 }, { "epoch": 0.068, "grad_norm": 0.2863057553768158, "learning_rate": 0.0001864004, "loss": 0.384, "step": 34000 }, { "epoch": 0.0682, "grad_norm": 0.23131683468818665, "learning_rate": 0.00018636040000000003, "loss": 0.3898, "step": 34100 }, { "epoch": 0.0684, "grad_norm": 0.2947816848754883, "learning_rate": 0.00018632040000000002, "loss": 0.3853, "step": 34200 }, { "epoch": 0.0686, "grad_norm": 0.2566851079463959, "learning_rate": 0.00018628039999999999, "loss": 0.3743, "step": 34300 }, { "epoch": 0.0688, "grad_norm": 0.23540231585502625, "learning_rate": 0.0001862404, "loss": 0.3777, "step": 34400 }, { "epoch": 0.069, "grad_norm": 0.3022463917732239, "learning_rate": 0.0001862004, "loss": 0.3865, "step": 34500 }, { "epoch": 0.0692, "grad_norm": 0.2179107815027237, "learning_rate": 0.00018616040000000002, "loss": 0.3802, "step": 34600 }, { "epoch": 0.0694, "grad_norm": 0.3824068009853363, "learning_rate": 0.00018612040000000001, "loss": 0.385, "step": 34700 }, { "epoch": 0.0696, "grad_norm": 0.2288985550403595, "learning_rate": 0.0001860804, "loss": 0.3767, "step": 34800 }, { "epoch": 0.0698, "grad_norm": 0.228486567735672, "learning_rate": 0.0001860404, "loss": 0.3827, "step": 34900 }, { "epoch": 0.07, "grad_norm": 0.2173689603805542, "learning_rate": 0.0001860004, "loss": 0.3865, "step": 35000 }, { "epoch": 0.0702, "grad_norm": 0.27074554562568665, "learning_rate": 0.00018596040000000002, "loss": 0.3756, "step": 35100 }, { "epoch": 0.0704, "grad_norm": 0.3166576623916626, "learning_rate": 0.0001859204, "loss": 0.3889, "step": 35200 }, { "epoch": 0.0706, "grad_norm": 0.3322249948978424, "learning_rate": 0.0001858804, "loss": 0.3793, "step": 35300 }, { "epoch": 0.0708, "grad_norm": 0.2725512981414795, "learning_rate": 0.0001858404, "loss": 0.3828, "step": 35400 }, { "epoch": 0.071, "grad_norm": 0.20953910052776337, "learning_rate": 0.00018580040000000002, "loss": 0.3771, "step": 35500 }, { "epoch": 0.0712, "grad_norm": 0.2445133924484253, "learning_rate": 0.0001857604, "loss": 0.3737, "step": 35600 }, { "epoch": 0.0714, "grad_norm": 0.5710493922233582, "learning_rate": 0.0001857204, "loss": 0.3712, "step": 35700 }, { "epoch": 0.0716, "grad_norm": 0.3139127790927887, "learning_rate": 0.0001856804, "loss": 0.3787, "step": 35800 }, { "epoch": 0.0718, "grad_norm": 0.23770715296268463, "learning_rate": 0.0001856404, "loss": 0.3814, "step": 35900 }, { "epoch": 0.072, "grad_norm": 0.3806588351726532, "learning_rate": 0.0001856004, "loss": 0.373, "step": 36000 }, { "epoch": 0.0722, "grad_norm": 0.25435832142829895, "learning_rate": 0.0001855604, "loss": 0.3789, "step": 36100 }, { "epoch": 0.0724, "grad_norm": 0.26886075735092163, "learning_rate": 0.00018552040000000003, "loss": 0.3837, "step": 36200 }, { "epoch": 0.0726, "grad_norm": 0.22618788480758667, "learning_rate": 0.0001854804, "loss": 0.3775, "step": 36300 }, { "epoch": 0.0728, "grad_norm": 0.1857277899980545, "learning_rate": 0.00018544040000000001, "loss": 0.3759, "step": 36400 }, { "epoch": 0.073, "grad_norm": 0.34357950091362, "learning_rate": 0.0001854004, "loss": 0.3698, "step": 36500 }, { "epoch": 0.0732, "grad_norm": 0.26498422026634216, "learning_rate": 0.0001853604, "loss": 0.3702, "step": 36600 }, { "epoch": 0.0734, "grad_norm": 0.21762557327747345, "learning_rate": 0.00018532040000000002, "loss": 0.3756, "step": 36700 }, { "epoch": 0.0736, "grad_norm": 0.2890067398548126, "learning_rate": 0.0001852804, "loss": 0.3707, "step": 36800 }, { "epoch": 0.0738, "grad_norm": 0.24609336256980896, "learning_rate": 0.0001852404, "loss": 0.3766, "step": 36900 }, { "epoch": 0.074, "grad_norm": 0.262658953666687, "learning_rate": 0.0001852004, "loss": 0.3736, "step": 37000 }, { "epoch": 0.0742, "grad_norm": 0.20918047428131104, "learning_rate": 0.00018516040000000002, "loss": 0.3756, "step": 37100 }, { "epoch": 0.0744, "grad_norm": 0.324136883020401, "learning_rate": 0.00018512040000000002, "loss": 0.3737, "step": 37200 }, { "epoch": 0.0746, "grad_norm": 0.3206271827220917, "learning_rate": 0.0001850804, "loss": 0.3725, "step": 37300 }, { "epoch": 0.0748, "grad_norm": 0.24515919387340546, "learning_rate": 0.0001850404, "loss": 0.369, "step": 37400 }, { "epoch": 0.075, "grad_norm": 0.441785991191864, "learning_rate": 0.0001850004, "loss": 0.3843, "step": 37500 }, { "epoch": 0.0752, "grad_norm": 0.20712384581565857, "learning_rate": 0.00018496040000000002, "loss": 0.3758, "step": 37600 }, { "epoch": 0.0754, "grad_norm": 0.2728515565395355, "learning_rate": 0.0001849204, "loss": 0.3674, "step": 37700 }, { "epoch": 0.0756, "grad_norm": 0.34958529472351074, "learning_rate": 0.0001848804, "loss": 0.3713, "step": 37800 }, { "epoch": 0.0758, "grad_norm": 0.2223168909549713, "learning_rate": 0.0001848404, "loss": 0.3724, "step": 37900 }, { "epoch": 0.076, "grad_norm": 0.22961725294589996, "learning_rate": 0.0001848004, "loss": 0.3768, "step": 38000 }, { "epoch": 0.0762, "grad_norm": 0.23851755261421204, "learning_rate": 0.00018476040000000001, "loss": 0.3715, "step": 38100 }, { "epoch": 0.0764, "grad_norm": 0.3652637004852295, "learning_rate": 0.0001847204, "loss": 0.3585, "step": 38200 }, { "epoch": 0.0766, "grad_norm": 0.2556198835372925, "learning_rate": 0.0001846804, "loss": 0.3682, "step": 38300 }, { "epoch": 0.0768, "grad_norm": 0.2592809498310089, "learning_rate": 0.0001846404, "loss": 0.3697, "step": 38400 }, { "epoch": 0.077, "grad_norm": 0.217251256108284, "learning_rate": 0.00018460040000000002, "loss": 0.3657, "step": 38500 }, { "epoch": 0.0772, "grad_norm": 0.22854512929916382, "learning_rate": 0.0001845604, "loss": 0.3669, "step": 38600 }, { "epoch": 0.0774, "grad_norm": 0.4855661690235138, "learning_rate": 0.0001845204, "loss": 0.3672, "step": 38700 }, { "epoch": 0.0776, "grad_norm": 0.26462578773498535, "learning_rate": 0.00018448040000000002, "loss": 0.3642, "step": 38800 }, { "epoch": 0.0778, "grad_norm": 0.24063749611377716, "learning_rate": 0.0001844404, "loss": 0.3682, "step": 38900 }, { "epoch": 0.078, "grad_norm": 0.21736368536949158, "learning_rate": 0.0001844004, "loss": 0.3813, "step": 39000 }, { "epoch": 0.0782, "grad_norm": 0.2315133512020111, "learning_rate": 0.0001843604, "loss": 0.3735, "step": 39100 }, { "epoch": 0.0784, "grad_norm": 0.23136012256145477, "learning_rate": 0.00018432040000000003, "loss": 0.3603, "step": 39200 }, { "epoch": 0.0786, "grad_norm": 0.2924899756908417, "learning_rate": 0.00018428040000000002, "loss": 0.3632, "step": 39300 }, { "epoch": 0.0788, "grad_norm": 0.307910293340683, "learning_rate": 0.0001842404, "loss": 0.3671, "step": 39400 }, { "epoch": 0.079, "grad_norm": 0.22759227454662323, "learning_rate": 0.0001842004, "loss": 0.3707, "step": 39500 }, { "epoch": 0.0792, "grad_norm": 0.21660542488098145, "learning_rate": 0.0001841604, "loss": 0.3728, "step": 39600 }, { "epoch": 0.0794, "grad_norm": 0.2659781575202942, "learning_rate": 0.00018412040000000002, "loss": 0.3729, "step": 39700 }, { "epoch": 0.0796, "grad_norm": 0.25910884141921997, "learning_rate": 0.00018408040000000001, "loss": 0.3681, "step": 39800 }, { "epoch": 0.0798, "grad_norm": 0.2730049788951874, "learning_rate": 0.0001840404, "loss": 0.3716, "step": 39900 }, { "epoch": 0.08, "grad_norm": 0.23555901646614075, "learning_rate": 0.0001840004, "loss": 0.3683, "step": 40000 }, { "epoch": 0.0802, "grad_norm": 0.22227054834365845, "learning_rate": 0.00018396040000000002, "loss": 0.371, "step": 40100 }, { "epoch": 0.0804, "grad_norm": 0.21306991577148438, "learning_rate": 0.00018392040000000002, "loss": 0.3666, "step": 40200 }, { "epoch": 0.0806, "grad_norm": 0.37939247488975525, "learning_rate": 0.0001838804, "loss": 0.3782, "step": 40300 }, { "epoch": 0.0808, "grad_norm": 0.26984167098999023, "learning_rate": 0.0001838404, "loss": 0.3717, "step": 40400 }, { "epoch": 0.081, "grad_norm": 0.22309809923171997, "learning_rate": 0.0001838004, "loss": 0.3761, "step": 40500 }, { "epoch": 0.0812, "grad_norm": 0.22144988179206848, "learning_rate": 0.00018376040000000002, "loss": 0.3609, "step": 40600 }, { "epoch": 0.0814, "grad_norm": 0.23661202192306519, "learning_rate": 0.0001837204, "loss": 0.364, "step": 40700 }, { "epoch": 0.0816, "grad_norm": 0.23225858807563782, "learning_rate": 0.00018368040000000003, "loss": 0.3608, "step": 40800 }, { "epoch": 0.0818, "grad_norm": 0.21633361279964447, "learning_rate": 0.0001836404, "loss": 0.3677, "step": 40900 }, { "epoch": 0.082, "grad_norm": 0.20861107110977173, "learning_rate": 0.00018360040000000002, "loss": 0.3586, "step": 41000 }, { "epoch": 0.0822, "grad_norm": 0.22681254148483276, "learning_rate": 0.0001835604, "loss": 0.3503, "step": 41100 }, { "epoch": 0.0824, "grad_norm": 0.29530957341194153, "learning_rate": 0.0001835204, "loss": 0.3546, "step": 41200 }, { "epoch": 0.0826, "grad_norm": 0.22965680062770844, "learning_rate": 0.00018348040000000003, "loss": 0.3581, "step": 41300 }, { "epoch": 0.0828, "grad_norm": 0.24958209693431854, "learning_rate": 0.0001834404, "loss": 0.3601, "step": 41400 }, { "epoch": 0.083, "grad_norm": 0.34926220774650574, "learning_rate": 0.0001834004, "loss": 0.3546, "step": 41500 }, { "epoch": 0.0832, "grad_norm": 0.23350180685520172, "learning_rate": 0.0001833604, "loss": 0.3591, "step": 41600 }, { "epoch": 0.0834, "grad_norm": 0.28490149974823, "learning_rate": 0.00018332040000000003, "loss": 0.3595, "step": 41700 }, { "epoch": 0.0836, "grad_norm": 0.37000980973243713, "learning_rate": 0.00018328040000000002, "loss": 0.357, "step": 41800 }, { "epoch": 0.0838, "grad_norm": 0.25736093521118164, "learning_rate": 0.0001832404, "loss": 0.3584, "step": 41900 }, { "epoch": 0.084, "grad_norm": 0.3347998559474945, "learning_rate": 0.0001832004, "loss": 0.3668, "step": 42000 }, { "epoch": 0.0842, "grad_norm": 0.259397029876709, "learning_rate": 0.0001831604, "loss": 0.358, "step": 42100 }, { "epoch": 0.0844, "grad_norm": 0.26411810517311096, "learning_rate": 0.00018312040000000002, "loss": 0.3678, "step": 42200 }, { "epoch": 0.0846, "grad_norm": 0.24818368256092072, "learning_rate": 0.00018308040000000002, "loss": 0.3563, "step": 42300 }, { "epoch": 0.0848, "grad_norm": 0.27950188517570496, "learning_rate": 0.0001830404, "loss": 0.3561, "step": 42400 }, { "epoch": 0.085, "grad_norm": 0.21346434950828552, "learning_rate": 0.0001830004, "loss": 0.3608, "step": 42500 }, { "epoch": 0.0852, "grad_norm": 0.20952267944812775, "learning_rate": 0.0001829604, "loss": 0.3565, "step": 42600 }, { "epoch": 0.0854, "grad_norm": 0.3930506706237793, "learning_rate": 0.00018292040000000002, "loss": 0.3591, "step": 42700 }, { "epoch": 0.0856, "grad_norm": 0.19581203162670135, "learning_rate": 0.0001828804, "loss": 0.3575, "step": 42800 }, { "epoch": 0.0858, "grad_norm": 0.20917409658432007, "learning_rate": 0.0001828404, "loss": 0.3609, "step": 42900 }, { "epoch": 0.086, "grad_norm": 0.47267964482307434, "learning_rate": 0.0001828004, "loss": 0.3629, "step": 43000 }, { "epoch": 0.0862, "grad_norm": 0.3048167824745178, "learning_rate": 0.00018276040000000002, "loss": 0.3535, "step": 43100 }, { "epoch": 0.0864, "grad_norm": 0.23811852931976318, "learning_rate": 0.0001827204, "loss": 0.362, "step": 43200 }, { "epoch": 0.0866, "grad_norm": 0.21190153062343597, "learning_rate": 0.0001826804, "loss": 0.3856, "step": 43300 }, { "epoch": 0.0868, "grad_norm": 0.2275034487247467, "learning_rate": 0.0001826404, "loss": 0.3586, "step": 43400 }, { "epoch": 0.087, "grad_norm": 0.19929227232933044, "learning_rate": 0.0001826004, "loss": 0.3518, "step": 43500 }, { "epoch": 0.0872, "grad_norm": 0.25546959042549133, "learning_rate": 0.00018256040000000001, "loss": 0.3543, "step": 43600 }, { "epoch": 0.0874, "grad_norm": 0.20379209518432617, "learning_rate": 0.0001825204, "loss": 0.3599, "step": 43700 }, { "epoch": 0.0876, "grad_norm": 0.27397245168685913, "learning_rate": 0.00018248040000000003, "loss": 0.3573, "step": 43800 }, { "epoch": 0.0878, "grad_norm": 0.23391596972942352, "learning_rate": 0.0001824404, "loss": 0.3538, "step": 43900 }, { "epoch": 0.088, "grad_norm": 0.23923568427562714, "learning_rate": 0.00018240040000000002, "loss": 0.3529, "step": 44000 }, { "epoch": 0.0882, "grad_norm": 0.2521684467792511, "learning_rate": 0.0001823604, "loss": 0.3691, "step": 44100 }, { "epoch": 0.0884, "grad_norm": 0.2728547155857086, "learning_rate": 0.0001823204, "loss": 0.3596, "step": 44200 }, { "epoch": 0.0886, "grad_norm": 0.28586891293525696, "learning_rate": 0.00018228040000000002, "loss": 0.3634, "step": 44300 }, { "epoch": 0.0888, "grad_norm": 0.2821336090564728, "learning_rate": 0.0001822404, "loss": 0.3554, "step": 44400 }, { "epoch": 0.089, "grad_norm": 0.38594385981559753, "learning_rate": 0.0001822004, "loss": 0.3559, "step": 44500 }, { "epoch": 0.0892, "grad_norm": 0.235568568110466, "learning_rate": 0.0001821604, "loss": 0.3514, "step": 44600 }, { "epoch": 0.0894, "grad_norm": 0.2375953644514084, "learning_rate": 0.00018212040000000003, "loss": 0.3576, "step": 44700 }, { "epoch": 0.0896, "grad_norm": 0.20530065894126892, "learning_rate": 0.00018208040000000002, "loss": 0.3574, "step": 44800 }, { "epoch": 0.0898, "grad_norm": 0.1840105950832367, "learning_rate": 0.0001820404, "loss": 0.351, "step": 44900 }, { "epoch": 0.09, "grad_norm": 0.25125375390052795, "learning_rate": 0.0001820004, "loss": 0.3533, "step": 45000 }, { "epoch": 0.0902, "grad_norm": 0.2656700313091278, "learning_rate": 0.0001819604, "loss": 0.3604, "step": 45100 }, { "epoch": 0.0904, "grad_norm": 0.27208247780799866, "learning_rate": 0.00018192040000000002, "loss": 0.3552, "step": 45200 }, { "epoch": 0.0906, "grad_norm": 0.2230006903409958, "learning_rate": 0.00018188040000000001, "loss": 0.3515, "step": 45300 }, { "epoch": 0.0908, "grad_norm": 0.28219157457351685, "learning_rate": 0.0001818404, "loss": 0.357, "step": 45400 }, { "epoch": 0.091, "grad_norm": 0.21550622582435608, "learning_rate": 0.0001818004, "loss": 0.3512, "step": 45500 }, { "epoch": 0.0912, "grad_norm": 0.23631860315799713, "learning_rate": 0.0001817604, "loss": 0.3471, "step": 45600 }, { "epoch": 0.0914, "grad_norm": 0.1854236125946045, "learning_rate": 0.00018172040000000002, "loss": 0.3509, "step": 45700 }, { "epoch": 0.0916, "grad_norm": 0.2366824895143509, "learning_rate": 0.0001816804, "loss": 0.3583, "step": 45800 }, { "epoch": 0.0918, "grad_norm": 0.37892627716064453, "learning_rate": 0.0001816404, "loss": 0.3479, "step": 45900 }, { "epoch": 0.092, "grad_norm": 0.2608516216278076, "learning_rate": 0.0001816004, "loss": 0.3478, "step": 46000 }, { "epoch": 0.0922, "grad_norm": 0.23194310069084167, "learning_rate": 0.00018156040000000002, "loss": 0.3654, "step": 46100 }, { "epoch": 0.0924, "grad_norm": 0.2503475248813629, "learning_rate": 0.0001815204, "loss": 0.3508, "step": 46200 }, { "epoch": 0.0926, "grad_norm": 0.25393983721733093, "learning_rate": 0.0001814804, "loss": 0.3512, "step": 46300 }, { "epoch": 0.0928, "grad_norm": 0.19703878462314606, "learning_rate": 0.0001814404, "loss": 0.3468, "step": 46400 }, { "epoch": 0.093, "grad_norm": 0.18077781796455383, "learning_rate": 0.0001814004, "loss": 0.3529, "step": 46500 }, { "epoch": 0.0932, "grad_norm": 0.21559618413448334, "learning_rate": 0.0001813604, "loss": 0.3454, "step": 46600 }, { "epoch": 0.0934, "grad_norm": 0.223658949136734, "learning_rate": 0.0001813204, "loss": 0.3545, "step": 46700 }, { "epoch": 0.0936, "grad_norm": 0.2453765571117401, "learning_rate": 0.00018128040000000003, "loss": 0.3493, "step": 46800 }, { "epoch": 0.0938, "grad_norm": 0.2784949541091919, "learning_rate": 0.0001812404, "loss": 0.3432, "step": 46900 }, { "epoch": 0.094, "grad_norm": 0.21154820919036865, "learning_rate": 0.00018120040000000001, "loss": 0.347, "step": 47000 }, { "epoch": 0.0942, "grad_norm": 0.2772074341773987, "learning_rate": 0.0001811604, "loss": 0.3495, "step": 47100 }, { "epoch": 0.0944, "grad_norm": 0.20805895328521729, "learning_rate": 0.0001811204, "loss": 0.361, "step": 47200 }, { "epoch": 0.0946, "grad_norm": 0.1839285045862198, "learning_rate": 0.00018108040000000002, "loss": 0.3557, "step": 47300 }, { "epoch": 0.0948, "grad_norm": 0.23907071352005005, "learning_rate": 0.0001810404, "loss": 0.3416, "step": 47400 }, { "epoch": 0.095, "grad_norm": 0.3368641138076782, "learning_rate": 0.0001810004, "loss": 0.3483, "step": 47500 }, { "epoch": 0.0952, "grad_norm": 0.24331678450107574, "learning_rate": 0.0001809604, "loss": 0.3494, "step": 47600 }, { "epoch": 0.0954, "grad_norm": 0.20715878903865814, "learning_rate": 0.00018092040000000002, "loss": 0.3457, "step": 47700 }, { "epoch": 0.0956, "grad_norm": 0.1883363276720047, "learning_rate": 0.00018088040000000002, "loss": 0.3556, "step": 47800 }, { "epoch": 0.0958, "grad_norm": 0.24465380609035492, "learning_rate": 0.0001808404, "loss": 0.3447, "step": 47900 }, { "epoch": 0.096, "grad_norm": 0.24823075532913208, "learning_rate": 0.0001808004, "loss": 0.3431, "step": 48000 }, { "epoch": 0.0962, "grad_norm": 0.3323017358779907, "learning_rate": 0.0001807604, "loss": 0.3515, "step": 48100 }, { "epoch": 0.0964, "grad_norm": 0.2804560363292694, "learning_rate": 0.00018072040000000002, "loss": 0.351, "step": 48200 }, { "epoch": 0.0966, "grad_norm": 0.26708027720451355, "learning_rate": 0.0001806804, "loss": 0.338, "step": 48300 }, { "epoch": 0.0968, "grad_norm": 0.20081064105033875, "learning_rate": 0.0001806404, "loss": 0.3424, "step": 48400 }, { "epoch": 0.097, "grad_norm": 0.19008444249629974, "learning_rate": 0.0001806004, "loss": 0.3449, "step": 48500 }, { "epoch": 0.0972, "grad_norm": 0.2081468105316162, "learning_rate": 0.00018056040000000002, "loss": 0.3419, "step": 48600 }, { "epoch": 0.0974, "grad_norm": 0.41435888409614563, "learning_rate": 0.00018052040000000001, "loss": 0.345, "step": 48700 }, { "epoch": 0.0976, "grad_norm": 0.30877628922462463, "learning_rate": 0.0001804804, "loss": 0.3457, "step": 48800 }, { "epoch": 0.0978, "grad_norm": 0.25084757804870605, "learning_rate": 0.0001804404, "loss": 0.3436, "step": 48900 }, { "epoch": 0.098, "grad_norm": 0.26010265946388245, "learning_rate": 0.0001804004, "loss": 0.3522, "step": 49000 }, { "epoch": 0.0982, "grad_norm": 0.45441487431526184, "learning_rate": 0.00018036040000000002, "loss": 0.3366, "step": 49100 }, { "epoch": 0.0984, "grad_norm": 0.2062908411026001, "learning_rate": 0.0001803204, "loss": 0.3415, "step": 49200 }, { "epoch": 0.0986, "grad_norm": 0.24297113716602325, "learning_rate": 0.0001802804, "loss": 0.3403, "step": 49300 }, { "epoch": 0.0988, "grad_norm": 0.2683410942554474, "learning_rate": 0.0001802404, "loss": 0.3234, "step": 49400 }, { "epoch": 0.099, "grad_norm": 0.2585706114768982, "learning_rate": 0.0001802004, "loss": 0.3402, "step": 49500 }, { "epoch": 0.0992, "grad_norm": 0.2423756718635559, "learning_rate": 0.0001801604, "loss": 0.3394, "step": 49600 }, { "epoch": 0.0994, "grad_norm": 0.20874212682247162, "learning_rate": 0.0001801204, "loss": 0.3469, "step": 49700 }, { "epoch": 0.0996, "grad_norm": 0.26174086332321167, "learning_rate": 0.00018008040000000003, "loss": 0.3424, "step": 49800 }, { "epoch": 0.0998, "grad_norm": 0.258007675409317, "learning_rate": 0.0001800404, "loss": 0.333, "step": 49900 }, { "epoch": 0.1, "grad_norm": 0.26558035612106323, "learning_rate": 0.0001800004, "loss": 0.3371, "step": 50000 }, { "epoch": 0.1002, "grad_norm": 0.2108493149280548, "learning_rate": 0.0001799604, "loss": 0.3439, "step": 50100 }, { "epoch": 0.1004, "grad_norm": 0.2356238216161728, "learning_rate": 0.0001799204, "loss": 0.3479, "step": 50200 }, { "epoch": 0.1006, "grad_norm": 0.22705116868019104, "learning_rate": 0.00017988040000000002, "loss": 0.3384, "step": 50300 }, { "epoch": 0.1008, "grad_norm": 0.22554484009742737, "learning_rate": 0.0001798404, "loss": 0.3398, "step": 50400 }, { "epoch": 0.101, "grad_norm": 0.23426580429077148, "learning_rate": 0.0001798004, "loss": 0.3463, "step": 50500 }, { "epoch": 0.1012, "grad_norm": 0.22517123818397522, "learning_rate": 0.0001797604, "loss": 0.3386, "step": 50600 }, { "epoch": 0.1014, "grad_norm": 0.24184340238571167, "learning_rate": 0.00017972040000000002, "loss": 0.3437, "step": 50700 }, { "epoch": 0.1016, "grad_norm": 0.3470735251903534, "learning_rate": 0.00017968040000000002, "loss": 0.3407, "step": 50800 }, { "epoch": 0.1018, "grad_norm": 0.35720571875572205, "learning_rate": 0.0001796404, "loss": 0.3379, "step": 50900 }, { "epoch": 0.102, "grad_norm": 0.25200462341308594, "learning_rate": 0.0001796004, "loss": 0.3454, "step": 51000 }, { "epoch": 0.1022, "grad_norm": 0.19085712730884552, "learning_rate": 0.0001795604, "loss": 0.3406, "step": 51100 }, { "epoch": 0.1024, "grad_norm": 0.21733929216861725, "learning_rate": 0.00017952040000000002, "loss": 0.3361, "step": 51200 }, { "epoch": 0.1026, "grad_norm": 0.2199040800333023, "learning_rate": 0.0001794804, "loss": 0.3381, "step": 51300 }, { "epoch": 0.1028, "grad_norm": 0.21806757152080536, "learning_rate": 0.00017944040000000003, "loss": 0.3391, "step": 51400 }, { "epoch": 0.103, "grad_norm": 0.27425751090049744, "learning_rate": 0.0001794004, "loss": 0.3336, "step": 51500 }, { "epoch": 0.1032, "grad_norm": 0.24505355954170227, "learning_rate": 0.00017936040000000002, "loss": 0.3384, "step": 51600 }, { "epoch": 0.1034, "grad_norm": 0.19927924871444702, "learning_rate": 0.0001793204, "loss": 0.3559, "step": 51700 }, { "epoch": 0.1036, "grad_norm": 0.31233856081962585, "learning_rate": 0.0001792804, "loss": 0.3369, "step": 51800 }, { "epoch": 0.1038, "grad_norm": 0.30651381611824036, "learning_rate": 0.00017924040000000003, "loss": 0.3579, "step": 51900 }, { "epoch": 0.104, "grad_norm": 0.18664880096912384, "learning_rate": 0.0001792004, "loss": 0.343, "step": 52000 }, { "epoch": 0.1042, "grad_norm": 0.48104581236839294, "learning_rate": 0.0001791604, "loss": 0.3705, "step": 52100 }, { "epoch": 0.1044, "grad_norm": 0.24804629385471344, "learning_rate": 0.0001791204, "loss": 0.3563, "step": 52200 }, { "epoch": 0.1046, "grad_norm": 0.2167651504278183, "learning_rate": 0.00017908040000000003, "loss": 0.3396, "step": 52300 }, { "epoch": 0.1048, "grad_norm": 0.5007449984550476, "learning_rate": 0.00017904040000000002, "loss": 0.3436, "step": 52400 }, { "epoch": 0.105, "grad_norm": 0.2310677170753479, "learning_rate": 0.0001790004, "loss": 0.3355, "step": 52500 }, { "epoch": 0.1052, "grad_norm": 0.26683372259140015, "learning_rate": 0.0001789604, "loss": 0.3472, "step": 52600 }, { "epoch": 0.1054, "grad_norm": 0.22053886950016022, "learning_rate": 0.0001789204, "loss": 0.3393, "step": 52700 }, { "epoch": 0.1056, "grad_norm": 0.24362322688102722, "learning_rate": 0.00017888040000000002, "loss": 0.3478, "step": 52800 }, { "epoch": 0.1058, "grad_norm": 0.28843867778778076, "learning_rate": 0.00017884040000000002, "loss": 0.3347, "step": 52900 }, { "epoch": 0.106, "grad_norm": 0.24268370866775513, "learning_rate": 0.0001788004, "loss": 0.3479, "step": 53000 }, { "epoch": 0.1062, "grad_norm": 0.25353243947029114, "learning_rate": 0.0001787604, "loss": 0.3503, "step": 53100 }, { "epoch": 0.1064, "grad_norm": 0.34019261598587036, "learning_rate": 0.0001787204, "loss": 0.3564, "step": 53200 }, { "epoch": 0.1066, "grad_norm": 0.28113237023353577, "learning_rate": 0.00017868040000000002, "loss": 0.3409, "step": 53300 }, { "epoch": 0.1068, "grad_norm": 0.33453458547592163, "learning_rate": 0.0001786404, "loss": 0.3393, "step": 53400 }, { "epoch": 0.107, "grad_norm": 0.25770464539527893, "learning_rate": 0.0001786004, "loss": 0.3448, "step": 53500 }, { "epoch": 0.1072, "grad_norm": 0.2797645628452301, "learning_rate": 0.0001785604, "loss": 0.3463, "step": 53600 }, { "epoch": 0.1074, "grad_norm": 0.26500073075294495, "learning_rate": 0.00017852040000000002, "loss": 0.3391, "step": 53700 }, { "epoch": 0.1076, "grad_norm": 0.23151659965515137, "learning_rate": 0.0001784804, "loss": 0.3369, "step": 53800 }, { "epoch": 0.1078, "grad_norm": 0.27178940176963806, "learning_rate": 0.0001784404, "loss": 0.3364, "step": 53900 }, { "epoch": 0.108, "grad_norm": 0.2767798900604248, "learning_rate": 0.0001784004, "loss": 0.3379, "step": 54000 }, { "epoch": 0.1082, "grad_norm": 0.21509261429309845, "learning_rate": 0.0001783604, "loss": 0.3382, "step": 54100 }, { "epoch": 0.1084, "grad_norm": 0.24864695966243744, "learning_rate": 0.00017832040000000001, "loss": 0.3369, "step": 54200 }, { "epoch": 0.1086, "grad_norm": 0.2107497751712799, "learning_rate": 0.0001782804, "loss": 0.333, "step": 54300 }, { "epoch": 0.1088, "grad_norm": 0.20573535561561584, "learning_rate": 0.00017824040000000003, "loss": 0.3397, "step": 54400 }, { "epoch": 0.109, "grad_norm": 0.25302717089653015, "learning_rate": 0.0001782004, "loss": 0.3384, "step": 54500 }, { "epoch": 0.1092, "grad_norm": 0.22738224267959595, "learning_rate": 0.00017816040000000002, "loss": 0.3398, "step": 54600 }, { "epoch": 0.1094, "grad_norm": 0.28848886489868164, "learning_rate": 0.0001781204, "loss": 0.3361, "step": 54700 }, { "epoch": 0.1096, "grad_norm": 0.23477931320667267, "learning_rate": 0.0001780804, "loss": 0.3414, "step": 54800 }, { "epoch": 0.1098, "grad_norm": 0.24742473661899567, "learning_rate": 0.00017804040000000002, "loss": 0.3359, "step": 54900 }, { "epoch": 0.11, "grad_norm": 0.24106016755104065, "learning_rate": 0.0001780004, "loss": 0.3428, "step": 55000 }, { "epoch": 0.1102, "grad_norm": 0.22427873313426971, "learning_rate": 0.0001779604, "loss": 0.346, "step": 55100 }, { "epoch": 0.1104, "grad_norm": 0.31361493468284607, "learning_rate": 0.0001779204, "loss": 0.3454, "step": 55200 }, { "epoch": 0.1106, "grad_norm": 0.2146797627210617, "learning_rate": 0.00017788040000000003, "loss": 0.3368, "step": 55300 }, { "epoch": 0.1108, "grad_norm": 0.2894279658794403, "learning_rate": 0.00017784040000000002, "loss": 0.339, "step": 55400 }, { "epoch": 0.111, "grad_norm": 0.3044085204601288, "learning_rate": 0.0001778004, "loss": 0.3411, "step": 55500 }, { "epoch": 0.1112, "grad_norm": 0.18939179182052612, "learning_rate": 0.0001777604, "loss": 0.3471, "step": 55600 }, { "epoch": 0.1114, "grad_norm": 0.24099594354629517, "learning_rate": 0.0001777204, "loss": 0.3296, "step": 55700 }, { "epoch": 0.1116, "grad_norm": 0.2313549667596817, "learning_rate": 0.00017768040000000002, "loss": 0.3384, "step": 55800 }, { "epoch": 0.1118, "grad_norm": 0.20545341074466705, "learning_rate": 0.00017764040000000001, "loss": 0.331, "step": 55900 }, { "epoch": 0.112, "grad_norm": 0.22012929618358612, "learning_rate": 0.0001776004, "loss": 0.3383, "step": 56000 }, { "epoch": 0.1122, "grad_norm": 0.2095547914505005, "learning_rate": 0.0001775604, "loss": 0.3355, "step": 56100 }, { "epoch": 0.1124, "grad_norm": 0.2438439428806305, "learning_rate": 0.00017752040000000002, "loss": 0.3305, "step": 56200 }, { "epoch": 0.1126, "grad_norm": 0.5170906186103821, "learning_rate": 0.00017748040000000002, "loss": 0.3385, "step": 56300 }, { "epoch": 0.1128, "grad_norm": 0.1860545575618744, "learning_rate": 0.0001774404, "loss": 0.3336, "step": 56400 }, { "epoch": 0.113, "grad_norm": 0.24801748991012573, "learning_rate": 0.0001774004, "loss": 0.3286, "step": 56500 }, { "epoch": 0.1132, "grad_norm": 0.25407829880714417, "learning_rate": 0.0001773604, "loss": 0.3402, "step": 56600 }, { "epoch": 0.1134, "grad_norm": 0.20441265404224396, "learning_rate": 0.00017732040000000002, "loss": 0.3336, "step": 56700 }, { "epoch": 0.1136, "grad_norm": 0.2255195528268814, "learning_rate": 0.0001772804, "loss": 0.3283, "step": 56800 }, { "epoch": 0.1138, "grad_norm": 0.3453531265258789, "learning_rate": 0.0001772404, "loss": 0.3325, "step": 56900 }, { "epoch": 0.114, "grad_norm": 0.1892947405576706, "learning_rate": 0.0001772004, "loss": 0.3298, "step": 57000 }, { "epoch": 0.1142, "grad_norm": 0.3136577308177948, "learning_rate": 0.0001771604, "loss": 0.3324, "step": 57100 }, { "epoch": 0.1144, "grad_norm": 0.2848574221134186, "learning_rate": 0.0001771204, "loss": 0.3266, "step": 57200 }, { "epoch": 0.1146, "grad_norm": 0.3240659832954407, "learning_rate": 0.0001770804, "loss": 0.3413, "step": 57300 }, { "epoch": 0.1148, "grad_norm": 0.2988292872905731, "learning_rate": 0.00017704040000000003, "loss": 0.3335, "step": 57400 }, { "epoch": 0.115, "grad_norm": 0.31577932834625244, "learning_rate": 0.0001770004, "loss": 0.3294, "step": 57500 }, { "epoch": 0.1152, "grad_norm": 0.32162484526634216, "learning_rate": 0.00017696040000000001, "loss": 0.3521, "step": 57600 }, { "epoch": 0.1154, "grad_norm": 0.661695122718811, "learning_rate": 0.0001769204, "loss": 0.3324, "step": 57700 }, { "epoch": 0.1156, "grad_norm": 0.25184112787246704, "learning_rate": 0.0001768804, "loss": 0.3438, "step": 57800 }, { "epoch": 0.1158, "grad_norm": 0.24458423256874084, "learning_rate": 0.00017684040000000002, "loss": 0.3297, "step": 57900 }, { "epoch": 0.116, "grad_norm": 0.23055191338062286, "learning_rate": 0.0001768004, "loss": 0.328, "step": 58000 }, { "epoch": 0.1162, "grad_norm": 0.22799059748649597, "learning_rate": 0.0001767604, "loss": 0.3305, "step": 58100 }, { "epoch": 0.1164, "grad_norm": 0.17628814280033112, "learning_rate": 0.0001767204, "loss": 0.3351, "step": 58200 }, { "epoch": 0.1166, "grad_norm": 0.29074835777282715, "learning_rate": 0.00017668040000000002, "loss": 0.3267, "step": 58300 }, { "epoch": 0.1168, "grad_norm": 0.26170602440834045, "learning_rate": 0.00017664040000000002, "loss": 0.332, "step": 58400 }, { "epoch": 0.117, "grad_norm": 0.18410225212574005, "learning_rate": 0.0001766004, "loss": 0.3291, "step": 58500 }, { "epoch": 0.1172, "grad_norm": 0.3183979392051697, "learning_rate": 0.0001765604, "loss": 0.3311, "step": 58600 }, { "epoch": 0.1174, "grad_norm": 0.21326270699501038, "learning_rate": 0.0001765204, "loss": 0.3279, "step": 58700 }, { "epoch": 0.1176, "grad_norm": 0.2310231775045395, "learning_rate": 0.00017648040000000002, "loss": 0.3267, "step": 58800 }, { "epoch": 0.1178, "grad_norm": 0.30372264981269836, "learning_rate": 0.0001764404, "loss": 0.3411, "step": 58900 }, { "epoch": 0.118, "grad_norm": 0.22354651987552643, "learning_rate": 0.0001764004, "loss": 0.3323, "step": 59000 }, { "epoch": 0.1182, "grad_norm": 0.22137747704982758, "learning_rate": 0.0001763604, "loss": 0.3261, "step": 59100 }, { "epoch": 0.1184, "grad_norm": 0.20340581238269806, "learning_rate": 0.00017632040000000002, "loss": 0.3291, "step": 59200 }, { "epoch": 0.1186, "grad_norm": 0.21525894105434418, "learning_rate": 0.00017628040000000001, "loss": 0.324, "step": 59300 }, { "epoch": 0.1188, "grad_norm": 0.21832208335399628, "learning_rate": 0.0001762404, "loss": 0.3319, "step": 59400 }, { "epoch": 0.119, "grad_norm": 0.2369861602783203, "learning_rate": 0.0001762004, "loss": 0.3298, "step": 59500 }, { "epoch": 0.1192, "grad_norm": 0.4107494652271271, "learning_rate": 0.0001761604, "loss": 0.3306, "step": 59600 }, { "epoch": 0.1194, "grad_norm": 0.23435452580451965, "learning_rate": 0.00017612040000000002, "loss": 0.3239, "step": 59700 }, { "epoch": 0.1196, "grad_norm": 0.20295970141887665, "learning_rate": 0.0001760804, "loss": 0.335, "step": 59800 }, { "epoch": 0.1198, "grad_norm": 0.29473334550857544, "learning_rate": 0.00017604040000000003, "loss": 0.332, "step": 59900 }, { "epoch": 0.12, "grad_norm": 0.22703693807125092, "learning_rate": 0.0001760004, "loss": 0.3291, "step": 60000 }, { "epoch": 0.1202, "grad_norm": 0.2531866729259491, "learning_rate": 0.0001759604, "loss": 0.3326, "step": 60100 }, { "epoch": 0.1204, "grad_norm": 0.20017871260643005, "learning_rate": 0.0001759204, "loss": 0.3304, "step": 60200 }, { "epoch": 0.1206, "grad_norm": 0.22206462919712067, "learning_rate": 0.0001758804, "loss": 0.3442, "step": 60300 }, { "epoch": 0.1208, "grad_norm": 0.24844832718372345, "learning_rate": 0.00017584040000000002, "loss": 0.3301, "step": 60400 }, { "epoch": 0.121, "grad_norm": 0.19882357120513916, "learning_rate": 0.0001758004, "loss": 0.3296, "step": 60500 }, { "epoch": 0.1212, "grad_norm": 0.2944953441619873, "learning_rate": 0.0001757604, "loss": 0.3283, "step": 60600 }, { "epoch": 0.1214, "grad_norm": 0.21407614648342133, "learning_rate": 0.0001757204, "loss": 0.328, "step": 60700 }, { "epoch": 0.1216, "grad_norm": 0.19046856462955475, "learning_rate": 0.0001756804, "loss": 0.3292, "step": 60800 }, { "epoch": 0.1218, "grad_norm": 0.22615882754325867, "learning_rate": 0.00017564040000000002, "loss": 0.3316, "step": 60900 }, { "epoch": 0.122, "grad_norm": 0.1709730625152588, "learning_rate": 0.0001756004, "loss": 0.3318, "step": 61000 }, { "epoch": 0.1222, "grad_norm": 0.1718514859676361, "learning_rate": 0.0001755604, "loss": 0.3201, "step": 61100 }, { "epoch": 0.1224, "grad_norm": 0.23949329555034637, "learning_rate": 0.0001755204, "loss": 0.3329, "step": 61200 }, { "epoch": 0.1226, "grad_norm": 0.2090582698583603, "learning_rate": 0.00017548040000000002, "loss": 0.3354, "step": 61300 }, { "epoch": 0.1228, "grad_norm": 0.2083326280117035, "learning_rate": 0.00017544040000000002, "loss": 0.3293, "step": 61400 }, { "epoch": 0.123, "grad_norm": 0.21610133349895477, "learning_rate": 0.0001754004, "loss": 0.3252, "step": 61500 }, { "epoch": 0.1232, "grad_norm": 0.2022552639245987, "learning_rate": 0.0001753604, "loss": 0.3322, "step": 61600 }, { "epoch": 0.1234, "grad_norm": 0.24811524152755737, "learning_rate": 0.0001753204, "loss": 0.3405, "step": 61700 }, { "epoch": 0.1236, "grad_norm": 0.25753867626190186, "learning_rate": 0.00017528040000000002, "loss": 0.3359, "step": 61800 }, { "epoch": 0.1238, "grad_norm": 0.32304272055625916, "learning_rate": 0.0001752404, "loss": 0.3254, "step": 61900 }, { "epoch": 0.124, "grad_norm": 0.18287380039691925, "learning_rate": 0.0001752004, "loss": 0.3298, "step": 62000 }, { "epoch": 0.1242, "grad_norm": 0.19973132014274597, "learning_rate": 0.0001751604, "loss": 0.3363, "step": 62100 }, { "epoch": 0.1244, "grad_norm": 0.23819290101528168, "learning_rate": 0.00017512040000000002, "loss": 0.3238, "step": 62200 }, { "epoch": 0.1246, "grad_norm": 0.27806952595710754, "learning_rate": 0.0001750804, "loss": 0.3286, "step": 62300 }, { "epoch": 0.1248, "grad_norm": 0.17436453700065613, "learning_rate": 0.0001750404, "loss": 0.3266, "step": 62400 }, { "epoch": 0.125, "grad_norm": 0.2046598643064499, "learning_rate": 0.0001750004, "loss": 0.32, "step": 62500 }, { "epoch": 0.1252, "grad_norm": 0.23056712746620178, "learning_rate": 0.0001749604, "loss": 0.325, "step": 62600 }, { "epoch": 0.1254, "grad_norm": 0.32476136088371277, "learning_rate": 0.0001749204, "loss": 0.3301, "step": 62700 }, { "epoch": 0.1256, "grad_norm": 0.24279266595840454, "learning_rate": 0.0001748804, "loss": 0.3253, "step": 62800 }, { "epoch": 0.1258, "grad_norm": 0.1884712129831314, "learning_rate": 0.00017484040000000003, "loss": 0.3309, "step": 62900 }, { "epoch": 0.126, "grad_norm": 0.21700552105903625, "learning_rate": 0.0001748004, "loss": 0.3299, "step": 63000 }, { "epoch": 0.1262, "grad_norm": 0.22113928198814392, "learning_rate": 0.00017476040000000001, "loss": 0.3283, "step": 63100 }, { "epoch": 0.1264, "grad_norm": 0.2137170284986496, "learning_rate": 0.0001747204, "loss": 0.3243, "step": 63200 }, { "epoch": 0.1266, "grad_norm": 0.273173063993454, "learning_rate": 0.0001746804, "loss": 0.3337, "step": 63300 }, { "epoch": 0.1268, "grad_norm": 0.2576025724411011, "learning_rate": 0.00017464040000000002, "loss": 0.329, "step": 63400 }, { "epoch": 0.127, "grad_norm": 0.3533582389354706, "learning_rate": 0.0001746004, "loss": 0.3322, "step": 63500 }, { "epoch": 0.1272, "grad_norm": 0.373040109872818, "learning_rate": 0.0001745604, "loss": 0.3262, "step": 63600 }, { "epoch": 0.1274, "grad_norm": 0.2242802381515503, "learning_rate": 0.0001745204, "loss": 0.328, "step": 63700 }, { "epoch": 0.1276, "grad_norm": 0.4904221296310425, "learning_rate": 0.0001744804, "loss": 0.3253, "step": 63800 }, { "epoch": 0.1278, "grad_norm": 0.20114104449748993, "learning_rate": 0.00017444040000000002, "loss": 0.3272, "step": 63900 }, { "epoch": 0.128, "grad_norm": 0.18482904136180878, "learning_rate": 0.0001744004, "loss": 0.3254, "step": 64000 }, { "epoch": 0.1282, "grad_norm": 0.19531016051769257, "learning_rate": 0.0001743604, "loss": 0.3206, "step": 64100 }, { "epoch": 0.1284, "grad_norm": 0.2820129990577698, "learning_rate": 0.0001743204, "loss": 0.3324, "step": 64200 }, { "epoch": 0.1286, "grad_norm": 0.23063960671424866, "learning_rate": 0.00017428040000000002, "loss": 0.3262, "step": 64300 }, { "epoch": 0.1288, "grad_norm": 0.2425381988286972, "learning_rate": 0.0001742404, "loss": 0.3221, "step": 64400 }, { "epoch": 0.129, "grad_norm": 0.1685548573732376, "learning_rate": 0.0001742004, "loss": 0.3225, "step": 64500 }, { "epoch": 0.1292, "grad_norm": 0.23022134602069855, "learning_rate": 0.0001741604, "loss": 0.3297, "step": 64600 }, { "epoch": 0.1294, "grad_norm": 0.2650277018547058, "learning_rate": 0.0001741204, "loss": 0.3293, "step": 64700 }, { "epoch": 0.1296, "grad_norm": 0.2417951077222824, "learning_rate": 0.00017408040000000001, "loss": 0.328, "step": 64800 }, { "epoch": 0.1298, "grad_norm": 0.22391964495182037, "learning_rate": 0.0001740404, "loss": 0.3266, "step": 64900 }, { "epoch": 0.13, "grad_norm": 0.2012777179479599, "learning_rate": 0.00017400040000000003, "loss": 0.3278, "step": 65000 }, { "epoch": 0.1302, "grad_norm": 0.18540987372398376, "learning_rate": 0.0001739604, "loss": 0.337, "step": 65100 }, { "epoch": 0.1304, "grad_norm": 0.19150865077972412, "learning_rate": 0.00017392040000000002, "loss": 0.3319, "step": 65200 }, { "epoch": 0.1306, "grad_norm": 0.24447111785411835, "learning_rate": 0.0001738804, "loss": 0.3312, "step": 65300 }, { "epoch": 0.1308, "grad_norm": 0.20017050206661224, "learning_rate": 0.0001738404, "loss": 0.3375, "step": 65400 }, { "epoch": 0.131, "grad_norm": 0.2571297585964203, "learning_rate": 0.00017380040000000002, "loss": 0.3268, "step": 65500 }, { "epoch": 0.1312, "grad_norm": 0.18792451918125153, "learning_rate": 0.0001737604, "loss": 0.3261, "step": 65600 }, { "epoch": 0.1314, "grad_norm": 0.4205167889595032, "learning_rate": 0.0001737204, "loss": 0.33, "step": 65700 }, { "epoch": 0.1316, "grad_norm": 0.2724183201789856, "learning_rate": 0.0001736804, "loss": 0.3273, "step": 65800 }, { "epoch": 0.1318, "grad_norm": 0.21143561601638794, "learning_rate": 0.00017364040000000003, "loss": 0.3281, "step": 65900 }, { "epoch": 0.132, "grad_norm": 0.1903173327445984, "learning_rate": 0.00017360040000000002, "loss": 0.3185, "step": 66000 }, { "epoch": 0.1322, "grad_norm": 0.3626430928707123, "learning_rate": 0.0001735604, "loss": 0.3411, "step": 66100 }, { "epoch": 0.1324, "grad_norm": 0.1814277172088623, "learning_rate": 0.0001735204, "loss": 0.3301, "step": 66200 }, { "epoch": 0.1326, "grad_norm": 0.22784367203712463, "learning_rate": 0.0001734804, "loss": 0.3244, "step": 66300 }, { "epoch": 0.1328, "grad_norm": 0.2449122965335846, "learning_rate": 0.00017344040000000002, "loss": 0.3281, "step": 66400 }, { "epoch": 0.133, "grad_norm": 0.17180882394313812, "learning_rate": 0.00017340040000000001, "loss": 0.3237, "step": 66500 }, { "epoch": 0.1332, "grad_norm": 0.24725696444511414, "learning_rate": 0.0001733604, "loss": 0.3281, "step": 66600 }, { "epoch": 0.1334, "grad_norm": 0.31062814593315125, "learning_rate": 0.0001733204, "loss": 0.3312, "step": 66700 }, { "epoch": 0.1336, "grad_norm": 0.20241935551166534, "learning_rate": 0.00017328040000000002, "loss": 0.3214, "step": 66800 }, { "epoch": 0.1338, "grad_norm": 0.319897323846817, "learning_rate": 0.00017324040000000002, "loss": 0.3262, "step": 66900 }, { "epoch": 0.134, "grad_norm": 0.36917874217033386, "learning_rate": 0.0001732004, "loss": 0.3232, "step": 67000 }, { "epoch": 0.1342, "grad_norm": 0.24121050536632538, "learning_rate": 0.0001731604, "loss": 0.3333, "step": 67100 }, { "epoch": 0.1344, "grad_norm": 0.21649840474128723, "learning_rate": 0.0001731204, "loss": 0.3246, "step": 67200 }, { "epoch": 0.1346, "grad_norm": 0.21219566464424133, "learning_rate": 0.00017308040000000002, "loss": 0.3263, "step": 67300 }, { "epoch": 0.1348, "grad_norm": 0.20997853577136993, "learning_rate": 0.0001730404, "loss": 0.321, "step": 67400 }, { "epoch": 0.135, "grad_norm": 0.20444467663764954, "learning_rate": 0.0001730004, "loss": 0.3242, "step": 67500 }, { "epoch": 0.1352, "grad_norm": 0.28162842988967896, "learning_rate": 0.0001729604, "loss": 0.3191, "step": 67600 }, { "epoch": 0.1354, "grad_norm": 0.266450971364975, "learning_rate": 0.0001729204, "loss": 0.3201, "step": 67700 }, { "epoch": 0.1356, "grad_norm": 0.22291947901248932, "learning_rate": 0.0001728804, "loss": 0.3171, "step": 67800 }, { "epoch": 0.1358, "grad_norm": 0.46510088443756104, "learning_rate": 0.0001728404, "loss": 0.3276, "step": 67900 }, { "epoch": 0.136, "grad_norm": 0.2612142562866211, "learning_rate": 0.00017280040000000003, "loss": 0.3313, "step": 68000 }, { "epoch": 0.1362, "grad_norm": 0.19849452376365662, "learning_rate": 0.0001727604, "loss": 0.3155, "step": 68100 }, { "epoch": 0.1364, "grad_norm": 0.32336533069610596, "learning_rate": 0.00017272040000000001, "loss": 0.3225, "step": 68200 }, { "epoch": 0.1366, "grad_norm": 0.26454541087150574, "learning_rate": 0.0001726804, "loss": 0.333, "step": 68300 }, { "epoch": 0.1368, "grad_norm": 0.2315870225429535, "learning_rate": 0.0001726404, "loss": 0.3297, "step": 68400 }, { "epoch": 0.137, "grad_norm": 0.3298985958099365, "learning_rate": 0.00017260040000000002, "loss": 0.3261, "step": 68500 }, { "epoch": 0.1372, "grad_norm": 0.2369878590106964, "learning_rate": 0.0001725604, "loss": 0.3287, "step": 68600 }, { "epoch": 0.1374, "grad_norm": 0.18581421673297882, "learning_rate": 0.0001725204, "loss": 0.3221, "step": 68700 }, { "epoch": 0.1376, "grad_norm": 0.3875245153903961, "learning_rate": 0.0001724804, "loss": 0.3181, "step": 68800 }, { "epoch": 0.1378, "grad_norm": 0.216262087225914, "learning_rate": 0.00017244040000000002, "loss": 0.3169, "step": 68900 }, { "epoch": 0.138, "grad_norm": 0.21017438173294067, "learning_rate": 0.00017240040000000002, "loss": 0.3184, "step": 69000 }, { "epoch": 0.1382, "grad_norm": 0.20590710639953613, "learning_rate": 0.0001723604, "loss": 0.3287, "step": 69100 }, { "epoch": 0.1384, "grad_norm": 0.2776513695716858, "learning_rate": 0.0001723204, "loss": 0.3195, "step": 69200 }, { "epoch": 0.1386, "grad_norm": 0.21307912468910217, "learning_rate": 0.0001722804, "loss": 0.3277, "step": 69300 }, { "epoch": 0.1388, "grad_norm": 0.2287779301404953, "learning_rate": 0.00017224040000000002, "loss": 0.3199, "step": 69400 }, { "epoch": 0.139, "grad_norm": 0.33188334107398987, "learning_rate": 0.0001722004, "loss": 0.3167, "step": 69500 }, { "epoch": 0.1392, "grad_norm": 0.2860109508037567, "learning_rate": 0.0001721604, "loss": 0.3151, "step": 69600 }, { "epoch": 0.1394, "grad_norm": 0.19062092900276184, "learning_rate": 0.0001721204, "loss": 0.3237, "step": 69700 }, { "epoch": 0.1396, "grad_norm": 0.21721702814102173, "learning_rate": 0.00017208040000000002, "loss": 0.3173, "step": 69800 }, { "epoch": 0.1398, "grad_norm": 0.20423859357833862, "learning_rate": 0.00017204040000000001, "loss": 0.3197, "step": 69900 }, { "epoch": 0.14, "grad_norm": 0.21749666333198547, "learning_rate": 0.0001720004, "loss": 0.3274, "step": 70000 }, { "epoch": 0.1402, "grad_norm": 0.2770256996154785, "learning_rate": 0.0001719604, "loss": 0.3189, "step": 70100 }, { "epoch": 0.1404, "grad_norm": 0.300563782453537, "learning_rate": 0.0001719204, "loss": 0.3158, "step": 70200 }, { "epoch": 0.1406, "grad_norm": 0.21529285609722137, "learning_rate": 0.00017188040000000002, "loss": 0.3234, "step": 70300 }, { "epoch": 0.1408, "grad_norm": 0.30113551020622253, "learning_rate": 0.0001718404, "loss": 0.3188, "step": 70400 }, { "epoch": 0.141, "grad_norm": 0.22355583310127258, "learning_rate": 0.00017180040000000003, "loss": 0.321, "step": 70500 }, { "epoch": 0.1412, "grad_norm": 0.23251134157180786, "learning_rate": 0.0001717604, "loss": 0.3215, "step": 70600 }, { "epoch": 0.1414, "grad_norm": 0.2621996998786926, "learning_rate": 0.00017172040000000002, "loss": 0.328, "step": 70700 }, { "epoch": 0.1416, "grad_norm": 0.24400697648525238, "learning_rate": 0.0001716804, "loss": 0.3182, "step": 70800 }, { "epoch": 0.1418, "grad_norm": 0.182724729180336, "learning_rate": 0.0001716404, "loss": 0.3104, "step": 70900 }, { "epoch": 0.142, "grad_norm": 0.2504395544528961, "learning_rate": 0.00017160040000000002, "loss": 0.3186, "step": 71000 }, { "epoch": 0.1422, "grad_norm": 0.3078869581222534, "learning_rate": 0.0001715604, "loss": 0.3175, "step": 71100 }, { "epoch": 0.1424, "grad_norm": 0.2186155915260315, "learning_rate": 0.0001715204, "loss": 0.32, "step": 71200 }, { "epoch": 0.1426, "grad_norm": 0.18938790261745453, "learning_rate": 0.0001714804, "loss": 0.3138, "step": 71300 }, { "epoch": 0.1428, "grad_norm": 0.21720623970031738, "learning_rate": 0.0001714404, "loss": 0.3192, "step": 71400 }, { "epoch": 0.143, "grad_norm": 0.18728028237819672, "learning_rate": 0.00017140040000000002, "loss": 0.3261, "step": 71500 }, { "epoch": 0.1432, "grad_norm": 0.23028717935085297, "learning_rate": 0.0001713604, "loss": 0.3167, "step": 71600 }, { "epoch": 0.1434, "grad_norm": 0.22258538007736206, "learning_rate": 0.0001713204, "loss": 0.3138, "step": 71700 }, { "epoch": 0.1436, "grad_norm": 0.3108178675174713, "learning_rate": 0.0001712804, "loss": 0.3261, "step": 71800 }, { "epoch": 0.1438, "grad_norm": 0.2817421853542328, "learning_rate": 0.00017124040000000002, "loss": 0.3153, "step": 71900 }, { "epoch": 0.144, "grad_norm": 0.2490064799785614, "learning_rate": 0.00017120040000000002, "loss": 0.314, "step": 72000 }, { "epoch": 0.1442, "grad_norm": 0.18532483279705048, "learning_rate": 0.0001711604, "loss": 0.3162, "step": 72100 }, { "epoch": 0.1444, "grad_norm": 0.23727016150951385, "learning_rate": 0.0001711204, "loss": 0.3278, "step": 72200 }, { "epoch": 0.1446, "grad_norm": 0.18576467037200928, "learning_rate": 0.0001710804, "loss": 0.3186, "step": 72300 }, { "epoch": 0.1448, "grad_norm": 0.21316267549991608, "learning_rate": 0.00017104040000000002, "loss": 0.3198, "step": 72400 }, { "epoch": 0.145, "grad_norm": 0.21670052409172058, "learning_rate": 0.0001710004, "loss": 0.3187, "step": 72500 }, { "epoch": 0.1452, "grad_norm": 0.260593056678772, "learning_rate": 0.0001709604, "loss": 0.3119, "step": 72600 }, { "epoch": 0.1454, "grad_norm": 0.24345095455646515, "learning_rate": 0.0001709204, "loss": 0.3081, "step": 72700 }, { "epoch": 0.1456, "grad_norm": 0.21282540261745453, "learning_rate": 0.00017088040000000002, "loss": 0.3222, "step": 72800 }, { "epoch": 0.1458, "grad_norm": 0.22302696108818054, "learning_rate": 0.0001708404, "loss": 0.3133, "step": 72900 }, { "epoch": 0.146, "grad_norm": 0.24244046211242676, "learning_rate": 0.0001708004, "loss": 0.3215, "step": 73000 }, { "epoch": 0.1462, "grad_norm": 0.651138961315155, "learning_rate": 0.0001707604, "loss": 0.3123, "step": 73100 }, { "epoch": 0.1464, "grad_norm": 0.27869105339050293, "learning_rate": 0.0001707204, "loss": 0.3184, "step": 73200 }, { "epoch": 0.1466, "grad_norm": 0.2049490511417389, "learning_rate": 0.0001706804, "loss": 0.3157, "step": 73300 }, { "epoch": 0.1468, "grad_norm": 0.2697393298149109, "learning_rate": 0.0001706404, "loss": 0.334, "step": 73400 }, { "epoch": 0.147, "grad_norm": 0.17334036529064178, "learning_rate": 0.00017060040000000003, "loss": 0.3145, "step": 73500 }, { "epoch": 0.1472, "grad_norm": 0.2551487386226654, "learning_rate": 0.0001705604, "loss": 0.3138, "step": 73600 }, { "epoch": 0.1474, "grad_norm": 0.23012271523475647, "learning_rate": 0.00017052040000000001, "loss": 0.3133, "step": 73700 }, { "epoch": 0.1476, "grad_norm": 0.21023279428482056, "learning_rate": 0.0001704804, "loss": 0.3096, "step": 73800 }, { "epoch": 0.1478, "grad_norm": 0.24528542160987854, "learning_rate": 0.0001704404, "loss": 0.313, "step": 73900 }, { "epoch": 0.148, "grad_norm": 0.24039429426193237, "learning_rate": 0.00017040040000000002, "loss": 0.3155, "step": 74000 }, { "epoch": 0.1482, "grad_norm": 0.20780912041664124, "learning_rate": 0.0001703604, "loss": 0.3131, "step": 74100 }, { "epoch": 0.1484, "grad_norm": 0.18914741277694702, "learning_rate": 0.0001703204, "loss": 0.3125, "step": 74200 }, { "epoch": 0.1486, "grad_norm": 0.18693341314792633, "learning_rate": 0.0001702804, "loss": 0.3185, "step": 74300 }, { "epoch": 0.1488, "grad_norm": 0.2793961465358734, "learning_rate": 0.00017024040000000002, "loss": 0.3209, "step": 74400 }, { "epoch": 0.149, "grad_norm": 0.3118375539779663, "learning_rate": 0.00017020040000000002, "loss": 0.3173, "step": 74500 }, { "epoch": 0.1492, "grad_norm": 0.23574410378932953, "learning_rate": 0.00017016039999999998, "loss": 0.3167, "step": 74600 }, { "epoch": 0.1494, "grad_norm": 0.33424112200737, "learning_rate": 0.0001701204, "loss": 0.3196, "step": 74700 }, { "epoch": 0.1496, "grad_norm": 0.24209052324295044, "learning_rate": 0.0001700804, "loss": 0.3201, "step": 74800 }, { "epoch": 0.1498, "grad_norm": 0.21331624686717987, "learning_rate": 0.00017004040000000002, "loss": 0.3163, "step": 74900 }, { "epoch": 0.15, "grad_norm": 0.20186734199523926, "learning_rate": 0.0001700004, "loss": 0.3174, "step": 75000 }, { "epoch": 0.1502, "grad_norm": 0.17939358949661255, "learning_rate": 0.0001699604, "loss": 0.3185, "step": 75100 }, { "epoch": 0.1504, "grad_norm": 0.23708830773830414, "learning_rate": 0.0001699204, "loss": 0.3239, "step": 75200 }, { "epoch": 0.1506, "grad_norm": 0.23293326795101166, "learning_rate": 0.0001698804, "loss": 0.3211, "step": 75300 }, { "epoch": 0.1508, "grad_norm": 0.18440894782543182, "learning_rate": 0.00016984040000000001, "loss": 0.3118, "step": 75400 }, { "epoch": 0.151, "grad_norm": 0.2222815454006195, "learning_rate": 0.0001698004, "loss": 0.3108, "step": 75500 }, { "epoch": 0.1512, "grad_norm": 0.24183017015457153, "learning_rate": 0.0001697604, "loss": 0.3145, "step": 75600 }, { "epoch": 0.1514, "grad_norm": 0.2108313888311386, "learning_rate": 0.0001697204, "loss": 0.3128, "step": 75700 }, { "epoch": 0.1516, "grad_norm": 0.25066909193992615, "learning_rate": 0.00016968040000000002, "loss": 0.3223, "step": 75800 }, { "epoch": 0.1518, "grad_norm": 0.2033839225769043, "learning_rate": 0.0001696404, "loss": 0.3253, "step": 75900 }, { "epoch": 0.152, "grad_norm": 0.21795319020748138, "learning_rate": 0.0001696004, "loss": 0.3186, "step": 76000 }, { "epoch": 0.1522, "grad_norm": 0.2567816972732544, "learning_rate": 0.0001695604, "loss": 0.3184, "step": 76100 }, { "epoch": 0.1524, "grad_norm": 0.2667364478111267, "learning_rate": 0.0001695204, "loss": 0.3174, "step": 76200 }, { "epoch": 0.1526, "grad_norm": 0.24945732951164246, "learning_rate": 0.0001694804, "loss": 0.3166, "step": 76300 }, { "epoch": 0.1528, "grad_norm": 0.3093933165073395, "learning_rate": 0.0001694404, "loss": 0.3089, "step": 76400 }, { "epoch": 0.153, "grad_norm": 0.2426934689283371, "learning_rate": 0.00016940040000000003, "loss": 0.3128, "step": 76500 }, { "epoch": 0.1532, "grad_norm": 0.20812727510929108, "learning_rate": 0.00016936040000000002, "loss": 0.3093, "step": 76600 }, { "epoch": 0.1534, "grad_norm": 0.4342077672481537, "learning_rate": 0.0001693204, "loss": 0.3295, "step": 76700 }, { "epoch": 0.1536, "grad_norm": 0.2487853765487671, "learning_rate": 0.0001692804, "loss": 0.3202, "step": 76800 }, { "epoch": 0.1538, "grad_norm": 0.21004439890384674, "learning_rate": 0.0001692404, "loss": 0.3136, "step": 76900 }, { "epoch": 0.154, "grad_norm": 0.20426279306411743, "learning_rate": 0.00016920040000000002, "loss": 0.3154, "step": 77000 }, { "epoch": 0.1542, "grad_norm": 0.2324894517660141, "learning_rate": 0.00016916040000000001, "loss": 0.3124, "step": 77100 }, { "epoch": 0.1544, "grad_norm": 0.24114584922790527, "learning_rate": 0.0001691204, "loss": 0.3218, "step": 77200 }, { "epoch": 0.1546, "grad_norm": 0.21619604527950287, "learning_rate": 0.0001690804, "loss": 0.3081, "step": 77300 }, { "epoch": 0.1548, "grad_norm": 0.3266576826572418, "learning_rate": 0.00016904040000000002, "loss": 0.3154, "step": 77400 }, { "epoch": 0.155, "grad_norm": 0.19163742661476135, "learning_rate": 0.00016900040000000002, "loss": 0.3112, "step": 77500 }, { "epoch": 0.1552, "grad_norm": 0.24338412284851074, "learning_rate": 0.0001689604, "loss": 0.309, "step": 77600 }, { "epoch": 0.1554, "grad_norm": 0.2039482295513153, "learning_rate": 0.0001689204, "loss": 0.3196, "step": 77700 }, { "epoch": 0.1556, "grad_norm": 0.2261604517698288, "learning_rate": 0.0001688804, "loss": 0.3157, "step": 77800 }, { "epoch": 0.1558, "grad_norm": 0.23354065418243408, "learning_rate": 0.00016884040000000002, "loss": 0.3091, "step": 77900 }, { "epoch": 0.156, "grad_norm": 0.3552130162715912, "learning_rate": 0.0001688004, "loss": 0.3138, "step": 78000 }, { "epoch": 0.1562, "grad_norm": 0.3127024471759796, "learning_rate": 0.00016876040000000003, "loss": 0.3134, "step": 78100 }, { "epoch": 0.1564, "grad_norm": 0.2056431621313095, "learning_rate": 0.0001687204, "loss": 0.3082, "step": 78200 }, { "epoch": 0.1566, "grad_norm": 0.22769121825695038, "learning_rate": 0.0001686804, "loss": 0.318, "step": 78300 }, { "epoch": 0.1568, "grad_norm": 0.1975649893283844, "learning_rate": 0.0001686404, "loss": 0.3284, "step": 78400 }, { "epoch": 0.157, "grad_norm": 0.25407645106315613, "learning_rate": 0.0001686004, "loss": 0.3124, "step": 78500 }, { "epoch": 0.1572, "grad_norm": 0.18915730714797974, "learning_rate": 0.00016856040000000003, "loss": 0.3125, "step": 78600 }, { "epoch": 0.1574, "grad_norm": 0.20961833000183105, "learning_rate": 0.0001685204, "loss": 0.3113, "step": 78700 }, { "epoch": 0.1576, "grad_norm": 0.20440556108951569, "learning_rate": 0.00016848040000000001, "loss": 0.3133, "step": 78800 }, { "epoch": 0.1578, "grad_norm": 0.20298384130001068, "learning_rate": 0.0001684404, "loss": 0.3195, "step": 78900 }, { "epoch": 0.158, "grad_norm": 0.2182042896747589, "learning_rate": 0.0001684004, "loss": 0.3091, "step": 79000 }, { "epoch": 0.1582, "grad_norm": 0.22923989593982697, "learning_rate": 0.00016836040000000002, "loss": 0.3181, "step": 79100 }, { "epoch": 0.1584, "grad_norm": 0.3075358271598816, "learning_rate": 0.0001683204, "loss": 0.3143, "step": 79200 }, { "epoch": 0.1586, "grad_norm": 0.2053437978029251, "learning_rate": 0.0001682804, "loss": 0.3086, "step": 79300 }, { "epoch": 0.1588, "grad_norm": 0.7424286007881165, "learning_rate": 0.0001682404, "loss": 0.3103, "step": 79400 }, { "epoch": 0.159, "grad_norm": 0.6188491582870483, "learning_rate": 0.00016820040000000002, "loss": 0.3183, "step": 79500 }, { "epoch": 0.1592, "grad_norm": 0.276163786649704, "learning_rate": 0.00016816040000000002, "loss": 0.3191, "step": 79600 }, { "epoch": 0.1594, "grad_norm": 0.18703459203243256, "learning_rate": 0.0001681204, "loss": 0.3139, "step": 79700 }, { "epoch": 0.1596, "grad_norm": 0.17597968876361847, "learning_rate": 0.0001680804, "loss": 0.3108, "step": 79800 }, { "epoch": 0.1598, "grad_norm": 0.24276955425739288, "learning_rate": 0.0001680404, "loss": 0.3068, "step": 79900 }, { "epoch": 0.16, "grad_norm": 0.21838606894016266, "learning_rate": 0.00016800040000000002, "loss": 0.3132, "step": 80000 }, { "epoch": 0.1602, "grad_norm": 0.2652193009853363, "learning_rate": 0.0001679604, "loss": 0.3107, "step": 80100 }, { "epoch": 0.1604, "grad_norm": 0.22899410128593445, "learning_rate": 0.0001679204, "loss": 0.3124, "step": 80200 }, { "epoch": 0.1606, "grad_norm": 0.19513286650180817, "learning_rate": 0.0001678804, "loss": 0.3125, "step": 80300 }, { "epoch": 0.1608, "grad_norm": 0.25317785143852234, "learning_rate": 0.00016784040000000002, "loss": 0.3075, "step": 80400 }, { "epoch": 0.161, "grad_norm": 0.21707969903945923, "learning_rate": 0.00016780040000000001, "loss": 0.3203, "step": 80500 }, { "epoch": 0.1612, "grad_norm": 0.22994515299797058, "learning_rate": 0.0001677604, "loss": 0.3136, "step": 80600 }, { "epoch": 0.1614, "grad_norm": 0.29428768157958984, "learning_rate": 0.0001677204, "loss": 0.3201, "step": 80700 }, { "epoch": 0.1616, "grad_norm": 0.24848338961601257, "learning_rate": 0.0001676804, "loss": 0.3169, "step": 80800 }, { "epoch": 0.1618, "grad_norm": 0.2289322018623352, "learning_rate": 0.00016764040000000002, "loss": 0.3055, "step": 80900 }, { "epoch": 0.162, "grad_norm": 0.20457616448402405, "learning_rate": 0.0001676004, "loss": 0.3115, "step": 81000 }, { "epoch": 0.1622, "grad_norm": 0.21032114326953888, "learning_rate": 0.00016756040000000003, "loss": 0.3116, "step": 81100 }, { "epoch": 0.1624, "grad_norm": 0.20674587786197662, "learning_rate": 0.0001675204, "loss": 0.3184, "step": 81200 }, { "epoch": 0.1626, "grad_norm": 0.4695895314216614, "learning_rate": 0.00016748040000000002, "loss": 0.3165, "step": 81300 }, { "epoch": 0.1628, "grad_norm": 0.26248136162757874, "learning_rate": 0.0001674404, "loss": 0.3092, "step": 81400 }, { "epoch": 0.163, "grad_norm": 0.20731079578399658, "learning_rate": 0.0001674004, "loss": 0.3141, "step": 81500 }, { "epoch": 0.1632, "grad_norm": 0.2757965922355652, "learning_rate": 0.00016736040000000002, "loss": 0.3124, "step": 81600 }, { "epoch": 0.1634, "grad_norm": 0.23029279708862305, "learning_rate": 0.0001673204, "loss": 0.3079, "step": 81700 }, { "epoch": 0.1636, "grad_norm": 0.22975629568099976, "learning_rate": 0.0001672804, "loss": 0.3157, "step": 81800 }, { "epoch": 0.1638, "grad_norm": 0.27681633830070496, "learning_rate": 0.0001672404, "loss": 0.31, "step": 81900 }, { "epoch": 0.164, "grad_norm": 0.20509104430675507, "learning_rate": 0.0001672004, "loss": 0.3141, "step": 82000 }, { "epoch": 0.1642, "grad_norm": 0.22510622441768646, "learning_rate": 0.00016716040000000002, "loss": 0.3046, "step": 82100 }, { "epoch": 0.1644, "grad_norm": 0.24418208003044128, "learning_rate": 0.0001671204, "loss": 0.324, "step": 82200 }, { "epoch": 0.1646, "grad_norm": 0.2196940779685974, "learning_rate": 0.0001670804, "loss": 0.3079, "step": 82300 }, { "epoch": 0.1648, "grad_norm": 0.19491960108280182, "learning_rate": 0.0001670404, "loss": 0.3114, "step": 82400 }, { "epoch": 0.165, "grad_norm": 0.18604259192943573, "learning_rate": 0.00016700040000000002, "loss": 0.3087, "step": 82500 }, { "epoch": 0.1652, "grad_norm": 0.22354719042778015, "learning_rate": 0.00016696040000000002, "loss": 0.3059, "step": 82600 }, { "epoch": 0.1654, "grad_norm": 0.19884774088859558, "learning_rate": 0.0001669204, "loss": 0.3094, "step": 82700 }, { "epoch": 0.1656, "grad_norm": 0.17430974543094635, "learning_rate": 0.0001668804, "loss": 0.3062, "step": 82800 }, { "epoch": 0.1658, "grad_norm": 0.32356834411621094, "learning_rate": 0.0001668404, "loss": 0.3127, "step": 82900 }, { "epoch": 0.166, "grad_norm": 0.19034580886363983, "learning_rate": 0.00016680040000000002, "loss": 0.308, "step": 83000 }, { "epoch": 0.1662, "grad_norm": 0.2106025516986847, "learning_rate": 0.0001667604, "loss": 0.3065, "step": 83100 }, { "epoch": 0.1664, "grad_norm": 0.22621573507785797, "learning_rate": 0.0001667204, "loss": 0.327, "step": 83200 }, { "epoch": 0.1666, "grad_norm": 0.19813230633735657, "learning_rate": 0.0001666804, "loss": 0.3081, "step": 83300 }, { "epoch": 0.1668, "grad_norm": 0.20063921809196472, "learning_rate": 0.00016664040000000002, "loss": 0.3069, "step": 83400 }, { "epoch": 0.167, "grad_norm": 0.1986655443906784, "learning_rate": 0.0001666004, "loss": 0.316, "step": 83500 }, { "epoch": 0.1672, "grad_norm": 0.22707831859588623, "learning_rate": 0.0001665604, "loss": 0.3103, "step": 83600 }, { "epoch": 0.1674, "grad_norm": 0.233692929148674, "learning_rate": 0.0001665204, "loss": 0.3196, "step": 83700 }, { "epoch": 0.1676, "grad_norm": 0.35872554779052734, "learning_rate": 0.0001664804, "loss": 0.3179, "step": 83800 }, { "epoch": 0.1678, "grad_norm": 0.3032999336719513, "learning_rate": 0.0001664404, "loss": 0.3123, "step": 83900 }, { "epoch": 0.168, "grad_norm": 0.27240657806396484, "learning_rate": 0.0001664004, "loss": 0.312, "step": 84000 }, { "epoch": 0.1682, "grad_norm": 0.20436140894889832, "learning_rate": 0.00016636040000000003, "loss": 0.3079, "step": 84100 }, { "epoch": 0.1684, "grad_norm": 0.26683861017227173, "learning_rate": 0.0001663204, "loss": 0.3107, "step": 84200 }, { "epoch": 0.1686, "grad_norm": 0.3275362253189087, "learning_rate": 0.00016628040000000001, "loss": 0.3066, "step": 84300 }, { "epoch": 0.1688, "grad_norm": 0.24865290522575378, "learning_rate": 0.0001662404, "loss": 0.3103, "step": 84400 }, { "epoch": 0.169, "grad_norm": 0.24508464336395264, "learning_rate": 0.0001662004, "loss": 0.3084, "step": 84500 }, { "epoch": 0.1692, "grad_norm": 0.19160589575767517, "learning_rate": 0.00016616040000000002, "loss": 0.3086, "step": 84600 }, { "epoch": 0.1694, "grad_norm": 0.21846391260623932, "learning_rate": 0.0001661204, "loss": 0.3038, "step": 84700 }, { "epoch": 0.1696, "grad_norm": 0.3486625850200653, "learning_rate": 0.0001660804, "loss": 0.3157, "step": 84800 }, { "epoch": 0.1698, "grad_norm": 0.281613826751709, "learning_rate": 0.0001660404, "loss": 0.3051, "step": 84900 }, { "epoch": 0.17, "grad_norm": 0.2179608792066574, "learning_rate": 0.00016600040000000002, "loss": 0.3258, "step": 85000 }, { "epoch": 0.1702, "grad_norm": 0.24830220639705658, "learning_rate": 0.00016596040000000002, "loss": 0.312, "step": 85100 }, { "epoch": 0.1704, "grad_norm": 0.2119312733411789, "learning_rate": 0.0001659204, "loss": 0.3119, "step": 85200 }, { "epoch": 0.1706, "grad_norm": 0.34956997632980347, "learning_rate": 0.0001658804, "loss": 0.3004, "step": 85300 }, { "epoch": 0.1708, "grad_norm": 0.2335246205329895, "learning_rate": 0.0001658404, "loss": 0.3102, "step": 85400 }, { "epoch": 0.171, "grad_norm": 0.2509993612766266, "learning_rate": 0.00016580040000000002, "loss": 0.3118, "step": 85500 }, { "epoch": 0.1712, "grad_norm": 0.23735851049423218, "learning_rate": 0.0001657604, "loss": 0.3073, "step": 85600 }, { "epoch": 0.1714, "grad_norm": 0.1960500031709671, "learning_rate": 0.0001657204, "loss": 0.3061, "step": 85700 }, { "epoch": 0.1716, "grad_norm": 0.17898212373256683, "learning_rate": 0.0001656804, "loss": 0.3056, "step": 85800 }, { "epoch": 0.1718, "grad_norm": 0.207002654671669, "learning_rate": 0.0001656404, "loss": 0.3114, "step": 85900 }, { "epoch": 0.172, "grad_norm": 0.19042788445949554, "learning_rate": 0.00016560040000000001, "loss": 0.3109, "step": 86000 }, { "epoch": 0.1722, "grad_norm": 0.2865552604198456, "learning_rate": 0.0001655604, "loss": 0.3185, "step": 86100 }, { "epoch": 0.1724, "grad_norm": 0.19299517571926117, "learning_rate": 0.0001655204, "loss": 0.3329, "step": 86200 }, { "epoch": 0.1726, "grad_norm": 0.22495882213115692, "learning_rate": 0.0001654804, "loss": 0.3175, "step": 86300 }, { "epoch": 0.1728, "grad_norm": 0.23224885761737823, "learning_rate": 0.00016544040000000002, "loss": 0.3076, "step": 86400 }, { "epoch": 0.173, "grad_norm": 0.24318882822990417, "learning_rate": 0.0001654004, "loss": 0.3123, "step": 86500 }, { "epoch": 0.1732, "grad_norm": 0.24434731900691986, "learning_rate": 0.0001653604, "loss": 0.3092, "step": 86600 }, { "epoch": 0.1734, "grad_norm": 0.23535099625587463, "learning_rate": 0.0001653204, "loss": 0.3091, "step": 86700 }, { "epoch": 0.1736, "grad_norm": 0.3537319600582123, "learning_rate": 0.0001652804, "loss": 0.3187, "step": 86800 }, { "epoch": 0.1738, "grad_norm": 0.2087029367685318, "learning_rate": 0.0001652404, "loss": 0.3096, "step": 86900 }, { "epoch": 0.174, "grad_norm": 0.20568256080150604, "learning_rate": 0.0001652004, "loss": 0.3114, "step": 87000 }, { "epoch": 0.1742, "grad_norm": 0.19157211482524872, "learning_rate": 0.00016516040000000003, "loss": 0.3124, "step": 87100 }, { "epoch": 0.1744, "grad_norm": 0.20994393527507782, "learning_rate": 0.0001651204, "loss": 0.3047, "step": 87200 }, { "epoch": 0.1746, "grad_norm": 0.4066809117794037, "learning_rate": 0.0001650804, "loss": 0.3105, "step": 87300 }, { "epoch": 0.1748, "grad_norm": 0.21257740259170532, "learning_rate": 0.0001650404, "loss": 0.3084, "step": 87400 }, { "epoch": 0.175, "grad_norm": 0.21162879467010498, "learning_rate": 0.0001650004, "loss": 0.3105, "step": 87500 }, { "epoch": 0.1752, "grad_norm": 0.21106883883476257, "learning_rate": 0.00016496040000000002, "loss": 0.3098, "step": 87600 }, { "epoch": 0.1754, "grad_norm": 0.21054783463478088, "learning_rate": 0.0001649204, "loss": 0.3018, "step": 87700 }, { "epoch": 0.1756, "grad_norm": 0.1708284318447113, "learning_rate": 0.0001648804, "loss": 0.3049, "step": 87800 }, { "epoch": 0.1758, "grad_norm": 0.20066872239112854, "learning_rate": 0.0001648404, "loss": 0.3055, "step": 87900 }, { "epoch": 0.176, "grad_norm": 0.3257821798324585, "learning_rate": 0.00016480040000000002, "loss": 0.3102, "step": 88000 }, { "epoch": 0.1762, "grad_norm": 0.1875178962945938, "learning_rate": 0.00016476040000000002, "loss": 0.311, "step": 88100 }, { "epoch": 0.1764, "grad_norm": 0.3387661874294281, "learning_rate": 0.0001647204, "loss": 0.3101, "step": 88200 }, { "epoch": 0.1766, "grad_norm": 0.3336031436920166, "learning_rate": 0.0001646804, "loss": 0.3154, "step": 88300 }, { "epoch": 0.1768, "grad_norm": 0.21378177404403687, "learning_rate": 0.0001646404, "loss": 0.3195, "step": 88400 }, { "epoch": 0.177, "grad_norm": 0.2274891436100006, "learning_rate": 0.00016460040000000002, "loss": 0.3096, "step": 88500 }, { "epoch": 0.1772, "grad_norm": 0.23640379309654236, "learning_rate": 0.0001645604, "loss": 0.3089, "step": 88600 }, { "epoch": 0.1774, "grad_norm": 0.19889909029006958, "learning_rate": 0.00016452040000000003, "loss": 0.3058, "step": 88700 }, { "epoch": 0.1776, "grad_norm": 0.20005521178245544, "learning_rate": 0.0001644804, "loss": 0.3059, "step": 88800 }, { "epoch": 0.1778, "grad_norm": 0.18955092132091522, "learning_rate": 0.00016444040000000002, "loss": 0.3026, "step": 88900 }, { "epoch": 0.178, "grad_norm": 0.23005805909633636, "learning_rate": 0.0001644004, "loss": 0.3067, "step": 89000 }, { "epoch": 0.1782, "grad_norm": 0.1776682585477829, "learning_rate": 0.0001643604, "loss": 0.3029, "step": 89100 }, { "epoch": 0.1784, "grad_norm": 0.29056572914123535, "learning_rate": 0.00016432040000000003, "loss": 0.3025, "step": 89200 }, { "epoch": 0.1786, "grad_norm": 0.2611645460128784, "learning_rate": 0.0001642804, "loss": 0.3022, "step": 89300 }, { "epoch": 0.1788, "grad_norm": 0.22479821741580963, "learning_rate": 0.00016424040000000001, "loss": 0.3102, "step": 89400 }, { "epoch": 0.179, "grad_norm": 0.3284640908241272, "learning_rate": 0.0001642004, "loss": 0.3078, "step": 89500 }, { "epoch": 0.1792, "grad_norm": 0.19706085324287415, "learning_rate": 0.0001641604, "loss": 0.3005, "step": 89600 }, { "epoch": 0.1794, "grad_norm": 0.1871166229248047, "learning_rate": 0.00016412040000000002, "loss": 0.3067, "step": 89700 }, { "epoch": 0.1796, "grad_norm": 0.18414820730686188, "learning_rate": 0.0001640804, "loss": 0.3065, "step": 89800 }, { "epoch": 0.1798, "grad_norm": 0.22224344313144684, "learning_rate": 0.0001640404, "loss": 0.3023, "step": 89900 }, { "epoch": 0.18, "grad_norm": 0.1839105784893036, "learning_rate": 0.0001640004, "loss": 0.3044, "step": 90000 }, { "epoch": 0.1802, "grad_norm": 0.3029592037200928, "learning_rate": 0.00016396040000000002, "loss": 0.3002, "step": 90100 }, { "epoch": 0.1804, "grad_norm": 0.21777479350566864, "learning_rate": 0.00016392040000000002, "loss": 0.3055, "step": 90200 }, { "epoch": 0.1806, "grad_norm": 0.1881372481584549, "learning_rate": 0.0001638804, "loss": 0.3015, "step": 90300 }, { "epoch": 0.1808, "grad_norm": 0.20605388283729553, "learning_rate": 0.0001638404, "loss": 0.3086, "step": 90400 }, { "epoch": 0.181, "grad_norm": 0.21422967314720154, "learning_rate": 0.0001638004, "loss": 0.3107, "step": 90500 }, { "epoch": 0.1812, "grad_norm": 0.18776142597198486, "learning_rate": 0.00016376040000000002, "loss": 0.297, "step": 90600 }, { "epoch": 0.1814, "grad_norm": 0.22885388135910034, "learning_rate": 0.0001637204, "loss": 0.2897, "step": 90700 }, { "epoch": 0.1816, "grad_norm": 0.16700296103954315, "learning_rate": 0.0001636804, "loss": 0.298, "step": 90800 }, { "epoch": 0.1818, "grad_norm": 0.2172263115644455, "learning_rate": 0.0001636404, "loss": 0.2995, "step": 90900 }, { "epoch": 0.182, "grad_norm": 0.25213050842285156, "learning_rate": 0.00016360040000000002, "loss": 0.3077, "step": 91000 }, { "epoch": 0.1822, "grad_norm": 0.22334252297878265, "learning_rate": 0.00016356040000000001, "loss": 0.3077, "step": 91100 }, { "epoch": 0.1824, "grad_norm": 0.25537583231925964, "learning_rate": 0.0001635204, "loss": 0.3074, "step": 91200 }, { "epoch": 0.1826, "grad_norm": 0.24679423868656158, "learning_rate": 0.0001634804, "loss": 0.3139, "step": 91300 }, { "epoch": 0.1828, "grad_norm": 0.26473841071128845, "learning_rate": 0.0001634404, "loss": 0.3155, "step": 91400 }, { "epoch": 0.183, "grad_norm": 0.23032037913799286, "learning_rate": 0.00016340040000000002, "loss": 0.3191, "step": 91500 }, { "epoch": 0.1832, "grad_norm": 0.4667571485042572, "learning_rate": 0.0001633604, "loss": 0.3006, "step": 91600 }, { "epoch": 0.1834, "grad_norm": 0.1938803791999817, "learning_rate": 0.00016332040000000003, "loss": 0.3034, "step": 91700 }, { "epoch": 0.1836, "grad_norm": 0.1822306215763092, "learning_rate": 0.0001632804, "loss": 0.306, "step": 91800 }, { "epoch": 0.1838, "grad_norm": 0.29107385873794556, "learning_rate": 0.00016324040000000002, "loss": 0.304, "step": 91900 }, { "epoch": 0.184, "grad_norm": 0.3339613676071167, "learning_rate": 0.0001632004, "loss": 0.2963, "step": 92000 }, { "epoch": 0.1842, "grad_norm": 0.2274635285139084, "learning_rate": 0.0001631604, "loss": 0.2997, "step": 92100 }, { "epoch": 0.1844, "grad_norm": 0.19457891583442688, "learning_rate": 0.00016312040000000002, "loss": 0.306, "step": 92200 }, { "epoch": 0.1846, "grad_norm": 0.2524142265319824, "learning_rate": 0.0001630804, "loss": 0.3096, "step": 92300 }, { "epoch": 0.1848, "grad_norm": 0.25092560052871704, "learning_rate": 0.0001630404, "loss": 0.2999, "step": 92400 }, { "epoch": 0.185, "grad_norm": 0.323590487241745, "learning_rate": 0.0001630004, "loss": 0.2996, "step": 92500 }, { "epoch": 0.1852, "grad_norm": 0.4235788881778717, "learning_rate": 0.00016296040000000003, "loss": 0.3031, "step": 92600 }, { "epoch": 0.1854, "grad_norm": 0.19289268553256989, "learning_rate": 0.00016292040000000002, "loss": 0.3026, "step": 92700 }, { "epoch": 0.1856, "grad_norm": 0.1985141932964325, "learning_rate": 0.00016288039999999999, "loss": 0.302, "step": 92800 }, { "epoch": 0.1858, "grad_norm": 0.3089956045150757, "learning_rate": 0.0001628404, "loss": 0.3002, "step": 92900 }, { "epoch": 0.186, "grad_norm": 0.2129218727350235, "learning_rate": 0.0001628004, "loss": 0.3036, "step": 93000 }, { "epoch": 0.1862, "grad_norm": 0.3660809397697449, "learning_rate": 0.00016276040000000002, "loss": 0.3017, "step": 93100 }, { "epoch": 0.1864, "grad_norm": 0.22182439267635345, "learning_rate": 0.00016272040000000002, "loss": 0.3012, "step": 93200 }, { "epoch": 0.1866, "grad_norm": 0.22655236721038818, "learning_rate": 0.0001626804, "loss": 0.3054, "step": 93300 }, { "epoch": 0.1868, "grad_norm": 0.19816626608371735, "learning_rate": 0.0001626404, "loss": 0.2978, "step": 93400 }, { "epoch": 0.187, "grad_norm": 0.19407741725444794, "learning_rate": 0.0001626004, "loss": 0.3013, "step": 93500 }, { "epoch": 0.1872, "grad_norm": 0.15464486181735992, "learning_rate": 0.00016256040000000002, "loss": 0.3073, "step": 93600 }, { "epoch": 0.1874, "grad_norm": 0.34153929352760315, "learning_rate": 0.0001625204, "loss": 0.3023, "step": 93700 }, { "epoch": 0.1876, "grad_norm": 0.35526177287101746, "learning_rate": 0.0001624804, "loss": 0.311, "step": 93800 }, { "epoch": 0.1878, "grad_norm": 0.2251167744398117, "learning_rate": 0.0001624404, "loss": 0.3127, "step": 93900 }, { "epoch": 0.188, "grad_norm": 0.25468969345092773, "learning_rate": 0.00016240040000000002, "loss": 0.3004, "step": 94000 }, { "epoch": 0.1882, "grad_norm": 0.17790032923221588, "learning_rate": 0.0001623604, "loss": 0.3096, "step": 94100 }, { "epoch": 0.1884, "grad_norm": 0.23429210484027863, "learning_rate": 0.0001623204, "loss": 0.3023, "step": 94200 }, { "epoch": 0.1886, "grad_norm": 0.3114053010940552, "learning_rate": 0.0001622804, "loss": 0.3043, "step": 94300 }, { "epoch": 0.1888, "grad_norm": 0.19942690432071686, "learning_rate": 0.0001622404, "loss": 0.2985, "step": 94400 }, { "epoch": 0.189, "grad_norm": 0.22419995069503784, "learning_rate": 0.0001622004, "loss": 0.3014, "step": 94500 }, { "epoch": 0.1892, "grad_norm": 0.19408832490444183, "learning_rate": 0.0001621604, "loss": 0.3031, "step": 94600 }, { "epoch": 0.1894, "grad_norm": 0.24643263220787048, "learning_rate": 0.00016212040000000003, "loss": 0.3042, "step": 94700 }, { "epoch": 0.1896, "grad_norm": 0.22279608249664307, "learning_rate": 0.0001620804, "loss": 0.3022, "step": 94800 }, { "epoch": 0.1898, "grad_norm": 0.340925931930542, "learning_rate": 0.00016204040000000001, "loss": 0.2986, "step": 94900 }, { "epoch": 0.19, "grad_norm": 0.1957314908504486, "learning_rate": 0.0001620004, "loss": 0.3014, "step": 95000 }, { "epoch": 0.1902, "grad_norm": 0.23643195629119873, "learning_rate": 0.0001619604, "loss": 0.3103, "step": 95100 }, { "epoch": 0.1904, "grad_norm": 0.2049199789762497, "learning_rate": 0.00016192040000000002, "loss": 0.3039, "step": 95200 }, { "epoch": 0.1906, "grad_norm": 0.19225084781646729, "learning_rate": 0.0001618804, "loss": 0.2963, "step": 95300 }, { "epoch": 0.1908, "grad_norm": 0.2920241355895996, "learning_rate": 0.0001618404, "loss": 0.2996, "step": 95400 }, { "epoch": 0.191, "grad_norm": 0.21930663287639618, "learning_rate": 0.0001618004, "loss": 0.3014, "step": 95500 }, { "epoch": 0.1912, "grad_norm": 0.2423180788755417, "learning_rate": 0.00016176040000000002, "loss": 0.3032, "step": 95600 }, { "epoch": 0.1914, "grad_norm": 0.37270796298980713, "learning_rate": 0.00016172040000000002, "loss": 0.3054, "step": 95700 }, { "epoch": 0.1916, "grad_norm": 0.26363104581832886, "learning_rate": 0.0001616804, "loss": 0.3049, "step": 95800 }, { "epoch": 0.1918, "grad_norm": 0.2359706610441208, "learning_rate": 0.0001616404, "loss": 0.3016, "step": 95900 }, { "epoch": 0.192, "grad_norm": 0.2633230984210968, "learning_rate": 0.0001616004, "loss": 0.3027, "step": 96000 }, { "epoch": 0.1922, "grad_norm": 0.16779562830924988, "learning_rate": 0.00016156040000000002, "loss": 0.3006, "step": 96100 }, { "epoch": 0.1924, "grad_norm": 0.17921751737594604, "learning_rate": 0.0001615204, "loss": 0.2997, "step": 96200 }, { "epoch": 0.1926, "grad_norm": 0.1813753992319107, "learning_rate": 0.0001614804, "loss": 0.2989, "step": 96300 }, { "epoch": 0.1928, "grad_norm": 0.2046331912279129, "learning_rate": 0.0001614404, "loss": 0.2992, "step": 96400 }, { "epoch": 0.193, "grad_norm": 0.1848832368850708, "learning_rate": 0.0001614004, "loss": 0.301, "step": 96500 }, { "epoch": 0.1932, "grad_norm": 0.27407926321029663, "learning_rate": 0.00016136040000000001, "loss": 0.3084, "step": 96600 }, { "epoch": 0.1934, "grad_norm": 0.17408868670463562, "learning_rate": 0.0001613204, "loss": 0.2984, "step": 96700 }, { "epoch": 0.1936, "grad_norm": 0.3170103132724762, "learning_rate": 0.0001612804, "loss": 0.3098, "step": 96800 }, { "epoch": 0.1938, "grad_norm": 0.1906721144914627, "learning_rate": 0.0001612404, "loss": 0.3002, "step": 96900 }, { "epoch": 0.194, "grad_norm": 0.22694748640060425, "learning_rate": 0.00016120040000000002, "loss": 0.3056, "step": 97000 }, { "epoch": 0.1942, "grad_norm": 0.2381366342306137, "learning_rate": 0.0001611604, "loss": 0.3112, "step": 97100 }, { "epoch": 0.1944, "grad_norm": 0.2619655132293701, "learning_rate": 0.0001611204, "loss": 0.3044, "step": 97200 }, { "epoch": 0.1946, "grad_norm": 0.22410011291503906, "learning_rate": 0.0001610804, "loss": 0.2976, "step": 97300 }, { "epoch": 0.1948, "grad_norm": 0.17326219379901886, "learning_rate": 0.0001610404, "loss": 0.2977, "step": 97400 }, { "epoch": 0.195, "grad_norm": 0.16517135500907898, "learning_rate": 0.0001610004, "loss": 0.3074, "step": 97500 }, { "epoch": 0.1952, "grad_norm": 0.2561172544956207, "learning_rate": 0.0001609604, "loss": 0.3074, "step": 97600 }, { "epoch": 0.1954, "grad_norm": 0.19643421471118927, "learning_rate": 0.00016092040000000003, "loss": 0.3031, "step": 97700 }, { "epoch": 0.1956, "grad_norm": 0.26884031295776367, "learning_rate": 0.0001608804, "loss": 0.298, "step": 97800 }, { "epoch": 0.1958, "grad_norm": 0.1689986288547516, "learning_rate": 0.0001608404, "loss": 0.303, "step": 97900 }, { "epoch": 0.196, "grad_norm": 0.22177661955356598, "learning_rate": 0.0001608004, "loss": 0.3113, "step": 98000 }, { "epoch": 0.1962, "grad_norm": 0.23559048771858215, "learning_rate": 0.0001607604, "loss": 0.3019, "step": 98100 }, { "epoch": 0.1964, "grad_norm": 0.2539811432361603, "learning_rate": 0.00016072040000000002, "loss": 0.3017, "step": 98200 }, { "epoch": 0.1966, "grad_norm": 0.2040734440088272, "learning_rate": 0.0001606804, "loss": 0.294, "step": 98300 }, { "epoch": 0.1968, "grad_norm": 0.18892860412597656, "learning_rate": 0.0001606404, "loss": 0.3082, "step": 98400 }, { "epoch": 0.197, "grad_norm": 0.23716984689235687, "learning_rate": 0.0001606004, "loss": 0.3005, "step": 98500 }, { "epoch": 0.1972, "grad_norm": 0.28783920407295227, "learning_rate": 0.00016056040000000002, "loss": 0.303, "step": 98600 }, { "epoch": 0.1974, "grad_norm": 0.31491243839263916, "learning_rate": 0.00016052040000000002, "loss": 0.2989, "step": 98700 }, { "epoch": 0.1976, "grad_norm": 0.2184073030948639, "learning_rate": 0.0001604804, "loss": 0.2992, "step": 98800 }, { "epoch": 0.1978, "grad_norm": 0.19414541125297546, "learning_rate": 0.0001604404, "loss": 0.3067, "step": 98900 }, { "epoch": 0.198, "grad_norm": 0.18975822627544403, "learning_rate": 0.0001604004, "loss": 0.2994, "step": 99000 }, { "epoch": 0.1982, "grad_norm": 0.24121256172657013, "learning_rate": 0.00016036040000000002, "loss": 0.2967, "step": 99100 }, { "epoch": 0.1984, "grad_norm": 0.19773977994918823, "learning_rate": 0.0001603204, "loss": 0.2955, "step": 99200 }, { "epoch": 0.1986, "grad_norm": 0.22694402933120728, "learning_rate": 0.0001602804, "loss": 0.2977, "step": 99300 }, { "epoch": 0.1988, "grad_norm": 0.3497297465801239, "learning_rate": 0.0001602404, "loss": 0.3044, "step": 99400 }, { "epoch": 0.199, "grad_norm": 0.18374694883823395, "learning_rate": 0.00016020040000000002, "loss": 0.2965, "step": 99500 }, { "epoch": 0.1992, "grad_norm": 0.33236002922058105, "learning_rate": 0.0001601604, "loss": 0.3007, "step": 99600 }, { "epoch": 0.1994, "grad_norm": 0.17745955288410187, "learning_rate": 0.0001601204, "loss": 0.2924, "step": 99700 }, { "epoch": 0.1996, "grad_norm": 0.3132670223712921, "learning_rate": 0.0001600804, "loss": 0.3021, "step": 99800 }, { "epoch": 0.1998, "grad_norm": 0.2375771552324295, "learning_rate": 0.0001600404, "loss": 0.3016, "step": 99900 }, { "epoch": 0.2, "grad_norm": 0.31381866335868835, "learning_rate": 0.00016000040000000001, "loss": 0.2965, "step": 100000 }, { "epoch": 0.0002, "grad_norm": 0.24663914740085602, "learning_rate": 0.0001599604, "loss": 0.3381, "step": 100100 }, { "epoch": 0.0004, "grad_norm": 0.6509382128715515, "learning_rate": 0.0001599204, "loss": 0.3687, "step": 100200 }, { "epoch": 0.0006, "grad_norm": 0.8887121081352234, "learning_rate": 0.0001598804, "loss": 0.3573, "step": 100300 }, { "epoch": 0.0008, "grad_norm": 0.366519033908844, "learning_rate": 0.0001598404, "loss": 0.3624, "step": 100400 }, { "epoch": 0.001, "grad_norm": 0.2282172590494156, "learning_rate": 0.0001598004, "loss": 0.3336, "step": 100500 }, { "epoch": 0.0012, "grad_norm": 0.46760937571525574, "learning_rate": 0.0001597604, "loss": 0.3279, "step": 100600 }, { "epoch": 0.0014, "grad_norm": 0.38649675250053406, "learning_rate": 0.00015972040000000002, "loss": 0.3385, "step": 100700 }, { "epoch": 0.0016, "grad_norm": 0.4251730740070343, "learning_rate": 0.0001596804, "loss": 0.335, "step": 100800 }, { "epoch": 0.0018, "grad_norm": 0.3479570150375366, "learning_rate": 0.0001596404, "loss": 0.3238, "step": 100900 }, { "epoch": 0.002, "grad_norm": 0.3096480667591095, "learning_rate": 0.0001596004, "loss": 0.3159, "step": 101000 }, { "epoch": 0.0022, "grad_norm": 0.36655187606811523, "learning_rate": 0.0001595604, "loss": 0.3457, "step": 101100 }, { "epoch": 0.0024, "grad_norm": 0.6139711141586304, "learning_rate": 0.00015952040000000002, "loss": 0.3249, "step": 101200 }, { "epoch": 0.0026, "grad_norm": 0.6844519376754761, "learning_rate": 0.0001594804, "loss": 0.3306, "step": 101300 }, { "epoch": 0.0028, "grad_norm": 0.16951784491539001, "learning_rate": 0.0001594404, "loss": 0.3109, "step": 101400 }, { "epoch": 0.003, "grad_norm": 0.4543248414993286, "learning_rate": 0.0001594004, "loss": 0.3241, "step": 101500 }, { "epoch": 0.0032, "grad_norm": 0.6316892504692078, "learning_rate": 0.00015936040000000002, "loss": 0.3479, "step": 101600 }, { "epoch": 0.0034, "grad_norm": 0.8181749582290649, "learning_rate": 0.00015932040000000001, "loss": 0.3215, "step": 101700 }, { "epoch": 0.0036, "grad_norm": 0.33392763137817383, "learning_rate": 0.0001592804, "loss": 0.3253, "step": 101800 }, { "epoch": 0.0038, "grad_norm": 0.27364447712898254, "learning_rate": 0.0001592404, "loss": 0.3252, "step": 101900 }, { "epoch": 0.004, "grad_norm": 0.443889319896698, "learning_rate": 0.0001592004, "loss": 0.3214, "step": 102000 }, { "epoch": 0.0042, "grad_norm": 0.23544713854789734, "learning_rate": 0.00015916040000000002, "loss": 0.3227, "step": 102100 }, { "epoch": 0.0044, "grad_norm": 0.24548670649528503, "learning_rate": 0.0001591204, "loss": 0.3491, "step": 102200 }, { "epoch": 0.0046, "grad_norm": 0.4170477092266083, "learning_rate": 0.00015908040000000003, "loss": 0.3318, "step": 102300 }, { "epoch": 0.0048, "grad_norm": 0.24510768055915833, "learning_rate": 0.0001590404, "loss": 0.3116, "step": 102400 }, { "epoch": 0.005, "grad_norm": 0.2921444773674011, "learning_rate": 0.00015900040000000002, "loss": 0.3049, "step": 102500 }, { "epoch": 0.0052, "grad_norm": 0.2453673630952835, "learning_rate": 0.0001589604, "loss": 0.3052, "step": 102600 }, { "epoch": 0.0054, "grad_norm": 0.3682481348514557, "learning_rate": 0.0001589204, "loss": 0.3123, "step": 102700 }, { "epoch": 0.0056, "grad_norm": 0.6065927743911743, "learning_rate": 0.00015888040000000002, "loss": 0.3278, "step": 102800 }, { "epoch": 0.0058, "grad_norm": 0.27878880500793457, "learning_rate": 0.0001588404, "loss": 0.3095, "step": 102900 }, { "epoch": 0.006, "grad_norm": 0.3071708083152771, "learning_rate": 0.0001588004, "loss": 0.3729, "step": 103000 }, { "epoch": 0.0062, "grad_norm": 0.27451980113983154, "learning_rate": 0.0001587604, "loss": 0.3234, "step": 103100 }, { "epoch": 0.0064, "grad_norm": 0.1992095708847046, "learning_rate": 0.00015872040000000003, "loss": 0.3142, "step": 103200 }, { "epoch": 0.0066, "grad_norm": 1.1765793561935425, "learning_rate": 0.00015868040000000002, "loss": 0.3365, "step": 103300 }, { "epoch": 0.0068, "grad_norm": 0.22701187431812286, "learning_rate": 0.0001586404, "loss": 0.3165, "step": 103400 }, { "epoch": 0.007, "grad_norm": 0.30635154247283936, "learning_rate": 0.0001586004, "loss": 0.3271, "step": 103500 }, { "epoch": 0.0072, "grad_norm": 0.3161128759384155, "learning_rate": 0.0001585604, "loss": 0.3246, "step": 103600 }, { "epoch": 0.0074, "grad_norm": 0.2176142781972885, "learning_rate": 0.00015852040000000002, "loss": 0.3342, "step": 103700 }, { "epoch": 0.0076, "grad_norm": 0.15987540781497955, "learning_rate": 0.00015848040000000001, "loss": 0.3345, "step": 103800 }, { "epoch": 0.0078, "grad_norm": 0.2533618211746216, "learning_rate": 0.0001584404, "loss": 0.3176, "step": 103900 }, { "epoch": 0.008, "grad_norm": 0.28857430815696716, "learning_rate": 0.0001584004, "loss": 0.3287, "step": 104000 }, { "epoch": 0.0082, "grad_norm": 0.4161205291748047, "learning_rate": 0.0001583604, "loss": 0.3342, "step": 104100 }, { "epoch": 0.0084, "grad_norm": 0.851681649684906, "learning_rate": 0.00015832040000000002, "loss": 0.318, "step": 104200 }, { "epoch": 0.0086, "grad_norm": 0.21251384913921356, "learning_rate": 0.0001582804, "loss": 0.3125, "step": 104300 }, { "epoch": 0.0088, "grad_norm": 0.19260574877262115, "learning_rate": 0.0001582404, "loss": 0.3137, "step": 104400 }, { "epoch": 0.009, "grad_norm": 0.3808644413948059, "learning_rate": 0.0001582004, "loss": 0.33, "step": 104500 }, { "epoch": 0.0092, "grad_norm": 0.23479130864143372, "learning_rate": 0.00015816040000000002, "loss": 0.3368, "step": 104600 }, { "epoch": 0.0094, "grad_norm": 0.3280342221260071, "learning_rate": 0.0001581204, "loss": 0.3242, "step": 104700 }, { "epoch": 0.0096, "grad_norm": 0.3940763473510742, "learning_rate": 0.0001580804, "loss": 0.3362, "step": 104800 }, { "epoch": 0.0098, "grad_norm": 0.23537495732307434, "learning_rate": 0.0001580404, "loss": 0.3342, "step": 104900 }, { "epoch": 0.01, "grad_norm": 0.3302271366119385, "learning_rate": 0.0001580004, "loss": 0.3262, "step": 105000 }, { "epoch": 0.0102, "grad_norm": 0.3009653389453888, "learning_rate": 0.0001579604, "loss": 0.3353, "step": 105100 }, { "epoch": 0.0104, "grad_norm": 0.28532806038856506, "learning_rate": 0.0001579204, "loss": 0.3156, "step": 105200 }, { "epoch": 0.0106, "grad_norm": 0.24950698018074036, "learning_rate": 0.00015788040000000003, "loss": 0.3294, "step": 105300 }, { "epoch": 0.0108, "grad_norm": 0.26212894916534424, "learning_rate": 0.0001578404, "loss": 0.3329, "step": 105400 }, { "epoch": 0.011, "grad_norm": 0.22611194849014282, "learning_rate": 0.00015780040000000001, "loss": 0.3311, "step": 105500 }, { "epoch": 0.0112, "grad_norm": 0.2197837382555008, "learning_rate": 0.0001577604, "loss": 0.327, "step": 105600 }, { "epoch": 0.0114, "grad_norm": 0.31932878494262695, "learning_rate": 0.0001577204, "loss": 0.316, "step": 105700 }, { "epoch": 0.0116, "grad_norm": 0.2394053041934967, "learning_rate": 0.00015768040000000002, "loss": 0.331, "step": 105800 }, { "epoch": 0.0118, "grad_norm": 0.20572052896022797, "learning_rate": 0.0001576404, "loss": 0.3324, "step": 105900 }, { "epoch": 0.012, "grad_norm": 0.19541539251804352, "learning_rate": 0.0001576004, "loss": 0.3201, "step": 106000 }, { "epoch": 0.0122, "grad_norm": 0.2593480348587036, "learning_rate": 0.0001575604, "loss": 0.3545, "step": 106100 }, { "epoch": 0.0124, "grad_norm": 0.2632744014263153, "learning_rate": 0.00015752040000000002, "loss": 0.3217, "step": 106200 }, { "epoch": 0.0126, "grad_norm": 0.2666124701499939, "learning_rate": 0.00015748040000000002, "loss": 0.3302, "step": 106300 }, { "epoch": 0.0128, "grad_norm": 0.17219951748847961, "learning_rate": 0.0001574404, "loss": 0.3136, "step": 106400 }, { "epoch": 0.013, "grad_norm": 0.22360770404338837, "learning_rate": 0.0001574004, "loss": 0.3171, "step": 106500 }, { "epoch": 0.0132, "grad_norm": 0.24799229204654694, "learning_rate": 0.0001573604, "loss": 0.3081, "step": 106600 }, { "epoch": 0.0134, "grad_norm": 0.20889222621917725, "learning_rate": 0.00015732040000000002, "loss": 0.3466, "step": 106700 }, { "epoch": 0.0136, "grad_norm": 0.2627999186515808, "learning_rate": 0.0001572804, "loss": 0.3139, "step": 106800 }, { "epoch": 0.0138, "grad_norm": 0.3904314339160919, "learning_rate": 0.0001572404, "loss": 0.3438, "step": 106900 }, { "epoch": 0.014, "grad_norm": 0.2342156618833542, "learning_rate": 0.0001572004, "loss": 0.335, "step": 107000 }, { "epoch": 0.0142, "grad_norm": 0.17536994814872742, "learning_rate": 0.00015716040000000002, "loss": 0.3221, "step": 107100 }, { "epoch": 0.0144, "grad_norm": 0.3388653099536896, "learning_rate": 0.00015712040000000001, "loss": 0.3233, "step": 107200 }, { "epoch": 0.0146, "grad_norm": 0.1892201155424118, "learning_rate": 0.0001570804, "loss": 0.3233, "step": 107300 }, { "epoch": 0.0148, "grad_norm": 0.4015616178512573, "learning_rate": 0.0001570404, "loss": 0.3135, "step": 107400 }, { "epoch": 0.015, "grad_norm": 0.2883925139904022, "learning_rate": 0.0001570004, "loss": 0.3252, "step": 107500 }, { "epoch": 0.0152, "grad_norm": 0.27161914110183716, "learning_rate": 0.00015696040000000002, "loss": 0.3202, "step": 107600 }, { "epoch": 0.0154, "grad_norm": 0.1823039948940277, "learning_rate": 0.0001569204, "loss": 0.3132, "step": 107700 }, { "epoch": 0.0156, "grad_norm": 0.23866082727909088, "learning_rate": 0.0001568804, "loss": 0.3158, "step": 107800 }, { "epoch": 0.0158, "grad_norm": 0.3076813519001007, "learning_rate": 0.0001568404, "loss": 0.3097, "step": 107900 }, { "epoch": 0.016, "grad_norm": 0.3990881145000458, "learning_rate": 0.0001568004, "loss": 0.3117, "step": 108000 }, { "epoch": 0.0162, "grad_norm": 0.2077111005783081, "learning_rate": 0.0001567604, "loss": 0.3304, "step": 108100 }, { "epoch": 0.0164, "grad_norm": 0.20977669954299927, "learning_rate": 0.0001567204, "loss": 0.3177, "step": 108200 }, { "epoch": 0.0166, "grad_norm": 0.33695122599601746, "learning_rate": 0.00015668040000000003, "loss": 0.3127, "step": 108300 }, { "epoch": 0.0168, "grad_norm": 0.3933875262737274, "learning_rate": 0.0001566404, "loss": 0.3187, "step": 108400 }, { "epoch": 0.017, "grad_norm": 0.24768926203250885, "learning_rate": 0.0001566004, "loss": 0.3279, "step": 108500 }, { "epoch": 0.0172, "grad_norm": 0.23914366960525513, "learning_rate": 0.0001565604, "loss": 0.3389, "step": 108600 }, { "epoch": 0.0174, "grad_norm": 0.2776049077510834, "learning_rate": 0.0001565204, "loss": 0.3202, "step": 108700 }, { "epoch": 0.0176, "grad_norm": 0.2476639598608017, "learning_rate": 0.00015648040000000002, "loss": 0.3203, "step": 108800 }, { "epoch": 0.0178, "grad_norm": 0.25279584527015686, "learning_rate": 0.0001564404, "loss": 0.3031, "step": 108900 }, { "epoch": 0.018, "grad_norm": 0.19141827523708344, "learning_rate": 0.0001564004, "loss": 0.3146, "step": 109000 }, { "epoch": 0.0182, "grad_norm": 0.1972874402999878, "learning_rate": 0.0001563604, "loss": 0.3188, "step": 109100 }, { "epoch": 0.0184, "grad_norm": 0.21893855929374695, "learning_rate": 0.00015632040000000002, "loss": 0.3208, "step": 109200 }, { "epoch": 0.0186, "grad_norm": 0.2527053654193878, "learning_rate": 0.00015628040000000002, "loss": 0.3135, "step": 109300 }, { "epoch": 0.0188, "grad_norm": 0.21851500868797302, "learning_rate": 0.0001562404, "loss": 0.2979, "step": 109400 }, { "epoch": 0.019, "grad_norm": 0.41574743390083313, "learning_rate": 0.0001562004, "loss": 0.3395, "step": 109500 }, { "epoch": 0.0192, "grad_norm": 0.23641952872276306, "learning_rate": 0.0001561604, "loss": 0.3052, "step": 109600 }, { "epoch": 0.0194, "grad_norm": 0.2123720645904541, "learning_rate": 0.00015612040000000002, "loss": 0.3294, "step": 109700 }, { "epoch": 0.0196, "grad_norm": 0.2654692530632019, "learning_rate": 0.0001560804, "loss": 0.3133, "step": 109800 }, { "epoch": 0.0198, "grad_norm": 0.7307865023612976, "learning_rate": 0.0001560404, "loss": 0.3231, "step": 109900 }, { "epoch": 0.02, "grad_norm": 0.23842667043209076, "learning_rate": 0.0001560004, "loss": 0.3278, "step": 110000 }, { "epoch": 0.0202, "grad_norm": 0.16572842001914978, "learning_rate": 0.00015596040000000002, "loss": 0.3166, "step": 110100 }, { "epoch": 0.0204, "grad_norm": 1.4259867668151855, "learning_rate": 0.0001559204, "loss": 0.3491, "step": 110200 }, { "epoch": 0.0206, "grad_norm": 0.2205478399991989, "learning_rate": 0.0001558804, "loss": 0.3302, "step": 110300 }, { "epoch": 0.0208, "grad_norm": 0.20075049996376038, "learning_rate": 0.0001558404, "loss": 0.3316, "step": 110400 }, { "epoch": 0.021, "grad_norm": 0.2577056884765625, "learning_rate": 0.0001558004, "loss": 0.2999, "step": 110500 }, { "epoch": 0.0212, "grad_norm": 0.2297847867012024, "learning_rate": 0.00015576040000000001, "loss": 0.3269, "step": 110600 }, { "epoch": 0.0214, "grad_norm": 0.21519100666046143, "learning_rate": 0.0001557204, "loss": 0.3017, "step": 110700 }, { "epoch": 0.0216, "grad_norm": 0.3626148998737335, "learning_rate": 0.00015568040000000003, "loss": 0.3238, "step": 110800 }, { "epoch": 0.0218, "grad_norm": 0.20477981865406036, "learning_rate": 0.0001556404, "loss": 0.3233, "step": 110900 }, { "epoch": 0.022, "grad_norm": 0.593943178653717, "learning_rate": 0.0001556004, "loss": 0.3187, "step": 111000 }, { "epoch": 0.0222, "grad_norm": 0.22717879712581635, "learning_rate": 0.0001555604, "loss": 0.329, "step": 111100 }, { "epoch": 0.0224, "grad_norm": 0.24597914516925812, "learning_rate": 0.0001555204, "loss": 0.3204, "step": 111200 }, { "epoch": 0.0226, "grad_norm": 0.6169671416282654, "learning_rate": 0.00015548040000000002, "loss": 0.3318, "step": 111300 }, { "epoch": 0.0228, "grad_norm": 0.512597918510437, "learning_rate": 0.0001554404, "loss": 0.3495, "step": 111400 }, { "epoch": 0.023, "grad_norm": 0.1982726901769638, "learning_rate": 0.0001554004, "loss": 0.3256, "step": 111500 }, { "epoch": 0.0232, "grad_norm": 0.24446536600589752, "learning_rate": 0.0001553604, "loss": 0.3182, "step": 111600 }, { "epoch": 0.0234, "grad_norm": 0.1990106999874115, "learning_rate": 0.0001553204, "loss": 0.3103, "step": 111700 }, { "epoch": 0.0236, "grad_norm": 0.32062387466430664, "learning_rate": 0.00015528040000000002, "loss": 0.3336, "step": 111800 }, { "epoch": 0.0238, "grad_norm": 0.24834854900836945, "learning_rate": 0.00015524039999999999, "loss": 0.3718, "step": 111900 }, { "epoch": 0.024, "grad_norm": 0.431271493434906, "learning_rate": 0.0001552004, "loss": 0.3148, "step": 112000 }, { "epoch": 0.0242, "grad_norm": 0.24593783915042877, "learning_rate": 0.0001551604, "loss": 0.3206, "step": 112100 }, { "epoch": 0.0244, "grad_norm": 0.22774197161197662, "learning_rate": 0.00015512040000000002, "loss": 0.323, "step": 112200 }, { "epoch": 0.0246, "grad_norm": 0.28193041682243347, "learning_rate": 0.00015508040000000001, "loss": 0.321, "step": 112300 }, { "epoch": 0.0248, "grad_norm": 0.27978020906448364, "learning_rate": 0.0001550404, "loss": 0.3293, "step": 112400 }, { "epoch": 0.025, "grad_norm": 0.33567145466804504, "learning_rate": 0.0001550004, "loss": 0.3398, "step": 112500 }, { "epoch": 0.0252, "grad_norm": 0.31685078144073486, "learning_rate": 0.0001549604, "loss": 0.3468, "step": 112600 }, { "epoch": 0.0254, "grad_norm": 0.23281008005142212, "learning_rate": 0.00015492040000000002, "loss": 0.3415, "step": 112700 }, { "epoch": 0.0256, "grad_norm": 0.29617008566856384, "learning_rate": 0.0001548804, "loss": 0.3215, "step": 112800 }, { "epoch": 0.0258, "grad_norm": 0.28596845269203186, "learning_rate": 0.0001548404, "loss": 0.3273, "step": 112900 }, { "epoch": 0.026, "grad_norm": 0.2520228922367096, "learning_rate": 0.0001548004, "loss": 0.3656, "step": 113000 }, { "epoch": 0.0262, "grad_norm": 0.1868925839662552, "learning_rate": 0.00015476040000000002, "loss": 0.3305, "step": 113100 }, { "epoch": 0.0264, "grad_norm": 0.2512012720108032, "learning_rate": 0.0001547204, "loss": 0.3331, "step": 113200 }, { "epoch": 0.0266, "grad_norm": 0.24037250876426697, "learning_rate": 0.0001546804, "loss": 0.3265, "step": 113300 }, { "epoch": 0.0268, "grad_norm": 0.44156473875045776, "learning_rate": 0.0001546404, "loss": 0.325, "step": 113400 }, { "epoch": 0.027, "grad_norm": 0.212955042719841, "learning_rate": 0.0001546004, "loss": 0.3103, "step": 113500 }, { "epoch": 0.0272, "grad_norm": 0.3389400839805603, "learning_rate": 0.0001545604, "loss": 0.2959, "step": 113600 }, { "epoch": 0.0274, "grad_norm": 0.21144139766693115, "learning_rate": 0.0001545204, "loss": 0.298, "step": 113700 }, { "epoch": 0.0276, "grad_norm": 0.2982296049594879, "learning_rate": 0.00015448040000000003, "loss": 0.2967, "step": 113800 }, { "epoch": 0.0278, "grad_norm": 0.23678374290466309, "learning_rate": 0.00015444040000000002, "loss": 0.2932, "step": 113900 }, { "epoch": 0.028, "grad_norm": 0.18785041570663452, "learning_rate": 0.0001544004, "loss": 0.3059, "step": 114000 }, { "epoch": 0.0282, "grad_norm": 0.27205973863601685, "learning_rate": 0.0001543604, "loss": 0.2817, "step": 114100 }, { "epoch": 0.0284, "grad_norm": 0.18749140202999115, "learning_rate": 0.0001543204, "loss": 0.2911, "step": 114200 }, { "epoch": 0.0286, "grad_norm": 0.21401409804821014, "learning_rate": 0.00015428040000000002, "loss": 0.3041, "step": 114300 }, { "epoch": 0.0288, "grad_norm": 0.20857536792755127, "learning_rate": 0.00015424040000000001, "loss": 0.2904, "step": 114400 }, { "epoch": 0.029, "grad_norm": 0.3695431053638458, "learning_rate": 0.0001542004, "loss": 0.3039, "step": 114500 }, { "epoch": 0.0292, "grad_norm": 0.42280298471450806, "learning_rate": 0.0001541604, "loss": 0.2926, "step": 114600 }, { "epoch": 0.0294, "grad_norm": 0.3620811998844147, "learning_rate": 0.0001541204, "loss": 0.29, "step": 114700 }, { "epoch": 0.0296, "grad_norm": 0.23109310865402222, "learning_rate": 0.00015408040000000002, "loss": 0.3008, "step": 114800 }, { "epoch": 0.0298, "grad_norm": 0.17469018697738647, "learning_rate": 0.0001540404, "loss": 0.2955, "step": 114900 }, { "epoch": 0.03, "grad_norm": 0.25855448842048645, "learning_rate": 0.0001540004, "loss": 0.3033, "step": 115000 }, { "epoch": 0.0302, "grad_norm": 0.2488468438386917, "learning_rate": 0.0001539604, "loss": 0.3022, "step": 115100 }, { "epoch": 0.0304, "grad_norm": 0.19968360662460327, "learning_rate": 0.00015392040000000002, "loss": 0.2934, "step": 115200 }, { "epoch": 0.0306, "grad_norm": 0.22593122720718384, "learning_rate": 0.0001538804, "loss": 0.292, "step": 115300 }, { "epoch": 0.0308, "grad_norm": 0.27579984068870544, "learning_rate": 0.0001538404, "loss": 0.295, "step": 115400 }, { "epoch": 0.031, "grad_norm": 0.25232911109924316, "learning_rate": 0.0001538004, "loss": 0.3041, "step": 115500 }, { "epoch": 0.0312, "grad_norm": 0.295006662607193, "learning_rate": 0.0001537604, "loss": 0.3088, "step": 115600 }, { "epoch": 0.0314, "grad_norm": 0.520390510559082, "learning_rate": 0.0001537204, "loss": 0.2972, "step": 115700 }, { "epoch": 0.0316, "grad_norm": 0.24330058693885803, "learning_rate": 0.0001536804, "loss": 0.2937, "step": 115800 }, { "epoch": 0.0318, "grad_norm": 0.4119510054588318, "learning_rate": 0.00015364040000000003, "loss": 0.298, "step": 115900 }, { "epoch": 0.032, "grad_norm": 0.309059202671051, "learning_rate": 0.0001536004, "loss": 0.3025, "step": 116000 }, { "epoch": 0.0322, "grad_norm": 0.30650094151496887, "learning_rate": 0.00015356040000000001, "loss": 0.3046, "step": 116100 }, { "epoch": 0.0324, "grad_norm": 0.24224722385406494, "learning_rate": 0.0001535204, "loss": 0.299, "step": 116200 }, { "epoch": 0.0326, "grad_norm": 0.32729941606521606, "learning_rate": 0.0001534804, "loss": 0.3008, "step": 116300 }, { "epoch": 0.0328, "grad_norm": 0.1765100657939911, "learning_rate": 0.00015344040000000002, "loss": 0.3152, "step": 116400 }, { "epoch": 0.033, "grad_norm": 0.25575095415115356, "learning_rate": 0.0001534004, "loss": 0.306, "step": 116500 }, { "epoch": 0.0332, "grad_norm": 0.6274484992027283, "learning_rate": 0.0001533604, "loss": 0.3092, "step": 116600 }, { "epoch": 0.0334, "grad_norm": 0.21123872697353363, "learning_rate": 0.0001533204, "loss": 0.3978, "step": 116700 }, { "epoch": 0.0336, "grad_norm": 0.29489830136299133, "learning_rate": 0.00015328040000000002, "loss": 0.3038, "step": 116800 }, { "epoch": 0.0338, "grad_norm": 0.2510158121585846, "learning_rate": 0.00015324040000000002, "loss": 0.3114, "step": 116900 }, { "epoch": 0.034, "grad_norm": 0.23414947092533112, "learning_rate": 0.0001532004, "loss": 0.3052, "step": 117000 }, { "epoch": 0.0342, "grad_norm": 0.19055521488189697, "learning_rate": 0.0001531604, "loss": 0.3013, "step": 117100 }, { "epoch": 0.0344, "grad_norm": 0.8105829954147339, "learning_rate": 0.0001531204, "loss": 0.2967, "step": 117200 }, { "epoch": 0.0346, "grad_norm": 0.4188934564590454, "learning_rate": 0.00015308040000000002, "loss": 0.2951, "step": 117300 }, { "epoch": 0.0348, "grad_norm": 0.21309562027454376, "learning_rate": 0.0001530404, "loss": 0.2981, "step": 117400 }, { "epoch": 0.035, "grad_norm": 0.18121813237667084, "learning_rate": 0.0001530004, "loss": 0.2931, "step": 117500 }, { "epoch": 0.0352, "grad_norm": 0.632888674736023, "learning_rate": 0.0001529604, "loss": 0.2981, "step": 117600 }, { "epoch": 0.0354, "grad_norm": 0.3604109287261963, "learning_rate": 0.00015292040000000002, "loss": 0.2895, "step": 117700 }, { "epoch": 0.0356, "grad_norm": 0.2515547573566437, "learning_rate": 0.00015288040000000001, "loss": 0.3134, "step": 117800 }, { "epoch": 0.0358, "grad_norm": 0.26086676120758057, "learning_rate": 0.0001528404, "loss": 0.3068, "step": 117900 }, { "epoch": 0.036, "grad_norm": 0.20093345642089844, "learning_rate": 0.0001528004, "loss": 0.2985, "step": 118000 }, { "epoch": 0.0362, "grad_norm": 0.43283843994140625, "learning_rate": 0.0001527604, "loss": 0.304, "step": 118100 }, { "epoch": 0.0364, "grad_norm": 0.2671961784362793, "learning_rate": 0.00015272040000000002, "loss": 0.2987, "step": 118200 }, { "epoch": 0.0366, "grad_norm": 0.2851164638996124, "learning_rate": 0.0001526804, "loss": 0.3012, "step": 118300 }, { "epoch": 0.0368, "grad_norm": 0.27135607600212097, "learning_rate": 0.00015264040000000003, "loss": 0.3035, "step": 118400 }, { "epoch": 0.037, "grad_norm": 0.2493288666009903, "learning_rate": 0.0001526004, "loss": 0.3087, "step": 118500 }, { "epoch": 0.0372, "grad_norm": 0.2523597478866577, "learning_rate": 0.0001525604, "loss": 0.2925, "step": 118600 }, { "epoch": 0.0374, "grad_norm": 2.79110050201416, "learning_rate": 0.0001525204, "loss": 0.3105, "step": 118700 }, { "epoch": 0.0376, "grad_norm": 0.20949222147464752, "learning_rate": 0.0001524804, "loss": 0.3069, "step": 118800 }, { "epoch": 0.0378, "grad_norm": 0.2129492163658142, "learning_rate": 0.00015244040000000003, "loss": 0.2957, "step": 118900 }, { "epoch": 0.038, "grad_norm": 0.2193364053964615, "learning_rate": 0.0001524004, "loss": 0.2939, "step": 119000 }, { "epoch": 0.0382, "grad_norm": 0.21624450385570526, "learning_rate": 0.0001523604, "loss": 0.3033, "step": 119100 }, { "epoch": 0.0384, "grad_norm": 0.16359418630599976, "learning_rate": 0.0001523204, "loss": 0.3051, "step": 119200 }, { "epoch": 0.0386, "grad_norm": 0.38990601897239685, "learning_rate": 0.0001522804, "loss": 0.2948, "step": 119300 }, { "epoch": 0.0388, "grad_norm": 0.25620323419570923, "learning_rate": 0.00015224040000000002, "loss": 0.3069, "step": 119400 }, { "epoch": 0.039, "grad_norm": 0.3383728563785553, "learning_rate": 0.0001522004, "loss": 0.3168, "step": 119500 }, { "epoch": 0.0392, "grad_norm": 0.296837717294693, "learning_rate": 0.0001521604, "loss": 0.3068, "step": 119600 }, { "epoch": 0.0394, "grad_norm": 0.17246387898921967, "learning_rate": 0.0001521204, "loss": 0.2959, "step": 119700 }, { "epoch": 0.0396, "grad_norm": 0.27589595317840576, "learning_rate": 0.00015208040000000002, "loss": 0.2961, "step": 119800 }, { "epoch": 0.0398, "grad_norm": 0.2941451668739319, "learning_rate": 0.00015204040000000002, "loss": 0.294, "step": 119900 }, { "epoch": 0.04, "grad_norm": 0.18005476891994476, "learning_rate": 0.0001520004, "loss": 0.3013, "step": 120000 }, { "epoch": 0.0002, "grad_norm": 0.21327625215053558, "learning_rate": 0.0001519604, "loss": 0.2806, "step": 120100 }, { "epoch": 0.0004, "grad_norm": 0.4669700562953949, "learning_rate": 0.0001519204, "loss": 0.2849, "step": 120200 }, { "epoch": 0.0006, "grad_norm": 0.3588303029537201, "learning_rate": 0.00015188040000000002, "loss": 0.2861, "step": 120300 }, { "epoch": 0.0008, "grad_norm": 0.28102320432662964, "learning_rate": 0.0001518404, "loss": 0.2878, "step": 120400 }, { "epoch": 0.001, "grad_norm": 0.1880110502243042, "learning_rate": 0.0001518004, "loss": 0.2811, "step": 120500 }, { "epoch": 0.0012, "grad_norm": 0.35984283685684204, "learning_rate": 0.0001517604, "loss": 0.277, "step": 120600 }, { "epoch": 0.0014, "grad_norm": 0.46658045053482056, "learning_rate": 0.00015172040000000002, "loss": 0.2813, "step": 120700 }, { "epoch": 0.0016, "grad_norm": 0.3781064748764038, "learning_rate": 0.0001516804, "loss": 0.2833, "step": 120800 }, { "epoch": 0.0018, "grad_norm": 0.30914661288261414, "learning_rate": 0.0001516404, "loss": 0.2806, "step": 120900 }, { "epoch": 0.002, "grad_norm": 0.32875189185142517, "learning_rate": 0.0001516004, "loss": 0.2778, "step": 121000 }, { "epoch": 0.0022, "grad_norm": 0.21213066577911377, "learning_rate": 0.0001515604, "loss": 0.2951, "step": 121100 }, { "epoch": 0.0024, "grad_norm": 0.31731417775154114, "learning_rate": 0.00015152040000000001, "loss": 0.2819, "step": 121200 }, { "epoch": 0.0026, "grad_norm": 0.5732041597366333, "learning_rate": 0.0001514804, "loss": 0.2908, "step": 121300 }, { "epoch": 0.0028, "grad_norm": 0.1619558483362198, "learning_rate": 0.00015144040000000003, "loss": 0.2796, "step": 121400 }, { "epoch": 0.003, "grad_norm": 0.4144911766052246, "learning_rate": 0.0001514004, "loss": 0.2892, "step": 121500 }, { "epoch": 0.0032, "grad_norm": 0.2565133273601532, "learning_rate": 0.00015136040000000002, "loss": 0.3034, "step": 121600 }, { "epoch": 0.0034, "grad_norm": 0.5545548796653748, "learning_rate": 0.0001513204, "loss": 0.2899, "step": 121700 }, { "epoch": 0.0036, "grad_norm": 0.21982091665267944, "learning_rate": 0.0001512804, "loss": 0.2901, "step": 121800 }, { "epoch": 0.0038, "grad_norm": 0.20543520152568817, "learning_rate": 0.00015124040000000002, "loss": 0.2925, "step": 121900 }, { "epoch": 0.004, "grad_norm": 0.27426043152809143, "learning_rate": 0.0001512004, "loss": 0.2911, "step": 122000 }, { "epoch": 0.0042, "grad_norm": 0.20343656837940216, "learning_rate": 0.0001511604, "loss": 0.2918, "step": 122100 }, { "epoch": 0.0044, "grad_norm": 0.21032807230949402, "learning_rate": 0.0001511204, "loss": 0.3072, "step": 122200 }, { "epoch": 0.0046, "grad_norm": 0.3014376163482666, "learning_rate": 0.0001510804, "loss": 0.2942, "step": 122300 }, { "epoch": 0.0048, "grad_norm": 0.2240341305732727, "learning_rate": 0.00015104040000000002, "loss": 0.2799, "step": 122400 }, { "epoch": 0.005, "grad_norm": 0.2607118785381317, "learning_rate": 0.00015100039999999998, "loss": 0.2782, "step": 122500 }, { "epoch": 0.0052, "grad_norm": 0.2203282117843628, "learning_rate": 0.0001509604, "loss": 0.28, "step": 122600 }, { "epoch": 0.0054, "grad_norm": 0.275749534368515, "learning_rate": 0.0001509204, "loss": 0.2859, "step": 122700 }, { "epoch": 0.0056, "grad_norm": 0.2172224372625351, "learning_rate": 0.00015088040000000002, "loss": 0.2916, "step": 122800 }, { "epoch": 0.0058, "grad_norm": 0.24563243985176086, "learning_rate": 0.00015084040000000001, "loss": 0.2784, "step": 122900 }, { "epoch": 0.006, "grad_norm": 0.25981712341308594, "learning_rate": 0.0001508004, "loss": 0.3223, "step": 123000 }, { "epoch": 0.0062, "grad_norm": 0.22978562116622925, "learning_rate": 0.0001507604, "loss": 0.2904, "step": 123100 }, { "epoch": 0.0064, "grad_norm": 0.18407663702964783, "learning_rate": 0.0001507204, "loss": 0.2847, "step": 123200 }, { "epoch": 0.0066, "grad_norm": 0.697337806224823, "learning_rate": 0.00015068040000000002, "loss": 0.2996, "step": 123300 }, { "epoch": 0.0068, "grad_norm": 0.17664720118045807, "learning_rate": 0.0001506404, "loss": 0.2878, "step": 123400 }, { "epoch": 0.007, "grad_norm": 0.24168753623962402, "learning_rate": 0.0001506004, "loss": 0.2988, "step": 123500 }, { "epoch": 0.0072, "grad_norm": 0.20818035304546356, "learning_rate": 0.0001505604, "loss": 0.2945, "step": 123600 }, { "epoch": 0.0074, "grad_norm": 0.22152996063232422, "learning_rate": 0.00015052040000000002, "loss": 0.3007, "step": 123700 }, { "epoch": 0.0076, "grad_norm": 0.15268588066101074, "learning_rate": 0.0001504804, "loss": 0.3035, "step": 123800 }, { "epoch": 0.0078, "grad_norm": 0.19523349404335022, "learning_rate": 0.0001504404, "loss": 0.2886, "step": 123900 }, { "epoch": 0.008, "grad_norm": 0.25065121054649353, "learning_rate": 0.0001504004, "loss": 0.2931, "step": 124000 }, { "epoch": 0.0082, "grad_norm": 0.2577344477176666, "learning_rate": 0.0001503604, "loss": 0.3029, "step": 124100 }, { "epoch": 0.0084, "grad_norm": 0.49803975224494934, "learning_rate": 0.0001503204, "loss": 0.2879, "step": 124200 }, { "epoch": 0.0086, "grad_norm": 0.2067602127790451, "learning_rate": 0.0001502804, "loss": 0.2863, "step": 124300 }, { "epoch": 0.0088, "grad_norm": 0.14126543700695038, "learning_rate": 0.00015024040000000003, "loss": 0.2845, "step": 124400 }, { "epoch": 0.009, "grad_norm": 0.24404196441173553, "learning_rate": 0.0001502004, "loss": 0.2985, "step": 124500 }, { "epoch": 0.0092, "grad_norm": 0.2484026700258255, "learning_rate": 0.0001501604, "loss": 0.3004, "step": 124600 }, { "epoch": 0.0094, "grad_norm": 0.3001592457294464, "learning_rate": 0.0001501204, "loss": 0.2953, "step": 124700 }, { "epoch": 0.0096, "grad_norm": 0.2282063364982605, "learning_rate": 0.0001500804, "loss": 0.3028, "step": 124800 }, { "epoch": 0.0098, "grad_norm": 0.20628437399864197, "learning_rate": 0.00015004040000000002, "loss": 0.2982, "step": 124900 }, { "epoch": 0.01, "grad_norm": 0.25722581148147583, "learning_rate": 0.0001500004, "loss": 0.2941, "step": 125000 }, { "epoch": 0.0102, "grad_norm": 0.23121199011802673, "learning_rate": 0.0001499604, "loss": 0.3042, "step": 125100 }, { "epoch": 0.0104, "grad_norm": 0.26357603073120117, "learning_rate": 0.0001499204, "loss": 0.2879, "step": 125200 }, { "epoch": 0.0106, "grad_norm": 0.24245673418045044, "learning_rate": 0.00014988040000000002, "loss": 0.2984, "step": 125300 }, { "epoch": 0.0108, "grad_norm": 0.23373626172542572, "learning_rate": 0.00014984040000000002, "loss": 0.3001, "step": 125400 }, { "epoch": 0.011, "grad_norm": 0.22621707618236542, "learning_rate": 0.00014980039999999998, "loss": 0.2982, "step": 125500 }, { "epoch": 0.0112, "grad_norm": 0.1624283492565155, "learning_rate": 0.0001497604, "loss": 0.2955, "step": 125600 }, { "epoch": 0.0114, "grad_norm": 0.2670327126979828, "learning_rate": 0.0001497204, "loss": 0.2883, "step": 125700 }, { "epoch": 0.0116, "grad_norm": 0.23716334998607635, "learning_rate": 0.00014968040000000002, "loss": 0.3013, "step": 125800 }, { "epoch": 0.0118, "grad_norm": 0.19362571835517883, "learning_rate": 0.0001496404, "loss": 0.3015, "step": 125900 }, { "epoch": 0.012, "grad_norm": 0.16306428611278534, "learning_rate": 0.0001496004, "loss": 0.2924, "step": 126000 }, { "epoch": 0.0122, "grad_norm": 0.25153741240501404, "learning_rate": 0.0001495604, "loss": 0.3193, "step": 126100 }, { "epoch": 0.0124, "grad_norm": 0.2298291176557541, "learning_rate": 0.0001495204, "loss": 0.2925, "step": 126200 }, { "epoch": 0.0126, "grad_norm": 0.2596379220485687, "learning_rate": 0.0001494804, "loss": 0.2992, "step": 126300 }, { "epoch": 0.0128, "grad_norm": 0.18484342098236084, "learning_rate": 0.0001494404, "loss": 0.2869, "step": 126400 }, { "epoch": 0.013, "grad_norm": 0.21150384843349457, "learning_rate": 0.00014940040000000003, "loss": 0.2917, "step": 126500 }, { "epoch": 0.0132, "grad_norm": 0.210966095328331, "learning_rate": 0.0001493604, "loss": 0.2858, "step": 126600 }, { "epoch": 0.0134, "grad_norm": 0.19671154022216797, "learning_rate": 0.00014932040000000001, "loss": 0.3121, "step": 126700 }, { "epoch": 0.0136, "grad_norm": 0.24427378177642822, "learning_rate": 0.0001492804, "loss": 0.2878, "step": 126800 }, { "epoch": 0.0138, "grad_norm": 0.278323233127594, "learning_rate": 0.0001492404, "loss": 0.3073, "step": 126900 }, { "epoch": 0.014, "grad_norm": 0.2476424276828766, "learning_rate": 0.00014920040000000002, "loss": 0.3057, "step": 127000 }, { "epoch": 0.0142, "grad_norm": 0.18065540492534637, "learning_rate": 0.0001491604, "loss": 0.2981, "step": 127100 }, { "epoch": 0.0144, "grad_norm": 0.28167441487312317, "learning_rate": 0.0001491204, "loss": 0.2962, "step": 127200 }, { "epoch": 0.0146, "grad_norm": 0.19970493018627167, "learning_rate": 0.0001490804, "loss": 0.2969, "step": 127300 }, { "epoch": 0.0148, "grad_norm": 0.40125706791877747, "learning_rate": 0.00014904040000000002, "loss": 0.2901, "step": 127400 }, { "epoch": 0.015, "grad_norm": 0.24552318453788757, "learning_rate": 0.00014900040000000002, "loss": 0.3009, "step": 127500 }, { "epoch": 0.0152, "grad_norm": 0.22551432251930237, "learning_rate": 0.0001489604, "loss": 0.2938, "step": 127600 }, { "epoch": 0.0154, "grad_norm": 0.21164937317371368, "learning_rate": 0.0001489204, "loss": 0.2914, "step": 127700 }, { "epoch": 0.0156, "grad_norm": 0.22755509614944458, "learning_rate": 0.0001488804, "loss": 0.2918, "step": 127800 }, { "epoch": 0.0158, "grad_norm": 0.28624242544174194, "learning_rate": 0.00014884040000000002, "loss": 0.2889, "step": 127900 }, { "epoch": 0.016, "grad_norm": 0.42901697754859924, "learning_rate": 0.0001488004, "loss": 0.2893, "step": 128000 }, { "epoch": 0.0162, "grad_norm": 0.2009563446044922, "learning_rate": 0.0001487604, "loss": 0.3049, "step": 128100 }, { "epoch": 0.0164, "grad_norm": 0.2092672884464264, "learning_rate": 0.0001487204, "loss": 0.2929, "step": 128200 }, { "epoch": 0.0166, "grad_norm": 0.2817251682281494, "learning_rate": 0.00014868040000000002, "loss": 0.2916, "step": 128300 }, { "epoch": 0.0168, "grad_norm": 0.3064669668674469, "learning_rate": 0.00014864040000000001, "loss": 0.2961, "step": 128400 }, { "epoch": 0.017, "grad_norm": 0.23615750670433044, "learning_rate": 0.0001486004, "loss": 0.3013, "step": 128500 }, { "epoch": 0.0172, "grad_norm": 0.2097761034965515, "learning_rate": 0.0001485604, "loss": 0.3128, "step": 128600 }, { "epoch": 0.0174, "grad_norm": 0.20526331663131714, "learning_rate": 0.0001485204, "loss": 0.2986, "step": 128700 }, { "epoch": 0.0176, "grad_norm": 0.24395328760147095, "learning_rate": 0.00014848040000000002, "loss": 0.2977, "step": 128800 }, { "epoch": 0.0178, "grad_norm": 0.31493526697158813, "learning_rate": 0.0001484404, "loss": 0.2832, "step": 128900 }, { "epoch": 0.018, "grad_norm": 0.19086699187755585, "learning_rate": 0.00014840040000000003, "loss": 0.2905, "step": 129000 }, { "epoch": 0.0182, "grad_norm": 0.16701176762580872, "learning_rate": 0.0001483604, "loss": 0.2968, "step": 129100 }, { "epoch": 0.0184, "grad_norm": 0.18984141945838928, "learning_rate": 0.0001483204, "loss": 0.2944, "step": 129200 }, { "epoch": 0.0186, "grad_norm": 0.21061408519744873, "learning_rate": 0.0001482804, "loss": 0.2907, "step": 129300 }, { "epoch": 0.0188, "grad_norm": 0.18577519059181213, "learning_rate": 0.0001482404, "loss": 0.2783, "step": 129400 }, { "epoch": 0.019, "grad_norm": 0.36443671584129333, "learning_rate": 0.00014820040000000003, "loss": 0.3125, "step": 129500 }, { "epoch": 0.0192, "grad_norm": 0.25582483410835266, "learning_rate": 0.0001481604, "loss": 0.2863, "step": 129600 }, { "epoch": 0.0194, "grad_norm": 0.16937601566314697, "learning_rate": 0.0001481204, "loss": 0.3062, "step": 129700 }, { "epoch": 0.0196, "grad_norm": 0.2256769835948944, "learning_rate": 0.0001480804, "loss": 0.2894, "step": 129800 }, { "epoch": 0.0198, "grad_norm": 0.5192826390266418, "learning_rate": 0.0001480404, "loss": 0.2953, "step": 129900 }, { "epoch": 0.02, "grad_norm": 0.21238556504249573, "learning_rate": 0.00014800040000000002, "loss": 0.3024, "step": 130000 }, { "epoch": 0.0202, "grad_norm": 0.15953408181667328, "learning_rate": 0.0001479604, "loss": 0.2966, "step": 130100 }, { "epoch": 0.0204, "grad_norm": 0.6439893841743469, "learning_rate": 0.0001479204, "loss": 0.3231, "step": 130200 }, { "epoch": 0.0206, "grad_norm": 0.2107216715812683, "learning_rate": 0.0001478804, "loss": 0.3037, "step": 130300 }, { "epoch": 0.0208, "grad_norm": 0.2086646556854248, "learning_rate": 0.00014784040000000002, "loss": 0.3024, "step": 130400 }, { "epoch": 0.021, "grad_norm": 0.23454782366752625, "learning_rate": 0.00014780040000000002, "loss": 0.2821, "step": 130500 }, { "epoch": 0.0212, "grad_norm": 0.195747971534729, "learning_rate": 0.0001477604, "loss": 0.2976, "step": 130600 }, { "epoch": 0.0214, "grad_norm": 0.22037339210510254, "learning_rate": 0.0001477204, "loss": 0.2796, "step": 130700 }, { "epoch": 0.0216, "grad_norm": 0.28433117270469666, "learning_rate": 0.0001476804, "loss": 0.3009, "step": 130800 }, { "epoch": 0.0218, "grad_norm": 0.5101615786552429, "learning_rate": 0.00014764040000000002, "loss": 0.3018, "step": 130900 }, { "epoch": 0.022, "grad_norm": 0.4728710353374481, "learning_rate": 0.0001476004, "loss": 0.2966, "step": 131000 }, { "epoch": 0.0222, "grad_norm": 0.2941736876964569, "learning_rate": 0.0001475604, "loss": 0.3063, "step": 131100 }, { "epoch": 0.0224, "grad_norm": 0.26847612857818604, "learning_rate": 0.0001475204, "loss": 0.3034, "step": 131200 }, { "epoch": 0.0226, "grad_norm": 0.30397793650627136, "learning_rate": 0.00014748040000000002, "loss": 0.3112, "step": 131300 }, { "epoch": 0.0228, "grad_norm": 0.48798418045043945, "learning_rate": 0.0001474404, "loss": 0.3263, "step": 131400 }, { "epoch": 0.023, "grad_norm": 0.20954224467277527, "learning_rate": 0.0001474004, "loss": 0.3032, "step": 131500 }, { "epoch": 0.0232, "grad_norm": 0.26696014404296875, "learning_rate": 0.0001473604, "loss": 0.2978, "step": 131600 }, { "epoch": 0.0234, "grad_norm": 0.1869426965713501, "learning_rate": 0.0001473204, "loss": 0.2897, "step": 131700 }, { "epoch": 0.0236, "grad_norm": 0.2813512980937958, "learning_rate": 0.00014728040000000001, "loss": 0.3065, "step": 131800 }, { "epoch": 0.0238, "grad_norm": 0.2745366096496582, "learning_rate": 0.0001472404, "loss": 0.3405, "step": 131900 }, { "epoch": 0.024, "grad_norm": 0.3381493091583252, "learning_rate": 0.00014720040000000003, "loss": 0.2944, "step": 132000 }, { "epoch": 0.0242, "grad_norm": 0.20233681797981262, "learning_rate": 0.0001471604, "loss": 0.3009, "step": 132100 }, { "epoch": 0.0244, "grad_norm": 0.24026724696159363, "learning_rate": 0.00014712040000000002, "loss": 0.2996, "step": 132200 }, { "epoch": 0.0246, "grad_norm": 0.24572235345840454, "learning_rate": 0.0001470804, "loss": 0.2985, "step": 132300 }, { "epoch": 0.0248, "grad_norm": 0.26701977849006653, "learning_rate": 0.0001470404, "loss": 0.3037, "step": 132400 }, { "epoch": 0.025, "grad_norm": 0.24025647342205048, "learning_rate": 0.00014700040000000002, "loss": 0.3154, "step": 132500 }, { "epoch": 0.0252, "grad_norm": 0.3687751889228821, "learning_rate": 0.0001469604, "loss": 0.3188, "step": 132600 }, { "epoch": 0.0254, "grad_norm": 0.215996652841568, "learning_rate": 0.0001469204, "loss": 0.3143, "step": 132700 }, { "epoch": 0.0256, "grad_norm": 0.2471071183681488, "learning_rate": 0.0001468804, "loss": 0.3019, "step": 132800 }, { "epoch": 0.0258, "grad_norm": 0.21399539709091187, "learning_rate": 0.00014684040000000002, "loss": 0.3064, "step": 132900 }, { "epoch": 0.026, "grad_norm": 0.2229408621788025, "learning_rate": 0.00014680040000000002, "loss": 0.3383, "step": 133000 }, { "epoch": 0.0262, "grad_norm": 0.1846669614315033, "learning_rate": 0.00014676039999999998, "loss": 0.3073, "step": 133100 }, { "epoch": 0.0264, "grad_norm": 0.2286502569913864, "learning_rate": 0.0001467204, "loss": 0.3073, "step": 133200 }, { "epoch": 0.0266, "grad_norm": 0.21924042701721191, "learning_rate": 0.0001466804, "loss": 0.3055, "step": 133300 }, { "epoch": 0.0268, "grad_norm": 0.3561740517616272, "learning_rate": 0.00014664040000000002, "loss": 0.3023, "step": 133400 }, { "epoch": 0.027, "grad_norm": 0.2278672158718109, "learning_rate": 0.0001466004, "loss": 0.2862, "step": 133500 }, { "epoch": 0.0272, "grad_norm": 0.28851574659347534, "learning_rate": 0.0001465604, "loss": 0.2724, "step": 133600 }, { "epoch": 0.0274, "grad_norm": 0.20886564254760742, "learning_rate": 0.0001465204, "loss": 0.2766, "step": 133700 }, { "epoch": 0.0276, "grad_norm": 0.2499471753835678, "learning_rate": 0.0001464804, "loss": 0.2761, "step": 133800 }, { "epoch": 0.0278, "grad_norm": 0.1883542537689209, "learning_rate": 0.00014644040000000002, "loss": 0.2739, "step": 133900 }, { "epoch": 0.028, "grad_norm": 0.16889332234859467, "learning_rate": 0.0001464004, "loss": 0.284, "step": 134000 }, { "epoch": 0.0282, "grad_norm": 0.23524640500545502, "learning_rate": 0.0001463604, "loss": 0.265, "step": 134100 }, { "epoch": 0.0284, "grad_norm": 0.18847911059856415, "learning_rate": 0.0001463204, "loss": 0.2718, "step": 134200 }, { "epoch": 0.0286, "grad_norm": 0.1972835212945938, "learning_rate": 0.00014628040000000002, "loss": 0.2858, "step": 134300 }, { "epoch": 0.0288, "grad_norm": 0.2064981758594513, "learning_rate": 0.0001462404, "loss": 0.2715, "step": 134400 }, { "epoch": 0.029, "grad_norm": 0.39654669165611267, "learning_rate": 0.0001462004, "loss": 0.2831, "step": 134500 }, { "epoch": 0.0292, "grad_norm": 0.7286176085472107, "learning_rate": 0.0001461604, "loss": 0.2753, "step": 134600 }, { "epoch": 0.0294, "grad_norm": 0.2998080849647522, "learning_rate": 0.0001461204, "loss": 0.2737, "step": 134700 }, { "epoch": 0.0296, "grad_norm": 0.22182480990886688, "learning_rate": 0.0001460804, "loss": 0.2809, "step": 134800 }, { "epoch": 0.0298, "grad_norm": 0.3183176517486572, "learning_rate": 0.0001460404, "loss": 0.278, "step": 134900 }, { "epoch": 0.03, "grad_norm": 0.24670830368995667, "learning_rate": 0.00014600040000000003, "loss": 0.284, "step": 135000 }, { "epoch": 0.0302, "grad_norm": 0.22563083469867706, "learning_rate": 0.0001459604, "loss": 0.2843, "step": 135100 }, { "epoch": 0.0304, "grad_norm": 0.2058597058057785, "learning_rate": 0.0001459204, "loss": 0.2767, "step": 135200 }, { "epoch": 0.0306, "grad_norm": 0.21196039021015167, "learning_rate": 0.0001458804, "loss": 0.2765, "step": 135300 }, { "epoch": 0.0308, "grad_norm": 0.24366965889930725, "learning_rate": 0.0001458404, "loss": 0.2783, "step": 135400 }, { "epoch": 0.031, "grad_norm": 0.24236196279525757, "learning_rate": 0.00014580040000000002, "loss": 0.2873, "step": 135500 }, { "epoch": 0.0312, "grad_norm": 0.2742379903793335, "learning_rate": 0.0001457604, "loss": 0.2895, "step": 135600 }, { "epoch": 0.0314, "grad_norm": 0.40432196855545044, "learning_rate": 0.0001457204, "loss": 0.2802, "step": 135700 }, { "epoch": 0.0316, "grad_norm": 0.21296831965446472, "learning_rate": 0.0001456804, "loss": 0.2787, "step": 135800 }, { "epoch": 0.0318, "grad_norm": 0.38033196330070496, "learning_rate": 0.00014564040000000002, "loss": 0.2832, "step": 135900 }, { "epoch": 0.032, "grad_norm": 0.28289541602134705, "learning_rate": 0.00014560040000000002, "loss": 0.2853, "step": 136000 }, { "epoch": 0.0322, "grad_norm": 0.21910668909549713, "learning_rate": 0.0001455604, "loss": 0.2872, "step": 136100 }, { "epoch": 0.0324, "grad_norm": 0.23765510320663452, "learning_rate": 0.0001455204, "loss": 0.2826, "step": 136200 }, { "epoch": 0.0326, "grad_norm": 0.27492231130599976, "learning_rate": 0.0001454804, "loss": 0.2825, "step": 136300 }, { "epoch": 0.0328, "grad_norm": 0.16266077756881714, "learning_rate": 0.00014544040000000002, "loss": 0.2949, "step": 136400 }, { "epoch": 0.033, "grad_norm": 0.21813906729221344, "learning_rate": 0.0001454004, "loss": 0.2898, "step": 136500 }, { "epoch": 0.0332, "grad_norm": 0.6559190154075623, "learning_rate": 0.0001453604, "loss": 0.2917, "step": 136600 }, { "epoch": 0.0334, "grad_norm": 0.21168918907642365, "learning_rate": 0.0001453204, "loss": 0.3655, "step": 136700 }, { "epoch": 0.0336, "grad_norm": 0.2966395616531372, "learning_rate": 0.0001452804, "loss": 0.2859, "step": 136800 }, { "epoch": 0.0338, "grad_norm": 0.20525045692920685, "learning_rate": 0.0001452404, "loss": 0.293, "step": 136900 }, { "epoch": 0.034, "grad_norm": 0.17695209383964539, "learning_rate": 0.0001452004, "loss": 0.2869, "step": 137000 }, { "epoch": 0.0342, "grad_norm": 0.182274729013443, "learning_rate": 0.0001451604, "loss": 0.2843, "step": 137100 }, { "epoch": 0.0344, "grad_norm": 0.8087812662124634, "learning_rate": 0.0001451204, "loss": 0.28, "step": 137200 }, { "epoch": 0.0346, "grad_norm": 0.33212119340896606, "learning_rate": 0.00014508040000000001, "loss": 0.2787, "step": 137300 }, { "epoch": 0.0348, "grad_norm": 0.18305575847625732, "learning_rate": 0.0001450404, "loss": 0.2812, "step": 137400 }, { "epoch": 0.035, "grad_norm": 0.17764481902122498, "learning_rate": 0.0001450004, "loss": 0.2773, "step": 137500 }, { "epoch": 0.0352, "grad_norm": 0.33276671171188354, "learning_rate": 0.0001449604, "loss": 0.2829, "step": 137600 }, { "epoch": 0.0354, "grad_norm": 0.2309039831161499, "learning_rate": 0.0001449204, "loss": 0.2744, "step": 137700 }, { "epoch": 0.0356, "grad_norm": 0.21504448354244232, "learning_rate": 0.0001448804, "loss": 0.2975, "step": 137800 }, { "epoch": 0.0358, "grad_norm": 0.23908798396587372, "learning_rate": 0.0001448404, "loss": 0.2891, "step": 137900 }, { "epoch": 0.036, "grad_norm": 0.18185609579086304, "learning_rate": 0.00014480040000000002, "loss": 0.2821, "step": 138000 }, { "epoch": 0.0362, "grad_norm": 0.3852616548538208, "learning_rate": 0.0001447604, "loss": 0.2854, "step": 138100 }, { "epoch": 0.0364, "grad_norm": 0.2340957075357437, "learning_rate": 0.0001447204, "loss": 0.2819, "step": 138200 }, { "epoch": 0.0366, "grad_norm": 0.23798440396785736, "learning_rate": 0.0001446804, "loss": 0.2838, "step": 138300 }, { "epoch": 0.0368, "grad_norm": 0.2655554413795471, "learning_rate": 0.0001446404, "loss": 0.2847, "step": 138400 }, { "epoch": 0.037, "grad_norm": 0.20889754593372345, "learning_rate": 0.00014460040000000002, "loss": 0.2906, "step": 138500 }, { "epoch": 0.0372, "grad_norm": 0.2111879587173462, "learning_rate": 0.00014456039999999999, "loss": 0.2766, "step": 138600 }, { "epoch": 0.0374, "grad_norm": 1.0969158411026, "learning_rate": 0.0001445204, "loss": 0.2879, "step": 138700 }, { "epoch": 0.0376, "grad_norm": 0.20623628795146942, "learning_rate": 0.0001444804, "loss": 0.288, "step": 138800 }, { "epoch": 0.0378, "grad_norm": 0.23224857449531555, "learning_rate": 0.00014444040000000002, "loss": 0.2794, "step": 138900 }, { "epoch": 0.038, "grad_norm": 0.20052120089530945, "learning_rate": 0.00014440040000000001, "loss": 0.2779, "step": 139000 }, { "epoch": 0.0382, "grad_norm": 0.21525472402572632, "learning_rate": 0.0001443604, "loss": 0.2861, "step": 139100 }, { "epoch": 0.0384, "grad_norm": 0.155807763338089, "learning_rate": 0.0001443204, "loss": 0.2868, "step": 139200 }, { "epoch": 0.0386, "grad_norm": 0.3168600797653198, "learning_rate": 0.0001442804, "loss": 0.2789, "step": 139300 }, { "epoch": 0.0388, "grad_norm": 0.23451729118824005, "learning_rate": 0.00014424040000000002, "loss": 0.2891, "step": 139400 }, { "epoch": 0.039, "grad_norm": 0.29926997423171997, "learning_rate": 0.0001442004, "loss": 0.2977, "step": 139500 }, { "epoch": 0.0392, "grad_norm": 0.27708712220191956, "learning_rate": 0.00014416040000000003, "loss": 0.2897, "step": 139600 }, { "epoch": 0.0394, "grad_norm": 0.17544563114643097, "learning_rate": 0.0001441204, "loss": 0.282, "step": 139700 }, { "epoch": 0.0396, "grad_norm": 0.24782776832580566, "learning_rate": 0.00014408040000000002, "loss": 0.2789, "step": 139800 }, { "epoch": 0.0398, "grad_norm": 0.23590870201587677, "learning_rate": 0.0001440404, "loss": 0.2794, "step": 139900 }, { "epoch": 0.04, "grad_norm": 0.17239037156105042, "learning_rate": 0.0001440004, "loss": 0.2843, "step": 140000 }, { "epoch": 0.0402, "grad_norm": 0.22361674904823303, "learning_rate": 0.00014396040000000003, "loss": 0.2794, "step": 140100 }, { "epoch": 0.0404, "grad_norm": 0.3022303581237793, "learning_rate": 0.0001439204, "loss": 0.3019, "step": 140200 }, { "epoch": 0.0406, "grad_norm": 0.2557013928890228, "learning_rate": 0.0001438804, "loss": 0.2996, "step": 140300 }, { "epoch": 0.0408, "grad_norm": 0.278455913066864, "learning_rate": 0.0001438404, "loss": 0.2935, "step": 140400 }, { "epoch": 0.041, "grad_norm": 0.37940022349357605, "learning_rate": 0.0001438004, "loss": 0.2886, "step": 140500 }, { "epoch": 0.0412, "grad_norm": 0.34311047196388245, "learning_rate": 0.00014376040000000002, "loss": 0.2892, "step": 140600 }, { "epoch": 0.0414, "grad_norm": 0.3010990619659424, "learning_rate": 0.0001437204, "loss": 0.2841, "step": 140700 }, { "epoch": 0.0416, "grad_norm": 0.19272881746292114, "learning_rate": 0.0001436804, "loss": 0.3048, "step": 140800 }, { "epoch": 0.0418, "grad_norm": 0.20627625286579132, "learning_rate": 0.0001436404, "loss": 0.295, "step": 140900 }, { "epoch": 0.042, "grad_norm": 0.4096256494522095, "learning_rate": 0.00014360040000000002, "loss": 0.296, "step": 141000 }, { "epoch": 0.0422, "grad_norm": 0.20318618416786194, "learning_rate": 0.00014356040000000002, "loss": 0.2917, "step": 141100 }, { "epoch": 0.0424, "grad_norm": 0.5557262897491455, "learning_rate": 0.0001435204, "loss": 0.2922, "step": 141200 }, { "epoch": 0.0426, "grad_norm": 0.2203870415687561, "learning_rate": 0.0001434804, "loss": 0.2947, "step": 141300 }, { "epoch": 0.0428, "grad_norm": 0.23622627556324005, "learning_rate": 0.0001434404, "loss": 0.2935, "step": 141400 }, { "epoch": 0.043, "grad_norm": 0.24292029440402985, "learning_rate": 0.00014340040000000002, "loss": 0.2956, "step": 141500 }, { "epoch": 0.0432, "grad_norm": 0.18479755520820618, "learning_rate": 0.0001433604, "loss": 0.2997, "step": 141600 }, { "epoch": 0.0434, "grad_norm": 0.2119598090648651, "learning_rate": 0.0001433204, "loss": 0.2957, "step": 141700 }, { "epoch": 0.0436, "grad_norm": 0.2696900963783264, "learning_rate": 0.0001432804, "loss": 0.293, "step": 141800 }, { "epoch": 0.0438, "grad_norm": 0.17380273342132568, "learning_rate": 0.00014324040000000002, "loss": 0.2979, "step": 141900 }, { "epoch": 0.044, "grad_norm": 0.20531192421913147, "learning_rate": 0.0001432004, "loss": 0.2969, "step": 142000 }, { "epoch": 0.0442, "grad_norm": 0.2422923445701599, "learning_rate": 0.0001431604, "loss": 0.3016, "step": 142100 }, { "epoch": 0.0444, "grad_norm": 0.16883300244808197, "learning_rate": 0.0001431204, "loss": 0.2839, "step": 142200 }, { "epoch": 0.0446, "grad_norm": 0.1872025579214096, "learning_rate": 0.0001430804, "loss": 0.3367, "step": 142300 }, { "epoch": 0.0448, "grad_norm": 0.18559499084949493, "learning_rate": 0.00014304040000000001, "loss": 0.2872, "step": 142400 }, { "epoch": 0.045, "grad_norm": 0.22933410108089447, "learning_rate": 0.0001430004, "loss": 0.2856, "step": 142500 }, { "epoch": 0.0452, "grad_norm": 0.1616288274526596, "learning_rate": 0.00014296040000000003, "loss": 0.2832, "step": 142600 }, { "epoch": 0.0454, "grad_norm": 0.190198615193367, "learning_rate": 0.0001429204, "loss": 0.2811, "step": 142700 }, { "epoch": 0.0456, "grad_norm": 0.2478422075510025, "learning_rate": 0.00014288040000000002, "loss": 0.2808, "step": 142800 }, { "epoch": 0.0458, "grad_norm": 0.3362259566783905, "learning_rate": 0.0001428404, "loss": 0.285, "step": 142900 }, { "epoch": 0.046, "grad_norm": 0.1876300573348999, "learning_rate": 0.0001428004, "loss": 0.2844, "step": 143000 }, { "epoch": 0.0462, "grad_norm": 0.267817884683609, "learning_rate": 0.00014276040000000002, "loss": 0.2967, "step": 143100 }, { "epoch": 0.0464, "grad_norm": 0.2802794277667999, "learning_rate": 0.0001427204, "loss": 0.3118, "step": 143200 }, { "epoch": 0.0466, "grad_norm": 0.27438464760780334, "learning_rate": 0.0001426804, "loss": 0.2868, "step": 143300 }, { "epoch": 0.0468, "grad_norm": 0.22925788164138794, "learning_rate": 0.0001426404, "loss": 0.2945, "step": 143400 }, { "epoch": 0.047, "grad_norm": 0.17650224268436432, "learning_rate": 0.00014260040000000002, "loss": 0.2908, "step": 143500 }, { "epoch": 0.0472, "grad_norm": 0.1927453577518463, "learning_rate": 0.00014256040000000002, "loss": 0.2882, "step": 143600 }, { "epoch": 0.0474, "grad_norm": 0.2470424622297287, "learning_rate": 0.00014252039999999998, "loss": 0.2918, "step": 143700 }, { "epoch": 0.0476, "grad_norm": 0.2224511355161667, "learning_rate": 0.0001424804, "loss": 0.2924, "step": 143800 }, { "epoch": 0.0478, "grad_norm": 0.25327613949775696, "learning_rate": 0.0001424404, "loss": 0.2966, "step": 143900 }, { "epoch": 0.048, "grad_norm": 0.2762177288532257, "learning_rate": 0.00014240040000000002, "loss": 0.2976, "step": 144000 }, { "epoch": 0.0482, "grad_norm": 0.32673609256744385, "learning_rate": 0.0001423604, "loss": 0.2974, "step": 144100 }, { "epoch": 0.0484, "grad_norm": 0.4225119352340698, "learning_rate": 0.0001423204, "loss": 0.2931, "step": 144200 }, { "epoch": 0.0486, "grad_norm": 0.20474287867546082, "learning_rate": 0.0001422804, "loss": 0.2992, "step": 144300 }, { "epoch": 0.0488, "grad_norm": 0.210563525557518, "learning_rate": 0.0001422404, "loss": 0.2848, "step": 144400 }, { "epoch": 0.049, "grad_norm": 0.21854105591773987, "learning_rate": 0.00014220040000000001, "loss": 0.2965, "step": 144500 }, { "epoch": 0.0492, "grad_norm": 0.2694493532180786, "learning_rate": 0.0001421604, "loss": 0.2847, "step": 144600 }, { "epoch": 0.0494, "grad_norm": 0.33580654859542847, "learning_rate": 0.0001421204, "loss": 0.2971, "step": 144700 }, { "epoch": 0.0496, "grad_norm": 0.22318777441978455, "learning_rate": 0.0001420804, "loss": 0.2951, "step": 144800 }, { "epoch": 0.0498, "grad_norm": 0.21913912892341614, "learning_rate": 0.00014204040000000002, "loss": 0.2943, "step": 144900 }, { "epoch": 0.05, "grad_norm": 0.1860969066619873, "learning_rate": 0.0001420004, "loss": 0.2937, "step": 145000 }, { "epoch": 0.0502, "grad_norm": 0.18611373007297516, "learning_rate": 0.0001419604, "loss": 0.2915, "step": 145100 }, { "epoch": 0.0504, "grad_norm": 0.3601659834384918, "learning_rate": 0.0001419204, "loss": 0.3132, "step": 145200 }, { "epoch": 0.0506, "grad_norm": 0.22689419984817505, "learning_rate": 0.0001418804, "loss": 0.2991, "step": 145300 }, { "epoch": 0.0508, "grad_norm": 0.22691553831100464, "learning_rate": 0.0001418404, "loss": 0.2964, "step": 145400 }, { "epoch": 0.051, "grad_norm": 0.18553423881530762, "learning_rate": 0.0001418004, "loss": 0.2975, "step": 145500 }, { "epoch": 0.0512, "grad_norm": 0.21025227010250092, "learning_rate": 0.00014176040000000003, "loss": 0.2923, "step": 145600 }, { "epoch": 0.0514, "grad_norm": 0.17990227043628693, "learning_rate": 0.0001417204, "loss": 0.3035, "step": 145700 }, { "epoch": 0.0516, "grad_norm": 0.18237918615341187, "learning_rate": 0.0001416804, "loss": 0.3022, "step": 145800 }, { "epoch": 0.0518, "grad_norm": 0.20058906078338623, "learning_rate": 0.0001416404, "loss": 0.2902, "step": 145900 }, { "epoch": 0.052, "grad_norm": 0.40441182255744934, "learning_rate": 0.0001416004, "loss": 0.3083, "step": 146000 }, { "epoch": 0.0522, "grad_norm": 0.2583122253417969, "learning_rate": 0.00014156040000000002, "loss": 0.285, "step": 146100 }, { "epoch": 0.0524, "grad_norm": 0.18074502050876617, "learning_rate": 0.0001415204, "loss": 0.2932, "step": 146200 }, { "epoch": 0.0526, "grad_norm": 0.3700363337993622, "learning_rate": 0.0001414804, "loss": 0.314, "step": 146300 }, { "epoch": 0.0528, "grad_norm": 0.20429983735084534, "learning_rate": 0.0001414404, "loss": 0.2994, "step": 146400 }, { "epoch": 0.053, "grad_norm": 0.4449596405029297, "learning_rate": 0.00014140040000000002, "loss": 0.2922, "step": 146500 }, { "epoch": 0.0532, "grad_norm": 0.27035078406333923, "learning_rate": 0.00014136040000000002, "loss": 0.3025, "step": 146600 }, { "epoch": 0.0534, "grad_norm": 0.1620853841304779, "learning_rate": 0.0001413204, "loss": 0.2883, "step": 146700 }, { "epoch": 0.0536, "grad_norm": 0.20872916281223297, "learning_rate": 0.0001412804, "loss": 0.2899, "step": 146800 }, { "epoch": 0.0538, "grad_norm": 0.171489417552948, "learning_rate": 0.0001412404, "loss": 0.293, "step": 146900 }, { "epoch": 0.054, "grad_norm": 0.19745928049087524, "learning_rate": 0.00014120040000000002, "loss": 0.2898, "step": 147000 }, { "epoch": 0.0542, "grad_norm": 0.18130256235599518, "learning_rate": 0.0001411604, "loss": 0.2923, "step": 147100 }, { "epoch": 0.0544, "grad_norm": 0.29075926542282104, "learning_rate": 0.0001411204, "loss": 0.3032, "step": 147200 }, { "epoch": 0.0546, "grad_norm": 0.21553675830364227, "learning_rate": 0.0001410804, "loss": 0.3012, "step": 147300 }, { "epoch": 0.0548, "grad_norm": 0.23423010110855103, "learning_rate": 0.00014104040000000002, "loss": 0.2911, "step": 147400 }, { "epoch": 0.055, "grad_norm": 0.2579517364501953, "learning_rate": 0.0001410004, "loss": 0.292, "step": 147500 }, { "epoch": 0.0552, "grad_norm": 0.23888039588928223, "learning_rate": 0.0001409604, "loss": 0.3001, "step": 147600 }, { "epoch": 0.0554, "grad_norm": 0.32866355776786804, "learning_rate": 0.0001409204, "loss": 0.318, "step": 147700 }, { "epoch": 0.0556, "grad_norm": 0.30911415815353394, "learning_rate": 0.0001408804, "loss": 0.3005, "step": 147800 }, { "epoch": 0.0558, "grad_norm": 0.2983124852180481, "learning_rate": 0.00014084040000000001, "loss": 0.2895, "step": 147900 }, { "epoch": 0.056, "grad_norm": 0.18074147403240204, "learning_rate": 0.0001408004, "loss": 0.3108, "step": 148000 }, { "epoch": 0.0562, "grad_norm": 0.17594757676124573, "learning_rate": 0.0001407604, "loss": 0.2967, "step": 148100 }, { "epoch": 0.0564, "grad_norm": 0.21774758398532867, "learning_rate": 0.0001407204, "loss": 0.299, "step": 148200 }, { "epoch": 0.0566, "grad_norm": 0.3890017569065094, "learning_rate": 0.0001406804, "loss": 0.2822, "step": 148300 }, { "epoch": 0.0568, "grad_norm": 0.23971693217754364, "learning_rate": 0.0001406404, "loss": 0.3042, "step": 148400 }, { "epoch": 0.057, "grad_norm": 0.14672812819480896, "learning_rate": 0.0001406004, "loss": 0.3023, "step": 148500 }, { "epoch": 0.0572, "grad_norm": 0.22197787463665009, "learning_rate": 0.00014056040000000002, "loss": 0.2946, "step": 148600 }, { "epoch": 0.0574, "grad_norm": 0.34892770648002625, "learning_rate": 0.0001405204, "loss": 0.3027, "step": 148700 }, { "epoch": 0.0576, "grad_norm": 0.22468386590480804, "learning_rate": 0.0001404804, "loss": 0.3036, "step": 148800 }, { "epoch": 0.0578, "grad_norm": 0.3529873192310333, "learning_rate": 0.0001404404, "loss": 0.3103, "step": 148900 }, { "epoch": 0.058, "grad_norm": 0.22541065514087677, "learning_rate": 0.0001404004, "loss": 0.3031, "step": 149000 }, { "epoch": 0.0582, "grad_norm": 0.19883927702903748, "learning_rate": 0.00014036040000000002, "loss": 0.2875, "step": 149100 }, { "epoch": 0.0584, "grad_norm": 0.16821545362472534, "learning_rate": 0.00014032039999999999, "loss": 0.2906, "step": 149200 }, { "epoch": 0.0586, "grad_norm": 0.2122853845357895, "learning_rate": 0.0001402804, "loss": 0.294, "step": 149300 }, { "epoch": 0.0588, "grad_norm": 0.21868352591991425, "learning_rate": 0.0001402404, "loss": 0.3014, "step": 149400 }, { "epoch": 0.059, "grad_norm": 0.26323187351226807, "learning_rate": 0.00014020040000000002, "loss": 0.3239, "step": 149500 }, { "epoch": 0.0592, "grad_norm": 0.28496459126472473, "learning_rate": 0.00014016040000000001, "loss": 0.3198, "step": 149600 }, { "epoch": 0.0594, "grad_norm": 0.21418221294879913, "learning_rate": 0.0001401204, "loss": 0.3037, "step": 149700 }, { "epoch": 0.0596, "grad_norm": 0.18050341308116913, "learning_rate": 0.0001400804, "loss": 0.2864, "step": 149800 }, { "epoch": 0.0598, "grad_norm": 0.18909691274166107, "learning_rate": 0.0001400404, "loss": 0.2765, "step": 149900 }, { "epoch": 0.06, "grad_norm": 0.1939447522163391, "learning_rate": 0.00014000040000000002, "loss": 0.2759, "step": 150000 }, { "epoch": 0.0602, "grad_norm": 0.2437305450439453, "learning_rate": 0.0001399604, "loss": 0.2684, "step": 150100 }, { "epoch": 0.0604, "grad_norm": 0.18825781345367432, "learning_rate": 0.0001399204, "loss": 0.2675, "step": 150200 }, { "epoch": 0.0606, "grad_norm": 0.17045961320400238, "learning_rate": 0.0001398804, "loss": 0.2668, "step": 150300 }, { "epoch": 0.0608, "grad_norm": 0.16906027495861053, "learning_rate": 0.00013984040000000002, "loss": 0.2641, "step": 150400 }, { "epoch": 0.061, "grad_norm": 0.19279079139232635, "learning_rate": 0.0001398004, "loss": 0.2656, "step": 150500 }, { "epoch": 0.0612, "grad_norm": 0.16379393637180328, "learning_rate": 0.0001397604, "loss": 0.2643, "step": 150600 }, { "epoch": 0.0614, "grad_norm": 0.20136988162994385, "learning_rate": 0.0001397204, "loss": 0.2644, "step": 150700 }, { "epoch": 0.0616, "grad_norm": 0.16851016879081726, "learning_rate": 0.0001396804, "loss": 0.2688, "step": 150800 }, { "epoch": 0.0618, "grad_norm": 0.3253178596496582, "learning_rate": 0.0001396404, "loss": 0.2745, "step": 150900 }, { "epoch": 0.062, "grad_norm": 0.22693248093128204, "learning_rate": 0.0001396004, "loss": 0.264, "step": 151000 }, { "epoch": 0.0622, "grad_norm": 0.16674602031707764, "learning_rate": 0.00013956040000000003, "loss": 0.2702, "step": 151100 }, { "epoch": 0.0624, "grad_norm": 0.17631779611110687, "learning_rate": 0.00013952040000000002, "loss": 0.2707, "step": 151200 }, { "epoch": 0.0626, "grad_norm": 0.18675994873046875, "learning_rate": 0.0001394804, "loss": 0.2697, "step": 151300 }, { "epoch": 0.0628, "grad_norm": 0.19959695637226105, "learning_rate": 0.0001394404, "loss": 0.2674, "step": 151400 }, { "epoch": 0.063, "grad_norm": 0.1849915087223053, "learning_rate": 0.0001394004, "loss": 0.2662, "step": 151500 }, { "epoch": 0.0632, "grad_norm": 0.9266983866691589, "learning_rate": 0.00013936040000000002, "loss": 0.2693, "step": 151600 }, { "epoch": 0.0634, "grad_norm": 0.1627340465784073, "learning_rate": 0.00013932040000000002, "loss": 0.2692, "step": 151700 }, { "epoch": 0.0636, "grad_norm": 0.20373468101024628, "learning_rate": 0.0001392804, "loss": 0.263, "step": 151800 }, { "epoch": 0.0638, "grad_norm": 0.18756113946437836, "learning_rate": 0.0001392404, "loss": 0.2629, "step": 151900 }, { "epoch": 0.064, "grad_norm": 0.21463698148727417, "learning_rate": 0.0001392004, "loss": 0.2727, "step": 152000 }, { "epoch": 0.0642, "grad_norm": 0.18175888061523438, "learning_rate": 0.00013916040000000002, "loss": 0.2732, "step": 152100 }, { "epoch": 0.0644, "grad_norm": 0.19156472384929657, "learning_rate": 0.0001391204, "loss": 0.2681, "step": 152200 }, { "epoch": 0.0646, "grad_norm": 0.1933964043855667, "learning_rate": 0.0001390804, "loss": 0.2669, "step": 152300 }, { "epoch": 0.0648, "grad_norm": 0.16370782256126404, "learning_rate": 0.0001390404, "loss": 0.2637, "step": 152400 }, { "epoch": 0.065, "grad_norm": 0.1704418957233429, "learning_rate": 0.00013900040000000002, "loss": 0.2651, "step": 152500 }, { "epoch": 0.0652, "grad_norm": 0.16683322191238403, "learning_rate": 0.0001389604, "loss": 0.2704, "step": 152600 }, { "epoch": 0.0654, "grad_norm": 0.2169724553823471, "learning_rate": 0.0001389204, "loss": 0.2701, "step": 152700 }, { "epoch": 0.0656, "grad_norm": 0.16984739899635315, "learning_rate": 0.0001388804, "loss": 0.267, "step": 152800 }, { "epoch": 0.0658, "grad_norm": 0.1774764508008957, "learning_rate": 0.0001388404, "loss": 0.2694, "step": 152900 }, { "epoch": 0.066, "grad_norm": 0.25803592801094055, "learning_rate": 0.00013880040000000001, "loss": 0.2692, "step": 153000 }, { "epoch": 0.0662, "grad_norm": 0.16305796802043915, "learning_rate": 0.0001387604, "loss": 0.2666, "step": 153100 }, { "epoch": 0.0664, "grad_norm": 0.28455424308776855, "learning_rate": 0.00013872040000000003, "loss": 0.2692, "step": 153200 }, { "epoch": 0.0666, "grad_norm": 0.527635931968689, "learning_rate": 0.0001386804, "loss": 0.2709, "step": 153300 }, { "epoch": 0.0668, "grad_norm": 0.15370622277259827, "learning_rate": 0.00013864040000000002, "loss": 0.2709, "step": 153400 }, { "epoch": 0.067, "grad_norm": 0.17640765011310577, "learning_rate": 0.0001386004, "loss": 0.271, "step": 153500 }, { "epoch": 0.0672, "grad_norm": 0.19771219789981842, "learning_rate": 0.0001385604, "loss": 0.2709, "step": 153600 }, { "epoch": 0.0674, "grad_norm": 0.1551404446363449, "learning_rate": 0.00013852040000000002, "loss": 0.2661, "step": 153700 }, { "epoch": 0.0676, "grad_norm": 0.17845940589904785, "learning_rate": 0.0001384804, "loss": 0.2696, "step": 153800 }, { "epoch": 0.0678, "grad_norm": 0.1720665991306305, "learning_rate": 0.0001384404, "loss": 0.2648, "step": 153900 }, { "epoch": 0.068, "grad_norm": 0.1719100922346115, "learning_rate": 0.0001384004, "loss": 0.2688, "step": 154000 }, { "epoch": 0.0682, "grad_norm": 0.16675078868865967, "learning_rate": 0.00013836040000000002, "loss": 0.274, "step": 154100 }, { "epoch": 0.0684, "grad_norm": 0.21262894570827484, "learning_rate": 0.00013832040000000002, "loss": 0.2699, "step": 154200 }, { "epoch": 0.0686, "grad_norm": 0.21415004134178162, "learning_rate": 0.0001382804, "loss": 0.2634, "step": 154300 }, { "epoch": 0.0688, "grad_norm": 0.16432279348373413, "learning_rate": 0.0001382404, "loss": 0.2671, "step": 154400 }, { "epoch": 0.069, "grad_norm": 0.2584374248981476, "learning_rate": 0.0001382004, "loss": 0.2731, "step": 154500 }, { "epoch": 0.0692, "grad_norm": 0.1635134071111679, "learning_rate": 0.00013816040000000002, "loss": 0.2686, "step": 154600 }, { "epoch": 0.0694, "grad_norm": 0.23434379696846008, "learning_rate": 0.0001381204, "loss": 0.2749, "step": 154700 }, { "epoch": 0.0696, "grad_norm": 0.16734665632247925, "learning_rate": 0.0001380804, "loss": 0.2663, "step": 154800 }, { "epoch": 0.0698, "grad_norm": 0.17818975448608398, "learning_rate": 0.0001380404, "loss": 0.2702, "step": 154900 }, { "epoch": 0.07, "grad_norm": 0.17465141415596008, "learning_rate": 0.0001380004, "loss": 0.2728, "step": 155000 }, { "epoch": 0.0702, "grad_norm": 0.24960975348949432, "learning_rate": 0.00013796040000000001, "loss": 0.2663, "step": 155100 }, { "epoch": 0.0704, "grad_norm": 0.22631630301475525, "learning_rate": 0.0001379204, "loss": 0.2728, "step": 155200 }, { "epoch": 0.0706, "grad_norm": 0.35504263639450073, "learning_rate": 0.0001378804, "loss": 0.2704, "step": 155300 }, { "epoch": 0.0708, "grad_norm": 0.19621668756008148, "learning_rate": 0.0001378404, "loss": 0.2723, "step": 155400 }, { "epoch": 0.071, "grad_norm": 0.14076755940914154, "learning_rate": 0.00013780040000000002, "loss": 0.2676, "step": 155500 }, { "epoch": 0.0712, "grad_norm": 0.19740091264247894, "learning_rate": 0.0001377604, "loss": 0.2654, "step": 155600 }, { "epoch": 0.0714, "grad_norm": 0.21776890754699707, "learning_rate": 0.0001377204, "loss": 0.2648, "step": 155700 }, { "epoch": 0.0716, "grad_norm": 0.19757398962974548, "learning_rate": 0.0001376804, "loss": 0.2706, "step": 155800 }, { "epoch": 0.0718, "grad_norm": 0.17804448306560516, "learning_rate": 0.0001376404, "loss": 0.2692, "step": 155900 }, { "epoch": 0.072, "grad_norm": 0.28381094336509705, "learning_rate": 0.0001376004, "loss": 0.2658, "step": 156000 }, { "epoch": 0.0722, "grad_norm": 0.19585195183753967, "learning_rate": 0.0001375604, "loss": 0.27, "step": 156100 }, { "epoch": 0.0724, "grad_norm": 0.1824631541967392, "learning_rate": 0.00013752040000000003, "loss": 0.2726, "step": 156200 }, { "epoch": 0.0726, "grad_norm": 0.16430000960826874, "learning_rate": 0.0001374804, "loss": 0.2702, "step": 156300 }, { "epoch": 0.0728, "grad_norm": 0.14714659750461578, "learning_rate": 0.0001374404, "loss": 0.2704, "step": 156400 }, { "epoch": 0.073, "grad_norm": 0.2480650097131729, "learning_rate": 0.0001374004, "loss": 0.2651, "step": 156500 }, { "epoch": 0.0732, "grad_norm": 0.1619512140750885, "learning_rate": 0.0001373604, "loss": 0.2661, "step": 156600 }, { "epoch": 0.0734, "grad_norm": 0.16349893808364868, "learning_rate": 0.00013732040000000002, "loss": 0.2684, "step": 156700 }, { "epoch": 0.0736, "grad_norm": 0.31547239422798157, "learning_rate": 0.0001372804, "loss": 0.267, "step": 156800 }, { "epoch": 0.0738, "grad_norm": 0.17574888467788696, "learning_rate": 0.0001372404, "loss": 0.2712, "step": 156900 }, { "epoch": 0.074, "grad_norm": 0.21195268630981445, "learning_rate": 0.0001372004, "loss": 0.2689, "step": 157000 }, { "epoch": 0.0742, "grad_norm": 0.16163906455039978, "learning_rate": 0.00013716040000000002, "loss": 0.2696, "step": 157100 }, { "epoch": 0.0744, "grad_norm": 0.22510552406311035, "learning_rate": 0.00013712040000000002, "loss": 0.2691, "step": 157200 }, { "epoch": 0.0746, "grad_norm": 0.27251046895980835, "learning_rate": 0.0001370804, "loss": 0.2678, "step": 157300 }, { "epoch": 0.0748, "grad_norm": 0.24684669077396393, "learning_rate": 0.0001370404, "loss": 0.267, "step": 157400 }, { "epoch": 0.075, "grad_norm": 0.2159154713153839, "learning_rate": 0.0001370004, "loss": 0.2792, "step": 157500 }, { "epoch": 0.0752, "grad_norm": 0.16219884157180786, "learning_rate": 0.00013696040000000002, "loss": 0.2714, "step": 157600 }, { "epoch": 0.0754, "grad_norm": 0.28837162256240845, "learning_rate": 0.0001369204, "loss": 0.2668, "step": 157700 }, { "epoch": 0.0756, "grad_norm": 0.23692043125629425, "learning_rate": 0.0001368804, "loss": 0.2673, "step": 157800 }, { "epoch": 0.0758, "grad_norm": 0.1836722046136856, "learning_rate": 0.0001368404, "loss": 0.2685, "step": 157900 }, { "epoch": 0.076, "grad_norm": 0.18108327686786652, "learning_rate": 0.00013680040000000002, "loss": 0.2725, "step": 158000 }, { "epoch": 0.0762, "grad_norm": 0.14798210561275482, "learning_rate": 0.0001367604, "loss": 0.2682, "step": 158100 }, { "epoch": 0.0764, "grad_norm": 0.3181903064250946, "learning_rate": 0.0001367204, "loss": 0.2608, "step": 158200 }, { "epoch": 0.0766, "grad_norm": 0.15039204061031342, "learning_rate": 0.0001366804, "loss": 0.2676, "step": 158300 }, { "epoch": 0.0768, "grad_norm": 0.18739774823188782, "learning_rate": 0.0001366404, "loss": 0.269, "step": 158400 }, { "epoch": 0.077, "grad_norm": 0.28927820920944214, "learning_rate": 0.00013660040000000001, "loss": 0.2656, "step": 158500 }, { "epoch": 0.0772, "grad_norm": 0.174117773771286, "learning_rate": 0.0001365604, "loss": 0.2639, "step": 158600 }, { "epoch": 0.0774, "grad_norm": 0.20544083416461945, "learning_rate": 0.0001365204, "loss": 0.2686, "step": 158700 }, { "epoch": 0.0776, "grad_norm": 0.219607412815094, "learning_rate": 0.0001364804, "loss": 0.2665, "step": 158800 }, { "epoch": 0.0778, "grad_norm": 0.16388162970542908, "learning_rate": 0.0001364404, "loss": 0.2676, "step": 158900 }, { "epoch": 0.078, "grad_norm": 0.2054087072610855, "learning_rate": 0.0001364004, "loss": 0.2767, "step": 159000 }, { "epoch": 0.0782, "grad_norm": 0.15745876729488373, "learning_rate": 0.0001363604, "loss": 0.2713, "step": 159100 }, { "epoch": 0.0784, "grad_norm": 0.17080196738243103, "learning_rate": 0.00013632040000000002, "loss": 0.2622, "step": 159200 }, { "epoch": 0.0786, "grad_norm": 0.2554245591163635, "learning_rate": 0.0001362804, "loss": 0.2658, "step": 159300 }, { "epoch": 0.0788, "grad_norm": 0.25696536898612976, "learning_rate": 0.0001362404, "loss": 0.2684, "step": 159400 }, { "epoch": 0.079, "grad_norm": 0.15896977484226227, "learning_rate": 0.0001362004, "loss": 0.2698, "step": 159500 }, { "epoch": 0.0792, "grad_norm": 0.17716248333454132, "learning_rate": 0.0001361604, "loss": 0.2713, "step": 159600 }, { "epoch": 0.0794, "grad_norm": 0.2042681723833084, "learning_rate": 0.00013612040000000002, "loss": 0.272, "step": 159700 }, { "epoch": 0.0796, "grad_norm": 0.21027176082134247, "learning_rate": 0.00013608039999999999, "loss": 0.2701, "step": 159800 }, { "epoch": 0.0798, "grad_norm": 0.17798981070518494, "learning_rate": 0.0001360404, "loss": 0.2722, "step": 159900 }, { "epoch": 0.08, "grad_norm": 0.21452707052230835, "learning_rate": 0.0001360004, "loss": 0.2704, "step": 160000 }, { "epoch": 0.0802, "grad_norm": 0.19280506670475006, "learning_rate": 0.00013596040000000002, "loss": 0.2714, "step": 160100 }, { "epoch": 0.0804, "grad_norm": 0.1743934005498886, "learning_rate": 0.00013592040000000001, "loss": 0.2681, "step": 160200 }, { "epoch": 0.0806, "grad_norm": 0.20632795989513397, "learning_rate": 0.0001358804, "loss": 0.2767, "step": 160300 }, { "epoch": 0.0808, "grad_norm": 0.18461468815803528, "learning_rate": 0.0001358404, "loss": 0.2716, "step": 160400 }, { "epoch": 0.081, "grad_norm": 0.18317373096942902, "learning_rate": 0.0001358004, "loss": 0.2742, "step": 160500 }, { "epoch": 0.0812, "grad_norm": 0.18154622614383698, "learning_rate": 0.00013576040000000002, "loss": 0.2647, "step": 160600 }, { "epoch": 0.0814, "grad_norm": 0.20829536020755768, "learning_rate": 0.0001357204, "loss": 0.2679, "step": 160700 }, { "epoch": 0.0816, "grad_norm": 0.19264093041419983, "learning_rate": 0.0001356804, "loss": 0.2655, "step": 160800 }, { "epoch": 0.0818, "grad_norm": 0.22138293087482452, "learning_rate": 0.0001356404, "loss": 0.272, "step": 160900 }, { "epoch": 0.082, "grad_norm": 0.18807452917099, "learning_rate": 0.00013560040000000002, "loss": 0.265, "step": 161000 }, { "epoch": 0.0822, "grad_norm": 0.1932212859392166, "learning_rate": 0.0001355604, "loss": 0.259, "step": 161100 }, { "epoch": 0.0824, "grad_norm": 0.1871035099029541, "learning_rate": 0.0001355204, "loss": 0.2631, "step": 161200 }, { "epoch": 0.0826, "grad_norm": 0.16003726422786713, "learning_rate": 0.0001354804, "loss": 0.2654, "step": 161300 }, { "epoch": 0.0828, "grad_norm": 0.1761871874332428, "learning_rate": 0.0001354404, "loss": 0.2669, "step": 161400 }, { "epoch": 0.083, "grad_norm": 0.27383115887641907, "learning_rate": 0.0001354004, "loss": 0.2621, "step": 161500 }, { "epoch": 0.0832, "grad_norm": 0.1668674498796463, "learning_rate": 0.0001353604, "loss": 0.2665, "step": 161600 }, { "epoch": 0.0834, "grad_norm": 0.19219066202640533, "learning_rate": 0.00013532040000000003, "loss": 0.2672, "step": 161700 }, { "epoch": 0.0836, "grad_norm": 0.2823545038700104, "learning_rate": 0.0001352804, "loss": 0.2655, "step": 161800 }, { "epoch": 0.0838, "grad_norm": 0.18881294131278992, "learning_rate": 0.00013524040000000001, "loss": 0.266, "step": 161900 }, { "epoch": 0.084, "grad_norm": 0.1976308971643448, "learning_rate": 0.0001352004, "loss": 0.2691, "step": 162000 }, { "epoch": 0.0842, "grad_norm": 0.22563625872135162, "learning_rate": 0.0001351604, "loss": 0.2655, "step": 162100 }, { "epoch": 0.0844, "grad_norm": 0.18993031978607178, "learning_rate": 0.00013512040000000002, "loss": 0.269, "step": 162200 }, { "epoch": 0.0846, "grad_norm": 0.2097342163324356, "learning_rate": 0.0001350804, "loss": 0.2652, "step": 162300 }, { "epoch": 0.0848, "grad_norm": 0.1664929836988449, "learning_rate": 0.0001350404, "loss": 0.2654, "step": 162400 }, { "epoch": 0.085, "grad_norm": 0.1798952966928482, "learning_rate": 0.0001350004, "loss": 0.2661, "step": 162500 }, { "epoch": 0.0852, "grad_norm": 0.17582497000694275, "learning_rate": 0.0001349604, "loss": 0.2651, "step": 162600 }, { "epoch": 0.0854, "grad_norm": 0.28709277510643005, "learning_rate": 0.00013492040000000002, "loss": 0.2669, "step": 162700 }, { "epoch": 0.0856, "grad_norm": 0.1520322561264038, "learning_rate": 0.00013488039999999998, "loss": 0.2652, "step": 162800 }, { "epoch": 0.0858, "grad_norm": 0.1492086499929428, "learning_rate": 0.0001348404, "loss": 0.2667, "step": 162900 }, { "epoch": 0.086, "grad_norm": 0.25710374116897583, "learning_rate": 0.0001348004, "loss": 0.2697, "step": 163000 }, { "epoch": 0.0862, "grad_norm": 0.20499125123023987, "learning_rate": 0.00013476040000000002, "loss": 0.264, "step": 163100 }, { "epoch": 0.0864, "grad_norm": 0.19236227869987488, "learning_rate": 0.0001347204, "loss": 0.2692, "step": 163200 }, { "epoch": 0.0866, "grad_norm": 0.1704840511083603, "learning_rate": 0.0001346804, "loss": 0.2857, "step": 163300 }, { "epoch": 0.0868, "grad_norm": 0.3140028417110443, "learning_rate": 0.0001346404, "loss": 0.2689, "step": 163400 }, { "epoch": 0.087, "grad_norm": 0.16201360523700714, "learning_rate": 0.0001346004, "loss": 0.2646, "step": 163500 }, { "epoch": 0.0872, "grad_norm": 0.22757022082805634, "learning_rate": 0.00013456040000000001, "loss": 0.266, "step": 163600 }, { "epoch": 0.0874, "grad_norm": 0.15154384076595306, "learning_rate": 0.0001345204, "loss": 0.2688, "step": 163700 }, { "epoch": 0.0876, "grad_norm": 0.2439052164554596, "learning_rate": 0.00013448040000000003, "loss": 0.2656, "step": 163800 }, { "epoch": 0.0878, "grad_norm": 0.1832723617553711, "learning_rate": 0.0001344404, "loss": 0.2657, "step": 163900 }, { "epoch": 0.088, "grad_norm": 0.2097846120595932, "learning_rate": 0.00013440040000000002, "loss": 0.2633, "step": 164000 }, { "epoch": 0.0882, "grad_norm": 0.2136882096529007, "learning_rate": 0.0001343604, "loss": 0.2756, "step": 164100 }, { "epoch": 0.0884, "grad_norm": 0.20696696639060974, "learning_rate": 0.0001343204, "loss": 0.2675, "step": 164200 }, { "epoch": 0.0886, "grad_norm": 0.2125898003578186, "learning_rate": 0.00013428040000000002, "loss": 0.2728, "step": 164300 }, { "epoch": 0.0888, "grad_norm": 0.16671323776245117, "learning_rate": 0.0001342404, "loss": 0.2667, "step": 164400 }, { "epoch": 0.089, "grad_norm": 0.18992967903614044, "learning_rate": 0.0001342004, "loss": 0.2666, "step": 164500 }, { "epoch": 0.0892, "grad_norm": 0.167401522397995, "learning_rate": 0.0001341604, "loss": 0.2649, "step": 164600 }, { "epoch": 0.0894, "grad_norm": 0.18029069900512695, "learning_rate": 0.00013412040000000002, "loss": 0.2683, "step": 164700 }, { "epoch": 0.0896, "grad_norm": 0.17287403345108032, "learning_rate": 0.00013408040000000002, "loss": 0.2697, "step": 164800 }, { "epoch": 0.0898, "grad_norm": 0.15097787976264954, "learning_rate": 0.0001340404, "loss": 0.265, "step": 164900 }, { "epoch": 0.09, "grad_norm": 0.1807047575712204, "learning_rate": 0.0001340004, "loss": 0.2669, "step": 165000 }, { "epoch": 0.0902, "grad_norm": 0.20332248508930206, "learning_rate": 0.0001339604, "loss": 0.2699, "step": 165100 }, { "epoch": 0.0904, "grad_norm": 0.24715851247310638, "learning_rate": 0.00013392040000000002, "loss": 0.2661, "step": 165200 }, { "epoch": 0.0906, "grad_norm": 0.1838693916797638, "learning_rate": 0.0001338804, "loss": 0.2645, "step": 165300 }, { "epoch": 0.0908, "grad_norm": 0.29177427291870117, "learning_rate": 0.0001338404, "loss": 0.2686, "step": 165400 }, { "epoch": 0.091, "grad_norm": 0.16363517940044403, "learning_rate": 0.0001338004, "loss": 0.2663, "step": 165500 }, { "epoch": 0.0912, "grad_norm": 0.17923811078071594, "learning_rate": 0.00013376040000000002, "loss": 0.2637, "step": 165600 }, { "epoch": 0.0914, "grad_norm": 0.14745573699474335, "learning_rate": 0.00013372040000000001, "loss": 0.2644, "step": 165700 }, { "epoch": 0.0916, "grad_norm": 0.17744286358356476, "learning_rate": 0.0001336804, "loss": 0.2697, "step": 165800 }, { "epoch": 0.0918, "grad_norm": 0.2632181942462921, "learning_rate": 0.0001336404, "loss": 0.2622, "step": 165900 }, { "epoch": 0.092, "grad_norm": 0.22580741345882416, "learning_rate": 0.0001336004, "loss": 0.2623, "step": 166000 }, { "epoch": 0.0922, "grad_norm": 0.2017957717180252, "learning_rate": 0.00013356040000000002, "loss": 0.2723, "step": 166100 }, { "epoch": 0.0924, "grad_norm": 0.20650072395801544, "learning_rate": 0.0001335204, "loss": 0.2646, "step": 166200 }, { "epoch": 0.0926, "grad_norm": 0.19651727378368378, "learning_rate": 0.0001334804, "loss": 0.2638, "step": 166300 }, { "epoch": 0.0928, "grad_norm": 0.16937686502933502, "learning_rate": 0.0001334404, "loss": 0.2627, "step": 166400 }, { "epoch": 0.093, "grad_norm": 0.16137665510177612, "learning_rate": 0.0001334004, "loss": 0.2643, "step": 166500 }, { "epoch": 0.0932, "grad_norm": 0.18096907436847687, "learning_rate": 0.0001333604, "loss": 0.2623, "step": 166600 }, { "epoch": 0.0934, "grad_norm": 0.1687513142824173, "learning_rate": 0.0001333204, "loss": 0.2679, "step": 166700 }, { "epoch": 0.0936, "grad_norm": 0.18491438031196594, "learning_rate": 0.00013328040000000003, "loss": 0.2663, "step": 166800 }, { "epoch": 0.0938, "grad_norm": 0.21767160296440125, "learning_rate": 0.0001332404, "loss": 0.2614, "step": 166900 }, { "epoch": 0.094, "grad_norm": 0.18175816535949707, "learning_rate": 0.0001332004, "loss": 0.2638, "step": 167000 }, { "epoch": 0.0942, "grad_norm": 0.17034652829170227, "learning_rate": 0.0001331604, "loss": 0.2656, "step": 167100 }, { "epoch": 0.0944, "grad_norm": 0.1829775869846344, "learning_rate": 0.0001331204, "loss": 0.2723, "step": 167200 }, { "epoch": 0.0946, "grad_norm": 0.16462214291095734, "learning_rate": 0.00013308040000000002, "loss": 0.2678, "step": 167300 }, { "epoch": 0.0948, "grad_norm": 0.18933749198913574, "learning_rate": 0.0001330404, "loss": 0.2598, "step": 167400 }, { "epoch": 0.095, "grad_norm": 0.2435947209596634, "learning_rate": 0.0001330004, "loss": 0.2648, "step": 167500 }, { "epoch": 0.0952, "grad_norm": 0.20233914256095886, "learning_rate": 0.0001329604, "loss": 0.2659, "step": 167600 }, { "epoch": 0.0954, "grad_norm": 0.17714782059192657, "learning_rate": 0.00013292040000000002, "loss": 0.2644, "step": 167700 }, { "epoch": 0.0956, "grad_norm": 0.14922687411308289, "learning_rate": 0.00013288040000000002, "loss": 0.2695, "step": 167800 }, { "epoch": 0.0958, "grad_norm": 0.1976463347673416, "learning_rate": 0.0001328404, "loss": 0.2628, "step": 167900 }, { "epoch": 0.096, "grad_norm": 0.18765133619308472, "learning_rate": 0.0001328004, "loss": 0.2612, "step": 168000 }, { "epoch": 0.0962, "grad_norm": 0.2899118959903717, "learning_rate": 0.0001327604, "loss": 0.2677, "step": 168100 }, { "epoch": 0.0964, "grad_norm": 0.20916017889976501, "learning_rate": 0.00013272040000000002, "loss": 0.2664, "step": 168200 }, { "epoch": 0.0966, "grad_norm": 0.20737506449222565, "learning_rate": 0.0001326804, "loss": 0.2585, "step": 168300 }, { "epoch": 0.0968, "grad_norm": 0.16495130956172943, "learning_rate": 0.0001326404, "loss": 0.2618, "step": 168400 }, { "epoch": 0.097, "grad_norm": 0.3457525074481964, "learning_rate": 0.0001326004, "loss": 0.2603, "step": 168500 }, { "epoch": 0.0972, "grad_norm": 0.16713711619377136, "learning_rate": 0.00013256040000000002, "loss": 0.2631, "step": 168600 }, { "epoch": 0.0974, "grad_norm": 0.4487501382827759, "learning_rate": 0.0001325204, "loss": 0.2655, "step": 168700 }, { "epoch": 0.0976, "grad_norm": 0.16808322072029114, "learning_rate": 0.0001324804, "loss": 0.265, "step": 168800 }, { "epoch": 0.0978, "grad_norm": 0.1778227984905243, "learning_rate": 0.0001324404, "loss": 0.2629, "step": 168900 }, { "epoch": 0.098, "grad_norm": 0.30036282539367676, "learning_rate": 0.0001324004, "loss": 0.269, "step": 169000 }, { "epoch": 0.0982, "grad_norm": 0.5122041702270508, "learning_rate": 0.00013236040000000001, "loss": 0.2575, "step": 169100 }, { "epoch": 0.0984, "grad_norm": 0.14579719305038452, "learning_rate": 0.0001323204, "loss": 0.2615, "step": 169200 }, { "epoch": 0.0986, "grad_norm": 0.2066614180803299, "learning_rate": 0.00013228040000000003, "loss": 0.2618, "step": 169300 }, { "epoch": 0.0988, "grad_norm": 0.20652034878730774, "learning_rate": 0.0001322404, "loss": 0.2491, "step": 169400 }, { "epoch": 0.099, "grad_norm": 0.23203743994235992, "learning_rate": 0.0001322004, "loss": 0.2618, "step": 169500 }, { "epoch": 0.0992, "grad_norm": 0.2068849503993988, "learning_rate": 0.0001321604, "loss": 0.2606, "step": 169600 }, { "epoch": 0.0994, "grad_norm": 0.19553183019161224, "learning_rate": 0.0001321204, "loss": 0.2661, "step": 169700 }, { "epoch": 0.0996, "grad_norm": 0.17411251366138458, "learning_rate": 0.00013208040000000002, "loss": 0.2633, "step": 169800 }, { "epoch": 0.0998, "grad_norm": 0.23143243789672852, "learning_rate": 0.0001320404, "loss": 0.2562, "step": 169900 }, { "epoch": 0.1, "grad_norm": 0.18598131835460663, "learning_rate": 0.0001320004, "loss": 0.2585, "step": 170000 }, { "epoch": 0.1002, "grad_norm": 0.2250274121761322, "learning_rate": 0.0001319604, "loss": 0.2631, "step": 170100 }, { "epoch": 0.1004, "grad_norm": 0.2246299684047699, "learning_rate": 0.0001319204, "loss": 0.2663, "step": 170200 }, { "epoch": 0.1006, "grad_norm": 0.18941205739974976, "learning_rate": 0.00013188040000000002, "loss": 0.2609, "step": 170300 }, { "epoch": 0.1008, "grad_norm": 0.16915076971054077, "learning_rate": 0.00013184039999999999, "loss": 0.2615, "step": 170400 }, { "epoch": 0.101, "grad_norm": 0.18507012724876404, "learning_rate": 0.0001318004, "loss": 0.2668, "step": 170500 }, { "epoch": 0.1012, "grad_norm": 0.19404035806655884, "learning_rate": 0.0001317604, "loss": 0.2604, "step": 170600 }, { "epoch": 0.1014, "grad_norm": 0.20818071067333221, "learning_rate": 0.00013172040000000002, "loss": 0.2635, "step": 170700 }, { "epoch": 0.1016, "grad_norm": 0.20802633464336395, "learning_rate": 0.00013168040000000001, "loss": 0.2608, "step": 170800 }, { "epoch": 0.1018, "grad_norm": 0.25072330236434937, "learning_rate": 0.0001316404, "loss": 0.2589, "step": 170900 }, { "epoch": 0.102, "grad_norm": 0.1490134745836258, "learning_rate": 0.0001316004, "loss": 0.2642, "step": 171000 }, { "epoch": 0.1022, "grad_norm": 0.15050648152828217, "learning_rate": 0.0001315604, "loss": 0.2625, "step": 171100 }, { "epoch": 0.1024, "grad_norm": 0.16798128187656403, "learning_rate": 0.00013152040000000002, "loss": 0.2601, "step": 171200 }, { "epoch": 0.1026, "grad_norm": 0.1618351936340332, "learning_rate": 0.0001314804, "loss": 0.2618, "step": 171300 }, { "epoch": 0.1028, "grad_norm": 0.19324688613414764, "learning_rate": 0.0001314404, "loss": 0.2617, "step": 171400 }, { "epoch": 0.103, "grad_norm": 0.16977021098136902, "learning_rate": 0.0001314004, "loss": 0.2583, "step": 171500 }, { "epoch": 0.1032, "grad_norm": 0.1766977161169052, "learning_rate": 0.00013136040000000002, "loss": 0.2617, "step": 171600 }, { "epoch": 0.1034, "grad_norm": 0.16178765892982483, "learning_rate": 0.0001313204, "loss": 0.2735, "step": 171700 }, { "epoch": 0.1036, "grad_norm": 0.2680456340312958, "learning_rate": 0.0001312804, "loss": 0.261, "step": 171800 }, { "epoch": 0.1038, "grad_norm": 0.1756458580493927, "learning_rate": 0.0001312404, "loss": 0.2736, "step": 171900 }, { "epoch": 0.104, "grad_norm": 0.14756278693675995, "learning_rate": 0.0001312004, "loss": 0.2646, "step": 172000 }, { "epoch": 0.1042, "grad_norm": 0.4272075593471527, "learning_rate": 0.0001311604, "loss": 0.2837, "step": 172100 }, { "epoch": 0.1044, "grad_norm": 0.2615519165992737, "learning_rate": 0.0001311204, "loss": 0.2742, "step": 172200 }, { "epoch": 0.1046, "grad_norm": 0.15839534997940063, "learning_rate": 0.00013108040000000003, "loss": 0.2642, "step": 172300 }, { "epoch": 0.1048, "grad_norm": 0.20636843144893646, "learning_rate": 0.0001310404, "loss": 0.2663, "step": 172400 }, { "epoch": 0.105, "grad_norm": 0.16299419105052948, "learning_rate": 0.00013100040000000001, "loss": 0.2599, "step": 172500 }, { "epoch": 0.1052, "grad_norm": 0.19725507497787476, "learning_rate": 0.0001309604, "loss": 0.2705, "step": 172600 }, { "epoch": 0.1054, "grad_norm": 0.20907683670520782, "learning_rate": 0.0001309204, "loss": 0.2637, "step": 172700 }, { "epoch": 0.1056, "grad_norm": 0.1777603030204773, "learning_rate": 0.00013088040000000002, "loss": 0.2696, "step": 172800 }, { "epoch": 0.1058, "grad_norm": 0.2649913430213928, "learning_rate": 0.0001308404, "loss": 0.2602, "step": 172900 }, { "epoch": 0.106, "grad_norm": 0.21683943271636963, "learning_rate": 0.0001308004, "loss": 0.268, "step": 173000 }, { "epoch": 0.1062, "grad_norm": 0.1863240897655487, "learning_rate": 0.0001307604, "loss": 0.2699, "step": 173100 }, { "epoch": 0.1064, "grad_norm": 0.38173478841781616, "learning_rate": 0.0001307204, "loss": 0.2741, "step": 173200 }, { "epoch": 0.1066, "grad_norm": 0.19350466132164001, "learning_rate": 0.00013068040000000002, "loss": 0.264, "step": 173300 }, { "epoch": 0.1068, "grad_norm": 0.21752768754959106, "learning_rate": 0.00013064039999999998, "loss": 0.2638, "step": 173400 }, { "epoch": 0.107, "grad_norm": 0.22547826170921326, "learning_rate": 0.0001306004, "loss": 0.2669, "step": 173500 }, { "epoch": 0.1072, "grad_norm": 0.17793551087379456, "learning_rate": 0.0001305604, "loss": 0.2668, "step": 173600 }, { "epoch": 0.1074, "grad_norm": 0.18250034749507904, "learning_rate": 0.00013052040000000002, "loss": 0.2634, "step": 173700 }, { "epoch": 0.1076, "grad_norm": 0.17843498289585114, "learning_rate": 0.0001304804, "loss": 0.2632, "step": 173800 }, { "epoch": 0.1078, "grad_norm": 0.2799086570739746, "learning_rate": 0.0001304404, "loss": 0.2619, "step": 173900 }, { "epoch": 0.108, "grad_norm": 0.2233753800392151, "learning_rate": 0.0001304004, "loss": 0.2625, "step": 174000 }, { "epoch": 0.1082, "grad_norm": 0.18286100029945374, "learning_rate": 0.0001303604, "loss": 0.2634, "step": 174100 }, { "epoch": 0.1084, "grad_norm": 0.1985454261302948, "learning_rate": 0.0001303204, "loss": 0.2626, "step": 174200 }, { "epoch": 0.1086, "grad_norm": 0.191930890083313, "learning_rate": 0.0001302804, "loss": 0.261, "step": 174300 }, { "epoch": 0.1088, "grad_norm": 0.17980682849884033, "learning_rate": 0.0001302404, "loss": 0.264, "step": 174400 }, { "epoch": 0.109, "grad_norm": 0.16460399329662323, "learning_rate": 0.0001302004, "loss": 0.2657, "step": 174500 }, { "epoch": 0.1092, "grad_norm": 0.22608372569084167, "learning_rate": 0.00013016040000000002, "loss": 0.2671, "step": 174600 }, { "epoch": 0.1094, "grad_norm": 0.22313453257083893, "learning_rate": 0.0001301204, "loss": 0.2633, "step": 174700 }, { "epoch": 0.1096, "grad_norm": 0.20686650276184082, "learning_rate": 0.0001300804, "loss": 0.2655, "step": 174800 }, { "epoch": 0.1098, "grad_norm": 0.23883718252182007, "learning_rate": 0.0001300404, "loss": 0.2633, "step": 174900 }, { "epoch": 0.11, "grad_norm": 0.19800545275211334, "learning_rate": 0.0001300004, "loss": 0.268, "step": 175000 }, { "epoch": 0.1102, "grad_norm": 0.18685345351696014, "learning_rate": 0.0001299604, "loss": 0.2678, "step": 175100 }, { "epoch": 0.1104, "grad_norm": 0.22192412614822388, "learning_rate": 0.0001299204, "loss": 0.2689, "step": 175200 }, { "epoch": 0.1106, "grad_norm": 0.20726250112056732, "learning_rate": 0.00012988040000000002, "loss": 0.2613, "step": 175300 }, { "epoch": 0.1108, "grad_norm": 0.2257356494665146, "learning_rate": 0.0001298404, "loss": 0.264, "step": 175400 }, { "epoch": 0.111, "grad_norm": 0.21950094401836395, "learning_rate": 0.0001298004, "loss": 0.2664, "step": 175500 }, { "epoch": 0.1112, "grad_norm": 0.18882116675376892, "learning_rate": 0.0001297604, "loss": 0.2719, "step": 175600 }, { "epoch": 0.1114, "grad_norm": 0.2492358237504959, "learning_rate": 0.0001297204, "loss": 0.2588, "step": 175700 }, { "epoch": 0.1116, "grad_norm": 0.15519103407859802, "learning_rate": 0.00012968040000000002, "loss": 0.2629, "step": 175800 }, { "epoch": 0.1118, "grad_norm": 0.16526469588279724, "learning_rate": 0.00012964039999999999, "loss": 0.2594, "step": 175900 }, { "epoch": 0.112, "grad_norm": 0.1781369298696518, "learning_rate": 0.0001296004, "loss": 0.2638, "step": 176000 }, { "epoch": 0.1122, "grad_norm": 0.19480067491531372, "learning_rate": 0.0001295604, "loss": 0.263, "step": 176100 }, { "epoch": 0.1124, "grad_norm": 0.20331743359565735, "learning_rate": 0.00012952040000000002, "loss": 0.2595, "step": 176200 }, { "epoch": 0.1126, "grad_norm": 0.404156357049942, "learning_rate": 0.00012948040000000001, "loss": 0.264, "step": 176300 }, { "epoch": 0.1128, "grad_norm": 0.17432494461536407, "learning_rate": 0.0001294404, "loss": 0.2616, "step": 176400 }, { "epoch": 0.113, "grad_norm": 0.17324478924274445, "learning_rate": 0.0001294004, "loss": 0.2566, "step": 176500 }, { "epoch": 0.1132, "grad_norm": 0.20663540065288544, "learning_rate": 0.0001293604, "loss": 0.2659, "step": 176600 }, { "epoch": 0.1134, "grad_norm": 0.16922618448734283, "learning_rate": 0.00012932040000000002, "loss": 0.2621, "step": 176700 }, { "epoch": 0.1136, "grad_norm": 0.1731468141078949, "learning_rate": 0.0001292804, "loss": 0.2583, "step": 176800 }, { "epoch": 0.1138, "grad_norm": 0.3375247120857239, "learning_rate": 0.0001292404, "loss": 0.2624, "step": 176900 }, { "epoch": 0.114, "grad_norm": 0.14527542889118195, "learning_rate": 0.0001292004, "loss": 0.2598, "step": 177000 }, { "epoch": 0.1142, "grad_norm": 0.18705546855926514, "learning_rate": 0.0001291604, "loss": 0.2602, "step": 177100 }, { "epoch": 0.1144, "grad_norm": 0.23862403631210327, "learning_rate": 0.0001291204, "loss": 0.2568, "step": 177200 }, { "epoch": 0.1146, "grad_norm": 0.21297907829284668, "learning_rate": 0.0001290804, "loss": 0.2687, "step": 177300 }, { "epoch": 0.1148, "grad_norm": 0.24559397995471954, "learning_rate": 0.00012904040000000003, "loss": 0.2638, "step": 177400 }, { "epoch": 0.115, "grad_norm": 0.1739484816789627, "learning_rate": 0.0001290004, "loss": 0.26, "step": 177500 }, { "epoch": 0.1152, "grad_norm": 0.1745755672454834, "learning_rate": 0.0001289604, "loss": 0.2781, "step": 177600 }, { "epoch": 0.1154, "grad_norm": 0.4448339343070984, "learning_rate": 0.0001289204, "loss": 0.2606, "step": 177700 }, { "epoch": 0.1156, "grad_norm": 0.18532177805900574, "learning_rate": 0.0001288804, "loss": 0.2695, "step": 177800 }, { "epoch": 0.1158, "grad_norm": 0.20550072193145752, "learning_rate": 0.00012884040000000002, "loss": 0.2586, "step": 177900 }, { "epoch": 0.116, "grad_norm": 0.18768703937530518, "learning_rate": 0.0001288004, "loss": 0.2581, "step": 178000 }, { "epoch": 0.1162, "grad_norm": 0.21779786050319672, "learning_rate": 0.0001287604, "loss": 0.2598, "step": 178100 }, { "epoch": 0.1164, "grad_norm": 0.16268861293792725, "learning_rate": 0.0001287204, "loss": 0.2639, "step": 178200 }, { "epoch": 0.1166, "grad_norm": 0.18373139202594757, "learning_rate": 0.00012868040000000002, "loss": 0.2573, "step": 178300 }, { "epoch": 0.1168, "grad_norm": 0.17486025393009186, "learning_rate": 0.00012864040000000002, "loss": 0.2628, "step": 178400 }, { "epoch": 0.117, "grad_norm": 0.17271597683429718, "learning_rate": 0.0001286004, "loss": 0.2603, "step": 178500 }, { "epoch": 0.1172, "grad_norm": 0.15884092450141907, "learning_rate": 0.0001285604, "loss": 0.2616, "step": 178600 }, { "epoch": 0.1174, "grad_norm": 0.22521570324897766, "learning_rate": 0.0001285204, "loss": 0.2596, "step": 178700 }, { "epoch": 0.1176, "grad_norm": 0.1907728910446167, "learning_rate": 0.00012848040000000002, "loss": 0.2585, "step": 178800 }, { "epoch": 0.1178, "grad_norm": 0.24702465534210205, "learning_rate": 0.0001284404, "loss": 0.268, "step": 178900 }, { "epoch": 0.118, "grad_norm": 0.15964145958423615, "learning_rate": 0.0001284004, "loss": 0.2633, "step": 179000 }, { "epoch": 0.1182, "grad_norm": 0.17996646463871002, "learning_rate": 0.0001283604, "loss": 0.2581, "step": 179100 }, { "epoch": 0.1184, "grad_norm": 0.1755184382200241, "learning_rate": 0.00012832040000000002, "loss": 0.26, "step": 179200 }, { "epoch": 0.1186, "grad_norm": 0.16567598283290863, "learning_rate": 0.0001282804, "loss": 0.2568, "step": 179300 }, { "epoch": 0.1188, "grad_norm": 0.16007539629936218, "learning_rate": 0.0001282404, "loss": 0.262, "step": 179400 }, { "epoch": 0.119, "grad_norm": 0.28034693002700806, "learning_rate": 0.0001282004, "loss": 0.2603, "step": 179500 }, { "epoch": 0.1192, "grad_norm": 0.28950920701026917, "learning_rate": 0.0001281604, "loss": 0.2611, "step": 179600 }, { "epoch": 0.1194, "grad_norm": 0.18529054522514343, "learning_rate": 0.00012812040000000001, "loss": 0.2574, "step": 179700 }, { "epoch": 0.1196, "grad_norm": 0.18269339203834534, "learning_rate": 0.0001280804, "loss": 0.2644, "step": 179800 }, { "epoch": 0.1198, "grad_norm": 0.27880364656448364, "learning_rate": 0.00012804040000000003, "loss": 0.2618, "step": 179900 }, { "epoch": 0.12, "grad_norm": 0.1725231260061264, "learning_rate": 0.0001280004, "loss": 0.2599, "step": 180000 }, { "epoch": 0.1202, "grad_norm": 0.2305331975221634, "learning_rate": 0.00012796040000000002, "loss": 0.2625, "step": 180100 }, { "epoch": 0.1204, "grad_norm": 0.14379462599754333, "learning_rate": 0.0001279204, "loss": 0.2627, "step": 180200 }, { "epoch": 0.1206, "grad_norm": 0.16537553071975708, "learning_rate": 0.0001278804, "loss": 0.2721, "step": 180300 }, { "epoch": 0.1208, "grad_norm": 0.21422922611236572, "learning_rate": 0.00012784040000000002, "loss": 0.2627, "step": 180400 }, { "epoch": 0.121, "grad_norm": 0.1767064481973648, "learning_rate": 0.0001278004, "loss": 0.2594, "step": 180500 }, { "epoch": 0.1212, "grad_norm": 0.23920567333698273, "learning_rate": 0.0001277604, "loss": 0.2595, "step": 180600 }, { "epoch": 0.1214, "grad_norm": 0.16521887481212616, "learning_rate": 0.0001277204, "loss": 0.2612, "step": 180700 }, { "epoch": 0.1216, "grad_norm": 0.1697589010000229, "learning_rate": 0.0001276804, "loss": 0.2619, "step": 180800 }, { "epoch": 0.1218, "grad_norm": 0.20626257359981537, "learning_rate": 0.00012764040000000002, "loss": 0.2642, "step": 180900 }, { "epoch": 0.122, "grad_norm": 0.16152922809123993, "learning_rate": 0.00012760039999999999, "loss": 0.2637, "step": 181000 }, { "epoch": 0.1222, "grad_norm": 0.15324938297271729, "learning_rate": 0.0001275604, "loss": 0.2557, "step": 181100 }, { "epoch": 0.1224, "grad_norm": 0.18400631844997406, "learning_rate": 0.0001275204, "loss": 0.2643, "step": 181200 }, { "epoch": 0.1226, "grad_norm": 0.1637372076511383, "learning_rate": 0.00012748040000000002, "loss": 0.2673, "step": 181300 }, { "epoch": 0.1228, "grad_norm": 0.23258568346500397, "learning_rate": 0.00012744040000000001, "loss": 0.2619, "step": 181400 }, { "epoch": 0.123, "grad_norm": 0.16462406516075134, "learning_rate": 0.0001274004, "loss": 0.2589, "step": 181500 }, { "epoch": 0.1232, "grad_norm": 0.19165945053100586, "learning_rate": 0.0001273604, "loss": 0.2649, "step": 181600 }, { "epoch": 0.1234, "grad_norm": 0.23923306167125702, "learning_rate": 0.0001273204, "loss": 0.2711, "step": 181700 }, { "epoch": 0.1236, "grad_norm": 0.23697294294834137, "learning_rate": 0.00012728040000000002, "loss": 0.2658, "step": 181800 }, { "epoch": 0.1238, "grad_norm": 0.24161309003829956, "learning_rate": 0.0001272404, "loss": 0.258, "step": 181900 }, { "epoch": 0.124, "grad_norm": 0.1704692542552948, "learning_rate": 0.0001272004, "loss": 0.2615, "step": 182000 }, { "epoch": 0.1242, "grad_norm": 0.16872115433216095, "learning_rate": 0.0001271604, "loss": 0.2674, "step": 182100 }, { "epoch": 0.1244, "grad_norm": 0.20729516446590424, "learning_rate": 0.00012712040000000002, "loss": 0.2585, "step": 182200 }, { "epoch": 0.1246, "grad_norm": 0.20898325741291046, "learning_rate": 0.0001270804, "loss": 0.2627, "step": 182300 }, { "epoch": 0.1248, "grad_norm": 0.1441413015127182, "learning_rate": 0.0001270404, "loss": 0.2603, "step": 182400 }, { "epoch": 0.125, "grad_norm": 0.18905945122241974, "learning_rate": 0.0001270004, "loss": 0.254, "step": 182500 }, { "epoch": 0.1252, "grad_norm": 0.17578963935375214, "learning_rate": 0.0001269604, "loss": 0.2608, "step": 182600 }, { "epoch": 0.1254, "grad_norm": 0.23273304104804993, "learning_rate": 0.0001269204, "loss": 0.2625, "step": 182700 }, { "epoch": 0.1256, "grad_norm": 0.228327676653862, "learning_rate": 0.0001268804, "loss": 0.2598, "step": 182800 }, { "epoch": 0.1258, "grad_norm": 0.1630987524986267, "learning_rate": 0.00012684040000000003, "loss": 0.2645, "step": 182900 }, { "epoch": 0.126, "grad_norm": 0.18932506442070007, "learning_rate": 0.0001268004, "loss": 0.2633, "step": 183000 }, { "epoch": 0.1262, "grad_norm": 0.18664585053920746, "learning_rate": 0.00012676040000000001, "loss": 0.2627, "step": 183100 }, { "epoch": 0.1264, "grad_norm": 0.2066875547170639, "learning_rate": 0.0001267204, "loss": 0.2598, "step": 183200 }, { "epoch": 0.1266, "grad_norm": 0.20927661657333374, "learning_rate": 0.0001266804, "loss": 0.2659, "step": 183300 }, { "epoch": 0.1268, "grad_norm": 0.204673171043396, "learning_rate": 0.00012664040000000002, "loss": 0.2605, "step": 183400 }, { "epoch": 0.127, "grad_norm": 0.23662003874778748, "learning_rate": 0.0001266004, "loss": 0.2638, "step": 183500 }, { "epoch": 0.1272, "grad_norm": 0.30359625816345215, "learning_rate": 0.0001265604, "loss": 0.2597, "step": 183600 }, { "epoch": 0.1274, "grad_norm": 0.2385796308517456, "learning_rate": 0.0001265204, "loss": 0.2633, "step": 183700 }, { "epoch": 0.1276, "grad_norm": 0.3185318112373352, "learning_rate": 0.00012648040000000002, "loss": 0.2609, "step": 183800 }, { "epoch": 0.1278, "grad_norm": 0.17162935435771942, "learning_rate": 0.00012644040000000002, "loss": 0.2624, "step": 183900 }, { "epoch": 0.128, "grad_norm": 0.16286444664001465, "learning_rate": 0.00012640039999999998, "loss": 0.2602, "step": 184000 }, { "epoch": 0.1282, "grad_norm": 0.15972454845905304, "learning_rate": 0.0001263604, "loss": 0.2578, "step": 184100 }, { "epoch": 0.1284, "grad_norm": 0.1734430193901062, "learning_rate": 0.0001263204, "loss": 0.2639, "step": 184200 }, { "epoch": 0.1286, "grad_norm": 0.16511160135269165, "learning_rate": 0.00012628040000000002, "loss": 0.2606, "step": 184300 }, { "epoch": 0.1288, "grad_norm": 0.22905713319778442, "learning_rate": 0.0001262404, "loss": 0.258, "step": 184400 }, { "epoch": 0.129, "grad_norm": 0.16669723391532898, "learning_rate": 0.0001262004, "loss": 0.2582, "step": 184500 }, { "epoch": 0.1292, "grad_norm": 0.18425410985946655, "learning_rate": 0.0001261604, "loss": 0.2649, "step": 184600 }, { "epoch": 0.1294, "grad_norm": 0.18333709239959717, "learning_rate": 0.0001261204, "loss": 0.2636, "step": 184700 }, { "epoch": 0.1296, "grad_norm": 0.19844196736812592, "learning_rate": 0.0001260804, "loss": 0.263, "step": 184800 }, { "epoch": 0.1298, "grad_norm": 0.21080031991004944, "learning_rate": 0.0001260404, "loss": 0.2617, "step": 184900 }, { "epoch": 0.13, "grad_norm": 0.16929513216018677, "learning_rate": 0.0001260004, "loss": 0.2637, "step": 185000 }, { "epoch": 0.1302, "grad_norm": 0.17420923709869385, "learning_rate": 0.0001259604, "loss": 0.2655, "step": 185100 }, { "epoch": 0.1304, "grad_norm": 0.15296240150928497, "learning_rate": 0.00012592040000000001, "loss": 0.2654, "step": 185200 }, { "epoch": 0.1306, "grad_norm": 0.18833494186401367, "learning_rate": 0.0001258804, "loss": 0.2657, "step": 185300 }, { "epoch": 0.1308, "grad_norm": 0.1517093926668167, "learning_rate": 0.0001258404, "loss": 0.2694, "step": 185400 }, { "epoch": 0.131, "grad_norm": 0.22326596081256866, "learning_rate": 0.0001258004, "loss": 0.2616, "step": 185500 }, { "epoch": 0.1312, "grad_norm": 0.17264465987682343, "learning_rate": 0.0001257604, "loss": 0.2618, "step": 185600 }, { "epoch": 0.1314, "grad_norm": 0.28218379616737366, "learning_rate": 0.0001257204, "loss": 0.2641, "step": 185700 }, { "epoch": 0.1316, "grad_norm": 0.19472543895244598, "learning_rate": 0.0001256804, "loss": 0.2638, "step": 185800 }, { "epoch": 0.1318, "grad_norm": 0.17265281081199646, "learning_rate": 0.00012564040000000002, "loss": 0.2624, "step": 185900 }, { "epoch": 0.132, "grad_norm": 0.16616639494895935, "learning_rate": 0.0001256004, "loss": 0.2569, "step": 186000 }, { "epoch": 0.1322, "grad_norm": 0.2015598714351654, "learning_rate": 0.0001255604, "loss": 0.2739, "step": 186100 }, { "epoch": 0.1324, "grad_norm": 0.16873787343502045, "learning_rate": 0.0001255204, "loss": 0.2643, "step": 186200 }, { "epoch": 0.1326, "grad_norm": 0.3382190763950348, "learning_rate": 0.0001254804, "loss": 0.2612, "step": 186300 }, { "epoch": 0.1328, "grad_norm": 0.1990320235490799, "learning_rate": 0.00012544040000000002, "loss": 0.2641, "step": 186400 }, { "epoch": 0.133, "grad_norm": 0.17715241014957428, "learning_rate": 0.00012540039999999999, "loss": 0.2619, "step": 186500 }, { "epoch": 0.1332, "grad_norm": 0.18182724714279175, "learning_rate": 0.0001253604, "loss": 0.2632, "step": 186600 }, { "epoch": 0.1334, "grad_norm": 0.24784061312675476, "learning_rate": 0.0001253204, "loss": 0.2643, "step": 186700 }, { "epoch": 0.1336, "grad_norm": 0.1730160117149353, "learning_rate": 0.00012528040000000002, "loss": 0.2599, "step": 186800 }, { "epoch": 0.1338, "grad_norm": 0.25901031494140625, "learning_rate": 0.00012524040000000001, "loss": 0.2627, "step": 186900 }, { "epoch": 0.134, "grad_norm": 0.2513553500175476, "learning_rate": 0.0001252004, "loss": 0.2601, "step": 187000 }, { "epoch": 0.1342, "grad_norm": 0.18127425014972687, "learning_rate": 0.0001251604, "loss": 0.2688, "step": 187100 }, { "epoch": 0.1344, "grad_norm": 0.16403953731060028, "learning_rate": 0.0001251204, "loss": 0.2604, "step": 187200 }, { "epoch": 0.1346, "grad_norm": 0.19153177738189697, "learning_rate": 0.00012508040000000002, "loss": 0.2607, "step": 187300 }, { "epoch": 0.1348, "grad_norm": 0.17384052276611328, "learning_rate": 0.0001250404, "loss": 0.2579, "step": 187400 }, { "epoch": 0.135, "grad_norm": 0.17870062589645386, "learning_rate": 0.0001250004, "loss": 0.2605, "step": 187500 }, { "epoch": 0.1352, "grad_norm": 0.16759884357452393, "learning_rate": 0.0001249604, "loss": 0.2579, "step": 187600 }, { "epoch": 0.1354, "grad_norm": 0.19198445975780487, "learning_rate": 0.0001249204, "loss": 0.2583, "step": 187700 }, { "epoch": 0.1356, "grad_norm": 0.17366041243076324, "learning_rate": 0.0001248804, "loss": 0.2566, "step": 187800 }, { "epoch": 0.1358, "grad_norm": 0.3349681794643402, "learning_rate": 0.0001248404, "loss": 0.264, "step": 187900 }, { "epoch": 0.136, "grad_norm": 0.21217289566993713, "learning_rate": 0.0001248004, "loss": 0.2657, "step": 188000 }, { "epoch": 0.1362, "grad_norm": 0.17328070104122162, "learning_rate": 0.0001247604, "loss": 0.2552, "step": 188100 }, { "epoch": 0.1364, "grad_norm": 0.3321928381919861, "learning_rate": 0.0001247204, "loss": 0.2617, "step": 188200 }, { "epoch": 0.1366, "grad_norm": 0.16838262975215912, "learning_rate": 0.0001246804, "loss": 0.2674, "step": 188300 }, { "epoch": 0.1368, "grad_norm": 0.19490031898021698, "learning_rate": 0.0001246404, "loss": 0.2651, "step": 188400 }, { "epoch": 0.137, "grad_norm": 0.2795969247817993, "learning_rate": 0.0001246004, "loss": 0.2598, "step": 188500 }, { "epoch": 0.1372, "grad_norm": 0.18094158172607422, "learning_rate": 0.0001245604, "loss": 0.2631, "step": 188600 }, { "epoch": 0.1374, "grad_norm": 0.17936332523822784, "learning_rate": 0.0001245204, "loss": 0.2599, "step": 188700 }, { "epoch": 0.1376, "grad_norm": 0.4011989235877991, "learning_rate": 0.0001244804, "loss": 0.2577, "step": 188800 }, { "epoch": 0.1378, "grad_norm": 0.1783379763364792, "learning_rate": 0.00012444040000000002, "loss": 0.2562, "step": 188900 }, { "epoch": 0.138, "grad_norm": 0.18040119111537933, "learning_rate": 0.00012440040000000002, "loss": 0.2579, "step": 189000 }, { "epoch": 0.1382, "grad_norm": 0.181350976228714, "learning_rate": 0.0001243604, "loss": 0.2662, "step": 189100 }, { "epoch": 0.1384, "grad_norm": 0.18030929565429688, "learning_rate": 0.0001243204, "loss": 0.2588, "step": 189200 }, { "epoch": 0.1386, "grad_norm": 0.19084039330482483, "learning_rate": 0.0001242804, "loss": 0.2654, "step": 189300 }, { "epoch": 0.1388, "grad_norm": 0.23569531738758087, "learning_rate": 0.00012424040000000002, "loss": 0.259, "step": 189400 }, { "epoch": 0.139, "grad_norm": 0.2429647147655487, "learning_rate": 0.0001242004, "loss": 0.2581, "step": 189500 }, { "epoch": 0.1392, "grad_norm": 0.20208866894245148, "learning_rate": 0.0001241604, "loss": 0.2563, "step": 189600 }, { "epoch": 0.1394, "grad_norm": 0.1706075370311737, "learning_rate": 0.0001241204, "loss": 0.2634, "step": 189700 }, { "epoch": 0.1396, "grad_norm": 0.1640571802854538, "learning_rate": 0.00012408040000000002, "loss": 0.2582, "step": 189800 }, { "epoch": 0.1398, "grad_norm": 0.22723062336444855, "learning_rate": 0.0001240404, "loss": 0.2603, "step": 189900 }, { "epoch": 0.14, "grad_norm": 0.16876673698425293, "learning_rate": 0.0001240004, "loss": 0.2636, "step": 190000 }, { "epoch": 0.1402, "grad_norm": 0.2153484970331192, "learning_rate": 0.0001239604, "loss": 0.2586, "step": 190100 }, { "epoch": 0.1404, "grad_norm": 0.21755315363407135, "learning_rate": 0.0001239204, "loss": 0.2565, "step": 190200 }, { "epoch": 0.1406, "grad_norm": 0.18556717038154602, "learning_rate": 0.00012388040000000001, "loss": 0.2631, "step": 190300 }, { "epoch": 0.1408, "grad_norm": 0.19718804955482483, "learning_rate": 0.0001238404, "loss": 0.2593, "step": 190400 }, { "epoch": 0.141, "grad_norm": 0.2352330982685089, "learning_rate": 0.00012380040000000003, "loss": 0.2606, "step": 190500 }, { "epoch": 0.1412, "grad_norm": 0.22349053621292114, "learning_rate": 0.0001237604, "loss": 0.2614, "step": 190600 }, { "epoch": 0.1414, "grad_norm": 0.27996236085891724, "learning_rate": 0.00012372040000000002, "loss": 0.2651, "step": 190700 }, { "epoch": 0.1416, "grad_norm": 0.21903091669082642, "learning_rate": 0.0001236804, "loss": 0.2577, "step": 190800 }, { "epoch": 0.1418, "grad_norm": 0.1741315722465515, "learning_rate": 0.0001236404, "loss": 0.2526, "step": 190900 }, { "epoch": 0.142, "grad_norm": 0.2520033121109009, "learning_rate": 0.00012360040000000002, "loss": 0.259, "step": 191000 }, { "epoch": 0.1422, "grad_norm": 0.23828014731407166, "learning_rate": 0.0001235604, "loss": 0.2594, "step": 191100 }, { "epoch": 0.1424, "grad_norm": 0.16912546753883362, "learning_rate": 0.0001235204, "loss": 0.2616, "step": 191200 }, { "epoch": 0.1426, "grad_norm": 0.17198620736598969, "learning_rate": 0.0001234804, "loss": 0.256, "step": 191300 }, { "epoch": 0.1428, "grad_norm": 0.17525877058506012, "learning_rate": 0.0001234404, "loss": 0.2601, "step": 191400 }, { "epoch": 0.143, "grad_norm": 0.16216328740119934, "learning_rate": 0.00012340040000000002, "loss": 0.2641, "step": 191500 }, { "epoch": 0.1432, "grad_norm": 0.1828595995903015, "learning_rate": 0.00012336039999999999, "loss": 0.2582, "step": 191600 }, { "epoch": 0.1434, "grad_norm": 0.16869176924228668, "learning_rate": 0.0001233204, "loss": 0.2554, "step": 191700 }, { "epoch": 0.1436, "grad_norm": 0.19655516743659973, "learning_rate": 0.0001232804, "loss": 0.2663, "step": 191800 }, { "epoch": 0.1438, "grad_norm": 0.20977061986923218, "learning_rate": 0.00012324040000000002, "loss": 0.2565, "step": 191900 }, { "epoch": 0.144, "grad_norm": 0.22906339168548584, "learning_rate": 0.00012320040000000001, "loss": 0.2553, "step": 192000 }, { "epoch": 0.1442, "grad_norm": 0.1683686375617981, "learning_rate": 0.0001231604, "loss": 0.2581, "step": 192100 }, { "epoch": 0.1444, "grad_norm": 0.20973190665245056, "learning_rate": 0.0001231204, "loss": 0.2642, "step": 192200 }, { "epoch": 0.1446, "grad_norm": 0.14577773213386536, "learning_rate": 0.0001230804, "loss": 0.26, "step": 192300 }, { "epoch": 0.1448, "grad_norm": 0.19426365196704865, "learning_rate": 0.00012304040000000002, "loss": 0.2622, "step": 192400 }, { "epoch": 0.145, "grad_norm": 0.17555978894233704, "learning_rate": 0.0001230004, "loss": 0.2598, "step": 192500 }, { "epoch": 0.1452, "grad_norm": 0.2241448312997818, "learning_rate": 0.0001229604, "loss": 0.2549, "step": 192600 }, { "epoch": 0.1454, "grad_norm": 0.24879616498947144, "learning_rate": 0.0001229204, "loss": 0.2525, "step": 192700 }, { "epoch": 0.1456, "grad_norm": 0.18659988045692444, "learning_rate": 0.00012288040000000002, "loss": 0.2635, "step": 192800 }, { "epoch": 0.1458, "grad_norm": 0.19412420690059662, "learning_rate": 0.0001228404, "loss": 0.2553, "step": 192900 }, { "epoch": 0.146, "grad_norm": 0.16501417756080627, "learning_rate": 0.0001228004, "loss": 0.2621, "step": 193000 }, { "epoch": 0.1462, "grad_norm": 0.18333154916763306, "learning_rate": 0.0001227604, "loss": 0.2545, "step": 193100 }, { "epoch": 0.1464, "grad_norm": 0.25169798731803894, "learning_rate": 0.0001227204, "loss": 0.26, "step": 193200 }, { "epoch": 0.1466, "grad_norm": 0.18788355588912964, "learning_rate": 0.0001226804, "loss": 0.2575, "step": 193300 }, { "epoch": 0.1468, "grad_norm": 0.23078390955924988, "learning_rate": 0.0001226404, "loss": 0.2717, "step": 193400 }, { "epoch": 0.147, "grad_norm": 0.18893170356750488, "learning_rate": 0.00012260040000000003, "loss": 0.2572, "step": 193500 }, { "epoch": 0.1472, "grad_norm": 0.20808455348014832, "learning_rate": 0.0001225604, "loss": 0.2576, "step": 193600 }, { "epoch": 0.1474, "grad_norm": 0.1902913898229599, "learning_rate": 0.00012252040000000001, "loss": 0.2568, "step": 193700 }, { "epoch": 0.1476, "grad_norm": 0.17236173152923584, "learning_rate": 0.0001224804, "loss": 0.2541, "step": 193800 }, { "epoch": 0.1478, "grad_norm": 0.22261063754558563, "learning_rate": 0.0001224404, "loss": 0.2572, "step": 193900 }, { "epoch": 0.148, "grad_norm": 0.21105404198169708, "learning_rate": 0.00012240040000000002, "loss": 0.2595, "step": 194000 }, { "epoch": 0.1482, "grad_norm": 0.1775384098291397, "learning_rate": 0.0001223604, "loss": 0.2566, "step": 194100 }, { "epoch": 0.1484, "grad_norm": 0.16679984331130981, "learning_rate": 0.0001223204, "loss": 0.2567, "step": 194200 }, { "epoch": 0.1486, "grad_norm": 0.15811361372470856, "learning_rate": 0.0001222804, "loss": 0.2627, "step": 194300 }, { "epoch": 0.1488, "grad_norm": 0.1975756138563156, "learning_rate": 0.00012224040000000002, "loss": 0.2629, "step": 194400 }, { "epoch": 0.149, "grad_norm": 0.2405877411365509, "learning_rate": 0.00012220040000000002, "loss": 0.2591, "step": 194500 }, { "epoch": 0.1492, "grad_norm": 0.1887446641921997, "learning_rate": 0.0001221604, "loss": 0.26, "step": 194600 }, { "epoch": 0.1494, "grad_norm": 0.23812128603458405, "learning_rate": 0.0001221204, "loss": 0.2602, "step": 194700 }, { "epoch": 0.1496, "grad_norm": 0.2019837647676468, "learning_rate": 0.0001220804, "loss": 0.2614, "step": 194800 }, { "epoch": 0.1498, "grad_norm": 0.18969768285751343, "learning_rate": 0.0001220404, "loss": 0.2597, "step": 194900 }, { "epoch": 0.15, "grad_norm": 0.2558167278766632, "learning_rate": 0.00012200040000000001, "loss": 0.2601, "step": 195000 }, { "epoch": 0.1502, "grad_norm": 0.16301903128623962, "learning_rate": 0.00012196039999999999, "loss": 0.2613, "step": 195100 }, { "epoch": 0.1504, "grad_norm": 0.19650568068027496, "learning_rate": 0.0001219204, "loss": 0.2643, "step": 195200 }, { "epoch": 0.1506, "grad_norm": 0.19182758033275604, "learning_rate": 0.0001218804, "loss": 0.2622, "step": 195300 }, { "epoch": 0.1508, "grad_norm": 0.16109509766101837, "learning_rate": 0.00012184040000000001, "loss": 0.2567, "step": 195400 }, { "epoch": 0.151, "grad_norm": 0.20243091881275177, "learning_rate": 0.00012180040000000002, "loss": 0.2561, "step": 195500 }, { "epoch": 0.1512, "grad_norm": 0.18910594284534454, "learning_rate": 0.0001217604, "loss": 0.2589, "step": 195600 }, { "epoch": 0.1514, "grad_norm": 0.17565961182117462, "learning_rate": 0.00012172040000000001, "loss": 0.2575, "step": 195700 }, { "epoch": 0.1516, "grad_norm": 0.18943890929222107, "learning_rate": 0.0001216804, "loss": 0.262, "step": 195800 }, { "epoch": 0.1518, "grad_norm": 0.1772325187921524, "learning_rate": 0.00012164040000000001, "loss": 0.2659, "step": 195900 }, { "epoch": 0.152, "grad_norm": 0.17579589784145355, "learning_rate": 0.00012160040000000002, "loss": 0.2611, "step": 196000 }, { "epoch": 0.1522, "grad_norm": 0.1820620447397232, "learning_rate": 0.0001215604, "loss": 0.2621, "step": 196100 }, { "epoch": 0.1524, "grad_norm": 0.20846767723560333, "learning_rate": 0.0001215204, "loss": 0.2607, "step": 196200 }, { "epoch": 0.1526, "grad_norm": 0.20666386187076569, "learning_rate": 0.00012148040000000001, "loss": 0.2587, "step": 196300 }, { "epoch": 0.1528, "grad_norm": 0.3281834125518799, "learning_rate": 0.0001214404, "loss": 0.2539, "step": 196400 }, { "epoch": 0.153, "grad_norm": 0.20208649337291718, "learning_rate": 0.00012140040000000001, "loss": 0.2578, "step": 196500 }, { "epoch": 0.1532, "grad_norm": 0.28220322728157043, "learning_rate": 0.00012136039999999999, "loss": 0.2558, "step": 196600 }, { "epoch": 0.1534, "grad_norm": 0.33225101232528687, "learning_rate": 0.0001213204, "loss": 0.2661, "step": 196700 }, { "epoch": 0.1536, "grad_norm": 0.24750415980815887, "learning_rate": 0.0001212804, "loss": 0.2624, "step": 196800 }, { "epoch": 0.1538, "grad_norm": 0.1799037754535675, "learning_rate": 0.00012124040000000001, "loss": 0.2584, "step": 196900 }, { "epoch": 0.154, "grad_norm": 0.20016971230506897, "learning_rate": 0.00012120040000000002, "loss": 0.2599, "step": 197000 }, { "epoch": 0.1542, "grad_norm": 0.18166713416576385, "learning_rate": 0.0001211604, "loss": 0.2567, "step": 197100 }, { "epoch": 0.1544, "grad_norm": 0.1939893662929535, "learning_rate": 0.0001211204, "loss": 0.2646, "step": 197200 }, { "epoch": 0.1546, "grad_norm": 0.18318818509578705, "learning_rate": 0.0001210804, "loss": 0.2552, "step": 197300 }, { "epoch": 0.1548, "grad_norm": 0.27068910002708435, "learning_rate": 0.00012104040000000001, "loss": 0.2594, "step": 197400 }, { "epoch": 0.155, "grad_norm": 0.20049890875816345, "learning_rate": 0.00012100040000000001, "loss": 0.2569, "step": 197500 }, { "epoch": 0.1552, "grad_norm": 0.1976662427186966, "learning_rate": 0.0001209604, "loss": 0.2544, "step": 197600 }, { "epoch": 0.1554, "grad_norm": 0.17978644371032715, "learning_rate": 0.0001209204, "loss": 0.2626, "step": 197700 }, { "epoch": 0.1556, "grad_norm": 0.18449051678180695, "learning_rate": 0.00012088040000000001, "loss": 0.2609, "step": 197800 }, { "epoch": 0.1558, "grad_norm": 0.21777702867984772, "learning_rate": 0.0001208404, "loss": 0.2557, "step": 197900 }, { "epoch": 0.156, "grad_norm": 0.34685173630714417, "learning_rate": 0.00012080040000000001, "loss": 0.2598, "step": 198000 }, { "epoch": 0.1562, "grad_norm": 0.21102797985076904, "learning_rate": 0.00012076039999999999, "loss": 0.258, "step": 198100 }, { "epoch": 0.1564, "grad_norm": 0.1801084280014038, "learning_rate": 0.0001207204, "loss": 0.2545, "step": 198200 }, { "epoch": 0.1566, "grad_norm": 0.24270586669445038, "learning_rate": 0.0001206804, "loss": 0.2594, "step": 198300 }, { "epoch": 0.1568, "grad_norm": 0.1699536144733429, "learning_rate": 0.00012064040000000001, "loss": 0.2665, "step": 198400 }, { "epoch": 0.157, "grad_norm": 0.15559914708137512, "learning_rate": 0.00012060040000000002, "loss": 0.2577, "step": 198500 }, { "epoch": 0.1572, "grad_norm": 0.1730017513036728, "learning_rate": 0.0001205604, "loss": 0.2578, "step": 198600 }, { "epoch": 0.1574, "grad_norm": 0.18403738737106323, "learning_rate": 0.0001205204, "loss": 0.257, "step": 198700 }, { "epoch": 0.1576, "grad_norm": 0.17950795590877533, "learning_rate": 0.0001204804, "loss": 0.2581, "step": 198800 }, { "epoch": 0.1578, "grad_norm": 0.18229971826076508, "learning_rate": 0.0001204404, "loss": 0.2611, "step": 198900 }, { "epoch": 0.158, "grad_norm": 0.1665981113910675, "learning_rate": 0.00012040040000000001, "loss": 0.2562, "step": 199000 }, { "epoch": 0.1582, "grad_norm": 0.1657390296459198, "learning_rate": 0.0001203604, "loss": 0.2629, "step": 199100 }, { "epoch": 0.1584, "grad_norm": 0.18978402018547058, "learning_rate": 0.0001203204, "loss": 0.2613, "step": 199200 }, { "epoch": 0.1586, "grad_norm": 0.18768629431724548, "learning_rate": 0.00012028040000000001, "loss": 0.2564, "step": 199300 }, { "epoch": 0.1588, "grad_norm": 0.3750452697277069, "learning_rate": 0.00012024040000000002, "loss": 0.2577, "step": 199400 }, { "epoch": 0.159, "grad_norm": 0.25771307945251465, "learning_rate": 0.00012020040000000001, "loss": 0.2642, "step": 199500 }, { "epoch": 0.1592, "grad_norm": 0.27435046434402466, "learning_rate": 0.00012016039999999999, "loss": 0.263, "step": 199600 }, { "epoch": 0.1594, "grad_norm": 0.1829531192779541, "learning_rate": 0.0001201204, "loss": 0.2606, "step": 199700 }, { "epoch": 0.1596, "grad_norm": 0.14673271775245667, "learning_rate": 0.0001200804, "loss": 0.2575, "step": 199800 }, { "epoch": 0.1598, "grad_norm": 0.19495797157287598, "learning_rate": 0.00012004040000000001, "loss": 0.2545, "step": 199900 }, { "epoch": 0.16, "grad_norm": 0.20886088907718658, "learning_rate": 0.00012000040000000002, "loss": 0.2584, "step": 200000 }, { "epoch": 0.1602, "grad_norm": 0.18301694095134735, "learning_rate": 0.0001199604, "loss": 0.2581, "step": 200100 }, { "epoch": 0.1604, "grad_norm": 0.1781015843153, "learning_rate": 0.0001199204, "loss": 0.2583, "step": 200200 }, { "epoch": 0.1606, "grad_norm": 0.15711505711078644, "learning_rate": 0.0001198804, "loss": 0.2594, "step": 200300 }, { "epoch": 0.1608, "grad_norm": 0.21567769348621368, "learning_rate": 0.0001198404, "loss": 0.2553, "step": 200400 }, { "epoch": 0.161, "grad_norm": 0.18019573390483856, "learning_rate": 0.00011980040000000001, "loss": 0.2653, "step": 200500 }, { "epoch": 0.1612, "grad_norm": 0.1789156198501587, "learning_rate": 0.00011976039999999999, "loss": 0.2604, "step": 200600 }, { "epoch": 0.1614, "grad_norm": 0.26856061816215515, "learning_rate": 0.0001197204, "loss": 0.2625, "step": 200700 }, { "epoch": 0.1616, "grad_norm": 0.16721145808696747, "learning_rate": 0.00011968040000000001, "loss": 0.2628, "step": 200800 }, { "epoch": 0.1618, "grad_norm": 0.1759210228919983, "learning_rate": 0.00011964040000000001, "loss": 0.2534, "step": 200900 }, { "epoch": 0.162, "grad_norm": 0.15787266194820404, "learning_rate": 0.00011960040000000001, "loss": 0.2586, "step": 201000 }, { "epoch": 0.1622, "grad_norm": 0.1680240035057068, "learning_rate": 0.0001195604, "loss": 0.2583, "step": 201100 }, { "epoch": 0.1624, "grad_norm": 0.19923582673072815, "learning_rate": 0.0001195204, "loss": 0.2637, "step": 201200 }, { "epoch": 0.1626, "grad_norm": 0.33889421820640564, "learning_rate": 0.0001194804, "loss": 0.261, "step": 201300 }, { "epoch": 0.1628, "grad_norm": 0.22548066079616547, "learning_rate": 0.00011944040000000001, "loss": 0.2557, "step": 201400 }, { "epoch": 0.163, "grad_norm": 0.2182902842760086, "learning_rate": 0.00011940040000000002, "loss": 0.2593, "step": 201500 }, { "epoch": 0.1632, "grad_norm": 0.27033495903015137, "learning_rate": 0.00011936040000000001, "loss": 0.2588, "step": 201600 }, { "epoch": 0.1634, "grad_norm": 0.3093793988227844, "learning_rate": 0.0001193204, "loss": 0.2561, "step": 201700 }, { "epoch": 0.1636, "grad_norm": 0.20741653442382812, "learning_rate": 0.0001192804, "loss": 0.2623, "step": 201800 }, { "epoch": 0.1638, "grad_norm": 0.2333420366048813, "learning_rate": 0.0001192404, "loss": 0.2574, "step": 201900 }, { "epoch": 0.164, "grad_norm": 0.20007722079753876, "learning_rate": 0.00011920040000000001, "loss": 0.2618, "step": 202000 }, { "epoch": 0.1642, "grad_norm": 0.18958450853824615, "learning_rate": 0.00011916040000000002, "loss": 0.2545, "step": 202100 }, { "epoch": 0.1644, "grad_norm": 0.22179192304611206, "learning_rate": 0.0001191204, "loss": 0.2637, "step": 202200 }, { "epoch": 0.1646, "grad_norm": 0.1823706179857254, "learning_rate": 0.0001190804, "loss": 0.2567, "step": 202300 }, { "epoch": 0.1648, "grad_norm": 0.19839805364608765, "learning_rate": 0.00011904040000000001, "loss": 0.2582, "step": 202400 }, { "epoch": 0.165, "grad_norm": 0.1794578582048416, "learning_rate": 0.0001190004, "loss": 0.2563, "step": 202500 }, { "epoch": 0.1652, "grad_norm": 0.1632799506187439, "learning_rate": 0.00011896040000000001, "loss": 0.2544, "step": 202600 }, { "epoch": 0.1654, "grad_norm": 0.14698299765586853, "learning_rate": 0.0001189204, "loss": 0.2583, "step": 202700 }, { "epoch": 0.1656, "grad_norm": 0.1564948856830597, "learning_rate": 0.0001188804, "loss": 0.2544, "step": 202800 }, { "epoch": 0.1658, "grad_norm": 0.27698883414268494, "learning_rate": 0.00011884040000000001, "loss": 0.2603, "step": 202900 }, { "epoch": 0.166, "grad_norm": 0.17376329004764557, "learning_rate": 0.00011880040000000002, "loss": 0.2569, "step": 203000 }, { "epoch": 0.1662, "grad_norm": 0.21339082717895508, "learning_rate": 0.00011876040000000002, "loss": 0.2554, "step": 203100 }, { "epoch": 0.1664, "grad_norm": 0.19938743114471436, "learning_rate": 0.0001187204, "loss": 0.2703, "step": 203200 }, { "epoch": 0.1666, "grad_norm": 0.1865181177854538, "learning_rate": 0.0001186804, "loss": 0.2559, "step": 203300 }, { "epoch": 0.1668, "grad_norm": 0.18764150142669678, "learning_rate": 0.0001186404, "loss": 0.2565, "step": 203400 }, { "epoch": 0.167, "grad_norm": 0.16691163182258606, "learning_rate": 0.00011860040000000001, "loss": 0.2616, "step": 203500 }, { "epoch": 0.1672, "grad_norm": 0.20289692282676697, "learning_rate": 0.00011856040000000002, "loss": 0.2585, "step": 203600 }, { "epoch": 0.1674, "grad_norm": 0.2008928805589676, "learning_rate": 0.0001185204, "loss": 0.2654, "step": 203700 }, { "epoch": 0.1676, "grad_norm": 0.29621559381484985, "learning_rate": 0.0001184804, "loss": 0.2649, "step": 203800 }, { "epoch": 0.1678, "grad_norm": 0.23792728781700134, "learning_rate": 0.00011844040000000001, "loss": 0.2606, "step": 203900 }, { "epoch": 0.168, "grad_norm": 0.1951097548007965, "learning_rate": 0.0001184004, "loss": 0.2603, "step": 204000 }, { "epoch": 0.1682, "grad_norm": 0.18603862822055817, "learning_rate": 0.00011836040000000001, "loss": 0.2567, "step": 204100 }, { "epoch": 0.1684, "grad_norm": 0.248612180352211, "learning_rate": 0.00011832039999999999, "loss": 0.2588, "step": 204200 }, { "epoch": 0.1686, "grad_norm": 0.3616507351398468, "learning_rate": 0.0001182804, "loss": 0.2568, "step": 204300 }, { "epoch": 0.1688, "grad_norm": 0.22224590182304382, "learning_rate": 0.00011824040000000001, "loss": 0.2589, "step": 204400 }, { "epoch": 0.169, "grad_norm": 0.22203755378723145, "learning_rate": 0.00011820040000000001, "loss": 0.2572, "step": 204500 }, { "epoch": 0.1692, "grad_norm": 0.16411060094833374, "learning_rate": 0.00011816040000000002, "loss": 0.2578, "step": 204600 }, { "epoch": 0.1694, "grad_norm": 0.17382003366947174, "learning_rate": 0.0001181204, "loss": 0.2546, "step": 204700 }, { "epoch": 0.1696, "grad_norm": 0.21096429228782654, "learning_rate": 0.00011808040000000001, "loss": 0.2619, "step": 204800 }, { "epoch": 0.1698, "grad_norm": 0.2046499401330948, "learning_rate": 0.0001180404, "loss": 0.2538, "step": 204900 }, { "epoch": 0.17, "grad_norm": 0.1903340220451355, "learning_rate": 0.00011800040000000001, "loss": 0.2695, "step": 205000 }, { "epoch": 0.1702, "grad_norm": 0.19105155766010284, "learning_rate": 0.00011796040000000002, "loss": 0.2592, "step": 205100 }, { "epoch": 0.1704, "grad_norm": 0.18848542869091034, "learning_rate": 0.0001179204, "loss": 0.2607, "step": 205200 }, { "epoch": 0.1706, "grad_norm": 0.19775767624378204, "learning_rate": 0.0001178804, "loss": 0.2514, "step": 205300 }, { "epoch": 0.1708, "grad_norm": 0.2048797607421875, "learning_rate": 0.00011784040000000001, "loss": 0.2584, "step": 205400 }, { "epoch": 0.171, "grad_norm": 0.17378486692905426, "learning_rate": 0.0001178004, "loss": 0.2611, "step": 205500 }, { "epoch": 0.1712, "grad_norm": 0.19337409734725952, "learning_rate": 0.00011776040000000001, "loss": 0.2552, "step": 205600 }, { "epoch": 0.1714, "grad_norm": 0.1871696412563324, "learning_rate": 0.00011772039999999999, "loss": 0.2555, "step": 205700 }, { "epoch": 0.1716, "grad_norm": 0.15516479313373566, "learning_rate": 0.0001176804, "loss": 0.255, "step": 205800 }, { "epoch": 0.1718, "grad_norm": 0.18520411849021912, "learning_rate": 0.0001176404, "loss": 0.2607, "step": 205900 }, { "epoch": 0.172, "grad_norm": 0.1725122481584549, "learning_rate": 0.00011760040000000001, "loss": 0.2599, "step": 206000 }, { "epoch": 0.1722, "grad_norm": 0.24798348546028137, "learning_rate": 0.00011756040000000002, "loss": 0.2654, "step": 206100 }, { "epoch": 0.1724, "grad_norm": 0.15769024193286896, "learning_rate": 0.0001175204, "loss": 0.2753, "step": 206200 }, { "epoch": 0.1726, "grad_norm": 0.2662038207054138, "learning_rate": 0.00011748040000000001, "loss": 0.2642, "step": 206300 }, { "epoch": 0.1728, "grad_norm": 0.22056128084659576, "learning_rate": 0.0001174404, "loss": 0.2573, "step": 206400 }, { "epoch": 0.173, "grad_norm": 0.24194368720054626, "learning_rate": 0.00011740040000000001, "loss": 0.2605, "step": 206500 }, { "epoch": 0.1732, "grad_norm": 0.2201591581106186, "learning_rate": 0.00011736040000000002, "loss": 0.26, "step": 206600 }, { "epoch": 0.1734, "grad_norm": 0.17984557151794434, "learning_rate": 0.0001173204, "loss": 0.2588, "step": 206700 }, { "epoch": 0.1736, "grad_norm": 0.2862537205219269, "learning_rate": 0.0001172804, "loss": 0.2656, "step": 206800 }, { "epoch": 0.1738, "grad_norm": 0.20301680266857147, "learning_rate": 0.00011724040000000001, "loss": 0.2596, "step": 206900 }, { "epoch": 0.174, "grad_norm": 0.16194941103458405, "learning_rate": 0.0001172004, "loss": 0.2604, "step": 207000 }, { "epoch": 0.1742, "grad_norm": 0.20305371284484863, "learning_rate": 0.00011716040000000001, "loss": 0.2605, "step": 207100 }, { "epoch": 0.1744, "grad_norm": 0.17961016297340393, "learning_rate": 0.00011712039999999999, "loss": 0.2553, "step": 207200 }, { "epoch": 0.1746, "grad_norm": 0.15147832036018372, "learning_rate": 0.0001170804, "loss": 0.2608, "step": 207300 }, { "epoch": 0.1748, "grad_norm": 0.17695245146751404, "learning_rate": 0.0001170404, "loss": 0.2577, "step": 207400 }, { "epoch": 0.175, "grad_norm": 0.16684728860855103, "learning_rate": 0.00011700040000000001, "loss": 0.2602, "step": 207500 }, { "epoch": 0.1752, "grad_norm": 0.1773560494184494, "learning_rate": 0.00011696040000000002, "loss": 0.2592, "step": 207600 }, { "epoch": 0.1754, "grad_norm": 0.1829720139503479, "learning_rate": 0.0001169204, "loss": 0.2533, "step": 207700 }, { "epoch": 0.1756, "grad_norm": 0.14604096114635468, "learning_rate": 0.0001168804, "loss": 0.2559, "step": 207800 }, { "epoch": 0.1758, "grad_norm": 0.16805389523506165, "learning_rate": 0.0001168404, "loss": 0.2565, "step": 207900 }, { "epoch": 0.176, "grad_norm": 0.2917552888393402, "learning_rate": 0.00011680040000000001, "loss": 0.2596, "step": 208000 }, { "epoch": 0.1762, "grad_norm": 0.1552281528711319, "learning_rate": 0.00011676040000000001, "loss": 0.2595, "step": 208100 }, { "epoch": 0.1764, "grad_norm": 0.21594230830669403, "learning_rate": 0.0001167204, "loss": 0.26, "step": 208200 }, { "epoch": 0.1766, "grad_norm": 0.2598268389701843, "learning_rate": 0.0001166804, "loss": 0.2634, "step": 208300 }, { "epoch": 0.1768, "grad_norm": 0.17285478115081787, "learning_rate": 0.00011664040000000001, "loss": 0.2663, "step": 208400 }, { "epoch": 0.177, "grad_norm": 0.2128245234489441, "learning_rate": 0.00011660040000000002, "loss": 0.2596, "step": 208500 }, { "epoch": 0.1772, "grad_norm": 0.17592047154903412, "learning_rate": 0.00011656040000000001, "loss": 0.259, "step": 208600 }, { "epoch": 0.1774, "grad_norm": 0.1863638013601303, "learning_rate": 0.00011652039999999999, "loss": 0.2574, "step": 208700 }, { "epoch": 0.1776, "grad_norm": 0.1951245218515396, "learning_rate": 0.0001164804, "loss": 0.2569, "step": 208800 }, { "epoch": 0.1778, "grad_norm": 0.20469717681407928, "learning_rate": 0.0001164404, "loss": 0.2548, "step": 208900 }, { "epoch": 0.178, "grad_norm": 0.18521174788475037, "learning_rate": 0.00011640040000000001, "loss": 0.2581, "step": 209000 }, { "epoch": 0.1782, "grad_norm": 0.16419340670108795, "learning_rate": 0.00011636040000000002, "loss": 0.2557, "step": 209100 }, { "epoch": 0.1784, "grad_norm": 0.2025509625673294, "learning_rate": 0.0001163204, "loss": 0.2553, "step": 209200 }, { "epoch": 0.1786, "grad_norm": 0.2792588174343109, "learning_rate": 0.0001162804, "loss": 0.2554, "step": 209300 }, { "epoch": 0.1788, "grad_norm": 0.18840503692626953, "learning_rate": 0.0001162404, "loss": 0.2612, "step": 209400 }, { "epoch": 0.179, "grad_norm": 0.3353501558303833, "learning_rate": 0.0001162004, "loss": 0.2575, "step": 209500 }, { "epoch": 0.1792, "grad_norm": 0.17007525265216827, "learning_rate": 0.00011616040000000001, "loss": 0.2534, "step": 209600 }, { "epoch": 0.1794, "grad_norm": 0.1712578982114792, "learning_rate": 0.0001161204, "loss": 0.2579, "step": 209700 }, { "epoch": 0.1796, "grad_norm": 0.16284547746181488, "learning_rate": 0.0001160804, "loss": 0.2578, "step": 209800 }, { "epoch": 0.1798, "grad_norm": 0.22904619574546814, "learning_rate": 0.00011604040000000001, "loss": 0.2554, "step": 209900 }, { "epoch": 0.18, "grad_norm": 0.1613014191389084, "learning_rate": 0.00011600040000000002, "loss": 0.2569, "step": 210000 }, { "epoch": 0.1802, "grad_norm": 0.24920882284641266, "learning_rate": 0.00011596040000000001, "loss": 0.2523, "step": 210100 }, { "epoch": 0.1804, "grad_norm": 0.16219836473464966, "learning_rate": 0.0001159204, "loss": 0.257, "step": 210200 }, { "epoch": 0.1806, "grad_norm": 0.14738017320632935, "learning_rate": 0.0001158804, "loss": 0.2534, "step": 210300 }, { "epoch": 0.1808, "grad_norm": 0.16476276516914368, "learning_rate": 0.0001158404, "loss": 0.261, "step": 210400 }, { "epoch": 0.181, "grad_norm": 0.20898793637752533, "learning_rate": 0.00011580040000000001, "loss": 0.261, "step": 210500 }, { "epoch": 0.1812, "grad_norm": 0.16828548908233643, "learning_rate": 0.00011576040000000002, "loss": 0.2502, "step": 210600 }, { "epoch": 0.1814, "grad_norm": 0.17808620631694794, "learning_rate": 0.0001157204, "loss": 0.2445, "step": 210700 }, { "epoch": 0.1816, "grad_norm": 0.16437947750091553, "learning_rate": 0.0001156804, "loss": 0.2521, "step": 210800 }, { "epoch": 0.1818, "grad_norm": 0.2212170660495758, "learning_rate": 0.0001156404, "loss": 0.2532, "step": 210900 }, { "epoch": 0.182, "grad_norm": 0.22296178340911865, "learning_rate": 0.0001156004, "loss": 0.2594, "step": 211000 }, { "epoch": 0.1822, "grad_norm": 0.19012941420078278, "learning_rate": 0.00011556040000000001, "loss": 0.2601, "step": 211100 }, { "epoch": 0.1824, "grad_norm": 0.2223205268383026, "learning_rate": 0.00011552039999999999, "loss": 0.2571, "step": 211200 }, { "epoch": 0.1826, "grad_norm": 0.193226620554924, "learning_rate": 0.0001154804, "loss": 0.2635, "step": 211300 }, { "epoch": 0.1828, "grad_norm": 0.19742873311042786, "learning_rate": 0.00011544040000000001, "loss": 0.2635, "step": 211400 }, { "epoch": 0.183, "grad_norm": 0.2180163711309433, "learning_rate": 0.00011540040000000001, "loss": 0.2669, "step": 211500 }, { "epoch": 0.1832, "grad_norm": 0.20754875242710114, "learning_rate": 0.00011536040000000001, "loss": 0.252, "step": 211600 }, { "epoch": 0.1834, "grad_norm": 0.16631507873535156, "learning_rate": 0.0001153204, "loss": 0.256, "step": 211700 }, { "epoch": 0.1836, "grad_norm": 0.1716802716255188, "learning_rate": 0.0001152804, "loss": 0.2578, "step": 211800 }, { "epoch": 0.1838, "grad_norm": 0.22143660485744476, "learning_rate": 0.0001152404, "loss": 0.2566, "step": 211900 }, { "epoch": 0.184, "grad_norm": 0.2684420347213745, "learning_rate": 0.00011520040000000001, "loss": 0.2515, "step": 212000 }, { "epoch": 0.1842, "grad_norm": 0.22609145939350128, "learning_rate": 0.00011516040000000002, "loss": 0.2532, "step": 212100 }, { "epoch": 0.1844, "grad_norm": 0.1780252754688263, "learning_rate": 0.0001151204, "loss": 0.2579, "step": 212200 }, { "epoch": 0.1846, "grad_norm": 0.256816029548645, "learning_rate": 0.0001150804, "loss": 0.2628, "step": 212300 }, { "epoch": 0.1848, "grad_norm": 0.22887232899665833, "learning_rate": 0.0001150404, "loss": 0.254, "step": 212400 }, { "epoch": 0.185, "grad_norm": 0.17870065569877625, "learning_rate": 0.0001150004, "loss": 0.2525, "step": 212500 }, { "epoch": 0.1852, "grad_norm": 0.4473213255405426, "learning_rate": 0.00011496040000000001, "loss": 0.2565, "step": 212600 }, { "epoch": 0.1854, "grad_norm": 0.1810619980096817, "learning_rate": 0.00011492039999999999, "loss": 0.2565, "step": 212700 }, { "epoch": 0.1856, "grad_norm": 0.1680411398410797, "learning_rate": 0.0001148804, "loss": 0.2555, "step": 212800 }, { "epoch": 0.1858, "grad_norm": 0.22519907355308533, "learning_rate": 0.0001148404, "loss": 0.252, "step": 212900 }, { "epoch": 0.186, "grad_norm": 0.18328481912612915, "learning_rate": 0.00011480040000000001, "loss": 0.2567, "step": 213000 }, { "epoch": 0.1862, "grad_norm": 0.4796239733695984, "learning_rate": 0.0001147604, "loss": 0.2552, "step": 213100 }, { "epoch": 0.1864, "grad_norm": 0.17960181832313538, "learning_rate": 0.0001147204, "loss": 0.2552, "step": 213200 }, { "epoch": 0.1866, "grad_norm": 0.21092750132083893, "learning_rate": 0.0001146804, "loss": 0.2591, "step": 213300 }, { "epoch": 0.1868, "grad_norm": 0.1919945329427719, "learning_rate": 0.0001146404, "loss": 0.2524, "step": 213400 }, { "epoch": 0.187, "grad_norm": 0.1630232334136963, "learning_rate": 0.00011460040000000001, "loss": 0.2548, "step": 213500 }, { "epoch": 0.1872, "grad_norm": 0.15087801218032837, "learning_rate": 0.00011456040000000002, "loss": 0.2595, "step": 213600 }, { "epoch": 0.1874, "grad_norm": 0.31527626514434814, "learning_rate": 0.00011452040000000002, "loss": 0.2553, "step": 213700 }, { "epoch": 0.1876, "grad_norm": 0.27339211106300354, "learning_rate": 0.0001144804, "loss": 0.2617, "step": 213800 }, { "epoch": 0.1878, "grad_norm": 0.19266119599342346, "learning_rate": 0.00011444040000000001, "loss": 0.2628, "step": 213900 }, { "epoch": 0.188, "grad_norm": 0.26474958658218384, "learning_rate": 0.0001144004, "loss": 0.2533, "step": 214000 }, { "epoch": 0.1882, "grad_norm": 0.16048365831375122, "learning_rate": 0.00011436040000000001, "loss": 0.261, "step": 214100 }, { "epoch": 0.1884, "grad_norm": 0.20829501748085022, "learning_rate": 0.00011432040000000002, "loss": 0.2562, "step": 214200 }, { "epoch": 0.1886, "grad_norm": 0.2848805785179138, "learning_rate": 0.0001142804, "loss": 0.2586, "step": 214300 }, { "epoch": 0.1888, "grad_norm": 0.2071709930896759, "learning_rate": 0.0001142404, "loss": 0.2534, "step": 214400 }, { "epoch": 0.189, "grad_norm": 0.20445095002651215, "learning_rate": 0.00011420040000000001, "loss": 0.2553, "step": 214500 }, { "epoch": 0.1892, "grad_norm": 0.1799619346857071, "learning_rate": 0.0001141604, "loss": 0.2571, "step": 214600 }, { "epoch": 0.1894, "grad_norm": 0.23064669966697693, "learning_rate": 0.00011412040000000001, "loss": 0.2581, "step": 214700 }, { "epoch": 0.1896, "grad_norm": 0.20549724996089935, "learning_rate": 0.00011408039999999999, "loss": 0.2563, "step": 214800 }, { "epoch": 0.1898, "grad_norm": 0.3230568766593933, "learning_rate": 0.0001140404, "loss": 0.2549, "step": 214900 }, { "epoch": 0.19, "grad_norm": 0.15154241025447845, "learning_rate": 0.00011400040000000001, "loss": 0.256, "step": 215000 }, { "epoch": 0.1902, "grad_norm": 0.18913942575454712, "learning_rate": 0.00011396040000000001, "loss": 0.2632, "step": 215100 }, { "epoch": 0.1904, "grad_norm": 0.19355298578739166, "learning_rate": 0.00011392040000000002, "loss": 0.2577, "step": 215200 }, { "epoch": 0.1906, "grad_norm": 0.16174174845218658, "learning_rate": 0.0001138804, "loss": 0.2504, "step": 215300 }, { "epoch": 0.1908, "grad_norm": 0.24760814011096954, "learning_rate": 0.00011384040000000001, "loss": 0.2541, "step": 215400 }, { "epoch": 0.191, "grad_norm": 0.19982276856899261, "learning_rate": 0.0001138004, "loss": 0.2553, "step": 215500 }, { "epoch": 0.1912, "grad_norm": 0.22562120854854584, "learning_rate": 0.00011376040000000001, "loss": 0.2564, "step": 215600 }, { "epoch": 0.1914, "grad_norm": 0.2680492401123047, "learning_rate": 0.00011372040000000002, "loss": 0.2582, "step": 215700 }, { "epoch": 0.1916, "grad_norm": 0.22115637362003326, "learning_rate": 0.0001136804, "loss": 0.2576, "step": 215800 }, { "epoch": 0.1918, "grad_norm": 0.19587989151477814, "learning_rate": 0.0001136404, "loss": 0.2566, "step": 215900 }, { "epoch": 0.192, "grad_norm": 0.25761643052101135, "learning_rate": 0.00011360040000000001, "loss": 0.2572, "step": 216000 }, { "epoch": 0.1922, "grad_norm": 0.1627892553806305, "learning_rate": 0.0001135604, "loss": 0.255, "step": 216100 }, { "epoch": 0.1924, "grad_norm": 0.18419532477855682, "learning_rate": 0.00011352040000000001, "loss": 0.2558, "step": 216200 }, { "epoch": 0.1926, "grad_norm": 0.18171828985214233, "learning_rate": 0.00011348039999999999, "loss": 0.2548, "step": 216300 }, { "epoch": 0.1928, "grad_norm": 0.16602705419063568, "learning_rate": 0.0001134404, "loss": 0.2542, "step": 216400 }, { "epoch": 0.193, "grad_norm": 0.17187577486038208, "learning_rate": 0.0001134004, "loss": 0.2559, "step": 216500 }, { "epoch": 0.1932, "grad_norm": 0.2111077755689621, "learning_rate": 0.00011336040000000001, "loss": 0.2578, "step": 216600 }, { "epoch": 0.1934, "grad_norm": 0.1766299456357956, "learning_rate": 0.00011332040000000002, "loss": 0.254, "step": 216700 }, { "epoch": 0.1936, "grad_norm": 0.2486806958913803, "learning_rate": 0.0001132804, "loss": 0.2642, "step": 216800 }, { "epoch": 0.1938, "grad_norm": 0.1899750679731369, "learning_rate": 0.00011324040000000001, "loss": 0.2542, "step": 216900 }, { "epoch": 0.194, "grad_norm": 0.18645033240318298, "learning_rate": 0.0001132004, "loss": 0.2589, "step": 217000 }, { "epoch": 0.1942, "grad_norm": 0.1879652589559555, "learning_rate": 0.00011316040000000001, "loss": 0.2623, "step": 217100 }, { "epoch": 0.1944, "grad_norm": 0.21320343017578125, "learning_rate": 0.00011312040000000002, "loss": 0.2572, "step": 217200 }, { "epoch": 0.1946, "grad_norm": 0.19344399869441986, "learning_rate": 0.0001130804, "loss": 0.2525, "step": 217300 }, { "epoch": 0.1948, "grad_norm": 0.16314131021499634, "learning_rate": 0.0001130404, "loss": 0.2532, "step": 217400 }, { "epoch": 0.195, "grad_norm": 0.153633713722229, "learning_rate": 0.00011300040000000001, "loss": 0.2613, "step": 217500 }, { "epoch": 0.1952, "grad_norm": 0.231405109167099, "learning_rate": 0.00011296040000000002, "loss": 0.2613, "step": 217600 }, { "epoch": 0.1954, "grad_norm": 0.17982491850852966, "learning_rate": 0.00011292040000000001, "loss": 0.2568, "step": 217700 }, { "epoch": 0.1956, "grad_norm": 0.21746788918972015, "learning_rate": 0.00011288039999999999, "loss": 0.2552, "step": 217800 }, { "epoch": 0.1958, "grad_norm": 0.1724511682987213, "learning_rate": 0.0001128404, "loss": 0.2593, "step": 217900 }, { "epoch": 0.196, "grad_norm": 0.1525942087173462, "learning_rate": 0.0001128004, "loss": 0.2647, "step": 218000 }, { "epoch": 0.1962, "grad_norm": 0.20681273937225342, "learning_rate": 0.00011276040000000001, "loss": 0.2569, "step": 218100 }, { "epoch": 0.1964, "grad_norm": 0.25222960114479065, "learning_rate": 0.00011272040000000002, "loss": 0.256, "step": 218200 }, { "epoch": 0.1966, "grad_norm": 0.19064846634864807, "learning_rate": 0.0001126804, "loss": 0.2507, "step": 218300 }, { "epoch": 0.1968, "grad_norm": 0.1997394561767578, "learning_rate": 0.0001126404, "loss": 0.2612, "step": 218400 }, { "epoch": 0.197, "grad_norm": 0.1820058673620224, "learning_rate": 0.0001126004, "loss": 0.2559, "step": 218500 }, { "epoch": 0.1972, "grad_norm": 0.23284319043159485, "learning_rate": 0.00011256040000000001, "loss": 0.2578, "step": 218600 }, { "epoch": 0.1974, "grad_norm": 0.3271949291229248, "learning_rate": 0.00011252040000000001, "loss": 0.2555, "step": 218700 }, { "epoch": 0.1976, "grad_norm": 0.18260078132152557, "learning_rate": 0.0001124804, "loss": 0.255, "step": 218800 }, { "epoch": 0.1978, "grad_norm": 0.1611817330121994, "learning_rate": 0.0001124404, "loss": 0.2601, "step": 218900 }, { "epoch": 0.198, "grad_norm": 0.22105377912521362, "learning_rate": 0.00011240040000000001, "loss": 0.2551, "step": 219000 }, { "epoch": 0.1982, "grad_norm": 0.22783438861370087, "learning_rate": 0.00011236040000000002, "loss": 0.2536, "step": 219100 }, { "epoch": 0.1984, "grad_norm": 0.14097750186920166, "learning_rate": 0.00011232040000000001, "loss": 0.251, "step": 219200 }, { "epoch": 0.1986, "grad_norm": 0.16217005252838135, "learning_rate": 0.0001122804, "loss": 0.2546, "step": 219300 }, { "epoch": 0.1988, "grad_norm": 0.28094884753227234, "learning_rate": 0.0001122404, "loss": 0.2597, "step": 219400 }, { "epoch": 0.199, "grad_norm": 0.1714087426662445, "learning_rate": 0.0001122004, "loss": 0.2532, "step": 219500 }, { "epoch": 0.1992, "grad_norm": 0.2341964989900589, "learning_rate": 0.00011216040000000001, "loss": 0.2566, "step": 219600 }, { "epoch": 0.1994, "grad_norm": 0.16600032150745392, "learning_rate": 0.00011212040000000002, "loss": 0.2502, "step": 219700 }, { "epoch": 0.1996, "grad_norm": 0.3427354693412781, "learning_rate": 0.0001120804, "loss": 0.2568, "step": 219800 }, { "epoch": 0.1998, "grad_norm": 0.2476433664560318, "learning_rate": 0.0001120404, "loss": 0.2579, "step": 219900 }, { "epoch": 0.2, "grad_norm": 0.24420465528964996, "learning_rate": 0.0001120004, "loss": 0.2519, "step": 220000 }, { "epoch": 0.2002, "grad_norm": 0.4817318618297577, "learning_rate": 0.0001119604, "loss": 0.3021, "step": 220100 }, { "epoch": 0.2004, "grad_norm": 0.42361992597579956, "learning_rate": 0.00011192040000000001, "loss": 0.3307, "step": 220200 }, { "epoch": 0.2006, "grad_norm": 0.20908264815807343, "learning_rate": 0.0001118804, "loss": 0.316, "step": 220300 }, { "epoch": 0.2008, "grad_norm": 0.4234218895435333, "learning_rate": 0.0001118404, "loss": 0.3001, "step": 220400 }, { "epoch": 0.201, "grad_norm": 0.5726023316383362, "learning_rate": 0.00011180040000000001, "loss": 0.3016, "step": 220500 }, { "epoch": 0.2012, "grad_norm": 0.28417375683784485, "learning_rate": 0.00011176040000000002, "loss": 0.3389, "step": 220600 }, { "epoch": 0.2014, "grad_norm": 0.3290591537952423, "learning_rate": 0.00011172040000000001, "loss": 0.3019, "step": 220700 }, { "epoch": 0.2016, "grad_norm": 0.5273597836494446, "learning_rate": 0.0001116804, "loss": 0.3008, "step": 220800 }, { "epoch": 0.2018, "grad_norm": 0.19379092752933502, "learning_rate": 0.0001116404, "loss": 0.3282, "step": 220900 }, { "epoch": 0.202, "grad_norm": 0.19746637344360352, "learning_rate": 0.0001116004, "loss": 0.2928, "step": 221000 }, { "epoch": 0.2022, "grad_norm": 0.2760812044143677, "learning_rate": 0.00011156040000000001, "loss": 0.2961, "step": 221100 }, { "epoch": 0.2024, "grad_norm": 0.21008889377117157, "learning_rate": 0.00011152040000000002, "loss": 0.2986, "step": 221200 }, { "epoch": 0.2026, "grad_norm": 0.4908127784729004, "learning_rate": 0.0001114804, "loss": 0.3456, "step": 221300 }, { "epoch": 0.2028, "grad_norm": 0.1982090175151825, "learning_rate": 0.0001114404, "loss": 0.3186, "step": 221400 }, { "epoch": 0.203, "grad_norm": 0.37534865736961365, "learning_rate": 0.0001114004, "loss": 0.2995, "step": 221500 }, { "epoch": 0.2032, "grad_norm": 0.24973396956920624, "learning_rate": 0.0001113604, "loss": 0.2996, "step": 221600 }, { "epoch": 0.2034, "grad_norm": 0.21024543046951294, "learning_rate": 0.00011132040000000001, "loss": 0.2907, "step": 221700 }, { "epoch": 0.2036, "grad_norm": 0.3225703537464142, "learning_rate": 0.00011128039999999999, "loss": 0.2911, "step": 221800 }, { "epoch": 0.2038, "grad_norm": 0.4581950306892395, "learning_rate": 0.0001112404, "loss": 0.3637, "step": 221900 }, { "epoch": 0.204, "grad_norm": 0.5433754920959473, "learning_rate": 0.00011120040000000001, "loss": 0.3487, "step": 222000 }, { "epoch": 0.2042, "grad_norm": 0.22788307070732117, "learning_rate": 0.00011116040000000001, "loss": 0.3004, "step": 222100 }, { "epoch": 0.2044, "grad_norm": 0.16002947092056274, "learning_rate": 0.00011112040000000001, "loss": 0.2993, "step": 222200 }, { "epoch": 0.2046, "grad_norm": 0.28923583030700684, "learning_rate": 0.0001110804, "loss": 0.3036, "step": 222300 }, { "epoch": 0.2048, "grad_norm": 0.3555202782154083, "learning_rate": 0.0001110404, "loss": 0.338, "step": 222400 }, { "epoch": 0.205, "grad_norm": 0.1331017166376114, "learning_rate": 0.0001110004, "loss": 0.2904, "step": 222500 }, { "epoch": 0.2052, "grad_norm": 0.269748717546463, "learning_rate": 0.00011096040000000001, "loss": 0.3133, "step": 222600 }, { "epoch": 0.2054, "grad_norm": 0.919607937335968, "learning_rate": 0.00011092040000000002, "loss": 0.3133, "step": 222700 }, { "epoch": 0.2056, "grad_norm": 0.27156761288642883, "learning_rate": 0.0001108804, "loss": 0.3065, "step": 222800 }, { "epoch": 0.2058, "grad_norm": 0.48692193627357483, "learning_rate": 0.0001108404, "loss": 0.3025, "step": 222900 }, { "epoch": 0.206, "grad_norm": 0.8623552322387695, "learning_rate": 0.00011080040000000001, "loss": 0.296, "step": 223000 }, { "epoch": 0.2062, "grad_norm": 0.18269303441047668, "learning_rate": 0.0001107604, "loss": 0.277, "step": 223100 }, { "epoch": 0.2064, "grad_norm": 0.37067535519599915, "learning_rate": 0.00011072040000000001, "loss": 0.3043, "step": 223200 }, { "epoch": 0.2066, "grad_norm": 0.25582921504974365, "learning_rate": 0.00011068039999999999, "loss": 0.2949, "step": 223300 }, { "epoch": 0.2068, "grad_norm": 0.19148407876491547, "learning_rate": 0.0001106404, "loss": 0.2955, "step": 223400 }, { "epoch": 0.207, "grad_norm": 0.24919286370277405, "learning_rate": 0.0001106004, "loss": 0.3014, "step": 223500 }, { "epoch": 0.2072, "grad_norm": 0.24463938176631927, "learning_rate": 0.00011056040000000001, "loss": 0.3057, "step": 223600 }, { "epoch": 0.2074, "grad_norm": 0.35586199164390564, "learning_rate": 0.0001105204, "loss": 0.29, "step": 223700 }, { "epoch": 0.2076, "grad_norm": 0.319210946559906, "learning_rate": 0.0001104804, "loss": 0.368, "step": 223800 }, { "epoch": 0.2078, "grad_norm": 0.282400906085968, "learning_rate": 0.0001104404, "loss": 0.292, "step": 223900 }, { "epoch": 0.208, "grad_norm": 0.17433995008468628, "learning_rate": 0.0001104004, "loss": 0.2892, "step": 224000 }, { "epoch": 0.2082, "grad_norm": 0.5151282548904419, "learning_rate": 0.00011036040000000001, "loss": 0.2962, "step": 224100 }, { "epoch": 0.2084, "grad_norm": 0.2073827087879181, "learning_rate": 0.00011032040000000002, "loss": 0.2928, "step": 224200 }, { "epoch": 0.2086, "grad_norm": 0.40052372217178345, "learning_rate": 0.0001102804, "loss": 0.297, "step": 224300 }, { "epoch": 0.2088, "grad_norm": 0.26308512687683105, "learning_rate": 0.0001102404, "loss": 0.3246, "step": 224400 }, { "epoch": 0.209, "grad_norm": 0.35187768936157227, "learning_rate": 0.00011020040000000001, "loss": 0.3031, "step": 224500 }, { "epoch": 0.2092, "grad_norm": 0.1798812747001648, "learning_rate": 0.0001101604, "loss": 0.2947, "step": 224600 }, { "epoch": 0.2094, "grad_norm": 0.2565075159072876, "learning_rate": 0.00011012040000000001, "loss": 0.3018, "step": 224700 }, { "epoch": 0.2096, "grad_norm": 0.2061193436384201, "learning_rate": 0.00011008039999999999, "loss": 0.2986, "step": 224800 }, { "epoch": 0.2098, "grad_norm": 0.15753225982189178, "learning_rate": 0.0001100404, "loss": 0.2993, "step": 224900 }, { "epoch": 0.21, "grad_norm": 0.18000277876853943, "learning_rate": 0.0001100004, "loss": 0.2809, "step": 225000 }, { "epoch": 0.2102, "grad_norm": 0.17245367169380188, "learning_rate": 0.00010996040000000001, "loss": 0.3003, "step": 225100 }, { "epoch": 0.2104, "grad_norm": 0.27620208263397217, "learning_rate": 0.0001099204, "loss": 0.2863, "step": 225200 }, { "epoch": 0.2106, "grad_norm": 0.1691516935825348, "learning_rate": 0.0001098804, "loss": 0.3085, "step": 225300 }, { "epoch": 0.2108, "grad_norm": 0.25270703434944153, "learning_rate": 0.00010984039999999999, "loss": 0.3018, "step": 225400 }, { "epoch": 0.211, "grad_norm": 0.26797574758529663, "learning_rate": 0.0001098004, "loss": 0.2919, "step": 225500 }, { "epoch": 0.2112, "grad_norm": 0.1746404618024826, "learning_rate": 0.00010976040000000001, "loss": 0.282, "step": 225600 }, { "epoch": 0.2114, "grad_norm": 0.2220151722431183, "learning_rate": 0.00010972040000000001, "loss": 0.3083, "step": 225700 }, { "epoch": 0.2116, "grad_norm": 0.3341788947582245, "learning_rate": 0.0001096804, "loss": 0.2846, "step": 225800 }, { "epoch": 0.2118, "grad_norm": 0.2519269585609436, "learning_rate": 0.0001096404, "loss": 0.3022, "step": 225900 }, { "epoch": 0.212, "grad_norm": 0.4205581545829773, "learning_rate": 0.00010960040000000001, "loss": 0.3163, "step": 226000 }, { "epoch": 0.2122, "grad_norm": 0.2838189899921417, "learning_rate": 0.0001095604, "loss": 0.2892, "step": 226100 }, { "epoch": 0.2124, "grad_norm": 0.25370433926582336, "learning_rate": 0.00010952040000000001, "loss": 0.282, "step": 226200 }, { "epoch": 0.2126, "grad_norm": 0.26368477940559387, "learning_rate": 0.00010948040000000002, "loss": 0.2784, "step": 226300 }, { "epoch": 0.2128, "grad_norm": 0.46951645612716675, "learning_rate": 0.0001094404, "loss": 0.3009, "step": 226400 }, { "epoch": 0.213, "grad_norm": 0.4232272207736969, "learning_rate": 0.0001094004, "loss": 0.2887, "step": 226500 }, { "epoch": 0.2132, "grad_norm": 0.19953791797161102, "learning_rate": 0.00010936040000000001, "loss": 0.2851, "step": 226600 }, { "epoch": 0.2134, "grad_norm": 0.28893914818763733, "learning_rate": 0.00010932040000000002, "loss": 0.2904, "step": 226700 }, { "epoch": 0.2136, "grad_norm": 0.18542243540287018, "learning_rate": 0.00010928040000000001, "loss": 0.2925, "step": 226800 }, { "epoch": 0.2138, "grad_norm": 0.2543767988681793, "learning_rate": 0.00010924039999999999, "loss": 0.2859, "step": 226900 }, { "epoch": 0.214, "grad_norm": 0.380341500043869, "learning_rate": 0.0001092004, "loss": 0.2948, "step": 227000 }, { "epoch": 0.2142, "grad_norm": 0.6200868487358093, "learning_rate": 0.0001091604, "loss": 0.3034, "step": 227100 }, { "epoch": 0.2144, "grad_norm": 0.40341097116470337, "learning_rate": 0.00010912040000000001, "loss": 0.2803, "step": 227200 }, { "epoch": 0.2146, "grad_norm": 0.27767351269721985, "learning_rate": 0.00010908040000000002, "loss": 0.2922, "step": 227300 }, { "epoch": 0.2148, "grad_norm": 0.18957166373729706, "learning_rate": 0.0001090404, "loss": 0.3052, "step": 227400 }, { "epoch": 0.215, "grad_norm": 0.21950003504753113, "learning_rate": 0.00010900040000000001, "loss": 0.3, "step": 227500 }, { "epoch": 0.2152, "grad_norm": 0.36292436718940735, "learning_rate": 0.0001089604, "loss": 0.291, "step": 227600 }, { "epoch": 0.2154, "grad_norm": 0.4383956789970398, "learning_rate": 0.00010892040000000001, "loss": 0.302, "step": 227700 }, { "epoch": 0.2156, "grad_norm": 0.18310613930225372, "learning_rate": 0.00010888040000000002, "loss": 0.3081, "step": 227800 }, { "epoch": 0.2158, "grad_norm": 0.21227945387363434, "learning_rate": 0.0001088404, "loss": 0.2791, "step": 227900 }, { "epoch": 0.216, "grad_norm": 0.25162750482559204, "learning_rate": 0.0001088004, "loss": 0.2922, "step": 228000 }, { "epoch": 0.2162, "grad_norm": 0.4016803205013275, "learning_rate": 0.00010876040000000001, "loss": 0.325, "step": 228100 }, { "epoch": 0.2164, "grad_norm": 0.24426665902137756, "learning_rate": 0.00010872040000000002, "loss": 0.2946, "step": 228200 }, { "epoch": 0.2166, "grad_norm": 0.35735762119293213, "learning_rate": 0.00010868040000000001, "loss": 0.2953, "step": 228300 }, { "epoch": 0.2168, "grad_norm": 0.17666518688201904, "learning_rate": 0.0001086404, "loss": 0.2933, "step": 228400 }, { "epoch": 0.217, "grad_norm": 0.4070577323436737, "learning_rate": 0.0001086004, "loss": 0.2955, "step": 228500 }, { "epoch": 0.2172, "grad_norm": 0.24427427351474762, "learning_rate": 0.0001085604, "loss": 0.2858, "step": 228600 }, { "epoch": 0.2174, "grad_norm": 0.21364831924438477, "learning_rate": 0.00010852040000000001, "loss": 0.2919, "step": 228700 }, { "epoch": 0.2176, "grad_norm": 0.20407788455486298, "learning_rate": 0.00010848040000000002, "loss": 0.2898, "step": 228800 }, { "epoch": 0.2178, "grad_norm": 0.31540021300315857, "learning_rate": 0.0001084404, "loss": 0.2987, "step": 228900 }, { "epoch": 0.218, "grad_norm": 0.15270252525806427, "learning_rate": 0.0001084004, "loss": 0.2872, "step": 229000 }, { "epoch": 0.2182, "grad_norm": 0.23393623530864716, "learning_rate": 0.0001083604, "loss": 0.3058, "step": 229100 }, { "epoch": 0.2184, "grad_norm": 0.5334755778312683, "learning_rate": 0.00010832040000000001, "loss": 0.29, "step": 229200 }, { "epoch": 0.2186, "grad_norm": 0.1923484355211258, "learning_rate": 0.00010828040000000001, "loss": 0.2873, "step": 229300 }, { "epoch": 0.2188, "grad_norm": 0.23377379775047302, "learning_rate": 0.0001082404, "loss": 0.2823, "step": 229400 }, { "epoch": 0.219, "grad_norm": 0.22919674217700958, "learning_rate": 0.0001082004, "loss": 0.2836, "step": 229500 }, { "epoch": 0.2192, "grad_norm": 0.20516985654830933, "learning_rate": 0.00010816040000000001, "loss": 0.29, "step": 229600 }, { "epoch": 0.2194, "grad_norm": 0.19901040196418762, "learning_rate": 0.00010812040000000002, "loss": 0.2963, "step": 229700 }, { "epoch": 0.2196, "grad_norm": 0.24688464403152466, "learning_rate": 0.00010808040000000001, "loss": 0.2855, "step": 229800 }, { "epoch": 0.2198, "grad_norm": 0.23144333064556122, "learning_rate": 0.0001080404, "loss": 0.2964, "step": 229900 }, { "epoch": 0.22, "grad_norm": 0.2202957421541214, "learning_rate": 0.0001080004, "loss": 0.2925, "step": 230000 }, { "epoch": 0.2202, "grad_norm": 0.23589707911014557, "learning_rate": 0.0001079604, "loss": 0.2759, "step": 230100 }, { "epoch": 0.2204, "grad_norm": 0.31147778034210205, "learning_rate": 0.00010792040000000001, "loss": 0.2668, "step": 230200 }, { "epoch": 0.2206, "grad_norm": 0.36505284905433655, "learning_rate": 0.00010788040000000002, "loss": 0.2807, "step": 230300 }, { "epoch": 0.2208, "grad_norm": 0.24104100465774536, "learning_rate": 0.0001078404, "loss": 0.2802, "step": 230400 }, { "epoch": 0.221, "grad_norm": 0.2541787922382355, "learning_rate": 0.0001078004, "loss": 0.2738, "step": 230500 }, { "epoch": 0.2212, "grad_norm": 0.18488843739032745, "learning_rate": 0.0001077604, "loss": 0.2781, "step": 230600 }, { "epoch": 0.2214, "grad_norm": 0.30733954906463623, "learning_rate": 0.0001077204, "loss": 0.2689, "step": 230700 }, { "epoch": 0.2216, "grad_norm": 0.4184741675853729, "learning_rate": 0.00010768040000000001, "loss": 0.2821, "step": 230800 }, { "epoch": 0.2218, "grad_norm": 0.234775573015213, "learning_rate": 0.0001076404, "loss": 0.2788, "step": 230900 }, { "epoch": 0.222, "grad_norm": 0.1866404116153717, "learning_rate": 0.0001076004, "loss": 0.2805, "step": 231000 }, { "epoch": 0.2222, "grad_norm": 0.20000393688678741, "learning_rate": 0.00010756040000000001, "loss": 0.2786, "step": 231100 }, { "epoch": 0.2224, "grad_norm": 0.2550050914287567, "learning_rate": 0.00010752040000000001, "loss": 0.2808, "step": 231200 }, { "epoch": 0.2226, "grad_norm": 0.19352710247039795, "learning_rate": 0.00010748040000000001, "loss": 0.2794, "step": 231300 }, { "epoch": 0.2228, "grad_norm": 0.33794355392456055, "learning_rate": 0.0001074404, "loss": 0.2654, "step": 231400 }, { "epoch": 0.223, "grad_norm": 0.1755092740058899, "learning_rate": 0.0001074004, "loss": 0.2777, "step": 231500 }, { "epoch": 0.2232, "grad_norm": 0.2609795331954956, "learning_rate": 0.0001073604, "loss": 0.2749, "step": 231600 }, { "epoch": 0.2234, "grad_norm": 0.23666484653949738, "learning_rate": 0.00010732040000000001, "loss": 0.2837, "step": 231700 }, { "epoch": 0.2236, "grad_norm": 0.28469526767730713, "learning_rate": 0.00010728040000000002, "loss": 0.283, "step": 231800 }, { "epoch": 0.2238, "grad_norm": 0.21141879260540009, "learning_rate": 0.0001072404, "loss": 0.2922, "step": 231900 }, { "epoch": 0.224, "grad_norm": 0.19850678741931915, "learning_rate": 0.0001072004, "loss": 0.2771, "step": 232000 }, { "epoch": 0.2242, "grad_norm": 0.24743816256523132, "learning_rate": 0.00010716040000000001, "loss": 0.2749, "step": 232100 }, { "epoch": 0.2244, "grad_norm": 0.3401179015636444, "learning_rate": 0.0001071204, "loss": 0.2781, "step": 232200 }, { "epoch": 0.2246, "grad_norm": 0.2130950391292572, "learning_rate": 0.00010708040000000001, "loss": 0.27, "step": 232300 }, { "epoch": 0.2248, "grad_norm": 0.37853744626045227, "learning_rate": 0.00010704039999999999, "loss": 0.2838, "step": 232400 }, { "epoch": 0.225, "grad_norm": 0.20562714338302612, "learning_rate": 0.0001070004, "loss": 0.2653, "step": 232500 }, { "epoch": 0.2252, "grad_norm": 0.17086434364318848, "learning_rate": 0.00010696040000000001, "loss": 0.2765, "step": 232600 }, { "epoch": 0.2254, "grad_norm": 0.3876626193523407, "learning_rate": 0.00010692040000000001, "loss": 0.2758, "step": 232700 }, { "epoch": 0.2256, "grad_norm": 0.29403626918792725, "learning_rate": 0.00010688040000000001, "loss": 0.2783, "step": 232800 }, { "epoch": 0.2258, "grad_norm": 0.3147401809692383, "learning_rate": 0.0001068404, "loss": 0.2843, "step": 232900 }, { "epoch": 0.226, "grad_norm": 0.38840705156326294, "learning_rate": 0.0001068004, "loss": 0.3095, "step": 233000 }, { "epoch": 0.2262, "grad_norm": 0.2079588621854782, "learning_rate": 0.0001067604, "loss": 0.2971, "step": 233100 }, { "epoch": 0.2264, "grad_norm": 0.293052077293396, "learning_rate": 0.00010672040000000001, "loss": 0.2787, "step": 233200 }, { "epoch": 0.2266, "grad_norm": 0.1692897379398346, "learning_rate": 0.00010668040000000002, "loss": 0.2939, "step": 233300 }, { "epoch": 0.2268, "grad_norm": 0.19356147944927216, "learning_rate": 0.0001066404, "loss": 0.2901, "step": 233400 }, { "epoch": 0.227, "grad_norm": 0.2273617833852768, "learning_rate": 0.0001066004, "loss": 0.2794, "step": 233500 }, { "epoch": 0.2272, "grad_norm": 0.21963585913181305, "learning_rate": 0.00010656040000000001, "loss": 0.2882, "step": 233600 }, { "epoch": 0.2274, "grad_norm": 0.3202444911003113, "learning_rate": 0.0001065204, "loss": 0.2735, "step": 233700 }, { "epoch": 0.2276, "grad_norm": 0.2695014774799347, "learning_rate": 0.00010648040000000001, "loss": 0.288, "step": 233800 }, { "epoch": 0.2278, "grad_norm": 0.1968889981508255, "learning_rate": 0.00010644039999999999, "loss": 0.2818, "step": 233900 }, { "epoch": 0.228, "grad_norm": 0.2076377421617508, "learning_rate": 0.0001064004, "loss": 0.2806, "step": 234000 }, { "epoch": 0.2282, "grad_norm": 0.17942437529563904, "learning_rate": 0.0001063604, "loss": 0.2874, "step": 234100 }, { "epoch": 0.2284, "grad_norm": 0.20164412260055542, "learning_rate": 0.00010632040000000001, "loss": 0.271, "step": 234200 }, { "epoch": 0.2286, "grad_norm": 0.3018455505371094, "learning_rate": 0.0001062804, "loss": 0.2787, "step": 234300 }, { "epoch": 0.2288, "grad_norm": 0.18581052124500275, "learning_rate": 0.0001062404, "loss": 0.271, "step": 234400 }, { "epoch": 0.229, "grad_norm": 0.36110278964042664, "learning_rate": 0.0001062004, "loss": 0.2856, "step": 234500 }, { "epoch": 0.2292, "grad_norm": 0.21756871044635773, "learning_rate": 0.0001061604, "loss": 0.2872, "step": 234600 }, { "epoch": 0.2294, "grad_norm": 0.1998421549797058, "learning_rate": 0.00010612040000000001, "loss": 0.28, "step": 234700 }, { "epoch": 0.2296, "grad_norm": 0.17958033084869385, "learning_rate": 0.00010608040000000002, "loss": 0.2811, "step": 234800 }, { "epoch": 0.2298, "grad_norm": 0.6013987064361572, "learning_rate": 0.0001060404, "loss": 0.2732, "step": 234900 }, { "epoch": 0.23, "grad_norm": 0.20178425312042236, "learning_rate": 0.0001060004, "loss": 0.2713, "step": 235000 }, { "epoch": 0.2302, "grad_norm": 0.2956780791282654, "learning_rate": 0.00010596040000000001, "loss": 0.2775, "step": 235100 }, { "epoch": 0.2304, "grad_norm": 0.2311795949935913, "learning_rate": 0.0001059204, "loss": 0.2807, "step": 235200 }, { "epoch": 0.2306, "grad_norm": 0.24748362600803375, "learning_rate": 0.00010588040000000001, "loss": 0.2657, "step": 235300 }, { "epoch": 0.2308, "grad_norm": 0.38484346866607666, "learning_rate": 0.00010584039999999999, "loss": 0.2714, "step": 235400 }, { "epoch": 0.231, "grad_norm": 0.20243670046329498, "learning_rate": 0.0001058004, "loss": 0.2783, "step": 235500 }, { "epoch": 0.2312, "grad_norm": 0.5278353095054626, "learning_rate": 0.0001057604, "loss": 0.2779, "step": 235600 }, { "epoch": 0.2314, "grad_norm": 0.27309587597846985, "learning_rate": 0.00010572040000000001, "loss": 0.2695, "step": 235700 }, { "epoch": 0.2316, "grad_norm": 0.16521087288856506, "learning_rate": 0.00010568040000000002, "loss": 0.2732, "step": 235800 }, { "epoch": 0.2318, "grad_norm": 0.5062386393547058, "learning_rate": 0.0001056404, "loss": 0.2878, "step": 235900 }, { "epoch": 0.232, "grad_norm": 0.19394774734973907, "learning_rate": 0.00010560039999999999, "loss": 0.2851, "step": 236000 }, { "epoch": 0.2322, "grad_norm": 0.171609029173851, "learning_rate": 0.0001055604, "loss": 0.2809, "step": 236100 }, { "epoch": 0.2324, "grad_norm": 0.30568891763687134, "learning_rate": 0.00010552040000000001, "loss": 0.2952, "step": 236200 }, { "epoch": 0.2326, "grad_norm": 0.32132649421691895, "learning_rate": 0.00010548040000000001, "loss": 0.2751, "step": 236300 }, { "epoch": 0.2328, "grad_norm": 0.30104103684425354, "learning_rate": 0.0001054404, "loss": 0.285, "step": 236400 }, { "epoch": 0.233, "grad_norm": 0.19023063778877258, "learning_rate": 0.0001054004, "loss": 0.3591, "step": 236500 }, { "epoch": 0.2332, "grad_norm": 0.1921398788690567, "learning_rate": 0.00010536040000000001, "loss": 0.2813, "step": 236600 }, { "epoch": 0.2334, "grad_norm": 0.15040914714336395, "learning_rate": 0.0001053204, "loss": 0.2873, "step": 236700 }, { "epoch": 0.2336, "grad_norm": 0.18588508665561676, "learning_rate": 0.00010528040000000001, "loss": 0.2722, "step": 236800 }, { "epoch": 0.2338, "grad_norm": 0.3410860002040863, "learning_rate": 0.00010524039999999999, "loss": 0.2811, "step": 236900 }, { "epoch": 0.234, "grad_norm": 0.21273022890090942, "learning_rate": 0.0001052004, "loss": 0.2892, "step": 237000 }, { "epoch": 0.2342, "grad_norm": 0.20495010912418365, "learning_rate": 0.0001051604, "loss": 0.2826, "step": 237100 }, { "epoch": 0.2344, "grad_norm": 0.3774225413799286, "learning_rate": 0.00010512040000000001, "loss": 0.2704, "step": 237200 }, { "epoch": 0.2346, "grad_norm": 0.1908106654882431, "learning_rate": 0.00010508040000000002, "loss": 0.2854, "step": 237300 }, { "epoch": 0.2348, "grad_norm": 0.333554208278656, "learning_rate": 0.0001050404, "loss": 0.276, "step": 237400 }, { "epoch": 0.235, "grad_norm": 0.2886081337928772, "learning_rate": 0.0001050004, "loss": 0.2737, "step": 237500 }, { "epoch": 0.2352, "grad_norm": 0.2619887888431549, "learning_rate": 0.0001049604, "loss": 0.2718, "step": 237600 }, { "epoch": 0.2354, "grad_norm": 0.2604043483734131, "learning_rate": 0.0001049204, "loss": 0.2757, "step": 237700 }, { "epoch": 0.2356, "grad_norm": 0.20619331300258636, "learning_rate": 0.00010488040000000001, "loss": 0.2722, "step": 237800 }, { "epoch": 0.2358, "grad_norm": 0.1808670163154602, "learning_rate": 0.00010484039999999999, "loss": 0.2715, "step": 237900 }, { "epoch": 0.236, "grad_norm": 0.39049023389816284, "learning_rate": 0.0001048004, "loss": 0.271, "step": 238000 }, { "epoch": 0.2362, "grad_norm": 0.2106139063835144, "learning_rate": 0.00010476040000000001, "loss": 0.2775, "step": 238100 }, { "epoch": 0.2364, "grad_norm": 0.36072278022766113, "learning_rate": 0.0001047204, "loss": 0.2719, "step": 238200 }, { "epoch": 0.2366, "grad_norm": 0.1794702261686325, "learning_rate": 0.00010468040000000001, "loss": 0.2865, "step": 238300 }, { "epoch": 0.2368, "grad_norm": 0.3740937113761902, "learning_rate": 0.00010464039999999999, "loss": 0.2725, "step": 238400 }, { "epoch": 0.237, "grad_norm": 0.2138763815164566, "learning_rate": 0.0001046004, "loss": 0.2776, "step": 238500 }, { "epoch": 0.2372, "grad_norm": 0.21753883361816406, "learning_rate": 0.0001045604, "loss": 0.2854, "step": 238600 }, { "epoch": 0.2374, "grad_norm": 0.19092372059822083, "learning_rate": 0.00010452040000000001, "loss": 0.2785, "step": 238700 }, { "epoch": 0.2376, "grad_norm": 0.18898741900920868, "learning_rate": 0.00010448040000000002, "loss": 0.2747, "step": 238800 }, { "epoch": 0.2378, "grad_norm": 0.3324822187423706, "learning_rate": 0.00010444040000000001, "loss": 0.2685, "step": 238900 }, { "epoch": 0.238, "grad_norm": 0.24389317631721497, "learning_rate": 0.0001044004, "loss": 0.2748, "step": 239000 }, { "epoch": 0.2382, "grad_norm": 0.23654966056346893, "learning_rate": 0.0001043604, "loss": 0.2845, "step": 239100 }, { "epoch": 0.2384, "grad_norm": 0.3116486072540283, "learning_rate": 0.0001043204, "loss": 0.2732, "step": 239200 }, { "epoch": 0.2386, "grad_norm": 0.33123424649238586, "learning_rate": 0.00010428040000000001, "loss": 0.2974, "step": 239300 }, { "epoch": 0.2388, "grad_norm": 0.2512767016887665, "learning_rate": 0.00010424040000000002, "loss": 0.2799, "step": 239400 }, { "epoch": 0.239, "grad_norm": 0.1622258871793747, "learning_rate": 0.0001042004, "loss": 0.2889, "step": 239500 }, { "epoch": 0.2392, "grad_norm": 0.33309224247932434, "learning_rate": 0.0001041604, "loss": 0.2861, "step": 239600 }, { "epoch": 0.2394, "grad_norm": 0.657027542591095, "learning_rate": 0.0001041204, "loss": 0.2772, "step": 239700 }, { "epoch": 0.2396, "grad_norm": 0.33285826444625854, "learning_rate": 0.00010408040000000001, "loss": 0.2832, "step": 239800 }, { "epoch": 0.2398, "grad_norm": 0.3333481550216675, "learning_rate": 0.00010404040000000001, "loss": 0.2798, "step": 239900 }, { "epoch": 0.24, "grad_norm": 0.17029236257076263, "learning_rate": 0.0001040004, "loss": 0.2781, "step": 240000 }, { "epoch": 0.2402, "grad_norm": 0.2667362689971924, "learning_rate": 0.0001039604, "loss": 0.2745, "step": 240100 }, { "epoch": 0.2404, "grad_norm": 0.2005651295185089, "learning_rate": 0.00010392040000000001, "loss": 0.2708, "step": 240200 }, { "epoch": 0.2406, "grad_norm": 0.22746022045612335, "learning_rate": 0.00010388040000000002, "loss": 0.2826, "step": 240300 }, { "epoch": 0.2408, "grad_norm": 0.4452439546585083, "learning_rate": 0.00010384040000000001, "loss": 0.2786, "step": 240400 }, { "epoch": 0.241, "grad_norm": 0.2846798002719879, "learning_rate": 0.0001038004, "loss": 0.2759, "step": 240500 }, { "epoch": 0.2412, "grad_norm": 0.5280228853225708, "learning_rate": 0.0001037604, "loss": 0.2831, "step": 240600 }, { "epoch": 0.2414, "grad_norm": 0.18407422304153442, "learning_rate": 0.0001037204, "loss": 0.2861, "step": 240700 }, { "epoch": 0.2416, "grad_norm": 0.27900245785713196, "learning_rate": 0.00010368040000000001, "loss": 0.2761, "step": 240800 }, { "epoch": 0.2418, "grad_norm": 0.4371165931224823, "learning_rate": 0.00010364040000000002, "loss": 0.2741, "step": 240900 }, { "epoch": 0.242, "grad_norm": 0.1729346215724945, "learning_rate": 0.0001036004, "loss": 0.2749, "step": 241000 }, { "epoch": 0.2422, "grad_norm": 0.3014446794986725, "learning_rate": 0.0001035604, "loss": 0.2757, "step": 241100 }, { "epoch": 0.2424, "grad_norm": 0.2890118956565857, "learning_rate": 0.00010352040000000001, "loss": 0.2748, "step": 241200 }, { "epoch": 0.2426, "grad_norm": 0.18363769352436066, "learning_rate": 0.0001034804, "loss": 0.2786, "step": 241300 }, { "epoch": 0.2428, "grad_norm": 0.23119623959064484, "learning_rate": 0.00010344040000000001, "loss": 0.2763, "step": 241400 }, { "epoch": 0.243, "grad_norm": 0.23859907686710358, "learning_rate": 0.0001034004, "loss": 0.2766, "step": 241500 }, { "epoch": 0.2432, "grad_norm": 0.2567295432090759, "learning_rate": 0.0001033604, "loss": 0.2733, "step": 241600 }, { "epoch": 0.2434, "grad_norm": 0.3161749243736267, "learning_rate": 0.00010332040000000001, "loss": 0.2697, "step": 241700 }, { "epoch": 0.2436, "grad_norm": 0.31291186809539795, "learning_rate": 0.00010328040000000001, "loss": 0.2612, "step": 241800 }, { "epoch": 0.2438, "grad_norm": 0.23509889841079712, "learning_rate": 0.00010324040000000001, "loss": 0.2825, "step": 241900 }, { "epoch": 0.244, "grad_norm": 0.21681292355060577, "learning_rate": 0.0001032004, "loss": 0.2797, "step": 242000 }, { "epoch": 0.2442, "grad_norm": 0.2644917070865631, "learning_rate": 0.0001031604, "loss": 0.271, "step": 242100 }, { "epoch": 0.2444, "grad_norm": 0.536569356918335, "learning_rate": 0.0001031204, "loss": 0.2731, "step": 242200 }, { "epoch": 0.2446, "grad_norm": 0.44594717025756836, "learning_rate": 0.00010308040000000001, "loss": 0.2774, "step": 242300 }, { "epoch": 0.2448, "grad_norm": 0.309821754693985, "learning_rate": 0.00010304040000000002, "loss": 0.2899, "step": 242400 }, { "epoch": 0.245, "grad_norm": 0.28232014179229736, "learning_rate": 0.0001030004, "loss": 0.2944, "step": 242500 }, { "epoch": 0.2452, "grad_norm": 0.29917338490486145, "learning_rate": 0.0001029604, "loss": 0.2795, "step": 242600 }, { "epoch": 0.2454, "grad_norm": 0.23704950511455536, "learning_rate": 0.00010292040000000001, "loss": 0.2796, "step": 242700 }, { "epoch": 0.2456, "grad_norm": 0.21638484299182892, "learning_rate": 0.0001028804, "loss": 0.2636, "step": 242800 }, { "epoch": 0.2458, "grad_norm": 0.18547122180461884, "learning_rate": 0.00010284040000000001, "loss": 0.2669, "step": 242900 }, { "epoch": 0.246, "grad_norm": 0.17270751297473907, "learning_rate": 0.00010280039999999999, "loss": 0.2593, "step": 243000 }, { "epoch": 0.2462, "grad_norm": 0.16788358986377716, "learning_rate": 0.0001027604, "loss": 0.2592, "step": 243100 }, { "epoch": 0.2464, "grad_norm": 0.1689780205488205, "learning_rate": 0.0001027204, "loss": 0.2647, "step": 243200 }, { "epoch": 0.2466, "grad_norm": 0.28541648387908936, "learning_rate": 0.00010268040000000001, "loss": 0.2598, "step": 243300 }, { "epoch": 0.2468, "grad_norm": 0.24988971650600433, "learning_rate": 0.00010264040000000002, "loss": 0.2606, "step": 243400 }, { "epoch": 0.247, "grad_norm": 0.17981068789958954, "learning_rate": 0.0001026004, "loss": 0.2581, "step": 243500 }, { "epoch": 0.2472, "grad_norm": 0.19057908654212952, "learning_rate": 0.0001025604, "loss": 0.2645, "step": 243600 }, { "epoch": 0.2474, "grad_norm": 0.1446358561515808, "learning_rate": 0.0001025204, "loss": 0.2513, "step": 243700 }, { "epoch": 0.2476, "grad_norm": 0.18903489410877228, "learning_rate": 0.00010248040000000001, "loss": 0.2577, "step": 243800 }, { "epoch": 0.2478, "grad_norm": 0.18427778780460358, "learning_rate": 0.00010244040000000002, "loss": 0.2549, "step": 243900 }, { "epoch": 0.248, "grad_norm": 0.17634162306785583, "learning_rate": 0.0001024004, "loss": 0.2544, "step": 244000 }, { "epoch": 0.2482, "grad_norm": 0.15997561812400818, "learning_rate": 0.0001023604, "loss": 0.257, "step": 244100 }, { "epoch": 0.2484, "grad_norm": 0.13847199082374573, "learning_rate": 0.00010232040000000001, "loss": 0.2518, "step": 244200 }, { "epoch": 0.2486, "grad_norm": 0.19378510117530823, "learning_rate": 0.0001022804, "loss": 0.2608, "step": 244300 }, { "epoch": 0.2488, "grad_norm": 0.2129637449979782, "learning_rate": 0.00010224040000000001, "loss": 0.2586, "step": 244400 }, { "epoch": 0.249, "grad_norm": 0.28681081533432007, "learning_rate": 0.00010220039999999999, "loss": 0.259, "step": 244500 }, { "epoch": 0.2492, "grad_norm": 0.20721031725406647, "learning_rate": 0.0001021604, "loss": 0.2585, "step": 244600 }, { "epoch": 0.2494, "grad_norm": 0.2532545328140259, "learning_rate": 0.0001021204, "loss": 0.2532, "step": 244700 }, { "epoch": 0.2496, "grad_norm": 0.28852516412734985, "learning_rate": 0.00010208040000000001, "loss": 0.255, "step": 244800 }, { "epoch": 0.2498, "grad_norm": 0.1656378209590912, "learning_rate": 0.00010204040000000002, "loss": 0.2537, "step": 244900 }, { "epoch": 0.25, "grad_norm": 0.2153751105070114, "learning_rate": 0.0001020004, "loss": 0.259, "step": 245000 }, { "epoch": 0.2502, "grad_norm": 0.19507664442062378, "learning_rate": 0.0001019604, "loss": 0.2587, "step": 245100 }, { "epoch": 0.2504, "grad_norm": 0.1736658811569214, "learning_rate": 0.0001019204, "loss": 0.2612, "step": 245200 }, { "epoch": 0.2506, "grad_norm": 0.18013814091682434, "learning_rate": 0.00010188040000000001, "loss": 0.2554, "step": 245300 }, { "epoch": 0.2508, "grad_norm": 0.2120230495929718, "learning_rate": 0.00010184040000000002, "loss": 0.2519, "step": 245400 }, { "epoch": 0.251, "grad_norm": 0.16198742389678955, "learning_rate": 0.0001018004, "loss": 0.254, "step": 245500 }, { "epoch": 0.2512, "grad_norm": 0.16579264402389526, "learning_rate": 0.0001017604, "loss": 0.258, "step": 245600 }, { "epoch": 0.2514, "grad_norm": 0.23009906709194183, "learning_rate": 0.00010172040000000001, "loss": 0.254, "step": 245700 }, { "epoch": 0.2516, "grad_norm": 0.15772415697574615, "learning_rate": 0.0001016804, "loss": 0.2532, "step": 245800 }, { "epoch": 0.2518, "grad_norm": 0.20444506406784058, "learning_rate": 0.00010164040000000001, "loss": 0.2524, "step": 245900 }, { "epoch": 0.252, "grad_norm": 0.19540543854236603, "learning_rate": 0.00010160039999999999, "loss": 0.2538, "step": 246000 }, { "epoch": 0.2522, "grad_norm": 0.17972715198993683, "learning_rate": 0.0001015604, "loss": 0.259, "step": 246100 }, { "epoch": 0.2524, "grad_norm": 0.17533235251903534, "learning_rate": 0.0001015204, "loss": 0.2564, "step": 246200 }, { "epoch": 0.2526, "grad_norm": 0.1778615266084671, "learning_rate": 0.00010148040000000001, "loss": 0.2579, "step": 246300 }, { "epoch": 0.2528, "grad_norm": 0.27196893095970154, "learning_rate": 0.00010144040000000002, "loss": 0.2631, "step": 246400 }, { "epoch": 0.253, "grad_norm": 0.20036780834197998, "learning_rate": 0.0001014004, "loss": 0.2619, "step": 246500 }, { "epoch": 0.2532, "grad_norm": 0.18973121047019958, "learning_rate": 0.0001013604, "loss": 0.2637, "step": 246600 }, { "epoch": 0.2534, "grad_norm": 0.17328648269176483, "learning_rate": 0.0001013204, "loss": 0.2516, "step": 246700 }, { "epoch": 0.2536, "grad_norm": 0.1752815842628479, "learning_rate": 0.00010128040000000001, "loss": 0.2572, "step": 246800 }, { "epoch": 0.2538, "grad_norm": 0.24637199938297272, "learning_rate": 0.00010124040000000001, "loss": 0.2543, "step": 246900 }, { "epoch": 0.254, "grad_norm": 0.19035212695598602, "learning_rate": 0.0001012004, "loss": 0.2513, "step": 247000 }, { "epoch": 0.2542, "grad_norm": 0.14904499053955078, "learning_rate": 0.0001011604, "loss": 0.2579, "step": 247100 }, { "epoch": 0.2544, "grad_norm": 0.6997934579849243, "learning_rate": 0.00010112040000000001, "loss": 0.2545, "step": 247200 }, { "epoch": 0.2546, "grad_norm": 0.1693519502878189, "learning_rate": 0.0001010804, "loss": 0.2575, "step": 247300 }, { "epoch": 0.2548, "grad_norm": 0.1666778028011322, "learning_rate": 0.00010104040000000001, "loss": 0.2645, "step": 247400 }, { "epoch": 0.255, "grad_norm": 0.24935069680213928, "learning_rate": 0.00010100039999999999, "loss": 0.2571, "step": 247500 }, { "epoch": 0.2552, "grad_norm": 0.1595294177532196, "learning_rate": 0.0001009604, "loss": 0.2556, "step": 247600 }, { "epoch": 0.2554, "grad_norm": 0.1594139188528061, "learning_rate": 0.0001009204, "loss": 0.2612, "step": 247700 }, { "epoch": 0.2556, "grad_norm": 0.2499750703573227, "learning_rate": 0.00010088040000000001, "loss": 0.2679, "step": 247800 }, { "epoch": 0.2558, "grad_norm": 0.1567823588848114, "learning_rate": 0.00010084040000000002, "loss": 0.2573, "step": 247900 }, { "epoch": 0.256, "grad_norm": 0.17192710936069489, "learning_rate": 0.0001008004, "loss": 0.2543, "step": 248000 }, { "epoch": 0.2562, "grad_norm": 0.18040655553340912, "learning_rate": 0.0001007604, "loss": 0.2632, "step": 248100 }, { "epoch": 0.2564, "grad_norm": 0.29884371161460876, "learning_rate": 0.0001007204, "loss": 0.2545, "step": 248200 }, { "epoch": 0.2566, "grad_norm": 0.2246190458536148, "learning_rate": 0.0001006804, "loss": 0.2507, "step": 248300 }, { "epoch": 0.2568, "grad_norm": 0.21513308584690094, "learning_rate": 0.00010064040000000001, "loss": 0.2517, "step": 248400 }, { "epoch": 0.257, "grad_norm": 0.17849339544773102, "learning_rate": 0.00010060039999999999, "loss": 0.253, "step": 248500 }, { "epoch": 0.2572, "grad_norm": 0.2598421573638916, "learning_rate": 0.0001005604, "loss": 0.2546, "step": 248600 }, { "epoch": 0.2574, "grad_norm": 0.21260765194892883, "learning_rate": 0.00010052040000000001, "loss": 0.2523, "step": 248700 }, { "epoch": 0.2576, "grad_norm": 0.9292067289352417, "learning_rate": 0.00010048040000000001, "loss": 0.2514, "step": 248800 }, { "epoch": 0.2578, "grad_norm": 0.1865740567445755, "learning_rate": 0.00010044040000000001, "loss": 0.2583, "step": 248900 }, { "epoch": 0.258, "grad_norm": 0.22997444868087769, "learning_rate": 0.00010040039999999999, "loss": 0.2542, "step": 249000 }, { "epoch": 0.2582, "grad_norm": 0.23303471505641937, "learning_rate": 0.0001003604, "loss": 0.2538, "step": 249100 }, { "epoch": 0.2584, "grad_norm": 0.26385053992271423, "learning_rate": 0.0001003204, "loss": 0.254, "step": 249200 }, { "epoch": 0.2586, "grad_norm": 0.36450210213661194, "learning_rate": 0.00010028040000000001, "loss": 0.261, "step": 249300 }, { "epoch": 0.2588, "grad_norm": 0.20195017755031586, "learning_rate": 0.00010024040000000002, "loss": 0.2497, "step": 249400 }, { "epoch": 0.259, "grad_norm": 0.24559420347213745, "learning_rate": 0.0001002004, "loss": 0.2556, "step": 249500 }, { "epoch": 0.2592, "grad_norm": 0.42150649428367615, "learning_rate": 0.0001001604, "loss": 0.257, "step": 249600 }, { "epoch": 0.2594, "grad_norm": 0.19844648241996765, "learning_rate": 0.0001001204, "loss": 0.2574, "step": 249700 }, { "epoch": 0.2596, "grad_norm": 0.18794132769107819, "learning_rate": 0.0001000804, "loss": 0.2507, "step": 249800 }, { "epoch": 0.2598, "grad_norm": 0.17367449402809143, "learning_rate": 0.00010004040000000001, "loss": 0.2526, "step": 249900 }, { "epoch": 0.26, "grad_norm": 0.23622073233127594, "learning_rate": 0.00010000039999999999, "loss": 0.2523, "step": 250000 }, { "epoch": 0.2602, "grad_norm": 0.1616039127111435, "learning_rate": 9.996040000000001e-05, "loss": 0.2555, "step": 250100 }, { "epoch": 0.2604, "grad_norm": 0.1847952902317047, "learning_rate": 9.99204e-05, "loss": 0.2545, "step": 250200 }, { "epoch": 0.2606, "grad_norm": 0.2138368785381317, "learning_rate": 9.988040000000001e-05, "loss": 0.2485, "step": 250300 }, { "epoch": 0.2608, "grad_norm": 0.17640897631645203, "learning_rate": 9.984040000000001e-05, "loss": 0.2585, "step": 250400 }, { "epoch": 0.261, "grad_norm": 0.20163944363594055, "learning_rate": 9.98004e-05, "loss": 0.2486, "step": 250500 }, { "epoch": 0.2612, "grad_norm": 0.23935359716415405, "learning_rate": 9.976040000000001e-05, "loss": 0.2603, "step": 250600 }, { "epoch": 0.2614, "grad_norm": 0.14974956214427948, "learning_rate": 9.97204e-05, "loss": 0.2557, "step": 250700 }, { "epoch": 0.2616, "grad_norm": 0.17297634482383728, "learning_rate": 9.968040000000001e-05, "loss": 0.2562, "step": 250800 }, { "epoch": 0.2618, "grad_norm": 0.31046923995018005, "learning_rate": 9.96404e-05, "loss": 0.2569, "step": 250900 }, { "epoch": 0.262, "grad_norm": 0.1529819816350937, "learning_rate": 9.960040000000001e-05, "loss": 0.2559, "step": 251000 }, { "epoch": 0.2622, "grad_norm": 0.22081358730793, "learning_rate": 9.95604e-05, "loss": 0.2595, "step": 251100 }, { "epoch": 0.2624, "grad_norm": 0.2596571445465088, "learning_rate": 9.95204e-05, "loss": 0.2621, "step": 251200 }, { "epoch": 0.2626, "grad_norm": 0.20315712690353394, "learning_rate": 9.94804e-05, "loss": 0.2556, "step": 251300 }, { "epoch": 0.2628, "grad_norm": 0.16273678839206696, "learning_rate": 9.94404e-05, "loss": 0.2538, "step": 251400 }, { "epoch": 0.263, "grad_norm": 0.3179258108139038, "learning_rate": 9.94004e-05, "loss": 0.2547, "step": 251500 }, { "epoch": 0.2632, "grad_norm": 0.18335247039794922, "learning_rate": 9.936040000000001e-05, "loss": 0.275, "step": 251600 }, { "epoch": 0.2634, "grad_norm": 0.19399204850196838, "learning_rate": 9.93204e-05, "loss": 0.253, "step": 251700 }, { "epoch": 0.2636, "grad_norm": 0.16826485097408295, "learning_rate": 9.928040000000001e-05, "loss": 0.2511, "step": 251800 }, { "epoch": 0.2638, "grad_norm": 0.16970914602279663, "learning_rate": 9.92404e-05, "loss": 0.2485, "step": 251900 }, { "epoch": 0.264, "grad_norm": 0.19796758890151978, "learning_rate": 9.92004e-05, "loss": 0.25, "step": 252000 }, { "epoch": 0.2642, "grad_norm": 0.23124122619628906, "learning_rate": 9.916040000000001e-05, "loss": 0.2517, "step": 252100 }, { "epoch": 0.2644, "grad_norm": 0.18028624355793, "learning_rate": 9.91204e-05, "loss": 0.2549, "step": 252200 }, { "epoch": 0.2646, "grad_norm": 0.24230654537677765, "learning_rate": 9.908040000000001e-05, "loss": 0.2511, "step": 252300 }, { "epoch": 0.2648, "grad_norm": 0.16083365678787231, "learning_rate": 9.90404e-05, "loss": 0.2491, "step": 252400 }, { "epoch": 0.265, "grad_norm": 0.3353089392185211, "learning_rate": 9.900040000000001e-05, "loss": 0.2586, "step": 252500 }, { "epoch": 0.2652, "grad_norm": 0.29795363545417786, "learning_rate": 9.89604e-05, "loss": 0.2581, "step": 252600 }, { "epoch": 0.2654, "grad_norm": 0.18846677243709564, "learning_rate": 9.89204e-05, "loss": 0.2596, "step": 252700 }, { "epoch": 0.2656, "grad_norm": 0.16122601926326752, "learning_rate": 9.88804e-05, "loss": 0.2517, "step": 252800 }, { "epoch": 0.2658, "grad_norm": 0.19372184574604034, "learning_rate": 9.88404e-05, "loss": 0.2474, "step": 252900 }, { "epoch": 0.266, "grad_norm": 0.1936195194721222, "learning_rate": 9.88004e-05, "loss": 0.2546, "step": 253000 }, { "epoch": 0.2662, "grad_norm": 0.2815191149711609, "learning_rate": 9.876040000000001e-05, "loss": 0.2507, "step": 253100 }, { "epoch": 0.2664, "grad_norm": 0.16391977667808533, "learning_rate": 9.87204e-05, "loss": 0.2538, "step": 253200 }, { "epoch": 0.2666, "grad_norm": 0.17948247492313385, "learning_rate": 9.868040000000001e-05, "loss": 0.253, "step": 253300 }, { "epoch": 0.2668, "grad_norm": 0.18989241123199463, "learning_rate": 9.86404e-05, "loss": 0.2547, "step": 253400 }, { "epoch": 0.267, "grad_norm": 0.20617620646953583, "learning_rate": 9.86004e-05, "loss": 0.2549, "step": 253500 }, { "epoch": 0.2672, "grad_norm": 0.2014150768518448, "learning_rate": 9.85604e-05, "loss": 0.2588, "step": 253600 }, { "epoch": 0.2674, "grad_norm": 0.22781629860401154, "learning_rate": 9.85204e-05, "loss": 0.2625, "step": 253700 }, { "epoch": 0.2676, "grad_norm": 0.179660364985466, "learning_rate": 9.84804e-05, "loss": 0.2547, "step": 253800 }, { "epoch": 0.2678, "grad_norm": 0.18104757368564606, "learning_rate": 9.84404e-05, "loss": 0.2569, "step": 253900 }, { "epoch": 0.268, "grad_norm": 0.42554691433906555, "learning_rate": 9.840040000000001e-05, "loss": 0.2606, "step": 254000 }, { "epoch": 0.2682, "grad_norm": 0.17360180616378784, "learning_rate": 9.836040000000001e-05, "loss": 0.2583, "step": 254100 }, { "epoch": 0.2684, "grad_norm": 0.21442458033561707, "learning_rate": 9.83204e-05, "loss": 0.2525, "step": 254200 }, { "epoch": 0.2686, "grad_norm": 0.17544832825660706, "learning_rate": 9.82804e-05, "loss": 0.2512, "step": 254300 }, { "epoch": 0.2688, "grad_norm": 0.18496903777122498, "learning_rate": 9.824040000000001e-05, "loss": 0.2501, "step": 254400 }, { "epoch": 0.269, "grad_norm": 0.20425285398960114, "learning_rate": 9.82004e-05, "loss": 0.2529, "step": 254500 }, { "epoch": 0.2692, "grad_norm": 0.16543030738830566, "learning_rate": 9.816040000000001e-05, "loss": 0.2572, "step": 254600 }, { "epoch": 0.2694, "grad_norm": 0.18236897885799408, "learning_rate": 9.81204e-05, "loss": 0.2546, "step": 254700 }, { "epoch": 0.2696, "grad_norm": 0.35450029373168945, "learning_rate": 9.808040000000001e-05, "loss": 0.2519, "step": 254800 }, { "epoch": 0.2698, "grad_norm": 0.1963033676147461, "learning_rate": 9.80404e-05, "loss": 0.274, "step": 254900 }, { "epoch": 0.27, "grad_norm": 0.24699929356575012, "learning_rate": 9.80004e-05, "loss": 0.2546, "step": 255000 }, { "epoch": 0.2702, "grad_norm": 0.2196625918149948, "learning_rate": 9.79604e-05, "loss": 0.2565, "step": 255100 }, { "epoch": 0.2704, "grad_norm": 0.2058703750371933, "learning_rate": 9.79204e-05, "loss": 0.2576, "step": 255200 }, { "epoch": 0.2706, "grad_norm": 0.16371659934520721, "learning_rate": 9.78804e-05, "loss": 0.2534, "step": 255300 }, { "epoch": 0.2708, "grad_norm": 0.249703511595726, "learning_rate": 9.784040000000001e-05, "loss": 0.251, "step": 255400 }, { "epoch": 0.271, "grad_norm": 0.18516069650650024, "learning_rate": 9.78004e-05, "loss": 0.2562, "step": 255500 }, { "epoch": 0.2712, "grad_norm": 0.17773735523223877, "learning_rate": 9.776040000000001e-05, "loss": 0.2583, "step": 255600 }, { "epoch": 0.2714, "grad_norm": 0.18748611211776733, "learning_rate": 9.772040000000001e-05, "loss": 0.2588, "step": 255700 }, { "epoch": 0.2716, "grad_norm": 0.16686369478702545, "learning_rate": 9.76804e-05, "loss": 0.2603, "step": 255800 }, { "epoch": 0.2718, "grad_norm": 0.17523817718029022, "learning_rate": 9.764040000000001e-05, "loss": 0.2466, "step": 255900 }, { "epoch": 0.272, "grad_norm": 0.2433302402496338, "learning_rate": 9.76004e-05, "loss": 0.2528, "step": 256000 }, { "epoch": 0.2722, "grad_norm": 0.14348742365837097, "learning_rate": 9.756040000000001e-05, "loss": 0.2494, "step": 256100 }, { "epoch": 0.2724, "grad_norm": 0.2618894577026367, "learning_rate": 9.75204e-05, "loss": 0.2606, "step": 256200 }, { "epoch": 0.2726, "grad_norm": 0.2586542069911957, "learning_rate": 9.748040000000001e-05, "loss": 0.2476, "step": 256300 }, { "epoch": 0.2728, "grad_norm": 0.17183682322502136, "learning_rate": 9.74404e-05, "loss": 0.253, "step": 256400 }, { "epoch": 0.273, "grad_norm": 0.15170541405677795, "learning_rate": 9.74004e-05, "loss": 0.253, "step": 256500 }, { "epoch": 0.2732, "grad_norm": 0.20044559240341187, "learning_rate": 9.73604e-05, "loss": 0.2594, "step": 256600 }, { "epoch": 0.2734, "grad_norm": 0.1726810783147812, "learning_rate": 9.73204e-05, "loss": 0.2544, "step": 256700 }, { "epoch": 0.2736, "grad_norm": 0.17345348000526428, "learning_rate": 9.72804e-05, "loss": 0.2543, "step": 256800 }, { "epoch": 0.2738, "grad_norm": 0.1733126938343048, "learning_rate": 9.724040000000001e-05, "loss": 0.2516, "step": 256900 }, { "epoch": 0.274, "grad_norm": 0.2917121946811676, "learning_rate": 9.72004e-05, "loss": 0.2605, "step": 257000 }, { "epoch": 0.2742, "grad_norm": 0.1882348358631134, "learning_rate": 9.716040000000001e-05, "loss": 0.2584, "step": 257100 }, { "epoch": 0.2744, "grad_norm": 0.1900627166032791, "learning_rate": 9.71204e-05, "loss": 0.2614, "step": 257200 }, { "epoch": 0.2746, "grad_norm": 0.21451936662197113, "learning_rate": 9.70804e-05, "loss": 0.255, "step": 257300 }, { "epoch": 0.2748, "grad_norm": 0.24926543235778809, "learning_rate": 9.704040000000001e-05, "loss": 0.2502, "step": 257400 }, { "epoch": 0.275, "grad_norm": 0.19679979979991913, "learning_rate": 9.70004e-05, "loss": 0.2504, "step": 257500 }, { "epoch": 0.2752, "grad_norm": 0.27472445368766785, "learning_rate": 9.696040000000001e-05, "loss": 0.2499, "step": 257600 }, { "epoch": 0.2754, "grad_norm": 0.1651807725429535, "learning_rate": 9.69204e-05, "loss": 0.2574, "step": 257700 }, { "epoch": 0.2756, "grad_norm": 0.2149466574192047, "learning_rate": 9.688040000000001e-05, "loss": 0.2466, "step": 257800 }, { "epoch": 0.2758, "grad_norm": 0.21232870221138, "learning_rate": 9.684040000000002e-05, "loss": 0.2539, "step": 257900 }, { "epoch": 0.276, "grad_norm": 0.20928317308425903, "learning_rate": 9.68004e-05, "loss": 0.2496, "step": 258000 }, { "epoch": 0.2762, "grad_norm": 0.2095847725868225, "learning_rate": 9.67604e-05, "loss": 0.2552, "step": 258100 }, { "epoch": 0.2764, "grad_norm": 0.1795085370540619, "learning_rate": 9.67204e-05, "loss": 0.2495, "step": 258200 }, { "epoch": 0.2766, "grad_norm": 0.17618554830551147, "learning_rate": 9.66804e-05, "loss": 0.252, "step": 258300 }, { "epoch": 0.2768, "grad_norm": 0.16858777403831482, "learning_rate": 9.664040000000001e-05, "loss": 0.2585, "step": 258400 }, { "epoch": 0.277, "grad_norm": 0.21656472980976105, "learning_rate": 9.66004e-05, "loss": 0.2498, "step": 258500 }, { "epoch": 0.2772, "grad_norm": 0.18165180087089539, "learning_rate": 9.656040000000001e-05, "loss": 0.2529, "step": 258600 }, { "epoch": 0.2774, "grad_norm": 0.2290857881307602, "learning_rate": 9.65204e-05, "loss": 0.25, "step": 258700 }, { "epoch": 0.2776, "grad_norm": 0.1842830330133438, "learning_rate": 9.64804e-05, "loss": 0.2526, "step": 258800 }, { "epoch": 0.2778, "grad_norm": 0.16289664804935455, "learning_rate": 9.64404e-05, "loss": 0.2546, "step": 258900 }, { "epoch": 0.278, "grad_norm": 0.22200186550617218, "learning_rate": 9.64004e-05, "loss": 0.2487, "step": 259000 }, { "epoch": 0.2782, "grad_norm": 0.15263180434703827, "learning_rate": 9.63604e-05, "loss": 0.2532, "step": 259100 }, { "epoch": 0.2784, "grad_norm": 0.19079795479774475, "learning_rate": 9.63204e-05, "loss": 0.2463, "step": 259200 }, { "epoch": 0.2786, "grad_norm": 0.21255415678024292, "learning_rate": 9.628040000000001e-05, "loss": 0.2485, "step": 259300 }, { "epoch": 0.2788, "grad_norm": 0.15365895628929138, "learning_rate": 9.624040000000001e-05, "loss": 0.2472, "step": 259400 }, { "epoch": 0.279, "grad_norm": 0.1517648547887802, "learning_rate": 9.620040000000001e-05, "loss": 0.2538, "step": 259500 }, { "epoch": 0.2792, "grad_norm": 0.2428562343120575, "learning_rate": 9.61604e-05, "loss": 0.2455, "step": 259600 }, { "epoch": 0.2794, "grad_norm": 0.15415999293327332, "learning_rate": 9.61204e-05, "loss": 0.2472, "step": 259700 }, { "epoch": 0.2796, "grad_norm": 0.19108308851718903, "learning_rate": 9.60804e-05, "loss": 0.2482, "step": 259800 }, { "epoch": 0.2798, "grad_norm": 0.1723984181880951, "learning_rate": 9.604040000000001e-05, "loss": 0.2502, "step": 259900 }, { "epoch": 0.28, "grad_norm": 0.24995626509189606, "learning_rate": 9.60004e-05, "loss": 0.2578, "step": 260000 }, { "epoch": 0.2802, "grad_norm": 0.17275862395763397, "learning_rate": 9.596040000000001e-05, "loss": 0.2583, "step": 260100 }, { "epoch": 0.2804, "grad_norm": 0.20496828854084015, "learning_rate": 9.59204e-05, "loss": 0.2502, "step": 260200 }, { "epoch": 0.2806, "grad_norm": 0.15254418551921844, "learning_rate": 9.58804e-05, "loss": 0.2453, "step": 260300 }, { "epoch": 0.2808, "grad_norm": 0.22347435355186462, "learning_rate": 9.58404e-05, "loss": 0.2655, "step": 260400 }, { "epoch": 0.281, "grad_norm": 0.27912843227386475, "learning_rate": 9.58004e-05, "loss": 0.2595, "step": 260500 }, { "epoch": 0.2812, "grad_norm": 0.17391923069953918, "learning_rate": 9.57604e-05, "loss": 0.2441, "step": 260600 }, { "epoch": 0.2814, "grad_norm": 0.20242680609226227, "learning_rate": 9.572040000000001e-05, "loss": 0.2513, "step": 260700 }, { "epoch": 0.2816, "grad_norm": 0.19099730253219604, "learning_rate": 9.56804e-05, "loss": 0.2474, "step": 260800 }, { "epoch": 0.2818, "grad_norm": 0.17173656821250916, "learning_rate": 9.564040000000001e-05, "loss": 0.2468, "step": 260900 }, { "epoch": 0.282, "grad_norm": 0.2755275368690491, "learning_rate": 9.560040000000001e-05, "loss": 0.2579, "step": 261000 }, { "epoch": 0.2822, "grad_norm": 0.1447450965642929, "learning_rate": 9.55604e-05, "loss": 0.2549, "step": 261100 }, { "epoch": 0.2824, "grad_norm": 0.17367517948150635, "learning_rate": 9.552040000000001e-05, "loss": 0.2493, "step": 261200 }, { "epoch": 0.2826, "grad_norm": 0.19174928963184357, "learning_rate": 9.54804e-05, "loss": 0.2518, "step": 261300 }, { "epoch": 0.2828, "grad_norm": 0.20758305490016937, "learning_rate": 9.544040000000001e-05, "loss": 0.2522, "step": 261400 }, { "epoch": 0.283, "grad_norm": 0.172040194272995, "learning_rate": 9.54004e-05, "loss": 0.2527, "step": 261500 }, { "epoch": 0.2832, "grad_norm": 0.19083595275878906, "learning_rate": 9.536040000000001e-05, "loss": 0.2439, "step": 261600 }, { "epoch": 0.2834, "grad_norm": 0.161021426320076, "learning_rate": 9.53204e-05, "loss": 0.2529, "step": 261700 }, { "epoch": 0.2836, "grad_norm": 0.1945277452468872, "learning_rate": 9.52804e-05, "loss": 0.2508, "step": 261800 }, { "epoch": 0.2838, "grad_norm": 0.18637241423130035, "learning_rate": 9.52404e-05, "loss": 0.2504, "step": 261900 }, { "epoch": 0.284, "grad_norm": 0.6245195865631104, "learning_rate": 9.52004e-05, "loss": 0.2512, "step": 262000 }, { "epoch": 0.2842, "grad_norm": 0.26768702268600464, "learning_rate": 9.51604e-05, "loss": 0.2504, "step": 262100 }, { "epoch": 0.2844, "grad_norm": 0.19558322429656982, "learning_rate": 9.512040000000001e-05, "loss": 0.2543, "step": 262200 }, { "epoch": 0.2846, "grad_norm": 0.19235068559646606, "learning_rate": 9.50804e-05, "loss": 0.2491, "step": 262300 }, { "epoch": 0.2848, "grad_norm": 0.19615627825260162, "learning_rate": 9.504040000000001e-05, "loss": 0.2486, "step": 262400 }, { "epoch": 0.285, "grad_norm": 0.4205188751220703, "learning_rate": 9.50004e-05, "loss": 0.2507, "step": 262500 }, { "epoch": 0.2852, "grad_norm": 0.2430608868598938, "learning_rate": 9.49604e-05, "loss": 0.2509, "step": 262600 }, { "epoch": 0.2854, "grad_norm": 0.17347165942192078, "learning_rate": 9.492040000000001e-05, "loss": 0.2495, "step": 262700 }, { "epoch": 0.2856, "grad_norm": 0.256759375333786, "learning_rate": 9.48804e-05, "loss": 0.2503, "step": 262800 }, { "epoch": 0.2858, "grad_norm": 0.16245199739933014, "learning_rate": 9.484040000000001e-05, "loss": 0.2465, "step": 262900 }, { "epoch": 0.286, "grad_norm": 0.17991042137145996, "learning_rate": 9.48004e-05, "loss": 0.2503, "step": 263000 }, { "epoch": 0.2862, "grad_norm": 0.1637287586927414, "learning_rate": 9.476040000000001e-05, "loss": 0.2472, "step": 263100 }, { "epoch": 0.2864, "grad_norm": 0.2055928260087967, "learning_rate": 9.472040000000002e-05, "loss": 0.2495, "step": 263200 }, { "epoch": 0.2866, "grad_norm": 0.19248582422733307, "learning_rate": 9.468040000000001e-05, "loss": 0.2465, "step": 263300 }, { "epoch": 0.2868, "grad_norm": 0.19688032567501068, "learning_rate": 9.46404e-05, "loss": 0.2486, "step": 263400 }, { "epoch": 0.287, "grad_norm": 0.1787942796945572, "learning_rate": 9.46004e-05, "loss": 0.2483, "step": 263500 }, { "epoch": 0.2872, "grad_norm": 0.18415433168411255, "learning_rate": 9.45604e-05, "loss": 0.2566, "step": 263600 }, { "epoch": 0.2874, "grad_norm": 0.19202789664268494, "learning_rate": 9.452040000000001e-05, "loss": 0.2498, "step": 263700 }, { "epoch": 0.2876, "grad_norm": 0.27513137459754944, "learning_rate": 9.44804e-05, "loss": 0.2508, "step": 263800 }, { "epoch": 0.2878, "grad_norm": 0.17416657507419586, "learning_rate": 9.444040000000001e-05, "loss": 0.2501, "step": 263900 }, { "epoch": 0.288, "grad_norm": 0.15521082282066345, "learning_rate": 9.44004e-05, "loss": 0.2462, "step": 264000 }, { "epoch": 0.2882, "grad_norm": 0.17850640416145325, "learning_rate": 9.43604e-05, "loss": 0.2577, "step": 264100 }, { "epoch": 0.2884, "grad_norm": 0.28766483068466187, "learning_rate": 9.43204e-05, "loss": 0.2539, "step": 264200 }, { "epoch": 0.2886, "grad_norm": 0.1784902662038803, "learning_rate": 9.42804e-05, "loss": 0.2531, "step": 264300 }, { "epoch": 0.2888, "grad_norm": 0.19980043172836304, "learning_rate": 9.42404e-05, "loss": 0.2467, "step": 264400 }, { "epoch": 0.289, "grad_norm": 1.2848162651062012, "learning_rate": 9.42004e-05, "loss": 0.2493, "step": 264500 }, { "epoch": 0.2892, "grad_norm": 0.20343329012393951, "learning_rate": 9.416040000000001e-05, "loss": 0.248, "step": 264600 }, { "epoch": 0.2894, "grad_norm": 0.22436699271202087, "learning_rate": 9.412040000000001e-05, "loss": 0.2492, "step": 264700 }, { "epoch": 0.2896, "grad_norm": 0.19231826066970825, "learning_rate": 9.408040000000001e-05, "loss": 0.2457, "step": 264800 }, { "epoch": 0.2898, "grad_norm": 0.20982171595096588, "learning_rate": 9.40404e-05, "loss": 0.2461, "step": 264900 }, { "epoch": 0.29, "grad_norm": 0.25590869784355164, "learning_rate": 9.40004e-05, "loss": 0.2511, "step": 265000 }, { "epoch": 0.2902, "grad_norm": 0.19312362372875214, "learning_rate": 9.39604e-05, "loss": 0.2466, "step": 265100 }, { "epoch": 0.2904, "grad_norm": 0.23638297617435455, "learning_rate": 9.392040000000001e-05, "loss": 0.2466, "step": 265200 }, { "epoch": 0.2906, "grad_norm": 0.16785204410552979, "learning_rate": 9.38804e-05, "loss": 0.2469, "step": 265300 }, { "epoch": 0.2908, "grad_norm": 0.2154064029455185, "learning_rate": 9.384040000000001e-05, "loss": 0.2706, "step": 265400 }, { "epoch": 0.291, "grad_norm": 0.18391206860542297, "learning_rate": 9.38004e-05, "loss": 0.243, "step": 265500 }, { "epoch": 0.2912, "grad_norm": 0.20186170935630798, "learning_rate": 9.37604e-05, "loss": 0.2555, "step": 265600 }, { "epoch": 0.2914, "grad_norm": 0.19941025972366333, "learning_rate": 9.37204e-05, "loss": 0.248, "step": 265700 }, { "epoch": 0.2916, "grad_norm": 0.18308115005493164, "learning_rate": 9.36804e-05, "loss": 0.2464, "step": 265800 }, { "epoch": 0.2918, "grad_norm": 0.24809174239635468, "learning_rate": 9.36404e-05, "loss": 0.2536, "step": 265900 }, { "epoch": 0.292, "grad_norm": 0.2352696657180786, "learning_rate": 9.36004e-05, "loss": 0.2515, "step": 266000 }, { "epoch": 0.2922, "grad_norm": 0.20043088495731354, "learning_rate": 9.35604e-05, "loss": 0.249, "step": 266100 }, { "epoch": 0.2924, "grad_norm": 0.33282342553138733, "learning_rate": 9.352040000000001e-05, "loss": 0.2588, "step": 266200 }, { "epoch": 0.2926, "grad_norm": 0.1646975576877594, "learning_rate": 9.348040000000001e-05, "loss": 0.2571, "step": 266300 }, { "epoch": 0.2928, "grad_norm": 0.2019079327583313, "learning_rate": 9.34404e-05, "loss": 0.2501, "step": 266400 }, { "epoch": 0.293, "grad_norm": 0.3303513526916504, "learning_rate": 9.34004e-05, "loss": 0.2547, "step": 266500 }, { "epoch": 0.2932, "grad_norm": 0.1550179123878479, "learning_rate": 9.33604e-05, "loss": 0.2554, "step": 266600 }, { "epoch": 0.2934, "grad_norm": 0.3752208948135376, "learning_rate": 9.332040000000001e-05, "loss": 0.2493, "step": 266700 }, { "epoch": 0.2936, "grad_norm": 0.18411359190940857, "learning_rate": 9.32804e-05, "loss": 0.2569, "step": 266800 }, { "epoch": 0.2938, "grad_norm": 0.17852115631103516, "learning_rate": 9.324040000000001e-05, "loss": 0.2479, "step": 266900 }, { "epoch": 0.294, "grad_norm": 0.17118707299232483, "learning_rate": 9.320040000000002e-05, "loss": 0.254, "step": 267000 }, { "epoch": 0.2942, "grad_norm": 0.35995370149612427, "learning_rate": 9.31604e-05, "loss": 0.2512, "step": 267100 }, { "epoch": 0.2944, "grad_norm": 0.27355167269706726, "learning_rate": 9.31204e-05, "loss": 0.2549, "step": 267200 }, { "epoch": 0.2946, "grad_norm": 0.1623479276895523, "learning_rate": 9.30804e-05, "loss": 0.2522, "step": 267300 }, { "epoch": 0.2948, "grad_norm": 0.26281481981277466, "learning_rate": 9.30404e-05, "loss": 0.2505, "step": 267400 }, { "epoch": 0.295, "grad_norm": 0.17927655577659607, "learning_rate": 9.300040000000001e-05, "loss": 0.2497, "step": 267500 }, { "epoch": 0.2952, "grad_norm": 0.1781659871339798, "learning_rate": 9.29604e-05, "loss": 0.2497, "step": 267600 }, { "epoch": 0.2954, "grad_norm": 0.16235920786857605, "learning_rate": 9.292040000000001e-05, "loss": 0.2567, "step": 267700 }, { "epoch": 0.2956, "grad_norm": 0.1638641655445099, "learning_rate": 9.28804e-05, "loss": 0.2469, "step": 267800 }, { "epoch": 0.2958, "grad_norm": 0.2075788974761963, "learning_rate": 9.28404e-05, "loss": 0.2482, "step": 267900 }, { "epoch": 0.296, "grad_norm": 0.1806693822145462, "learning_rate": 9.280040000000001e-05, "loss": 0.2516, "step": 268000 }, { "epoch": 0.2962, "grad_norm": 0.22953234612941742, "learning_rate": 9.27604e-05, "loss": 0.2631, "step": 268100 }, { "epoch": 0.2964, "grad_norm": 0.22682808339595795, "learning_rate": 9.272040000000001e-05, "loss": 0.2532, "step": 268200 }, { "epoch": 0.2966, "grad_norm": 0.3644724488258362, "learning_rate": 9.26804e-05, "loss": 0.2521, "step": 268300 }, { "epoch": 0.2968, "grad_norm": 0.18433807790279388, "learning_rate": 9.264040000000001e-05, "loss": 0.2578, "step": 268400 }, { "epoch": 0.297, "grad_norm": 0.1955367773771286, "learning_rate": 9.260040000000002e-05, "loss": 0.2493, "step": 268500 }, { "epoch": 0.2972, "grad_norm": 0.2631000578403473, "learning_rate": 9.256040000000001e-05, "loss": 0.2487, "step": 268600 }, { "epoch": 0.2974, "grad_norm": 0.24312226474285126, "learning_rate": 9.25204e-05, "loss": 0.2501, "step": 268700 }, { "epoch": 0.2976, "grad_norm": 0.19950313866138458, "learning_rate": 9.24804e-05, "loss": 0.2504, "step": 268800 }, { "epoch": 0.2978, "grad_norm": 0.21298249065876007, "learning_rate": 9.24404e-05, "loss": 0.2492, "step": 268900 }, { "epoch": 0.298, "grad_norm": 0.27059680223464966, "learning_rate": 9.240040000000001e-05, "loss": 0.2585, "step": 269000 }, { "epoch": 0.2982, "grad_norm": 0.17913617193698883, "learning_rate": 9.23604e-05, "loss": 0.2426, "step": 269100 }, { "epoch": 0.2984, "grad_norm": 0.19647064805030823, "learning_rate": 9.232040000000001e-05, "loss": 0.2485, "step": 269200 }, { "epoch": 0.2986, "grad_norm": 0.14924323558807373, "learning_rate": 9.22804e-05, "loss": 0.2514, "step": 269300 }, { "epoch": 0.2988, "grad_norm": 0.18103660643100739, "learning_rate": 9.22404e-05, "loss": 0.253, "step": 269400 }, { "epoch": 0.299, "grad_norm": 0.16981446743011475, "learning_rate": 9.22004e-05, "loss": 0.2506, "step": 269500 }, { "epoch": 0.2992, "grad_norm": 0.17313528060913086, "learning_rate": 9.21604e-05, "loss": 0.255, "step": 269600 }, { "epoch": 0.2994, "grad_norm": 0.19472962617874146, "learning_rate": 9.21204e-05, "loss": 0.2462, "step": 269700 }, { "epoch": 0.2996, "grad_norm": 0.20072679221630096, "learning_rate": 9.20804e-05, "loss": 0.2499, "step": 269800 }, { "epoch": 0.2998, "grad_norm": 0.20446142554283142, "learning_rate": 9.204040000000001e-05, "loss": 0.2484, "step": 269900 }, { "epoch": 0.3, "grad_norm": 0.23795530200004578, "learning_rate": 9.200040000000001e-05, "loss": 0.2488, "step": 270000 }, { "epoch": 0.3002, "grad_norm": 0.5230748057365417, "learning_rate": 9.196040000000001e-05, "loss": 0.2517, "step": 270100 }, { "epoch": 0.3004, "grad_norm": 0.22013844549655914, "learning_rate": 9.19204e-05, "loss": 0.2482, "step": 270200 }, { "epoch": 0.3006, "grad_norm": 0.20569878816604614, "learning_rate": 9.18804e-05, "loss": 0.2592, "step": 270300 }, { "epoch": 0.3008, "grad_norm": 0.16592691838741302, "learning_rate": 9.18404e-05, "loss": 0.2539, "step": 270400 }, { "epoch": 0.301, "grad_norm": 0.20824261009693146, "learning_rate": 9.180040000000001e-05, "loss": 0.2523, "step": 270500 }, { "epoch": 0.3012, "grad_norm": 0.21853414177894592, "learning_rate": 9.17604e-05, "loss": 0.2476, "step": 270600 }, { "epoch": 0.3014, "grad_norm": 0.15546807646751404, "learning_rate": 9.172040000000001e-05, "loss": 0.2489, "step": 270700 }, { "epoch": 0.3016, "grad_norm": 0.19030147790908813, "learning_rate": 9.16804e-05, "loss": 0.2444, "step": 270800 }, { "epoch": 0.3018, "grad_norm": 0.20716990530490875, "learning_rate": 9.16404e-05, "loss": 0.2479, "step": 270900 }, { "epoch": 0.302, "grad_norm": 0.1584189087152481, "learning_rate": 9.16004e-05, "loss": 0.2461, "step": 271000 }, { "epoch": 0.3022, "grad_norm": 0.15803031623363495, "learning_rate": 9.15604e-05, "loss": 0.2492, "step": 271100 }, { "epoch": 0.3024, "grad_norm": 0.18134865164756775, "learning_rate": 9.15204e-05, "loss": 0.2458, "step": 271200 }, { "epoch": 0.3026, "grad_norm": 0.17175163328647614, "learning_rate": 9.14804e-05, "loss": 0.2507, "step": 271300 }, { "epoch": 0.3028, "grad_norm": 0.13778148591518402, "learning_rate": 9.14404e-05, "loss": 0.2517, "step": 271400 }, { "epoch": 0.303, "grad_norm": 0.18867234885692596, "learning_rate": 9.140040000000001e-05, "loss": 0.2477, "step": 271500 }, { "epoch": 0.3032, "grad_norm": 0.18188384175300598, "learning_rate": 9.136040000000001e-05, "loss": 0.2564, "step": 271600 }, { "epoch": 0.3034, "grad_norm": 0.18826958537101746, "learning_rate": 9.13204e-05, "loss": 0.2468, "step": 271700 }, { "epoch": 0.3036, "grad_norm": 0.18992917239665985, "learning_rate": 9.12804e-05, "loss": 0.2521, "step": 271800 }, { "epoch": 0.3038, "grad_norm": 0.2327507585287094, "learning_rate": 9.12404e-05, "loss": 0.2514, "step": 271900 }, { "epoch": 0.304, "grad_norm": 0.24102598428726196, "learning_rate": 9.120040000000001e-05, "loss": 0.2539, "step": 272000 }, { "epoch": 0.3042, "grad_norm": 0.16849716007709503, "learning_rate": 9.11604e-05, "loss": 0.2486, "step": 272100 }, { "epoch": 0.3044, "grad_norm": 0.34158793091773987, "learning_rate": 9.112040000000001e-05, "loss": 0.2524, "step": 272200 }, { "epoch": 0.3046, "grad_norm": 0.29653236269950867, "learning_rate": 9.10804e-05, "loss": 0.259, "step": 272300 }, { "epoch": 0.3048, "grad_norm": 0.19480286538600922, "learning_rate": 9.104040000000001e-05, "loss": 0.2505, "step": 272400 }, { "epoch": 0.305, "grad_norm": 0.19145742058753967, "learning_rate": 9.10004e-05, "loss": 0.2539, "step": 272500 }, { "epoch": 0.3052, "grad_norm": 0.5713980793952942, "learning_rate": 9.09604e-05, "loss": 0.2558, "step": 272600 }, { "epoch": 0.3054, "grad_norm": 0.2311251163482666, "learning_rate": 9.09204e-05, "loss": 0.2525, "step": 272700 }, { "epoch": 0.3056, "grad_norm": 0.17040377855300903, "learning_rate": 9.08804e-05, "loss": 0.2611, "step": 272800 }, { "epoch": 0.3058, "grad_norm": 0.16640836000442505, "learning_rate": 9.08404e-05, "loss": 0.2485, "step": 272900 }, { "epoch": 0.306, "grad_norm": 0.18391956388950348, "learning_rate": 9.080040000000001e-05, "loss": 0.2515, "step": 273000 }, { "epoch": 0.3062, "grad_norm": 0.20270375907421112, "learning_rate": 9.07604e-05, "loss": 0.2495, "step": 273100 }, { "epoch": 0.3064, "grad_norm": 0.1664918065071106, "learning_rate": 9.07204e-05, "loss": 0.253, "step": 273200 }, { "epoch": 0.3066, "grad_norm": 0.2487817406654358, "learning_rate": 9.068040000000001e-05, "loss": 0.2534, "step": 273300 }, { "epoch": 0.3068, "grad_norm": 0.19376716017723083, "learning_rate": 9.06404e-05, "loss": 0.2447, "step": 273400 }, { "epoch": 0.307, "grad_norm": 1.2538508176803589, "learning_rate": 9.060040000000001e-05, "loss": 0.2626, "step": 273500 }, { "epoch": 0.3072, "grad_norm": 0.18994612991809845, "learning_rate": 9.05604e-05, "loss": 0.269, "step": 273600 }, { "epoch": 0.3074, "grad_norm": 0.1983320116996765, "learning_rate": 9.052040000000001e-05, "loss": 0.2484, "step": 273700 }, { "epoch": 0.3076, "grad_norm": 0.17373573780059814, "learning_rate": 9.048040000000002e-05, "loss": 0.245, "step": 273800 }, { "epoch": 0.3078, "grad_norm": 0.2375088334083557, "learning_rate": 9.044040000000001e-05, "loss": 0.2467, "step": 273900 }, { "epoch": 0.308, "grad_norm": 0.16651317477226257, "learning_rate": 9.04004e-05, "loss": 0.2551, "step": 274000 }, { "epoch": 0.3082, "grad_norm": 0.27858221530914307, "learning_rate": 9.03604e-05, "loss": 0.2434, "step": 274100 }, { "epoch": 0.3084, "grad_norm": 0.18154625594615936, "learning_rate": 9.03204e-05, "loss": 0.2591, "step": 274200 }, { "epoch": 0.3086, "grad_norm": 0.2042841613292694, "learning_rate": 9.028040000000001e-05, "loss": 0.2435, "step": 274300 }, { "epoch": 0.3088, "grad_norm": 0.2147081345319748, "learning_rate": 9.02404e-05, "loss": 0.2484, "step": 274400 }, { "epoch": 0.309, "grad_norm": 0.1780577152967453, "learning_rate": 9.020040000000001e-05, "loss": 0.2476, "step": 274500 }, { "epoch": 0.3092, "grad_norm": 0.2056666761636734, "learning_rate": 9.01604e-05, "loss": 0.256, "step": 274600 }, { "epoch": 0.3094, "grad_norm": 0.27037736773490906, "learning_rate": 9.01204e-05, "loss": 0.2486, "step": 274700 }, { "epoch": 0.3096, "grad_norm": 0.17876963317394257, "learning_rate": 9.00804e-05, "loss": 0.253, "step": 274800 }, { "epoch": 0.3098, "grad_norm": 0.1852559596300125, "learning_rate": 9.00404e-05, "loss": 0.2535, "step": 274900 }, { "epoch": 0.31, "grad_norm": 0.2233329564332962, "learning_rate": 9.00004e-05, "loss": 0.2478, "step": 275000 }, { "epoch": 0.3102, "grad_norm": 0.2136661559343338, "learning_rate": 8.99604e-05, "loss": 0.2555, "step": 275100 }, { "epoch": 0.3104, "grad_norm": 0.2538791000843048, "learning_rate": 8.992040000000001e-05, "loss": 0.2453, "step": 275200 }, { "epoch": 0.3106, "grad_norm": 0.1559320092201233, "learning_rate": 8.988040000000001e-05, "loss": 0.2462, "step": 275300 }, { "epoch": 0.3108, "grad_norm": 0.21376411616802216, "learning_rate": 8.984040000000001e-05, "loss": 0.247, "step": 275400 }, { "epoch": 0.311, "grad_norm": 0.20133382081985474, "learning_rate": 8.98004e-05, "loss": 0.254, "step": 275500 }, { "epoch": 0.3112, "grad_norm": 0.17593859136104584, "learning_rate": 8.97604e-05, "loss": 0.2535, "step": 275600 }, { "epoch": 0.3114, "grad_norm": 0.20064404606819153, "learning_rate": 8.97204e-05, "loss": 0.2527, "step": 275700 }, { "epoch": 0.3116, "grad_norm": 0.17532220482826233, "learning_rate": 8.968040000000001e-05, "loss": 0.2533, "step": 275800 }, { "epoch": 0.3118, "grad_norm": 0.21646389365196228, "learning_rate": 8.96404e-05, "loss": 0.25, "step": 275900 }, { "epoch": 0.312, "grad_norm": 0.2135733962059021, "learning_rate": 8.960040000000001e-05, "loss": 0.2463, "step": 276000 }, { "epoch": 0.3122, "grad_norm": 0.24991312623023987, "learning_rate": 8.95604e-05, "loss": 0.2506, "step": 276100 }, { "epoch": 0.3124, "grad_norm": 0.20160618424415588, "learning_rate": 8.95204e-05, "loss": 0.2508, "step": 276200 }, { "epoch": 0.3126, "grad_norm": 0.3549211323261261, "learning_rate": 8.94804e-05, "loss": 0.2452, "step": 276300 }, { "epoch": 0.3128, "grad_norm": 0.18709681928157806, "learning_rate": 8.94404e-05, "loss": 0.2463, "step": 276400 }, { "epoch": 0.313, "grad_norm": 0.1972089558839798, "learning_rate": 8.94004e-05, "loss": 0.2434, "step": 276500 }, { "epoch": 0.3132, "grad_norm": 0.21796953678131104, "learning_rate": 8.93604e-05, "loss": 0.2577, "step": 276600 }, { "epoch": 0.3134, "grad_norm": 0.19307708740234375, "learning_rate": 8.93204e-05, "loss": 0.2485, "step": 276700 }, { "epoch": 0.3136, "grad_norm": 0.24761377274990082, "learning_rate": 8.928040000000001e-05, "loss": 0.2482, "step": 276800 }, { "epoch": 0.3138, "grad_norm": 0.1852557212114334, "learning_rate": 8.924040000000001e-05, "loss": 0.2549, "step": 276900 }, { "epoch": 0.314, "grad_norm": 0.18273688852787018, "learning_rate": 8.92004e-05, "loss": 0.2463, "step": 277000 }, { "epoch": 0.3142, "grad_norm": 0.25522246956825256, "learning_rate": 8.91604e-05, "loss": 0.2467, "step": 277100 }, { "epoch": 0.3144, "grad_norm": 0.1901063621044159, "learning_rate": 8.91204e-05, "loss": 0.2592, "step": 277200 }, { "epoch": 0.3146, "grad_norm": 0.21189343929290771, "learning_rate": 8.908040000000001e-05, "loss": 0.2549, "step": 277300 }, { "epoch": 0.3148, "grad_norm": 0.17540982365608215, "learning_rate": 8.90404e-05, "loss": 0.2534, "step": 277400 }, { "epoch": 0.315, "grad_norm": 0.1724601536989212, "learning_rate": 8.900040000000001e-05, "loss": 0.2529, "step": 277500 }, { "epoch": 0.3152, "grad_norm": 0.21971330046653748, "learning_rate": 8.89604e-05, "loss": 0.2542, "step": 277600 }, { "epoch": 0.3154, "grad_norm": 0.18612246215343475, "learning_rate": 8.892040000000001e-05, "loss": 0.249, "step": 277700 }, { "epoch": 0.3156, "grad_norm": 0.15609164535999298, "learning_rate": 8.88804e-05, "loss": 0.2486, "step": 277800 }, { "epoch": 0.3158, "grad_norm": 0.26058584451675415, "learning_rate": 8.88404e-05, "loss": 0.2489, "step": 277900 }, { "epoch": 0.316, "grad_norm": 0.2343665212392807, "learning_rate": 8.88004e-05, "loss": 0.2434, "step": 278000 }, { "epoch": 0.3162, "grad_norm": 0.26184457540512085, "learning_rate": 8.87604e-05, "loss": 0.2558, "step": 278100 }, { "epoch": 0.3164, "grad_norm": 0.23153603076934814, "learning_rate": 8.87204e-05, "loss": 0.2515, "step": 278200 }, { "epoch": 0.3166, "grad_norm": 0.16645345091819763, "learning_rate": 8.868040000000001e-05, "loss": 0.2506, "step": 278300 }, { "epoch": 0.3168, "grad_norm": 0.21093203127384186, "learning_rate": 8.86404e-05, "loss": 0.2501, "step": 278400 }, { "epoch": 0.317, "grad_norm": 0.2181226760149002, "learning_rate": 8.86004e-05, "loss": 0.2469, "step": 278500 }, { "epoch": 0.3172, "grad_norm": 0.27833208441734314, "learning_rate": 8.85604e-05, "loss": 0.2493, "step": 278600 }, { "epoch": 0.3174, "grad_norm": 0.18451841175556183, "learning_rate": 8.85204e-05, "loss": 0.2476, "step": 278700 }, { "epoch": 0.3176, "grad_norm": 0.15855015814304352, "learning_rate": 8.848040000000001e-05, "loss": 0.2562, "step": 278800 }, { "epoch": 0.3178, "grad_norm": 0.19361542165279388, "learning_rate": 8.84404e-05, "loss": 0.2461, "step": 278900 }, { "epoch": 0.318, "grad_norm": 0.1782364845275879, "learning_rate": 8.840040000000001e-05, "loss": 0.2504, "step": 279000 }, { "epoch": 0.3182, "grad_norm": 0.1721198856830597, "learning_rate": 8.83604e-05, "loss": 0.2489, "step": 279100 }, { "epoch": 0.3184, "grad_norm": 0.17030316591262817, "learning_rate": 8.832040000000001e-05, "loss": 0.2485, "step": 279200 }, { "epoch": 0.3186, "grad_norm": 0.2248735874891281, "learning_rate": 8.82804e-05, "loss": 0.252, "step": 279300 }, { "epoch": 0.3188, "grad_norm": 0.18986405432224274, "learning_rate": 8.82404e-05, "loss": 0.2536, "step": 279400 }, { "epoch": 0.319, "grad_norm": 0.48810842633247375, "learning_rate": 8.82004e-05, "loss": 0.2584, "step": 279500 }, { "epoch": 0.3192, "grad_norm": 0.16459937393665314, "learning_rate": 8.816040000000001e-05, "loss": 0.2482, "step": 279600 }, { "epoch": 0.3194, "grad_norm": 0.17048123478889465, "learning_rate": 8.81204e-05, "loss": 0.2472, "step": 279700 }, { "epoch": 0.3196, "grad_norm": 0.20386305451393127, "learning_rate": 8.808040000000001e-05, "loss": 0.2453, "step": 279800 }, { "epoch": 0.3198, "grad_norm": 0.2072535902261734, "learning_rate": 8.80404e-05, "loss": 0.2536, "step": 279900 }, { "epoch": 0.32, "grad_norm": 0.21061697602272034, "learning_rate": 8.80004e-05, "loss": 0.25, "step": 280000 }, { "epoch": 0.3202, "grad_norm": 0.18452088534832, "learning_rate": 8.79604e-05, "loss": 0.251, "step": 280100 }, { "epoch": 0.3204, "grad_norm": 0.27870404720306396, "learning_rate": 8.79204e-05, "loss": 0.2523, "step": 280200 }, { "epoch": 0.3206, "grad_norm": 0.20520621538162231, "learning_rate": 8.78804e-05, "loss": 0.2463, "step": 280300 }, { "epoch": 0.3208, "grad_norm": 0.23470567166805267, "learning_rate": 8.78404e-05, "loss": 0.2507, "step": 280400 }, { "epoch": 0.321, "grad_norm": 0.17464961111545563, "learning_rate": 8.780040000000001e-05, "loss": 0.2463, "step": 280500 }, { "epoch": 0.3212, "grad_norm": 0.2375008910894394, "learning_rate": 8.776040000000001e-05, "loss": 0.2457, "step": 280600 }, { "epoch": 0.3214, "grad_norm": 0.17004603147506714, "learning_rate": 8.772040000000001e-05, "loss": 0.2511, "step": 280700 }, { "epoch": 0.3216, "grad_norm": 0.3254378139972687, "learning_rate": 8.76804e-05, "loss": 0.2513, "step": 280800 }, { "epoch": 0.3218, "grad_norm": 0.1874452382326126, "learning_rate": 8.76404e-05, "loss": 0.2451, "step": 280900 }, { "epoch": 0.322, "grad_norm": 0.18145489692687988, "learning_rate": 8.76004e-05, "loss": 0.2469, "step": 281000 }, { "epoch": 0.3222, "grad_norm": 0.15016548335552216, "learning_rate": 8.756040000000001e-05, "loss": 0.2496, "step": 281100 }, { "epoch": 0.3224, "grad_norm": 0.3445073962211609, "learning_rate": 8.75204e-05, "loss": 0.2484, "step": 281200 }, { "epoch": 0.3226, "grad_norm": 0.17580726742744446, "learning_rate": 8.748040000000001e-05, "loss": 0.2482, "step": 281300 }, { "epoch": 0.3228, "grad_norm": 0.17913737893104553, "learning_rate": 8.74404e-05, "loss": 0.2457, "step": 281400 }, { "epoch": 0.323, "grad_norm": 0.20740285515785217, "learning_rate": 8.740040000000001e-05, "loss": 0.2519, "step": 281500 }, { "epoch": 0.3232, "grad_norm": 0.15794052183628082, "learning_rate": 8.73604e-05, "loss": 0.2478, "step": 281600 }, { "epoch": 0.3234, "grad_norm": 0.204098641872406, "learning_rate": 8.73204e-05, "loss": 0.2478, "step": 281700 }, { "epoch": 0.3236, "grad_norm": 0.17985448241233826, "learning_rate": 8.72804e-05, "loss": 0.2511, "step": 281800 }, { "epoch": 0.3238, "grad_norm": 0.20703667402267456, "learning_rate": 8.72404e-05, "loss": 0.249, "step": 281900 }, { "epoch": 0.324, "grad_norm": 0.3906717002391815, "learning_rate": 8.72004e-05, "loss": 0.2498, "step": 282000 }, { "epoch": 0.3242, "grad_norm": 0.32392579317092896, "learning_rate": 8.716040000000001e-05, "loss": 0.2529, "step": 282100 }, { "epoch": 0.3244, "grad_norm": 0.2961254119873047, "learning_rate": 8.712040000000001e-05, "loss": 0.2528, "step": 282200 }, { "epoch": 0.3246, "grad_norm": 0.18365971744060516, "learning_rate": 8.70804e-05, "loss": 0.2508, "step": 282300 }, { "epoch": 0.3248, "grad_norm": 0.30931851267814636, "learning_rate": 8.70404e-05, "loss": 0.2557, "step": 282400 }, { "epoch": 0.325, "grad_norm": 0.18898595869541168, "learning_rate": 8.70004e-05, "loss": 0.2444, "step": 282500 }, { "epoch": 0.3252, "grad_norm": 0.27550897002220154, "learning_rate": 8.696040000000001e-05, "loss": 0.2427, "step": 282600 }, { "epoch": 0.3254, "grad_norm": 0.19707511365413666, "learning_rate": 8.69204e-05, "loss": 0.2517, "step": 282700 }, { "epoch": 0.3256, "grad_norm": 0.18959122896194458, "learning_rate": 8.688040000000001e-05, "loss": 0.2474, "step": 282800 }, { "epoch": 0.3258, "grad_norm": 0.263849675655365, "learning_rate": 8.68404e-05, "loss": 0.2497, "step": 282900 }, { "epoch": 0.326, "grad_norm": 0.17215219140052795, "learning_rate": 8.680040000000001e-05, "loss": 0.256, "step": 283000 }, { "epoch": 0.3262, "grad_norm": 0.1411217749118805, "learning_rate": 8.67604e-05, "loss": 0.2485, "step": 283100 }, { "epoch": 0.3264, "grad_norm": 0.19203592836856842, "learning_rate": 8.67204e-05, "loss": 0.2478, "step": 283200 }, { "epoch": 0.3266, "grad_norm": 0.18021759390830994, "learning_rate": 8.66804e-05, "loss": 0.2478, "step": 283300 }, { "epoch": 0.3268, "grad_norm": 0.15332086384296417, "learning_rate": 8.66404e-05, "loss": 0.2443, "step": 283400 }, { "epoch": 0.327, "grad_norm": 0.1966552585363388, "learning_rate": 8.66004e-05, "loss": 0.248, "step": 283500 }, { "epoch": 0.3272, "grad_norm": 0.21253438293933868, "learning_rate": 8.656040000000001e-05, "loss": 0.2472, "step": 283600 }, { "epoch": 0.3274, "grad_norm": 0.253312349319458, "learning_rate": 8.65204e-05, "loss": 0.2522, "step": 283700 }, { "epoch": 0.3276, "grad_norm": 0.17193368077278137, "learning_rate": 8.64804e-05, "loss": 0.2564, "step": 283800 }, { "epoch": 0.3278, "grad_norm": 0.17592443525791168, "learning_rate": 8.644039999999999e-05, "loss": 0.2484, "step": 283900 }, { "epoch": 0.328, "grad_norm": 0.200908824801445, "learning_rate": 8.64004e-05, "loss": 0.2473, "step": 284000 }, { "epoch": 0.3282, "grad_norm": 0.21171993017196655, "learning_rate": 8.636040000000001e-05, "loss": 0.2482, "step": 284100 }, { "epoch": 0.3284, "grad_norm": 0.21706272661685944, "learning_rate": 8.63204e-05, "loss": 0.2537, "step": 284200 }, { "epoch": 0.3286, "grad_norm": 0.21344873309135437, "learning_rate": 8.628040000000001e-05, "loss": 0.2501, "step": 284300 }, { "epoch": 0.3288, "grad_norm": 0.19360865652561188, "learning_rate": 8.62404e-05, "loss": 0.2499, "step": 284400 }, { "epoch": 0.329, "grad_norm": 0.16247357428073883, "learning_rate": 8.620040000000001e-05, "loss": 0.246, "step": 284500 }, { "epoch": 0.3292, "grad_norm": 0.205808624625206, "learning_rate": 8.61604e-05, "loss": 0.2592, "step": 284600 }, { "epoch": 0.3294, "grad_norm": 0.21998731791973114, "learning_rate": 8.61204e-05, "loss": 0.2512, "step": 284700 }, { "epoch": 0.3296, "grad_norm": 0.17610786855220795, "learning_rate": 8.60804e-05, "loss": 0.2474, "step": 284800 }, { "epoch": 0.3298, "grad_norm": 0.1882948875427246, "learning_rate": 8.60404e-05, "loss": 0.2512, "step": 284900 }, { "epoch": 0.33, "grad_norm": 0.1443343162536621, "learning_rate": 8.60004e-05, "loss": 0.244, "step": 285000 }, { "epoch": 0.3302, "grad_norm": 0.22007732093334198, "learning_rate": 8.596040000000001e-05, "loss": 0.252, "step": 285100 }, { "epoch": 0.3304, "grad_norm": 0.1846906691789627, "learning_rate": 8.59204e-05, "loss": 0.2482, "step": 285200 }, { "epoch": 0.3306, "grad_norm": 0.19831278920173645, "learning_rate": 8.58804e-05, "loss": 0.2463, "step": 285300 }, { "epoch": 0.3308, "grad_norm": 0.16694426536560059, "learning_rate": 8.584039999999999e-05, "loss": 0.2461, "step": 285400 }, { "epoch": 0.331, "grad_norm": 0.22869306802749634, "learning_rate": 8.58004e-05, "loss": 0.2461, "step": 285500 }, { "epoch": 0.3312, "grad_norm": 0.29163485765457153, "learning_rate": 8.57604e-05, "loss": 0.2514, "step": 285600 }, { "epoch": 0.3314, "grad_norm": 0.20715934038162231, "learning_rate": 8.57204e-05, "loss": 0.2494, "step": 285700 }, { "epoch": 0.3316, "grad_norm": 0.2089393138885498, "learning_rate": 8.568040000000001e-05, "loss": 0.255, "step": 285800 }, { "epoch": 0.3318, "grad_norm": 0.1605835258960724, "learning_rate": 8.564040000000001e-05, "loss": 0.2502, "step": 285900 }, { "epoch": 0.332, "grad_norm": 0.21266242861747742, "learning_rate": 8.560040000000001e-05, "loss": 0.2506, "step": 286000 }, { "epoch": 0.3322, "grad_norm": 0.18077294528484344, "learning_rate": 8.55604e-05, "loss": 0.2463, "step": 286100 }, { "epoch": 0.3324, "grad_norm": 0.2691899538040161, "learning_rate": 8.55204e-05, "loss": 0.2484, "step": 286200 }, { "epoch": 0.3326, "grad_norm": 0.20911413431167603, "learning_rate": 8.54804e-05, "loss": 0.2492, "step": 286300 }, { "epoch": 0.3328, "grad_norm": 0.20955608785152435, "learning_rate": 8.544040000000001e-05, "loss": 0.2501, "step": 286400 }, { "epoch": 0.333, "grad_norm": 0.16261516511440277, "learning_rate": 8.54004e-05, "loss": 0.2481, "step": 286500 }, { "epoch": 0.3332, "grad_norm": 0.3086774945259094, "learning_rate": 8.536040000000001e-05, "loss": 0.2505, "step": 286600 }, { "epoch": 0.3334, "grad_norm": 0.16996116936206818, "learning_rate": 8.53204e-05, "loss": 0.25, "step": 286700 }, { "epoch": 0.3336, "grad_norm": 0.2675207853317261, "learning_rate": 8.528040000000001e-05, "loss": 0.2459, "step": 286800 }, { "epoch": 0.3338, "grad_norm": 0.26065778732299805, "learning_rate": 8.52404e-05, "loss": 0.2468, "step": 286900 }, { "epoch": 0.334, "grad_norm": 0.15775251388549805, "learning_rate": 8.52004e-05, "loss": 0.248, "step": 287000 }, { "epoch": 0.3342, "grad_norm": 0.17829616367816925, "learning_rate": 8.51604e-05, "loss": 0.2522, "step": 287100 }, { "epoch": 0.3344, "grad_norm": 0.283382385969162, "learning_rate": 8.51204e-05, "loss": 0.2504, "step": 287200 }, { "epoch": 0.3346, "grad_norm": 0.21750444173812866, "learning_rate": 8.50804e-05, "loss": 0.2458, "step": 287300 }, { "epoch": 0.3348, "grad_norm": 0.17604978382587433, "learning_rate": 8.504040000000001e-05, "loss": 0.2543, "step": 287400 }, { "epoch": 0.335, "grad_norm": 0.17278654873371124, "learning_rate": 8.500040000000001e-05, "loss": 0.2437, "step": 287500 }, { "epoch": 0.3352, "grad_norm": 0.1866379976272583, "learning_rate": 8.49604e-05, "loss": 0.2473, "step": 287600 }, { "epoch": 0.3354, "grad_norm": 0.18013985455036163, "learning_rate": 8.49204e-05, "loss": 0.2504, "step": 287700 }, { "epoch": 0.3356, "grad_norm": 0.1813124716281891, "learning_rate": 8.48804e-05, "loss": 0.25, "step": 287800 }, { "epoch": 0.3358, "grad_norm": 0.1880805343389511, "learning_rate": 8.484040000000001e-05, "loss": 0.2506, "step": 287900 }, { "epoch": 0.336, "grad_norm": 0.20966345071792603, "learning_rate": 8.48004e-05, "loss": 0.2508, "step": 288000 }, { "epoch": 0.3362, "grad_norm": 0.23822133243083954, "learning_rate": 8.476040000000001e-05, "loss": 0.2549, "step": 288100 }, { "epoch": 0.3364, "grad_norm": 0.17772118747234344, "learning_rate": 8.47204e-05, "loss": 0.2441, "step": 288200 }, { "epoch": 0.3366, "grad_norm": 0.22590018808841705, "learning_rate": 8.468040000000001e-05, "loss": 0.2517, "step": 288300 }, { "epoch": 0.3368, "grad_norm": 0.19911415874958038, "learning_rate": 8.46404e-05, "loss": 0.2495, "step": 288400 }, { "epoch": 0.337, "grad_norm": 0.19281023740768433, "learning_rate": 8.46004e-05, "loss": 0.2535, "step": 288500 }, { "epoch": 0.3372, "grad_norm": 0.18460847437381744, "learning_rate": 8.45604e-05, "loss": 0.2541, "step": 288600 }, { "epoch": 0.3374, "grad_norm": 0.2108038067817688, "learning_rate": 8.45204e-05, "loss": 0.252, "step": 288700 }, { "epoch": 0.3376, "grad_norm": 0.24235336482524872, "learning_rate": 8.44804e-05, "loss": 0.2482, "step": 288800 }, { "epoch": 0.3378, "grad_norm": 0.16646993160247803, "learning_rate": 8.444040000000001e-05, "loss": 0.254, "step": 288900 }, { "epoch": 0.338, "grad_norm": 0.2231166511774063, "learning_rate": 8.44004e-05, "loss": 0.2479, "step": 289000 }, { "epoch": 0.3382, "grad_norm": 0.24372661113739014, "learning_rate": 8.43604e-05, "loss": 0.2488, "step": 289100 }, { "epoch": 0.3384, "grad_norm": 0.1555611491203308, "learning_rate": 8.432039999999999e-05, "loss": 0.2532, "step": 289200 }, { "epoch": 0.3386, "grad_norm": 0.22455960512161255, "learning_rate": 8.42804e-05, "loss": 0.2474, "step": 289300 }, { "epoch": 0.3388, "grad_norm": 0.2038494348526001, "learning_rate": 8.424040000000001e-05, "loss": 0.2468, "step": 289400 }, { "epoch": 0.339, "grad_norm": 0.2508559226989746, "learning_rate": 8.42004e-05, "loss": 0.2489, "step": 289500 }, { "epoch": 0.3392, "grad_norm": 0.2195146679878235, "learning_rate": 8.416040000000001e-05, "loss": 0.2452, "step": 289600 }, { "epoch": 0.3394, "grad_norm": 0.19063441455364227, "learning_rate": 8.41204e-05, "loss": 0.2491, "step": 289700 }, { "epoch": 0.3396, "grad_norm": 0.2436915785074234, "learning_rate": 8.408040000000001e-05, "loss": 0.2508, "step": 289800 }, { "epoch": 0.3398, "grad_norm": 0.20626668632030487, "learning_rate": 8.40404e-05, "loss": 0.2497, "step": 289900 }, { "epoch": 0.34, "grad_norm": 0.1684507131576538, "learning_rate": 8.40004e-05, "loss": 0.2493, "step": 290000 }, { "epoch": 0.3402, "grad_norm": 0.16648483276367188, "learning_rate": 8.39604e-05, "loss": 0.2526, "step": 290100 }, { "epoch": 0.3404, "grad_norm": 0.1838679313659668, "learning_rate": 8.39204e-05, "loss": 0.2477, "step": 290200 }, { "epoch": 0.3406, "grad_norm": 0.204168438911438, "learning_rate": 8.38804e-05, "loss": 0.247, "step": 290300 }, { "epoch": 0.3408, "grad_norm": 0.18529953062534332, "learning_rate": 8.384040000000001e-05, "loss": 0.25, "step": 290400 }, { "epoch": 0.341, "grad_norm": 0.17540723085403442, "learning_rate": 8.38004e-05, "loss": 0.2467, "step": 290500 }, { "epoch": 0.3412, "grad_norm": 0.15867659449577332, "learning_rate": 8.376040000000001e-05, "loss": 0.2442, "step": 290600 }, { "epoch": 0.3414, "grad_norm": 0.24484454095363617, "learning_rate": 8.372039999999999e-05, "loss": 0.2486, "step": 290700 }, { "epoch": 0.3416, "grad_norm": 0.21966548264026642, "learning_rate": 8.36804e-05, "loss": 0.2487, "step": 290800 }, { "epoch": 0.3418, "grad_norm": 0.16184256970882416, "learning_rate": 8.36404e-05, "loss": 0.2464, "step": 290900 }, { "epoch": 0.342, "grad_norm": 0.2605167329311371, "learning_rate": 8.36004e-05, "loss": 0.2456, "step": 291000 }, { "epoch": 0.3422, "grad_norm": 0.15184715390205383, "learning_rate": 8.356040000000001e-05, "loss": 0.2544, "step": 291100 }, { "epoch": 0.3424, "grad_norm": 0.18287336826324463, "learning_rate": 8.35204e-05, "loss": 0.2461, "step": 291200 }, { "epoch": 0.3426, "grad_norm": 0.1705942153930664, "learning_rate": 8.348040000000001e-05, "loss": 0.2468, "step": 291300 }, { "epoch": 0.3428, "grad_norm": 0.4577696919441223, "learning_rate": 8.34404e-05, "loss": 0.2488, "step": 291400 }, { "epoch": 0.343, "grad_norm": 0.21272404491901398, "learning_rate": 8.34004e-05, "loss": 0.2522, "step": 291500 }, { "epoch": 0.3432, "grad_norm": 0.24788586795330048, "learning_rate": 8.33604e-05, "loss": 0.2437, "step": 291600 }, { "epoch": 0.3434, "grad_norm": 0.16285791993141174, "learning_rate": 8.33204e-05, "loss": 0.2455, "step": 291700 }, { "epoch": 0.3436, "grad_norm": 0.1585230529308319, "learning_rate": 8.32804e-05, "loss": 0.2454, "step": 291800 }, { "epoch": 0.3438, "grad_norm": 0.1985248178243637, "learning_rate": 8.324040000000001e-05, "loss": 0.2475, "step": 291900 }, { "epoch": 0.344, "grad_norm": 0.3097187280654907, "learning_rate": 8.32004e-05, "loss": 0.2479, "step": 292000 }, { "epoch": 0.3442, "grad_norm": 0.21880201995372772, "learning_rate": 8.316040000000001e-05, "loss": 0.2495, "step": 292100 }, { "epoch": 0.3444, "grad_norm": 0.1829034388065338, "learning_rate": 8.31204e-05, "loss": 0.2501, "step": 292200 }, { "epoch": 0.3446, "grad_norm": 0.1782861053943634, "learning_rate": 8.30804e-05, "loss": 0.2532, "step": 292300 }, { "epoch": 0.3448, "grad_norm": 0.22069762647151947, "learning_rate": 8.30404e-05, "loss": 0.2608, "step": 292400 }, { "epoch": 0.345, "grad_norm": 0.18594609200954437, "learning_rate": 8.30004e-05, "loss": 0.2552, "step": 292500 }, { "epoch": 0.3452, "grad_norm": 0.17892573773860931, "learning_rate": 8.29604e-05, "loss": 0.2488, "step": 292600 }, { "epoch": 0.3454, "grad_norm": 0.18807783722877502, "learning_rate": 8.292040000000001e-05, "loss": 0.2513, "step": 292700 }, { "epoch": 0.3456, "grad_norm": 0.19882932305335999, "learning_rate": 8.288040000000001e-05, "loss": 0.2489, "step": 292800 }, { "epoch": 0.3458, "grad_norm": 0.19057603180408478, "learning_rate": 8.28404e-05, "loss": 0.2479, "step": 292900 }, { "epoch": 0.346, "grad_norm": 0.2387564480304718, "learning_rate": 8.28004e-05, "loss": 0.2488, "step": 293000 }, { "epoch": 0.3462, "grad_norm": 0.22695794701576233, "learning_rate": 8.27604e-05, "loss": 0.2452, "step": 293100 }, { "epoch": 0.3464, "grad_norm": 0.15103566646575928, "learning_rate": 8.272040000000001e-05, "loss": 0.2488, "step": 293200 }, { "epoch": 0.3466, "grad_norm": 0.16485248506069183, "learning_rate": 8.26804e-05, "loss": 0.2457, "step": 293300 }, { "epoch": 0.3468, "grad_norm": 0.1788826584815979, "learning_rate": 8.264040000000001e-05, "loss": 0.2479, "step": 293400 }, { "epoch": 0.347, "grad_norm": 0.17442439496517181, "learning_rate": 8.26004e-05, "loss": 0.2471, "step": 293500 }, { "epoch": 0.3472, "grad_norm": 0.5051277875900269, "learning_rate": 8.256040000000001e-05, "loss": 0.2474, "step": 293600 }, { "epoch": 0.3474, "grad_norm": 0.40286529064178467, "learning_rate": 8.25204e-05, "loss": 0.2482, "step": 293700 }, { "epoch": 0.3476, "grad_norm": 0.2842089831829071, "learning_rate": 8.24804e-05, "loss": 0.2494, "step": 293800 }, { "epoch": 0.3478, "grad_norm": 0.17578501999378204, "learning_rate": 8.24404e-05, "loss": 0.2527, "step": 293900 }, { "epoch": 0.348, "grad_norm": 0.17574481666088104, "learning_rate": 8.24004e-05, "loss": 0.2463, "step": 294000 }, { "epoch": 0.3482, "grad_norm": 0.1960979551076889, "learning_rate": 8.23604e-05, "loss": 0.2495, "step": 294100 }, { "epoch": 0.3484, "grad_norm": 0.1842304915189743, "learning_rate": 8.232040000000001e-05, "loss": 0.25, "step": 294200 }, { "epoch": 0.3486, "grad_norm": 0.17555032670497894, "learning_rate": 8.22804e-05, "loss": 0.2487, "step": 294300 }, { "epoch": 0.3488, "grad_norm": 0.18717283010482788, "learning_rate": 8.224040000000001e-05, "loss": 0.2462, "step": 294400 }, { "epoch": 0.349, "grad_norm": 0.18235217034816742, "learning_rate": 8.220039999999999e-05, "loss": 0.2472, "step": 294500 }, { "epoch": 0.3492, "grad_norm": 0.19905789196491241, "learning_rate": 8.21604e-05, "loss": 0.2485, "step": 294600 }, { "epoch": 0.3494, "grad_norm": 0.19601567089557648, "learning_rate": 8.212040000000001e-05, "loss": 0.2519, "step": 294700 }, { "epoch": 0.3496, "grad_norm": 0.16348402202129364, "learning_rate": 8.20804e-05, "loss": 0.2463, "step": 294800 }, { "epoch": 0.3498, "grad_norm": 0.16451190412044525, "learning_rate": 8.204040000000001e-05, "loss": 0.251, "step": 294900 }, { "epoch": 0.35, "grad_norm": 0.20977216958999634, "learning_rate": 8.20004e-05, "loss": 0.2465, "step": 295000 }, { "epoch": 0.3502, "grad_norm": 0.3382442891597748, "learning_rate": 8.196040000000001e-05, "loss": 0.2456, "step": 295100 }, { "epoch": 0.3504, "grad_norm": 0.22302016615867615, "learning_rate": 8.19204e-05, "loss": 0.2469, "step": 295200 }, { "epoch": 0.3506, "grad_norm": 0.1620716005563736, "learning_rate": 8.18804e-05, "loss": 0.2435, "step": 295300 }, { "epoch": 0.3508, "grad_norm": 0.2126971334218979, "learning_rate": 8.18404e-05, "loss": 0.2444, "step": 295400 }, { "epoch": 0.351, "grad_norm": 0.1620679646730423, "learning_rate": 8.18004e-05, "loss": 0.2402, "step": 295500 }, { "epoch": 0.3512, "grad_norm": 0.19541557133197784, "learning_rate": 8.17604e-05, "loss": 0.2426, "step": 295600 }, { "epoch": 0.3514, "grad_norm": 0.18142946064472198, "learning_rate": 8.172040000000001e-05, "loss": 0.2452, "step": 295700 }, { "epoch": 0.3516, "grad_norm": 0.18419720232486725, "learning_rate": 8.16804e-05, "loss": 0.2466, "step": 295800 }, { "epoch": 0.3518, "grad_norm": 0.19173642992973328, "learning_rate": 8.164040000000001e-05, "loss": 0.2487, "step": 295900 }, { "epoch": 0.352, "grad_norm": 0.16842658817768097, "learning_rate": 8.16004e-05, "loss": 0.2508, "step": 296000 }, { "epoch": 0.3522, "grad_norm": 0.23482634127140045, "learning_rate": 8.15604e-05, "loss": 0.2421, "step": 296100 }, { "epoch": 0.3524, "grad_norm": 0.17169761657714844, "learning_rate": 8.15204e-05, "loss": 0.2454, "step": 296200 }, { "epoch": 0.3526, "grad_norm": 0.1711745262145996, "learning_rate": 8.14804e-05, "loss": 0.2424, "step": 296300 }, { "epoch": 0.3528, "grad_norm": 0.18024346232414246, "learning_rate": 8.144040000000001e-05, "loss": 0.244, "step": 296400 }, { "epoch": 0.353, "grad_norm": 0.468720018863678, "learning_rate": 8.14004e-05, "loss": 0.2452, "step": 296500 }, { "epoch": 0.3532, "grad_norm": 0.16344809532165527, "learning_rate": 8.136040000000001e-05, "loss": 0.2426, "step": 296600 }, { "epoch": 0.3534, "grad_norm": 0.22236470878124237, "learning_rate": 8.13204e-05, "loss": 0.2466, "step": 296700 }, { "epoch": 0.3536, "grad_norm": 0.19982776045799255, "learning_rate": 8.12804e-05, "loss": 0.2506, "step": 296800 }, { "epoch": 0.3538, "grad_norm": 0.1859310418367386, "learning_rate": 8.12404e-05, "loss": 0.251, "step": 296900 }, { "epoch": 0.354, "grad_norm": 0.21040932834148407, "learning_rate": 8.12004e-05, "loss": 0.2457, "step": 297000 }, { "epoch": 0.3542, "grad_norm": 0.1694370061159134, "learning_rate": 8.11604e-05, "loss": 0.2444, "step": 297100 }, { "epoch": 0.3544, "grad_norm": 0.17156536877155304, "learning_rate": 8.112040000000001e-05, "loss": 0.2424, "step": 297200 }, { "epoch": 0.3546, "grad_norm": 0.17718739807605743, "learning_rate": 8.10804e-05, "loss": 0.2411, "step": 297300 }, { "epoch": 0.3548, "grad_norm": 0.23720037937164307, "learning_rate": 8.104040000000001e-05, "loss": 0.2469, "step": 297400 }, { "epoch": 0.355, "grad_norm": 0.2064405083656311, "learning_rate": 8.10004e-05, "loss": 0.2415, "step": 297500 }, { "epoch": 0.3552, "grad_norm": 0.1649145632982254, "learning_rate": 8.09604e-05, "loss": 0.2421, "step": 297600 }, { "epoch": 0.3554, "grad_norm": 0.1422872543334961, "learning_rate": 8.09204e-05, "loss": 0.2414, "step": 297700 }, { "epoch": 0.3556, "grad_norm": 0.42385339736938477, "learning_rate": 8.08804e-05, "loss": 0.2425, "step": 297800 }, { "epoch": 0.3558, "grad_norm": 0.1627955287694931, "learning_rate": 8.08404e-05, "loss": 0.2481, "step": 297900 }, { "epoch": 0.356, "grad_norm": 0.1613747775554657, "learning_rate": 8.08004e-05, "loss": 0.2478, "step": 298000 }, { "epoch": 0.3562, "grad_norm": 0.23526862263679504, "learning_rate": 8.076040000000001e-05, "loss": 0.248, "step": 298100 }, { "epoch": 0.3564, "grad_norm": 0.18639631569385529, "learning_rate": 8.07204e-05, "loss": 0.2567, "step": 298200 }, { "epoch": 0.3566, "grad_norm": 0.2338380515575409, "learning_rate": 8.06804e-05, "loss": 0.2494, "step": 298300 }, { "epoch": 0.3568, "grad_norm": 0.165755957365036, "learning_rate": 8.06404e-05, "loss": 0.2518, "step": 298400 }, { "epoch": 0.357, "grad_norm": 0.1789596974849701, "learning_rate": 8.060040000000001e-05, "loss": 0.2483, "step": 298500 }, { "epoch": 0.3572, "grad_norm": 0.174707293510437, "learning_rate": 8.05604e-05, "loss": 0.2445, "step": 298600 }, { "epoch": 0.3574, "grad_norm": 0.15874457359313965, "learning_rate": 8.052040000000001e-05, "loss": 0.2526, "step": 298700 }, { "epoch": 0.3576, "grad_norm": 0.15661177039146423, "learning_rate": 8.04804e-05, "loss": 0.2447, "step": 298800 }, { "epoch": 0.3578, "grad_norm": 0.185741126537323, "learning_rate": 8.044040000000001e-05, "loss": 0.254, "step": 298900 }, { "epoch": 0.358, "grad_norm": 0.1975129246711731, "learning_rate": 8.04004e-05, "loss": 0.2453, "step": 299000 }, { "epoch": 0.3582, "grad_norm": 0.21340824663639069, "learning_rate": 8.03604e-05, "loss": 0.2436, "step": 299100 }, { "epoch": 0.3584, "grad_norm": 0.1729285567998886, "learning_rate": 8.03204e-05, "loss": 0.2495, "step": 299200 }, { "epoch": 0.3586, "grad_norm": 0.2982395589351654, "learning_rate": 8.02804e-05, "loss": 0.2443, "step": 299300 }, { "epoch": 0.3588, "grad_norm": 0.18718211352825165, "learning_rate": 8.02404e-05, "loss": 0.2421, "step": 299400 }, { "epoch": 0.359, "grad_norm": 0.18755905330181122, "learning_rate": 8.020040000000001e-05, "loss": 0.2517, "step": 299500 }, { "epoch": 0.3592, "grad_norm": 0.15775266289710999, "learning_rate": 8.01604e-05, "loss": 0.2471, "step": 299600 }, { "epoch": 0.3594, "grad_norm": 0.14446353912353516, "learning_rate": 8.012040000000001e-05, "loss": 0.2433, "step": 299700 }, { "epoch": 0.3596, "grad_norm": 0.17306815087795258, "learning_rate": 8.008039999999999e-05, "loss": 0.2464, "step": 299800 }, { "epoch": 0.3598, "grad_norm": 0.22161149978637695, "learning_rate": 8.00404e-05, "loss": 0.2467, "step": 299900 }, { "epoch": 0.36, "grad_norm": 0.15656442940235138, "learning_rate": 8.000040000000001e-05, "loss": 0.2445, "step": 300000 }, { "epoch": 0.3602, "grad_norm": 0.17966248095035553, "learning_rate": 7.99604e-05, "loss": 0.2451, "step": 300100 }, { "epoch": 0.3604, "grad_norm": 0.18749384582042694, "learning_rate": 7.992040000000001e-05, "loss": 0.242, "step": 300200 }, { "epoch": 0.3606, "grad_norm": 0.18940359354019165, "learning_rate": 7.98804e-05, "loss": 0.241, "step": 300300 }, { "epoch": 0.3608, "grad_norm": 0.1695726066827774, "learning_rate": 7.984040000000001e-05, "loss": 0.247, "step": 300400 }, { "epoch": 0.361, "grad_norm": 0.23196519911289215, "learning_rate": 7.98004e-05, "loss": 0.2518, "step": 300500 }, { "epoch": 0.3612, "grad_norm": 0.1548641175031662, "learning_rate": 7.97604e-05, "loss": 0.2424, "step": 300600 }, { "epoch": 0.3614, "grad_norm": 0.25332915782928467, "learning_rate": 7.97204e-05, "loss": 0.2428, "step": 300700 }, { "epoch": 0.3616, "grad_norm": 0.21986594796180725, "learning_rate": 7.96804e-05, "loss": 0.2447, "step": 300800 }, { "epoch": 0.3618, "grad_norm": 0.18515200912952423, "learning_rate": 7.96404e-05, "loss": 0.2424, "step": 300900 }, { "epoch": 0.362, "grad_norm": 0.23712636530399323, "learning_rate": 7.960040000000001e-05, "loss": 0.2513, "step": 301000 }, { "epoch": 0.3622, "grad_norm": 0.21687628328800201, "learning_rate": 7.95604e-05, "loss": 0.249, "step": 301100 }, { "epoch": 0.3624, "grad_norm": 0.22492177784442902, "learning_rate": 7.952040000000001e-05, "loss": 0.2463, "step": 301200 }, { "epoch": 0.3626, "grad_norm": 0.19002969563007355, "learning_rate": 7.94804e-05, "loss": 0.2455, "step": 301300 }, { "epoch": 0.3628, "grad_norm": 0.1988956481218338, "learning_rate": 7.94404e-05, "loss": 0.2405, "step": 301400 }, { "epoch": 0.363, "grad_norm": 0.1998603492975235, "learning_rate": 7.94004e-05, "loss": 0.2454, "step": 301500 }, { "epoch": 0.3632, "grad_norm": 0.23634324967861176, "learning_rate": 7.93604e-05, "loss": 0.2481, "step": 301600 }, { "epoch": 0.3634, "grad_norm": 0.17416614294052124, "learning_rate": 7.932040000000001e-05, "loss": 0.2418, "step": 301700 }, { "epoch": 0.3636, "grad_norm": 0.2007831186056137, "learning_rate": 7.92804e-05, "loss": 0.2538, "step": 301800 }, { "epoch": 0.3638, "grad_norm": 0.1813400238752365, "learning_rate": 7.924040000000001e-05, "loss": 0.2553, "step": 301900 }, { "epoch": 0.364, "grad_norm": 0.18407954275608063, "learning_rate": 7.92004e-05, "loss": 0.2429, "step": 302000 }, { "epoch": 0.3642, "grad_norm": 0.2030446082353592, "learning_rate": 7.91604e-05, "loss": 0.243, "step": 302100 }, { "epoch": 0.3644, "grad_norm": 0.18323349952697754, "learning_rate": 7.91204e-05, "loss": 0.2449, "step": 302200 }, { "epoch": 0.3646, "grad_norm": 0.1829667091369629, "learning_rate": 7.90804e-05, "loss": 0.2497, "step": 302300 }, { "epoch": 0.3648, "grad_norm": 0.21287867426872253, "learning_rate": 7.90404e-05, "loss": 0.2463, "step": 302400 }, { "epoch": 0.365, "grad_norm": 0.17376112937927246, "learning_rate": 7.900040000000001e-05, "loss": 0.241, "step": 302500 }, { "epoch": 0.3652, "grad_norm": 0.19770017266273499, "learning_rate": 7.89604e-05, "loss": 0.247, "step": 302600 }, { "epoch": 0.3654, "grad_norm": 0.2480800300836563, "learning_rate": 7.892040000000001e-05, "loss": 0.2408, "step": 302700 }, { "epoch": 0.3656, "grad_norm": 0.1723504662513733, "learning_rate": 7.88804e-05, "loss": 0.2475, "step": 302800 }, { "epoch": 0.3658, "grad_norm": 0.19746597111225128, "learning_rate": 7.88404e-05, "loss": 0.2387, "step": 302900 }, { "epoch": 0.366, "grad_norm": 0.2700865864753723, "learning_rate": 7.88004e-05, "loss": 0.2431, "step": 303000 }, { "epoch": 0.3662, "grad_norm": 0.18088355660438538, "learning_rate": 7.87604e-05, "loss": 0.2425, "step": 303100 }, { "epoch": 0.3664, "grad_norm": 0.24252882599830627, "learning_rate": 7.87204e-05, "loss": 0.2456, "step": 303200 }, { "epoch": 0.3666, "grad_norm": 0.18904949724674225, "learning_rate": 7.86804e-05, "loss": 0.2485, "step": 303300 }, { "epoch": 0.3668, "grad_norm": 0.2009989470243454, "learning_rate": 7.864040000000001e-05, "loss": 0.2457, "step": 303400 }, { "epoch": 0.367, "grad_norm": 0.21063704788684845, "learning_rate": 7.860040000000001e-05, "loss": 0.2437, "step": 303500 }, { "epoch": 0.3672, "grad_norm": 0.24128037691116333, "learning_rate": 7.85604e-05, "loss": 0.2519, "step": 303600 }, { "epoch": 0.3674, "grad_norm": 0.18868161737918854, "learning_rate": 7.85204e-05, "loss": 0.2456, "step": 303700 }, { "epoch": 0.3676, "grad_norm": 0.24962688982486725, "learning_rate": 7.84804e-05, "loss": 0.2483, "step": 303800 }, { "epoch": 0.3678, "grad_norm": 0.15526580810546875, "learning_rate": 7.84404e-05, "loss": 0.2507, "step": 303900 }, { "epoch": 0.368, "grad_norm": 0.21151374280452728, "learning_rate": 7.840040000000001e-05, "loss": 0.2587, "step": 304000 }, { "epoch": 0.3682, "grad_norm": 0.18684406578540802, "learning_rate": 7.83604e-05, "loss": 0.2472, "step": 304100 }, { "epoch": 0.3684, "grad_norm": 0.5539630651473999, "learning_rate": 7.832040000000001e-05, "loss": 0.2514, "step": 304200 }, { "epoch": 0.3686, "grad_norm": 0.31068259477615356, "learning_rate": 7.82804e-05, "loss": 0.2521, "step": 304300 }, { "epoch": 0.3688, "grad_norm": 0.15399712324142456, "learning_rate": 7.82404e-05, "loss": 0.245, "step": 304400 }, { "epoch": 0.369, "grad_norm": 0.16748294234275818, "learning_rate": 7.82004e-05, "loss": 0.2458, "step": 304500 }, { "epoch": 0.3692, "grad_norm": 0.35579004883766174, "learning_rate": 7.81604e-05, "loss": 0.2753, "step": 304600 }, { "epoch": 0.3694, "grad_norm": 0.28640273213386536, "learning_rate": 7.81204e-05, "loss": 0.3173, "step": 304700 }, { "epoch": 0.3696, "grad_norm": 0.5388140082359314, "learning_rate": 7.808040000000001e-05, "loss": 0.3084, "step": 304800 }, { "epoch": 0.3698, "grad_norm": 0.28529220819473267, "learning_rate": 7.80404e-05, "loss": 0.2922, "step": 304900 }, { "epoch": 0.37, "grad_norm": 0.465318888425827, "learning_rate": 7.800040000000001e-05, "loss": 0.3043, "step": 305000 }, { "epoch": 0.3702, "grad_norm": 0.25032395124435425, "learning_rate": 7.79604e-05, "loss": 0.3063, "step": 305100 }, { "epoch": 0.3704, "grad_norm": 0.4543182849884033, "learning_rate": 7.79204e-05, "loss": 0.2866, "step": 305200 }, { "epoch": 0.3706, "grad_norm": 0.305585652589798, "learning_rate": 7.788040000000001e-05, "loss": 0.2914, "step": 305300 }, { "epoch": 0.3708, "grad_norm": 0.9315662980079651, "learning_rate": 7.78404e-05, "loss": 0.2972, "step": 305400 }, { "epoch": 0.371, "grad_norm": 0.32617905735969543, "learning_rate": 7.780040000000001e-05, "loss": 0.3116, "step": 305500 }, { "epoch": 0.3712, "grad_norm": 0.3560982346534729, "learning_rate": 7.77604e-05, "loss": 0.3046, "step": 305600 }, { "epoch": 0.3714, "grad_norm": 0.23608624935150146, "learning_rate": 7.772040000000001e-05, "loss": 0.2764, "step": 305700 }, { "epoch": 0.3716, "grad_norm": 0.429745614528656, "learning_rate": 7.76804e-05, "loss": 0.3084, "step": 305800 }, { "epoch": 0.3718, "grad_norm": 0.181761234998703, "learning_rate": 7.76404e-05, "loss": 0.2848, "step": 305900 }, { "epoch": 0.372, "grad_norm": 0.2306230366230011, "learning_rate": 7.76004e-05, "loss": 0.2956, "step": 306000 }, { "epoch": 0.3722, "grad_norm": 0.22008122503757477, "learning_rate": 7.75604e-05, "loss": 0.2669, "step": 306100 }, { "epoch": 0.3724, "grad_norm": 0.1795935481786728, "learning_rate": 7.75204e-05, "loss": 0.2864, "step": 306200 }, { "epoch": 0.3726, "grad_norm": 0.28549113869667053, "learning_rate": 7.748040000000001e-05, "loss": 0.2774, "step": 306300 }, { "epoch": 0.3728, "grad_norm": 0.2546445429325104, "learning_rate": 7.74404e-05, "loss": 0.2714, "step": 306400 }, { "epoch": 0.373, "grad_norm": 0.23946426808834076, "learning_rate": 7.740040000000001e-05, "loss": 0.271, "step": 306500 }, { "epoch": 0.3732, "grad_norm": 0.4045504927635193, "learning_rate": 7.73604e-05, "loss": 0.3021, "step": 306600 }, { "epoch": 0.3734, "grad_norm": 0.32764193415641785, "learning_rate": 7.73204e-05, "loss": 0.2769, "step": 306700 }, { "epoch": 0.3736, "grad_norm": 0.2573786973953247, "learning_rate": 7.72804e-05, "loss": 0.2901, "step": 306800 }, { "epoch": 0.3738, "grad_norm": 0.34804463386535645, "learning_rate": 7.72404e-05, "loss": 0.2925, "step": 306900 }, { "epoch": 0.374, "grad_norm": 0.3770942986011505, "learning_rate": 7.720040000000001e-05, "loss": 0.2738, "step": 307000 }, { "epoch": 0.3742, "grad_norm": 0.27824512124061584, "learning_rate": 7.71604e-05, "loss": 0.28, "step": 307100 }, { "epoch": 0.3744, "grad_norm": 0.24019598960876465, "learning_rate": 7.712040000000001e-05, "loss": 0.2983, "step": 307200 }, { "epoch": 0.3746, "grad_norm": 0.283953458070755, "learning_rate": 7.70804e-05, "loss": 0.293, "step": 307300 }, { "epoch": 0.3748, "grad_norm": 0.2683544456958771, "learning_rate": 7.70404e-05, "loss": 0.2729, "step": 307400 }, { "epoch": 0.375, "grad_norm": 0.3089542090892792, "learning_rate": 7.70004e-05, "loss": 0.2916, "step": 307500 }, { "epoch": 0.3752, "grad_norm": 0.5444771647453308, "learning_rate": 7.69604e-05, "loss": 0.2795, "step": 307600 }, { "epoch": 0.3754, "grad_norm": 0.333789587020874, "learning_rate": 7.69204e-05, "loss": 0.2751, "step": 307700 }, { "epoch": 0.3756, "grad_norm": 0.7448511123657227, "learning_rate": 7.688040000000001e-05, "loss": 0.29, "step": 307800 }, { "epoch": 0.3758, "grad_norm": 0.17118731141090393, "learning_rate": 7.68404e-05, "loss": 0.2819, "step": 307900 }, { "epoch": 0.376, "grad_norm": 0.1759895384311676, "learning_rate": 7.680040000000001e-05, "loss": 0.2877, "step": 308000 }, { "epoch": 0.3762, "grad_norm": 0.26918289065361023, "learning_rate": 7.67604e-05, "loss": 0.2738, "step": 308100 }, { "epoch": 0.3764, "grad_norm": 0.2078232765197754, "learning_rate": 7.67204e-05, "loss": 0.2881, "step": 308200 }, { "epoch": 0.3766, "grad_norm": 0.6741946339607239, "learning_rate": 7.66804e-05, "loss": 0.2742, "step": 308300 }, { "epoch": 0.3768, "grad_norm": 0.2831641137599945, "learning_rate": 7.66404e-05, "loss": 0.3325, "step": 308400 }, { "epoch": 0.377, "grad_norm": 0.16423487663269043, "learning_rate": 7.66004e-05, "loss": 0.2887, "step": 308500 }, { "epoch": 0.3772, "grad_norm": 0.2626672089099884, "learning_rate": 7.65604e-05, "loss": 0.2761, "step": 308600 }, { "epoch": 0.3774, "grad_norm": 0.23534747958183289, "learning_rate": 7.652040000000001e-05, "loss": 0.2843, "step": 308700 }, { "epoch": 0.3776, "grad_norm": 0.4902610182762146, "learning_rate": 7.648040000000001e-05, "loss": 0.2959, "step": 308800 }, { "epoch": 0.3778, "grad_norm": 0.29607897996902466, "learning_rate": 7.644040000000001e-05, "loss": 0.2678, "step": 308900 }, { "epoch": 0.378, "grad_norm": 0.3284173011779785, "learning_rate": 7.64004e-05, "loss": 0.2764, "step": 309000 }, { "epoch": 0.3782, "grad_norm": 0.24199332296848297, "learning_rate": 7.63604e-05, "loss": 0.2766, "step": 309100 }, { "epoch": 0.3784, "grad_norm": 0.3074169158935547, "learning_rate": 7.63204e-05, "loss": 0.2901, "step": 309200 }, { "epoch": 0.3786, "grad_norm": 0.3491748869419098, "learning_rate": 7.628040000000001e-05, "loss": 0.2846, "step": 309300 }, { "epoch": 0.3788, "grad_norm": 0.46956080198287964, "learning_rate": 7.62404e-05, "loss": 0.2853, "step": 309400 }, { "epoch": 0.379, "grad_norm": 0.2703664004802704, "learning_rate": 7.620040000000001e-05, "loss": 0.3082, "step": 309500 }, { "epoch": 0.3792, "grad_norm": 0.25405412912368774, "learning_rate": 7.61604e-05, "loss": 0.2806, "step": 309600 }, { "epoch": 0.3794, "grad_norm": 0.39644813537597656, "learning_rate": 7.61204e-05, "loss": 0.2921, "step": 309700 }, { "epoch": 0.3796, "grad_norm": 0.16149291396141052, "learning_rate": 7.60804e-05, "loss": 0.2622, "step": 309800 }, { "epoch": 0.3798, "grad_norm": 0.314878910779953, "learning_rate": 7.60404e-05, "loss": 0.2703, "step": 309900 }, { "epoch": 0.38, "grad_norm": 0.2966255247592926, "learning_rate": 7.60004e-05, "loss": 0.2854, "step": 310000 }, { "epoch": 0.3802, "grad_norm": 0.20654204487800598, "learning_rate": 7.59604e-05, "loss": 0.29, "step": 310100 }, { "epoch": 0.3804, "grad_norm": 0.2614297568798065, "learning_rate": 7.59204e-05, "loss": 0.2918, "step": 310200 }, { "epoch": 0.3806, "grad_norm": 0.22196181118488312, "learning_rate": 7.588040000000001e-05, "loss": 0.2773, "step": 310300 }, { "epoch": 0.3808, "grad_norm": 0.19726967811584473, "learning_rate": 7.58404e-05, "loss": 0.2817, "step": 310400 }, { "epoch": 0.381, "grad_norm": 0.24383938312530518, "learning_rate": 7.58004e-05, "loss": 0.2837, "step": 310500 }, { "epoch": 0.3812, "grad_norm": 0.21394483745098114, "learning_rate": 7.576040000000001e-05, "loss": 0.2759, "step": 310600 }, { "epoch": 0.3814, "grad_norm": 0.2858792245388031, "learning_rate": 7.57204e-05, "loss": 0.291, "step": 310700 }, { "epoch": 0.3816, "grad_norm": 0.19695499539375305, "learning_rate": 7.568040000000001e-05, "loss": 0.2819, "step": 310800 }, { "epoch": 0.3818, "grad_norm": 0.21151651442050934, "learning_rate": 7.56404e-05, "loss": 0.2702, "step": 310900 }, { "epoch": 0.382, "grad_norm": 0.22047953307628632, "learning_rate": 7.560040000000001e-05, "loss": 0.2866, "step": 311000 }, { "epoch": 0.3822, "grad_norm": 0.5067945718765259, "learning_rate": 7.55604e-05, "loss": 0.2756, "step": 311100 }, { "epoch": 0.3824, "grad_norm": 0.43224334716796875, "learning_rate": 7.55204e-05, "loss": 0.288, "step": 311200 }, { "epoch": 0.3826, "grad_norm": 0.2836945652961731, "learning_rate": 7.54804e-05, "loss": 0.2803, "step": 311300 }, { "epoch": 0.3828, "grad_norm": 0.6460776925086975, "learning_rate": 7.54404e-05, "loss": 0.2686, "step": 311400 }, { "epoch": 0.383, "grad_norm": 0.3450317978858948, "learning_rate": 7.54004e-05, "loss": 0.2747, "step": 311500 }, { "epoch": 0.3832, "grad_norm": 0.2429281622171402, "learning_rate": 7.536040000000001e-05, "loss": 0.2859, "step": 311600 }, { "epoch": 0.3834, "grad_norm": 0.1564161479473114, "learning_rate": 7.53204e-05, "loss": 0.282, "step": 311700 }, { "epoch": 0.3836, "grad_norm": 0.242931067943573, "learning_rate": 7.528040000000001e-05, "loss": 0.2638, "step": 311800 }, { "epoch": 0.3838, "grad_norm": 0.22181962430477142, "learning_rate": 7.52404e-05, "loss": 0.2862, "step": 311900 }, { "epoch": 0.384, "grad_norm": 0.23662132024765015, "learning_rate": 7.52004e-05, "loss": 0.282, "step": 312000 }, { "epoch": 0.3842, "grad_norm": 0.25929373502731323, "learning_rate": 7.51604e-05, "loss": 0.2827, "step": 312100 }, { "epoch": 0.3844, "grad_norm": 0.3989902436733246, "learning_rate": 7.51204e-05, "loss": 0.2733, "step": 312200 }, { "epoch": 0.3846, "grad_norm": 0.26393795013427734, "learning_rate": 7.508040000000001e-05, "loss": 0.2827, "step": 312300 }, { "epoch": 0.3848, "grad_norm": 0.6005264520645142, "learning_rate": 7.50404e-05, "loss": 0.2917, "step": 312400 }, { "epoch": 0.385, "grad_norm": 0.23418720066547394, "learning_rate": 7.500040000000001e-05, "loss": 0.2657, "step": 312500 }, { "epoch": 0.3852, "grad_norm": 0.1971062570810318, "learning_rate": 7.496040000000002e-05, "loss": 0.2741, "step": 312600 }, { "epoch": 0.3854, "grad_norm": 0.9465832710266113, "learning_rate": 7.49204e-05, "loss": 0.2761, "step": 312700 }, { "epoch": 0.3856, "grad_norm": 0.2490047663450241, "learning_rate": 7.48804e-05, "loss": 0.2843, "step": 312800 }, { "epoch": 0.3858, "grad_norm": 0.26003503799438477, "learning_rate": 7.48404e-05, "loss": 0.2663, "step": 312900 }, { "epoch": 0.386, "grad_norm": 1.0622795820236206, "learning_rate": 7.48004e-05, "loss": 0.2784, "step": 313000 }, { "epoch": 0.3862, "grad_norm": 0.25771743059158325, "learning_rate": 7.476040000000001e-05, "loss": 0.2726, "step": 313100 }, { "epoch": 0.3864, "grad_norm": 0.3834233283996582, "learning_rate": 7.47204e-05, "loss": 0.2598, "step": 313200 }, { "epoch": 0.3866, "grad_norm": 0.19467617571353912, "learning_rate": 7.468040000000001e-05, "loss": 0.2667, "step": 313300 }, { "epoch": 0.3868, "grad_norm": 0.23708763718605042, "learning_rate": 7.46404e-05, "loss": 0.2619, "step": 313400 }, { "epoch": 0.387, "grad_norm": 0.2725387215614319, "learning_rate": 7.46004e-05, "loss": 0.2723, "step": 313500 }, { "epoch": 0.3872, "grad_norm": 0.23830577731132507, "learning_rate": 7.45604e-05, "loss": 0.26, "step": 313600 }, { "epoch": 0.3874, "grad_norm": 0.20995588600635529, "learning_rate": 7.45204e-05, "loss": 0.2563, "step": 313700 }, { "epoch": 0.3876, "grad_norm": 0.20928217470645905, "learning_rate": 7.44804e-05, "loss": 0.2839, "step": 313800 }, { "epoch": 0.3878, "grad_norm": 0.21170674264431, "learning_rate": 7.44404e-05, "loss": 0.2693, "step": 313900 }, { "epoch": 0.388, "grad_norm": 0.16965176165103912, "learning_rate": 7.440040000000001e-05, "loss": 0.2739, "step": 314000 }, { "epoch": 0.3882, "grad_norm": 0.32784977555274963, "learning_rate": 7.436040000000001e-05, "loss": 0.2638, "step": 314100 }, { "epoch": 0.3884, "grad_norm": 0.45630523562431335, "learning_rate": 7.432040000000001e-05, "loss": 0.2681, "step": 314200 }, { "epoch": 0.3886, "grad_norm": 0.30572786927223206, "learning_rate": 7.42804e-05, "loss": 0.2632, "step": 314300 }, { "epoch": 0.3888, "grad_norm": 0.21931806206703186, "learning_rate": 7.42404e-05, "loss": 0.2717, "step": 314400 }, { "epoch": 0.389, "grad_norm": 0.28573718667030334, "learning_rate": 7.42004e-05, "loss": 0.2684, "step": 314500 }, { "epoch": 0.3892, "grad_norm": 2.636169672012329, "learning_rate": 7.416040000000001e-05, "loss": 0.2664, "step": 314600 }, { "epoch": 0.3894, "grad_norm": 0.19433589279651642, "learning_rate": 7.41204e-05, "loss": 0.264, "step": 314700 }, { "epoch": 0.3896, "grad_norm": 0.2967517673969269, "learning_rate": 7.408040000000001e-05, "loss": 0.2744, "step": 314800 }, { "epoch": 0.3898, "grad_norm": 0.29649585485458374, "learning_rate": 7.40404e-05, "loss": 0.2656, "step": 314900 }, { "epoch": 0.39, "grad_norm": 0.2209979146718979, "learning_rate": 7.40004e-05, "loss": 0.275, "step": 315000 }, { "epoch": 0.3902, "grad_norm": 0.1720481663942337, "learning_rate": 7.39604e-05, "loss": 0.2581, "step": 315100 }, { "epoch": 0.3904, "grad_norm": 0.2020689994096756, "learning_rate": 7.39204e-05, "loss": 0.2617, "step": 315200 }, { "epoch": 0.3906, "grad_norm": 0.22698251903057098, "learning_rate": 7.38804e-05, "loss": 0.269, "step": 315300 }, { "epoch": 0.3908, "grad_norm": 0.2073138952255249, "learning_rate": 7.38404e-05, "loss": 0.2801, "step": 315400 }, { "epoch": 0.391, "grad_norm": 0.4935136139392853, "learning_rate": 7.38004e-05, "loss": 0.2653, "step": 315500 }, { "epoch": 0.3912, "grad_norm": 0.1566108763217926, "learning_rate": 7.376040000000001e-05, "loss": 0.258, "step": 315600 }, { "epoch": 0.3914, "grad_norm": 0.24738989770412445, "learning_rate": 7.37204e-05, "loss": 0.2634, "step": 315700 }, { "epoch": 0.3916, "grad_norm": 0.7126500010490417, "learning_rate": 7.36804e-05, "loss": 0.2653, "step": 315800 }, { "epoch": 0.3918, "grad_norm": 0.23705486953258514, "learning_rate": 7.36404e-05, "loss": 0.2677, "step": 315900 }, { "epoch": 0.392, "grad_norm": 0.3229648470878601, "learning_rate": 7.36004e-05, "loss": 0.2816, "step": 316000 }, { "epoch": 0.3922, "grad_norm": 0.20382308959960938, "learning_rate": 7.356040000000001e-05, "loss": 0.2606, "step": 316100 }, { "epoch": 0.3924, "grad_norm": 0.19805587828159332, "learning_rate": 7.35204e-05, "loss": 0.2673, "step": 316200 }, { "epoch": 0.3926, "grad_norm": 0.24583861231803894, "learning_rate": 7.348040000000001e-05, "loss": 0.273, "step": 316300 }, { "epoch": 0.3928, "grad_norm": 0.2229291796684265, "learning_rate": 7.34404e-05, "loss": 0.2791, "step": 316400 }, { "epoch": 0.393, "grad_norm": 0.21293219923973083, "learning_rate": 7.34004e-05, "loss": 0.2672, "step": 316500 }, { "epoch": 0.3932, "grad_norm": 0.20457857847213745, "learning_rate": 7.33604e-05, "loss": 0.261, "step": 316600 }, { "epoch": 0.3934, "grad_norm": 0.21731828153133392, "learning_rate": 7.33204e-05, "loss": 0.2681, "step": 316700 }, { "epoch": 0.3936, "grad_norm": 0.26696541905403137, "learning_rate": 7.32804e-05, "loss": 0.2708, "step": 316800 }, { "epoch": 0.3938, "grad_norm": 0.21308811008930206, "learning_rate": 7.324040000000001e-05, "loss": 0.2642, "step": 316900 }, { "epoch": 0.394, "grad_norm": 0.2161969393491745, "learning_rate": 7.32004e-05, "loss": 0.2633, "step": 317000 }, { "epoch": 0.3942, "grad_norm": 0.21060483157634735, "learning_rate": 7.316040000000001e-05, "loss": 0.2669, "step": 317100 }, { "epoch": 0.3944, "grad_norm": 0.2582820653915405, "learning_rate": 7.31204e-05, "loss": 0.2644, "step": 317200 }, { "epoch": 0.3946, "grad_norm": 0.24517767131328583, "learning_rate": 7.30804e-05, "loss": 0.2691, "step": 317300 }, { "epoch": 0.3948, "grad_norm": 0.2194310426712036, "learning_rate": 7.30404e-05, "loss": 0.2653, "step": 317400 }, { "epoch": 0.395, "grad_norm": 0.28793978691101074, "learning_rate": 7.30004e-05, "loss": 0.2643, "step": 317500 }, { "epoch": 0.3952, "grad_norm": 0.6977545022964478, "learning_rate": 7.296040000000001e-05, "loss": 0.2723, "step": 317600 }, { "epoch": 0.3954, "grad_norm": 0.2802855372428894, "learning_rate": 7.29204e-05, "loss": 0.3051, "step": 317700 }, { "epoch": 0.3956, "grad_norm": 0.2557845413684845, "learning_rate": 7.288040000000001e-05, "loss": 0.2639, "step": 317800 }, { "epoch": 0.3958, "grad_norm": 0.3409139811992645, "learning_rate": 7.284040000000001e-05, "loss": 0.279, "step": 317900 }, { "epoch": 0.396, "grad_norm": 0.18252715468406677, "learning_rate": 7.280040000000001e-05, "loss": 0.2664, "step": 318000 }, { "epoch": 0.3962, "grad_norm": 0.2673168182373047, "learning_rate": 7.27604e-05, "loss": 0.2685, "step": 318100 }, { "epoch": 0.3964, "grad_norm": 0.37415772676467896, "learning_rate": 7.27204e-05, "loss": 0.2564, "step": 318200 }, { "epoch": 0.3966, "grad_norm": 0.23668460547924042, "learning_rate": 7.26804e-05, "loss": 0.2792, "step": 318300 }, { "epoch": 0.3968, "grad_norm": 0.4940067231655121, "learning_rate": 7.264040000000001e-05, "loss": 0.2757, "step": 318400 }, { "epoch": 0.397, "grad_norm": 0.18225626647472382, "learning_rate": 7.26004e-05, "loss": 0.2789, "step": 318500 }, { "epoch": 0.3972, "grad_norm": 0.19703717529773712, "learning_rate": 7.256040000000001e-05, "loss": 0.2761, "step": 318600 }, { "epoch": 0.3974, "grad_norm": 0.21940729022026062, "learning_rate": 7.25204e-05, "loss": 0.291, "step": 318700 }, { "epoch": 0.3976, "grad_norm": 0.293050080537796, "learning_rate": 7.24804e-05, "loss": 0.2809, "step": 318800 }, { "epoch": 0.3978, "grad_norm": 0.45579320192337036, "learning_rate": 7.24404e-05, "loss": 0.2659, "step": 318900 }, { "epoch": 0.398, "grad_norm": 0.20021362602710724, "learning_rate": 7.24004e-05, "loss": 0.2687, "step": 319000 }, { "epoch": 0.3982, "grad_norm": 0.22239625453948975, "learning_rate": 7.23604e-05, "loss": 0.2789, "step": 319100 }, { "epoch": 0.3984, "grad_norm": 0.2077815979719162, "learning_rate": 7.23204e-05, "loss": 0.2563, "step": 319200 }, { "epoch": 0.3986, "grad_norm": 0.2690514028072357, "learning_rate": 7.228040000000001e-05, "loss": 0.2593, "step": 319300 }, { "epoch": 0.3988, "grad_norm": 0.18208642303943634, "learning_rate": 7.224040000000001e-05, "loss": 0.2709, "step": 319400 }, { "epoch": 0.399, "grad_norm": 0.16601015627384186, "learning_rate": 7.220040000000001e-05, "loss": 0.2738, "step": 319500 }, { "epoch": 0.3992, "grad_norm": 0.18831057846546173, "learning_rate": 7.21604e-05, "loss": 0.2703, "step": 319600 }, { "epoch": 0.3994, "grad_norm": 0.19717268645763397, "learning_rate": 7.21204e-05, "loss": 0.2563, "step": 319700 }, { "epoch": 0.3996, "grad_norm": 0.2667396366596222, "learning_rate": 7.20804e-05, "loss": 0.2578, "step": 319800 }, { "epoch": 0.3998, "grad_norm": 0.2868683934211731, "learning_rate": 7.204040000000001e-05, "loss": 0.2691, "step": 319900 }, { "epoch": 0.4, "grad_norm": 0.21218955516815186, "learning_rate": 7.20004e-05, "loss": 0.2637, "step": 320000 }, { "epoch": 0.4002, "grad_norm": 0.4244702160358429, "learning_rate": 7.196040000000001e-05, "loss": 0.2767, "step": 320100 }, { "epoch": 0.4004, "grad_norm": 0.23811425268650055, "learning_rate": 7.19204e-05, "loss": 0.2627, "step": 320200 }, { "epoch": 0.4006, "grad_norm": 0.20318666100502014, "learning_rate": 7.18804e-05, "loss": 0.2616, "step": 320300 }, { "epoch": 0.4008, "grad_norm": 0.3025484085083008, "learning_rate": 7.18404e-05, "loss": 0.2685, "step": 320400 }, { "epoch": 0.401, "grad_norm": 0.17960281670093536, "learning_rate": 7.18004e-05, "loss": 0.2733, "step": 320500 }, { "epoch": 0.4012, "grad_norm": 0.25798577070236206, "learning_rate": 7.17604e-05, "loss": 0.264, "step": 320600 }, { "epoch": 0.4014, "grad_norm": 0.1824919730424881, "learning_rate": 7.17204e-05, "loss": 0.2684, "step": 320700 }, { "epoch": 0.4016, "grad_norm": 0.23640932142734528, "learning_rate": 7.16804e-05, "loss": 0.265, "step": 320800 }, { "epoch": 0.4018, "grad_norm": 0.22346696257591248, "learning_rate": 7.164040000000001e-05, "loss": 0.2574, "step": 320900 }, { "epoch": 0.402, "grad_norm": 0.33641451597213745, "learning_rate": 7.16004e-05, "loss": 0.2644, "step": 321000 }, { "epoch": 0.4022, "grad_norm": 0.24027025699615479, "learning_rate": 7.15604e-05, "loss": 0.2803, "step": 321100 }, { "epoch": 0.4024, "grad_norm": 0.2125771939754486, "learning_rate": 7.15204e-05, "loss": 0.2733, "step": 321200 }, { "epoch": 0.4026, "grad_norm": 0.2774077355861664, "learning_rate": 7.14804e-05, "loss": 0.2781, "step": 321300 }, { "epoch": 0.4028, "grad_norm": 0.1831909418106079, "learning_rate": 7.144040000000001e-05, "loss": 0.2661, "step": 321400 }, { "epoch": 0.403, "grad_norm": 0.2540408968925476, "learning_rate": 7.14004e-05, "loss": 0.2801, "step": 321500 }, { "epoch": 0.4032, "grad_norm": 0.2425178438425064, "learning_rate": 7.136040000000001e-05, "loss": 0.2584, "step": 321600 }, { "epoch": 0.4034, "grad_norm": 0.1953432857990265, "learning_rate": 7.13204e-05, "loss": 0.2674, "step": 321700 }, { "epoch": 0.4036, "grad_norm": 0.31084179878234863, "learning_rate": 7.12804e-05, "loss": 0.2599, "step": 321800 }, { "epoch": 0.4038, "grad_norm": 0.17004917562007904, "learning_rate": 7.12404e-05, "loss": 0.272, "step": 321900 }, { "epoch": 0.404, "grad_norm": 0.2186584621667862, "learning_rate": 7.12004e-05, "loss": 0.2615, "step": 322000 }, { "epoch": 0.4042, "grad_norm": 0.2131323516368866, "learning_rate": 7.11604e-05, "loss": 0.2734, "step": 322100 }, { "epoch": 0.4044, "grad_norm": 0.29444190859794617, "learning_rate": 7.11204e-05, "loss": 0.2577, "step": 322200 }, { "epoch": 0.4046, "grad_norm": 0.2686418890953064, "learning_rate": 7.10804e-05, "loss": 0.2636, "step": 322300 }, { "epoch": 0.4048, "grad_norm": 0.2958160936832428, "learning_rate": 7.104040000000001e-05, "loss": 0.2611, "step": 322400 }, { "epoch": 0.405, "grad_norm": 0.24784350395202637, "learning_rate": 7.10004e-05, "loss": 0.2658, "step": 322500 }, { "epoch": 0.4052, "grad_norm": 0.21654903888702393, "learning_rate": 7.09604e-05, "loss": 0.2625, "step": 322600 }, { "epoch": 0.4054, "grad_norm": 0.17862387001514435, "learning_rate": 7.092039999999999e-05, "loss": 0.2611, "step": 322700 }, { "epoch": 0.4056, "grad_norm": 0.2753390371799469, "learning_rate": 7.08804e-05, "loss": 0.2626, "step": 322800 }, { "epoch": 0.4058, "grad_norm": 0.41417863965034485, "learning_rate": 7.084040000000001e-05, "loss": 0.262, "step": 322900 }, { "epoch": 0.406, "grad_norm": 0.6197725534439087, "learning_rate": 7.08004e-05, "loss": 0.2647, "step": 323000 }, { "epoch": 0.4062, "grad_norm": 0.31697189807891846, "learning_rate": 7.076040000000001e-05, "loss": 0.2633, "step": 323100 }, { "epoch": 0.4064, "grad_norm": 0.7414258718490601, "learning_rate": 7.072040000000001e-05, "loss": 0.271, "step": 323200 }, { "epoch": 0.4066, "grad_norm": 0.22181887924671173, "learning_rate": 7.068040000000001e-05, "loss": 0.2637, "step": 323300 }, { "epoch": 0.4068, "grad_norm": 0.24176335334777832, "learning_rate": 7.06404e-05, "loss": 0.2641, "step": 323400 }, { "epoch": 0.407, "grad_norm": 0.2450632005929947, "learning_rate": 7.06004e-05, "loss": 0.2634, "step": 323500 }, { "epoch": 0.4072, "grad_norm": 0.1967063993215561, "learning_rate": 7.05604e-05, "loss": 0.273, "step": 323600 }, { "epoch": 0.4074, "grad_norm": 0.16844309866428375, "learning_rate": 7.052040000000001e-05, "loss": 0.2664, "step": 323700 }, { "epoch": 0.4076, "grad_norm": 0.24496600031852722, "learning_rate": 7.04804e-05, "loss": 0.2706, "step": 323800 }, { "epoch": 0.4078, "grad_norm": 0.24819956719875336, "learning_rate": 7.044040000000001e-05, "loss": 0.2573, "step": 323900 }, { "epoch": 0.408, "grad_norm": 0.31527596712112427, "learning_rate": 7.04004e-05, "loss": 0.2602, "step": 324000 }, { "epoch": 0.4082, "grad_norm": 0.22740913927555084, "learning_rate": 7.03604e-05, "loss": 0.2614, "step": 324100 }, { "epoch": 0.4084, "grad_norm": 0.2220146805047989, "learning_rate": 7.03204e-05, "loss": 0.2682, "step": 324200 }, { "epoch": 0.4086, "grad_norm": 0.22316837310791016, "learning_rate": 7.02804e-05, "loss": 0.2612, "step": 324300 }, { "epoch": 0.4088, "grad_norm": 0.2431160807609558, "learning_rate": 7.02404e-05, "loss": 0.2753, "step": 324400 }, { "epoch": 0.409, "grad_norm": 0.2228887677192688, "learning_rate": 7.02004e-05, "loss": 0.2612, "step": 324500 }, { "epoch": 0.4092, "grad_norm": 0.25735148787498474, "learning_rate": 7.01604e-05, "loss": 0.2639, "step": 324600 }, { "epoch": 0.4094, "grad_norm": 0.19601066410541534, "learning_rate": 7.012040000000001e-05, "loss": 0.2515, "step": 324700 }, { "epoch": 0.4096, "grad_norm": 0.2028639167547226, "learning_rate": 7.008040000000001e-05, "loss": 0.2704, "step": 324800 }, { "epoch": 0.4098, "grad_norm": 0.18071521818637848, "learning_rate": 7.00404e-05, "loss": 0.2657, "step": 324900 }, { "epoch": 0.41, "grad_norm": 0.22790144383907318, "learning_rate": 7.00004e-05, "loss": 0.2852, "step": 325000 } ], "logging_steps": 100, "max_steps": 500000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.491078602752e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }