diff --git "a/checkpoint-558/trainer_state.json" "b/checkpoint-558/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-558/trainer_state.json" @@ -0,0 +1,3940 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.5, + "eval_steps": 500, + "global_step": 558, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002688172043010753, + "grad_norm": 1.6433222600981285, + "learning_rate": 0.0, + "loss": 1.562, + "step": 1 + }, + { + "epoch": 0.005376344086021506, + "grad_norm": 1.6862631068558513, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.4706, + "step": 2 + }, + { + "epoch": 0.008064516129032258, + "grad_norm": 1.7423201097805276, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.5406, + "step": 3 + }, + { + "epoch": 0.010752688172043012, + "grad_norm": 1.7727625055064622, + "learning_rate": 3e-06, + "loss": 1.5182, + "step": 4 + }, + { + "epoch": 0.013440860215053764, + "grad_norm": 1.5457482765192463, + "learning_rate": 4.000000000000001e-06, + "loss": 1.5169, + "step": 5 + }, + { + "epoch": 0.016129032258064516, + "grad_norm": 1.5659007249743502, + "learning_rate": 5e-06, + "loss": 1.4922, + "step": 6 + }, + { + "epoch": 0.01881720430107527, + "grad_norm": 1.3878881126089677, + "learning_rate": 6e-06, + "loss": 1.4863, + "step": 7 + }, + { + "epoch": 0.021505376344086023, + "grad_norm": 1.295368020848385, + "learning_rate": 7e-06, + "loss": 1.4839, + "step": 8 + }, + { + "epoch": 0.024193548387096774, + "grad_norm": 1.589857887668944, + "learning_rate": 8.000000000000001e-06, + "loss": 1.4303, + "step": 9 + }, + { + "epoch": 0.026881720430107527, + "grad_norm": 2.60679604894195, + "learning_rate": 9e-06, + "loss": 1.3744, + "step": 10 + }, + { + "epoch": 0.02956989247311828, + "grad_norm": 0.8410885692002656, + "learning_rate": 1e-05, + "loss": 1.3498, + "step": 11 + }, + { + "epoch": 0.03225806451612903, + "grad_norm": 0.7927855266728604, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.3179, + "step": 12 + }, + { + "epoch": 0.03494623655913978, + "grad_norm": 0.6808035050220127, + "learning_rate": 1.2e-05, + "loss": 1.3268, + "step": 13 + }, + { + "epoch": 0.03763440860215054, + "grad_norm": 0.6602967909334083, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.2784, + "step": 14 + }, + { + "epoch": 0.04032258064516129, + "grad_norm": 0.5797556052811048, + "learning_rate": 1.4e-05, + "loss": 1.2949, + "step": 15 + }, + { + "epoch": 0.043010752688172046, + "grad_norm": 0.6000541560518325, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.288, + "step": 16 + }, + { + "epoch": 0.0456989247311828, + "grad_norm": 0.6494981992893607, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.2449, + "step": 17 + }, + { + "epoch": 0.04838709677419355, + "grad_norm": 0.6723097988215474, + "learning_rate": 1.7e-05, + "loss": 1.2102, + "step": 18 + }, + { + "epoch": 0.051075268817204304, + "grad_norm": 0.6702835925568053, + "learning_rate": 1.8e-05, + "loss": 1.2025, + "step": 19 + }, + { + "epoch": 0.053763440860215055, + "grad_norm": 0.625636082792655, + "learning_rate": 1.9e-05, + "loss": 1.2777, + "step": 20 + }, + { + "epoch": 0.056451612903225805, + "grad_norm": 0.6253912624763358, + "learning_rate": 2e-05, + "loss": 1.2669, + "step": 21 + }, + { + "epoch": 0.05913978494623656, + "grad_norm": 0.5910337660829342, + "learning_rate": 2.1000000000000002e-05, + "loss": 1.2654, + "step": 22 + }, + { + "epoch": 0.06182795698924731, + "grad_norm": 0.6304908028391322, + "learning_rate": 2.2000000000000003e-05, + "loss": 1.2413, + "step": 23 + }, + { + "epoch": 0.06451612903225806, + "grad_norm": 0.5377853121890415, + "learning_rate": 2.3e-05, + "loss": 1.2109, + "step": 24 + }, + { + "epoch": 0.06720430107526881, + "grad_norm": 0.4970873703549533, + "learning_rate": 2.4e-05, + "loss": 1.1359, + "step": 25 + }, + { + "epoch": 0.06989247311827956, + "grad_norm": 0.5292734885521813, + "learning_rate": 2.5e-05, + "loss": 1.2236, + "step": 26 + }, + { + "epoch": 0.07258064516129033, + "grad_norm": 0.5428754620149544, + "learning_rate": 2.6000000000000002e-05, + "loss": 1.2083, + "step": 27 + }, + { + "epoch": 0.07526881720430108, + "grad_norm": 0.5711123503896314, + "learning_rate": 2.7000000000000002e-05, + "loss": 1.2161, + "step": 28 + }, + { + "epoch": 0.07795698924731183, + "grad_norm": 0.49149041488377043, + "learning_rate": 2.8e-05, + "loss": 1.1454, + "step": 29 + }, + { + "epoch": 0.08064516129032258, + "grad_norm": 0.5285852530799724, + "learning_rate": 2.9e-05, + "loss": 1.1194, + "step": 30 + }, + { + "epoch": 0.08333333333333333, + "grad_norm": 0.5295555329242986, + "learning_rate": 3.0000000000000004e-05, + "loss": 1.1688, + "step": 31 + }, + { + "epoch": 0.08602150537634409, + "grad_norm": 0.465354706566009, + "learning_rate": 3.1e-05, + "loss": 1.1743, + "step": 32 + }, + { + "epoch": 0.08870967741935484, + "grad_norm": 0.4486072933924605, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.0818, + "step": 33 + }, + { + "epoch": 0.0913978494623656, + "grad_norm": 0.496727888984662, + "learning_rate": 3.3e-05, + "loss": 1.2101, + "step": 34 + }, + { + "epoch": 0.09408602150537634, + "grad_norm": 0.43899748210993167, + "learning_rate": 3.4e-05, + "loss": 1.1884, + "step": 35 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 0.4147227405541853, + "learning_rate": 3.5000000000000004e-05, + "loss": 1.0814, + "step": 36 + }, + { + "epoch": 0.09946236559139784, + "grad_norm": 0.48760701758721925, + "learning_rate": 3.6e-05, + "loss": 1.1212, + "step": 37 + }, + { + "epoch": 0.10215053763440861, + "grad_norm": 0.49917378567432974, + "learning_rate": 3.7000000000000005e-05, + "loss": 1.1984, + "step": 38 + }, + { + "epoch": 0.10483870967741936, + "grad_norm": 0.5304015628409972, + "learning_rate": 3.8e-05, + "loss": 1.1274, + "step": 39 + }, + { + "epoch": 0.10752688172043011, + "grad_norm": 0.4726408598975661, + "learning_rate": 3.9e-05, + "loss": 1.1323, + "step": 40 + }, + { + "epoch": 0.11021505376344086, + "grad_norm": 0.44174146995469904, + "learning_rate": 4e-05, + "loss": 1.1898, + "step": 41 + }, + { + "epoch": 0.11290322580645161, + "grad_norm": 0.5087279682773094, + "learning_rate": 3.999980086219931e-05, + "loss": 1.1469, + "step": 42 + }, + { + "epoch": 0.11559139784946236, + "grad_norm": 0.5626510931079601, + "learning_rate": 3.999920345276283e-05, + "loss": 1.1321, + "step": 43 + }, + { + "epoch": 0.11827956989247312, + "grad_norm": 0.47565220090788773, + "learning_rate": 3.999820778358724e-05, + "loss": 1.1453, + "step": 44 + }, + { + "epoch": 0.12096774193548387, + "grad_norm": 0.4431044005508681, + "learning_rate": 3.999681387450007e-05, + "loss": 1.1408, + "step": 45 + }, + { + "epoch": 0.12365591397849462, + "grad_norm": 0.47942624390584926, + "learning_rate": 3.999502175325932e-05, + "loss": 1.168, + "step": 46 + }, + { + "epoch": 0.12634408602150538, + "grad_norm": 0.43166434321061714, + "learning_rate": 3.999283145555291e-05, + "loss": 1.1087, + "step": 47 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 0.47105749411720044, + "learning_rate": 3.999024302499794e-05, + "loss": 1.0752, + "step": 48 + }, + { + "epoch": 0.13172043010752688, + "grad_norm": 0.3959072081415341, + "learning_rate": 3.998725651313984e-05, + "loss": 1.1011, + "step": 49 + }, + { + "epoch": 0.13440860215053763, + "grad_norm": 0.4416535692834609, + "learning_rate": 3.998387197945135e-05, + "loss": 1.1306, + "step": 50 + }, + { + "epoch": 0.13709677419354838, + "grad_norm": 0.4272647809985287, + "learning_rate": 3.9980089491331344e-05, + "loss": 1.1381, + "step": 51 + }, + { + "epoch": 0.13978494623655913, + "grad_norm": 0.47769854993592265, + "learning_rate": 3.997590912410345e-05, + "loss": 1.0976, + "step": 52 + }, + { + "epoch": 0.1424731182795699, + "grad_norm": 0.3877500456630632, + "learning_rate": 3.997133096101458e-05, + "loss": 1.128, + "step": 53 + }, + { + "epoch": 0.14516129032258066, + "grad_norm": 0.3869721085588235, + "learning_rate": 3.996635509323327e-05, + "loss": 1.1225, + "step": 54 + }, + { + "epoch": 0.1478494623655914, + "grad_norm": 0.47271590281090886, + "learning_rate": 3.9960981619847856e-05, + "loss": 1.1141, + "step": 55 + }, + { + "epoch": 0.15053763440860216, + "grad_norm": 0.4368206211090345, + "learning_rate": 3.99552106478645e-05, + "loss": 1.0872, + "step": 56 + }, + { + "epoch": 0.1532258064516129, + "grad_norm": 0.3872679475185707, + "learning_rate": 3.994904229220507e-05, + "loss": 1.1514, + "step": 57 + }, + { + "epoch": 0.15591397849462366, + "grad_norm": 0.406268890860899, + "learning_rate": 3.9942476675704854e-05, + "loss": 1.0965, + "step": 58 + }, + { + "epoch": 0.1586021505376344, + "grad_norm": 0.43172418498531184, + "learning_rate": 3.993551392911009e-05, + "loss": 1.1192, + "step": 59 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 0.4258357918752704, + "learning_rate": 3.9928154191075375e-05, + "loss": 1.0623, + "step": 60 + }, + { + "epoch": 0.1639784946236559, + "grad_norm": 0.4585556740184179, + "learning_rate": 3.9920397608160925e-05, + "loss": 1.1076, + "step": 61 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.44452627464263844, + "learning_rate": 3.991224433482961e-05, + "loss": 1.1107, + "step": 62 + }, + { + "epoch": 0.1693548387096774, + "grad_norm": 0.4787003491624029, + "learning_rate": 3.990369453344394e-05, + "loss": 1.1165, + "step": 63 + }, + { + "epoch": 0.17204301075268819, + "grad_norm": 0.4704549745433953, + "learning_rate": 3.989474837426277e-05, + "loss": 1.1541, + "step": 64 + }, + { + "epoch": 0.17473118279569894, + "grad_norm": 0.4026214434021435, + "learning_rate": 3.9885406035437953e-05, + "loss": 1.1166, + "step": 65 + }, + { + "epoch": 0.1774193548387097, + "grad_norm": 0.40057979364796353, + "learning_rate": 3.987566770301076e-05, + "loss": 1.0626, + "step": 66 + }, + { + "epoch": 0.18010752688172044, + "grad_norm": 0.4340486368362563, + "learning_rate": 3.98655335709082e-05, + "loss": 1.104, + "step": 67 + }, + { + "epoch": 0.1827956989247312, + "grad_norm": 0.42609639195543936, + "learning_rate": 3.985500384093917e-05, + "loss": 1.0893, + "step": 68 + }, + { + "epoch": 0.18548387096774194, + "grad_norm": 0.381378569874383, + "learning_rate": 3.984407872279037e-05, + "loss": 1.0433, + "step": 69 + }, + { + "epoch": 0.1881720430107527, + "grad_norm": 0.3903976348529897, + "learning_rate": 3.983275843402222e-05, + "loss": 1.1019, + "step": 70 + }, + { + "epoch": 0.19086021505376344, + "grad_norm": 0.3648695348221521, + "learning_rate": 3.982104320006446e-05, + "loss": 1.0992, + "step": 71 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 1.8993059639660952, + "learning_rate": 3.9808933254211665e-05, + "loss": 1.1056, + "step": 72 + }, + { + "epoch": 0.19623655913978494, + "grad_norm": 0.46580843289168206, + "learning_rate": 3.979642883761866e-05, + "loss": 1.1031, + "step": 73 + }, + { + "epoch": 0.1989247311827957, + "grad_norm": 0.449285515287558, + "learning_rate": 3.978353019929562e-05, + "loss": 1.1068, + "step": 74 + }, + { + "epoch": 0.20161290322580644, + "grad_norm": 0.5567418056951845, + "learning_rate": 3.977023759610321e-05, + "loss": 1.0446, + "step": 75 + }, + { + "epoch": 0.20430107526881722, + "grad_norm": 0.38684392317210076, + "learning_rate": 3.9756551292747405e-05, + "loss": 1.0377, + "step": 76 + }, + { + "epoch": 0.20698924731182797, + "grad_norm": 0.473773440244898, + "learning_rate": 3.974247156177423e-05, + "loss": 1.1396, + "step": 77 + }, + { + "epoch": 0.20967741935483872, + "grad_norm": 0.4177520757238314, + "learning_rate": 3.9727998683564355e-05, + "loss": 1.1008, + "step": 78 + }, + { + "epoch": 0.21236559139784947, + "grad_norm": 0.39719194878309766, + "learning_rate": 3.9713132946327494e-05, + "loss": 1.0215, + "step": 79 + }, + { + "epoch": 0.21505376344086022, + "grad_norm": 0.4105085260167095, + "learning_rate": 3.9697874646096675e-05, + "loss": 1.1115, + "step": 80 + }, + { + "epoch": 0.21774193548387097, + "grad_norm": 0.4087045401288919, + "learning_rate": 3.968222408672232e-05, + "loss": 1.0579, + "step": 81 + }, + { + "epoch": 0.22043010752688172, + "grad_norm": 0.39033402258475636, + "learning_rate": 3.9666181579866244e-05, + "loss": 1.0692, + "step": 82 + }, + { + "epoch": 0.22311827956989247, + "grad_norm": 0.41439706526743936, + "learning_rate": 3.964974744499539e-05, + "loss": 1.0865, + "step": 83 + }, + { + "epoch": 0.22580645161290322, + "grad_norm": 0.38234297411695073, + "learning_rate": 3.963292200937551e-05, + "loss": 1.0173, + "step": 84 + }, + { + "epoch": 0.22849462365591397, + "grad_norm": 0.5308750280660687, + "learning_rate": 3.961570560806461e-05, + "loss": 1.067, + "step": 85 + }, + { + "epoch": 0.23118279569892472, + "grad_norm": 0.43351295582441124, + "learning_rate": 3.959809858390634e-05, + "loss": 1.086, + "step": 86 + }, + { + "epoch": 0.23387096774193547, + "grad_norm": 0.42069712201952686, + "learning_rate": 3.9580101287523105e-05, + "loss": 1.1064, + "step": 87 + }, + { + "epoch": 0.23655913978494625, + "grad_norm": 0.42821523209412365, + "learning_rate": 3.95617140773091e-05, + "loss": 1.0263, + "step": 88 + }, + { + "epoch": 0.239247311827957, + "grad_norm": 0.4114502165683399, + "learning_rate": 3.954293731942319e-05, + "loss": 1.0729, + "step": 89 + }, + { + "epoch": 0.24193548387096775, + "grad_norm": 0.4131919780645225, + "learning_rate": 3.95237713877816e-05, + "loss": 1.0621, + "step": 90 + }, + { + "epoch": 0.2446236559139785, + "grad_norm": 0.4433939594965718, + "learning_rate": 3.950421666405048e-05, + "loss": 1.0805, + "step": 91 + }, + { + "epoch": 0.24731182795698925, + "grad_norm": 0.4056188018789589, + "learning_rate": 3.948427353763829e-05, + "loss": 1.0784, + "step": 92 + }, + { + "epoch": 0.25, + "grad_norm": 0.4642044159391645, + "learning_rate": 3.946394240568807e-05, + "loss": 1.0406, + "step": 93 + }, + { + "epoch": 0.25268817204301075, + "grad_norm": 0.4280982724994961, + "learning_rate": 3.944322367306951e-05, + "loss": 1.1117, + "step": 94 + }, + { + "epoch": 0.2553763440860215, + "grad_norm": 0.41758547723414086, + "learning_rate": 3.942211775237089e-05, + "loss": 1.0747, + "step": 95 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 0.4344009299837567, + "learning_rate": 3.940062506389089e-05, + "loss": 1.1249, + "step": 96 + }, + { + "epoch": 0.260752688172043, + "grad_norm": 0.3847297194838658, + "learning_rate": 3.937874603563015e-05, + "loss": 1.0977, + "step": 97 + }, + { + "epoch": 0.26344086021505375, + "grad_norm": 0.4959083398122344, + "learning_rate": 3.935648110328285e-05, + "loss": 1.041, + "step": 98 + }, + { + "epoch": 0.2661290322580645, + "grad_norm": 0.46262720954521647, + "learning_rate": 3.933383071022795e-05, + "loss": 1.0926, + "step": 99 + }, + { + "epoch": 0.26881720430107525, + "grad_norm": 0.4789561041937064, + "learning_rate": 3.93107953075204e-05, + "loss": 1.0701, + "step": 100 + }, + { + "epoch": 0.271505376344086, + "grad_norm": 0.4229869803365367, + "learning_rate": 3.928737535388214e-05, + "loss": 1.063, + "step": 101 + }, + { + "epoch": 0.27419354838709675, + "grad_norm": 0.43404703473814416, + "learning_rate": 3.9263571315692976e-05, + "loss": 1.0696, + "step": 102 + }, + { + "epoch": 0.2768817204301075, + "grad_norm": 0.4396716028324381, + "learning_rate": 3.923938366698129e-05, + "loss": 1.0317, + "step": 103 + }, + { + "epoch": 0.27956989247311825, + "grad_norm": 0.6860340156482403, + "learning_rate": 3.921481288941459e-05, + "loss": 1.0611, + "step": 104 + }, + { + "epoch": 0.28225806451612906, + "grad_norm": 0.39601683185098385, + "learning_rate": 3.9189859472289956e-05, + "loss": 1.0294, + "step": 105 + }, + { + "epoch": 0.2849462365591398, + "grad_norm": 0.39641986440862376, + "learning_rate": 3.9164523912524224e-05, + "loss": 1.0663, + "step": 106 + }, + { + "epoch": 0.28763440860215056, + "grad_norm": 0.3898209322812333, + "learning_rate": 3.913880671464418e-05, + "loss": 1.0671, + "step": 107 + }, + { + "epoch": 0.2903225806451613, + "grad_norm": 0.408678962590762, + "learning_rate": 3.911270839077644e-05, + "loss": 1.0224, + "step": 108 + }, + { + "epoch": 0.29301075268817206, + "grad_norm": 0.4681397312637908, + "learning_rate": 3.908622946063728e-05, + "loss": 1.091, + "step": 109 + }, + { + "epoch": 0.2956989247311828, + "grad_norm": 0.47955178042664964, + "learning_rate": 3.9059370451522295e-05, + "loss": 1.0961, + "step": 110 + }, + { + "epoch": 0.29838709677419356, + "grad_norm": 0.4229760577312693, + "learning_rate": 3.903213189829589e-05, + "loss": 1.0386, + "step": 111 + }, + { + "epoch": 0.3010752688172043, + "grad_norm": 0.39011319960684926, + "learning_rate": 3.900451434338062e-05, + "loss": 1.067, + "step": 112 + }, + { + "epoch": 0.30376344086021506, + "grad_norm": 0.39672904488910227, + "learning_rate": 3.8976518336746396e-05, + "loss": 1.0424, + "step": 113 + }, + { + "epoch": 0.3064516129032258, + "grad_norm": 0.49393594827425025, + "learning_rate": 3.894814443589954e-05, + "loss": 1.0695, + "step": 114 + }, + { + "epoch": 0.30913978494623656, + "grad_norm": 0.38254416729289076, + "learning_rate": 3.8919393205871676e-05, + "loss": 1.0801, + "step": 115 + }, + { + "epoch": 0.3118279569892473, + "grad_norm": 0.4456422459103533, + "learning_rate": 3.889026521920847e-05, + "loss": 1.0934, + "step": 116 + }, + { + "epoch": 0.31451612903225806, + "grad_norm": 0.39398196216047476, + "learning_rate": 3.886076105595825e-05, + "loss": 1.1011, + "step": 117 + }, + { + "epoch": 0.3172043010752688, + "grad_norm": 0.3949327527665007, + "learning_rate": 3.883088130366042e-05, + "loss": 1.018, + "step": 118 + }, + { + "epoch": 0.31989247311827956, + "grad_norm": 0.39254792724729387, + "learning_rate": 3.88006265573338e-05, + "loss": 1.0607, + "step": 119 + }, + { + "epoch": 0.3225806451612903, + "grad_norm": 0.5007199853312655, + "learning_rate": 3.876999741946478e-05, + "loss": 1.0609, + "step": 120 + }, + { + "epoch": 0.32526881720430106, + "grad_norm": 0.4619751408736227, + "learning_rate": 3.873899449999524e-05, + "loss": 1.0955, + "step": 121 + }, + { + "epoch": 0.3279569892473118, + "grad_norm": 0.48219172224114765, + "learning_rate": 3.870761841631051e-05, + "loss": 1.063, + "step": 122 + }, + { + "epoch": 0.33064516129032256, + "grad_norm": 0.4054037874416271, + "learning_rate": 3.867586979322703e-05, + "loss": 1.0907, + "step": 123 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.43161457507331874, + "learning_rate": 3.8643749262979896e-05, + "loss": 1.0666, + "step": 124 + }, + { + "epoch": 0.33602150537634407, + "grad_norm": 0.36751029685084174, + "learning_rate": 3.861125746521028e-05, + "loss": 1.0557, + "step": 125 + }, + { + "epoch": 0.3387096774193548, + "grad_norm": 0.46690938120869707, + "learning_rate": 3.8578395046952686e-05, + "loss": 1.1023, + "step": 126 + }, + { + "epoch": 0.34139784946236557, + "grad_norm": 0.3988094995343537, + "learning_rate": 3.85451626626221e-05, + "loss": 1.0717, + "step": 127 + }, + { + "epoch": 0.34408602150537637, + "grad_norm": 0.48432619617982536, + "learning_rate": 3.85115609740009e-05, + "loss": 1.0271, + "step": 128 + }, + { + "epoch": 0.3467741935483871, + "grad_norm": 0.5127948499632843, + "learning_rate": 3.8477590650225735e-05, + "loss": 1.0575, + "step": 129 + }, + { + "epoch": 0.34946236559139787, + "grad_norm": 0.4132091412639387, + "learning_rate": 3.8443252367774164e-05, + "loss": 1.0355, + "step": 130 + }, + { + "epoch": 0.3521505376344086, + "grad_norm": 0.4439631972175399, + "learning_rate": 3.8408546810451176e-05, + "loss": 1.0541, + "step": 131 + }, + { + "epoch": 0.3548387096774194, + "grad_norm": 0.3956247259769062, + "learning_rate": 3.837347466937562e-05, + "loss": 1.0672, + "step": 132 + }, + { + "epoch": 0.3575268817204301, + "grad_norm": 0.44952249373265674, + "learning_rate": 3.8338036642966396e-05, + "loss": 1.0444, + "step": 133 + }, + { + "epoch": 0.3602150537634409, + "grad_norm": 0.4449484078947791, + "learning_rate": 3.830223343692857e-05, + "loss": 1.0514, + "step": 134 + }, + { + "epoch": 0.3629032258064516, + "grad_norm": 0.3905509358873801, + "learning_rate": 3.826606576423931e-05, + "loss": 1.0394, + "step": 135 + }, + { + "epoch": 0.3655913978494624, + "grad_norm": 0.4183744146790331, + "learning_rate": 3.8229534345133695e-05, + "loss": 1.0212, + "step": 136 + }, + { + "epoch": 0.3682795698924731, + "grad_norm": 0.46086732418604737, + "learning_rate": 3.819263990709037e-05, + "loss": 0.994, + "step": 137 + }, + { + "epoch": 0.3709677419354839, + "grad_norm": 0.4468564375555911, + "learning_rate": 3.8155383184817064e-05, + "loss": 1.0279, + "step": 138 + }, + { + "epoch": 0.3736559139784946, + "grad_norm": 0.3966511312736679, + "learning_rate": 3.8117764920235945e-05, + "loss": 0.9992, + "step": 139 + }, + { + "epoch": 0.3763440860215054, + "grad_norm": 0.46461846433833476, + "learning_rate": 3.807978586246887e-05, + "loss": 1.088, + "step": 140 + }, + { + "epoch": 0.3790322580645161, + "grad_norm": 0.4254641795470929, + "learning_rate": 3.804144676782243e-05, + "loss": 1.0764, + "step": 141 + }, + { + "epoch": 0.3817204301075269, + "grad_norm": 0.42137203485219293, + "learning_rate": 3.800274839977293e-05, + "loss": 1.0422, + "step": 142 + }, + { + "epoch": 0.3844086021505376, + "grad_norm": 0.4172681789743796, + "learning_rate": 3.796369152895117e-05, + "loss": 1.0453, + "step": 143 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 0.4531431509751161, + "learning_rate": 3.792427693312707e-05, + "loss": 1.0389, + "step": 144 + }, + { + "epoch": 0.3897849462365591, + "grad_norm": 0.3782466419505299, + "learning_rate": 3.788450539719423e-05, + "loss": 1.025, + "step": 145 + }, + { + "epoch": 0.3924731182795699, + "grad_norm": 0.4655605897605627, + "learning_rate": 3.7844377713154264e-05, + "loss": 1.064, + "step": 146 + }, + { + "epoch": 0.3951612903225806, + "grad_norm": 0.4384836890227208, + "learning_rate": 3.780389468010106e-05, + "loss": 1.0397, + "step": 147 + }, + { + "epoch": 0.3978494623655914, + "grad_norm": 0.4844715439450037, + "learning_rate": 3.776305710420482e-05, + "loss": 1.1193, + "step": 148 + }, + { + "epoch": 0.40053763440860213, + "grad_norm": 0.41760675460607827, + "learning_rate": 3.7721865798696056e-05, + "loss": 1.0124, + "step": 149 + }, + { + "epoch": 0.4032258064516129, + "grad_norm": 0.7337537478769387, + "learning_rate": 3.7680321583849365e-05, + "loss": 1.0508, + "step": 150 + }, + { + "epoch": 0.40591397849462363, + "grad_norm": 0.44725816367920673, + "learning_rate": 3.76384252869671e-05, + "loss": 1.0434, + "step": 151 + }, + { + "epoch": 0.40860215053763443, + "grad_norm": 0.40870612635720194, + "learning_rate": 3.759617774236292e-05, + "loss": 1.068, + "step": 152 + }, + { + "epoch": 0.4112903225806452, + "grad_norm": 0.4534649483932217, + "learning_rate": 3.755357979134511e-05, + "loss": 1.0614, + "step": 153 + }, + { + "epoch": 0.41397849462365593, + "grad_norm": 0.41986572053185917, + "learning_rate": 3.751063228219993e-05, + "loss": 1.0391, + "step": 154 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.3717380879536067, + "learning_rate": 3.7467336070174604e-05, + "loss": 1.0378, + "step": 155 + }, + { + "epoch": 0.41935483870967744, + "grad_norm": 0.41848537015206944, + "learning_rate": 3.742369201746038e-05, + "loss": 1.0439, + "step": 156 + }, + { + "epoch": 0.4220430107526882, + "grad_norm": 0.43443932018052933, + "learning_rate": 3.737970099317535e-05, + "loss": 1.0197, + "step": 157 + }, + { + "epoch": 0.42473118279569894, + "grad_norm": 0.421554546653683, + "learning_rate": 3.7335363873347056e-05, + "loss": 1.0487, + "step": 158 + }, + { + "epoch": 0.4274193548387097, + "grad_norm": 0.8430023271255561, + "learning_rate": 3.729068154089519e-05, + "loss": 1.0333, + "step": 159 + }, + { + "epoch": 0.43010752688172044, + "grad_norm": 0.4363044724173691, + "learning_rate": 3.724565488561387e-05, + "loss": 1.0213, + "step": 160 + }, + { + "epoch": 0.4327956989247312, + "grad_norm": 0.5335682969510431, + "learning_rate": 3.720028480415401e-05, + "loss": 1.0205, + "step": 161 + }, + { + "epoch": 0.43548387096774194, + "grad_norm": 0.4056834135687678, + "learning_rate": 3.7154572200005446e-05, + "loss": 1.0311, + "step": 162 + }, + { + "epoch": 0.4381720430107527, + "grad_norm": 0.5322107401886871, + "learning_rate": 3.710851798347891e-05, + "loss": 1.0601, + "step": 163 + }, + { + "epoch": 0.44086021505376344, + "grad_norm": 0.4138677278304246, + "learning_rate": 3.7062123071687944e-05, + "loss": 1.0361, + "step": 164 + }, + { + "epoch": 0.4435483870967742, + "grad_norm": 0.4775100325512625, + "learning_rate": 3.701538838853062e-05, + "loss": 1.0194, + "step": 165 + }, + { + "epoch": 0.44623655913978494, + "grad_norm": 0.40839482534046995, + "learning_rate": 3.696831486467114e-05, + "loss": 1.0463, + "step": 166 + }, + { + "epoch": 0.4489247311827957, + "grad_norm": 0.3963093446633738, + "learning_rate": 3.6920903437521305e-05, + "loss": 1.0238, + "step": 167 + }, + { + "epoch": 0.45161290322580644, + "grad_norm": 0.4344752184390704, + "learning_rate": 3.6873155051221846e-05, + "loss": 1.0472, + "step": 168 + }, + { + "epoch": 0.4543010752688172, + "grad_norm": 0.4167014186949368, + "learning_rate": 3.6825070656623626e-05, + "loss": 1.0599, + "step": 169 + }, + { + "epoch": 0.45698924731182794, + "grad_norm": 0.43904590007956124, + "learning_rate": 3.677665121126871e-05, + "loss": 1.0559, + "step": 170 + }, + { + "epoch": 0.4596774193548387, + "grad_norm": 0.372185063148541, + "learning_rate": 3.6727897679371276e-05, + "loss": 1.0012, + "step": 171 + }, + { + "epoch": 0.46236559139784944, + "grad_norm": 0.43086731351488916, + "learning_rate": 3.667881103179844e-05, + "loss": 1.0133, + "step": 172 + }, + { + "epoch": 0.4650537634408602, + "grad_norm": 0.5796354347464544, + "learning_rate": 3.662939224605091e-05, + "loss": 1.0517, + "step": 173 + }, + { + "epoch": 0.46774193548387094, + "grad_norm": 0.4587453684541154, + "learning_rate": 3.657964230624351e-05, + "loss": 1.0164, + "step": 174 + }, + { + "epoch": 0.47043010752688175, + "grad_norm": 0.5102852182866393, + "learning_rate": 3.6529562203085595e-05, + "loss": 1.052, + "step": 175 + }, + { + "epoch": 0.4731182795698925, + "grad_norm": 0.4469591346380821, + "learning_rate": 3.6479152933861336e-05, + "loss": 1.0905, + "step": 176 + }, + { + "epoch": 0.47580645161290325, + "grad_norm": 0.45277428352010624, + "learning_rate": 3.642841550240983e-05, + "loss": 1.0961, + "step": 177 + }, + { + "epoch": 0.478494623655914, + "grad_norm": 0.45588595960031525, + "learning_rate": 3.6377350919105136e-05, + "loss": 1.0178, + "step": 178 + }, + { + "epoch": 0.48118279569892475, + "grad_norm": 0.6147997034643559, + "learning_rate": 3.632596020083612e-05, + "loss": 1.0148, + "step": 179 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 0.3734326271789308, + "learning_rate": 3.627424437098625e-05, + "loss": 1.0006, + "step": 180 + }, + { + "epoch": 0.48655913978494625, + "grad_norm": 0.4564187594173089, + "learning_rate": 3.6222204459413186e-05, + "loss": 1.0635, + "step": 181 + }, + { + "epoch": 0.489247311827957, + "grad_norm": 0.42811733614493086, + "learning_rate": 3.6169841502428285e-05, + "loss": 1.0469, + "step": 182 + }, + { + "epoch": 0.49193548387096775, + "grad_norm": 0.4227875509642681, + "learning_rate": 3.611715654277596e-05, + "loss": 1.0446, + "step": 183 + }, + { + "epoch": 0.4946236559139785, + "grad_norm": 0.40548546169007965, + "learning_rate": 3.60641506296129e-05, + "loss": 1.0564, + "step": 184 + }, + { + "epoch": 0.49731182795698925, + "grad_norm": 0.4161116484325749, + "learning_rate": 3.601082481848721e-05, + "loss": 0.9917, + "step": 185 + }, + { + "epoch": 0.5, + "grad_norm": 0.39180067540636987, + "learning_rate": 3.595718017131736e-05, + "loss": 1.0081, + "step": 186 + }, + { + "epoch": 0.5026881720430108, + "grad_norm": 0.5307122561583237, + "learning_rate": 3.5903217756371066e-05, + "loss": 0.9972, + "step": 187 + }, + { + "epoch": 0.5053763440860215, + "grad_norm": 0.4633315164676552, + "learning_rate": 3.5848938648243976e-05, + "loss": 1.0196, + "step": 188 + }, + { + "epoch": 0.5080645161290323, + "grad_norm": 0.43457272116367207, + "learning_rate": 3.579434392783832e-05, + "loss": 1.0429, + "step": 189 + }, + { + "epoch": 0.510752688172043, + "grad_norm": 0.42602042879132207, + "learning_rate": 3.5739434682341355e-05, + "loss": 1.0355, + "step": 190 + }, + { + "epoch": 0.5134408602150538, + "grad_norm": 0.37328410492227004, + "learning_rate": 3.568421200520371e-05, + "loss": 1.0158, + "step": 191 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 0.47901349260363574, + "learning_rate": 3.562867699611764e-05, + "loss": 1.006, + "step": 192 + }, + { + "epoch": 0.5188172043010753, + "grad_norm": 0.6800894155552869, + "learning_rate": 3.55728307609951e-05, + "loss": 1.0819, + "step": 193 + }, + { + "epoch": 0.521505376344086, + "grad_norm": 0.6815573295093794, + "learning_rate": 3.5516674411945747e-05, + "loss": 0.9767, + "step": 194 + }, + { + "epoch": 0.5241935483870968, + "grad_norm": 0.40923877696875666, + "learning_rate": 3.546020906725474e-05, + "loss": 1.0048, + "step": 195 + }, + { + "epoch": 0.5268817204301075, + "grad_norm": 0.39166638466881304, + "learning_rate": 3.540343585136056e-05, + "loss": 1.0115, + "step": 196 + }, + { + "epoch": 0.5295698924731183, + "grad_norm": 0.46039879078749524, + "learning_rate": 3.5346355894832515e-05, + "loss": 1.0274, + "step": 197 + }, + { + "epoch": 0.532258064516129, + "grad_norm": 0.435003701062386, + "learning_rate": 3.5288970334348324e-05, + "loss": 1.0262, + "step": 198 + }, + { + "epoch": 0.5349462365591398, + "grad_norm": 0.46422099557675184, + "learning_rate": 3.5231280312671426e-05, + "loss": 1.0406, + "step": 199 + }, + { + "epoch": 0.5376344086021505, + "grad_norm": 0.3946242892533647, + "learning_rate": 3.51732869786282e-05, + "loss": 1.0351, + "step": 200 + }, + { + "epoch": 0.5403225806451613, + "grad_norm": 0.4593963303455073, + "learning_rate": 3.511499148708517e-05, + "loss": 1.0161, + "step": 201 + }, + { + "epoch": 0.543010752688172, + "grad_norm": 0.43211273427185715, + "learning_rate": 3.505639499892591e-05, + "loss": 1.0339, + "step": 202 + }, + { + "epoch": 0.5456989247311828, + "grad_norm": 0.4638011311631454, + "learning_rate": 3.499749868102802e-05, + "loss": 1.0195, + "step": 203 + }, + { + "epoch": 0.5483870967741935, + "grad_norm": 0.4606785516075864, + "learning_rate": 3.4938303706239814e-05, + "loss": 1.0809, + "step": 204 + }, + { + "epoch": 0.5510752688172043, + "grad_norm": 0.4750835163830621, + "learning_rate": 3.487881125335699e-05, + "loss": 1.0104, + "step": 205 + }, + { + "epoch": 0.553763440860215, + "grad_norm": 0.48069623342657913, + "learning_rate": 3.4819022507099184e-05, + "loss": 1.0534, + "step": 206 + }, + { + "epoch": 0.5564516129032258, + "grad_norm": 0.4485052357605267, + "learning_rate": 3.475893865808633e-05, + "loss": 1.008, + "step": 207 + }, + { + "epoch": 0.5591397849462365, + "grad_norm": 0.45226568470539963, + "learning_rate": 3.4698560902815e-05, + "loss": 0.9859, + "step": 208 + }, + { + "epoch": 0.5618279569892473, + "grad_norm": 0.4556713744237398, + "learning_rate": 3.463789044363451e-05, + "loss": 1.0468, + "step": 209 + }, + { + "epoch": 0.5645161290322581, + "grad_norm": 0.40515419542450315, + "learning_rate": 3.4576928488723056e-05, + "loss": 1.0069, + "step": 210 + }, + { + "epoch": 0.5672043010752689, + "grad_norm": 0.407850239298829, + "learning_rate": 3.4515676252063595e-05, + "loss": 1.024, + "step": 211 + }, + { + "epoch": 0.5698924731182796, + "grad_norm": 0.4245125668059516, + "learning_rate": 3.445413495341971e-05, + "loss": 0.9842, + "step": 212 + }, + { + "epoch": 0.5725806451612904, + "grad_norm": 0.5282266357639802, + "learning_rate": 3.439230581831126e-05, + "loss": 1.0511, + "step": 213 + }, + { + "epoch": 0.5752688172043011, + "grad_norm": 0.46721556238008377, + "learning_rate": 3.433019007799007e-05, + "loss": 1.0722, + "step": 214 + }, + { + "epoch": 0.5779569892473119, + "grad_norm": 0.3998174935596331, + "learning_rate": 3.4267788969415315e-05, + "loss": 1.0417, + "step": 215 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 0.39836497217157424, + "learning_rate": 3.420510373522896e-05, + "loss": 0.9522, + "step": 216 + }, + { + "epoch": 0.5833333333333334, + "grad_norm": 0.5604060165845736, + "learning_rate": 3.4142135623730954e-05, + "loss": 1.0406, + "step": 217 + }, + { + "epoch": 0.5860215053763441, + "grad_norm": 0.4626752931850209, + "learning_rate": 3.4078885888854436e-05, + "loss": 1.0403, + "step": 218 + }, + { + "epoch": 0.5887096774193549, + "grad_norm": 0.4119865874583256, + "learning_rate": 3.4015355790140715e-05, + "loss": 0.974, + "step": 219 + }, + { + "epoch": 0.5913978494623656, + "grad_norm": 0.41688760669607, + "learning_rate": 3.39515465927142e-05, + "loss": 1.0354, + "step": 220 + }, + { + "epoch": 0.5940860215053764, + "grad_norm": 0.47263736408876167, + "learning_rate": 3.388745956725722e-05, + "loss": 1.0438, + "step": 221 + }, + { + "epoch": 0.5967741935483871, + "grad_norm": 0.48712838990373963, + "learning_rate": 3.3823095989984697e-05, + "loss": 0.9847, + "step": 222 + }, + { + "epoch": 0.5994623655913979, + "grad_norm": 0.39317905049275836, + "learning_rate": 3.3758457142618754e-05, + "loss": 0.9806, + "step": 223 + }, + { + "epoch": 0.6021505376344086, + "grad_norm": 0.484001386994586, + "learning_rate": 3.369354431236319e-05, + "loss": 1.0003, + "step": 224 + }, + { + "epoch": 0.6048387096774194, + "grad_norm": 0.3896751020684252, + "learning_rate": 3.362835879187783e-05, + "loss": 0.9314, + "step": 225 + }, + { + "epoch": 0.6075268817204301, + "grad_norm": 0.402131340210077, + "learning_rate": 3.356290187925278e-05, + "loss": 0.957, + "step": 226 + }, + { + "epoch": 0.6102150537634409, + "grad_norm": 0.4442069284277535, + "learning_rate": 3.349717487798261e-05, + "loss": 1.0651, + "step": 227 + }, + { + "epoch": 0.6129032258064516, + "grad_norm": 0.4075067959077034, + "learning_rate": 3.3431179096940375e-05, + "loss": 1.0117, + "step": 228 + }, + { + "epoch": 0.6155913978494624, + "grad_norm": 0.4595977891340027, + "learning_rate": 3.3364915850351525e-05, + "loss": 1.0277, + "step": 229 + }, + { + "epoch": 0.6182795698924731, + "grad_norm": 0.41565240224286376, + "learning_rate": 3.3298386457767804e-05, + "loss": 0.9873, + "step": 230 + }, + { + "epoch": 0.6209677419354839, + "grad_norm": 0.400290934516727, + "learning_rate": 3.3231592244040885e-05, + "loss": 1.0503, + "step": 231 + }, + { + "epoch": 0.6236559139784946, + "grad_norm": 0.43593503744528256, + "learning_rate": 3.3164534539296056e-05, + "loss": 1.0256, + "step": 232 + }, + { + "epoch": 0.6263440860215054, + "grad_norm": 0.4297576409774745, + "learning_rate": 3.309721467890571e-05, + "loss": 0.9873, + "step": 233 + }, + { + "epoch": 0.6290322580645161, + "grad_norm": 0.5286155107560961, + "learning_rate": 3.302963400346272e-05, + "loss": 1.0526, + "step": 234 + }, + { + "epoch": 0.6317204301075269, + "grad_norm": 0.4080215430723157, + "learning_rate": 3.296179385875381e-05, + "loss": 0.993, + "step": 235 + }, + { + "epoch": 0.6344086021505376, + "grad_norm": 0.4666697414536282, + "learning_rate": 3.2893695595732705e-05, + "loss": 0.9855, + "step": 236 + }, + { + "epoch": 0.6370967741935484, + "grad_norm": 0.44576593027115785, + "learning_rate": 3.282534057049322e-05, + "loss": 0.994, + "step": 237 + }, + { + "epoch": 0.6397849462365591, + "grad_norm": 0.45875921319019286, + "learning_rate": 3.275673014424231e-05, + "loss": 1.0695, + "step": 238 + }, + { + "epoch": 0.6424731182795699, + "grad_norm": 0.4483391985101821, + "learning_rate": 3.268786568327291e-05, + "loss": 1.0413, + "step": 239 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 0.3823024947210084, + "learning_rate": 3.261874855893675e-05, + "loss": 1.0634, + "step": 240 + }, + { + "epoch": 0.6478494623655914, + "grad_norm": 0.42590418591004187, + "learning_rate": 3.254938014761704e-05, + "loss": 1.1039, + "step": 241 + }, + { + "epoch": 0.6505376344086021, + "grad_norm": 0.4436207874701427, + "learning_rate": 3.2479761830701075e-05, + "loss": 1.0797, + "step": 242 + }, + { + "epoch": 0.6532258064516129, + "grad_norm": 0.5436242022516592, + "learning_rate": 3.240989499455269e-05, + "loss": 0.998, + "step": 243 + }, + { + "epoch": 0.6559139784946236, + "grad_norm": 0.42461660808494955, + "learning_rate": 3.2339781030484715e-05, + "loss": 1.0014, + "step": 244 + }, + { + "epoch": 0.6586021505376344, + "grad_norm": 0.4147658974390641, + "learning_rate": 3.2269421334731196e-05, + "loss": 1.0047, + "step": 245 + }, + { + "epoch": 0.6612903225806451, + "grad_norm": 0.3702000902999608, + "learning_rate": 3.219881730841964e-05, + "loss": 1.0057, + "step": 246 + }, + { + "epoch": 0.6639784946236559, + "grad_norm": 0.37405944820555137, + "learning_rate": 3.212797035754311e-05, + "loss": 0.9881, + "step": 247 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.39789221907192235, + "learning_rate": 3.205688189293219e-05, + "loss": 1.002, + "step": 248 + }, + { + "epoch": 0.6693548387096774, + "grad_norm": 0.35269099760384387, + "learning_rate": 3.198555333022694e-05, + "loss": 1.0445, + "step": 249 + }, + { + "epoch": 0.6720430107526881, + "grad_norm": 0.39171670743365294, + "learning_rate": 3.191398608984867e-05, + "loss": 0.9873, + "step": 250 + }, + { + "epoch": 0.6747311827956989, + "grad_norm": 0.36377972714827284, + "learning_rate": 3.184218159697166e-05, + "loss": 0.9678, + "step": 251 + }, + { + "epoch": 0.6774193548387096, + "grad_norm": 0.4760701686418637, + "learning_rate": 3.177014128149479e-05, + "loss": 1.0475, + "step": 252 + }, + { + "epoch": 0.6801075268817204, + "grad_norm": 0.36306748600915323, + "learning_rate": 3.169786657801306e-05, + "loss": 0.9737, + "step": 253 + }, + { + "epoch": 0.6827956989247311, + "grad_norm": 0.36397370143939106, + "learning_rate": 3.162535892578903e-05, + "loss": 1.0009, + "step": 254 + }, + { + "epoch": 0.6854838709677419, + "grad_norm": 0.41923544253489314, + "learning_rate": 3.155261976872412e-05, + "loss": 0.9855, + "step": 255 + }, + { + "epoch": 0.6881720430107527, + "grad_norm": 0.4349008134787599, + "learning_rate": 3.147965055532991e-05, + "loss": 0.9843, + "step": 256 + }, + { + "epoch": 0.6908602150537635, + "grad_norm": 0.4403161475473632, + "learning_rate": 3.1406452738699284e-05, + "loss": 0.9932, + "step": 257 + }, + { + "epoch": 0.6935483870967742, + "grad_norm": 0.4088632034626185, + "learning_rate": 3.1333027776477454e-05, + "loss": 1.0175, + "step": 258 + }, + { + "epoch": 0.696236559139785, + "grad_norm": 0.4089626667866183, + "learning_rate": 3.125937713083296e-05, + "loss": 0.9957, + "step": 259 + }, + { + "epoch": 0.6989247311827957, + "grad_norm": 0.44005061948101687, + "learning_rate": 3.118550226842857e-05, + "loss": 0.9902, + "step": 260 + }, + { + "epoch": 0.7016129032258065, + "grad_norm": 1.1016022022748841, + "learning_rate": 3.111140466039205e-05, + "loss": 0.991, + "step": 261 + }, + { + "epoch": 0.7043010752688172, + "grad_norm": 0.39448956783294353, + "learning_rate": 3.103708578228686e-05, + "loss": 1.0041, + "step": 262 + }, + { + "epoch": 0.706989247311828, + "grad_norm": 0.41388488702273174, + "learning_rate": 3.0962547114082804e-05, + "loss": 0.9928, + "step": 263 + }, + { + "epoch": 0.7096774193548387, + "grad_norm": 0.4065224464102798, + "learning_rate": 3.088779014012652e-05, + "loss": 0.9859, + "step": 264 + }, + { + "epoch": 0.7123655913978495, + "grad_norm": 0.39952347811781436, + "learning_rate": 3.0812816349111956e-05, + "loss": 0.9613, + "step": 265 + }, + { + "epoch": 0.7150537634408602, + "grad_norm": 0.43554876713734897, + "learning_rate": 3.073762723405069e-05, + "loss": 1.0289, + "step": 266 + }, + { + "epoch": 0.717741935483871, + "grad_norm": 0.469813057633801, + "learning_rate": 3.066222429224221e-05, + "loss": 1.0438, + "step": 267 + }, + { + "epoch": 0.7204301075268817, + "grad_norm": 0.4353123605440106, + "learning_rate": 3.0586609025244144e-05, + "loss": 1.0017, + "step": 268 + }, + { + "epoch": 0.7231182795698925, + "grad_norm": 0.40010712539262144, + "learning_rate": 3.051078293884226e-05, + "loss": 1.0254, + "step": 269 + }, + { + "epoch": 0.7258064516129032, + "grad_norm": 0.41179768187019394, + "learning_rate": 3.0434747543020585e-05, + "loss": 1.0167, + "step": 270 + }, + { + "epoch": 0.728494623655914, + "grad_norm": 0.39261397155250993, + "learning_rate": 3.0358504351931265e-05, + "loss": 0.9987, + "step": 271 + }, + { + "epoch": 0.7311827956989247, + "grad_norm": 0.4037853365263608, + "learning_rate": 3.0282054883864434e-05, + "loss": 1.0016, + "step": 272 + }, + { + "epoch": 0.7338709677419355, + "grad_norm": 0.3920371074761728, + "learning_rate": 3.0205400661218e-05, + "loss": 0.9427, + "step": 273 + }, + { + "epoch": 0.7365591397849462, + "grad_norm": 0.4525036893342772, + "learning_rate": 3.0128543210467273e-05, + "loss": 1.0566, + "step": 274 + }, + { + "epoch": 0.739247311827957, + "grad_norm": 0.41264407607647574, + "learning_rate": 3.0051484062134632e-05, + "loss": 0.9899, + "step": 275 + }, + { + "epoch": 0.7419354838709677, + "grad_norm": 0.37437706613357397, + "learning_rate": 2.9974224750759017e-05, + "loss": 0.9817, + "step": 276 + }, + { + "epoch": 0.7446236559139785, + "grad_norm": 0.3844600838817203, + "learning_rate": 2.9896766814865355e-05, + "loss": 1.0263, + "step": 277 + }, + { + "epoch": 0.7473118279569892, + "grad_norm": 0.4310511049000039, + "learning_rate": 2.9819111796933948e-05, + "loss": 0.9781, + "step": 278 + }, + { + "epoch": 0.75, + "grad_norm": 0.40281595760365946, + "learning_rate": 2.9741261243369746e-05, + "loss": 1.0273, + "step": 279 + }, + { + "epoch": 0.7526881720430108, + "grad_norm": 0.4498302856339957, + "learning_rate": 2.9663216704471547e-05, + "loss": 0.9886, + "step": 280 + }, + { + "epoch": 0.7553763440860215, + "grad_norm": 0.4350406167421517, + "learning_rate": 2.958497973440114e-05, + "loss": 1.0247, + "step": 281 + }, + { + "epoch": 0.7580645161290323, + "grad_norm": 0.46748351737565624, + "learning_rate": 2.9506551891152334e-05, + "loss": 1.0072, + "step": 282 + }, + { + "epoch": 0.760752688172043, + "grad_norm": 0.3998308958015181, + "learning_rate": 2.9427934736519962e-05, + "loss": 1.076, + "step": 283 + }, + { + "epoch": 0.7634408602150538, + "grad_norm": 0.42326867383664013, + "learning_rate": 2.9349129836068732e-05, + "loss": 0.9895, + "step": 284 + }, + { + "epoch": 0.7661290322580645, + "grad_norm": 0.3949205497118407, + "learning_rate": 2.9270138759102108e-05, + "loss": 1.027, + "step": 285 + }, + { + "epoch": 0.7688172043010753, + "grad_norm": 0.40826149975955933, + "learning_rate": 2.919096307863104e-05, + "loss": 1.0128, + "step": 286 + }, + { + "epoch": 0.771505376344086, + "grad_norm": 0.6045575439891937, + "learning_rate": 2.9111604371342593e-05, + "loss": 0.9806, + "step": 287 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 0.3906743864943639, + "learning_rate": 2.903206421756862e-05, + "loss": 1.0126, + "step": 288 + }, + { + "epoch": 0.7768817204301075, + "grad_norm": 0.37994713789537804, + "learning_rate": 2.8952344201254253e-05, + "loss": 0.9984, + "step": 289 + }, + { + "epoch": 0.7795698924731183, + "grad_norm": 0.4560671009564336, + "learning_rate": 2.8872445909926358e-05, + "loss": 0.9846, + "step": 290 + }, + { + "epoch": 0.782258064516129, + "grad_norm": 0.40231158085064994, + "learning_rate": 2.8792370934661948e-05, + "loss": 1.0403, + "step": 291 + }, + { + "epoch": 0.7849462365591398, + "grad_norm": 0.4776678536973747, + "learning_rate": 2.8712120870056455e-05, + "loss": 1.0327, + "step": 292 + }, + { + "epoch": 0.7876344086021505, + "grad_norm": 0.45302618010000684, + "learning_rate": 2.8631697314192012e-05, + "loss": 1.0126, + "step": 293 + }, + { + "epoch": 0.7903225806451613, + "grad_norm": 0.4332121059542856, + "learning_rate": 2.8551101868605644e-05, + "loss": 1.0475, + "step": 294 + }, + { + "epoch": 0.793010752688172, + "grad_norm": 0.4498441085262953, + "learning_rate": 2.8470336138257315e-05, + "loss": 1.0178, + "step": 295 + }, + { + "epoch": 0.7956989247311828, + "grad_norm": 0.39208633969875073, + "learning_rate": 2.8389401731498018e-05, + "loss": 1.0127, + "step": 296 + }, + { + "epoch": 0.7983870967741935, + "grad_norm": 0.4042053763726035, + "learning_rate": 2.8308300260037734e-05, + "loss": 0.9732, + "step": 297 + }, + { + "epoch": 0.8010752688172043, + "grad_norm": 0.42842239164240437, + "learning_rate": 2.8227033338913318e-05, + "loss": 1.0152, + "step": 298 + }, + { + "epoch": 0.803763440860215, + "grad_norm": 0.3807866452863404, + "learning_rate": 2.814560258645638e-05, + "loss": 1.0189, + "step": 299 + }, + { + "epoch": 0.8064516129032258, + "grad_norm": 0.43852909963759557, + "learning_rate": 2.8064009624260994e-05, + "loss": 1.0084, + "step": 300 + }, + { + "epoch": 0.8091397849462365, + "grad_norm": 0.5122035327018767, + "learning_rate": 2.7982256077151482e-05, + "loss": 1.0098, + "step": 301 + }, + { + "epoch": 0.8118279569892473, + "grad_norm": 0.38079784946729706, + "learning_rate": 2.7900343573150003e-05, + "loss": 1.0097, + "step": 302 + }, + { + "epoch": 0.8145161290322581, + "grad_norm": 0.3583539130301541, + "learning_rate": 2.7818273743444132e-05, + "loss": 0.9964, + "step": 303 + }, + { + "epoch": 0.8172043010752689, + "grad_norm": 0.3813956107048218, + "learning_rate": 2.7736048222354414e-05, + "loss": 0.9761, + "step": 304 + }, + { + "epoch": 0.8198924731182796, + "grad_norm": 0.3901758217275271, + "learning_rate": 2.7653668647301797e-05, + "loss": 1.0117, + "step": 305 + }, + { + "epoch": 0.8225806451612904, + "grad_norm": 0.41237780052722667, + "learning_rate": 2.757113665877502e-05, + "loss": 0.9653, + "step": 306 + }, + { + "epoch": 0.8252688172043011, + "grad_norm": 0.457306901223017, + "learning_rate": 2.748845390029794e-05, + "loss": 1.0524, + "step": 307 + }, + { + "epoch": 0.8279569892473119, + "grad_norm": 0.3791723859065832, + "learning_rate": 2.740562201839684e-05, + "loss": 0.9861, + "step": 308 + }, + { + "epoch": 0.8306451612903226, + "grad_norm": 0.500338650948681, + "learning_rate": 2.7322642662567592e-05, + "loss": 0.9705, + "step": 309 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.4052884593861236, + "learning_rate": 2.7239517485242836e-05, + "loss": 0.9892, + "step": 310 + }, + { + "epoch": 0.8360215053763441, + "grad_norm": 0.3969000439893693, + "learning_rate": 2.715624814175907e-05, + "loss": 0.9883, + "step": 311 + }, + { + "epoch": 0.8387096774193549, + "grad_norm": 0.5254585071566374, + "learning_rate": 2.7072836290323698e-05, + "loss": 1.08, + "step": 312 + }, + { + "epoch": 0.8413978494623656, + "grad_norm": 0.5111475952965409, + "learning_rate": 2.698928359198197e-05, + "loss": 1.0526, + "step": 313 + }, + { + "epoch": 0.8440860215053764, + "grad_norm": 0.4717493748353866, + "learning_rate": 2.6905591710583957e-05, + "loss": 1.0137, + "step": 314 + }, + { + "epoch": 0.8467741935483871, + "grad_norm": 0.3838063749897804, + "learning_rate": 2.6821762312751368e-05, + "loss": 0.9901, + "step": 315 + }, + { + "epoch": 0.8494623655913979, + "grad_norm": 0.3456617314343378, + "learning_rate": 2.6737797067844403e-05, + "loss": 1.0034, + "step": 316 + }, + { + "epoch": 0.8521505376344086, + "grad_norm": 0.37971130684639953, + "learning_rate": 2.6653697647928485e-05, + "loss": 0.9552, + "step": 317 + }, + { + "epoch": 0.8548387096774194, + "grad_norm": 0.3820801267530888, + "learning_rate": 2.656946572774095e-05, + "loss": 0.9236, + "step": 318 + }, + { + "epoch": 0.8575268817204301, + "grad_norm": 0.4114917943590629, + "learning_rate": 2.648510298465775e-05, + "loss": 1.0, + "step": 319 + }, + { + "epoch": 0.8602150537634409, + "grad_norm": 0.4185665498381875, + "learning_rate": 2.6400611098659988e-05, + "loss": 1.0435, + "step": 320 + }, + { + "epoch": 0.8629032258064516, + "grad_norm": 0.36227121606774076, + "learning_rate": 2.6315991752300503e-05, + "loss": 0.9797, + "step": 321 + }, + { + "epoch": 0.8655913978494624, + "grad_norm": 0.40186567244596927, + "learning_rate": 2.623124663067034e-05, + "loss": 1.0071, + "step": 322 + }, + { + "epoch": 0.8682795698924731, + "grad_norm": 0.3833356371805648, + "learning_rate": 2.6146377421365225e-05, + "loss": 1.0159, + "step": 323 + }, + { + "epoch": 0.8709677419354839, + "grad_norm": 0.41469411381713683, + "learning_rate": 2.6061385814451913e-05, + "loss": 1.0277, + "step": 324 + }, + { + "epoch": 0.8736559139784946, + "grad_norm": 0.92622435409038, + "learning_rate": 2.5976273502434584e-05, + "loss": 1.0001, + "step": 325 + }, + { + "epoch": 0.8763440860215054, + "grad_norm": 0.4316506228630945, + "learning_rate": 2.5891042180221094e-05, + "loss": 1.0712, + "step": 326 + }, + { + "epoch": 0.8790322580645161, + "grad_norm": 0.42656057546508047, + "learning_rate": 2.580569354508925e-05, + "loss": 1.0074, + "step": 327 + }, + { + "epoch": 0.8817204301075269, + "grad_norm": 0.3789318712710433, + "learning_rate": 2.5720229296653006e-05, + "loss": 1.0355, + "step": 328 + }, + { + "epoch": 0.8844086021505376, + "grad_norm": 0.367154670317836, + "learning_rate": 2.5634651136828597e-05, + "loss": 1.0394, + "step": 329 + }, + { + "epoch": 0.8870967741935484, + "grad_norm": 0.4735001007157819, + "learning_rate": 2.554896076980069e-05, + "loss": 1.0552, + "step": 330 + }, + { + "epoch": 0.8897849462365591, + "grad_norm": 0.4390567460028508, + "learning_rate": 2.54631599019884e-05, + "loss": 1.0043, + "step": 331 + }, + { + "epoch": 0.8924731182795699, + "grad_norm": 0.3642787415401991, + "learning_rate": 2.5377250242011338e-05, + "loss": 0.9854, + "step": 332 + }, + { + "epoch": 0.8951612903225806, + "grad_norm": 0.4524235630593109, + "learning_rate": 2.5291233500655584e-05, + "loss": 1.0029, + "step": 333 + }, + { + "epoch": 0.8978494623655914, + "grad_norm": 0.4097887869063476, + "learning_rate": 2.52051113908396e-05, + "loss": 1.0122, + "step": 334 + }, + { + "epoch": 0.9005376344086021, + "grad_norm": 0.3852040955735104, + "learning_rate": 2.5118885627580155e-05, + "loss": 0.9779, + "step": 335 + }, + { + "epoch": 0.9032258064516129, + "grad_norm": 0.40481656602470306, + "learning_rate": 2.5032557927958116e-05, + "loss": 1.0125, + "step": 336 + }, + { + "epoch": 0.9059139784946236, + "grad_norm": 0.4118716752579493, + "learning_rate": 2.494613001108431e-05, + "loss": 1.0364, + "step": 337 + }, + { + "epoch": 0.9086021505376344, + "grad_norm": 0.4489453038959667, + "learning_rate": 2.485960359806528e-05, + "loss": 1.0436, + "step": 338 + }, + { + "epoch": 0.9112903225806451, + "grad_norm": 0.41112406404210244, + "learning_rate": 2.4772980411968975e-05, + "loss": 0.9545, + "step": 339 + }, + { + "epoch": 0.9139784946236559, + "grad_norm": 0.4856093390929945, + "learning_rate": 2.468626217779047e-05, + "loss": 0.9854, + "step": 340 + }, + { + "epoch": 0.9166666666666666, + "grad_norm": 0.37523760134058665, + "learning_rate": 2.4599450622417615e-05, + "loss": 0.9699, + "step": 341 + }, + { + "epoch": 0.9193548387096774, + "grad_norm": 0.4064413347216363, + "learning_rate": 2.4512547474596624e-05, + "loss": 1.0083, + "step": 342 + }, + { + "epoch": 0.9220430107526881, + "grad_norm": 0.44550717714004195, + "learning_rate": 2.4425554464897675e-05, + "loss": 1.0175, + "step": 343 + }, + { + "epoch": 0.9247311827956989, + "grad_norm": 0.44076297740074416, + "learning_rate": 2.433847332568042e-05, + "loss": 0.9718, + "step": 344 + }, + { + "epoch": 0.9274193548387096, + "grad_norm": 0.4971040038925624, + "learning_rate": 2.4251305791059533e-05, + "loss": 1.0317, + "step": 345 + }, + { + "epoch": 0.9301075268817204, + "grad_norm": 0.35978037050758516, + "learning_rate": 2.416405359687012e-05, + "loss": 0.9693, + "step": 346 + }, + { + "epoch": 0.9327956989247311, + "grad_norm": 0.41817202738352904, + "learning_rate": 2.4076718480633178e-05, + "loss": 0.9764, + "step": 347 + }, + { + "epoch": 0.9354838709677419, + "grad_norm": 0.4130988765844788, + "learning_rate": 2.398930218152101e-05, + "loss": 0.9548, + "step": 348 + }, + { + "epoch": 0.9381720430107527, + "grad_norm": 0.47899471351234146, + "learning_rate": 2.390180644032257e-05, + "loss": 0.9965, + "step": 349 + }, + { + "epoch": 0.9408602150537635, + "grad_norm": 0.3639159912649112, + "learning_rate": 2.38142329994088e-05, + "loss": 0.945, + "step": 350 + }, + { + "epoch": 0.9435483870967742, + "grad_norm": 0.41552533932477614, + "learning_rate": 2.372658360269796e-05, + "loss": 0.976, + "step": 351 + }, + { + "epoch": 0.946236559139785, + "grad_norm": 0.4127471276078075, + "learning_rate": 2.363885999562084e-05, + "loss": 1.0493, + "step": 352 + }, + { + "epoch": 0.9489247311827957, + "grad_norm": 0.42874463629780296, + "learning_rate": 2.3551063925086072e-05, + "loss": 1.0003, + "step": 353 + }, + { + "epoch": 0.9516129032258065, + "grad_norm": 0.4542236208271591, + "learning_rate": 2.3463197139445284e-05, + "loss": 1.0189, + "step": 354 + }, + { + "epoch": 0.9543010752688172, + "grad_norm": 0.8840248169596676, + "learning_rate": 2.3375261388458318e-05, + "loss": 1.0006, + "step": 355 + }, + { + "epoch": 0.956989247311828, + "grad_norm": 0.47762507803159143, + "learning_rate": 2.3287258423258405e-05, + "loss": 1.0101, + "step": 356 + }, + { + "epoch": 0.9596774193548387, + "grad_norm": 0.42765004964798886, + "learning_rate": 2.3199189996317205e-05, + "loss": 0.9896, + "step": 357 + }, + { + "epoch": 0.9623655913978495, + "grad_norm": 0.4236101839000849, + "learning_rate": 2.3111057861410026e-05, + "loss": 0.9931, + "step": 358 + }, + { + "epoch": 0.9650537634408602, + "grad_norm": 0.38884571703952686, + "learning_rate": 2.3022863773580813e-05, + "loss": 0.9394, + "step": 359 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 0.5378824587688318, + "learning_rate": 2.2934609489107236e-05, + "loss": 0.9842, + "step": 360 + }, + { + "epoch": 0.9704301075268817, + "grad_norm": 0.39925462372416454, + "learning_rate": 2.2846296765465708e-05, + "loss": 1.0026, + "step": 361 + }, + { + "epoch": 0.9731182795698925, + "grad_norm": 0.9592078982505338, + "learning_rate": 2.2757927361296376e-05, + "loss": 1.0332, + "step": 362 + }, + { + "epoch": 0.9758064516129032, + "grad_norm": 0.4396877320552629, + "learning_rate": 2.2669503036368124e-05, + "loss": 0.9971, + "step": 363 + }, + { + "epoch": 0.978494623655914, + "grad_norm": 0.38966539914800313, + "learning_rate": 2.2581025551543516e-05, + "loss": 0.9469, + "step": 364 + }, + { + "epoch": 0.9811827956989247, + "grad_norm": 0.4216276354211585, + "learning_rate": 2.249249666874372e-05, + "loss": 1.0322, + "step": 365 + }, + { + "epoch": 0.9838709677419355, + "grad_norm": 0.4351959975704115, + "learning_rate": 2.240391815091344e-05, + "loss": 0.962, + "step": 366 + }, + { + "epoch": 0.9865591397849462, + "grad_norm": 0.35811079366878923, + "learning_rate": 2.2315291761985803e-05, + "loss": 0.9937, + "step": 367 + }, + { + "epoch": 0.989247311827957, + "grad_norm": 0.3605918004740936, + "learning_rate": 2.222661926684722e-05, + "loss": 0.991, + "step": 368 + }, + { + "epoch": 0.9919354838709677, + "grad_norm": 0.4176512601533839, + "learning_rate": 2.2137902431302264e-05, + "loss": 1.0332, + "step": 369 + }, + { + "epoch": 0.9946236559139785, + "grad_norm": 0.42340462982190896, + "learning_rate": 2.2049143022038472e-05, + "loss": 0.9922, + "step": 370 + }, + { + "epoch": 0.9973118279569892, + "grad_norm": 0.420010163587815, + "learning_rate": 2.196034280659122e-05, + "loss": 1.0155, + "step": 371 + }, + { + "epoch": 1.0, + "grad_norm": 0.41657151819377736, + "learning_rate": 2.1871503553308447e-05, + "loss": 0.9901, + "step": 372 + }, + { + "epoch": 1.0026881720430108, + "grad_norm": 0.37625522072539047, + "learning_rate": 2.178262703131552e-05, + "loss": 0.9968, + "step": 373 + }, + { + "epoch": 1.0053763440860215, + "grad_norm": 0.3372266500196924, + "learning_rate": 2.169371501047995e-05, + "loss": 0.9412, + "step": 374 + }, + { + "epoch": 1.0080645161290323, + "grad_norm": 0.4054609590993035, + "learning_rate": 2.160476926137616e-05, + "loss": 0.9854, + "step": 375 + }, + { + "epoch": 1.010752688172043, + "grad_norm": 0.4699260706715865, + "learning_rate": 2.1515791555250236e-05, + "loss": 0.9842, + "step": 376 + }, + { + "epoch": 1.0134408602150538, + "grad_norm": 0.4110199971228794, + "learning_rate": 2.1426783663984648e-05, + "loss": 0.9907, + "step": 377 + }, + { + "epoch": 1.0161290322580645, + "grad_norm": 0.37859507690512056, + "learning_rate": 2.133774736006297e-05, + "loss": 0.9802, + "step": 378 + }, + { + "epoch": 1.0188172043010753, + "grad_norm": 0.3899354406871148, + "learning_rate": 2.1248684416534586e-05, + "loss": 0.9572, + "step": 379 + }, + { + "epoch": 1.021505376344086, + "grad_norm": 0.440058465578572, + "learning_rate": 2.115959660697935e-05, + "loss": 0.9844, + "step": 380 + }, + { + "epoch": 1.0241935483870968, + "grad_norm": 0.3520254725000719, + "learning_rate": 2.1070485705472305e-05, + "loss": 0.9814, + "step": 381 + }, + { + "epoch": 1.0268817204301075, + "grad_norm": 0.3539722695025887, + "learning_rate": 2.0981353486548363e-05, + "loss": 0.9639, + "step": 382 + }, + { + "epoch": 1.0295698924731183, + "grad_norm": 0.4194473931929923, + "learning_rate": 2.0892201725166918e-05, + "loss": 0.9535, + "step": 383 + }, + { + "epoch": 1.032258064516129, + "grad_norm": 0.40610578232489386, + "learning_rate": 2.0803032196676542e-05, + "loss": 0.9503, + "step": 384 + }, + { + "epoch": 1.0349462365591398, + "grad_norm": 0.39740618952578477, + "learning_rate": 2.0713846676779613e-05, + "loss": 0.9649, + "step": 385 + }, + { + "epoch": 1.0376344086021505, + "grad_norm": 0.40362301965301367, + "learning_rate": 2.0624646941496957e-05, + "loss": 0.9439, + "step": 386 + }, + { + "epoch": 1.0403225806451613, + "grad_norm": 0.3941991828164143, + "learning_rate": 2.0535434767132495e-05, + "loss": 0.9714, + "step": 387 + }, + { + "epoch": 1.043010752688172, + "grad_norm": 0.43714978534391263, + "learning_rate": 2.0446211930237828e-05, + "loss": 0.9825, + "step": 388 + }, + { + "epoch": 1.0456989247311828, + "grad_norm": 0.4288060089603866, + "learning_rate": 2.0356980207576923e-05, + "loss": 0.955, + "step": 389 + }, + { + "epoch": 1.0483870967741935, + "grad_norm": 0.42501218262610596, + "learning_rate": 2.026774137609068e-05, + "loss": 0.9294, + "step": 390 + }, + { + "epoch": 1.0510752688172043, + "grad_norm": 0.3797408497083259, + "learning_rate": 2.017849721286155e-05, + "loss": 0.925, + "step": 391 + }, + { + "epoch": 1.053763440860215, + "grad_norm": 0.4192330864087085, + "learning_rate": 2.0089249495078186e-05, + "loss": 0.9942, + "step": 392 + }, + { + "epoch": 1.0564516129032258, + "grad_norm": 0.432069127816113, + "learning_rate": 2e-05, + "loss": 0.991, + "step": 393 + }, + { + "epoch": 1.0591397849462365, + "grad_norm": 0.4099267346630584, + "learning_rate": 1.991075050492182e-05, + "loss": 0.9895, + "step": 394 + }, + { + "epoch": 1.0618279569892473, + "grad_norm": 0.4495162016467118, + "learning_rate": 1.9821502787138457e-05, + "loss": 0.9581, + "step": 395 + }, + { + "epoch": 1.064516129032258, + "grad_norm": 0.4164111570075975, + "learning_rate": 1.973225862390933e-05, + "loss": 0.9372, + "step": 396 + }, + { + "epoch": 1.0672043010752688, + "grad_norm": 0.42182057950960955, + "learning_rate": 1.964301979242308e-05, + "loss": 0.8968, + "step": 397 + }, + { + "epoch": 1.0698924731182795, + "grad_norm": 0.40971128479229557, + "learning_rate": 1.955378806976218e-05, + "loss": 0.9579, + "step": 398 + }, + { + "epoch": 1.0725806451612903, + "grad_norm": 0.3788909632878751, + "learning_rate": 1.9464565232867512e-05, + "loss": 0.9528, + "step": 399 + }, + { + "epoch": 1.075268817204301, + "grad_norm": 0.43822514312132327, + "learning_rate": 1.9375353058503054e-05, + "loss": 0.9564, + "step": 400 + }, + { + "epoch": 1.0779569892473118, + "grad_norm": 0.3639778816066556, + "learning_rate": 1.9286153323220393e-05, + "loss": 0.9153, + "step": 401 + }, + { + "epoch": 1.0806451612903225, + "grad_norm": 0.43609982064320735, + "learning_rate": 1.9196967803323464e-05, + "loss": 0.8832, + "step": 402 + }, + { + "epoch": 1.0833333333333333, + "grad_norm": 0.39633091853296737, + "learning_rate": 1.9107798274833092e-05, + "loss": 0.9292, + "step": 403 + }, + { + "epoch": 1.086021505376344, + "grad_norm": 0.3522476588548908, + "learning_rate": 1.901864651345164e-05, + "loss": 0.942, + "step": 404 + }, + { + "epoch": 1.0887096774193548, + "grad_norm": 0.3273883068895211, + "learning_rate": 1.8929514294527698e-05, + "loss": 0.8661, + "step": 405 + }, + { + "epoch": 1.0913978494623655, + "grad_norm": 0.4476510585706847, + "learning_rate": 1.8840403393020663e-05, + "loss": 0.9595, + "step": 406 + }, + { + "epoch": 1.0940860215053763, + "grad_norm": 0.3901195297026457, + "learning_rate": 1.875131558346542e-05, + "loss": 0.9486, + "step": 407 + }, + { + "epoch": 1.096774193548387, + "grad_norm": 0.3485725672680906, + "learning_rate": 1.866225263993703e-05, + "loss": 0.8658, + "step": 408 + }, + { + "epoch": 1.0994623655913978, + "grad_norm": 0.41331554397279857, + "learning_rate": 1.8573216336015355e-05, + "loss": 0.9012, + "step": 409 + }, + { + "epoch": 1.1021505376344085, + "grad_norm": 0.4151212580924691, + "learning_rate": 1.848420844474977e-05, + "loss": 0.9663, + "step": 410 + }, + { + "epoch": 1.1048387096774193, + "grad_norm": 0.38617856497225356, + "learning_rate": 1.839523073862385e-05, + "loss": 0.9066, + "step": 411 + }, + { + "epoch": 1.10752688172043, + "grad_norm": 0.3813756613319048, + "learning_rate": 1.8306284989520055e-05, + "loss": 0.9128, + "step": 412 + }, + { + "epoch": 1.1102150537634408, + "grad_norm": 0.3756411514924831, + "learning_rate": 1.8217372968684483e-05, + "loss": 0.9689, + "step": 413 + }, + { + "epoch": 1.1129032258064515, + "grad_norm": 0.40020922129279124, + "learning_rate": 1.8128496446691563e-05, + "loss": 0.9321, + "step": 414 + }, + { + "epoch": 1.1155913978494623, + "grad_norm": 0.37642937721697783, + "learning_rate": 1.8039657193408788e-05, + "loss": 0.9186, + "step": 415 + }, + { + "epoch": 1.118279569892473, + "grad_norm": 0.42399625828357296, + "learning_rate": 1.795085697796153e-05, + "loss": 0.9169, + "step": 416 + }, + { + "epoch": 1.120967741935484, + "grad_norm": 0.4030377840102742, + "learning_rate": 1.786209756869775e-05, + "loss": 0.9179, + "step": 417 + }, + { + "epoch": 1.1236559139784945, + "grad_norm": 0.4079171223898751, + "learning_rate": 1.7773380733152786e-05, + "loss": 0.9482, + "step": 418 + }, + { + "epoch": 1.1263440860215055, + "grad_norm": 0.38158416365856496, + "learning_rate": 1.76847082380142e-05, + "loss": 0.9027, + "step": 419 + }, + { + "epoch": 1.129032258064516, + "grad_norm": 0.4124329106576186, + "learning_rate": 1.7596081849086562e-05, + "loss": 0.8622, + "step": 420 + }, + { + "epoch": 1.131720430107527, + "grad_norm": 0.33896062702559865, + "learning_rate": 1.7507503331256283e-05, + "loss": 0.9051, + "step": 421 + }, + { + "epoch": 1.1344086021505375, + "grad_norm": 0.4010835912883116, + "learning_rate": 1.741897444845649e-05, + "loss": 0.9089, + "step": 422 + }, + { + "epoch": 1.1370967741935485, + "grad_norm": 0.39948068332931136, + "learning_rate": 1.7330496963631883e-05, + "loss": 0.9289, + "step": 423 + }, + { + "epoch": 1.139784946236559, + "grad_norm": 0.41161766967963404, + "learning_rate": 1.7242072638703627e-05, + "loss": 0.8759, + "step": 424 + }, + { + "epoch": 1.14247311827957, + "grad_norm": 0.3528790684520857, + "learning_rate": 1.7153703234534302e-05, + "loss": 0.933, + "step": 425 + }, + { + "epoch": 1.1451612903225807, + "grad_norm": 0.3520898679904786, + "learning_rate": 1.7065390510892767e-05, + "loss": 0.9317, + "step": 426 + }, + { + "epoch": 1.1478494623655915, + "grad_norm": 0.3879958084060956, + "learning_rate": 1.6977136226419187e-05, + "loss": 0.912, + "step": 427 + }, + { + "epoch": 1.1505376344086022, + "grad_norm": 0.38003669279752317, + "learning_rate": 1.6888942138589977e-05, + "loss": 0.8905, + "step": 428 + }, + { + "epoch": 1.153225806451613, + "grad_norm": 0.38751859131970406, + "learning_rate": 1.68008100036828e-05, + "loss": 0.9537, + "step": 429 + }, + { + "epoch": 1.1559139784946237, + "grad_norm": 0.35387610231825767, + "learning_rate": 1.67127415767416e-05, + "loss": 0.9069, + "step": 430 + }, + { + "epoch": 1.1586021505376345, + "grad_norm": 0.3791427138266714, + "learning_rate": 1.6624738611541685e-05, + "loss": 0.9218, + "step": 431 + }, + { + "epoch": 1.1612903225806452, + "grad_norm": 0.3802278199985049, + "learning_rate": 1.6536802860554723e-05, + "loss": 0.8637, + "step": 432 + }, + { + "epoch": 1.163978494623656, + "grad_norm": 0.4032654259366595, + "learning_rate": 1.6448936074913938e-05, + "loss": 0.8948, + "step": 433 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.5392331130434859, + "learning_rate": 1.6361140004379165e-05, + "loss": 0.9121, + "step": 434 + }, + { + "epoch": 1.1693548387096775, + "grad_norm": 0.4241698900501277, + "learning_rate": 1.6273416397302043e-05, + "loss": 0.9073, + "step": 435 + }, + { + "epoch": 1.1720430107526882, + "grad_norm": 0.4332038553957603, + "learning_rate": 1.6185767000591202e-05, + "loss": 0.9443, + "step": 436 + }, + { + "epoch": 1.174731182795699, + "grad_norm": 0.36935972988007515, + "learning_rate": 1.609819355967744e-05, + "loss": 0.9264, + "step": 437 + }, + { + "epoch": 1.1774193548387097, + "grad_norm": 0.3451086959697611, + "learning_rate": 1.6010697818478996e-05, + "loss": 0.8848, + "step": 438 + }, + { + "epoch": 1.1801075268817205, + "grad_norm": 0.4120432714099029, + "learning_rate": 1.5923281519366832e-05, + "loss": 0.901, + "step": 439 + }, + { + "epoch": 1.1827956989247312, + "grad_norm": 0.45212322860187754, + "learning_rate": 1.5835946403129886e-05, + "loss": 0.906, + "step": 440 + }, + { + "epoch": 1.185483870967742, + "grad_norm": 0.3604005645749173, + "learning_rate": 1.5748694208940467e-05, + "loss": 0.8777, + "step": 441 + }, + { + "epoch": 1.1881720430107527, + "grad_norm": 0.3604056072515275, + "learning_rate": 1.5661526674319582e-05, + "loss": 0.919, + "step": 442 + }, + { + "epoch": 1.1908602150537635, + "grad_norm": 0.34313703093643527, + "learning_rate": 1.557444553510233e-05, + "loss": 0.9329, + "step": 443 + }, + { + "epoch": 1.1935483870967742, + "grad_norm": 0.37375432993164387, + "learning_rate": 1.548745252540339e-05, + "loss": 0.9481, + "step": 444 + }, + { + "epoch": 1.196236559139785, + "grad_norm": 0.43080367488426863, + "learning_rate": 1.5400549377582392e-05, + "loss": 0.894, + "step": 445 + }, + { + "epoch": 1.1989247311827957, + "grad_norm": 0.3750508182193543, + "learning_rate": 1.5313737822209532e-05, + "loss": 0.922, + "step": 446 + }, + { + "epoch": 1.2016129032258065, + "grad_norm": 0.4276115589621554, + "learning_rate": 1.5227019588031035e-05, + "loss": 0.8569, + "step": 447 + }, + { + "epoch": 1.2043010752688172, + "grad_norm": 0.36909790172392043, + "learning_rate": 1.5140396401934725e-05, + "loss": 0.8656, + "step": 448 + }, + { + "epoch": 1.206989247311828, + "grad_norm": 0.44266858883203647, + "learning_rate": 1.5053869988915691e-05, + "loss": 0.9371, + "step": 449 + }, + { + "epoch": 1.2096774193548387, + "grad_norm": 0.3725440482769685, + "learning_rate": 1.4967442072041895e-05, + "loss": 0.9103, + "step": 450 + }, + { + "epoch": 1.2123655913978495, + "grad_norm": 0.36815385084670166, + "learning_rate": 1.4881114372419854e-05, + "loss": 0.8499, + "step": 451 + }, + { + "epoch": 1.2150537634408602, + "grad_norm": 0.3730907651609142, + "learning_rate": 1.47948886091604e-05, + "loss": 0.928, + "step": 452 + }, + { + "epoch": 1.217741935483871, + "grad_norm": 0.4105069671252027, + "learning_rate": 1.4708766499344424e-05, + "loss": 0.8816, + "step": 453 + }, + { + "epoch": 1.2204301075268817, + "grad_norm": 0.3606969658371548, + "learning_rate": 1.462274975798867e-05, + "loss": 0.8993, + "step": 454 + }, + { + "epoch": 1.2231182795698925, + "grad_norm": 0.3819716903785989, + "learning_rate": 1.4536840098011613e-05, + "loss": 0.9016, + "step": 455 + }, + { + "epoch": 1.2258064516129032, + "grad_norm": 0.3468890936243422, + "learning_rate": 1.4451039230199317e-05, + "loss": 0.8533, + "step": 456 + }, + { + "epoch": 1.228494623655914, + "grad_norm": 0.4219447671378407, + "learning_rate": 1.4365348863171406e-05, + "loss": 0.8838, + "step": 457 + }, + { + "epoch": 1.2311827956989247, + "grad_norm": 0.38132389434045155, + "learning_rate": 1.4279770703347008e-05, + "loss": 0.9027, + "step": 458 + }, + { + "epoch": 1.2338709677419355, + "grad_norm": 0.40246629352796237, + "learning_rate": 1.4194306454910757e-05, + "loss": 0.9246, + "step": 459 + }, + { + "epoch": 1.2365591397849462, + "grad_norm": 0.3909042227194992, + "learning_rate": 1.410895781977891e-05, + "loss": 0.8486, + "step": 460 + }, + { + "epoch": 1.239247311827957, + "grad_norm": 0.3674117695470089, + "learning_rate": 1.4023726497565422e-05, + "loss": 0.8977, + "step": 461 + }, + { + "epoch": 1.2419354838709677, + "grad_norm": 0.41824933124811753, + "learning_rate": 1.3938614185548094e-05, + "loss": 0.8888, + "step": 462 + }, + { + "epoch": 1.2446236559139785, + "grad_norm": 0.40078055707466587, + "learning_rate": 1.385362257863478e-05, + "loss": 0.8963, + "step": 463 + }, + { + "epoch": 1.2473118279569892, + "grad_norm": 0.3611909331897887, + "learning_rate": 1.3768753369329664e-05, + "loss": 0.9065, + "step": 464 + }, + { + "epoch": 1.25, + "grad_norm": 0.3889978301002571, + "learning_rate": 1.3684008247699505e-05, + "loss": 0.8655, + "step": 465 + }, + { + "epoch": 1.2526881720430108, + "grad_norm": 0.3967333759192253, + "learning_rate": 1.3599388901340019e-05, + "loss": 0.9289, + "step": 466 + }, + { + "epoch": 1.2553763440860215, + "grad_norm": 0.3930903953239825, + "learning_rate": 1.3514897015342257e-05, + "loss": 0.9004, + "step": 467 + }, + { + "epoch": 1.2580645161290323, + "grad_norm": 0.39782516966072734, + "learning_rate": 1.343053427225905e-05, + "loss": 0.9356, + "step": 468 + }, + { + "epoch": 1.260752688172043, + "grad_norm": 0.3755719466909933, + "learning_rate": 1.3346302352071525e-05, + "loss": 0.9308, + "step": 469 + }, + { + "epoch": 1.2634408602150538, + "grad_norm": 0.4574705672937729, + "learning_rate": 1.3262202932155602e-05, + "loss": 0.8518, + "step": 470 + }, + { + "epoch": 1.2661290322580645, + "grad_norm": 0.387011509571043, + "learning_rate": 1.3178237687248632e-05, + "loss": 0.9082, + "step": 471 + }, + { + "epoch": 1.2688172043010753, + "grad_norm": 0.4362071151177883, + "learning_rate": 1.3094408289416052e-05, + "loss": 0.8711, + "step": 472 + }, + { + "epoch": 1.271505376344086, + "grad_norm": 0.39574828340378626, + "learning_rate": 1.3010716408018037e-05, + "loss": 0.8927, + "step": 473 + }, + { + "epoch": 1.2741935483870968, + "grad_norm": 0.38582577248876887, + "learning_rate": 1.2927163709676305e-05, + "loss": 0.8953, + "step": 474 + }, + { + "epoch": 1.2768817204301075, + "grad_norm": 0.39993052025189596, + "learning_rate": 1.2843751858240938e-05, + "loss": 0.8545, + "step": 475 + }, + { + "epoch": 1.2795698924731183, + "grad_norm": 0.3758518397155174, + "learning_rate": 1.276048251475717e-05, + "loss": 0.8853, + "step": 476 + }, + { + "epoch": 1.282258064516129, + "grad_norm": 0.3673034097829057, + "learning_rate": 1.267735733743242e-05, + "loss": 0.8618, + "step": 477 + }, + { + "epoch": 1.2849462365591398, + "grad_norm": 0.3579444429196611, + "learning_rate": 1.2594377981603167e-05, + "loss": 0.9049, + "step": 478 + }, + { + "epoch": 1.2876344086021505, + "grad_norm": 0.3512023955004902, + "learning_rate": 1.251154609970206e-05, + "loss": 0.905, + "step": 479 + }, + { + "epoch": 1.2903225806451613, + "grad_norm": 0.3821816087256963, + "learning_rate": 1.2428863341224988e-05, + "loss": 0.8603, + "step": 480 + }, + { + "epoch": 1.293010752688172, + "grad_norm": 0.4133864396732673, + "learning_rate": 1.2346331352698206e-05, + "loss": 0.8936, + "step": 481 + }, + { + "epoch": 1.2956989247311828, + "grad_norm": 0.4291865828849791, + "learning_rate": 1.2263951777645588e-05, + "loss": 0.8951, + "step": 482 + }, + { + "epoch": 1.2983870967741935, + "grad_norm": 0.3960987966319793, + "learning_rate": 1.2181726256555877e-05, + "loss": 0.8656, + "step": 483 + }, + { + "epoch": 1.3010752688172043, + "grad_norm": 0.3666707325539653, + "learning_rate": 1.2099656426850004e-05, + "loss": 0.9047, + "step": 484 + }, + { + "epoch": 1.303763440860215, + "grad_norm": 0.36431675184531326, + "learning_rate": 1.2017743922848518e-05, + "loss": 0.8792, + "step": 485 + }, + { + "epoch": 1.3064516129032258, + "grad_norm": 0.4255014642528041, + "learning_rate": 1.1935990375739011e-05, + "loss": 0.8804, + "step": 486 + }, + { + "epoch": 1.3091397849462365, + "grad_norm": 0.381858451178781, + "learning_rate": 1.1854397413543626e-05, + "loss": 0.919, + "step": 487 + }, + { + "epoch": 1.3118279569892473, + "grad_norm": 0.35522559885326904, + "learning_rate": 1.177296666108669e-05, + "loss": 0.9347, + "step": 488 + }, + { + "epoch": 1.314516129032258, + "grad_norm": 0.37099789206174566, + "learning_rate": 1.1691699739962275e-05, + "loss": 0.936, + "step": 489 + }, + { + "epoch": 1.3172043010752688, + "grad_norm": 0.36298491750372225, + "learning_rate": 1.1610598268501982e-05, + "loss": 0.8553, + "step": 490 + }, + { + "epoch": 1.3198924731182795, + "grad_norm": 0.3634911000699515, + "learning_rate": 1.1529663861742692e-05, + "loss": 0.9017, + "step": 491 + }, + { + "epoch": 1.3225806451612903, + "grad_norm": 0.4152917791390396, + "learning_rate": 1.1448898131394364e-05, + "loss": 0.8788, + "step": 492 + }, + { + "epoch": 1.325268817204301, + "grad_norm": 0.4244046126280729, + "learning_rate": 1.1368302685807984e-05, + "loss": 0.9105, + "step": 493 + }, + { + "epoch": 1.3279569892473118, + "grad_norm": 0.4305679319947305, + "learning_rate": 1.1287879129943558e-05, + "loss": 0.8712, + "step": 494 + }, + { + "epoch": 1.3306451612903225, + "grad_norm": 0.3815632433540905, + "learning_rate": 1.1207629065338063e-05, + "loss": 0.9274, + "step": 495 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.395491462517581, + "learning_rate": 1.1127554090073639e-05, + "loss": 0.8852, + "step": 496 + }, + { + "epoch": 1.336021505376344, + "grad_norm": 0.3412204028961129, + "learning_rate": 1.1047655798745752e-05, + "loss": 0.909, + "step": 497 + }, + { + "epoch": 1.3387096774193548, + "grad_norm": 0.42252187704467103, + "learning_rate": 1.0967935782431382e-05, + "loss": 0.9152, + "step": 498 + }, + { + "epoch": 1.3413978494623655, + "grad_norm": 0.3660726577260679, + "learning_rate": 1.0888395628657413e-05, + "loss": 0.9087, + "step": 499 + }, + { + "epoch": 1.3440860215053765, + "grad_norm": 0.41163724258371354, + "learning_rate": 1.0809036921368966e-05, + "loss": 0.8401, + "step": 500 + }, + { + "epoch": 1.346774193548387, + "grad_norm": 0.4662876589603711, + "learning_rate": 1.0729861240897892e-05, + "loss": 0.8499, + "step": 501 + }, + { + "epoch": 1.349462365591398, + "grad_norm": 0.38293219107966625, + "learning_rate": 1.0650870163931275e-05, + "loss": 0.8773, + "step": 502 + }, + { + "epoch": 1.3521505376344085, + "grad_norm": 0.4100231412619493, + "learning_rate": 1.0572065263480046e-05, + "loss": 0.874, + "step": 503 + }, + { + "epoch": 1.3548387096774195, + "grad_norm": 0.3609567532765614, + "learning_rate": 1.0493448108847669e-05, + "loss": 0.9064, + "step": 504 + }, + { + "epoch": 1.35752688172043, + "grad_norm": 0.3995575869896119, + "learning_rate": 1.0415020265598872e-05, + "loss": 0.8599, + "step": 505 + }, + { + "epoch": 1.360215053763441, + "grad_norm": 0.39891198490046775, + "learning_rate": 1.0336783295528454e-05, + "loss": 0.8696, + "step": 506 + }, + { + "epoch": 1.3629032258064515, + "grad_norm": 0.3869673460119376, + "learning_rate": 1.0258738756630255e-05, + "loss": 0.8882, + "step": 507 + }, + { + "epoch": 1.3655913978494625, + "grad_norm": 0.3785572445627051, + "learning_rate": 1.0180888203066059e-05, + "loss": 0.8653, + "step": 508 + }, + { + "epoch": 1.368279569892473, + "grad_norm": 0.4042858657113599, + "learning_rate": 1.0103233185134647e-05, + "loss": 0.8197, + "step": 509 + }, + { + "epoch": 1.370967741935484, + "grad_norm": 0.3930704370351199, + "learning_rate": 1.0025775249240993e-05, + "loss": 0.8542, + "step": 510 + }, + { + "epoch": 1.3736559139784945, + "grad_norm": 0.35542964413505407, + "learning_rate": 9.948515937865375e-06, + "loss": 0.8476, + "step": 511 + }, + { + "epoch": 1.3763440860215055, + "grad_norm": 0.42702317687809166, + "learning_rate": 9.871456789532736e-06, + "loss": 0.9089, + "step": 512 + }, + { + "epoch": 1.379032258064516, + "grad_norm": 0.3930984004263597, + "learning_rate": 9.794599338782011e-06, + "loss": 0.9033, + "step": 513 + }, + { + "epoch": 1.381720430107527, + "grad_norm": 0.36398813175120787, + "learning_rate": 9.717945116135568e-06, + "loss": 0.8921, + "step": 514 + }, + { + "epoch": 1.3844086021505375, + "grad_norm": 0.3716360571507653, + "learning_rate": 9.641495648068739e-06, + "loss": 0.8826, + "step": 515 + }, + { + "epoch": 1.3870967741935485, + "grad_norm": 0.5477375096224166, + "learning_rate": 9.56525245697942e-06, + "loss": 0.9088, + "step": 516 + }, + { + "epoch": 1.389784946236559, + "grad_norm": 0.35084597720275207, + "learning_rate": 9.489217061157744e-06, + "loss": 0.8777, + "step": 517 + }, + { + "epoch": 1.39247311827957, + "grad_norm": 0.41785033927597015, + "learning_rate": 9.413390974755864e-06, + "loss": 0.8934, + "step": 518 + }, + { + "epoch": 1.3951612903225805, + "grad_norm": 0.3942271377890524, + "learning_rate": 9.337775707757792e-06, + "loss": 0.8795, + "step": 519 + }, + { + "epoch": 1.3978494623655915, + "grad_norm": 0.43266747223335905, + "learning_rate": 9.262372765949319e-06, + "loss": 0.9344, + "step": 520 + }, + { + "epoch": 1.400537634408602, + "grad_norm": 0.3555286580841176, + "learning_rate": 9.187183650888056e-06, + "loss": 0.8561, + "step": 521 + }, + { + "epoch": 1.403225806451613, + "grad_norm": 0.4326861883163117, + "learning_rate": 9.112209859873479e-06, + "loss": 0.8748, + "step": 522 + }, + { + "epoch": 1.4059139784946235, + "grad_norm": 0.3911645714163196, + "learning_rate": 9.037452885917197e-06, + "loss": 0.8741, + "step": 523 + }, + { + "epoch": 1.4086021505376345, + "grad_norm": 0.3721004990469904, + "learning_rate": 8.962914217713148e-06, + "loss": 0.9123, + "step": 524 + }, + { + "epoch": 1.4112903225806452, + "grad_norm": 0.3504669386782379, + "learning_rate": 8.888595339607961e-06, + "loss": 0.9166, + "step": 525 + }, + { + "epoch": 1.413978494623656, + "grad_norm": 0.38727418818749926, + "learning_rate": 8.814497731571432e-06, + "loss": 0.8756, + "step": 526 + }, + { + "epoch": 1.4166666666666667, + "grad_norm": 0.3530651014121482, + "learning_rate": 8.74062286916705e-06, + "loss": 0.8953, + "step": 527 + }, + { + "epoch": 1.4193548387096775, + "grad_norm": 0.9061717130965162, + "learning_rate": 8.666972223522559e-06, + "loss": 0.898, + "step": 528 + }, + { + "epoch": 1.4220430107526882, + "grad_norm": 0.3892987110948611, + "learning_rate": 8.593547261300716e-06, + "loss": 0.8561, + "step": 529 + }, + { + "epoch": 1.424731182795699, + "grad_norm": 0.38394033117089926, + "learning_rate": 8.520349444670093e-06, + "loss": 0.8868, + "step": 530 + }, + { + "epoch": 1.4274193548387097, + "grad_norm": 0.4557984907315737, + "learning_rate": 8.447380231275889e-06, + "loss": 0.8617, + "step": 531 + }, + { + "epoch": 1.4301075268817205, + "grad_norm": 0.375452470598354, + "learning_rate": 8.374641074210979e-06, + "loss": 0.8734, + "step": 532 + }, + { + "epoch": 1.4327956989247312, + "grad_norm": 0.4144183699699819, + "learning_rate": 8.30213342198694e-06, + "loss": 0.8523, + "step": 533 + }, + { + "epoch": 1.435483870967742, + "grad_norm": 0.42069408475654074, + "learning_rate": 8.229858718505212e-06, + "loss": 0.8791, + "step": 534 + }, + { + "epoch": 1.4381720430107527, + "grad_norm": 0.48297854323803846, + "learning_rate": 8.157818403028343e-06, + "loss": 0.8565, + "step": 535 + }, + { + "epoch": 1.4408602150537635, + "grad_norm": 0.39205140724296417, + "learning_rate": 8.086013910151334e-06, + "loss": 0.8831, + "step": 536 + }, + { + "epoch": 1.4435483870967742, + "grad_norm": 0.4124811246961915, + "learning_rate": 8.014446669773061e-06, + "loss": 0.8508, + "step": 537 + }, + { + "epoch": 1.446236559139785, + "grad_norm": 0.3697888279992508, + "learning_rate": 7.943118107067813e-06, + "loss": 0.8964, + "step": 538 + }, + { + "epoch": 1.4489247311827957, + "grad_norm": 0.35775344448605834, + "learning_rate": 7.872029642456895e-06, + "loss": 0.8804, + "step": 539 + }, + { + "epoch": 1.4516129032258065, + "grad_norm": 0.4050701342223883, + "learning_rate": 7.801182691580362e-06, + "loss": 0.8798, + "step": 540 + }, + { + "epoch": 1.4543010752688172, + "grad_norm": 0.37810189432390856, + "learning_rate": 7.730578665268815e-06, + "loss": 0.9053, + "step": 541 + }, + { + "epoch": 1.456989247311828, + "grad_norm": 0.4016865992606119, + "learning_rate": 7.66021896951529e-06, + "loss": 0.8787, + "step": 542 + }, + { + "epoch": 1.4596774193548387, + "grad_norm": 0.3745942539375088, + "learning_rate": 7.590105005447317e-06, + "loss": 0.8693, + "step": 543 + }, + { + "epoch": 1.4623655913978495, + "grad_norm": 0.4773383301892084, + "learning_rate": 7.520238169298937e-06, + "loss": 0.8696, + "step": 544 + }, + { + "epoch": 1.4650537634408602, + "grad_norm": 0.3915177577878335, + "learning_rate": 7.450619852382959e-06, + "loss": 0.8985, + "step": 545 + }, + { + "epoch": 1.467741935483871, + "grad_norm": 0.4002539636329483, + "learning_rate": 7.381251441063255e-06, + "loss": 0.8545, + "step": 546 + }, + { + "epoch": 1.4704301075268817, + "grad_norm": 0.4556350475396439, + "learning_rate": 7.312134316727093e-06, + "loss": 0.8445, + "step": 547 + }, + { + "epoch": 1.4731182795698925, + "grad_norm": 0.4195728788592859, + "learning_rate": 7.243269855757693e-06, + "loss": 0.9239, + "step": 548 + }, + { + "epoch": 1.4758064516129032, + "grad_norm": 0.44054388742759737, + "learning_rate": 7.1746594295067826e-06, + "loss": 0.9248, + "step": 549 + }, + { + "epoch": 1.478494623655914, + "grad_norm": 0.42709109156924885, + "learning_rate": 7.106304404267304e-06, + "loss": 0.8424, + "step": 550 + }, + { + "epoch": 1.4811827956989247, + "grad_norm": 0.39224047779710847, + "learning_rate": 7.0382061412461935e-06, + "loss": 0.8638, + "step": 551 + }, + { + "epoch": 1.4838709677419355, + "grad_norm": 0.3480064313882409, + "learning_rate": 6.970365996537285e-06, + "loss": 0.866, + "step": 552 + }, + { + "epoch": 1.4865591397849462, + "grad_norm": 0.4470655366145503, + "learning_rate": 6.902785321094301e-06, + "loss": 0.8854, + "step": 553 + }, + { + "epoch": 1.489247311827957, + "grad_norm": 0.427599328724393, + "learning_rate": 6.8354654607039535e-06, + "loss": 0.8864, + "step": 554 + }, + { + "epoch": 1.4919354838709677, + "grad_norm": 0.41347796402657694, + "learning_rate": 6.768407755959119e-06, + "loss": 0.8928, + "step": 555 + }, + { + "epoch": 1.4946236559139785, + "grad_norm": 0.3826360606173244, + "learning_rate": 6.701613542232202e-06, + "loss": 0.9089, + "step": 556 + }, + { + "epoch": 1.4973118279569892, + "grad_norm": 0.36892665307361505, + "learning_rate": 6.635084149648481e-06, + "loss": 0.8484, + "step": 557 + }, + { + "epoch": 1.5, + "grad_norm": 0.36406926057608424, + "learning_rate": 6.568820903059632e-06, + "loss": 0.8632, + "step": 558 + } + ], + "logging_steps": 1, + "max_steps": 744, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 186, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2315107631628288.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}