{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 372, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002688172043010753, "grad_norm": 1.6433222600981285, "learning_rate": 0.0, "loss": 1.562, "step": 1 }, { "epoch": 0.005376344086021506, "grad_norm": 1.6862631068558513, "learning_rate": 1.0000000000000002e-06, "loss": 1.4706, "step": 2 }, { "epoch": 0.008064516129032258, "grad_norm": 1.7423201097805276, "learning_rate": 2.0000000000000003e-06, "loss": 1.5406, "step": 3 }, { "epoch": 0.010752688172043012, "grad_norm": 1.7727625055064622, "learning_rate": 3e-06, "loss": 1.5182, "step": 4 }, { "epoch": 0.013440860215053764, "grad_norm": 1.5457482765192463, "learning_rate": 4.000000000000001e-06, "loss": 1.5169, "step": 5 }, { "epoch": 0.016129032258064516, "grad_norm": 1.5659007249743502, "learning_rate": 5e-06, "loss": 1.4922, "step": 6 }, { "epoch": 0.01881720430107527, "grad_norm": 1.3878881126089677, "learning_rate": 6e-06, "loss": 1.4863, "step": 7 }, { "epoch": 0.021505376344086023, "grad_norm": 1.295368020848385, "learning_rate": 7e-06, "loss": 1.4839, "step": 8 }, { "epoch": 0.024193548387096774, "grad_norm": 1.589857887668944, "learning_rate": 8.000000000000001e-06, "loss": 1.4303, "step": 9 }, { "epoch": 0.026881720430107527, "grad_norm": 2.60679604894195, "learning_rate": 9e-06, "loss": 1.3744, "step": 10 }, { "epoch": 0.02956989247311828, "grad_norm": 0.8410885692002656, "learning_rate": 1e-05, "loss": 1.3498, "step": 11 }, { "epoch": 0.03225806451612903, "grad_norm": 0.7927855266728604, "learning_rate": 1.1000000000000001e-05, "loss": 1.3179, "step": 12 }, { "epoch": 0.03494623655913978, "grad_norm": 0.6808035050220127, "learning_rate": 1.2e-05, "loss": 1.3268, "step": 13 }, { "epoch": 0.03763440860215054, "grad_norm": 0.6602967909334083, "learning_rate": 1.3000000000000001e-05, "loss": 1.2784, "step": 14 }, { "epoch": 0.04032258064516129, "grad_norm": 0.5797556052811048, "learning_rate": 1.4e-05, "loss": 1.2949, "step": 15 }, { "epoch": 0.043010752688172046, "grad_norm": 0.6000541560518325, "learning_rate": 1.5000000000000002e-05, "loss": 1.288, "step": 16 }, { "epoch": 0.0456989247311828, "grad_norm": 0.6494981992893607, "learning_rate": 1.6000000000000003e-05, "loss": 1.2449, "step": 17 }, { "epoch": 0.04838709677419355, "grad_norm": 0.6723097988215474, "learning_rate": 1.7e-05, "loss": 1.2102, "step": 18 }, { "epoch": 0.051075268817204304, "grad_norm": 0.6702835925568053, "learning_rate": 1.8e-05, "loss": 1.2025, "step": 19 }, { "epoch": 0.053763440860215055, "grad_norm": 0.625636082792655, "learning_rate": 1.9e-05, "loss": 1.2777, "step": 20 }, { "epoch": 0.056451612903225805, "grad_norm": 0.6253912624763358, "learning_rate": 2e-05, "loss": 1.2669, "step": 21 }, { "epoch": 0.05913978494623656, "grad_norm": 0.5910337660829342, "learning_rate": 2.1000000000000002e-05, "loss": 1.2654, "step": 22 }, { "epoch": 0.06182795698924731, "grad_norm": 0.6304908028391322, "learning_rate": 2.2000000000000003e-05, "loss": 1.2413, "step": 23 }, { "epoch": 0.06451612903225806, "grad_norm": 0.5377853121890415, "learning_rate": 2.3e-05, "loss": 1.2109, "step": 24 }, { "epoch": 0.06720430107526881, "grad_norm": 0.4970873703549533, "learning_rate": 2.4e-05, "loss": 1.1359, "step": 25 }, { "epoch": 0.06989247311827956, "grad_norm": 0.5292734885521813, "learning_rate": 2.5e-05, "loss": 1.2236, "step": 26 }, { "epoch": 0.07258064516129033, "grad_norm": 0.5428754620149544, "learning_rate": 2.6000000000000002e-05, "loss": 1.2083, "step": 27 }, { "epoch": 0.07526881720430108, "grad_norm": 0.5711123503896314, "learning_rate": 2.7000000000000002e-05, "loss": 1.2161, "step": 28 }, { "epoch": 0.07795698924731183, "grad_norm": 0.49149041488377043, "learning_rate": 2.8e-05, "loss": 1.1454, "step": 29 }, { "epoch": 0.08064516129032258, "grad_norm": 0.5285852530799724, "learning_rate": 2.9e-05, "loss": 1.1194, "step": 30 }, { "epoch": 0.08333333333333333, "grad_norm": 0.5295555329242986, "learning_rate": 3.0000000000000004e-05, "loss": 1.1688, "step": 31 }, { "epoch": 0.08602150537634409, "grad_norm": 0.465354706566009, "learning_rate": 3.1e-05, "loss": 1.1743, "step": 32 }, { "epoch": 0.08870967741935484, "grad_norm": 0.4486072933924605, "learning_rate": 3.2000000000000005e-05, "loss": 1.0818, "step": 33 }, { "epoch": 0.0913978494623656, "grad_norm": 0.496727888984662, "learning_rate": 3.3e-05, "loss": 1.2101, "step": 34 }, { "epoch": 0.09408602150537634, "grad_norm": 0.43899748210993167, "learning_rate": 3.4e-05, "loss": 1.1884, "step": 35 }, { "epoch": 0.0967741935483871, "grad_norm": 0.4147227405541853, "learning_rate": 3.5000000000000004e-05, "loss": 1.0814, "step": 36 }, { "epoch": 0.09946236559139784, "grad_norm": 0.48760701758721925, "learning_rate": 3.6e-05, "loss": 1.1212, "step": 37 }, { "epoch": 0.10215053763440861, "grad_norm": 0.49917378567432974, "learning_rate": 3.7000000000000005e-05, "loss": 1.1984, "step": 38 }, { "epoch": 0.10483870967741936, "grad_norm": 0.5304015628409972, "learning_rate": 3.8e-05, "loss": 1.1274, "step": 39 }, { "epoch": 0.10752688172043011, "grad_norm": 0.4726408598975661, "learning_rate": 3.9e-05, "loss": 1.1323, "step": 40 }, { "epoch": 0.11021505376344086, "grad_norm": 0.44174146995469904, "learning_rate": 4e-05, "loss": 1.1898, "step": 41 }, { "epoch": 0.11290322580645161, "grad_norm": 0.5087279682773094, "learning_rate": 3.999980086219931e-05, "loss": 1.1469, "step": 42 }, { "epoch": 0.11559139784946236, "grad_norm": 0.5626510931079601, "learning_rate": 3.999920345276283e-05, "loss": 1.1321, "step": 43 }, { "epoch": 0.11827956989247312, "grad_norm": 0.47565220090788773, "learning_rate": 3.999820778358724e-05, "loss": 1.1453, "step": 44 }, { "epoch": 0.12096774193548387, "grad_norm": 0.4431044005508681, "learning_rate": 3.999681387450007e-05, "loss": 1.1408, "step": 45 }, { "epoch": 0.12365591397849462, "grad_norm": 0.47942624390584926, "learning_rate": 3.999502175325932e-05, "loss": 1.168, "step": 46 }, { "epoch": 0.12634408602150538, "grad_norm": 0.43166434321061714, "learning_rate": 3.999283145555291e-05, "loss": 1.1087, "step": 47 }, { "epoch": 0.12903225806451613, "grad_norm": 0.47105749411720044, "learning_rate": 3.999024302499794e-05, "loss": 1.0752, "step": 48 }, { "epoch": 0.13172043010752688, "grad_norm": 0.3959072081415341, "learning_rate": 3.998725651313984e-05, "loss": 1.1011, "step": 49 }, { "epoch": 0.13440860215053763, "grad_norm": 0.4416535692834609, "learning_rate": 3.998387197945135e-05, "loss": 1.1306, "step": 50 }, { "epoch": 0.13709677419354838, "grad_norm": 0.4272647809985287, "learning_rate": 3.9980089491331344e-05, "loss": 1.1381, "step": 51 }, { "epoch": 0.13978494623655913, "grad_norm": 0.47769854993592265, "learning_rate": 3.997590912410345e-05, "loss": 1.0976, "step": 52 }, { "epoch": 0.1424731182795699, "grad_norm": 0.3877500456630632, "learning_rate": 3.997133096101458e-05, "loss": 1.128, "step": 53 }, { "epoch": 0.14516129032258066, "grad_norm": 0.3869721085588235, "learning_rate": 3.996635509323327e-05, "loss": 1.1225, "step": 54 }, { "epoch": 0.1478494623655914, "grad_norm": 0.47271590281090886, "learning_rate": 3.9960981619847856e-05, "loss": 1.1141, "step": 55 }, { "epoch": 0.15053763440860216, "grad_norm": 0.4368206211090345, "learning_rate": 3.99552106478645e-05, "loss": 1.0872, "step": 56 }, { "epoch": 0.1532258064516129, "grad_norm": 0.3872679475185707, "learning_rate": 3.994904229220507e-05, "loss": 1.1514, "step": 57 }, { "epoch": 0.15591397849462366, "grad_norm": 0.406268890860899, "learning_rate": 3.9942476675704854e-05, "loss": 1.0965, "step": 58 }, { "epoch": 0.1586021505376344, "grad_norm": 0.43172418498531184, "learning_rate": 3.993551392911009e-05, "loss": 1.1192, "step": 59 }, { "epoch": 0.16129032258064516, "grad_norm": 0.4258357918752704, "learning_rate": 3.9928154191075375e-05, "loss": 1.0623, "step": 60 }, { "epoch": 0.1639784946236559, "grad_norm": 0.4585556740184179, "learning_rate": 3.9920397608160925e-05, "loss": 1.1076, "step": 61 }, { "epoch": 0.16666666666666666, "grad_norm": 0.44452627464263844, "learning_rate": 3.991224433482961e-05, "loss": 1.1107, "step": 62 }, { "epoch": 0.1693548387096774, "grad_norm": 0.4787003491624029, "learning_rate": 3.990369453344394e-05, "loss": 1.1165, "step": 63 }, { "epoch": 0.17204301075268819, "grad_norm": 0.4704549745433953, "learning_rate": 3.989474837426277e-05, "loss": 1.1541, "step": 64 }, { "epoch": 0.17473118279569894, "grad_norm": 0.4026214434021435, "learning_rate": 3.9885406035437953e-05, "loss": 1.1166, "step": 65 }, { "epoch": 0.1774193548387097, "grad_norm": 0.40057979364796353, "learning_rate": 3.987566770301076e-05, "loss": 1.0626, "step": 66 }, { "epoch": 0.18010752688172044, "grad_norm": 0.4340486368362563, "learning_rate": 3.98655335709082e-05, "loss": 1.104, "step": 67 }, { "epoch": 0.1827956989247312, "grad_norm": 0.42609639195543936, "learning_rate": 3.985500384093917e-05, "loss": 1.0893, "step": 68 }, { "epoch": 0.18548387096774194, "grad_norm": 0.381378569874383, "learning_rate": 3.984407872279037e-05, "loss": 1.0433, "step": 69 }, { "epoch": 0.1881720430107527, "grad_norm": 0.3903976348529897, "learning_rate": 3.983275843402222e-05, "loss": 1.1019, "step": 70 }, { "epoch": 0.19086021505376344, "grad_norm": 0.3648695348221521, "learning_rate": 3.982104320006446e-05, "loss": 1.0992, "step": 71 }, { "epoch": 0.1935483870967742, "grad_norm": 1.8993059639660952, "learning_rate": 3.9808933254211665e-05, "loss": 1.1056, "step": 72 }, { "epoch": 0.19623655913978494, "grad_norm": 0.46580843289168206, "learning_rate": 3.979642883761866e-05, "loss": 1.1031, "step": 73 }, { "epoch": 0.1989247311827957, "grad_norm": 0.449285515287558, "learning_rate": 3.978353019929562e-05, "loss": 1.1068, "step": 74 }, { "epoch": 0.20161290322580644, "grad_norm": 0.5567418056951845, "learning_rate": 3.977023759610321e-05, "loss": 1.0446, "step": 75 }, { "epoch": 0.20430107526881722, "grad_norm": 0.38684392317210076, "learning_rate": 3.9756551292747405e-05, "loss": 1.0377, "step": 76 }, { "epoch": 0.20698924731182797, "grad_norm": 0.473773440244898, "learning_rate": 3.974247156177423e-05, "loss": 1.1396, "step": 77 }, { "epoch": 0.20967741935483872, "grad_norm": 0.4177520757238314, "learning_rate": 3.9727998683564355e-05, "loss": 1.1008, "step": 78 }, { "epoch": 0.21236559139784947, "grad_norm": 0.39719194878309766, "learning_rate": 3.9713132946327494e-05, "loss": 1.0215, "step": 79 }, { "epoch": 0.21505376344086022, "grad_norm": 0.4105085260167095, "learning_rate": 3.9697874646096675e-05, "loss": 1.1115, "step": 80 }, { "epoch": 0.21774193548387097, "grad_norm": 0.4087045401288919, "learning_rate": 3.968222408672232e-05, "loss": 1.0579, "step": 81 }, { "epoch": 0.22043010752688172, "grad_norm": 0.39033402258475636, "learning_rate": 3.9666181579866244e-05, "loss": 1.0692, "step": 82 }, { "epoch": 0.22311827956989247, "grad_norm": 0.41439706526743936, "learning_rate": 3.964974744499539e-05, "loss": 1.0865, "step": 83 }, { "epoch": 0.22580645161290322, "grad_norm": 0.38234297411695073, "learning_rate": 3.963292200937551e-05, "loss": 1.0173, "step": 84 }, { "epoch": 0.22849462365591397, "grad_norm": 0.5308750280660687, "learning_rate": 3.961570560806461e-05, "loss": 1.067, "step": 85 }, { "epoch": 0.23118279569892472, "grad_norm": 0.43351295582441124, "learning_rate": 3.959809858390634e-05, "loss": 1.086, "step": 86 }, { "epoch": 0.23387096774193547, "grad_norm": 0.42069712201952686, "learning_rate": 3.9580101287523105e-05, "loss": 1.1064, "step": 87 }, { "epoch": 0.23655913978494625, "grad_norm": 0.42821523209412365, "learning_rate": 3.95617140773091e-05, "loss": 1.0263, "step": 88 }, { "epoch": 0.239247311827957, "grad_norm": 0.4114502165683399, "learning_rate": 3.954293731942319e-05, "loss": 1.0729, "step": 89 }, { "epoch": 0.24193548387096775, "grad_norm": 0.4131919780645225, "learning_rate": 3.95237713877816e-05, "loss": 1.0621, "step": 90 }, { "epoch": 0.2446236559139785, "grad_norm": 0.4433939594965718, "learning_rate": 3.950421666405048e-05, "loss": 1.0805, "step": 91 }, { "epoch": 0.24731182795698925, "grad_norm": 0.4056188018789589, "learning_rate": 3.948427353763829e-05, "loss": 1.0784, "step": 92 }, { "epoch": 0.25, "grad_norm": 0.4642044159391645, "learning_rate": 3.946394240568807e-05, "loss": 1.0406, "step": 93 }, { "epoch": 0.25268817204301075, "grad_norm": 0.4280982724994961, "learning_rate": 3.944322367306951e-05, "loss": 1.1117, "step": 94 }, { "epoch": 0.2553763440860215, "grad_norm": 0.41758547723414086, "learning_rate": 3.942211775237089e-05, "loss": 1.0747, "step": 95 }, { "epoch": 0.25806451612903225, "grad_norm": 0.4344009299837567, "learning_rate": 3.940062506389089e-05, "loss": 1.1249, "step": 96 }, { "epoch": 0.260752688172043, "grad_norm": 0.3847297194838658, "learning_rate": 3.937874603563015e-05, "loss": 1.0977, "step": 97 }, { "epoch": 0.26344086021505375, "grad_norm": 0.4959083398122344, "learning_rate": 3.935648110328285e-05, "loss": 1.041, "step": 98 }, { "epoch": 0.2661290322580645, "grad_norm": 0.46262720954521647, "learning_rate": 3.933383071022795e-05, "loss": 1.0926, "step": 99 }, { "epoch": 0.26881720430107525, "grad_norm": 0.4789561041937064, "learning_rate": 3.93107953075204e-05, "loss": 1.0701, "step": 100 }, { "epoch": 0.271505376344086, "grad_norm": 0.4229869803365367, "learning_rate": 3.928737535388214e-05, "loss": 1.063, "step": 101 }, { "epoch": 0.27419354838709675, "grad_norm": 0.43404703473814416, "learning_rate": 3.9263571315692976e-05, "loss": 1.0696, "step": 102 }, { "epoch": 0.2768817204301075, "grad_norm": 0.4396716028324381, "learning_rate": 3.923938366698129e-05, "loss": 1.0317, "step": 103 }, { "epoch": 0.27956989247311825, "grad_norm": 0.6860340156482403, "learning_rate": 3.921481288941459e-05, "loss": 1.0611, "step": 104 }, { "epoch": 0.28225806451612906, "grad_norm": 0.39601683185098385, "learning_rate": 3.9189859472289956e-05, "loss": 1.0294, "step": 105 }, { "epoch": 0.2849462365591398, "grad_norm": 0.39641986440862376, "learning_rate": 3.9164523912524224e-05, "loss": 1.0663, "step": 106 }, { "epoch": 0.28763440860215056, "grad_norm": 0.3898209322812333, "learning_rate": 3.913880671464418e-05, "loss": 1.0671, "step": 107 }, { "epoch": 0.2903225806451613, "grad_norm": 0.408678962590762, "learning_rate": 3.911270839077644e-05, "loss": 1.0224, "step": 108 }, { "epoch": 0.29301075268817206, "grad_norm": 0.4681397312637908, "learning_rate": 3.908622946063728e-05, "loss": 1.091, "step": 109 }, { "epoch": 0.2956989247311828, "grad_norm": 0.47955178042664964, "learning_rate": 3.9059370451522295e-05, "loss": 1.0961, "step": 110 }, { "epoch": 0.29838709677419356, "grad_norm": 0.4229760577312693, "learning_rate": 3.903213189829589e-05, "loss": 1.0386, "step": 111 }, { "epoch": 0.3010752688172043, "grad_norm": 0.39011319960684926, "learning_rate": 3.900451434338062e-05, "loss": 1.067, "step": 112 }, { "epoch": 0.30376344086021506, "grad_norm": 0.39672904488910227, "learning_rate": 3.8976518336746396e-05, "loss": 1.0424, "step": 113 }, { "epoch": 0.3064516129032258, "grad_norm": 0.49393594827425025, "learning_rate": 3.894814443589954e-05, "loss": 1.0695, "step": 114 }, { "epoch": 0.30913978494623656, "grad_norm": 0.38254416729289076, "learning_rate": 3.8919393205871676e-05, "loss": 1.0801, "step": 115 }, { "epoch": 0.3118279569892473, "grad_norm": 0.4456422459103533, "learning_rate": 3.889026521920847e-05, "loss": 1.0934, "step": 116 }, { "epoch": 0.31451612903225806, "grad_norm": 0.39398196216047476, "learning_rate": 3.886076105595825e-05, "loss": 1.1011, "step": 117 }, { "epoch": 0.3172043010752688, "grad_norm": 0.3949327527665007, "learning_rate": 3.883088130366042e-05, "loss": 1.018, "step": 118 }, { "epoch": 0.31989247311827956, "grad_norm": 0.39254792724729387, "learning_rate": 3.88006265573338e-05, "loss": 1.0607, "step": 119 }, { "epoch": 0.3225806451612903, "grad_norm": 0.5007199853312655, "learning_rate": 3.876999741946478e-05, "loss": 1.0609, "step": 120 }, { "epoch": 0.32526881720430106, "grad_norm": 0.4619751408736227, "learning_rate": 3.873899449999524e-05, "loss": 1.0955, "step": 121 }, { "epoch": 0.3279569892473118, "grad_norm": 0.48219172224114765, "learning_rate": 3.870761841631051e-05, "loss": 1.063, "step": 122 }, { "epoch": 0.33064516129032256, "grad_norm": 0.4054037874416271, "learning_rate": 3.867586979322703e-05, "loss": 1.0907, "step": 123 }, { "epoch": 0.3333333333333333, "grad_norm": 0.43161457507331874, "learning_rate": 3.8643749262979896e-05, "loss": 1.0666, "step": 124 }, { "epoch": 0.33602150537634407, "grad_norm": 0.36751029685084174, "learning_rate": 3.861125746521028e-05, "loss": 1.0557, "step": 125 }, { "epoch": 0.3387096774193548, "grad_norm": 0.46690938120869707, "learning_rate": 3.8578395046952686e-05, "loss": 1.1023, "step": 126 }, { "epoch": 0.34139784946236557, "grad_norm": 0.3988094995343537, "learning_rate": 3.85451626626221e-05, "loss": 1.0717, "step": 127 }, { "epoch": 0.34408602150537637, "grad_norm": 0.48432619617982536, "learning_rate": 3.85115609740009e-05, "loss": 1.0271, "step": 128 }, { "epoch": 0.3467741935483871, "grad_norm": 0.5127948499632843, "learning_rate": 3.8477590650225735e-05, "loss": 1.0575, "step": 129 }, { "epoch": 0.34946236559139787, "grad_norm": 0.4132091412639387, "learning_rate": 3.8443252367774164e-05, "loss": 1.0355, "step": 130 }, { "epoch": 0.3521505376344086, "grad_norm": 0.4439631972175399, "learning_rate": 3.8408546810451176e-05, "loss": 1.0541, "step": 131 }, { "epoch": 0.3548387096774194, "grad_norm": 0.3956247259769062, "learning_rate": 3.837347466937562e-05, "loss": 1.0672, "step": 132 }, { "epoch": 0.3575268817204301, "grad_norm": 0.44952249373265674, "learning_rate": 3.8338036642966396e-05, "loss": 1.0444, "step": 133 }, { "epoch": 0.3602150537634409, "grad_norm": 0.4449484078947791, "learning_rate": 3.830223343692857e-05, "loss": 1.0514, "step": 134 }, { "epoch": 0.3629032258064516, "grad_norm": 0.3905509358873801, "learning_rate": 3.826606576423931e-05, "loss": 1.0394, "step": 135 }, { "epoch": 0.3655913978494624, "grad_norm": 0.4183744146790331, "learning_rate": 3.8229534345133695e-05, "loss": 1.0212, "step": 136 }, { "epoch": 0.3682795698924731, "grad_norm": 0.46086732418604737, "learning_rate": 3.819263990709037e-05, "loss": 0.994, "step": 137 }, { "epoch": 0.3709677419354839, "grad_norm": 0.4468564375555911, "learning_rate": 3.8155383184817064e-05, "loss": 1.0279, "step": 138 }, { "epoch": 0.3736559139784946, "grad_norm": 0.3966511312736679, "learning_rate": 3.8117764920235945e-05, "loss": 0.9992, "step": 139 }, { "epoch": 0.3763440860215054, "grad_norm": 0.46461846433833476, "learning_rate": 3.807978586246887e-05, "loss": 1.088, "step": 140 }, { "epoch": 0.3790322580645161, "grad_norm": 0.4254641795470929, "learning_rate": 3.804144676782243e-05, "loss": 1.0764, "step": 141 }, { "epoch": 0.3817204301075269, "grad_norm": 0.42137203485219293, "learning_rate": 3.800274839977293e-05, "loss": 1.0422, "step": 142 }, { "epoch": 0.3844086021505376, "grad_norm": 0.4172681789743796, "learning_rate": 3.796369152895117e-05, "loss": 1.0453, "step": 143 }, { "epoch": 0.3870967741935484, "grad_norm": 0.4531431509751161, "learning_rate": 3.792427693312707e-05, "loss": 1.0389, "step": 144 }, { "epoch": 0.3897849462365591, "grad_norm": 0.3782466419505299, "learning_rate": 3.788450539719423e-05, "loss": 1.025, "step": 145 }, { "epoch": 0.3924731182795699, "grad_norm": 0.4655605897605627, "learning_rate": 3.7844377713154264e-05, "loss": 1.064, "step": 146 }, { "epoch": 0.3951612903225806, "grad_norm": 0.4384836890227208, "learning_rate": 3.780389468010106e-05, "loss": 1.0397, "step": 147 }, { "epoch": 0.3978494623655914, "grad_norm": 0.4844715439450037, "learning_rate": 3.776305710420482e-05, "loss": 1.1193, "step": 148 }, { "epoch": 0.40053763440860213, "grad_norm": 0.41760675460607827, "learning_rate": 3.7721865798696056e-05, "loss": 1.0124, "step": 149 }, { "epoch": 0.4032258064516129, "grad_norm": 0.7337537478769387, "learning_rate": 3.7680321583849365e-05, "loss": 1.0508, "step": 150 }, { "epoch": 0.40591397849462363, "grad_norm": 0.44725816367920673, "learning_rate": 3.76384252869671e-05, "loss": 1.0434, "step": 151 }, { "epoch": 0.40860215053763443, "grad_norm": 0.40870612635720194, "learning_rate": 3.759617774236292e-05, "loss": 1.068, "step": 152 }, { "epoch": 0.4112903225806452, "grad_norm": 0.4534649483932217, "learning_rate": 3.755357979134511e-05, "loss": 1.0614, "step": 153 }, { "epoch": 0.41397849462365593, "grad_norm": 0.41986572053185917, "learning_rate": 3.751063228219993e-05, "loss": 1.0391, "step": 154 }, { "epoch": 0.4166666666666667, "grad_norm": 0.3717380879536067, "learning_rate": 3.7467336070174604e-05, "loss": 1.0378, "step": 155 }, { "epoch": 0.41935483870967744, "grad_norm": 0.41848537015206944, "learning_rate": 3.742369201746038e-05, "loss": 1.0439, "step": 156 }, { "epoch": 0.4220430107526882, "grad_norm": 0.43443932018052933, "learning_rate": 3.737970099317535e-05, "loss": 1.0197, "step": 157 }, { "epoch": 0.42473118279569894, "grad_norm": 0.421554546653683, "learning_rate": 3.7335363873347056e-05, "loss": 1.0487, "step": 158 }, { "epoch": 0.4274193548387097, "grad_norm": 0.8430023271255561, "learning_rate": 3.729068154089519e-05, "loss": 1.0333, "step": 159 }, { "epoch": 0.43010752688172044, "grad_norm": 0.4363044724173691, "learning_rate": 3.724565488561387e-05, "loss": 1.0213, "step": 160 }, { "epoch": 0.4327956989247312, "grad_norm": 0.5335682969510431, "learning_rate": 3.720028480415401e-05, "loss": 1.0205, "step": 161 }, { "epoch": 0.43548387096774194, "grad_norm": 0.4056834135687678, "learning_rate": 3.7154572200005446e-05, "loss": 1.0311, "step": 162 }, { "epoch": 0.4381720430107527, "grad_norm": 0.5322107401886871, "learning_rate": 3.710851798347891e-05, "loss": 1.0601, "step": 163 }, { "epoch": 0.44086021505376344, "grad_norm": 0.4138677278304246, "learning_rate": 3.7062123071687944e-05, "loss": 1.0361, "step": 164 }, { "epoch": 0.4435483870967742, "grad_norm": 0.4775100325512625, "learning_rate": 3.701538838853062e-05, "loss": 1.0194, "step": 165 }, { "epoch": 0.44623655913978494, "grad_norm": 0.40839482534046995, "learning_rate": 3.696831486467114e-05, "loss": 1.0463, "step": 166 }, { "epoch": 0.4489247311827957, "grad_norm": 0.3963093446633738, "learning_rate": 3.6920903437521305e-05, "loss": 1.0238, "step": 167 }, { "epoch": 0.45161290322580644, "grad_norm": 0.4344752184390704, "learning_rate": 3.6873155051221846e-05, "loss": 1.0472, "step": 168 }, { "epoch": 0.4543010752688172, "grad_norm": 0.4167014186949368, "learning_rate": 3.6825070656623626e-05, "loss": 1.0599, "step": 169 }, { "epoch": 0.45698924731182794, "grad_norm": 0.43904590007956124, "learning_rate": 3.677665121126871e-05, "loss": 1.0559, "step": 170 }, { "epoch": 0.4596774193548387, "grad_norm": 0.372185063148541, "learning_rate": 3.6727897679371276e-05, "loss": 1.0012, "step": 171 }, { "epoch": 0.46236559139784944, "grad_norm": 0.43086731351488916, "learning_rate": 3.667881103179844e-05, "loss": 1.0133, "step": 172 }, { "epoch": 0.4650537634408602, "grad_norm": 0.5796354347464544, "learning_rate": 3.662939224605091e-05, "loss": 1.0517, "step": 173 }, { "epoch": 0.46774193548387094, "grad_norm": 0.4587453684541154, "learning_rate": 3.657964230624351e-05, "loss": 1.0164, "step": 174 }, { "epoch": 0.47043010752688175, "grad_norm": 0.5102852182866393, "learning_rate": 3.6529562203085595e-05, "loss": 1.052, "step": 175 }, { "epoch": 0.4731182795698925, "grad_norm": 0.4469591346380821, "learning_rate": 3.6479152933861336e-05, "loss": 1.0905, "step": 176 }, { "epoch": 0.47580645161290325, "grad_norm": 0.45277428352010624, "learning_rate": 3.642841550240983e-05, "loss": 1.0961, "step": 177 }, { "epoch": 0.478494623655914, "grad_norm": 0.45588595960031525, "learning_rate": 3.6377350919105136e-05, "loss": 1.0178, "step": 178 }, { "epoch": 0.48118279569892475, "grad_norm": 0.6147997034643559, "learning_rate": 3.632596020083612e-05, "loss": 1.0148, "step": 179 }, { "epoch": 0.4838709677419355, "grad_norm": 0.3734326271789308, "learning_rate": 3.627424437098625e-05, "loss": 1.0006, "step": 180 }, { "epoch": 0.48655913978494625, "grad_norm": 0.4564187594173089, "learning_rate": 3.6222204459413186e-05, "loss": 1.0635, "step": 181 }, { "epoch": 0.489247311827957, "grad_norm": 0.42811733614493086, "learning_rate": 3.6169841502428285e-05, "loss": 1.0469, "step": 182 }, { "epoch": 0.49193548387096775, "grad_norm": 0.4227875509642681, "learning_rate": 3.611715654277596e-05, "loss": 1.0446, "step": 183 }, { "epoch": 0.4946236559139785, "grad_norm": 0.40548546169007965, "learning_rate": 3.60641506296129e-05, "loss": 1.0564, "step": 184 }, { "epoch": 0.49731182795698925, "grad_norm": 0.4161116484325749, "learning_rate": 3.601082481848721e-05, "loss": 0.9917, "step": 185 }, { "epoch": 0.5, "grad_norm": 0.39180067540636987, "learning_rate": 3.595718017131736e-05, "loss": 1.0081, "step": 186 }, { "epoch": 0.5026881720430108, "grad_norm": 0.5307122561583237, "learning_rate": 3.5903217756371066e-05, "loss": 0.9972, "step": 187 }, { "epoch": 0.5053763440860215, "grad_norm": 0.4633315164676552, "learning_rate": 3.5848938648243976e-05, "loss": 1.0196, "step": 188 }, { "epoch": 0.5080645161290323, "grad_norm": 0.43457272116367207, "learning_rate": 3.579434392783832e-05, "loss": 1.0429, "step": 189 }, { "epoch": 0.510752688172043, "grad_norm": 0.42602042879132207, "learning_rate": 3.5739434682341355e-05, "loss": 1.0355, "step": 190 }, { "epoch": 0.5134408602150538, "grad_norm": 0.37328410492227004, "learning_rate": 3.568421200520371e-05, "loss": 1.0158, "step": 191 }, { "epoch": 0.5161290322580645, "grad_norm": 0.47901349260363574, "learning_rate": 3.562867699611764e-05, "loss": 1.006, "step": 192 }, { "epoch": 0.5188172043010753, "grad_norm": 0.6800894155552869, "learning_rate": 3.55728307609951e-05, "loss": 1.0819, "step": 193 }, { "epoch": 0.521505376344086, "grad_norm": 0.6815573295093794, "learning_rate": 3.5516674411945747e-05, "loss": 0.9767, "step": 194 }, { "epoch": 0.5241935483870968, "grad_norm": 0.40923877696875666, "learning_rate": 3.546020906725474e-05, "loss": 1.0048, "step": 195 }, { "epoch": 0.5268817204301075, "grad_norm": 0.39166638466881304, "learning_rate": 3.540343585136056e-05, "loss": 1.0115, "step": 196 }, { "epoch": 0.5295698924731183, "grad_norm": 0.46039879078749524, "learning_rate": 3.5346355894832515e-05, "loss": 1.0274, "step": 197 }, { "epoch": 0.532258064516129, "grad_norm": 0.435003701062386, "learning_rate": 3.5288970334348324e-05, "loss": 1.0262, "step": 198 }, { "epoch": 0.5349462365591398, "grad_norm": 0.46422099557675184, "learning_rate": 3.5231280312671426e-05, "loss": 1.0406, "step": 199 }, { "epoch": 0.5376344086021505, "grad_norm": 0.3946242892533647, "learning_rate": 3.51732869786282e-05, "loss": 1.0351, "step": 200 }, { "epoch": 0.5403225806451613, "grad_norm": 0.4593963303455073, "learning_rate": 3.511499148708517e-05, "loss": 1.0161, "step": 201 }, { "epoch": 0.543010752688172, "grad_norm": 0.43211273427185715, "learning_rate": 3.505639499892591e-05, "loss": 1.0339, "step": 202 }, { "epoch": 0.5456989247311828, "grad_norm": 0.4638011311631454, "learning_rate": 3.499749868102802e-05, "loss": 1.0195, "step": 203 }, { "epoch": 0.5483870967741935, "grad_norm": 0.4606785516075864, "learning_rate": 3.4938303706239814e-05, "loss": 1.0809, "step": 204 }, { "epoch": 0.5510752688172043, "grad_norm": 0.4750835163830621, "learning_rate": 3.487881125335699e-05, "loss": 1.0104, "step": 205 }, { "epoch": 0.553763440860215, "grad_norm": 0.48069623342657913, "learning_rate": 3.4819022507099184e-05, "loss": 1.0534, "step": 206 }, { "epoch": 0.5564516129032258, "grad_norm": 0.4485052357605267, "learning_rate": 3.475893865808633e-05, "loss": 1.008, "step": 207 }, { "epoch": 0.5591397849462365, "grad_norm": 0.45226568470539963, "learning_rate": 3.4698560902815e-05, "loss": 0.9859, "step": 208 }, { "epoch": 0.5618279569892473, "grad_norm": 0.4556713744237398, "learning_rate": 3.463789044363451e-05, "loss": 1.0468, "step": 209 }, { "epoch": 0.5645161290322581, "grad_norm": 0.40515419542450315, "learning_rate": 3.4576928488723056e-05, "loss": 1.0069, "step": 210 }, { "epoch": 0.5672043010752689, "grad_norm": 0.407850239298829, "learning_rate": 3.4515676252063595e-05, "loss": 1.024, "step": 211 }, { "epoch": 0.5698924731182796, "grad_norm": 0.4245125668059516, "learning_rate": 3.445413495341971e-05, "loss": 0.9842, "step": 212 }, { "epoch": 0.5725806451612904, "grad_norm": 0.5282266357639802, "learning_rate": 3.439230581831126e-05, "loss": 1.0511, "step": 213 }, { "epoch": 0.5752688172043011, "grad_norm": 0.46721556238008377, "learning_rate": 3.433019007799007e-05, "loss": 1.0722, "step": 214 }, { "epoch": 0.5779569892473119, "grad_norm": 0.3998174935596331, "learning_rate": 3.4267788969415315e-05, "loss": 1.0417, "step": 215 }, { "epoch": 0.5806451612903226, "grad_norm": 0.39836497217157424, "learning_rate": 3.420510373522896e-05, "loss": 0.9522, "step": 216 }, { "epoch": 0.5833333333333334, "grad_norm": 0.5604060165845736, "learning_rate": 3.4142135623730954e-05, "loss": 1.0406, "step": 217 }, { "epoch": 0.5860215053763441, "grad_norm": 0.4626752931850209, "learning_rate": 3.4078885888854436e-05, "loss": 1.0403, "step": 218 }, { "epoch": 0.5887096774193549, "grad_norm": 0.4119865874583256, "learning_rate": 3.4015355790140715e-05, "loss": 0.974, "step": 219 }, { "epoch": 0.5913978494623656, "grad_norm": 0.41688760669607, "learning_rate": 3.39515465927142e-05, "loss": 1.0354, "step": 220 }, { "epoch": 0.5940860215053764, "grad_norm": 0.47263736408876167, "learning_rate": 3.388745956725722e-05, "loss": 1.0438, "step": 221 }, { "epoch": 0.5967741935483871, "grad_norm": 0.48712838990373963, "learning_rate": 3.3823095989984697e-05, "loss": 0.9847, "step": 222 }, { "epoch": 0.5994623655913979, "grad_norm": 0.39317905049275836, "learning_rate": 3.3758457142618754e-05, "loss": 0.9806, "step": 223 }, { "epoch": 0.6021505376344086, "grad_norm": 0.484001386994586, "learning_rate": 3.369354431236319e-05, "loss": 1.0003, "step": 224 }, { "epoch": 0.6048387096774194, "grad_norm": 0.3896751020684252, "learning_rate": 3.362835879187783e-05, "loss": 0.9314, "step": 225 }, { "epoch": 0.6075268817204301, "grad_norm": 0.402131340210077, "learning_rate": 3.356290187925278e-05, "loss": 0.957, "step": 226 }, { "epoch": 0.6102150537634409, "grad_norm": 0.4442069284277535, "learning_rate": 3.349717487798261e-05, "loss": 1.0651, "step": 227 }, { "epoch": 0.6129032258064516, "grad_norm": 0.4075067959077034, "learning_rate": 3.3431179096940375e-05, "loss": 1.0117, "step": 228 }, { "epoch": 0.6155913978494624, "grad_norm": 0.4595977891340027, "learning_rate": 3.3364915850351525e-05, "loss": 1.0277, "step": 229 }, { "epoch": 0.6182795698924731, "grad_norm": 0.41565240224286376, "learning_rate": 3.3298386457767804e-05, "loss": 0.9873, "step": 230 }, { "epoch": 0.6209677419354839, "grad_norm": 0.400290934516727, "learning_rate": 3.3231592244040885e-05, "loss": 1.0503, "step": 231 }, { "epoch": 0.6236559139784946, "grad_norm": 0.43593503744528256, "learning_rate": 3.3164534539296056e-05, "loss": 1.0256, "step": 232 }, { "epoch": 0.6263440860215054, "grad_norm": 0.4297576409774745, "learning_rate": 3.309721467890571e-05, "loss": 0.9873, "step": 233 }, { "epoch": 0.6290322580645161, "grad_norm": 0.5286155107560961, "learning_rate": 3.302963400346272e-05, "loss": 1.0526, "step": 234 }, { "epoch": 0.6317204301075269, "grad_norm": 0.4080215430723157, "learning_rate": 3.296179385875381e-05, "loss": 0.993, "step": 235 }, { "epoch": 0.6344086021505376, "grad_norm": 0.4666697414536282, "learning_rate": 3.2893695595732705e-05, "loss": 0.9855, "step": 236 }, { "epoch": 0.6370967741935484, "grad_norm": 0.44576593027115785, "learning_rate": 3.282534057049322e-05, "loss": 0.994, "step": 237 }, { "epoch": 0.6397849462365591, "grad_norm": 0.45875921319019286, "learning_rate": 3.275673014424231e-05, "loss": 1.0695, "step": 238 }, { "epoch": 0.6424731182795699, "grad_norm": 0.4483391985101821, "learning_rate": 3.268786568327291e-05, "loss": 1.0413, "step": 239 }, { "epoch": 0.6451612903225806, "grad_norm": 0.3823024947210084, "learning_rate": 3.261874855893675e-05, "loss": 1.0634, "step": 240 }, { "epoch": 0.6478494623655914, "grad_norm": 0.42590418591004187, "learning_rate": 3.254938014761704e-05, "loss": 1.1039, "step": 241 }, { "epoch": 0.6505376344086021, "grad_norm": 0.4436207874701427, "learning_rate": 3.2479761830701075e-05, "loss": 1.0797, "step": 242 }, { "epoch": 0.6532258064516129, "grad_norm": 0.5436242022516592, "learning_rate": 3.240989499455269e-05, "loss": 0.998, "step": 243 }, { "epoch": 0.6559139784946236, "grad_norm": 0.42461660808494955, "learning_rate": 3.2339781030484715e-05, "loss": 1.0014, "step": 244 }, { "epoch": 0.6586021505376344, "grad_norm": 0.4147658974390641, "learning_rate": 3.2269421334731196e-05, "loss": 1.0047, "step": 245 }, { "epoch": 0.6612903225806451, "grad_norm": 0.3702000902999608, "learning_rate": 3.219881730841964e-05, "loss": 1.0057, "step": 246 }, { "epoch": 0.6639784946236559, "grad_norm": 0.37405944820555137, "learning_rate": 3.212797035754311e-05, "loss": 0.9881, "step": 247 }, { "epoch": 0.6666666666666666, "grad_norm": 0.39789221907192235, "learning_rate": 3.205688189293219e-05, "loss": 1.002, "step": 248 }, { "epoch": 0.6693548387096774, "grad_norm": 0.35269099760384387, "learning_rate": 3.198555333022694e-05, "loss": 1.0445, "step": 249 }, { "epoch": 0.6720430107526881, "grad_norm": 0.39171670743365294, "learning_rate": 3.191398608984867e-05, "loss": 0.9873, "step": 250 }, { "epoch": 0.6747311827956989, "grad_norm": 0.36377972714827284, "learning_rate": 3.184218159697166e-05, "loss": 0.9678, "step": 251 }, { "epoch": 0.6774193548387096, "grad_norm": 0.4760701686418637, "learning_rate": 3.177014128149479e-05, "loss": 1.0475, "step": 252 }, { "epoch": 0.6801075268817204, "grad_norm": 0.36306748600915323, "learning_rate": 3.169786657801306e-05, "loss": 0.9737, "step": 253 }, { "epoch": 0.6827956989247311, "grad_norm": 0.36397370143939106, "learning_rate": 3.162535892578903e-05, "loss": 1.0009, "step": 254 }, { "epoch": 0.6854838709677419, "grad_norm": 0.41923544253489314, "learning_rate": 3.155261976872412e-05, "loss": 0.9855, "step": 255 }, { "epoch": 0.6881720430107527, "grad_norm": 0.4349008134787599, "learning_rate": 3.147965055532991e-05, "loss": 0.9843, "step": 256 }, { "epoch": 0.6908602150537635, "grad_norm": 0.4403161475473632, "learning_rate": 3.1406452738699284e-05, "loss": 0.9932, "step": 257 }, { "epoch": 0.6935483870967742, "grad_norm": 0.4088632034626185, "learning_rate": 3.1333027776477454e-05, "loss": 1.0175, "step": 258 }, { "epoch": 0.696236559139785, "grad_norm": 0.4089626667866183, "learning_rate": 3.125937713083296e-05, "loss": 0.9957, "step": 259 }, { "epoch": 0.6989247311827957, "grad_norm": 0.44005061948101687, "learning_rate": 3.118550226842857e-05, "loss": 0.9902, "step": 260 }, { "epoch": 0.7016129032258065, "grad_norm": 1.1016022022748841, "learning_rate": 3.111140466039205e-05, "loss": 0.991, "step": 261 }, { "epoch": 0.7043010752688172, "grad_norm": 0.39448956783294353, "learning_rate": 3.103708578228686e-05, "loss": 1.0041, "step": 262 }, { "epoch": 0.706989247311828, "grad_norm": 0.41388488702273174, "learning_rate": 3.0962547114082804e-05, "loss": 0.9928, "step": 263 }, { "epoch": 0.7096774193548387, "grad_norm": 0.4065224464102798, "learning_rate": 3.088779014012652e-05, "loss": 0.9859, "step": 264 }, { "epoch": 0.7123655913978495, "grad_norm": 0.39952347811781436, "learning_rate": 3.0812816349111956e-05, "loss": 0.9613, "step": 265 }, { "epoch": 0.7150537634408602, "grad_norm": 0.43554876713734897, "learning_rate": 3.073762723405069e-05, "loss": 1.0289, "step": 266 }, { "epoch": 0.717741935483871, "grad_norm": 0.469813057633801, "learning_rate": 3.066222429224221e-05, "loss": 1.0438, "step": 267 }, { "epoch": 0.7204301075268817, "grad_norm": 0.4353123605440106, "learning_rate": 3.0586609025244144e-05, "loss": 1.0017, "step": 268 }, { "epoch": 0.7231182795698925, "grad_norm": 0.40010712539262144, "learning_rate": 3.051078293884226e-05, "loss": 1.0254, "step": 269 }, { "epoch": 0.7258064516129032, "grad_norm": 0.41179768187019394, "learning_rate": 3.0434747543020585e-05, "loss": 1.0167, "step": 270 }, { "epoch": 0.728494623655914, "grad_norm": 0.39261397155250993, "learning_rate": 3.0358504351931265e-05, "loss": 0.9987, "step": 271 }, { "epoch": 0.7311827956989247, "grad_norm": 0.4037853365263608, "learning_rate": 3.0282054883864434e-05, "loss": 1.0016, "step": 272 }, { "epoch": 0.7338709677419355, "grad_norm": 0.3920371074761728, "learning_rate": 3.0205400661218e-05, "loss": 0.9427, "step": 273 }, { "epoch": 0.7365591397849462, "grad_norm": 0.4525036893342772, "learning_rate": 3.0128543210467273e-05, "loss": 1.0566, "step": 274 }, { "epoch": 0.739247311827957, "grad_norm": 0.41264407607647574, "learning_rate": 3.0051484062134632e-05, "loss": 0.9899, "step": 275 }, { "epoch": 0.7419354838709677, "grad_norm": 0.37437706613357397, "learning_rate": 2.9974224750759017e-05, "loss": 0.9817, "step": 276 }, { "epoch": 0.7446236559139785, "grad_norm": 0.3844600838817203, "learning_rate": 2.9896766814865355e-05, "loss": 1.0263, "step": 277 }, { "epoch": 0.7473118279569892, "grad_norm": 0.4310511049000039, "learning_rate": 2.9819111796933948e-05, "loss": 0.9781, "step": 278 }, { "epoch": 0.75, "grad_norm": 0.40281595760365946, "learning_rate": 2.9741261243369746e-05, "loss": 1.0273, "step": 279 }, { "epoch": 0.7526881720430108, "grad_norm": 0.4498302856339957, "learning_rate": 2.9663216704471547e-05, "loss": 0.9886, "step": 280 }, { "epoch": 0.7553763440860215, "grad_norm": 0.4350406167421517, "learning_rate": 2.958497973440114e-05, "loss": 1.0247, "step": 281 }, { "epoch": 0.7580645161290323, "grad_norm": 0.46748351737565624, "learning_rate": 2.9506551891152334e-05, "loss": 1.0072, "step": 282 }, { "epoch": 0.760752688172043, "grad_norm": 0.3998308958015181, "learning_rate": 2.9427934736519962e-05, "loss": 1.076, "step": 283 }, { "epoch": 0.7634408602150538, "grad_norm": 0.42326867383664013, "learning_rate": 2.9349129836068732e-05, "loss": 0.9895, "step": 284 }, { "epoch": 0.7661290322580645, "grad_norm": 0.3949205497118407, "learning_rate": 2.9270138759102108e-05, "loss": 1.027, "step": 285 }, { "epoch": 0.7688172043010753, "grad_norm": 0.40826149975955933, "learning_rate": 2.919096307863104e-05, "loss": 1.0128, "step": 286 }, { "epoch": 0.771505376344086, "grad_norm": 0.6045575439891937, "learning_rate": 2.9111604371342593e-05, "loss": 0.9806, "step": 287 }, { "epoch": 0.7741935483870968, "grad_norm": 0.3906743864943639, "learning_rate": 2.903206421756862e-05, "loss": 1.0126, "step": 288 }, { "epoch": 0.7768817204301075, "grad_norm": 0.37994713789537804, "learning_rate": 2.8952344201254253e-05, "loss": 0.9984, "step": 289 }, { "epoch": 0.7795698924731183, "grad_norm": 0.4560671009564336, "learning_rate": 2.8872445909926358e-05, "loss": 0.9846, "step": 290 }, { "epoch": 0.782258064516129, "grad_norm": 0.40231158085064994, "learning_rate": 2.8792370934661948e-05, "loss": 1.0403, "step": 291 }, { "epoch": 0.7849462365591398, "grad_norm": 0.4776678536973747, "learning_rate": 2.8712120870056455e-05, "loss": 1.0327, "step": 292 }, { "epoch": 0.7876344086021505, "grad_norm": 0.45302618010000684, "learning_rate": 2.8631697314192012e-05, "loss": 1.0126, "step": 293 }, { "epoch": 0.7903225806451613, "grad_norm": 0.4332121059542856, "learning_rate": 2.8551101868605644e-05, "loss": 1.0475, "step": 294 }, { "epoch": 0.793010752688172, "grad_norm": 0.4498441085262953, "learning_rate": 2.8470336138257315e-05, "loss": 1.0178, "step": 295 }, { "epoch": 0.7956989247311828, "grad_norm": 0.39208633969875073, "learning_rate": 2.8389401731498018e-05, "loss": 1.0127, "step": 296 }, { "epoch": 0.7983870967741935, "grad_norm": 0.4042053763726035, "learning_rate": 2.8308300260037734e-05, "loss": 0.9732, "step": 297 }, { "epoch": 0.8010752688172043, "grad_norm": 0.42842239164240437, "learning_rate": 2.8227033338913318e-05, "loss": 1.0152, "step": 298 }, { "epoch": 0.803763440860215, "grad_norm": 0.3807866452863404, "learning_rate": 2.814560258645638e-05, "loss": 1.0189, "step": 299 }, { "epoch": 0.8064516129032258, "grad_norm": 0.43852909963759557, "learning_rate": 2.8064009624260994e-05, "loss": 1.0084, "step": 300 }, { "epoch": 0.8091397849462365, "grad_norm": 0.5122035327018767, "learning_rate": 2.7982256077151482e-05, "loss": 1.0098, "step": 301 }, { "epoch": 0.8118279569892473, "grad_norm": 0.38079784946729706, "learning_rate": 2.7900343573150003e-05, "loss": 1.0097, "step": 302 }, { "epoch": 0.8145161290322581, "grad_norm": 0.3583539130301541, "learning_rate": 2.7818273743444132e-05, "loss": 0.9964, "step": 303 }, { "epoch": 0.8172043010752689, "grad_norm": 0.3813956107048218, "learning_rate": 2.7736048222354414e-05, "loss": 0.9761, "step": 304 }, { "epoch": 0.8198924731182796, "grad_norm": 0.3901758217275271, "learning_rate": 2.7653668647301797e-05, "loss": 1.0117, "step": 305 }, { "epoch": 0.8225806451612904, "grad_norm": 0.41237780052722667, "learning_rate": 2.757113665877502e-05, "loss": 0.9653, "step": 306 }, { "epoch": 0.8252688172043011, "grad_norm": 0.457306901223017, "learning_rate": 2.748845390029794e-05, "loss": 1.0524, "step": 307 }, { "epoch": 0.8279569892473119, "grad_norm": 0.3791723859065832, "learning_rate": 2.740562201839684e-05, "loss": 0.9861, "step": 308 }, { "epoch": 0.8306451612903226, "grad_norm": 0.500338650948681, "learning_rate": 2.7322642662567592e-05, "loss": 0.9705, "step": 309 }, { "epoch": 0.8333333333333334, "grad_norm": 0.4052884593861236, "learning_rate": 2.7239517485242836e-05, "loss": 0.9892, "step": 310 }, { "epoch": 0.8360215053763441, "grad_norm": 0.3969000439893693, "learning_rate": 2.715624814175907e-05, "loss": 0.9883, "step": 311 }, { "epoch": 0.8387096774193549, "grad_norm": 0.5254585071566374, "learning_rate": 2.7072836290323698e-05, "loss": 1.08, "step": 312 }, { "epoch": 0.8413978494623656, "grad_norm": 0.5111475952965409, "learning_rate": 2.698928359198197e-05, "loss": 1.0526, "step": 313 }, { "epoch": 0.8440860215053764, "grad_norm": 0.4717493748353866, "learning_rate": 2.6905591710583957e-05, "loss": 1.0137, "step": 314 }, { "epoch": 0.8467741935483871, "grad_norm": 0.3838063749897804, "learning_rate": 2.6821762312751368e-05, "loss": 0.9901, "step": 315 }, { "epoch": 0.8494623655913979, "grad_norm": 0.3456617314343378, "learning_rate": 2.6737797067844403e-05, "loss": 1.0034, "step": 316 }, { "epoch": 0.8521505376344086, "grad_norm": 0.37971130684639953, "learning_rate": 2.6653697647928485e-05, "loss": 0.9552, "step": 317 }, { "epoch": 0.8548387096774194, "grad_norm": 0.3820801267530888, "learning_rate": 2.656946572774095e-05, "loss": 0.9236, "step": 318 }, { "epoch": 0.8575268817204301, "grad_norm": 0.4114917943590629, "learning_rate": 2.648510298465775e-05, "loss": 1.0, "step": 319 }, { "epoch": 0.8602150537634409, "grad_norm": 0.4185665498381875, "learning_rate": 2.6400611098659988e-05, "loss": 1.0435, "step": 320 }, { "epoch": 0.8629032258064516, "grad_norm": 0.36227121606774076, "learning_rate": 2.6315991752300503e-05, "loss": 0.9797, "step": 321 }, { "epoch": 0.8655913978494624, "grad_norm": 0.40186567244596927, "learning_rate": 2.623124663067034e-05, "loss": 1.0071, "step": 322 }, { "epoch": 0.8682795698924731, "grad_norm": 0.3833356371805648, "learning_rate": 2.6146377421365225e-05, "loss": 1.0159, "step": 323 }, { "epoch": 0.8709677419354839, "grad_norm": 0.41469411381713683, "learning_rate": 2.6061385814451913e-05, "loss": 1.0277, "step": 324 }, { "epoch": 0.8736559139784946, "grad_norm": 0.92622435409038, "learning_rate": 2.5976273502434584e-05, "loss": 1.0001, "step": 325 }, { "epoch": 0.8763440860215054, "grad_norm": 0.4316506228630945, "learning_rate": 2.5891042180221094e-05, "loss": 1.0712, "step": 326 }, { "epoch": 0.8790322580645161, "grad_norm": 0.42656057546508047, "learning_rate": 2.580569354508925e-05, "loss": 1.0074, "step": 327 }, { "epoch": 0.8817204301075269, "grad_norm": 0.3789318712710433, "learning_rate": 2.5720229296653006e-05, "loss": 1.0355, "step": 328 }, { "epoch": 0.8844086021505376, "grad_norm": 0.367154670317836, "learning_rate": 2.5634651136828597e-05, "loss": 1.0394, "step": 329 }, { "epoch": 0.8870967741935484, "grad_norm": 0.4735001007157819, "learning_rate": 2.554896076980069e-05, "loss": 1.0552, "step": 330 }, { "epoch": 0.8897849462365591, "grad_norm": 0.4390567460028508, "learning_rate": 2.54631599019884e-05, "loss": 1.0043, "step": 331 }, { "epoch": 0.8924731182795699, "grad_norm": 0.3642787415401991, "learning_rate": 2.5377250242011338e-05, "loss": 0.9854, "step": 332 }, { "epoch": 0.8951612903225806, "grad_norm": 0.4524235630593109, "learning_rate": 2.5291233500655584e-05, "loss": 1.0029, "step": 333 }, { "epoch": 0.8978494623655914, "grad_norm": 0.4097887869063476, "learning_rate": 2.52051113908396e-05, "loss": 1.0122, "step": 334 }, { "epoch": 0.9005376344086021, "grad_norm": 0.3852040955735104, "learning_rate": 2.5118885627580155e-05, "loss": 0.9779, "step": 335 }, { "epoch": 0.9032258064516129, "grad_norm": 0.40481656602470306, "learning_rate": 2.5032557927958116e-05, "loss": 1.0125, "step": 336 }, { "epoch": 0.9059139784946236, "grad_norm": 0.4118716752579493, "learning_rate": 2.494613001108431e-05, "loss": 1.0364, "step": 337 }, { "epoch": 0.9086021505376344, "grad_norm": 0.4489453038959667, "learning_rate": 2.485960359806528e-05, "loss": 1.0436, "step": 338 }, { "epoch": 0.9112903225806451, "grad_norm": 0.41112406404210244, "learning_rate": 2.4772980411968975e-05, "loss": 0.9545, "step": 339 }, { "epoch": 0.9139784946236559, "grad_norm": 0.4856093390929945, "learning_rate": 2.468626217779047e-05, "loss": 0.9854, "step": 340 }, { "epoch": 0.9166666666666666, "grad_norm": 0.37523760134058665, "learning_rate": 2.4599450622417615e-05, "loss": 0.9699, "step": 341 }, { "epoch": 0.9193548387096774, "grad_norm": 0.4064413347216363, "learning_rate": 2.4512547474596624e-05, "loss": 1.0083, "step": 342 }, { "epoch": 0.9220430107526881, "grad_norm": 0.44550717714004195, "learning_rate": 2.4425554464897675e-05, "loss": 1.0175, "step": 343 }, { "epoch": 0.9247311827956989, "grad_norm": 0.44076297740074416, "learning_rate": 2.433847332568042e-05, "loss": 0.9718, "step": 344 }, { "epoch": 0.9274193548387096, "grad_norm": 0.4971040038925624, "learning_rate": 2.4251305791059533e-05, "loss": 1.0317, "step": 345 }, { "epoch": 0.9301075268817204, "grad_norm": 0.35978037050758516, "learning_rate": 2.416405359687012e-05, "loss": 0.9693, "step": 346 }, { "epoch": 0.9327956989247311, "grad_norm": 0.41817202738352904, "learning_rate": 2.4076718480633178e-05, "loss": 0.9764, "step": 347 }, { "epoch": 0.9354838709677419, "grad_norm": 0.4130988765844788, "learning_rate": 2.398930218152101e-05, "loss": 0.9548, "step": 348 }, { "epoch": 0.9381720430107527, "grad_norm": 0.47899471351234146, "learning_rate": 2.390180644032257e-05, "loss": 0.9965, "step": 349 }, { "epoch": 0.9408602150537635, "grad_norm": 0.3639159912649112, "learning_rate": 2.38142329994088e-05, "loss": 0.945, "step": 350 }, { "epoch": 0.9435483870967742, "grad_norm": 0.41552533932477614, "learning_rate": 2.372658360269796e-05, "loss": 0.976, "step": 351 }, { "epoch": 0.946236559139785, "grad_norm": 0.4127471276078075, "learning_rate": 2.363885999562084e-05, "loss": 1.0493, "step": 352 }, { "epoch": 0.9489247311827957, "grad_norm": 0.42874463629780296, "learning_rate": 2.3551063925086072e-05, "loss": 1.0003, "step": 353 }, { "epoch": 0.9516129032258065, "grad_norm": 0.4542236208271591, "learning_rate": 2.3463197139445284e-05, "loss": 1.0189, "step": 354 }, { "epoch": 0.9543010752688172, "grad_norm": 0.8840248169596676, "learning_rate": 2.3375261388458318e-05, "loss": 1.0006, "step": 355 }, { "epoch": 0.956989247311828, "grad_norm": 0.47762507803159143, "learning_rate": 2.3287258423258405e-05, "loss": 1.0101, "step": 356 }, { "epoch": 0.9596774193548387, "grad_norm": 0.42765004964798886, "learning_rate": 2.3199189996317205e-05, "loss": 0.9896, "step": 357 }, { "epoch": 0.9623655913978495, "grad_norm": 0.4236101839000849, "learning_rate": 2.3111057861410026e-05, "loss": 0.9931, "step": 358 }, { "epoch": 0.9650537634408602, "grad_norm": 0.38884571703952686, "learning_rate": 2.3022863773580813e-05, "loss": 0.9394, "step": 359 }, { "epoch": 0.967741935483871, "grad_norm": 0.5378824587688318, "learning_rate": 2.2934609489107236e-05, "loss": 0.9842, "step": 360 }, { "epoch": 0.9704301075268817, "grad_norm": 0.39925462372416454, "learning_rate": 2.2846296765465708e-05, "loss": 1.0026, "step": 361 }, { "epoch": 0.9731182795698925, "grad_norm": 0.9592078982505338, "learning_rate": 2.2757927361296376e-05, "loss": 1.0332, "step": 362 }, { "epoch": 0.9758064516129032, "grad_norm": 0.4396877320552629, "learning_rate": 2.2669503036368124e-05, "loss": 0.9971, "step": 363 }, { "epoch": 0.978494623655914, "grad_norm": 0.38966539914800313, "learning_rate": 2.2581025551543516e-05, "loss": 0.9469, "step": 364 }, { "epoch": 0.9811827956989247, "grad_norm": 0.4216276354211585, "learning_rate": 2.249249666874372e-05, "loss": 1.0322, "step": 365 }, { "epoch": 0.9838709677419355, "grad_norm": 0.4351959975704115, "learning_rate": 2.240391815091344e-05, "loss": 0.962, "step": 366 }, { "epoch": 0.9865591397849462, "grad_norm": 0.35811079366878923, "learning_rate": 2.2315291761985803e-05, "loss": 0.9937, "step": 367 }, { "epoch": 0.989247311827957, "grad_norm": 0.3605918004740936, "learning_rate": 2.222661926684722e-05, "loss": 0.991, "step": 368 }, { "epoch": 0.9919354838709677, "grad_norm": 0.4176512601533839, "learning_rate": 2.2137902431302264e-05, "loss": 1.0332, "step": 369 }, { "epoch": 0.9946236559139785, "grad_norm": 0.42340462982190896, "learning_rate": 2.2049143022038472e-05, "loss": 0.9922, "step": 370 }, { "epoch": 0.9973118279569892, "grad_norm": 0.420010163587815, "learning_rate": 2.196034280659122e-05, "loss": 1.0155, "step": 371 }, { "epoch": 1.0, "grad_norm": 0.41657151819377736, "learning_rate": 2.1871503553308447e-05, "loss": 0.9901, "step": 372 } ], "logging_steps": 1, "max_steps": 744, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 186, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1543405087752192.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }