| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.5, | |
| "eval_steps": 500, | |
| "global_step": 186, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.002688172043010753, | |
| "grad_norm": 1.6433222600981285, | |
| "learning_rate": 0.0, | |
| "loss": 1.562, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.005376344086021506, | |
| "grad_norm": 1.6862631068558513, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 1.4706, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.008064516129032258, | |
| "grad_norm": 1.7423201097805276, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 1.5406, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.010752688172043012, | |
| "grad_norm": 1.7727625055064622, | |
| "learning_rate": 3e-06, | |
| "loss": 1.5182, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.013440860215053764, | |
| "grad_norm": 1.5457482765192463, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 1.5169, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.016129032258064516, | |
| "grad_norm": 1.5659007249743502, | |
| "learning_rate": 5e-06, | |
| "loss": 1.4922, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.01881720430107527, | |
| "grad_norm": 1.3878881126089677, | |
| "learning_rate": 6e-06, | |
| "loss": 1.4863, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.021505376344086023, | |
| "grad_norm": 1.295368020848385, | |
| "learning_rate": 7e-06, | |
| "loss": 1.4839, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.024193548387096774, | |
| "grad_norm": 1.589857887668944, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 1.4303, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.026881720430107527, | |
| "grad_norm": 2.60679604894195, | |
| "learning_rate": 9e-06, | |
| "loss": 1.3744, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02956989247311828, | |
| "grad_norm": 0.8410885692002656, | |
| "learning_rate": 1e-05, | |
| "loss": 1.3498, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.03225806451612903, | |
| "grad_norm": 0.7927855266728604, | |
| "learning_rate": 1.1000000000000001e-05, | |
| "loss": 1.3179, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.03494623655913978, | |
| "grad_norm": 0.6808035050220127, | |
| "learning_rate": 1.2e-05, | |
| "loss": 1.3268, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.03763440860215054, | |
| "grad_norm": 0.6602967909334083, | |
| "learning_rate": 1.3000000000000001e-05, | |
| "loss": 1.2784, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.04032258064516129, | |
| "grad_norm": 0.5797556052811048, | |
| "learning_rate": 1.4e-05, | |
| "loss": 1.2949, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.043010752688172046, | |
| "grad_norm": 0.6000541560518325, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 1.288, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.0456989247311828, | |
| "grad_norm": 0.6494981992893607, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 1.2449, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.04838709677419355, | |
| "grad_norm": 0.6723097988215474, | |
| "learning_rate": 1.7e-05, | |
| "loss": 1.2102, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.051075268817204304, | |
| "grad_norm": 0.6702835925568053, | |
| "learning_rate": 1.8e-05, | |
| "loss": 1.2025, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.053763440860215055, | |
| "grad_norm": 0.625636082792655, | |
| "learning_rate": 1.9e-05, | |
| "loss": 1.2777, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.056451612903225805, | |
| "grad_norm": 0.6253912624763358, | |
| "learning_rate": 2e-05, | |
| "loss": 1.2669, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.05913978494623656, | |
| "grad_norm": 0.5910337660829342, | |
| "learning_rate": 2.1000000000000002e-05, | |
| "loss": 1.2654, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.06182795698924731, | |
| "grad_norm": 0.6304908028391322, | |
| "learning_rate": 2.2000000000000003e-05, | |
| "loss": 1.2413, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.06451612903225806, | |
| "grad_norm": 0.5377853121890415, | |
| "learning_rate": 2.3e-05, | |
| "loss": 1.2109, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.06720430107526881, | |
| "grad_norm": 0.4970873703549533, | |
| "learning_rate": 2.4e-05, | |
| "loss": 1.1359, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.06989247311827956, | |
| "grad_norm": 0.5292734885521813, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.2236, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.07258064516129033, | |
| "grad_norm": 0.5428754620149544, | |
| "learning_rate": 2.6000000000000002e-05, | |
| "loss": 1.2083, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.07526881720430108, | |
| "grad_norm": 0.5711123503896314, | |
| "learning_rate": 2.7000000000000002e-05, | |
| "loss": 1.2161, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.07795698924731183, | |
| "grad_norm": 0.49149041488377043, | |
| "learning_rate": 2.8e-05, | |
| "loss": 1.1454, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.08064516129032258, | |
| "grad_norm": 0.5285852530799724, | |
| "learning_rate": 2.9e-05, | |
| "loss": 1.1194, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.08333333333333333, | |
| "grad_norm": 0.5295555329242986, | |
| "learning_rate": 3.0000000000000004e-05, | |
| "loss": 1.1688, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.08602150537634409, | |
| "grad_norm": 0.465354706566009, | |
| "learning_rate": 3.1e-05, | |
| "loss": 1.1743, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.08870967741935484, | |
| "grad_norm": 0.4486072933924605, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 1.0818, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.0913978494623656, | |
| "grad_norm": 0.496727888984662, | |
| "learning_rate": 3.3e-05, | |
| "loss": 1.2101, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.09408602150537634, | |
| "grad_norm": 0.43899748210993167, | |
| "learning_rate": 3.4e-05, | |
| "loss": 1.1884, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.0967741935483871, | |
| "grad_norm": 0.4147227405541853, | |
| "learning_rate": 3.5000000000000004e-05, | |
| "loss": 1.0814, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.09946236559139784, | |
| "grad_norm": 0.48760701758721925, | |
| "learning_rate": 3.6e-05, | |
| "loss": 1.1212, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.10215053763440861, | |
| "grad_norm": 0.49917378567432974, | |
| "learning_rate": 3.7000000000000005e-05, | |
| "loss": 1.1984, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.10483870967741936, | |
| "grad_norm": 0.5304015628409972, | |
| "learning_rate": 3.8e-05, | |
| "loss": 1.1274, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.10752688172043011, | |
| "grad_norm": 0.4726408598975661, | |
| "learning_rate": 3.9e-05, | |
| "loss": 1.1323, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.11021505376344086, | |
| "grad_norm": 0.44174146995469904, | |
| "learning_rate": 4e-05, | |
| "loss": 1.1898, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.11290322580645161, | |
| "grad_norm": 0.5087279682773094, | |
| "learning_rate": 3.999980086219931e-05, | |
| "loss": 1.1469, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.11559139784946236, | |
| "grad_norm": 0.5626510931079601, | |
| "learning_rate": 3.999920345276283e-05, | |
| "loss": 1.1321, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.11827956989247312, | |
| "grad_norm": 0.47565220090788773, | |
| "learning_rate": 3.999820778358724e-05, | |
| "loss": 1.1453, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.12096774193548387, | |
| "grad_norm": 0.4431044005508681, | |
| "learning_rate": 3.999681387450007e-05, | |
| "loss": 1.1408, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.12365591397849462, | |
| "grad_norm": 0.47942624390584926, | |
| "learning_rate": 3.999502175325932e-05, | |
| "loss": 1.168, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.12634408602150538, | |
| "grad_norm": 0.43166434321061714, | |
| "learning_rate": 3.999283145555291e-05, | |
| "loss": 1.1087, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.12903225806451613, | |
| "grad_norm": 0.47105749411720044, | |
| "learning_rate": 3.999024302499794e-05, | |
| "loss": 1.0752, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.13172043010752688, | |
| "grad_norm": 0.3959072081415341, | |
| "learning_rate": 3.998725651313984e-05, | |
| "loss": 1.1011, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.13440860215053763, | |
| "grad_norm": 0.4416535692834609, | |
| "learning_rate": 3.998387197945135e-05, | |
| "loss": 1.1306, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.13709677419354838, | |
| "grad_norm": 0.4272647809985287, | |
| "learning_rate": 3.9980089491331344e-05, | |
| "loss": 1.1381, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.13978494623655913, | |
| "grad_norm": 0.47769854993592265, | |
| "learning_rate": 3.997590912410345e-05, | |
| "loss": 1.0976, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.1424731182795699, | |
| "grad_norm": 0.3877500456630632, | |
| "learning_rate": 3.997133096101458e-05, | |
| "loss": 1.128, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.14516129032258066, | |
| "grad_norm": 0.3869721085588235, | |
| "learning_rate": 3.996635509323327e-05, | |
| "loss": 1.1225, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.1478494623655914, | |
| "grad_norm": 0.47271590281090886, | |
| "learning_rate": 3.9960981619847856e-05, | |
| "loss": 1.1141, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.15053763440860216, | |
| "grad_norm": 0.4368206211090345, | |
| "learning_rate": 3.99552106478645e-05, | |
| "loss": 1.0872, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.1532258064516129, | |
| "grad_norm": 0.3872679475185707, | |
| "learning_rate": 3.994904229220507e-05, | |
| "loss": 1.1514, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.15591397849462366, | |
| "grad_norm": 0.406268890860899, | |
| "learning_rate": 3.9942476675704854e-05, | |
| "loss": 1.0965, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.1586021505376344, | |
| "grad_norm": 0.43172418498531184, | |
| "learning_rate": 3.993551392911009e-05, | |
| "loss": 1.1192, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.16129032258064516, | |
| "grad_norm": 0.4258357918752704, | |
| "learning_rate": 3.9928154191075375e-05, | |
| "loss": 1.0623, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1639784946236559, | |
| "grad_norm": 0.4585556740184179, | |
| "learning_rate": 3.9920397608160925e-05, | |
| "loss": 1.1076, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.16666666666666666, | |
| "grad_norm": 0.44452627464263844, | |
| "learning_rate": 3.991224433482961e-05, | |
| "loss": 1.1107, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.1693548387096774, | |
| "grad_norm": 0.4787003491624029, | |
| "learning_rate": 3.990369453344394e-05, | |
| "loss": 1.1165, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.17204301075268819, | |
| "grad_norm": 0.4704549745433953, | |
| "learning_rate": 3.989474837426277e-05, | |
| "loss": 1.1541, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.17473118279569894, | |
| "grad_norm": 0.4026214434021435, | |
| "learning_rate": 3.9885406035437953e-05, | |
| "loss": 1.1166, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.1774193548387097, | |
| "grad_norm": 0.40057979364796353, | |
| "learning_rate": 3.987566770301076e-05, | |
| "loss": 1.0626, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.18010752688172044, | |
| "grad_norm": 0.4340486368362563, | |
| "learning_rate": 3.98655335709082e-05, | |
| "loss": 1.104, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.1827956989247312, | |
| "grad_norm": 0.42609639195543936, | |
| "learning_rate": 3.985500384093917e-05, | |
| "loss": 1.0893, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.18548387096774194, | |
| "grad_norm": 0.381378569874383, | |
| "learning_rate": 3.984407872279037e-05, | |
| "loss": 1.0433, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.1881720430107527, | |
| "grad_norm": 0.3903976348529897, | |
| "learning_rate": 3.983275843402222e-05, | |
| "loss": 1.1019, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.19086021505376344, | |
| "grad_norm": 0.3648695348221521, | |
| "learning_rate": 3.982104320006446e-05, | |
| "loss": 1.0992, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.1935483870967742, | |
| "grad_norm": 1.8993059639660952, | |
| "learning_rate": 3.9808933254211665e-05, | |
| "loss": 1.1056, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.19623655913978494, | |
| "grad_norm": 0.46580843289168206, | |
| "learning_rate": 3.979642883761866e-05, | |
| "loss": 1.1031, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.1989247311827957, | |
| "grad_norm": 0.449285515287558, | |
| "learning_rate": 3.978353019929562e-05, | |
| "loss": 1.1068, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.20161290322580644, | |
| "grad_norm": 0.5567418056951845, | |
| "learning_rate": 3.977023759610321e-05, | |
| "loss": 1.0446, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.20430107526881722, | |
| "grad_norm": 0.38684392317210076, | |
| "learning_rate": 3.9756551292747405e-05, | |
| "loss": 1.0377, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.20698924731182797, | |
| "grad_norm": 0.473773440244898, | |
| "learning_rate": 3.974247156177423e-05, | |
| "loss": 1.1396, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.20967741935483872, | |
| "grad_norm": 0.4177520757238314, | |
| "learning_rate": 3.9727998683564355e-05, | |
| "loss": 1.1008, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.21236559139784947, | |
| "grad_norm": 0.39719194878309766, | |
| "learning_rate": 3.9713132946327494e-05, | |
| "loss": 1.0215, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.21505376344086022, | |
| "grad_norm": 0.4105085260167095, | |
| "learning_rate": 3.9697874646096675e-05, | |
| "loss": 1.1115, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.21774193548387097, | |
| "grad_norm": 0.4087045401288919, | |
| "learning_rate": 3.968222408672232e-05, | |
| "loss": 1.0579, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.22043010752688172, | |
| "grad_norm": 0.39033402258475636, | |
| "learning_rate": 3.9666181579866244e-05, | |
| "loss": 1.0692, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.22311827956989247, | |
| "grad_norm": 0.41439706526743936, | |
| "learning_rate": 3.964974744499539e-05, | |
| "loss": 1.0865, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.22580645161290322, | |
| "grad_norm": 0.38234297411695073, | |
| "learning_rate": 3.963292200937551e-05, | |
| "loss": 1.0173, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.22849462365591397, | |
| "grad_norm": 0.5308750280660687, | |
| "learning_rate": 3.961570560806461e-05, | |
| "loss": 1.067, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.23118279569892472, | |
| "grad_norm": 0.43351295582441124, | |
| "learning_rate": 3.959809858390634e-05, | |
| "loss": 1.086, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.23387096774193547, | |
| "grad_norm": 0.42069712201952686, | |
| "learning_rate": 3.9580101287523105e-05, | |
| "loss": 1.1064, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.23655913978494625, | |
| "grad_norm": 0.42821523209412365, | |
| "learning_rate": 3.95617140773091e-05, | |
| "loss": 1.0263, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.239247311827957, | |
| "grad_norm": 0.4114502165683399, | |
| "learning_rate": 3.954293731942319e-05, | |
| "loss": 1.0729, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.24193548387096775, | |
| "grad_norm": 0.4131919780645225, | |
| "learning_rate": 3.95237713877816e-05, | |
| "loss": 1.0621, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2446236559139785, | |
| "grad_norm": 0.4433939594965718, | |
| "learning_rate": 3.950421666405048e-05, | |
| "loss": 1.0805, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.24731182795698925, | |
| "grad_norm": 0.4056188018789589, | |
| "learning_rate": 3.948427353763829e-05, | |
| "loss": 1.0784, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.4642044159391645, | |
| "learning_rate": 3.946394240568807e-05, | |
| "loss": 1.0406, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.25268817204301075, | |
| "grad_norm": 0.4280982724994961, | |
| "learning_rate": 3.944322367306951e-05, | |
| "loss": 1.1117, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.2553763440860215, | |
| "grad_norm": 0.41758547723414086, | |
| "learning_rate": 3.942211775237089e-05, | |
| "loss": 1.0747, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.25806451612903225, | |
| "grad_norm": 0.4344009299837567, | |
| "learning_rate": 3.940062506389089e-05, | |
| "loss": 1.1249, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.260752688172043, | |
| "grad_norm": 0.3847297194838658, | |
| "learning_rate": 3.937874603563015e-05, | |
| "loss": 1.0977, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.26344086021505375, | |
| "grad_norm": 0.4959083398122344, | |
| "learning_rate": 3.935648110328285e-05, | |
| "loss": 1.041, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.2661290322580645, | |
| "grad_norm": 0.46262720954521647, | |
| "learning_rate": 3.933383071022795e-05, | |
| "loss": 1.0926, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.26881720430107525, | |
| "grad_norm": 0.4789561041937064, | |
| "learning_rate": 3.93107953075204e-05, | |
| "loss": 1.0701, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.271505376344086, | |
| "grad_norm": 0.4229869803365367, | |
| "learning_rate": 3.928737535388214e-05, | |
| "loss": 1.063, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.27419354838709675, | |
| "grad_norm": 0.43404703473814416, | |
| "learning_rate": 3.9263571315692976e-05, | |
| "loss": 1.0696, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.2768817204301075, | |
| "grad_norm": 0.4396716028324381, | |
| "learning_rate": 3.923938366698129e-05, | |
| "loss": 1.0317, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.27956989247311825, | |
| "grad_norm": 0.6860340156482403, | |
| "learning_rate": 3.921481288941459e-05, | |
| "loss": 1.0611, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.28225806451612906, | |
| "grad_norm": 0.39601683185098385, | |
| "learning_rate": 3.9189859472289956e-05, | |
| "loss": 1.0294, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.2849462365591398, | |
| "grad_norm": 0.39641986440862376, | |
| "learning_rate": 3.9164523912524224e-05, | |
| "loss": 1.0663, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.28763440860215056, | |
| "grad_norm": 0.3898209322812333, | |
| "learning_rate": 3.913880671464418e-05, | |
| "loss": 1.0671, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.2903225806451613, | |
| "grad_norm": 0.408678962590762, | |
| "learning_rate": 3.911270839077644e-05, | |
| "loss": 1.0224, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.29301075268817206, | |
| "grad_norm": 0.4681397312637908, | |
| "learning_rate": 3.908622946063728e-05, | |
| "loss": 1.091, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.2956989247311828, | |
| "grad_norm": 0.47955178042664964, | |
| "learning_rate": 3.9059370451522295e-05, | |
| "loss": 1.0961, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.29838709677419356, | |
| "grad_norm": 0.4229760577312693, | |
| "learning_rate": 3.903213189829589e-05, | |
| "loss": 1.0386, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.3010752688172043, | |
| "grad_norm": 0.39011319960684926, | |
| "learning_rate": 3.900451434338062e-05, | |
| "loss": 1.067, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.30376344086021506, | |
| "grad_norm": 0.39672904488910227, | |
| "learning_rate": 3.8976518336746396e-05, | |
| "loss": 1.0424, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.3064516129032258, | |
| "grad_norm": 0.49393594827425025, | |
| "learning_rate": 3.894814443589954e-05, | |
| "loss": 1.0695, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.30913978494623656, | |
| "grad_norm": 0.38254416729289076, | |
| "learning_rate": 3.8919393205871676e-05, | |
| "loss": 1.0801, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.3118279569892473, | |
| "grad_norm": 0.4456422459103533, | |
| "learning_rate": 3.889026521920847e-05, | |
| "loss": 1.0934, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.31451612903225806, | |
| "grad_norm": 0.39398196216047476, | |
| "learning_rate": 3.886076105595825e-05, | |
| "loss": 1.1011, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.3172043010752688, | |
| "grad_norm": 0.3949327527665007, | |
| "learning_rate": 3.883088130366042e-05, | |
| "loss": 1.018, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.31989247311827956, | |
| "grad_norm": 0.39254792724729387, | |
| "learning_rate": 3.88006265573338e-05, | |
| "loss": 1.0607, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.3225806451612903, | |
| "grad_norm": 0.5007199853312655, | |
| "learning_rate": 3.876999741946478e-05, | |
| "loss": 1.0609, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.32526881720430106, | |
| "grad_norm": 0.4619751408736227, | |
| "learning_rate": 3.873899449999524e-05, | |
| "loss": 1.0955, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.3279569892473118, | |
| "grad_norm": 0.48219172224114765, | |
| "learning_rate": 3.870761841631051e-05, | |
| "loss": 1.063, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.33064516129032256, | |
| "grad_norm": 0.4054037874416271, | |
| "learning_rate": 3.867586979322703e-05, | |
| "loss": 1.0907, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 0.43161457507331874, | |
| "learning_rate": 3.8643749262979896e-05, | |
| "loss": 1.0666, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.33602150537634407, | |
| "grad_norm": 0.36751029685084174, | |
| "learning_rate": 3.861125746521028e-05, | |
| "loss": 1.0557, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.3387096774193548, | |
| "grad_norm": 0.46690938120869707, | |
| "learning_rate": 3.8578395046952686e-05, | |
| "loss": 1.1023, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.34139784946236557, | |
| "grad_norm": 0.3988094995343537, | |
| "learning_rate": 3.85451626626221e-05, | |
| "loss": 1.0717, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.34408602150537637, | |
| "grad_norm": 0.48432619617982536, | |
| "learning_rate": 3.85115609740009e-05, | |
| "loss": 1.0271, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.3467741935483871, | |
| "grad_norm": 0.5127948499632843, | |
| "learning_rate": 3.8477590650225735e-05, | |
| "loss": 1.0575, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.34946236559139787, | |
| "grad_norm": 0.4132091412639387, | |
| "learning_rate": 3.8443252367774164e-05, | |
| "loss": 1.0355, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3521505376344086, | |
| "grad_norm": 0.4439631972175399, | |
| "learning_rate": 3.8408546810451176e-05, | |
| "loss": 1.0541, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.3548387096774194, | |
| "grad_norm": 0.3956247259769062, | |
| "learning_rate": 3.837347466937562e-05, | |
| "loss": 1.0672, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.3575268817204301, | |
| "grad_norm": 0.44952249373265674, | |
| "learning_rate": 3.8338036642966396e-05, | |
| "loss": 1.0444, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.3602150537634409, | |
| "grad_norm": 0.4449484078947791, | |
| "learning_rate": 3.830223343692857e-05, | |
| "loss": 1.0514, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.3629032258064516, | |
| "grad_norm": 0.3905509358873801, | |
| "learning_rate": 3.826606576423931e-05, | |
| "loss": 1.0394, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.3655913978494624, | |
| "grad_norm": 0.4183744146790331, | |
| "learning_rate": 3.8229534345133695e-05, | |
| "loss": 1.0212, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.3682795698924731, | |
| "grad_norm": 0.46086732418604737, | |
| "learning_rate": 3.819263990709037e-05, | |
| "loss": 0.994, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.3709677419354839, | |
| "grad_norm": 0.4468564375555911, | |
| "learning_rate": 3.8155383184817064e-05, | |
| "loss": 1.0279, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.3736559139784946, | |
| "grad_norm": 0.3966511312736679, | |
| "learning_rate": 3.8117764920235945e-05, | |
| "loss": 0.9992, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.3763440860215054, | |
| "grad_norm": 0.46461846433833476, | |
| "learning_rate": 3.807978586246887e-05, | |
| "loss": 1.088, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3790322580645161, | |
| "grad_norm": 0.4254641795470929, | |
| "learning_rate": 3.804144676782243e-05, | |
| "loss": 1.0764, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.3817204301075269, | |
| "grad_norm": 0.42137203485219293, | |
| "learning_rate": 3.800274839977293e-05, | |
| "loss": 1.0422, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.3844086021505376, | |
| "grad_norm": 0.4172681789743796, | |
| "learning_rate": 3.796369152895117e-05, | |
| "loss": 1.0453, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.3870967741935484, | |
| "grad_norm": 0.4531431509751161, | |
| "learning_rate": 3.792427693312707e-05, | |
| "loss": 1.0389, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.3897849462365591, | |
| "grad_norm": 0.3782466419505299, | |
| "learning_rate": 3.788450539719423e-05, | |
| "loss": 1.025, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.3924731182795699, | |
| "grad_norm": 0.4655605897605627, | |
| "learning_rate": 3.7844377713154264e-05, | |
| "loss": 1.064, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.3951612903225806, | |
| "grad_norm": 0.4384836890227208, | |
| "learning_rate": 3.780389468010106e-05, | |
| "loss": 1.0397, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.3978494623655914, | |
| "grad_norm": 0.4844715439450037, | |
| "learning_rate": 3.776305710420482e-05, | |
| "loss": 1.1193, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.40053763440860213, | |
| "grad_norm": 0.41760675460607827, | |
| "learning_rate": 3.7721865798696056e-05, | |
| "loss": 1.0124, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.4032258064516129, | |
| "grad_norm": 0.7337537478769387, | |
| "learning_rate": 3.7680321583849365e-05, | |
| "loss": 1.0508, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.40591397849462363, | |
| "grad_norm": 0.44725816367920673, | |
| "learning_rate": 3.76384252869671e-05, | |
| "loss": 1.0434, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.40860215053763443, | |
| "grad_norm": 0.40870612635720194, | |
| "learning_rate": 3.759617774236292e-05, | |
| "loss": 1.068, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.4112903225806452, | |
| "grad_norm": 0.4534649483932217, | |
| "learning_rate": 3.755357979134511e-05, | |
| "loss": 1.0614, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.41397849462365593, | |
| "grad_norm": 0.41986572053185917, | |
| "learning_rate": 3.751063228219993e-05, | |
| "loss": 1.0391, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.4166666666666667, | |
| "grad_norm": 0.3717380879536067, | |
| "learning_rate": 3.7467336070174604e-05, | |
| "loss": 1.0378, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.41935483870967744, | |
| "grad_norm": 0.41848537015206944, | |
| "learning_rate": 3.742369201746038e-05, | |
| "loss": 1.0439, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.4220430107526882, | |
| "grad_norm": 0.43443932018052933, | |
| "learning_rate": 3.737970099317535e-05, | |
| "loss": 1.0197, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.42473118279569894, | |
| "grad_norm": 0.421554546653683, | |
| "learning_rate": 3.7335363873347056e-05, | |
| "loss": 1.0487, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.4274193548387097, | |
| "grad_norm": 0.8430023271255561, | |
| "learning_rate": 3.729068154089519e-05, | |
| "loss": 1.0333, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.43010752688172044, | |
| "grad_norm": 0.4363044724173691, | |
| "learning_rate": 3.724565488561387e-05, | |
| "loss": 1.0213, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.4327956989247312, | |
| "grad_norm": 0.5335682969510431, | |
| "learning_rate": 3.720028480415401e-05, | |
| "loss": 1.0205, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.43548387096774194, | |
| "grad_norm": 0.4056834135687678, | |
| "learning_rate": 3.7154572200005446e-05, | |
| "loss": 1.0311, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.4381720430107527, | |
| "grad_norm": 0.5322107401886871, | |
| "learning_rate": 3.710851798347891e-05, | |
| "loss": 1.0601, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.44086021505376344, | |
| "grad_norm": 0.4138677278304246, | |
| "learning_rate": 3.7062123071687944e-05, | |
| "loss": 1.0361, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.4435483870967742, | |
| "grad_norm": 0.4775100325512625, | |
| "learning_rate": 3.701538838853062e-05, | |
| "loss": 1.0194, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.44623655913978494, | |
| "grad_norm": 0.40839482534046995, | |
| "learning_rate": 3.696831486467114e-05, | |
| "loss": 1.0463, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.4489247311827957, | |
| "grad_norm": 0.3963093446633738, | |
| "learning_rate": 3.6920903437521305e-05, | |
| "loss": 1.0238, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.45161290322580644, | |
| "grad_norm": 0.4344752184390704, | |
| "learning_rate": 3.6873155051221846e-05, | |
| "loss": 1.0472, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.4543010752688172, | |
| "grad_norm": 0.4167014186949368, | |
| "learning_rate": 3.6825070656623626e-05, | |
| "loss": 1.0599, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.45698924731182794, | |
| "grad_norm": 0.43904590007956124, | |
| "learning_rate": 3.677665121126871e-05, | |
| "loss": 1.0559, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.4596774193548387, | |
| "grad_norm": 0.372185063148541, | |
| "learning_rate": 3.6727897679371276e-05, | |
| "loss": 1.0012, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.46236559139784944, | |
| "grad_norm": 0.43086731351488916, | |
| "learning_rate": 3.667881103179844e-05, | |
| "loss": 1.0133, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.4650537634408602, | |
| "grad_norm": 0.5796354347464544, | |
| "learning_rate": 3.662939224605091e-05, | |
| "loss": 1.0517, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.46774193548387094, | |
| "grad_norm": 0.4587453684541154, | |
| "learning_rate": 3.657964230624351e-05, | |
| "loss": 1.0164, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.47043010752688175, | |
| "grad_norm": 0.5102852182866393, | |
| "learning_rate": 3.6529562203085595e-05, | |
| "loss": 1.052, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.4731182795698925, | |
| "grad_norm": 0.4469591346380821, | |
| "learning_rate": 3.6479152933861336e-05, | |
| "loss": 1.0905, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.47580645161290325, | |
| "grad_norm": 0.45277428352010624, | |
| "learning_rate": 3.642841550240983e-05, | |
| "loss": 1.0961, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.478494623655914, | |
| "grad_norm": 0.45588595960031525, | |
| "learning_rate": 3.6377350919105136e-05, | |
| "loss": 1.0178, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.48118279569892475, | |
| "grad_norm": 0.6147997034643559, | |
| "learning_rate": 3.632596020083612e-05, | |
| "loss": 1.0148, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.4838709677419355, | |
| "grad_norm": 0.3734326271789308, | |
| "learning_rate": 3.627424437098625e-05, | |
| "loss": 1.0006, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.48655913978494625, | |
| "grad_norm": 0.4564187594173089, | |
| "learning_rate": 3.6222204459413186e-05, | |
| "loss": 1.0635, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.489247311827957, | |
| "grad_norm": 0.42811733614493086, | |
| "learning_rate": 3.6169841502428285e-05, | |
| "loss": 1.0469, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.49193548387096775, | |
| "grad_norm": 0.4227875509642681, | |
| "learning_rate": 3.611715654277596e-05, | |
| "loss": 1.0446, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.4946236559139785, | |
| "grad_norm": 0.40548546169007965, | |
| "learning_rate": 3.60641506296129e-05, | |
| "loss": 1.0564, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.49731182795698925, | |
| "grad_norm": 0.4161116484325749, | |
| "learning_rate": 3.601082481848721e-05, | |
| "loss": 0.9917, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.39180067540636987, | |
| "learning_rate": 3.595718017131736e-05, | |
| "loss": 1.0081, | |
| "step": 186 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 744, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 186, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 771702543876096.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |