{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.5787781350482315, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012861736334405145, "grad_norm": 1.1440598964691162, "learning_rate": 0.0001, "loss": 2.6398, "step": 1 }, { "epoch": 0.02572347266881029, "grad_norm": 1.078170657157898, "learning_rate": 9.949748743718594e-05, "loss": 2.6721, "step": 2 }, { "epoch": 0.03858520900321544, "grad_norm": 0.9559459686279297, "learning_rate": 9.899497487437186e-05, "loss": 2.6911, "step": 3 }, { "epoch": 0.05144694533762058, "grad_norm": 1.0017387866973877, "learning_rate": 9.84924623115578e-05, "loss": 2.479, "step": 4 }, { "epoch": 0.06430868167202572, "grad_norm": 1.0636959075927734, "learning_rate": 9.798994974874372e-05, "loss": 2.335, "step": 5 }, { "epoch": 0.07717041800643087, "grad_norm": 1.0521775484085083, "learning_rate": 9.748743718592965e-05, "loss": 2.1896, "step": 6 }, { "epoch": 0.09003215434083602, "grad_norm": 1.0728856325149536, "learning_rate": 9.698492462311559e-05, "loss": 2.0778, "step": 7 }, { "epoch": 0.10289389067524116, "grad_norm": 1.031778335571289, "learning_rate": 9.64824120603015e-05, "loss": 1.9751, "step": 8 }, { "epoch": 0.1157556270096463, "grad_norm": 1.076965093612671, "learning_rate": 9.597989949748745e-05, "loss": 1.7265, "step": 9 }, { "epoch": 0.12861736334405144, "grad_norm": 1.214233636856079, "learning_rate": 9.547738693467337e-05, "loss": 1.729, "step": 10 }, { "epoch": 0.1414790996784566, "grad_norm": 0.7060278058052063, "learning_rate": 9.49748743718593e-05, "loss": 1.9542, "step": 11 }, { "epoch": 0.15434083601286175, "grad_norm": 0.827420711517334, "learning_rate": 9.447236180904523e-05, "loss": 1.4983, "step": 12 }, { "epoch": 0.16720257234726688, "grad_norm": 0.6821541786193848, "learning_rate": 9.396984924623115e-05, "loss": 1.4999, "step": 13 }, { "epoch": 0.18006430868167203, "grad_norm": 0.7033599615097046, "learning_rate": 9.34673366834171e-05, "loss": 1.4026, "step": 14 }, { "epoch": 0.19292604501607716, "grad_norm": 0.80780029296875, "learning_rate": 9.296482412060302e-05, "loss": 1.4809, "step": 15 }, { "epoch": 0.2057877813504823, "grad_norm": 0.7556090950965881, "learning_rate": 9.246231155778895e-05, "loss": 1.4065, "step": 16 }, { "epoch": 0.21864951768488747, "grad_norm": 0.6941238641738892, "learning_rate": 9.195979899497488e-05, "loss": 1.4728, "step": 17 }, { "epoch": 0.2315112540192926, "grad_norm": 0.7150428295135498, "learning_rate": 9.14572864321608e-05, "loss": 1.3528, "step": 18 }, { "epoch": 0.24437299035369775, "grad_norm": 0.5909086465835571, "learning_rate": 9.095477386934675e-05, "loss": 1.3859, "step": 19 }, { "epoch": 0.2572347266881029, "grad_norm": 0.7633894681930542, "learning_rate": 9.045226130653267e-05, "loss": 1.3405, "step": 20 }, { "epoch": 0.27009646302250806, "grad_norm": 0.6418241262435913, "learning_rate": 8.99497487437186e-05, "loss": 1.3162, "step": 21 }, { "epoch": 0.2829581993569132, "grad_norm": 0.8278843760490417, "learning_rate": 8.944723618090453e-05, "loss": 1.3934, "step": 22 }, { "epoch": 0.2958199356913183, "grad_norm": 0.7349154353141785, "learning_rate": 8.894472361809045e-05, "loss": 1.3461, "step": 23 }, { "epoch": 0.3086816720257235, "grad_norm": 0.736482560634613, "learning_rate": 8.84422110552764e-05, "loss": 1.2912, "step": 24 }, { "epoch": 0.3215434083601286, "grad_norm": 0.8143345713615417, "learning_rate": 8.793969849246232e-05, "loss": 1.3445, "step": 25 }, { "epoch": 0.33440514469453375, "grad_norm": 0.6922888159751892, "learning_rate": 8.743718592964825e-05, "loss": 1.36, "step": 26 }, { "epoch": 0.34726688102893893, "grad_norm": 0.7785637378692627, "learning_rate": 8.693467336683418e-05, "loss": 1.276, "step": 27 }, { "epoch": 0.36012861736334406, "grad_norm": 0.7577537298202515, "learning_rate": 8.64321608040201e-05, "loss": 1.2323, "step": 28 }, { "epoch": 0.3729903536977492, "grad_norm": 0.7919734716415405, "learning_rate": 8.592964824120603e-05, "loss": 1.4053, "step": 29 }, { "epoch": 0.3858520900321543, "grad_norm": 0.821456789970398, "learning_rate": 8.542713567839196e-05, "loss": 1.251, "step": 30 }, { "epoch": 0.3987138263665595, "grad_norm": 0.8000316023826599, "learning_rate": 8.49246231155779e-05, "loss": 1.3022, "step": 31 }, { "epoch": 0.4115755627009646, "grad_norm": 0.7909243702888489, "learning_rate": 8.442211055276383e-05, "loss": 1.2509, "step": 32 }, { "epoch": 0.42443729903536975, "grad_norm": 0.896696150302887, "learning_rate": 8.391959798994975e-05, "loss": 1.2068, "step": 33 }, { "epoch": 0.43729903536977494, "grad_norm": 0.8683031797409058, "learning_rate": 8.341708542713568e-05, "loss": 1.3849, "step": 34 }, { "epoch": 0.45016077170418006, "grad_norm": 0.8988001346588135, "learning_rate": 8.291457286432161e-05, "loss": 1.1935, "step": 35 }, { "epoch": 0.4630225080385852, "grad_norm": 0.8507208824157715, "learning_rate": 8.241206030150754e-05, "loss": 1.2605, "step": 36 }, { "epoch": 0.4758842443729904, "grad_norm": 0.9080713987350464, "learning_rate": 8.190954773869348e-05, "loss": 1.3304, "step": 37 }, { "epoch": 0.4887459807073955, "grad_norm": 0.9472977519035339, "learning_rate": 8.14070351758794e-05, "loss": 1.3325, "step": 38 }, { "epoch": 0.5016077170418006, "grad_norm": 0.9067745804786682, "learning_rate": 8.090452261306533e-05, "loss": 1.195, "step": 39 }, { "epoch": 0.5144694533762058, "grad_norm": 0.7469335198402405, "learning_rate": 8.040201005025126e-05, "loss": 1.348, "step": 40 }, { "epoch": 0.5273311897106109, "grad_norm": 0.915685772895813, "learning_rate": 7.989949748743719e-05, "loss": 1.2406, "step": 41 }, { "epoch": 0.5401929260450161, "grad_norm": 0.6561890244483948, "learning_rate": 7.939698492462313e-05, "loss": 1.4992, "step": 42 }, { "epoch": 0.5530546623794212, "grad_norm": 0.7920067310333252, "learning_rate": 7.889447236180904e-05, "loss": 1.1802, "step": 43 }, { "epoch": 0.5659163987138264, "grad_norm": 0.7714186906814575, "learning_rate": 7.839195979899498e-05, "loss": 1.2086, "step": 44 }, { "epoch": 0.5787781350482315, "grad_norm": 0.7048550844192505, "learning_rate": 7.788944723618091e-05, "loss": 1.166, "step": 45 }, { "epoch": 0.5916398713826366, "grad_norm": 0.5653746128082275, "learning_rate": 7.738693467336684e-05, "loss": 1.3876, "step": 46 }, { "epoch": 0.6045016077170418, "grad_norm": 0.6872064471244812, "learning_rate": 7.688442211055277e-05, "loss": 1.1946, "step": 47 }, { "epoch": 0.617363344051447, "grad_norm": 0.6310857534408569, "learning_rate": 7.638190954773869e-05, "loss": 1.1509, "step": 48 }, { "epoch": 0.6302250803858521, "grad_norm": 0.7433875203132629, "learning_rate": 7.587939698492463e-05, "loss": 1.1792, "step": 49 }, { "epoch": 0.6430868167202572, "grad_norm": 0.7774860262870789, "learning_rate": 7.537688442211056e-05, "loss": 1.3531, "step": 50 }, { "epoch": 0.6559485530546624, "grad_norm": 0.6565365195274353, "learning_rate": 7.487437185929649e-05, "loss": 1.2448, "step": 51 }, { "epoch": 0.6688102893890675, "grad_norm": 0.6934945583343506, "learning_rate": 7.437185929648241e-05, "loss": 1.3109, "step": 52 }, { "epoch": 0.6816720257234726, "grad_norm": 0.662295401096344, "learning_rate": 7.386934673366834e-05, "loss": 1.3463, "step": 53 }, { "epoch": 0.6945337620578779, "grad_norm": 0.6406717300415039, "learning_rate": 7.336683417085427e-05, "loss": 1.0897, "step": 54 }, { "epoch": 0.707395498392283, "grad_norm": 0.7470242977142334, "learning_rate": 7.28643216080402e-05, "loss": 1.1834, "step": 55 }, { "epoch": 0.7202572347266881, "grad_norm": 0.6413742899894714, "learning_rate": 7.236180904522614e-05, "loss": 1.313, "step": 56 }, { "epoch": 0.7331189710610932, "grad_norm": 0.64141845703125, "learning_rate": 7.185929648241206e-05, "loss": 1.322, "step": 57 }, { "epoch": 0.7459807073954984, "grad_norm": 0.6849397420883179, "learning_rate": 7.135678391959799e-05, "loss": 1.1466, "step": 58 }, { "epoch": 0.7588424437299035, "grad_norm": 0.7390387058258057, "learning_rate": 7.085427135678392e-05, "loss": 1.1692, "step": 59 }, { "epoch": 0.7717041800643086, "grad_norm": 0.7512691617012024, "learning_rate": 7.035175879396985e-05, "loss": 1.2447, "step": 60 }, { "epoch": 0.7845659163987139, "grad_norm": 0.7736090421676636, "learning_rate": 6.984924623115579e-05, "loss": 1.1723, "step": 61 }, { "epoch": 0.797427652733119, "grad_norm": 0.7576032280921936, "learning_rate": 6.93467336683417e-05, "loss": 1.2074, "step": 62 }, { "epoch": 0.8102893890675241, "grad_norm": 0.7597335577011108, "learning_rate": 6.884422110552764e-05, "loss": 1.1425, "step": 63 }, { "epoch": 0.8231511254019293, "grad_norm": 0.774594247341156, "learning_rate": 6.834170854271357e-05, "loss": 1.3168, "step": 64 }, { "epoch": 0.8360128617363344, "grad_norm": 0.7835249900817871, "learning_rate": 6.78391959798995e-05, "loss": 0.9976, "step": 65 }, { "epoch": 0.8488745980707395, "grad_norm": 0.915006160736084, "learning_rate": 6.733668341708544e-05, "loss": 1.2015, "step": 66 }, { "epoch": 0.8617363344051447, "grad_norm": 0.7381256818771362, "learning_rate": 6.683417085427135e-05, "loss": 1.1394, "step": 67 }, { "epoch": 0.8745980707395499, "grad_norm": 0.6990655064582825, "learning_rate": 6.633165829145729e-05, "loss": 1.1879, "step": 68 }, { "epoch": 0.887459807073955, "grad_norm": 0.7526246309280396, "learning_rate": 6.582914572864322e-05, "loss": 1.2836, "step": 69 }, { "epoch": 0.9003215434083601, "grad_norm": 0.6938768029212952, "learning_rate": 6.532663316582915e-05, "loss": 1.1957, "step": 70 }, { "epoch": 0.9131832797427653, "grad_norm": 0.7444002032279968, "learning_rate": 6.482412060301508e-05, "loss": 1.2171, "step": 71 }, { "epoch": 0.9260450160771704, "grad_norm": 0.7161276340484619, "learning_rate": 6.4321608040201e-05, "loss": 1.1213, "step": 72 }, { "epoch": 0.9389067524115756, "grad_norm": 0.7644099593162537, "learning_rate": 6.381909547738694e-05, "loss": 1.0821, "step": 73 }, { "epoch": 0.9517684887459807, "grad_norm": 0.6624590754508972, "learning_rate": 6.331658291457287e-05, "loss": 1.233, "step": 74 }, { "epoch": 0.9646302250803859, "grad_norm": 0.7556670308113098, "learning_rate": 6.28140703517588e-05, "loss": 1.2724, "step": 75 }, { "epoch": 0.977491961414791, "grad_norm": 0.7611079812049866, "learning_rate": 6.231155778894473e-05, "loss": 1.1659, "step": 76 }, { "epoch": 0.9903536977491961, "grad_norm": 0.7112507224082947, "learning_rate": 6.180904522613065e-05, "loss": 1.1363, "step": 77 }, { "epoch": 1.0064308681672025, "grad_norm": 1.1092808246612549, "learning_rate": 6.130653266331658e-05, "loss": 1.6764, "step": 78 }, { "epoch": 1.0192926045016077, "grad_norm": 0.5934129357337952, "learning_rate": 6.080402010050251e-05, "loss": 1.0156, "step": 79 }, { "epoch": 1.0321543408360128, "grad_norm": 0.6461953520774841, "learning_rate": 6.030150753768844e-05, "loss": 1.116, "step": 80 }, { "epoch": 1.045016077170418, "grad_norm": 0.6718643307685852, "learning_rate": 5.979899497487438e-05, "loss": 1.1288, "step": 81 }, { "epoch": 1.0578778135048232, "grad_norm": 0.6698076128959656, "learning_rate": 5.929648241206031e-05, "loss": 1.1071, "step": 82 }, { "epoch": 1.0707395498392283, "grad_norm": 0.630225419998169, "learning_rate": 5.879396984924623e-05, "loss": 1.0615, "step": 83 }, { "epoch": 1.0836012861736335, "grad_norm": 0.7957659959793091, "learning_rate": 5.829145728643216e-05, "loss": 1.1099, "step": 84 }, { "epoch": 1.0964630225080385, "grad_norm": 0.7359949946403503, "learning_rate": 5.778894472361809e-05, "loss": 1.1107, "step": 85 }, { "epoch": 1.1093247588424437, "grad_norm": 0.7295353412628174, "learning_rate": 5.728643216080403e-05, "loss": 1.093, "step": 86 }, { "epoch": 1.122186495176849, "grad_norm": 0.6929362416267395, "learning_rate": 5.6783919597989955e-05, "loss": 1.0855, "step": 87 }, { "epoch": 1.135048231511254, "grad_norm": 0.6977247595787048, "learning_rate": 5.628140703517588e-05, "loss": 1.1056, "step": 88 }, { "epoch": 1.1479099678456592, "grad_norm": 0.7273219227790833, "learning_rate": 5.577889447236181e-05, "loss": 1.1319, "step": 89 }, { "epoch": 1.1607717041800643, "grad_norm": 0.7561770081520081, "learning_rate": 5.527638190954774e-05, "loss": 1.1406, "step": 90 }, { "epoch": 1.1736334405144695, "grad_norm": 0.7354375720024109, "learning_rate": 5.477386934673368e-05, "loss": 1.8666, "step": 91 }, { "epoch": 1.1864951768488745, "grad_norm": 0.6309502124786377, "learning_rate": 5.4271356783919604e-05, "loss": 0.9311, "step": 92 }, { "epoch": 1.1993569131832797, "grad_norm": 0.6576774716377258, "learning_rate": 5.376884422110553e-05, "loss": 0.9488, "step": 93 }, { "epoch": 1.212218649517685, "grad_norm": 0.8312212228775024, "learning_rate": 5.3266331658291455e-05, "loss": 1.2609, "step": 94 }, { "epoch": 1.22508038585209, "grad_norm": 0.6623194813728333, "learning_rate": 5.276381909547739e-05, "loss": 0.9066, "step": 95 }, { "epoch": 1.2379421221864952, "grad_norm": 0.8220139741897583, "learning_rate": 5.226130653266332e-05, "loss": 1.2762, "step": 96 }, { "epoch": 1.2508038585209003, "grad_norm": 0.8111358880996704, "learning_rate": 5.175879396984925e-05, "loss": 1.1121, "step": 97 }, { "epoch": 1.2636655948553055, "grad_norm": 0.8110450506210327, "learning_rate": 5.125628140703518e-05, "loss": 1.1078, "step": 98 }, { "epoch": 1.2765273311897105, "grad_norm": 0.7321817874908447, "learning_rate": 5.0753768844221104e-05, "loss": 1.1252, "step": 99 }, { "epoch": 1.2893890675241158, "grad_norm": 0.7316034436225891, "learning_rate": 5.0251256281407036e-05, "loss": 1.0643, "step": 100 }, { "epoch": 1.302250803858521, "grad_norm": 0.7192660570144653, "learning_rate": 4.974874371859297e-05, "loss": 1.1715, "step": 101 }, { "epoch": 1.315112540192926, "grad_norm": 0.7240356206893921, "learning_rate": 4.92462311557789e-05, "loss": 1.0688, "step": 102 }, { "epoch": 1.3279742765273312, "grad_norm": 0.6924866437911987, "learning_rate": 4.874371859296483e-05, "loss": 0.9077, "step": 103 }, { "epoch": 1.3408360128617363, "grad_norm": 0.7501733303070068, "learning_rate": 4.824120603015075e-05, "loss": 1.5946, "step": 104 }, { "epoch": 1.3536977491961415, "grad_norm": 0.807327926158905, "learning_rate": 4.7738693467336685e-05, "loss": 1.1421, "step": 105 }, { "epoch": 1.3665594855305465, "grad_norm": 0.7556862235069275, "learning_rate": 4.723618090452262e-05, "loss": 0.9923, "step": 106 }, { "epoch": 1.3794212218649518, "grad_norm": 0.7407500743865967, "learning_rate": 4.673366834170855e-05, "loss": 0.9658, "step": 107 }, { "epoch": 1.392282958199357, "grad_norm": 0.7646963596343994, "learning_rate": 4.6231155778894475e-05, "loss": 1.0564, "step": 108 }, { "epoch": 1.405144694533762, "grad_norm": 0.782166063785553, "learning_rate": 4.57286432160804e-05, "loss": 0.9601, "step": 109 }, { "epoch": 1.4180064308681672, "grad_norm": 0.7942412495613098, "learning_rate": 4.522613065326633e-05, "loss": 1.0054, "step": 110 }, { "epoch": 1.4308681672025725, "grad_norm": 0.7960532307624817, "learning_rate": 4.4723618090452266e-05, "loss": 1.0791, "step": 111 }, { "epoch": 1.4437299035369775, "grad_norm": 0.7725895643234253, "learning_rate": 4.42211055276382e-05, "loss": 1.0099, "step": 112 }, { "epoch": 1.4565916398713825, "grad_norm": 0.8653061985969543, "learning_rate": 4.3718592964824124e-05, "loss": 1.168, "step": 113 }, { "epoch": 1.4694533762057878, "grad_norm": 0.8689178824424744, "learning_rate": 4.321608040201005e-05, "loss": 1.1026, "step": 114 }, { "epoch": 1.482315112540193, "grad_norm": 0.823765754699707, "learning_rate": 4.271356783919598e-05, "loss": 1.3272, "step": 115 }, { "epoch": 1.495176848874598, "grad_norm": 0.715006947517395, "learning_rate": 4.2211055276381914e-05, "loss": 0.9572, "step": 116 }, { "epoch": 1.5080385852090032, "grad_norm": 0.9381377696990967, "learning_rate": 4.170854271356784e-05, "loss": 1.0988, "step": 117 }, { "epoch": 1.5209003215434085, "grad_norm": 0.7912129759788513, "learning_rate": 4.120603015075377e-05, "loss": 1.0784, "step": 118 }, { "epoch": 1.5337620578778135, "grad_norm": 0.8803995251655579, "learning_rate": 4.07035175879397e-05, "loss": 1.1845, "step": 119 }, { "epoch": 1.5466237942122185, "grad_norm": 0.7034085988998413, "learning_rate": 4.020100502512563e-05, "loss": 1.016, "step": 120 }, { "epoch": 1.5594855305466238, "grad_norm": 0.8203994035720825, "learning_rate": 3.969849246231156e-05, "loss": 1.063, "step": 121 }, { "epoch": 1.572347266881029, "grad_norm": 0.712227463722229, "learning_rate": 3.919597989949749e-05, "loss": 1.033, "step": 122 }, { "epoch": 1.585209003215434, "grad_norm": 0.7563191056251526, "learning_rate": 3.869346733668342e-05, "loss": 1.0089, "step": 123 }, { "epoch": 1.5980707395498392, "grad_norm": 0.8773857355117798, "learning_rate": 3.8190954773869346e-05, "loss": 1.4975, "step": 124 }, { "epoch": 1.6109324758842445, "grad_norm": 0.6820680499076843, "learning_rate": 3.768844221105528e-05, "loss": 0.8996, "step": 125 }, { "epoch": 1.6237942122186495, "grad_norm": 0.7820584774017334, "learning_rate": 3.7185929648241204e-05, "loss": 0.9505, "step": 126 }, { "epoch": 1.6366559485530545, "grad_norm": 0.7456291913986206, "learning_rate": 3.668341708542714e-05, "loss": 0.9537, "step": 127 }, { "epoch": 1.6495176848874598, "grad_norm": 0.9636368155479431, "learning_rate": 3.618090452261307e-05, "loss": 1.1609, "step": 128 }, { "epoch": 1.662379421221865, "grad_norm": 0.83909672498703, "learning_rate": 3.5678391959798995e-05, "loss": 1.0519, "step": 129 }, { "epoch": 1.67524115755627, "grad_norm": 0.9138084053993225, "learning_rate": 3.517587939698493e-05, "loss": 0.977, "step": 130 }, { "epoch": 1.6881028938906752, "grad_norm": 0.8310115337371826, "learning_rate": 3.467336683417085e-05, "loss": 1.0701, "step": 131 }, { "epoch": 1.7009646302250805, "grad_norm": 0.8289808630943298, "learning_rate": 3.4170854271356785e-05, "loss": 1.0607, "step": 132 }, { "epoch": 1.7138263665594855, "grad_norm": 0.9004020094871521, "learning_rate": 3.366834170854272e-05, "loss": 1.3209, "step": 133 }, { "epoch": 1.7266881028938905, "grad_norm": 0.7358340620994568, "learning_rate": 3.3165829145728643e-05, "loss": 0.9646, "step": 134 }, { "epoch": 1.739549839228296, "grad_norm": 0.8724610805511475, "learning_rate": 3.2663316582914576e-05, "loss": 0.9834, "step": 135 }, { "epoch": 1.752411575562701, "grad_norm": 0.9433433413505554, "learning_rate": 3.21608040201005e-05, "loss": 0.9332, "step": 136 }, { "epoch": 1.765273311897106, "grad_norm": 0.9258175492286682, "learning_rate": 3.1658291457286434e-05, "loss": 1.0992, "step": 137 }, { "epoch": 1.7781350482315113, "grad_norm": 0.8666226267814636, "learning_rate": 3.1155778894472366e-05, "loss": 1.0032, "step": 138 }, { "epoch": 1.7909967845659165, "grad_norm": 0.9296693205833435, "learning_rate": 3.065326633165829e-05, "loss": 1.082, "step": 139 }, { "epoch": 1.8038585209003215, "grad_norm": 0.8250362873077393, "learning_rate": 3.015075376884422e-05, "loss": 0.8665, "step": 140 }, { "epoch": 1.8167202572347267, "grad_norm": 0.9356509447097778, "learning_rate": 2.9648241206030153e-05, "loss": 1.1933, "step": 141 }, { "epoch": 1.829581993569132, "grad_norm": 0.935892641544342, "learning_rate": 2.914572864321608e-05, "loss": 1.0981, "step": 142 }, { "epoch": 1.842443729903537, "grad_norm": 0.8205680251121521, "learning_rate": 2.8643216080402015e-05, "loss": 1.0501, "step": 143 }, { "epoch": 1.855305466237942, "grad_norm": 0.9360036253929138, "learning_rate": 2.814070351758794e-05, "loss": 1.1749, "step": 144 }, { "epoch": 1.8681672025723473, "grad_norm": 0.9020069241523743, "learning_rate": 2.763819095477387e-05, "loss": 1.0402, "step": 145 }, { "epoch": 1.8810289389067525, "grad_norm": 0.7428980469703674, "learning_rate": 2.7135678391959802e-05, "loss": 0.9236, "step": 146 }, { "epoch": 1.8938906752411575, "grad_norm": 0.8405928611755371, "learning_rate": 2.6633165829145728e-05, "loss": 1.2336, "step": 147 }, { "epoch": 1.9067524115755627, "grad_norm": 0.954319417476654, "learning_rate": 2.613065326633166e-05, "loss": 1.0338, "step": 148 }, { "epoch": 1.919614147909968, "grad_norm": 0.836933970451355, "learning_rate": 2.562814070351759e-05, "loss": 0.9513, "step": 149 }, { "epoch": 1.932475884244373, "grad_norm": 0.8751674294471741, "learning_rate": 2.5125628140703518e-05, "loss": 1.1418, "step": 150 }, { "epoch": 1.945337620578778, "grad_norm": 0.7700965404510498, "learning_rate": 2.462311557788945e-05, "loss": 1.0421, "step": 151 }, { "epoch": 1.9581993569131833, "grad_norm": 0.8978580832481384, "learning_rate": 2.4120603015075376e-05, "loss": 1.124, "step": 152 }, { "epoch": 1.9710610932475885, "grad_norm": 0.8578283786773682, "learning_rate": 2.361809045226131e-05, "loss": 1.1622, "step": 153 }, { "epoch": 1.9839228295819935, "grad_norm": 0.8631901741027832, "learning_rate": 2.3115577889447238e-05, "loss": 1.0854, "step": 154 }, { "epoch": 1.9967845659163987, "grad_norm": 1.506197452545166, "learning_rate": 2.2613065326633167e-05, "loss": 1.5602, "step": 155 }, { "epoch": 2.012861736334405, "grad_norm": 0.8579190969467163, "learning_rate": 2.21105527638191e-05, "loss": 1.0135, "step": 156 }, { "epoch": 2.0257234726688105, "grad_norm": 0.791259229183197, "learning_rate": 2.1608040201005025e-05, "loss": 1.063, "step": 157 }, { "epoch": 2.0385852090032155, "grad_norm": 0.7824810743331909, "learning_rate": 2.1105527638190957e-05, "loss": 0.9797, "step": 158 }, { "epoch": 2.0514469453376205, "grad_norm": 0.7896823287010193, "learning_rate": 2.0603015075376886e-05, "loss": 0.9882, "step": 159 }, { "epoch": 2.0643086816720255, "grad_norm": 0.8175792098045349, "learning_rate": 2.0100502512562815e-05, "loss": 0.9608, "step": 160 }, { "epoch": 2.077170418006431, "grad_norm": 0.819749116897583, "learning_rate": 1.9597989949748744e-05, "loss": 1.0491, "step": 161 }, { "epoch": 2.090032154340836, "grad_norm": 0.8855689764022827, "learning_rate": 1.9095477386934673e-05, "loss": 1.0228, "step": 162 }, { "epoch": 2.102893890675241, "grad_norm": 0.8208020925521851, "learning_rate": 1.8592964824120602e-05, "loss": 0.9609, "step": 163 }, { "epoch": 2.1157556270096465, "grad_norm": 0.7887352108955383, "learning_rate": 1.8090452261306535e-05, "loss": 0.9935, "step": 164 }, { "epoch": 2.1286173633440515, "grad_norm": 0.7535598874092102, "learning_rate": 1.7587939698492464e-05, "loss": 1.0082, "step": 165 }, { "epoch": 2.1414790996784565, "grad_norm": 0.8124526739120483, "learning_rate": 1.7085427135678393e-05, "loss": 1.0349, "step": 166 }, { "epoch": 2.154340836012862, "grad_norm": 0.8419150114059448, "learning_rate": 1.6582914572864322e-05, "loss": 0.9734, "step": 167 }, { "epoch": 2.167202572347267, "grad_norm": 0.7929818034172058, "learning_rate": 1.608040201005025e-05, "loss": 0.9844, "step": 168 }, { "epoch": 2.180064308681672, "grad_norm": 0.8568313121795654, "learning_rate": 1.5577889447236183e-05, "loss": 0.9512, "step": 169 }, { "epoch": 2.192926045016077, "grad_norm": 0.7779914736747742, "learning_rate": 1.507537688442211e-05, "loss": 1.4218, "step": 170 }, { "epoch": 2.2057877813504825, "grad_norm": 0.8760497570037842, "learning_rate": 1.457286432160804e-05, "loss": 0.9962, "step": 171 }, { "epoch": 2.2186495176848875, "grad_norm": 0.9114797711372375, "learning_rate": 1.407035175879397e-05, "loss": 0.9851, "step": 172 }, { "epoch": 2.2315112540192925, "grad_norm": 0.9769560694694519, "learning_rate": 1.3567839195979901e-05, "loss": 0.9286, "step": 173 }, { "epoch": 2.244372990353698, "grad_norm": 0.8264016509056091, "learning_rate": 1.306532663316583e-05, "loss": 1.105, "step": 174 }, { "epoch": 2.257234726688103, "grad_norm": 0.845151960849762, "learning_rate": 1.2562814070351759e-05, "loss": 1.0246, "step": 175 }, { "epoch": 2.270096463022508, "grad_norm": 0.8815337419509888, "learning_rate": 1.2060301507537688e-05, "loss": 1.123, "step": 176 }, { "epoch": 2.282958199356913, "grad_norm": 0.8236774802207947, "learning_rate": 1.1557788944723619e-05, "loss": 1.0557, "step": 177 }, { "epoch": 2.2958199356913185, "grad_norm": 0.9845472574234009, "learning_rate": 1.105527638190955e-05, "loss": 1.0297, "step": 178 }, { "epoch": 2.3086816720257235, "grad_norm": 0.9996894001960754, "learning_rate": 1.0552763819095479e-05, "loss": 0.9584, "step": 179 }, { "epoch": 2.3215434083601285, "grad_norm": 0.873965859413147, "learning_rate": 1.0050251256281408e-05, "loss": 0.9794, "step": 180 }, { "epoch": 2.334405144694534, "grad_norm": 0.8431399464607239, "learning_rate": 9.547738693467337e-06, "loss": 0.982, "step": 181 }, { "epoch": 2.347266881028939, "grad_norm": 0.8434001207351685, "learning_rate": 9.045226130653267e-06, "loss": 0.9264, "step": 182 }, { "epoch": 2.360128617363344, "grad_norm": 0.9110084176063538, "learning_rate": 8.542713567839196e-06, "loss": 0.9953, "step": 183 }, { "epoch": 2.372990353697749, "grad_norm": 0.7942981719970703, "learning_rate": 8.040201005025125e-06, "loss": 1.0177, "step": 184 }, { "epoch": 2.3858520900321545, "grad_norm": 0.8465241193771362, "learning_rate": 7.537688442211055e-06, "loss": 0.9383, "step": 185 }, { "epoch": 2.3987138263665595, "grad_norm": 0.9114559888839722, "learning_rate": 7.035175879396985e-06, "loss": 1.0373, "step": 186 }, { "epoch": 2.4115755627009645, "grad_norm": 0.9119040369987488, "learning_rate": 6.532663316582915e-06, "loss": 0.9853, "step": 187 }, { "epoch": 2.42443729903537, "grad_norm": 0.8535617589950562, "learning_rate": 6.030150753768844e-06, "loss": 1.0043, "step": 188 }, { "epoch": 2.437299035369775, "grad_norm": 0.8427352905273438, "learning_rate": 5.527638190954775e-06, "loss": 0.9835, "step": 189 }, { "epoch": 2.45016077170418, "grad_norm": 0.8029307723045349, "learning_rate": 5.025125628140704e-06, "loss": 1.0006, "step": 190 }, { "epoch": 2.463022508038585, "grad_norm": 0.7396455407142639, "learning_rate": 4.522613065326634e-06, "loss": 1.2503, "step": 191 }, { "epoch": 2.4758842443729905, "grad_norm": 0.9325175881385803, "learning_rate": 4.020100502512563e-06, "loss": 1.014, "step": 192 }, { "epoch": 2.4887459807073955, "grad_norm": 0.8711210489273071, "learning_rate": 3.5175879396984926e-06, "loss": 0.8779, "step": 193 }, { "epoch": 2.5016077170418005, "grad_norm": 0.9022113680839539, "learning_rate": 3.015075376884422e-06, "loss": 1.0297, "step": 194 }, { "epoch": 2.514469453376206, "grad_norm": 0.8506854176521301, "learning_rate": 2.512562814070352e-06, "loss": 0.8196, "step": 195 }, { "epoch": 2.527331189710611, "grad_norm": 0.9505279064178467, "learning_rate": 2.0100502512562813e-06, "loss": 0.9124, "step": 196 }, { "epoch": 2.540192926045016, "grad_norm": 0.8015897870063782, "learning_rate": 1.507537688442211e-06, "loss": 0.9492, "step": 197 }, { "epoch": 2.553054662379421, "grad_norm": 0.8406469821929932, "learning_rate": 1.0050251256281407e-06, "loss": 0.9474, "step": 198 }, { "epoch": 2.5659163987138265, "grad_norm": 0.8296630382537842, "learning_rate": 5.025125628140703e-07, "loss": 0.9759, "step": 199 }, { "epoch": 2.5787781350482315, "grad_norm": 0.8576996922492981, "learning_rate": 0.0, "loss": 0.849, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.879413794353971e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }