{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02092050209205021, "grad_norm": 0.07870777149790437, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.6306002140045166, "logits/rejected": -2.576826572418213, "logps/chosen": -1.0156770944595337, "logps/rejected": -1.187302827835083, "loss": 0.6931, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 4.107605946046533e-06, "rewards/margins": 6.403818588296417e-06, "rewards/rejected": -2.296213096997235e-06, "step": 10 }, { "epoch": 0.04184100418410042, "grad_norm": 0.07901324239389455, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.6465299129486084, "logits/rejected": -2.615328311920166, "logps/chosen": -1.072858452796936, "logps/rejected": -1.1568002700805664, "loss": 0.6931, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 5.4746556997997686e-05, "rewards/margins": 2.0796986063942313e-05, "rewards/rejected": 3.394957457203418e-05, "step": 20 }, { "epoch": 0.06276150627615062, "grad_norm": 0.06023523282455774, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.6788735389709473, "logits/rejected": -2.601842164993286, "logps/chosen": -0.9440663456916809, "logps/rejected": -1.1466290950775146, "loss": 0.6931, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0002553145168349147, "rewards/margins": 6.28855632385239e-05, "rewards/rejected": 0.00019242893904447556, "step": 30 }, { "epoch": 0.08368200836820083, "grad_norm": 0.07304770397797064, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.655829906463623, "logits/rejected": -2.574197292327881, "logps/chosen": -0.9297592043876648, "logps/rejected": -1.1075788736343384, "loss": 0.6929, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0005508430185727775, "rewards/margins": 7.169989112298936e-05, "rewards/rejected": 0.0004791432002093643, "step": 40 }, { "epoch": 0.10460251046025104, "grad_norm": 0.10718963835004064, "learning_rate": 4.999733114418725e-07, "logits/chosen": -2.5783586502075195, "logits/rejected": -2.5689890384674072, "logps/chosen": -1.1152536869049072, "logps/rejected": -1.2834819555282593, "loss": 0.6926, "rewards/accuracies": 0.6875, "rewards/chosen": 0.00013837238657288253, "rewards/margins": 0.0015539798187091947, "rewards/rejected": -0.0014156072866171598, "step": 50 }, { "epoch": 0.12552301255230125, "grad_norm": 0.746842093919948, "learning_rate": 4.990398100856366e-07, "logits/chosen": -2.4468166828155518, "logits/rejected": -2.4004383087158203, "logps/chosen": -1.0850234031677246, "logps/rejected": -1.3491606712341309, "loss": 0.6921, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.0004565033013932407, "rewards/margins": 0.00173471518792212, "rewards/rejected": -0.0021912185475230217, "step": 60 }, { "epoch": 0.14644351464435146, "grad_norm": 0.4410404107128935, "learning_rate": 4.967775735898179e-07, "logits/chosen": -1.8864914178848267, "logits/rejected": -1.7537386417388916, "logps/chosen": -1.3654557466506958, "logps/rejected": -1.9325025081634521, "loss": 0.6915, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0032336972653865814, "rewards/margins": 0.004393851850181818, "rewards/rejected": -0.007627549115568399, "step": 70 }, { "epoch": 0.16736401673640167, "grad_norm": 1.1545608141293162, "learning_rate": 4.931986719649298e-07, "logits/chosen": 0.8977311253547668, "logits/rejected": 0.8506044149398804, "logps/chosen": -1.9253437519073486, "logps/rejected": -2.216801643371582, "loss": 0.6897, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.008814895525574684, "rewards/margins": 0.0028276063967496157, "rewards/rejected": -0.01164250262081623, "step": 80 }, { "epoch": 0.18828451882845187, "grad_norm": 0.6816222180173854, "learning_rate": 4.883222001996351e-07, "logits/chosen": 2.8068461418151855, "logits/rejected": 3.1994242668151855, "logps/chosen": -3.0255942344665527, "logps/rejected": -4.264664649963379, "loss": 0.6871, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.019984012469649315, "rewards/margins": 0.011252423748373985, "rewards/rejected": -0.0312364362180233, "step": 90 }, { "epoch": 0.20920502092050208, "grad_norm": 2.826730795468728, "learning_rate": 4.821741763807186e-07, "logits/chosen": 3.5266506671905518, "logits/rejected": 4.590703010559082, "logps/chosen": -8.408990859985352, "logps/rejected": -11.962701797485352, "loss": 0.6803, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.07384248822927475, "rewards/margins": 0.0347314216196537, "rewards/rejected": -0.10857391357421875, "step": 100 }, { "epoch": 0.20920502092050208, "eval_logits/chosen": 4.672222137451172, "eval_logits/rejected": 5.161917209625244, "eval_logps/chosen": -19.255094528198242, "eval_logps/rejected": -23.496896743774414, "eval_loss": 0.6740710735321045, "eval_rewards/accuracies": 0.60546875, "eval_rewards/chosen": -0.18297159671783447, "eval_rewards/margins": 0.04080774635076523, "eval_rewards/rejected": -0.2237793505191803, "eval_runtime": 101.5736, "eval_samples_per_second": 19.69, "eval_steps_per_second": 0.315, "step": 100 }, { "epoch": 0.2301255230125523, "grad_norm": 3.6684021455052416, "learning_rate": 4.747874028753375e-07, "logits/chosen": 4.655301094055176, "logits/rejected": 5.133517265319824, "logps/chosen": -21.937213897705078, "logps/rejected": -27.49126625061035, "loss": 0.6637, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.20875534415245056, "rewards/margins": 0.05406813696026802, "rewards/rejected": -0.2628234922885895, "step": 110 }, { "epoch": 0.2510460251046025, "grad_norm": 3.7695575614366703, "learning_rate": 4.662012913161997e-07, "logits/chosen": 3.5462818145751953, "logits/rejected": 4.2834906578063965, "logps/chosen": -20.185375213623047, "logps/rejected": -30.037878036499023, "loss": 0.6586, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.19164811074733734, "rewards/margins": 0.09708912670612335, "rewards/rejected": -0.2887372672557831, "step": 120 }, { "epoch": 0.2719665271966527, "grad_norm": 5.06646121428358, "learning_rate": 4.5646165232345103e-07, "logits/chosen": 3.5566489696502686, "logits/rejected": 3.932034969329834, "logps/chosen": -26.674022674560547, "logps/rejected": -33.86052703857422, "loss": 0.6538, "rewards/accuracies": 0.59375, "rewards/chosen": -0.25690948963165283, "rewards/margins": 0.0707937479019165, "rewards/rejected": -0.32770323753356934, "step": 130 }, { "epoch": 0.2928870292887029, "grad_norm": 6.761925793876663, "learning_rate": 4.456204510851956e-07, "logits/chosen": 3.1402018070220947, "logits/rejected": 3.708683490753174, "logps/chosen": -21.81682777404785, "logps/rejected": -37.612449645996094, "loss": 0.6431, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.20800963044166565, "rewards/margins": 0.15619885921478271, "rewards/rejected": -0.36420848965644836, "step": 140 }, { "epoch": 0.3138075313807531, "grad_norm": 8.735857187801553, "learning_rate": 4.337355301007335e-07, "logits/chosen": 3.099961519241333, "logits/rejected": 3.81238055229187, "logps/chosen": -23.575672149658203, "logps/rejected": -36.89662551879883, "loss": 0.6395, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.2260146588087082, "rewards/margins": 0.1314857304096222, "rewards/rejected": -0.3575003743171692, "step": 150 }, { "epoch": 0.33472803347280333, "grad_norm": 6.923549888626696, "learning_rate": 4.2087030056579986e-07, "logits/chosen": 3.2462105751037598, "logits/rejected": 3.581221103668213, "logps/chosen": -21.311235427856445, "logps/rejected": -36.60784149169922, "loss": 0.6369, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.2027742564678192, "rewards/margins": 0.151877760887146, "rewards/rejected": -0.3546520173549652, "step": 160 }, { "epoch": 0.35564853556485354, "grad_norm": 7.240479894508596, "learning_rate": 4.070934040463998e-07, "logits/chosen": 3.023836612701416, "logits/rejected": 3.3588485717773438, "logps/chosen": -23.696680068969727, "logps/rejected": -36.59798049926758, "loss": 0.6339, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.22723570466041565, "rewards/margins": 0.12730170786380768, "rewards/rejected": -0.3545374274253845, "step": 170 }, { "epoch": 0.37656903765690375, "grad_norm": 9.14137324995646, "learning_rate": 3.9247834624635404e-07, "logits/chosen": 2.783609390258789, "logits/rejected": 3.172799587249756, "logps/chosen": -22.1041259765625, "logps/rejected": -37.87432098388672, "loss": 0.6295, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2107868194580078, "rewards/margins": 0.15683028101921082, "rewards/rejected": -0.36761707067489624, "step": 180 }, { "epoch": 0.39748953974895396, "grad_norm": 9.772844987285321, "learning_rate": 3.7710310482256523e-07, "logits/chosen": 2.702998399734497, "logits/rejected": 2.8405003547668457, "logps/chosen": -26.399200439453125, "logps/rejected": -40.94459915161133, "loss": 0.6258, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.25330352783203125, "rewards/margins": 0.14452621340751648, "rewards/rejected": -0.3978297710418701, "step": 190 }, { "epoch": 0.41841004184100417, "grad_norm": 8.573703981557731, "learning_rate": 3.610497133404795e-07, "logits/chosen": 2.867384433746338, "logits/rejected": 3.402123212814331, "logps/chosen": -25.27474021911621, "logps/rejected": -39.822689056396484, "loss": 0.6333, "rewards/accuracies": 0.71875, "rewards/chosen": -0.24253730475902557, "rewards/margins": 0.14454875886440277, "rewards/rejected": -0.38708609342575073, "step": 200 }, { "epoch": 0.41841004184100417, "eval_logits/chosen": 2.205869197845459, "eval_logits/rejected": 2.0618207454681396, "eval_logps/chosen": -27.025375366210938, "eval_logps/rejected": -43.748695373535156, "eval_loss": 0.6317887902259827, "eval_rewards/accuracies": 0.70703125, "eval_rewards/chosen": -0.2606744170188904, "eval_rewards/margins": 0.16562291979789734, "eval_rewards/rejected": -0.4262973666191101, "eval_runtime": 102.633, "eval_samples_per_second": 19.487, "eval_steps_per_second": 0.312, "step": 200 }, { "epoch": 0.4393305439330544, "grad_norm": 10.103691000737252, "learning_rate": 3.4440382358952115e-07, "logits/chosen": 2.632490634918213, "logits/rejected": 2.8943495750427246, "logps/chosen": -26.576526641845703, "logps/rejected": -39.573448181152344, "loss": 0.6294, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2549188733100891, "rewards/margins": 0.12898248434066772, "rewards/rejected": -0.38390129804611206, "step": 210 }, { "epoch": 0.4602510460251046, "grad_norm": 8.68088284973258, "learning_rate": 3.272542485937368e-07, "logits/chosen": 2.8408799171447754, "logits/rejected": 3.1735482215881348, "logps/chosen": -28.81484031677246, "logps/rejected": -44.7733268737793, "loss": 0.6277, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2777389883995056, "rewards/margins": 0.15828576683998108, "rewards/rejected": -0.43602481484413147, "step": 220 }, { "epoch": 0.4811715481171548, "grad_norm": 9.381367466525985, "learning_rate": 3.096924887558854e-07, "logits/chosen": 2.756274700164795, "logits/rejected": 3.042253017425537, "logps/chosen": -20.681415557861328, "logps/rejected": -41.724674224853516, "loss": 0.6228, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.195968359708786, "rewards/margins": 0.20941869914531708, "rewards/rejected": -0.4053870737552643, "step": 230 }, { "epoch": 0.502092050209205, "grad_norm": 9.698557591113461, "learning_rate": 2.9181224366319943e-07, "logits/chosen": 2.9523372650146484, "logits/rejected": 3.568504810333252, "logps/chosen": -25.638051986694336, "logps/rejected": -40.016014099121094, "loss": 0.6288, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.24575933814048767, "rewards/margins": 0.14226052165031433, "rewards/rejected": -0.3880198299884796, "step": 240 }, { "epoch": 0.5230125523012552, "grad_norm": 7.996022666238754, "learning_rate": 2.7370891215954565e-07, "logits/chosen": 2.842616558074951, "logits/rejected": 3.386730670928955, "logps/chosen": -16.753459930419922, "logps/rejected": -32.727378845214844, "loss": 0.6271, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.157147616147995, "rewards/margins": 0.15919244289398193, "rewards/rejected": -0.31634002923965454, "step": 250 }, { "epoch": 0.5439330543933054, "grad_norm": 9.49367392875526, "learning_rate": 2.55479083351317e-07, "logits/chosen": 2.8096323013305664, "logits/rejected": 2.8222105503082275, "logps/chosen": -26.005813598632812, "logps/rejected": -45.71054458618164, "loss": 0.6241, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.24941372871398926, "rewards/margins": 0.19658246636390686, "rewards/rejected": -0.4459961950778961, "step": 260 }, { "epoch": 0.5648535564853556, "grad_norm": 14.638817153423537, "learning_rate": 2.3722002126275822e-07, "logits/chosen": 2.8568336963653564, "logits/rejected": 3.19636869430542, "logps/chosen": -22.6005916595459, "logps/rejected": -40.90196990966797, "loss": 0.6224, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.21595272421836853, "rewards/margins": 0.1823262870311737, "rewards/rejected": -0.3982790410518646, "step": 270 }, { "epoch": 0.5857740585774058, "grad_norm": 10.46787470104777, "learning_rate": 2.19029145890313e-07, "logits/chosen": 2.681917428970337, "logits/rejected": 2.849825382232666, "logps/chosen": -21.89919090270996, "logps/rejected": -37.612815856933594, "loss": 0.6265, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2091476172208786, "rewards/margins": 0.1555161476135254, "rewards/rejected": -0.3646637797355652, "step": 280 }, { "epoch": 0.606694560669456, "grad_norm": 7.74921044050114, "learning_rate": 2.0100351342479216e-07, "logits/chosen": 2.8385300636291504, "logits/rejected": 2.8731513023376465, "logps/chosen": -24.38838768005371, "logps/rejected": -42.81266784667969, "loss": 0.6231, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.23373980820178986, "rewards/margins": 0.18318690359592438, "rewards/rejected": -0.41692671179771423, "step": 290 }, { "epoch": 0.6276150627615062, "grad_norm": 8.46554493519494, "learning_rate": 1.8323929841460178e-07, "logits/chosen": 3.3211822509765625, "logits/rejected": 3.4708034992218018, "logps/chosen": -22.359912872314453, "logps/rejected": -38.96550369262695, "loss": 0.6202, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.21357199549674988, "rewards/margins": 0.16511210799217224, "rewards/rejected": -0.3786841034889221, "step": 300 }, { "epoch": 0.6276150627615062, "eval_logits/chosen": 2.999788284301758, "eval_logits/rejected": 3.0356078147888184, "eval_logps/chosen": -20.559364318847656, "eval_logps/rejected": -40.0822639465332, "eval_loss": 0.6255786418914795, "eval_rewards/accuracies": 0.71875, "eval_rewards/chosen": -0.19601428508758545, "eval_rewards/margins": 0.1936187446117401, "eval_rewards/rejected": -0.38963305950164795, "eval_runtime": 102.0512, "eval_samples_per_second": 19.598, "eval_steps_per_second": 0.314, "step": 300 }, { "epoch": 0.6485355648535565, "grad_norm": 9.417564911888258, "learning_rate": 1.6583128063291573e-07, "logits/chosen": 2.8539745807647705, "logits/rejected": 3.0914368629455566, "logps/chosen": -20.804203033447266, "logps/rejected": -41.28700637817383, "loss": 0.621, "rewards/accuracies": 0.75, "rewards/chosen": -0.1979689598083496, "rewards/margins": 0.20332905650138855, "rewards/rejected": -0.40129804611206055, "step": 310 }, { "epoch": 0.6694560669456067, "grad_norm": 8.668181751551456, "learning_rate": 1.488723393865766e-07, "logits/chosen": 2.763029098510742, "logits/rejected": 3.5786728858947754, "logps/chosen": -25.161691665649414, "logps/rejected": -41.82555389404297, "loss": 0.6159, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.24154594540596008, "rewards/margins": 0.16512060165405273, "rewards/rejected": -0.4066665768623352, "step": 320 }, { "epoch": 0.6903765690376569, "grad_norm": 9.135541705526716, "learning_rate": 1.3245295796480788e-07, "logits/chosen": 2.6453309059143066, "logits/rejected": 3.213120937347412, "logps/chosen": -28.319080352783203, "logps/rejected": -45.0162467956543, "loss": 0.6209, "rewards/accuracies": 0.65625, "rewards/chosen": -0.27260464429855347, "rewards/margins": 0.16600076854228973, "rewards/rejected": -0.438605397939682, "step": 330 }, { "epoch": 0.7112970711297071, "grad_norm": 9.485727128957436, "learning_rate": 1.1666074087171627e-07, "logits/chosen": 3.553588390350342, "logits/rejected": 3.5263543128967285, "logps/chosen": -23.71384048461914, "logps/rejected": -40.1160774230957, "loss": 0.6268, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.22761042416095734, "rewards/margins": 0.16276967525482178, "rewards/rejected": -0.3903800845146179, "step": 340 }, { "epoch": 0.7322175732217573, "grad_norm": 9.579729502767362, "learning_rate": 1.0157994641835734e-07, "logits/chosen": 3.2801671028137207, "logits/rejected": 3.6302173137664795, "logps/chosen": -23.160703659057617, "logps/rejected": -39.75481414794922, "loss": 0.6148, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.22147643566131592, "rewards/margins": 0.1643747091293335, "rewards/rejected": -0.3858511745929718, "step": 350 }, { "epoch": 0.7531380753138075, "grad_norm": 9.56792620122, "learning_rate": 8.729103716819111e-08, "logits/chosen": 2.997981548309326, "logits/rejected": 3.285050868988037, "logps/chosen": -25.180973052978516, "logps/rejected": -41.596805572509766, "loss": 0.6213, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.2415219247341156, "rewards/margins": 0.16281357407569885, "rewards/rejected": -0.40433549880981445, "step": 360 }, { "epoch": 0.7740585774058577, "grad_norm": 9.46320162833878, "learning_rate": 7.387025063449081e-08, "logits/chosen": 2.827638864517212, "logits/rejected": 3.168362855911255, "logps/chosen": -20.08965301513672, "logps/rejected": -41.27653121948242, "loss": 0.6151, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1905418187379837, "rewards/margins": 0.21073243021965027, "rewards/rejected": -0.40127426385879517, "step": 370 }, { "epoch": 0.7949790794979079, "grad_norm": 11.287573566387135, "learning_rate": 6.138919252022435e-08, "logits/chosen": 2.806427001953125, "logits/rejected": 2.9862289428710938, "logps/chosen": -26.241718292236328, "logps/rejected": -40.23133850097656, "loss": 0.6247, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.25202974677085876, "rewards/margins": 0.1393345296382904, "rewards/rejected": -0.39136427640914917, "step": 380 }, { "epoch": 0.8158995815899581, "grad_norm": 13.141931276110663, "learning_rate": 4.991445467064689e-08, "logits/chosen": 3.050666093826294, "logits/rejected": 3.1723926067352295, "logps/chosen": -28.155725479125977, "logps/rejected": -46.16926193237305, "loss": 0.6197, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2713962495326996, "rewards/margins": 0.17873115837574005, "rewards/rejected": -0.45012742280960083, "step": 390 }, { "epoch": 0.8368200836820083, "grad_norm": 7.223640139685516, "learning_rate": 3.9507259776993954e-08, "logits/chosen": 2.904824733734131, "logits/rejected": 2.7292914390563965, "logps/chosen": -28.01041603088379, "logps/rejected": -43.947017669677734, "loss": 0.6195, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.27024754881858826, "rewards/margins": 0.1586446762084961, "rewards/rejected": -0.42889222502708435, "step": 400 }, { "epoch": 0.8368200836820083, "eval_logits/chosen": 2.7595362663269043, "eval_logits/rejected": 2.7050070762634277, "eval_logps/chosen": -23.82366180419922, "eval_logps/rejected": -43.26374435424805, "eval_loss": 0.6218914985656738, "eval_rewards/accuracies": 0.71875, "eval_rewards/chosen": -0.2286572903394699, "eval_rewards/margins": 0.19279056787490845, "eval_rewards/rejected": -0.42144784331321716, "eval_runtime": 101.9605, "eval_samples_per_second": 19.615, "eval_steps_per_second": 0.314, "step": 400 }, { "epoch": 0.8577405857740585, "grad_norm": 9.798220212045168, "learning_rate": 3.022313472693447e-08, "logits/chosen": 2.254669427871704, "logits/rejected": 2.458228588104248, "logps/chosen": -25.69635009765625, "logps/rejected": -43.98773956298828, "loss": 0.6191, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.24772438406944275, "rewards/margins": 0.18161071836948395, "rewards/rejected": -0.42933517694473267, "step": 410 }, { "epoch": 0.8786610878661087, "grad_norm": 9.457071183698215, "learning_rate": 2.2111614344599684e-08, "logits/chosen": 2.807976484298706, "logits/rejected": 3.5379626750946045, "logps/chosen": -23.7137393951416, "logps/rejected": -40.27167510986328, "loss": 0.6151, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.22669024765491486, "rewards/margins": 0.16417010128498077, "rewards/rejected": -0.390860378742218, "step": 420 }, { "epoch": 0.899581589958159, "grad_norm": 11.679593436076072, "learning_rate": 1.521597710086439e-08, "logits/chosen": 2.7207865715026855, "logits/rejected": 3.395256757736206, "logps/chosen": -21.971521377563477, "logps/rejected": -40.920936584472656, "loss": 0.6232, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.20926229655742645, "rewards/margins": 0.18839535117149353, "rewards/rejected": -0.3976576328277588, "step": 430 }, { "epoch": 0.9205020920502092, "grad_norm": 8.417656516106161, "learning_rate": 9.57301420397924e-09, "logits/chosen": 3.1824350357055664, "logits/rejected": 3.9073855876922607, "logps/chosen": -25.478689193725586, "logps/rejected": -40.70270538330078, "loss": 0.6223, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.24391062557697296, "rewards/margins": 0.15080411732196808, "rewards/rejected": -0.3947147727012634, "step": 440 }, { "epoch": 0.9414225941422594, "grad_norm": 8.425685268345296, "learning_rate": 5.212833302556258e-09, "logits/chosen": 3.0830206871032715, "logits/rejected": 3.034968614578247, "logps/chosen": -21.253185272216797, "logps/rejected": -42.495853424072266, "loss": 0.6214, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.20228211581707, "rewards/margins": 0.21119949221611023, "rewards/rejected": -0.41348162293434143, "step": 450 }, { "epoch": 0.9623430962343096, "grad_norm": 7.6775915287366585, "learning_rate": 2.158697848236607e-09, "logits/chosen": 3.344025135040283, "logits/rejected": 3.607654571533203, "logps/chosen": -26.998760223388672, "logps/rejected": -40.048553466796875, "loss": 0.6173, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.26005202531814575, "rewards/margins": 0.12917304039001465, "rewards/rejected": -0.3892250657081604, "step": 460 }, { "epoch": 0.9832635983263598, "grad_norm": 9.905482891237243, "learning_rate": 4.269029751107489e-10, "logits/chosen": 3.274850368499756, "logits/rejected": 3.1045289039611816, "logps/chosen": -22.779346466064453, "logps/rejected": -39.72507095336914, "loss": 0.6284, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.217888742685318, "rewards/margins": 0.168381929397583, "rewards/rejected": -0.386270672082901, "step": 470 }, { "epoch": 1.0, "step": 478, "total_flos": 0.0, "train_loss": 0.6403242514223234, "train_runtime": 11960.3252, "train_samples_per_second": 5.111, "train_steps_per_second": 0.04 } ], "logging_steps": 10, "max_steps": 478, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }