{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998441639395356, "eval_steps": 500, "global_step": 401, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0024933769674302633, "grad_norm": 133.0, "learning_rate": 4.375e-08, "logits/chosen": 0.9333375096321106, "logits/rejected": 0.8665135502815247, "logps/chosen": -1.574099063873291, "logps/rejected": -1.2997534275054932, "loss": 6.6643, "rewards/accuracies": 0.46875, "rewards/chosen": -15.74099063873291, "rewards/margins": -2.7434558868408203, "rewards/rejected": -12.997533798217773, "step": 1 }, { "epoch": 0.004986753934860527, "grad_norm": 78.5, "learning_rate": 8.75e-08, "logits/chosen": 1.007162094116211, "logits/rejected": 0.9319976568222046, "logps/chosen": -1.5873029232025146, "logps/rejected": -1.1813093423843384, "loss": 6.8875, "rewards/accuracies": 0.5, "rewards/chosen": -15.873027801513672, "rewards/margins": -4.059934616088867, "rewards/rejected": -11.813094139099121, "step": 2 }, { "epoch": 0.0074801309022907905, "grad_norm": 145.0, "learning_rate": 1.3125e-07, "logits/chosen": 1.015642523765564, "logits/rejected": 0.8658874034881592, "logps/chosen": -2.187445640563965, "logps/rejected": -1.3217400312423706, "loss": 10.8975, "rewards/accuracies": 0.34375, "rewards/chosen": -21.87445831298828, "rewards/margins": -8.657057762145996, "rewards/rejected": -13.217399597167969, "step": 3 }, { "epoch": 0.009973507869721053, "grad_norm": 80.5, "learning_rate": 1.75e-07, "logits/chosen": 1.0409551858901978, "logits/rejected": 0.9476256966590881, "logps/chosen": -1.4537204504013062, "logps/rejected": -1.1356033086776733, "loss": 5.5006, "rewards/accuracies": 0.5625, "rewards/chosen": -14.537205696105957, "rewards/margins": -3.181171417236328, "rewards/rejected": -11.356032371520996, "step": 4 }, { "epoch": 0.012466884837151316, "grad_norm": 58.75, "learning_rate": 2.1875e-07, "logits/chosen": 0.9139229655265808, "logits/rejected": 1.0454109907150269, "logps/chosen": -1.550325632095337, "logps/rejected": -1.160091757774353, "loss": 6.5305, "rewards/accuracies": 0.40625, "rewards/chosen": -15.503257751464844, "rewards/margins": -3.902338981628418, "rewards/rejected": -11.60091781616211, "step": 5 }, { "epoch": 0.014960261804581581, "grad_norm": 123.5, "learning_rate": 2.625e-07, "logits/chosen": 0.9995524883270264, "logits/rejected": 0.9891590476036072, "logps/chosen": -2.1153974533081055, "logps/rejected": -1.3793702125549316, "loss": 10.0926, "rewards/accuracies": 0.46875, "rewards/chosen": -21.153976440429688, "rewards/margins": -7.360274314880371, "rewards/rejected": -13.793702125549316, "step": 6 }, { "epoch": 0.017453638772011844, "grad_norm": 122.5, "learning_rate": 3.0625e-07, "logits/chosen": 0.9477364420890808, "logits/rejected": 0.8998125791549683, "logps/chosen": -2.1086533069610596, "logps/rejected": -1.1814693212509155, "loss": 10.9058, "rewards/accuracies": 0.3125, "rewards/chosen": -21.086532592773438, "rewards/margins": -9.271841049194336, "rewards/rejected": -11.814691543579102, "step": 7 }, { "epoch": 0.019947015739442107, "grad_norm": 117.5, "learning_rate": 3.5e-07, "logits/chosen": 0.9074363112449646, "logits/rejected": 1.0108835697174072, "logps/chosen": -2.171971082687378, "logps/rejected": -1.2922351360321045, "loss": 11.804, "rewards/accuracies": 0.375, "rewards/chosen": -21.719709396362305, "rewards/margins": -8.797359466552734, "rewards/rejected": -12.922350883483887, "step": 8 }, { "epoch": 0.02244039270687237, "grad_norm": 92.0, "learning_rate": 3.9375e-07, "logits/chosen": 0.9447617530822754, "logits/rejected": 0.8549212217330933, "logps/chosen": -2.003368616104126, "logps/rejected": -1.2754697799682617, "loss": 9.4959, "rewards/accuracies": 0.40625, "rewards/chosen": -20.033687591552734, "rewards/margins": -7.278989791870117, "rewards/rejected": -12.754697799682617, "step": 9 }, { "epoch": 0.024933769674302633, "grad_norm": 98.5, "learning_rate": 4.375e-07, "logits/chosen": 1.009035587310791, "logits/rejected": 0.895173192024231, "logps/chosen": -1.9958442449569702, "logps/rejected": -1.3750892877578735, "loss": 8.6131, "rewards/accuracies": 0.4375, "rewards/chosen": -19.95844078063965, "rewards/margins": -6.207549095153809, "rewards/rejected": -13.750892639160156, "step": 10 }, { "epoch": 0.027427146641732895, "grad_norm": 110.0, "learning_rate": 4.812499999999999e-07, "logits/chosen": 0.9430880546569824, "logits/rejected": 0.9480469226837158, "logps/chosen": -2.0413639545440674, "logps/rejected": -1.3464946746826172, "loss": 9.6928, "rewards/accuracies": 0.40625, "rewards/chosen": -20.413639068603516, "rewards/margins": -6.948694229125977, "rewards/rejected": -13.464945793151855, "step": 11 }, { "epoch": 0.029920523609163162, "grad_norm": 152.0, "learning_rate": 5.25e-07, "logits/chosen": 0.9941633343696594, "logits/rejected": 0.7915381193161011, "logps/chosen": -2.5496878623962402, "logps/rejected": -1.5264402627944946, "loss": 12.4115, "rewards/accuracies": 0.34375, "rewards/chosen": -25.496877670288086, "rewards/margins": -10.232475280761719, "rewards/rejected": -15.26440143585205, "step": 12 }, { "epoch": 0.03241390057659342, "grad_norm": 78.5, "learning_rate": 5.6875e-07, "logits/chosen": 0.8952471017837524, "logits/rejected": 0.8926589488983154, "logps/chosen": -1.597143530845642, "logps/rejected": -1.355407476425171, "loss": 6.8413, "rewards/accuracies": 0.59375, "rewards/chosen": -15.971436500549316, "rewards/margins": -2.41736102104187, "rewards/rejected": -13.554075241088867, "step": 13 }, { "epoch": 0.03490727754402369, "grad_norm": 128.0, "learning_rate": 6.125e-07, "logits/chosen": 1.050255537033081, "logits/rejected": 0.8761364221572876, "logps/chosen": -1.834416151046753, "logps/rejected": -1.298659324645996, "loss": 7.9545, "rewards/accuracies": 0.46875, "rewards/chosen": -18.344160079956055, "rewards/margins": -5.357568740844727, "rewards/rejected": -12.986591339111328, "step": 14 }, { "epoch": 0.03740065451145395, "grad_norm": 133.0, "learning_rate": 6.5625e-07, "logits/chosen": 1.0313760042190552, "logits/rejected": 0.914068341255188, "logps/chosen": -2.0879173278808594, "logps/rejected": -1.2106117010116577, "loss": 10.301, "rewards/accuracies": 0.34375, "rewards/chosen": -20.879173278808594, "rewards/margins": -8.773056030273438, "rewards/rejected": -12.10611629486084, "step": 15 }, { "epoch": 0.039894031478884213, "grad_norm": 56.5, "learning_rate": 7e-07, "logits/chosen": 0.9663585424423218, "logits/rejected": 0.9516808986663818, "logps/chosen": -1.7078903913497925, "logps/rejected": -1.2222994565963745, "loss": 7.0132, "rewards/accuracies": 0.4375, "rewards/chosen": -17.078907012939453, "rewards/margins": -4.8559112548828125, "rewards/rejected": -12.222993850708008, "step": 16 }, { "epoch": 0.04238740844631448, "grad_norm": 96.5, "learning_rate": 6.999883476391534e-07, "logits/chosen": 1.0192354917526245, "logits/rejected": 0.9732477068901062, "logps/chosen": -1.774751901626587, "logps/rejected": -1.0946956872940063, "loss": 8.648, "rewards/accuracies": 0.40625, "rewards/chosen": -17.747520446777344, "rewards/margins": -6.800562858581543, "rewards/rejected": -10.946956634521484, "step": 17 }, { "epoch": 0.04488078541374474, "grad_norm": 121.0, "learning_rate": 6.999533913324853e-07, "logits/chosen": 0.981746256351471, "logits/rejected": 0.9062566757202148, "logps/chosen": -2.0760321617126465, "logps/rejected": -2.2810633182525635, "loss": 10.1792, "rewards/accuracies": 0.3125, "rewards/chosen": -20.76032066345215, "rewards/margins": 2.050312042236328, "rewards/rejected": -22.810632705688477, "step": 18 }, { "epoch": 0.047374162381175006, "grad_norm": 67.0, "learning_rate": 6.998951334075586e-07, "logits/chosen": 1.0017695426940918, "logits/rejected": 0.9386453032493591, "logps/chosen": -1.5593485832214355, "logps/rejected": -1.3584306240081787, "loss": 5.5006, "rewards/accuracies": 0.625, "rewards/chosen": -15.593484878540039, "rewards/margins": -2.0091779232025146, "rewards/rejected": -13.584305763244629, "step": 19 }, { "epoch": 0.049867539348605265, "grad_norm": 83.5, "learning_rate": 6.998135777434723e-07, "logits/chosen": 0.9819589853286743, "logits/rejected": 0.9480808973312378, "logps/chosen": -1.7974135875701904, "logps/rejected": -1.2320420742034912, "loss": 8.011, "rewards/accuracies": 0.46875, "rewards/chosen": -17.974136352539062, "rewards/margins": -5.653716564178467, "rewards/rejected": -12.32042121887207, "step": 20 }, { "epoch": 0.05236091631603553, "grad_norm": 123.0, "learning_rate": 6.99708729770604e-07, "logits/chosen": 0.9151750802993774, "logits/rejected": 0.9027111530303955, "logps/chosen": -1.9205522537231445, "logps/rejected": -1.6000399589538574, "loss": 8.9355, "rewards/accuracies": 0.40625, "rewards/chosen": -19.205522537231445, "rewards/margins": -3.205122232437134, "rewards/rejected": -16.00040054321289, "step": 21 }, { "epoch": 0.05485429328346579, "grad_norm": 53.25, "learning_rate": 6.995805964702472e-07, "logits/chosen": 0.9063746333122253, "logits/rejected": 0.9599690437316895, "logps/chosen": -1.5286774635314941, "logps/rejected": -1.182100772857666, "loss": 6.2756, "rewards/accuracies": 0.34375, "rewards/chosen": -15.286775588989258, "rewards/margins": -3.4657678604125977, "rewards/rejected": -11.821005821228027, "step": 22 }, { "epoch": 0.05734767025089606, "grad_norm": 115.5, "learning_rate": 6.994291863741474e-07, "logits/chosen": 0.9865818619728088, "logits/rejected": 0.8803253173828125, "logps/chosen": -1.8937522172927856, "logps/rejected": -1.1672096252441406, "loss": 9.051, "rewards/accuracies": 0.4375, "rewards/chosen": -18.937522888183594, "rewards/margins": -7.265425205230713, "rewards/rejected": -11.672097206115723, "step": 23 }, { "epoch": 0.059841047218326324, "grad_norm": 120.0, "learning_rate": 6.992545095639337e-07, "logits/chosen": 0.8972434997558594, "logits/rejected": 0.8747442960739136, "logps/chosen": -2.372899055480957, "logps/rejected": -1.4153152704238892, "loss": 11.7809, "rewards/accuracies": 0.3125, "rewards/chosen": -23.728988647460938, "rewards/margins": -9.575835227966309, "rewards/rejected": -14.153154373168945, "step": 24 }, { "epoch": 0.06233442418575658, "grad_norm": 58.5, "learning_rate": 6.990565776704475e-07, "logits/chosen": 0.9191975593566895, "logits/rejected": 0.908176839351654, "logps/chosen": -1.6683969497680664, "logps/rejected": -1.231262445449829, "loss": 7.8375, "rewards/accuracies": 0.4375, "rewards/chosen": -16.683971405029297, "rewards/margins": -4.371344566345215, "rewards/rejected": -12.31262493133545, "step": 25 }, { "epoch": 0.06482780115318684, "grad_norm": 120.5, "learning_rate": 6.988354038729676e-07, "logits/chosen": 0.9013136625289917, "logits/rejected": 0.7893968820571899, "logps/chosen": -2.127075433731079, "logps/rejected": -1.3035414218902588, "loss": 10.6297, "rewards/accuracies": 0.34375, "rewards/chosen": -21.270755767822266, "rewards/margins": -8.23534107208252, "rewards/rejected": -13.03541374206543, "step": 26 }, { "epoch": 0.06732117812061711, "grad_norm": 82.0, "learning_rate": 6.985910028983336e-07, "logits/chosen": 0.9725473523139954, "logits/rejected": 0.9624121189117432, "logps/chosen": -2.005342483520508, "logps/rejected": -1.2962756156921387, "loss": 8.3075, "rewards/accuracies": 0.1875, "rewards/chosen": -20.053424835205078, "rewards/margins": -7.09066915512085, "rewards/rejected": -12.962756156921387, "step": 27 }, { "epoch": 0.06981455508804738, "grad_norm": 52.25, "learning_rate": 6.983233910199648e-07, "logits/chosen": 0.8846550583839417, "logits/rejected": 0.9423845410346985, "logps/chosen": -1.6742780208587646, "logps/rejected": -1.188732385635376, "loss": 7.3535, "rewards/accuracies": 0.40625, "rewards/chosen": -16.742778778076172, "rewards/margins": -4.8554558753967285, "rewards/rejected": -11.887323379516602, "step": 28 }, { "epoch": 0.07230793205547764, "grad_norm": 74.0, "learning_rate": 6.98032586056776e-07, "logits/chosen": 0.9702792167663574, "logits/rejected": 0.8728958368301392, "logps/chosen": -1.8141976594924927, "logps/rejected": -1.3005653619766235, "loss": 7.5819, "rewards/accuracies": 0.40625, "rewards/chosen": -18.14197540283203, "rewards/margins": -5.136322975158691, "rewards/rejected": -13.005653381347656, "step": 29 }, { "epoch": 0.0748013090229079, "grad_norm": 115.5, "learning_rate": 6.977186073719925e-07, "logits/chosen": 0.855915904045105, "logits/rejected": 0.7963756918907166, "logps/chosen": -1.9207674264907837, "logps/rejected": -1.16620934009552, "loss": 9.6435, "rewards/accuracies": 0.34375, "rewards/chosen": -19.20767593383789, "rewards/margins": -7.545581340789795, "rewards/rejected": -11.662094116210938, "step": 30 }, { "epoch": 0.07729468599033816, "grad_norm": 32.5, "learning_rate": 6.973814758718596e-07, "logits/chosen": 0.9370359182357788, "logits/rejected": 0.896599531173706, "logps/chosen": -1.3457211256027222, "logps/rejected": -1.0361064672470093, "loss": 4.8073, "rewards/accuracies": 0.46875, "rewards/chosen": -13.457212448120117, "rewards/margins": -3.0961475372314453, "rewards/rejected": -10.361063957214355, "step": 31 }, { "epoch": 0.07978806295776843, "grad_norm": 68.0, "learning_rate": 6.97021214004251e-07, "logits/chosen": 0.8998004198074341, "logits/rejected": 0.9092382192611694, "logps/chosen": -1.5766998529434204, "logps/rejected": -1.1409169435501099, "loss": 6.4345, "rewards/accuracies": 0.4375, "rewards/chosen": -15.766998291015625, "rewards/margins": -4.357827663421631, "rewards/rejected": -11.409171104431152, "step": 32 }, { "epoch": 0.0822814399251987, "grad_norm": 76.0, "learning_rate": 6.96637845757174e-07, "logits/chosen": 0.8456138372421265, "logits/rejected": 0.9118346571922302, "logps/chosen": -2.059769868850708, "logps/rejected": -1.3298571109771729, "loss": 9.2994, "rewards/accuracies": 0.34375, "rewards/chosen": -20.597698211669922, "rewards/margins": -7.299127101898193, "rewards/rejected": -13.29857063293457, "step": 33 }, { "epoch": 0.08477481689262896, "grad_norm": 50.0, "learning_rate": 6.962313966571722e-07, "logits/chosen": 0.8960351347923279, "logits/rejected": 0.8999559879302979, "logps/chosen": -1.4601362943649292, "logps/rejected": -1.4213840961456299, "loss": 4.6079, "rewards/accuracies": 0.46875, "rewards/chosen": -14.601361274719238, "rewards/margins": -0.3875225782394409, "rewards/rejected": -14.21384048461914, "step": 34 }, { "epoch": 0.08726819386005921, "grad_norm": 47.0, "learning_rate": 6.958018937676262e-07, "logits/chosen": 0.9134461879730225, "logits/rejected": 0.8920255899429321, "logps/chosen": -1.46458101272583, "logps/rejected": -1.2648781538009644, "loss": 5.2894, "rewards/accuracies": 0.4375, "rewards/chosen": -14.6458101272583, "rewards/margins": -1.99702787399292, "rewards/rejected": -12.648781776428223, "step": 35 }, { "epoch": 0.08976157082748948, "grad_norm": 86.5, "learning_rate": 6.953493656869511e-07, "logits/chosen": 0.9218010902404785, "logits/rejected": 0.7793766260147095, "logps/chosen": -1.7202703952789307, "logps/rejected": -1.3021219968795776, "loss": 6.3929, "rewards/accuracies": 0.40625, "rewards/chosen": -17.20270538330078, "rewards/margins": -4.181485176086426, "rewards/rejected": -13.021220207214355, "step": 36 }, { "epoch": 0.09225494779491974, "grad_norm": 55.0, "learning_rate": 6.948738425466925e-07, "logits/chosen": 0.9479645490646362, "logits/rejected": 0.8090993762016296, "logps/chosen": -1.6109609603881836, "logps/rejected": -1.395875334739685, "loss": 5.8692, "rewards/accuracies": 0.53125, "rewards/chosen": -16.10961151123047, "rewards/margins": -2.1508564949035645, "rewards/rejected": -13.95875358581543, "step": 37 }, { "epoch": 0.09474832476235001, "grad_norm": 32.75, "learning_rate": 6.943753560095204e-07, "logits/chosen": 1.020020604133606, "logits/rejected": 0.9307425618171692, "logps/chosen": -1.447858452796936, "logps/rejected": -1.0777596235275269, "loss": 5.3116, "rewards/accuracies": 0.375, "rewards/chosen": -14.478584289550781, "rewards/margins": -3.700988292694092, "rewards/rejected": -10.777596473693848, "step": 38 }, { "epoch": 0.09724170172978028, "grad_norm": 45.0, "learning_rate": 6.938539392671203e-07, "logits/chosen": 0.939849317073822, "logits/rejected": 0.9025396108627319, "logps/chosen": -1.6659669876098633, "logps/rejected": -1.1725908517837524, "loss": 7.0117, "rewards/accuracies": 0.3125, "rewards/chosen": -16.659669876098633, "rewards/margins": -4.933763027191162, "rewards/rejected": -11.725908279418945, "step": 39 }, { "epoch": 0.09973507869721053, "grad_norm": 74.5, "learning_rate": 6.933096270379841e-07, "logits/chosen": 0.996893584728241, "logits/rejected": 0.912053108215332, "logps/chosen": -1.2696326971054077, "logps/rejected": -1.1286218166351318, "loss": 4.2095, "rewards/accuracies": 0.5625, "rewards/chosen": -12.69632625579834, "rewards/margins": -1.410109281539917, "rewards/rejected": -11.286218643188477, "step": 40 }, { "epoch": 0.1022284556646408, "grad_norm": 47.0, "learning_rate": 6.927424555650974e-07, "logits/chosen": 0.9594122171401978, "logits/rejected": 0.8550945520401001, "logps/chosen": -1.5375633239746094, "logps/rejected": -1.2417051792144775, "loss": 5.0733, "rewards/accuracies": 0.375, "rewards/chosen": -15.375633239746094, "rewards/margins": -2.9585819244384766, "rewards/rejected": -12.4170503616333, "step": 41 }, { "epoch": 0.10472183263207106, "grad_norm": 44.0, "learning_rate": 6.921524626135268e-07, "logits/chosen": 0.8996063470840454, "logits/rejected": 0.9653378129005432, "logps/chosen": -1.763725996017456, "logps/rejected": -1.0993306636810303, "loss": 8.0476, "rewards/accuracies": 0.21875, "rewards/chosen": -17.637258529663086, "rewards/margins": -6.643953323364258, "rewards/rejected": -10.993307113647461, "step": 42 }, { "epoch": 0.10721520959950133, "grad_norm": 42.25, "learning_rate": 6.915396874679055e-07, "logits/chosen": 1.0091477632522583, "logits/rejected": 0.9392642974853516, "logps/chosen": -1.2002838850021362, "logps/rejected": -1.0848746299743652, "loss": 3.0284, "rewards/accuracies": 0.5, "rewards/chosen": -12.002839088439941, "rewards/margins": -1.1540918350219727, "rewards/rejected": -10.848746299743652, "step": 43 }, { "epoch": 0.10970858656693158, "grad_norm": 40.25, "learning_rate": 6.909041709298168e-07, "logits/chosen": 0.8822853565216064, "logits/rejected": 0.8290736079216003, "logps/chosen": -1.4588274955749512, "logps/rejected": -1.2779145240783691, "loss": 4.9219, "rewards/accuracies": 0.34375, "rewards/chosen": -14.588274955749512, "rewards/margins": -1.8091294765472412, "rewards/rejected": -12.779145240783691, "step": 44 }, { "epoch": 0.11220196353436185, "grad_norm": 56.75, "learning_rate": 6.902459553150779e-07, "logits/chosen": 0.9077208638191223, "logits/rejected": 0.7896067500114441, "logps/chosen": -1.4615594148635864, "logps/rejected": -1.2456190586090088, "loss": 5.1754, "rewards/accuracies": 0.53125, "rewards/chosen": -14.615594863891602, "rewards/margins": -2.1594033241271973, "rewards/rejected": -12.456191062927246, "step": 45 }, { "epoch": 0.11469534050179211, "grad_norm": 67.0, "learning_rate": 6.895650844509226e-07, "logits/chosen": 0.9100595116615295, "logits/rejected": 0.7619892358779907, "logps/chosen": -1.6750259399414062, "logps/rejected": -1.2229750156402588, "loss": 6.2716, "rewards/accuracies": 0.34375, "rewards/chosen": -16.75025749206543, "rewards/margins": -4.5205078125, "rewards/rejected": -12.229750633239746, "step": 46 }, { "epoch": 0.11718871746922238, "grad_norm": 70.0, "learning_rate": 6.88861603673082e-07, "logits/chosen": 0.8918619751930237, "logits/rejected": 0.9012125134468079, "logps/chosen": -1.64901602268219, "logps/rejected": -1.265884518623352, "loss": 6.6894, "rewards/accuracies": 0.53125, "rewards/chosen": -16.490161895751953, "rewards/margins": -3.831315279006958, "rewards/rejected": -12.658845901489258, "step": 47 }, { "epoch": 0.11968209443665265, "grad_norm": 51.0, "learning_rate": 6.88135559822767e-07, "logits/chosen": 0.8720345497131348, "logits/rejected": 0.7581244707107544, "logps/chosen": -1.872530221939087, "logps/rejected": -1.4055814743041992, "loss": 6.6813, "rewards/accuracies": 0.3125, "rewards/chosen": -18.72530174255371, "rewards/margins": -4.669487476348877, "rewards/rejected": -14.055814743041992, "step": 48 }, { "epoch": 0.1221754714040829, "grad_norm": 52.75, "learning_rate": 6.873870012435486e-07, "logits/chosen": 0.8616499900817871, "logits/rejected": 0.7673721313476562, "logps/chosen": -1.3419944047927856, "logps/rejected": -1.2395105361938477, "loss": 3.3706, "rewards/accuracies": 0.5, "rewards/chosen": -13.419943809509277, "rewards/margins": -1.0248385667800903, "rewards/rejected": -12.395105361938477, "step": 49 }, { "epoch": 0.12466884837151317, "grad_norm": 56.25, "learning_rate": 6.866159777781393e-07, "logits/chosen": 0.8702710866928101, "logits/rejected": 0.7436060309410095, "logps/chosen": -1.6595778465270996, "logps/rejected": -1.1382110118865967, "loss": 6.7412, "rewards/accuracies": 0.34375, "rewards/chosen": -16.595779418945312, "rewards/margins": -5.213669300079346, "rewards/rejected": -11.382110595703125, "step": 50 }, { "epoch": 0.12716222533894342, "grad_norm": 50.75, "learning_rate": 6.858225407650741e-07, "logits/chosen": 0.7868949174880981, "logits/rejected": 0.8334120512008667, "logps/chosen": -1.7013866901397705, "logps/rejected": -1.3084328174591064, "loss": 6.2144, "rewards/accuracies": 0.34375, "rewards/chosen": -17.013864517211914, "rewards/margins": -3.929537296295166, "rewards/rejected": -13.084327697753906, "step": 51 }, { "epoch": 0.12965560230637369, "grad_norm": 65.0, "learning_rate": 6.850067430352923e-07, "logits/chosen": 0.8779257535934448, "logits/rejected": 0.7302612066268921, "logps/chosen": -1.9540197849273682, "logps/rejected": -1.4614614248275757, "loss": 6.5912, "rewards/accuracies": 0.3125, "rewards/chosen": -19.540199279785156, "rewards/margins": -4.925583362579346, "rewards/rejected": -14.61461353302002, "step": 52 }, { "epoch": 0.13214897927380395, "grad_norm": 79.0, "learning_rate": 6.8416863890862e-07, "logits/chosen": 0.91861492395401, "logits/rejected": 0.8185287714004517, "logps/chosen": -1.457578182220459, "logps/rejected": -1.275127649307251, "loss": 5.29, "rewards/accuracies": 0.40625, "rewards/chosen": -14.575782775878906, "rewards/margins": -1.8245068788528442, "rewards/rejected": -12.751276016235352, "step": 53 }, { "epoch": 0.13464235624123422, "grad_norm": 25.625, "learning_rate": 6.833082841901524e-07, "logits/chosen": 0.8008706569671631, "logits/rejected": 0.7791386246681213, "logps/chosen": -1.2828067541122437, "logps/rejected": -1.14899480342865, "loss": 3.6665, "rewards/accuracies": 0.5625, "rewards/chosen": -12.828067779541016, "rewards/margins": -1.3381190299987793, "rewards/rejected": -11.489947319030762, "step": 54 }, { "epoch": 0.13713573320866448, "grad_norm": 37.5, "learning_rate": 6.82425736166539e-07, "logits/chosen": 0.8428397178649902, "logits/rejected": 0.819983720779419, "logps/chosen": -1.5656299591064453, "logps/rejected": -1.6019946336746216, "loss": 5.8169, "rewards/accuracies": 0.3125, "rewards/chosen": -15.656298637390137, "rewards/margins": 0.3636472821235657, "rewards/rejected": -16.019947052001953, "step": 55 }, { "epoch": 0.13962911017609475, "grad_norm": 43.75, "learning_rate": 6.815210536021685e-07, "logits/chosen": 0.7473218441009521, "logits/rejected": 0.7424555420875549, "logps/chosen": -1.4687354564666748, "logps/rejected": -1.2596654891967773, "loss": 5.3807, "rewards/accuracies": 0.46875, "rewards/chosen": -14.687355041503906, "rewards/margins": -2.090701103210449, "rewards/rejected": -12.596653938293457, "step": 56 }, { "epoch": 0.14212248714352502, "grad_norm": 33.5, "learning_rate": 6.805942967352563e-07, "logits/chosen": 0.8693878650665283, "logits/rejected": 0.8091084361076355, "logps/chosen": -1.4544310569763184, "logps/rejected": -1.1222821474075317, "loss": 5.2342, "rewards/accuracies": 0.3125, "rewards/chosen": -14.544310569763184, "rewards/margins": -3.3214893341064453, "rewards/rejected": -11.222820281982422, "step": 57 }, { "epoch": 0.14461586411095528, "grad_norm": 60.75, "learning_rate": 6.796455272738337e-07, "logits/chosen": 0.8443146347999573, "logits/rejected": 0.7834912538528442, "logps/chosen": -1.630685806274414, "logps/rejected": -2.097377061843872, "loss": 5.0217, "rewards/accuracies": 0.28125, "rewards/chosen": -16.306856155395508, "rewards/margins": 4.6669135093688965, "rewards/rejected": -20.973772048950195, "step": 58 }, { "epoch": 0.14710924107838555, "grad_norm": 34.0, "learning_rate": 6.78674808391638e-07, "logits/chosen": 0.7124283313751221, "logits/rejected": 0.7266104221343994, "logps/chosen": -1.5309463739395142, "logps/rejected": -1.204602599143982, "loss": 4.9925, "rewards/accuracies": 0.3125, "rewards/chosen": -15.309463500976562, "rewards/margins": -3.263436794281006, "rewards/rejected": -12.046026229858398, "step": 59 }, { "epoch": 0.1496026180458158, "grad_norm": 31.25, "learning_rate": 6.776822047239079e-07, "logits/chosen": 0.810710608959198, "logits/rejected": 0.7433085441589355, "logps/chosen": -1.3407872915267944, "logps/rejected": -1.1019080877304077, "loss": 4.0638, "rewards/accuracies": 0.4375, "rewards/chosen": -13.407873153686523, "rewards/margins": -2.3887932300567627, "rewards/rejected": -11.019081115722656, "step": 60 }, { "epoch": 0.15209599501324605, "grad_norm": 35.75, "learning_rate": 6.766677823630784e-07, "logits/chosen": 0.9204759001731873, "logits/rejected": 0.8126802444458008, "logps/chosen": -1.3521380424499512, "logps/rejected": -1.230940341949463, "loss": 3.1759, "rewards/accuracies": 0.46875, "rewards/chosen": -13.521379470825195, "rewards/margins": -1.2119766473770142, "rewards/rejected": -12.309402465820312, "step": 61 }, { "epoch": 0.15458937198067632, "grad_norm": 74.5, "learning_rate": 6.756316088543799e-07, "logits/chosen": 0.8732976317405701, "logits/rejected": 0.7553092837333679, "logps/chosen": -1.6522544622421265, "logps/rejected": -1.304805874824524, "loss": 5.2966, "rewards/accuracies": 0.3125, "rewards/chosen": -16.522544860839844, "rewards/margins": -3.474484920501709, "rewards/rejected": -13.048059463500977, "step": 62 }, { "epoch": 0.1570827489481066, "grad_norm": 32.25, "learning_rate": 6.74573753191342e-07, "logits/chosen": 0.8279662728309631, "logits/rejected": 0.7906845808029175, "logps/chosen": -1.3082658052444458, "logps/rejected": -1.215187907218933, "loss": 3.2516, "rewards/accuracies": 0.40625, "rewards/chosen": -13.082658767700195, "rewards/margins": -0.9307788610458374, "rewards/rejected": -12.15187931060791, "step": 63 }, { "epoch": 0.15957612591553685, "grad_norm": 30.0, "learning_rate": 6.734942858111986e-07, "logits/chosen": 0.8267450332641602, "logits/rejected": 0.7294779419898987, "logps/chosen": -1.272381067276001, "logps/rejected": -1.2460516691207886, "loss": 3.548, "rewards/accuracies": 0.59375, "rewards/chosen": -12.723810195922852, "rewards/margins": -0.26329320669174194, "rewards/rejected": -12.460516929626465, "step": 64 }, { "epoch": 0.16206950288296712, "grad_norm": 50.75, "learning_rate": 6.723932785901975e-07, "logits/chosen": 0.9013331532478333, "logits/rejected": 0.8166715502738953, "logps/chosen": -1.563563346862793, "logps/rejected": -1.2308857440948486, "loss": 4.8669, "rewards/accuracies": 0.34375, "rewards/chosen": -15.63563346862793, "rewards/margins": -3.3267745971679688, "rewards/rejected": -12.308857917785645, "step": 65 }, { "epoch": 0.1645628798503974, "grad_norm": 18.75, "learning_rate": 6.712708048388158e-07, "logits/chosen": 0.833111047744751, "logits/rejected": 0.7176869511604309, "logps/chosen": -1.2247819900512695, "logps/rejected": -1.460402488708496, "loss": 2.2855, "rewards/accuracies": 0.6875, "rewards/chosen": -12.247820854187012, "rewards/margins": 2.3562047481536865, "rewards/rejected": -14.604024887084961, "step": 66 }, { "epoch": 0.16705625681782765, "grad_norm": 41.75, "learning_rate": 6.701269392968773e-07, "logits/chosen": 0.8795142769813538, "logits/rejected": 0.7385881543159485, "logps/chosen": -1.5149438381195068, "logps/rejected": -1.4080404043197632, "loss": 3.8984, "rewards/accuracies": 0.53125, "rewards/chosen": -15.14943790435791, "rewards/margins": -1.0690345764160156, "rewards/rejected": -14.080402374267578, "step": 67 }, { "epoch": 0.16954963378525792, "grad_norm": 37.0, "learning_rate": 6.689617581285765e-07, "logits/chosen": 0.8711040616035461, "logits/rejected": 0.6986596584320068, "logps/chosen": -1.6020938158035278, "logps/rejected": -1.350814938545227, "loss": 4.9801, "rewards/accuracies": 0.40625, "rewards/chosen": -16.02094078063965, "rewards/margins": -2.5127904415130615, "rewards/rejected": -13.508148193359375, "step": 68 }, { "epoch": 0.17204301075268819, "grad_norm": 45.25, "learning_rate": 6.677753389174075e-07, "logits/chosen": 0.9395517706871033, "logits/rejected": 0.7759240865707397, "logps/chosen": -1.5319843292236328, "logps/rejected": -1.3424811363220215, "loss": 5.1017, "rewards/accuracies": 0.5625, "rewards/chosen": -15.319843292236328, "rewards/margins": -1.8950309753417969, "rewards/rejected": -13.424812316894531, "step": 69 }, { "epoch": 0.17453638772011842, "grad_norm": 35.0, "learning_rate": 6.665677606609973e-07, "logits/chosen": 0.8715901374816895, "logits/rejected": 0.8017496466636658, "logps/chosen": -1.5180387496948242, "logps/rejected": -1.3085150718688965, "loss": 4.5983, "rewards/accuracies": 0.4375, "rewards/chosen": -15.180386543273926, "rewards/margins": -2.095235824584961, "rewards/rejected": -13.085149765014648, "step": 70 }, { "epoch": 0.1770297646875487, "grad_norm": 42.75, "learning_rate": 6.653391037658466e-07, "logits/chosen": 0.8521101474761963, "logits/rejected": 0.7697039246559143, "logps/chosen": -1.5679848194122314, "logps/rejected": -1.3208036422729492, "loss": 4.539, "rewards/accuracies": 0.375, "rewards/chosen": -15.679848670959473, "rewards/margins": -2.471813201904297, "rewards/rejected": -13.208035469055176, "step": 71 }, { "epoch": 0.17952314165497896, "grad_norm": 46.5, "learning_rate": 6.640894500419754e-07, "logits/chosen": 0.9186801314353943, "logits/rejected": 0.7545082569122314, "logps/chosen": -1.5185034275054932, "logps/rejected": -1.2024480104446411, "loss": 5.072, "rewards/accuracies": 0.3125, "rewards/chosen": -15.185033798217773, "rewards/margins": -3.160555601119995, "rewards/rejected": -12.024478912353516, "step": 72 }, { "epoch": 0.18201651862240922, "grad_norm": 23.375, "learning_rate": 6.628188826974758e-07, "logits/chosen": 0.8491867780685425, "logits/rejected": 0.7822642922401428, "logps/chosen": -1.1384882926940918, "logps/rejected": -1.1687763929367065, "loss": 2.5731, "rewards/accuracies": 0.5, "rewards/chosen": -11.384883880615234, "rewards/margins": 0.3028792440891266, "rewards/rejected": -11.687764167785645, "step": 73 }, { "epoch": 0.1845098955898395, "grad_norm": 33.0, "learning_rate": 6.615274863329715e-07, "logits/chosen": 0.9214451909065247, "logits/rejected": 0.75420743227005, "logps/chosen": -1.5405924320220947, "logps/rejected": -1.6105780601501465, "loss": 2.1388, "rewards/accuracies": 0.53125, "rewards/chosen": -15.405925750732422, "rewards/margins": 0.6998560428619385, "rewards/rejected": -16.10578155517578, "step": 74 }, { "epoch": 0.18700327255726976, "grad_norm": 39.5, "learning_rate": 6.602153469359852e-07, "logits/chosen": 0.905957043170929, "logits/rejected": 0.7297846078872681, "logps/chosen": -1.4268043041229248, "logps/rejected": -1.3724501132965088, "loss": 2.9419, "rewards/accuracies": 0.4375, "rewards/chosen": -14.268043518066406, "rewards/margins": -0.5435430407524109, "rewards/rejected": -13.72450065612793, "step": 75 }, { "epoch": 0.18949664952470002, "grad_norm": 34.5, "learning_rate": 6.588825518752124e-07, "logits/chosen": 0.9336991310119629, "logits/rejected": 0.7428035736083984, "logps/chosen": -1.4046045541763306, "logps/rejected": -1.1234869956970215, "loss": 4.3453, "rewards/accuracies": 0.34375, "rewards/chosen": -14.046045303344727, "rewards/margins": -2.8111753463745117, "rewards/rejected": -11.234869003295898, "step": 76 }, { "epoch": 0.1919900264921303, "grad_norm": 40.75, "learning_rate": 6.575291898947046e-07, "logits/chosen": 0.8886721134185791, "logits/rejected": 0.6734512448310852, "logps/chosen": -1.4164619445800781, "logps/rejected": -1.3936028480529785, "loss": 3.5175, "rewards/accuracies": 0.5, "rewards/chosen": -14.164620399475098, "rewards/margins": -0.22859010100364685, "rewards/rejected": -13.936028480529785, "step": 77 }, { "epoch": 0.19448340345956056, "grad_norm": 41.75, "learning_rate": 6.561553511079596e-07, "logits/chosen": 0.829595148563385, "logits/rejected": 0.6838914155960083, "logps/chosen": -1.5907689332962036, "logps/rejected": -1.4232121706008911, "loss": 4.2889, "rewards/accuracies": 0.375, "rewards/chosen": -15.907690048217773, "rewards/margins": -1.675569772720337, "rewards/rejected": -14.232120513916016, "step": 78 }, { "epoch": 0.1969767804269908, "grad_norm": 34.5, "learning_rate": 6.54761126991922e-07, "logits/chosen": 0.9110915660858154, "logits/rejected": 0.6820324063301086, "logps/chosen": -1.547090768814087, "logps/rejected": -1.3237884044647217, "loss": 4.8103, "rewards/accuracies": 0.40625, "rewards/chosen": -15.470909118652344, "rewards/margins": -2.2330236434936523, "rewards/rejected": -13.237884521484375, "step": 79 }, { "epoch": 0.19947015739442106, "grad_norm": 43.75, "learning_rate": 6.533466103808918e-07, "logits/chosen": 0.8135228157043457, "logits/rejected": 0.7062645554542542, "logps/chosen": -1.5417137145996094, "logps/rejected": -1.3554742336273193, "loss": 5.119, "rewards/accuracies": 0.46875, "rewards/chosen": -15.417137145996094, "rewards/margins": -1.8623945713043213, "rewards/rejected": -13.554742813110352, "step": 80 }, { "epoch": 0.20196353436185133, "grad_norm": 64.5, "learning_rate": 6.519118954603431e-07, "logits/chosen": 0.818507194519043, "logits/rejected": 0.7929250001907349, "logps/chosen": -1.6561720371246338, "logps/rejected": -1.3400187492370605, "loss": 5.316, "rewards/accuracies": 0.3125, "rewards/chosen": -16.56171989440918, "rewards/margins": -3.161534309387207, "rewards/rejected": -13.400186538696289, "step": 81 }, { "epoch": 0.2044569113292816, "grad_norm": 22.75, "learning_rate": 6.504570777606531e-07, "logits/chosen": 0.8459409475326538, "logits/rejected": 0.7011772990226746, "logps/chosen": -1.3367918729782104, "logps/rejected": -1.2118648290634155, "loss": 3.5423, "rewards/accuracies": 0.53125, "rewards/chosen": -13.367918014526367, "rewards/margins": -1.249271273612976, "rewards/rejected": -12.118647575378418, "step": 82 }, { "epoch": 0.20695028829671186, "grad_norm": 25.875, "learning_rate": 6.489822541507404e-07, "logits/chosen": 0.8798666596412659, "logits/rejected": 0.7069228887557983, "logps/chosen": -1.1269609928131104, "logps/rejected": -1.1012687683105469, "loss": 2.5165, "rewards/accuracies": 0.53125, "rewards/chosen": -11.269609451293945, "rewards/margins": -0.25692227482795715, "rewards/rejected": -11.012688636779785, "step": 83 }, { "epoch": 0.20944366526414213, "grad_norm": 30.875, "learning_rate": 6.474875228316158e-07, "logits/chosen": 0.9361159801483154, "logits/rejected": 0.8077545762062073, "logps/chosen": -1.4038376808166504, "logps/rejected": -1.357134222984314, "loss": 3.4071, "rewards/accuracies": 0.46875, "rewards/chosen": -14.038376808166504, "rewards/margins": -0.4670344293117523, "rewards/rejected": -13.571342468261719, "step": 84 }, { "epoch": 0.2119370422315724, "grad_norm": 20.625, "learning_rate": 6.459729833298434e-07, "logits/chosen": 0.7581954002380371, "logits/rejected": 0.7710189819335938, "logps/chosen": -1.2664942741394043, "logps/rejected": -1.2973535060882568, "loss": 3.0325, "rewards/accuracies": 0.59375, "rewards/chosen": -12.66494369506836, "rewards/margins": 0.3085915148258209, "rewards/rejected": -12.973533630371094, "step": 85 }, { "epoch": 0.21443041919900266, "grad_norm": 38.25, "learning_rate": 6.444387364909134e-07, "logits/chosen": 0.8360967636108398, "logits/rejected": 0.7465887069702148, "logps/chosen": -1.4347429275512695, "logps/rejected": -1.4330288171768188, "loss": 3.0653, "rewards/accuracies": 0.5, "rewards/chosen": -14.347427368164062, "rewards/margins": -0.01714131236076355, "rewards/rejected": -14.33028793334961, "step": 86 }, { "epoch": 0.21692379616643293, "grad_norm": 25.375, "learning_rate": 6.428848844725274e-07, "logits/chosen": 0.7691155672073364, "logits/rejected": 0.6017144322395325, "logps/chosen": -1.2951093912124634, "logps/rejected": -1.3574622869491577, "loss": 2.8385, "rewards/accuracies": 0.5625, "rewards/chosen": -12.951093673706055, "rewards/margins": 0.623528778553009, "rewards/rejected": -13.574623107910156, "step": 87 }, { "epoch": 0.21941717313386316, "grad_norm": 48.0, "learning_rate": 6.413115307377965e-07, "logits/chosen": 0.8395971059799194, "logits/rejected": 0.6882689595222473, "logps/chosen": -1.4701257944107056, "logps/rejected": -1.4139220714569092, "loss": 3.306, "rewards/accuracies": 0.4375, "rewards/chosen": -14.701258659362793, "rewards/margins": -0.5620384216308594, "rewards/rejected": -14.1392183303833, "step": 88 }, { "epoch": 0.22191055010129343, "grad_norm": 31.625, "learning_rate": 6.397187800483519e-07, "logits/chosen": 0.8466267585754395, "logits/rejected": 0.6940711140632629, "logps/chosen": -1.4214099645614624, "logps/rejected": -1.3584471940994263, "loss": 2.831, "rewards/accuracies": 0.59375, "rewards/chosen": -14.214098930358887, "rewards/margins": -0.6296274662017822, "rewards/rejected": -13.58447265625, "step": 89 }, { "epoch": 0.2244039270687237, "grad_norm": 33.5, "learning_rate": 6.381067384573693e-07, "logits/chosen": 0.8270119428634644, "logits/rejected": 0.65580815076828, "logps/chosen": -1.4739896059036255, "logps/rejected": -1.2760796546936035, "loss": 3.8132, "rewards/accuracies": 0.40625, "rewards/chosen": -14.739895820617676, "rewards/margins": -1.9790987968444824, "rewards/rejected": -12.760797500610352, "step": 90 }, { "epoch": 0.22689730403615396, "grad_norm": 27.375, "learning_rate": 6.364755133025077e-07, "logits/chosen": 0.8560658693313599, "logits/rejected": 0.6389474868774414, "logps/chosen": -1.2929621934890747, "logps/rejected": -1.887449860572815, "loss": 2.5581, "rewards/accuracies": 0.59375, "rewards/chosen": -12.929622650146484, "rewards/margins": 5.944877624511719, "rewards/rejected": -18.87449836730957, "step": 91 }, { "epoch": 0.22939068100358423, "grad_norm": 51.25, "learning_rate": 6.348252131987621e-07, "logits/chosen": 0.9491753578186035, "logits/rejected": 0.5920038819313049, "logps/chosen": -1.7384018898010254, "logps/rejected": -1.4728763103485107, "loss": 4.2467, "rewards/accuracies": 0.4375, "rewards/chosen": -17.38401985168457, "rewards/margins": -2.6552560329437256, "rewards/rejected": -14.728763580322266, "step": 92 }, { "epoch": 0.2318840579710145, "grad_norm": 41.75, "learning_rate": 6.331559480312316e-07, "logits/chosen": 0.8945069313049316, "logits/rejected": 0.6501726508140564, "logps/chosen": -1.6423015594482422, "logps/rejected": -1.5272799730300903, "loss": 3.7742, "rewards/accuracies": 0.46875, "rewards/chosen": -16.423015594482422, "rewards/margins": -1.150214433670044, "rewards/rejected": -15.272799491882324, "step": 93 }, { "epoch": 0.23437743493844476, "grad_norm": 27.875, "learning_rate": 6.314678289478021e-07, "logits/chosen": 0.868090033531189, "logits/rejected": 0.7094947695732117, "logps/chosen": -1.3805748224258423, "logps/rejected": -1.3794375658035278, "loss": 2.4021, "rewards/accuracies": 0.5, "rewards/chosen": -13.805749893188477, "rewards/margins": -0.011373043060302734, "rewards/rejected": -13.794376373291016, "step": 94 }, { "epoch": 0.23687081190587503, "grad_norm": 30.625, "learning_rate": 6.297609683517465e-07, "logits/chosen": 0.9310474395751953, "logits/rejected": 0.7228609323501587, "logps/chosen": -1.339646339416504, "logps/rejected": -1.4403069019317627, "loss": 2.1866, "rewards/accuracies": 0.65625, "rewards/chosen": -13.396462440490723, "rewards/margins": 1.0066064596176147, "rewards/rejected": -14.403070449829102, "step": 95 }, { "epoch": 0.2393641888733053, "grad_norm": 27.875, "learning_rate": 6.280354798942394e-07, "logits/chosen": 0.8475272059440613, "logits/rejected": 0.7736526727676392, "logps/chosen": -1.3078885078430176, "logps/rejected": -1.3328254222869873, "loss": 2.2729, "rewards/accuracies": 0.5625, "rewards/chosen": -13.078886985778809, "rewards/margins": 0.24936795234680176, "rewards/rejected": -13.328254699707031, "step": 96 }, { "epoch": 0.24185756584073553, "grad_norm": 23.125, "learning_rate": 6.262914784667902e-07, "logits/chosen": 0.8516014814376831, "logits/rejected": 0.6699912548065186, "logps/chosen": -1.2598787546157837, "logps/rejected": -1.2950444221496582, "loss": 3.0957, "rewards/accuracies": 0.625, "rewards/chosen": -12.598786354064941, "rewards/margins": 0.35165655612945557, "rewards/rejected": -12.950444221496582, "step": 97 }, { "epoch": 0.2443509428081658, "grad_norm": 37.75, "learning_rate": 6.245290801935929e-07, "logits/chosen": 0.8076661229133606, "logits/rejected": 0.6437760591506958, "logps/chosen": -1.4585728645324707, "logps/rejected": -1.3383572101593018, "loss": 4.0758, "rewards/accuracies": 0.40625, "rewards/chosen": -14.58572769165039, "rewards/margins": -1.2021559476852417, "rewards/rejected": -13.38357162475586, "step": 98 }, { "epoch": 0.24684431977559607, "grad_norm": 22.875, "learning_rate": 6.227484024237941e-07, "logits/chosen": 0.8829818367958069, "logits/rejected": 0.6302488446235657, "logps/chosen": -1.3252794742584229, "logps/rejected": -1.350675344467163, "loss": 2.156, "rewards/accuracies": 0.59375, "rewards/chosen": -13.252795219421387, "rewards/margins": 0.2539580166339874, "rewards/rejected": -13.506753921508789, "step": 99 }, { "epoch": 0.24933769674302633, "grad_norm": 42.25, "learning_rate": 6.209495637236789e-07, "logits/chosen": 0.7620182037353516, "logits/rejected": 0.7404342889785767, "logps/chosen": -1.7423349618911743, "logps/rejected": -1.6396631002426147, "loss": 4.6426, "rewards/accuracies": 0.53125, "rewards/chosen": -17.423349380493164, "rewards/margins": -1.026718258857727, "rewards/rejected": -16.396631240844727, "step": 100 }, { "epoch": 0.2518310737104566, "grad_norm": 44.25, "learning_rate": 6.191326838687767e-07, "logits/chosen": 0.8130788803100586, "logits/rejected": 0.6447663307189941, "logps/chosen": -1.6222593784332275, "logps/rejected": -1.539342999458313, "loss": 3.8946, "rewards/accuracies": 0.59375, "rewards/chosen": -16.222591400146484, "rewards/margins": -0.8291639089584351, "rewards/rejected": -15.393428802490234, "step": 101 }, { "epoch": 0.25432445067788684, "grad_norm": 23.25, "learning_rate": 6.172978838358858e-07, "logits/chosen": 0.8688798546791077, "logits/rejected": 0.7208373546600342, "logps/chosen": -1.2495828866958618, "logps/rejected": -1.16335129737854, "loss": 3.3786, "rewards/accuracies": 0.53125, "rewards/chosen": -12.495828628540039, "rewards/margins": -0.8623146414756775, "rewards/rejected": -11.633513450622559, "step": 102 }, { "epoch": 0.25681782764531713, "grad_norm": 28.625, "learning_rate": 6.154452857950179e-07, "logits/chosen": 0.867901086807251, "logits/rejected": 0.6571163535118103, "logps/chosen": -1.4274240732192993, "logps/rejected": -1.2099568843841553, "loss": 4.0273, "rewards/accuracies": 0.40625, "rewards/chosen": -14.274239540100098, "rewards/margins": -2.174670934677124, "rewards/rejected": -12.099568367004395, "step": 103 }, { "epoch": 0.25931120461274737, "grad_norm": 17.5, "learning_rate": 6.135750131012639e-07, "logits/chosen": 0.8423357009887695, "logits/rejected": 0.7953418493270874, "logps/chosen": -1.1816288232803345, "logps/rejected": -1.4284311532974243, "loss": 1.5765, "rewards/accuracies": 0.84375, "rewards/chosen": -11.816287994384766, "rewards/margins": 2.468022346496582, "rewards/rejected": -14.284311294555664, "step": 104 }, { "epoch": 0.26180458158017766, "grad_norm": 48.0, "learning_rate": 6.116871902865795e-07, "logits/chosen": 0.7953894138336182, "logits/rejected": 0.6910791993141174, "logps/chosen": -1.4984780550003052, "logps/rejected": -1.359675645828247, "loss": 4.2421, "rewards/accuracies": 0.46875, "rewards/chosen": -14.984780311584473, "rewards/margins": -1.3880234956741333, "rewards/rejected": -13.596756935119629, "step": 105 }, { "epoch": 0.2642979585476079, "grad_norm": 14.5, "learning_rate": 6.097819430514944e-07, "logits/chosen": 0.8314008712768555, "logits/rejected": 0.6421066522598267, "logps/chosen": -1.1923877000808716, "logps/rejected": -1.403716802597046, "loss": 1.3615, "rewards/accuracies": 0.6875, "rewards/chosen": -11.92387580871582, "rewards/margins": 2.1132919788360596, "rewards/rejected": -14.037168502807617, "step": 106 }, { "epoch": 0.2667913355150382, "grad_norm": 41.25, "learning_rate": 6.078593982567416e-07, "logits/chosen": 0.9006607532501221, "logits/rejected": 0.7951247096061707, "logps/chosen": -1.5271791219711304, "logps/rejected": -1.3939223289489746, "loss": 3.7453, "rewards/accuracies": 0.5, "rewards/chosen": -15.271790504455566, "rewards/margins": -1.332566738128662, "rewards/rejected": -13.939225196838379, "step": 107 }, { "epoch": 0.26928471248246844, "grad_norm": 47.25, "learning_rate": 6.059196839148109e-07, "logits/chosen": 0.7548659443855286, "logits/rejected": 0.6844202280044556, "logps/chosen": -1.4953826665878296, "logps/rejected": -1.211591362953186, "loss": 5.099, "rewards/accuracies": 0.375, "rewards/chosen": -14.953826904296875, "rewards/margins": -2.8379130363464355, "rewards/rejected": -12.115914344787598, "step": 108 }, { "epoch": 0.27177808944989873, "grad_norm": 26.75, "learning_rate": 6.039629291814247e-07, "logits/chosen": 0.7883430123329163, "logits/rejected": 0.6593764424324036, "logps/chosen": -1.4129087924957275, "logps/rejected": -1.6393111944198608, "loss": 2.0234, "rewards/accuracies": 0.71875, "rewards/chosen": -14.12908935546875, "rewards/margins": 2.2640252113342285, "rewards/rejected": -16.39311408996582, "step": 109 }, { "epoch": 0.27427146641732897, "grad_norm": 47.25, "learning_rate": 6.019892643469387e-07, "logits/chosen": 0.8495079874992371, "logits/rejected": 0.7186658978462219, "logps/chosen": -1.4737249612808228, "logps/rejected": -1.3164616823196411, "loss": 3.8864, "rewards/accuracies": 0.4375, "rewards/chosen": -14.737249374389648, "rewards/margins": -1.5726318359375, "rewards/rejected": -13.164617538452148, "step": 110 }, { "epoch": 0.2767648433847592, "grad_norm": 60.5, "learning_rate": 5.999988208276662e-07, "logits/chosen": 0.8825462460517883, "logits/rejected": 0.6535596251487732, "logps/chosen": -1.5816519260406494, "logps/rejected": -1.498726725578308, "loss": 3.1086, "rewards/accuracies": 0.40625, "rewards/chosen": -15.816520690917969, "rewards/margins": -0.8292534351348877, "rewards/rejected": -14.987266540527344, "step": 111 }, { "epoch": 0.2792582203521895, "grad_norm": 54.0, "learning_rate": 5.979917311571282e-07, "logits/chosen": 0.8688668012619019, "logits/rejected": 0.5492098927497864, "logps/chosen": -1.4838958978652954, "logps/rejected": -1.6213023662567139, "loss": 2.3478, "rewards/accuracies": 0.4375, "rewards/chosen": -14.838959693908691, "rewards/margins": 1.3740637302398682, "rewards/rejected": -16.213022232055664, "step": 112 }, { "epoch": 0.28175159731961974, "grad_norm": 46.5, "learning_rate": 5.959681289772278e-07, "logits/chosen": 0.842609703540802, "logits/rejected": 0.6387814283370972, "logps/chosen": -1.5294029712677002, "logps/rejected": -1.7203947305679321, "loss": 2.5737, "rewards/accuracies": 0.5625, "rewards/chosen": -15.294027328491211, "rewards/margins": 1.9099199771881104, "rewards/rejected": -17.203948974609375, "step": 113 }, { "epoch": 0.28424497428705003, "grad_norm": 22.875, "learning_rate": 5.939281490293527e-07, "logits/chosen": 0.7885753512382507, "logits/rejected": 0.7003703713417053, "logps/chosen": -1.6169934272766113, "logps/rejected": -1.616774082183838, "loss": 3.128, "rewards/accuracies": 0.53125, "rewards/chosen": -16.169931411743164, "rewards/margins": -0.002192378044128418, "rewards/rejected": -16.167739868164062, "step": 114 }, { "epoch": 0.2867383512544803, "grad_norm": 125.5, "learning_rate": 5.918719271454026e-07, "logits/chosen": 0.8902820944786072, "logits/rejected": 0.6495590806007385, "logps/chosen": -1.7944972515106201, "logps/rejected": -1.6713979244232178, "loss": 3.4653, "rewards/accuracies": 0.40625, "rewards/chosen": -17.94497299194336, "rewards/margins": -1.2309918403625488, "rewards/rejected": -16.71398162841797, "step": 115 }, { "epoch": 0.28923172822191057, "grad_norm": 12.5, "learning_rate": 5.897996002387454e-07, "logits/chosen": 0.9350267648696899, "logits/rejected": 0.7698911428451538, "logps/chosen": -1.3168952465057373, "logps/rejected": -1.5409971475601196, "loss": 2.018, "rewards/accuracies": 0.71875, "rewards/chosen": -13.168952941894531, "rewards/margins": 2.241018533706665, "rewards/rejected": -15.409971237182617, "step": 116 }, { "epoch": 0.2917251051893408, "grad_norm": 35.25, "learning_rate": 5.877113062951007e-07, "logits/chosen": 0.9151044487953186, "logits/rejected": 0.7181938886642456, "logps/chosen": -1.3629308938980103, "logps/rejected": -2.3142240047454834, "loss": 2.7597, "rewards/accuracies": 0.59375, "rewards/chosen": -13.62930965423584, "rewards/margins": 9.512930870056152, "rewards/rejected": -23.142240524291992, "step": 117 }, { "epoch": 0.2942184821567711, "grad_norm": 19.75, "learning_rate": 5.856071843633516e-07, "logits/chosen": 0.8448264598846436, "logits/rejected": 0.6548407077789307, "logps/chosen": -1.355668544769287, "logps/rejected": -1.4345015287399292, "loss": 2.6585, "rewards/accuracies": 0.59375, "rewards/chosen": -13.556684494018555, "rewards/margins": 0.7883304953575134, "rewards/rejected": -14.345015525817871, "step": 118 }, { "epoch": 0.29671185912420134, "grad_norm": 52.0, "learning_rate": 5.834873745462869e-07, "logits/chosen": 0.9469012022018433, "logits/rejected": 0.6909551620483398, "logps/chosen": -1.5371216535568237, "logps/rejected": -1.9698981046676636, "loss": 1.7712, "rewards/accuracies": 0.71875, "rewards/chosen": -15.3712158203125, "rewards/margins": 4.327763557434082, "rewards/rejected": -19.698978424072266, "step": 119 }, { "epoch": 0.2992052360916316, "grad_norm": 38.75, "learning_rate": 5.813520179912718e-07, "logits/chosen": 0.8846210241317749, "logits/rejected": 0.6549557447433472, "logps/chosen": -1.5691332817077637, "logps/rejected": -1.859965443611145, "loss": 1.9083, "rewards/accuracies": 0.46875, "rewards/chosen": -15.691333770751953, "rewards/margins": 2.9083199501037598, "rewards/rejected": -18.599653244018555, "step": 120 }, { "epoch": 0.30169861305906187, "grad_norm": 52.75, "learning_rate": 5.792012568808498e-07, "logits/chosen": 0.9424107074737549, "logits/rejected": 0.638304591178894, "logps/chosen": -1.7227492332458496, "logps/rejected": -1.9438109397888184, "loss": 2.7587, "rewards/accuracies": 0.5625, "rewards/chosen": -17.227493286132812, "rewards/margins": 2.2106146812438965, "rewards/rejected": -19.438106536865234, "step": 121 }, { "epoch": 0.3041919900264921, "grad_norm": 30.125, "learning_rate": 5.770352344232754e-07, "logits/chosen": 0.9350774884223938, "logits/rejected": 0.7812179327011108, "logps/chosen": -1.4625705480575562, "logps/rejected": -1.6399712562561035, "loss": 2.1925, "rewards/accuracies": 0.53125, "rewards/chosen": -14.62570571899414, "rewards/margins": 1.7740064859390259, "rewards/rejected": -16.39971351623535, "step": 122 }, { "epoch": 0.3066853669939224, "grad_norm": 38.75, "learning_rate": 5.748540948429791e-07, "logits/chosen": 0.8861021995544434, "logits/rejected": 0.5621581077575684, "logps/chosen": -1.7297865152359009, "logps/rejected": -2.025303840637207, "loss": 2.116, "rewards/accuracies": 0.625, "rewards/chosen": -17.297866821289062, "rewards/margins": 2.955172538757324, "rewards/rejected": -20.25303840637207, "step": 123 }, { "epoch": 0.30917874396135264, "grad_norm": 40.0, "learning_rate": 5.726579833709629e-07, "logits/chosen": 0.8791552782058716, "logits/rejected": 0.7237104773521423, "logps/chosen": -1.5754930973052979, "logps/rejected": -1.760854959487915, "loss": 1.9028, "rewards/accuracies": 0.53125, "rewards/chosen": -15.75493049621582, "rewards/margins": 1.853618860244751, "rewards/rejected": -17.608549118041992, "step": 124 }, { "epoch": 0.31167212092878294, "grad_norm": 52.25, "learning_rate": 5.704470462351321e-07, "logits/chosen": 0.8605432510375977, "logits/rejected": 0.6145266890525818, "logps/chosen": -1.4967951774597168, "logps/rejected": -1.6985702514648438, "loss": 2.7418, "rewards/accuracies": 0.5625, "rewards/chosen": -14.9679536819458, "rewards/margins": 2.0177483558654785, "rewards/rejected": -16.985700607299805, "step": 125 }, { "epoch": 0.3141654978962132, "grad_norm": 9.25, "learning_rate": 5.682214306505567e-07, "logits/chosen": 0.89193195104599, "logits/rejected": 0.7236483097076416, "logps/chosen": -1.4118638038635254, "logps/rejected": -1.9812034368515015, "loss": 1.6725, "rewards/accuracies": 0.75, "rewards/chosen": -14.118638038635254, "rewards/margins": 5.693397521972656, "rewards/rejected": -19.812034606933594, "step": 126 }, { "epoch": 0.31665887486364347, "grad_norm": 19.0, "learning_rate": 5.659812848096706e-07, "logits/chosen": 0.7631481289863586, "logits/rejected": 0.6791519522666931, "logps/chosen": -1.5167012214660645, "logps/rejected": -1.6185718774795532, "loss": 3.444, "rewards/accuracies": 0.5625, "rewards/chosen": -15.167011260986328, "rewards/margins": 1.0187066793441772, "rewards/rejected": -16.185718536376953, "step": 127 }, { "epoch": 0.3191522518310737, "grad_norm": 55.75, "learning_rate": 5.637267578724034e-07, "logits/chosen": 0.847726047039032, "logits/rejected": 0.693824291229248, "logps/chosen": -1.5810160636901855, "logps/rejected": -1.9167366027832031, "loss": 2.9597, "rewards/accuracies": 0.46875, "rewards/chosen": -15.810161590576172, "rewards/margins": 3.357205867767334, "rewards/rejected": -19.16736602783203, "step": 128 }, { "epoch": 0.32164562879850395, "grad_norm": 72.5, "learning_rate": 5.614579999562487e-07, "logits/chosen": 0.878848135471344, "logits/rejected": 0.7662035822868347, "logps/chosen": -1.6665140390396118, "logps/rejected": -1.7739882469177246, "loss": 3.1744, "rewards/accuracies": 0.46875, "rewards/chosen": -16.66514015197754, "rewards/margins": 1.0747425556182861, "rewards/rejected": -17.73988151550293, "step": 129 }, { "epoch": 0.32413900576593424, "grad_norm": 61.5, "learning_rate": 5.591751621262691e-07, "logits/chosen": 0.8593266010284424, "logits/rejected": 0.7886440753936768, "logps/chosen": -1.1743977069854736, "logps/rejected": -1.3935869932174683, "loss": 1.9932, "rewards/accuracies": 0.5625, "rewards/chosen": -11.743976593017578, "rewards/margins": 2.1918928623199463, "rewards/rejected": -13.935870170593262, "step": 130 }, { "epoch": 0.3266323827333645, "grad_norm": 23.75, "learning_rate": 5.568783963850368e-07, "logits/chosen": 0.9685453176498413, "logits/rejected": 0.7054411768913269, "logps/chosen": -1.598836898803711, "logps/rejected": -1.8996286392211914, "loss": 2.1934, "rewards/accuracies": 0.5625, "rewards/chosen": -15.988369941711426, "rewards/margins": 3.0079164505004883, "rewards/rejected": -18.996286392211914, "step": 131 }, { "epoch": 0.3291257597007948, "grad_norm": 22.0, "learning_rate": 5.545678556625129e-07, "logits/chosen": 0.8639561533927917, "logits/rejected": 0.6618623733520508, "logps/chosen": -1.7690205574035645, "logps/rejected": -2.254978895187378, "loss": 1.9177, "rewards/accuracies": 0.625, "rewards/chosen": -17.69020652770996, "rewards/margins": 4.859582901000977, "rewards/rejected": -22.549787521362305, "step": 132 }, { "epoch": 0.331619136668225, "grad_norm": 27.5, "learning_rate": 5.522436938058645e-07, "logits/chosen": 0.8631035089492798, "logits/rejected": 0.7001104950904846, "logps/chosen": -1.5964336395263672, "logps/rejected": -2.130333185195923, "loss": 1.625, "rewards/accuracies": 0.75, "rewards/chosen": -15.964335441589355, "rewards/margins": 5.338994979858398, "rewards/rejected": -21.303333282470703, "step": 133 }, { "epoch": 0.3341125136356553, "grad_norm": 59.0, "learning_rate": 5.49906065569221e-07, "logits/chosen": 0.733770489692688, "logits/rejected": 0.5061658620834351, "logps/chosen": -1.5350837707519531, "logps/rejected": -1.8332159519195557, "loss": 2.7709, "rewards/accuracies": 0.5625, "rewards/chosen": -15.350838661193848, "rewards/margins": 2.981321334838867, "rewards/rejected": -18.3321590423584, "step": 134 }, { "epoch": 0.33660589060308554, "grad_norm": 13.1875, "learning_rate": 5.475551266033692e-07, "logits/chosen": 0.9098625183105469, "logits/rejected": 0.7151045203208923, "logps/chosen": -1.388254165649414, "logps/rejected": -1.944246530532837, "loss": 1.4884, "rewards/accuracies": 0.75, "rewards/chosen": -13.88254165649414, "rewards/margins": 5.559926509857178, "rewards/rejected": -19.442468643188477, "step": 135 }, { "epoch": 0.33909926757051584, "grad_norm": 36.75, "learning_rate": 5.451910334453903e-07, "logits/chosen": 0.9809038639068604, "logits/rejected": 0.6819513440132141, "logps/chosen": -1.6769332885742188, "logps/rejected": -2.2928450107574463, "loss": 1.2734, "rewards/accuracies": 0.78125, "rewards/chosen": -16.769332885742188, "rewards/margins": 6.159116268157959, "rewards/rejected": -22.928447723388672, "step": 136 }, { "epoch": 0.3415926445379461, "grad_norm": 111.5, "learning_rate": 5.428139435082358e-07, "logits/chosen": 0.9270225763320923, "logits/rejected": 0.6331555843353271, "logps/chosen": -1.6441551446914673, "logps/rejected": -1.7793883085250854, "loss": 2.9125, "rewards/accuracies": 0.46875, "rewards/chosen": -16.441551208496094, "rewards/margins": 1.3523308038711548, "rewards/rejected": -17.793882369995117, "step": 137 }, { "epoch": 0.34408602150537637, "grad_norm": 19.25, "learning_rate": 5.404240150702472e-07, "logits/chosen": 0.9672467708587646, "logits/rejected": 0.8573353886604309, "logps/chosen": -1.3790785074234009, "logps/rejected": -1.8634607791900635, "loss": 1.7018, "rewards/accuracies": 0.65625, "rewards/chosen": -13.79078483581543, "rewards/margins": 4.8438215255737305, "rewards/rejected": -18.634607315063477, "step": 138 }, { "epoch": 0.3465793984728066, "grad_norm": 83.0, "learning_rate": 5.38021407264616e-07, "logits/chosen": 0.8024469614028931, "logits/rejected": 0.5433262586593628, "logps/chosen": -1.3546580076217651, "logps/rejected": -1.5655174255371094, "loss": 2.6886, "rewards/accuracies": 0.65625, "rewards/chosen": -13.546581268310547, "rewards/margins": 2.108593702316284, "rewards/rejected": -15.655172348022461, "step": 139 }, { "epoch": 0.34907277544023685, "grad_norm": 56.5, "learning_rate": 5.356062800687886e-07, "logits/chosen": 0.7994624972343445, "logits/rejected": 0.6035336256027222, "logps/chosen": -1.2650129795074463, "logps/rejected": -1.3674687147140503, "loss": 2.4405, "rewards/accuracies": 0.59375, "rewards/chosen": -12.650128364562988, "rewards/margins": 1.0245567560195923, "rewards/rejected": -13.67468547821045, "step": 140 }, { "epoch": 0.35156615240766714, "grad_norm": 60.75, "learning_rate": 5.331787942938142e-07, "logits/chosen": 1.0324114561080933, "logits/rejected": 0.7126603126525879, "logps/chosen": -1.5447206497192383, "logps/rejected": -1.9410955905914307, "loss": 1.5742, "rewards/accuracies": 0.625, "rewards/chosen": -15.447206497192383, "rewards/margins": 3.963749885559082, "rewards/rejected": -19.41095733642578, "step": 141 }, { "epoch": 0.3540595293750974, "grad_norm": 14.4375, "learning_rate": 5.307391115736366e-07, "logits/chosen": 0.7712888717651367, "logits/rejected": 0.5555048584938049, "logps/chosen": -1.2323440313339233, "logps/rejected": -1.6426218748092651, "loss": 1.5398, "rewards/accuracies": 0.6875, "rewards/chosen": -12.32343864440918, "rewards/margins": 4.102778434753418, "rewards/rejected": -16.42621612548828, "step": 142 }, { "epoch": 0.3565529063425277, "grad_norm": 42.25, "learning_rate": 5.282873943543326e-07, "logits/chosen": 0.8940728306770325, "logits/rejected": 0.7413418292999268, "logps/chosen": -1.296794056892395, "logps/rejected": -1.8393501043319702, "loss": 1.7974, "rewards/accuracies": 0.59375, "rewards/chosen": -12.967940330505371, "rewards/margins": 5.425559997558594, "rewards/rejected": -18.39349937438965, "step": 143 }, { "epoch": 0.3590462833099579, "grad_norm": 31.75, "learning_rate": 5.258238058832948e-07, "logits/chosen": 0.9329725503921509, "logits/rejected": 0.5702534914016724, "logps/chosen": -1.3792263269424438, "logps/rejected": -1.757681965827942, "loss": 2.1616, "rewards/accuracies": 0.65625, "rewards/chosen": -13.792261123657227, "rewards/margins": 3.784557342529297, "rewards/rejected": -17.576818466186523, "step": 144 }, { "epoch": 0.3615396602773882, "grad_norm": 78.5, "learning_rate": 5.233485101983624e-07, "logits/chosen": 0.9451256990432739, "logits/rejected": 0.8186403512954712, "logps/chosen": -1.5383343696594238, "logps/rejected": -2.494551181793213, "loss": 1.4328, "rewards/accuracies": 0.6875, "rewards/chosen": -15.383341789245605, "rewards/margins": 9.562170028686523, "rewards/rejected": -24.945512771606445, "step": 145 }, { "epoch": 0.36403303724481845, "grad_norm": 58.5, "learning_rate": 5.208616721168984e-07, "logits/chosen": 0.9742121696472168, "logits/rejected": 0.7483265995979309, "logps/chosen": -1.6329911947250366, "logps/rejected": -2.0833840370178223, "loss": 1.8646, "rewards/accuracies": 0.6875, "rewards/chosen": -16.329910278320312, "rewards/margins": 4.503929138183594, "rewards/rejected": -20.833839416503906, "step": 146 }, { "epoch": 0.36652641421224874, "grad_norm": 29.5, "learning_rate": 5.183634572248153e-07, "logits/chosen": 0.8174174427986145, "logits/rejected": 0.7698001265525818, "logps/chosen": -1.255910038948059, "logps/rejected": -1.4298808574676514, "loss": 2.2763, "rewards/accuracies": 0.5, "rewards/chosen": -12.559102058410645, "rewards/margins": 1.739708423614502, "rewards/rejected": -14.298810005187988, "step": 147 }, { "epoch": 0.369019791179679, "grad_norm": 161.0, "learning_rate": 5.158540318655495e-07, "logits/chosen": 1.1192365884780884, "logits/rejected": 0.7937313914299011, "logps/chosen": -1.7974631786346436, "logps/rejected": -2.402998924255371, "loss": 2.2646, "rewards/accuracies": 0.5625, "rewards/chosen": -17.974632263183594, "rewards/margins": 6.055357933044434, "rewards/rejected": -24.02998924255371, "step": 148 }, { "epoch": 0.3715131681471092, "grad_norm": 13.4375, "learning_rate": 5.133335631289858e-07, "logits/chosen": 1.004485011100769, "logits/rejected": 0.6550527215003967, "logps/chosen": -1.4417423009872437, "logps/rejected": -2.1560442447662354, "loss": 1.3901, "rewards/accuracies": 0.65625, "rewards/chosen": -14.4174222946167, "rewards/margins": 7.1430182456970215, "rewards/rejected": -21.560441970825195, "step": 149 }, { "epoch": 0.3740065451145395, "grad_norm": 29.75, "learning_rate": 5.10802218840331e-07, "logits/chosen": 0.8932673335075378, "logits/rejected": 0.695792019367218, "logps/chosen": -1.3724555969238281, "logps/rejected": -1.7769482135772705, "loss": 1.7406, "rewards/accuracies": 0.65625, "rewards/chosen": -13.724554061889648, "rewards/margins": 4.044928073883057, "rewards/rejected": -17.76948356628418, "step": 150 }, { "epoch": 0.37649992208196975, "grad_norm": 38.25, "learning_rate": 5.0826016754894e-07, "logits/chosen": 0.9987000823020935, "logits/rejected": 0.6120975017547607, "logps/chosen": -1.7447395324707031, "logps/rejected": -2.424745559692383, "loss": 2.0385, "rewards/accuracies": 0.625, "rewards/chosen": -17.4473934173584, "rewards/margins": 6.800059795379639, "rewards/rejected": -24.247455596923828, "step": 151 }, { "epoch": 0.37899329904940005, "grad_norm": 43.5, "learning_rate": 5.057075785170923e-07, "logits/chosen": 0.7949992418289185, "logits/rejected": 0.735917866230011, "logps/chosen": -1.4737513065338135, "logps/rejected": -1.7997541427612305, "loss": 2.4462, "rewards/accuracies": 0.4375, "rewards/chosen": -14.737512588500977, "rewards/margins": 3.2600276470184326, "rewards/rejected": -17.997541427612305, "step": 152 }, { "epoch": 0.3814866760168303, "grad_norm": 34.75, "learning_rate": 5.031446217087223e-07, "logits/chosen": 0.7635215520858765, "logits/rejected": 0.6593471765518188, "logps/chosen": -1.4680148363113403, "logps/rejected": -1.8192330598831177, "loss": 2.3192, "rewards/accuracies": 0.5625, "rewards/chosen": -14.680147171020508, "rewards/margins": 3.5121822357177734, "rewards/rejected": -18.19232940673828, "step": 153 }, { "epoch": 0.3839800529842606, "grad_norm": 18.625, "learning_rate": 5.005714677781016e-07, "logits/chosen": 0.8512160778045654, "logits/rejected": 0.638878583908081, "logps/chosen": -1.239166259765625, "logps/rejected": -1.7152905464172363, "loss": 1.1124, "rewards/accuracies": 0.625, "rewards/chosen": -12.39166259765625, "rewards/margins": 4.761242866516113, "rewards/rejected": -17.15290641784668, "step": 154 }, { "epoch": 0.3864734299516908, "grad_norm": 16.375, "learning_rate": 4.979882880584766e-07, "logits/chosen": 0.9124481678009033, "logits/rejected": 0.7296810150146484, "logps/chosen": -1.7560640573501587, "logps/rejected": -2.781906843185425, "loss": 1.6899, "rewards/accuracies": 0.71875, "rewards/chosen": -17.56064224243164, "rewards/margins": 10.258424758911133, "rewards/rejected": -27.81906509399414, "step": 155 }, { "epoch": 0.3889668069191211, "grad_norm": 30.125, "learning_rate": 4.953952545506602e-07, "logits/chosen": 0.8763688802719116, "logits/rejected": 0.7317189574241638, "logps/chosen": -1.6232566833496094, "logps/rejected": -2.2681305408477783, "loss": 1.9121, "rewards/accuracies": 0.59375, "rewards/chosen": -16.232566833496094, "rewards/margins": 6.448739051818848, "rewards/rejected": -22.681304931640625, "step": 156 }, { "epoch": 0.39146018388655135, "grad_norm": 23.125, "learning_rate": 4.927925399115788e-07, "logits/chosen": 0.8235619068145752, "logits/rejected": 0.7919750213623047, "logps/chosen": -1.391683578491211, "logps/rejected": -1.6939644813537598, "loss": 2.2898, "rewards/accuracies": 0.625, "rewards/chosen": -13.916834831237793, "rewards/margins": 3.0228097438812256, "rewards/rejected": -16.939645767211914, "step": 157 }, { "epoch": 0.3939535608539816, "grad_norm": 58.0, "learning_rate": 4.901803174427757e-07, "logits/chosen": 0.890289306640625, "logits/rejected": 0.6626406311988831, "logps/chosen": -1.6668946743011475, "logps/rejected": -2.7818055152893066, "loss": 1.1016, "rewards/accuracies": 0.625, "rewards/chosen": -16.668947219848633, "rewards/margins": 11.14910888671875, "rewards/rejected": -27.81805419921875, "step": 158 }, { "epoch": 0.3964469378214119, "grad_norm": 50.25, "learning_rate": 4.875587610788733e-07, "logits/chosen": 0.7171937227249146, "logits/rejected": 0.6810190677642822, "logps/chosen": -1.645186424255371, "logps/rejected": -2.06756854057312, "loss": 2.5663, "rewards/accuracies": 0.5625, "rewards/chosen": -16.45186424255371, "rewards/margins": 4.22382116317749, "rewards/rejected": -20.67568588256836, "step": 159 }, { "epoch": 0.3989403147888421, "grad_norm": 19.375, "learning_rate": 4.849280453759897e-07, "logits/chosen": 0.9262104630470276, "logits/rejected": 0.7050573229789734, "logps/chosen": -1.6274131536483765, "logps/rejected": -2.1605324745178223, "loss": 1.2244, "rewards/accuracies": 0.6875, "rewards/chosen": -16.274131774902344, "rewards/margins": 5.331192970275879, "rewards/rejected": -21.60532569885254, "step": 160 }, { "epoch": 0.4014336917562724, "grad_norm": 83.0, "learning_rate": 4.822883455001173e-07, "logits/chosen": 0.9184644818305969, "logits/rejected": 0.8644086122512817, "logps/chosen": -1.5301023721694946, "logps/rejected": -1.876584768295288, "loss": 2.0259, "rewards/accuracies": 0.625, "rewards/chosen": -15.30102252960205, "rewards/margins": 3.464823007583618, "rewards/rejected": -18.76584815979004, "step": 161 }, { "epoch": 0.40392706872370265, "grad_norm": 24.5, "learning_rate": 4.796398372154588e-07, "logits/chosen": 1.0671634674072266, "logits/rejected": 0.8774153590202332, "logps/chosen": -1.6217372417449951, "logps/rejected": -2.3855130672454834, "loss": 1.4698, "rewards/accuracies": 0.65625, "rewards/chosen": -16.21737289428711, "rewards/margins": 7.637757301330566, "rewards/rejected": -23.85512924194336, "step": 162 }, { "epoch": 0.40642044569113295, "grad_norm": 44.0, "learning_rate": 4.769826968727243e-07, "logits/chosen": 0.80574631690979, "logits/rejected": 0.6158944964408875, "logps/chosen": -1.5703632831573486, "logps/rejected": -2.269869327545166, "loss": 1.3586, "rewards/accuracies": 0.625, "rewards/chosen": -15.703633308410645, "rewards/margins": 6.995059967041016, "rewards/rejected": -22.698694229125977, "step": 163 }, { "epoch": 0.4089138226585632, "grad_norm": 27.5, "learning_rate": 4.743171013973885e-07, "logits/chosen": 0.935499370098114, "logits/rejected": 0.7237244844436646, "logps/chosen": -1.7726106643676758, "logps/rejected": -2.6084468364715576, "loss": 1.447, "rewards/accuracies": 0.6875, "rewards/chosen": -17.72610855102539, "rewards/margins": 8.358359336853027, "rewards/rejected": -26.08446502685547, "step": 164 }, { "epoch": 0.4114071996259935, "grad_norm": 30.0, "learning_rate": 4.716432282779106e-07, "logits/chosen": 0.9203133583068848, "logits/rejected": 0.7862353920936584, "logps/chosen": -1.4431755542755127, "logps/rejected": -2.1590194702148438, "loss": 1.4126, "rewards/accuracies": 0.71875, "rewards/chosen": -14.431756019592285, "rewards/margins": 7.158439636230469, "rewards/rejected": -21.590194702148438, "step": 165 }, { "epoch": 0.4139005765934237, "grad_norm": 100.5, "learning_rate": 4.6896125555391575e-07, "logits/chosen": 0.9510793685913086, "logits/rejected": 0.7097218036651611, "logps/chosen": -1.377150535583496, "logps/rejected": -1.8154629468917847, "loss": 1.436, "rewards/accuracies": 0.625, "rewards/chosen": -13.771505355834961, "rewards/margins": 4.383124351501465, "rewards/rejected": -18.15462875366211, "step": 166 }, { "epoch": 0.41639395356085396, "grad_norm": 40.5, "learning_rate": 4.662713618043413e-07, "logits/chosen": 0.9421004056930542, "logits/rejected": 0.6513608694076538, "logps/chosen": -1.4433151483535767, "logps/rejected": -1.7160542011260986, "loss": 1.3431, "rewards/accuracies": 0.65625, "rewards/chosen": -14.433152198791504, "rewards/margins": 2.7273917198181152, "rewards/rejected": -17.16054344177246, "step": 167 }, { "epoch": 0.41888733052828425, "grad_norm": 78.5, "learning_rate": 4.635737261355447e-07, "logits/chosen": 0.8841539621353149, "logits/rejected": 0.7275552153587341, "logps/chosen": -1.617548942565918, "logps/rejected": -2.5178287029266357, "loss": 1.7514, "rewards/accuracies": 0.53125, "rewards/chosen": -16.17548942565918, "rewards/margins": 9.002798080444336, "rewards/rejected": -25.178287506103516, "step": 168 }, { "epoch": 0.4213807074957145, "grad_norm": 61.5, "learning_rate": 4.608685281693789e-07, "logits/chosen": 0.795113205909729, "logits/rejected": 0.7205825448036194, "logps/chosen": -1.5723981857299805, "logps/rejected": -1.8851563930511475, "loss": 2.6762, "rewards/accuracies": 0.53125, "rewards/chosen": -15.723981857299805, "rewards/margins": 3.1275830268859863, "rewards/rejected": -18.851564407348633, "step": 169 }, { "epoch": 0.4238740844631448, "grad_norm": 40.5, "learning_rate": 4.581559480312316e-07, "logits/chosen": 0.9474557042121887, "logits/rejected": 0.7945749759674072, "logps/chosen": -1.8188387155532837, "logps/rejected": -2.6367805004119873, "loss": 1.3681, "rewards/accuracies": 0.71875, "rewards/chosen": -18.18838882446289, "rewards/margins": 8.179415702819824, "rewards/rejected": -26.36780548095703, "step": 170 }, { "epoch": 0.426367461430575, "grad_norm": 33.25, "learning_rate": 4.5543616633803197e-07, "logits/chosen": 0.7378120422363281, "logits/rejected": 0.7000318169593811, "logps/chosen": -1.4147385358810425, "logps/rejected": -1.8963797092437744, "loss": 1.899, "rewards/accuracies": 0.53125, "rewards/chosen": -14.147384643554688, "rewards/margins": 4.816410541534424, "rewards/rejected": -18.963794708251953, "step": 171 }, { "epoch": 0.4288608383980053, "grad_norm": 35.25, "learning_rate": 4.527093641862241e-07, "logits/chosen": 0.9072024822235107, "logits/rejected": 0.7587930560112, "logps/chosen": -1.2699742317199707, "logps/rejected": -1.702739953994751, "loss": 1.4364, "rewards/accuracies": 0.65625, "rewards/chosen": -12.699743270874023, "rewards/margins": 4.327658176422119, "rewards/rejected": -17.027400970458984, "step": 172 }, { "epoch": 0.43135421536543556, "grad_norm": 25.75, "learning_rate": 4.499757231397087e-07, "logits/chosen": 0.8443821668624878, "logits/rejected": 0.6597446203231812, "logps/chosen": -1.509061336517334, "logps/rejected": -2.0712409019470215, "loss": 1.2708, "rewards/accuracies": 0.65625, "rewards/chosen": -15.09061336517334, "rewards/margins": 5.6217942237854, "rewards/rejected": -20.7124080657959, "step": 173 }, { "epoch": 0.43384759233286585, "grad_norm": 23.375, "learning_rate": 4.4723542521775385e-07, "logits/chosen": 1.0543487071990967, "logits/rejected": 0.5649646520614624, "logps/chosen": -1.4722059965133667, "logps/rejected": -2.189124584197998, "loss": 0.8446, "rewards/accuracies": 0.75, "rewards/chosen": -14.72205924987793, "rewards/margins": 7.169185638427734, "rewards/rejected": -21.891244888305664, "step": 174 }, { "epoch": 0.4363409693002961, "grad_norm": 54.75, "learning_rate": 4.444886528828749e-07, "logits/chosen": 0.9907981157302856, "logits/rejected": 0.7723469138145447, "logps/chosen": -1.8176202774047852, "logps/rejected": -2.3857648372650146, "loss": 1.7344, "rewards/accuracies": 0.59375, "rewards/chosen": -18.17620277404785, "rewards/margins": 5.6814446449279785, "rewards/rejected": -23.857648849487305, "step": 175 }, { "epoch": 0.4388343462677263, "grad_norm": 31.5, "learning_rate": 4.417355890286857e-07, "logits/chosen": 0.9411242008209229, "logits/rejected": 0.7533101439476013, "logps/chosen": -1.6791445016860962, "logps/rejected": -2.381438732147217, "loss": 1.8322, "rewards/accuracies": 0.6875, "rewards/chosen": -16.791446685791016, "rewards/margins": 7.022940635681152, "rewards/rejected": -23.81438446044922, "step": 176 }, { "epoch": 0.4413277232351566, "grad_norm": 51.25, "learning_rate": 4.389764169677205e-07, "logits/chosen": 0.862296462059021, "logits/rejected": 0.7431577444076538, "logps/chosen": -1.3871877193450928, "logps/rejected": -1.9420627355575562, "loss": 1.2998, "rewards/accuracies": 0.65625, "rewards/chosen": -13.87187671661377, "rewards/margins": 5.548751354217529, "rewards/rejected": -19.420629501342773, "step": 177 }, { "epoch": 0.44382110020258686, "grad_norm": 41.0, "learning_rate": 4.3621132041922745e-07, "logits/chosen": 0.8196381330490112, "logits/rejected": 0.735532820224762, "logps/chosen": -1.3557261228561401, "logps/rejected": -2.2253174781799316, "loss": 1.2671, "rewards/accuracies": 0.625, "rewards/chosen": -13.557262420654297, "rewards/margins": 8.695913314819336, "rewards/rejected": -22.253175735473633, "step": 178 }, { "epoch": 0.44631447717001715, "grad_norm": 28.75, "learning_rate": 4.334404834969368e-07, "logits/chosen": 1.0182719230651855, "logits/rejected": 0.8464354872703552, "logps/chosen": -1.3779345750808716, "logps/rejected": -1.8232632875442505, "loss": 1.3614, "rewards/accuracies": 0.59375, "rewards/chosen": -13.779345512390137, "rewards/margins": 4.4532856941223145, "rewards/rejected": -18.23263168334961, "step": 179 }, { "epoch": 0.4488078541374474, "grad_norm": 18.375, "learning_rate": 4.306640906968011e-07, "logits/chosen": 0.927130401134491, "logits/rejected": 0.7001396417617798, "logps/chosen": -1.3739848136901855, "logps/rejected": -2.3601279258728027, "loss": 0.5738, "rewards/accuracies": 0.875, "rewards/chosen": -13.739850044250488, "rewards/margins": 9.861430168151855, "rewards/rejected": -23.601280212402344, "step": 180 }, { "epoch": 0.4513012311048777, "grad_norm": 46.25, "learning_rate": 4.2788232688471e-07, "logits/chosen": 0.858923077583313, "logits/rejected": 0.7578305006027222, "logps/chosen": -1.2482776641845703, "logps/rejected": -1.7487417459487915, "loss": 1.0749, "rewards/accuracies": 0.6875, "rewards/chosen": -12.482775688171387, "rewards/margins": 5.004642486572266, "rewards/rejected": -17.487417221069336, "step": 181 }, { "epoch": 0.4537946080723079, "grad_norm": 83.0, "learning_rate": 4.2509537728418233e-07, "logits/chosen": 0.8518757224082947, "logits/rejected": 0.7721596360206604, "logps/chosen": -1.3393375873565674, "logps/rejected": -1.7837915420532227, "loss": 1.2853, "rewards/accuracies": 0.6875, "rewards/chosen": -13.393375396728516, "rewards/margins": 4.444540977478027, "rewards/rejected": -17.83791732788086, "step": 182 }, { "epoch": 0.4562879850397382, "grad_norm": 56.0, "learning_rate": 4.223034274640317e-07, "logits/chosen": 0.9242639541625977, "logits/rejected": 0.7321256995201111, "logps/chosen": -1.6946762800216675, "logps/rejected": -2.9650654792785645, "loss": 1.0034, "rewards/accuracies": 0.75, "rewards/chosen": -16.94676399230957, "rewards/margins": 12.703892707824707, "rewards/rejected": -29.65065574645996, "step": 183 }, { "epoch": 0.45878136200716846, "grad_norm": 38.75, "learning_rate": 4.195066633260109e-07, "logits/chosen": 0.8796188831329346, "logits/rejected": 0.6841633319854736, "logps/chosen": -1.3098094463348389, "logps/rejected": -1.709314227104187, "loss": 1.0082, "rewards/accuracies": 0.75, "rewards/chosen": -13.09809398651123, "rewards/margins": 3.995047092437744, "rewards/rejected": -17.0931396484375, "step": 184 }, { "epoch": 0.4612747389745987, "grad_norm": 49.75, "learning_rate": 4.1670527109243414e-07, "logits/chosen": 0.8437327146530151, "logits/rejected": 0.7233911156654358, "logps/chosen": -1.552445888519287, "logps/rejected": -2.1319706439971924, "loss": 1.2603, "rewards/accuracies": 0.71875, "rewards/chosen": -15.524458885192871, "rewards/margins": 5.795248031616211, "rewards/rejected": -21.3197078704834, "step": 185 }, { "epoch": 0.463768115942029, "grad_norm": 45.5, "learning_rate": 4.138994372937766e-07, "logits/chosen": 0.9246405363082886, "logits/rejected": 0.7257117629051208, "logps/chosen": -1.5023407936096191, "logps/rejected": -2.219346046447754, "loss": 1.2028, "rewards/accuracies": 0.6875, "rewards/chosen": -15.023408889770508, "rewards/margins": 7.170053958892822, "rewards/rejected": -22.193462371826172, "step": 186 }, { "epoch": 0.46626149290945923, "grad_norm": 74.0, "learning_rate": 4.110893487562548e-07, "logits/chosen": 0.7957507371902466, "logits/rejected": 0.7296849489212036, "logps/chosen": -1.3323700428009033, "logps/rejected": -2.052271842956543, "loss": 0.6391, "rewards/accuracies": 0.71875, "rewards/chosen": -13.323701858520508, "rewards/margins": 7.199017524719238, "rewards/rejected": -20.52271842956543, "step": 187 }, { "epoch": 0.4687548698768895, "grad_norm": 56.25, "learning_rate": 4.082751925893869e-07, "logits/chosen": 0.8817852735519409, "logits/rejected": 0.7720733880996704, "logps/chosen": -1.196410059928894, "logps/rejected": -1.500748634338379, "loss": 0.9032, "rewards/accuracies": 0.71875, "rewards/chosen": -11.96410083770752, "rewards/margins": 3.0433857440948486, "rewards/rejected": -15.007488250732422, "step": 188 }, { "epoch": 0.47124824684431976, "grad_norm": 41.0, "learning_rate": 4.054571561735334e-07, "logits/chosen": 0.9272749423980713, "logits/rejected": 0.6019188761711121, "logps/chosen": -1.804772138595581, "logps/rejected": -2.7550244331359863, "loss": 0.5996, "rewards/accuracies": 0.71875, "rewards/chosen": -18.04772186279297, "rewards/margins": 9.502524375915527, "rewards/rejected": -27.550243377685547, "step": 189 }, { "epoch": 0.47374162381175006, "grad_norm": 15.6875, "learning_rate": 4.026354271474214e-07, "logits/chosen": 0.9149619340896606, "logits/rejected": 0.6641325950622559, "logps/chosen": -1.705862283706665, "logps/rejected": -3.0835325717926025, "loss": 1.0064, "rewards/accuracies": 0.71875, "rewards/chosen": -17.058624267578125, "rewards/margins": 13.776700019836426, "rewards/rejected": -30.835325241088867, "step": 190 }, { "epoch": 0.4762350007791803, "grad_norm": 23.0, "learning_rate": 3.998101933956498e-07, "logits/chosen": 0.8473320007324219, "logits/rejected": 0.7866963744163513, "logps/chosen": -1.4231493473052979, "logps/rejected": -2.1398186683654785, "loss": 0.7511, "rewards/accuracies": 0.75, "rewards/chosen": -14.23149299621582, "rewards/margins": 7.1666951179504395, "rewards/rejected": -21.39818572998047, "step": 191 }, { "epoch": 0.4787283777466106, "grad_norm": 39.5, "learning_rate": 3.969816430361794e-07, "logits/chosen": 0.8237781524658203, "logits/rejected": 0.7161869406700134, "logps/chosen": -1.8508167266845703, "logps/rejected": -2.9980063438415527, "loss": 0.7146, "rewards/accuracies": 0.75, "rewards/chosen": -18.508167266845703, "rewards/margins": 11.471895217895508, "rewards/rejected": -29.980064392089844, "step": 192 }, { "epoch": 0.48122175471404083, "grad_norm": 75.0, "learning_rate": 3.9414996440780724e-07, "logits/chosen": 0.9529024958610535, "logits/rejected": 0.8278242349624634, "logps/chosen": -1.8834271430969238, "logps/rejected": -2.4769580364227295, "loss": 1.046, "rewards/accuracies": 0.71875, "rewards/chosen": -18.834270477294922, "rewards/margins": 5.935309410095215, "rewards/rejected": -24.769580841064453, "step": 193 }, { "epoch": 0.48371513168147107, "grad_norm": 62.0, "learning_rate": 3.913153460576256e-07, "logits/chosen": 0.916070818901062, "logits/rejected": 0.6884597539901733, "logps/chosen": -1.893513560295105, "logps/rejected": -2.9602129459381104, "loss": 1.0144, "rewards/accuracies": 0.75, "rewards/chosen": -18.935134887695312, "rewards/margins": 10.66699504852295, "rewards/rejected": -29.602130889892578, "step": 194 }, { "epoch": 0.48620850864890136, "grad_norm": 18.125, "learning_rate": 3.8847797672846825e-07, "logits/chosen": 0.9603822231292725, "logits/rejected": 0.6512764692306519, "logps/chosen": -1.7449413537979126, "logps/rejected": -2.7406604290008545, "loss": 0.3528, "rewards/accuracies": 0.875, "rewards/chosen": -17.44941520690918, "rewards/margins": 9.957185745239258, "rewards/rejected": -27.40660285949707, "step": 195 }, { "epoch": 0.4887018856163316, "grad_norm": 10.75, "learning_rate": 3.8563804534634246e-07, "logits/chosen": 0.9687063694000244, "logits/rejected": 0.8893125057220459, "logps/chosen": -1.372796654701233, "logps/rejected": -2.2664504051208496, "loss": 0.533, "rewards/accuracies": 0.8125, "rewards/chosen": -13.727968215942383, "rewards/margins": 8.93653678894043, "rewards/rejected": -22.664501190185547, "step": 196 }, { "epoch": 0.4911952625837619, "grad_norm": 48.75, "learning_rate": 3.827957410078494e-07, "logits/chosen": 0.8412132859230042, "logits/rejected": 0.7297399044036865, "logps/chosen": -2.030590772628784, "logps/rejected": -3.408313035964966, "loss": 0.7652, "rewards/accuracies": 0.75, "rewards/chosen": -20.305906295776367, "rewards/margins": 13.777222633361816, "rewards/rejected": -34.0831298828125, "step": 197 }, { "epoch": 0.49368863955119213, "grad_norm": 25.0, "learning_rate": 3.799512529675939e-07, "logits/chosen": 0.8365733623504639, "logits/rejected": 0.7946135997772217, "logps/chosen": -1.8182940483093262, "logps/rejected": -2.8680472373962402, "loss": 0.6624, "rewards/accuracies": 0.875, "rewards/chosen": -18.182941436767578, "rewards/margins": 10.497532844543457, "rewards/rejected": -28.680471420288086, "step": 198 }, { "epoch": 0.4961820165186224, "grad_norm": 49.25, "learning_rate": 3.7710477062558195e-07, "logits/chosen": 0.8030841946601868, "logits/rejected": 0.6840673685073853, "logps/chosen": -1.7462419271469116, "logps/rejected": -2.6651408672332764, "loss": 0.9539, "rewards/accuracies": 0.65625, "rewards/chosen": -17.462419509887695, "rewards/margins": 9.188987731933594, "rewards/rejected": -26.651405334472656, "step": 199 }, { "epoch": 0.49867539348605266, "grad_norm": 37.25, "learning_rate": 3.742564835146099e-07, "logits/chosen": 0.940216064453125, "logits/rejected": 0.7382882833480835, "logps/chosen": -1.5715973377227783, "logps/rejected": -2.2499136924743652, "loss": 0.5224, "rewards/accuracies": 0.84375, "rewards/chosen": -15.715973854064941, "rewards/margins": 6.7831621170043945, "rewards/rejected": -22.499134063720703, "step": 200 }, { "epoch": 0.501168770453483, "grad_norm": 53.5, "learning_rate": 3.71406581287645e-07, "logits/chosen": 0.8017429113388062, "logits/rejected": 0.7019472122192383, "logps/chosen": -1.5708627700805664, "logps/rejected": -2.446748733520508, "loss": 0.6145, "rewards/accuracies": 0.78125, "rewards/chosen": -15.70862865447998, "rewards/margins": 8.758858680725098, "rewards/rejected": -24.467487335205078, "step": 201 }, { "epoch": 0.5036621474209132, "grad_norm": 24.5, "learning_rate": 3.6855525370519617e-07, "logits/chosen": 0.9191329479217529, "logits/rejected": 0.7709681987762451, "logps/chosen": -1.2503210306167603, "logps/rejected": -1.8521945476531982, "loss": 0.5184, "rewards/accuracies": 0.78125, "rewards/chosen": -12.50321102142334, "rewards/margins": 6.018735408782959, "rewards/rejected": -18.52194595336914, "step": 202 }, { "epoch": 0.5061555243883434, "grad_norm": 17.375, "learning_rate": 3.6570269062268025e-07, "logits/chosen": 0.7203347682952881, "logits/rejected": 0.7312765717506409, "logps/chosen": -1.9442358016967773, "logps/rejected": -3.1624157428741455, "loss": 0.6896, "rewards/accuracies": 0.8125, "rewards/chosen": -19.44235610961914, "rewards/margins": 12.18179988861084, "rewards/rejected": -31.624156951904297, "step": 203 }, { "epoch": 0.5086489013557737, "grad_norm": 14.125, "learning_rate": 3.6284908197777915e-07, "logits/chosen": 0.7811324596405029, "logits/rejected": 0.7788522839546204, "logps/chosen": -1.5343513488769531, "logps/rejected": -2.655756711959839, "loss": 0.4037, "rewards/accuracies": 0.84375, "rewards/chosen": -15.343514442443848, "rewards/margins": 11.214055061340332, "rewards/rejected": -26.557571411132812, "step": 204 }, { "epoch": 0.511142278323204, "grad_norm": 9.75, "learning_rate": 3.599946177777936e-07, "logits/chosen": 0.9504005908966064, "logits/rejected": 0.8734852075576782, "logps/chosen": -1.6099568605422974, "logps/rejected": -2.527747869491577, "loss": 0.4034, "rewards/accuracies": 0.84375, "rewards/chosen": -16.099567413330078, "rewards/margins": 9.177909851074219, "rewards/rejected": -25.277477264404297, "step": 205 }, { "epoch": 0.5136356552906343, "grad_norm": 9.0, "learning_rate": 3.571394880869919e-07, "logits/chosen": 1.0245471000671387, "logits/rejected": 0.8590348958969116, "logps/chosen": -1.5646387338638306, "logps/rejected": -2.8300962448120117, "loss": 0.633, "rewards/accuracies": 0.78125, "rewards/chosen": -15.64638614654541, "rewards/margins": 12.65457534790039, "rewards/rejected": -28.300960540771484, "step": 206 }, { "epoch": 0.5161290322580645, "grad_norm": 11.9375, "learning_rate": 3.5428388301395325e-07, "logits/chosen": 0.9345250129699707, "logits/rejected": 0.8547608852386475, "logps/chosen": -1.4048100709915161, "logps/rejected": -2.2128334045410156, "loss": 0.5043, "rewards/accuracies": 0.78125, "rewards/chosen": -14.048102378845215, "rewards/margins": 8.080232620239258, "rewards/rejected": -22.128334045410156, "step": 207 }, { "epoch": 0.5186224092254947, "grad_norm": 12.375, "learning_rate": 3.514279926989105e-07, "logits/chosen": 0.9688948392868042, "logits/rejected": 0.7790014743804932, "logps/chosen": -2.1355183124542236, "logps/rejected": -3.5980281829833984, "loss": 0.5437, "rewards/accuracies": 0.84375, "rewards/chosen": -21.35518455505371, "rewards/margins": 14.625100135803223, "rewards/rejected": -35.98028564453125, "step": 208 }, { "epoch": 0.5211157861929251, "grad_norm": 8.5, "learning_rate": 3.485720073010896e-07, "logits/chosen": 0.9008550643920898, "logits/rejected": 0.8881810307502747, "logps/chosen": -1.938570261001587, "logps/rejected": -3.0095808506011963, "loss": 0.513, "rewards/accuracies": 0.78125, "rewards/chosen": -19.38570213317871, "rewards/margins": 10.710105895996094, "rewards/rejected": -30.095808029174805, "step": 209 }, { "epoch": 0.5236091631603553, "grad_norm": 24.5, "learning_rate": 3.457161169860469e-07, "logits/chosen": 0.9138238430023193, "logits/rejected": 0.6945370435714722, "logps/chosen": -1.7868579626083374, "logps/rejected": -3.283820390701294, "loss": 0.5155, "rewards/accuracies": 0.84375, "rewards/chosen": -17.868579864501953, "rewards/margins": 14.969620704650879, "rewards/rejected": -32.83820343017578, "step": 210 }, { "epoch": 0.5261025401277856, "grad_norm": 5.09375, "learning_rate": 3.428605119130082e-07, "logits/chosen": 0.8236789703369141, "logits/rejected": 0.8235811591148376, "logps/chosen": -1.940079689025879, "logps/rejected": -3.2608189582824707, "loss": 0.2783, "rewards/accuracies": 0.875, "rewards/chosen": -19.400798797607422, "rewards/margins": 13.207389831542969, "rewards/rejected": -32.60818862915039, "step": 211 }, { "epoch": 0.5285959170952158, "grad_norm": 45.75, "learning_rate": 3.4000538222220635e-07, "logits/chosen": 0.9403684139251709, "logits/rejected": 0.8005753755569458, "logps/chosen": -1.6408517360687256, "logps/rejected": -2.543820858001709, "loss": 0.4837, "rewards/accuracies": 0.84375, "rewards/chosen": -16.408517837524414, "rewards/margins": 9.029691696166992, "rewards/rejected": -25.438209533691406, "step": 212 }, { "epoch": 0.531089294062646, "grad_norm": 8.625, "learning_rate": 3.37150918022221e-07, "logits/chosen": 0.8799944519996643, "logits/rejected": 0.7955228090286255, "logps/chosen": -1.9793920516967773, "logps/rejected": -3.432222843170166, "loss": 0.216, "rewards/accuracies": 0.875, "rewards/chosen": -19.79391860961914, "rewards/margins": 14.528306007385254, "rewards/rejected": -34.32222366333008, "step": 213 }, { "epoch": 0.5335826710300764, "grad_norm": 7.96875, "learning_rate": 3.342973093773199e-07, "logits/chosen": 0.9032948017120361, "logits/rejected": 0.8324267268180847, "logps/chosen": -1.3809956312179565, "logps/rejected": -2.4675354957580566, "loss": 0.3747, "rewards/accuracies": 0.90625, "rewards/chosen": -13.809956550598145, "rewards/margins": 10.865400314331055, "rewards/rejected": -24.675355911254883, "step": 214 }, { "epoch": 0.5360760479975066, "grad_norm": 5.875, "learning_rate": 3.314447462948038e-07, "logits/chosen": 0.8150188326835632, "logits/rejected": 0.7412484884262085, "logps/chosen": -1.7949788570404053, "logps/rejected": -3.1454625129699707, "loss": 0.5245, "rewards/accuracies": 0.875, "rewards/chosen": -17.94978904724121, "rewards/margins": 13.504838943481445, "rewards/rejected": -31.454627990722656, "step": 215 }, { "epoch": 0.5385694249649369, "grad_norm": 6.46875, "learning_rate": 3.285934187123551e-07, "logits/chosen": 0.9771428108215332, "logits/rejected": 0.7578872442245483, "logps/chosen": -1.5606952905654907, "logps/rejected": -2.352637529373169, "loss": 0.9085, "rewards/accuracies": 0.71875, "rewards/chosen": -15.606952667236328, "rewards/margins": 7.9194231033325195, "rewards/rejected": -23.526376724243164, "step": 216 }, { "epoch": 0.5410628019323671, "grad_norm": 8.8125, "learning_rate": 3.2574351648539017e-07, "logits/chosen": 0.8879974484443665, "logits/rejected": 0.7966049909591675, "logps/chosen": -1.748363733291626, "logps/rejected": -2.8166420459747314, "loss": 0.7279, "rewards/accuracies": 0.78125, "rewards/chosen": -17.4836368560791, "rewards/margins": 10.682784080505371, "rewards/rejected": -28.16642189025879, "step": 217 }, { "epoch": 0.5435561788997975, "grad_norm": 12.875, "learning_rate": 3.228952293744181e-07, "logits/chosen": 0.9608930349349976, "logits/rejected": 0.7884482741355896, "logps/chosen": -1.9304460287094116, "logps/rejected": -3.123302459716797, "loss": 0.6031, "rewards/accuracies": 0.8125, "rewards/chosen": -19.304460525512695, "rewards/margins": 11.928570747375488, "rewards/rejected": -31.233030319213867, "step": 218 }, { "epoch": 0.5460495558672277, "grad_norm": 13.6875, "learning_rate": 3.200487470324062e-07, "logits/chosen": 0.9692325592041016, "logits/rejected": 0.8839060068130493, "logps/chosen": -1.7750276327133179, "logps/rejected": -3.3542592525482178, "loss": 0.4743, "rewards/accuracies": 0.8125, "rewards/chosen": -17.750276565551758, "rewards/margins": 15.792311668395996, "rewards/rejected": -33.54258728027344, "step": 219 }, { "epoch": 0.5485429328346579, "grad_norm": 15.6875, "learning_rate": 3.172042589921506e-07, "logits/chosen": 0.9265443086624146, "logits/rejected": 0.8019118309020996, "logps/chosen": -1.751523494720459, "logps/rejected": -2.899667501449585, "loss": 0.6666, "rewards/accuracies": 0.71875, "rewards/chosen": -17.515233993530273, "rewards/margins": 11.481440544128418, "rewards/rejected": -28.996675491333008, "step": 220 }, { "epoch": 0.5510363098020882, "grad_norm": 27.375, "learning_rate": 3.1436195465365767e-07, "logits/chosen": 0.8846260905265808, "logits/rejected": 0.8169682621955872, "logps/chosen": -1.4631338119506836, "logps/rejected": -2.289689064025879, "loss": 0.5698, "rewards/accuracies": 0.84375, "rewards/chosen": -14.631338119506836, "rewards/margins": 8.265554428100586, "rewards/rejected": -22.896892547607422, "step": 221 }, { "epoch": 0.5535296867695184, "grad_norm": 13.875, "learning_rate": 3.115220232715318e-07, "logits/chosen": 0.8759758472442627, "logits/rejected": 0.8523691296577454, "logps/chosen": -1.8312069177627563, "logps/rejected": -3.232063055038452, "loss": 0.645, "rewards/accuracies": 0.78125, "rewards/chosen": -18.312068939208984, "rewards/margins": 14.008562088012695, "rewards/rejected": -32.32063293457031, "step": 222 }, { "epoch": 0.5560230637369488, "grad_norm": 5.84375, "learning_rate": 3.086846539423744e-07, "logits/chosen": 0.8589959740638733, "logits/rejected": 0.7877765893936157, "logps/chosen": -1.3714457750320435, "logps/rejected": -2.5221285820007324, "loss": 0.584, "rewards/accuracies": 0.875, "rewards/chosen": -13.714457511901855, "rewards/margins": 11.506828308105469, "rewards/rejected": -25.221284866333008, "step": 223 }, { "epoch": 0.558516440704379, "grad_norm": 12.75, "learning_rate": 3.0585003559219284e-07, "logits/chosen": 0.7336137294769287, "logits/rejected": 0.8082336187362671, "logps/chosen": -2.2451255321502686, "logps/rejected": -4.2201972007751465, "loss": 0.6203, "rewards/accuracies": 0.8125, "rewards/chosen": -22.451255798339844, "rewards/margins": 19.750713348388672, "rewards/rejected": -42.20196533203125, "step": 224 }, { "epoch": 0.5610098176718092, "grad_norm": 50.0, "learning_rate": 3.030183569638207e-07, "logits/chosen": 0.7706287503242493, "logits/rejected": 0.7501264810562134, "logps/chosen": -1.5991909503936768, "logps/rejected": -2.917886972427368, "loss": 0.2763, "rewards/accuracies": 0.875, "rewards/chosen": -15.99190902709961, "rewards/margins": 13.186960220336914, "rewards/rejected": -29.178869247436523, "step": 225 }, { "epoch": 0.5635031946392395, "grad_norm": 1.609375, "learning_rate": 3.001898066043502e-07, "logits/chosen": 0.9699455499649048, "logits/rejected": 0.8485396504402161, "logps/chosen": -2.2904715538024902, "logps/rejected": -4.465200424194336, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -22.90471649169922, "rewards/margins": 21.74728775024414, "rewards/rejected": -44.652008056640625, "step": 226 }, { "epoch": 0.5659965716066698, "grad_norm": 5.8125, "learning_rate": 2.973645728525786e-07, "logits/chosen": 0.8099995851516724, "logits/rejected": 0.6409150958061218, "logps/chosen": -1.6182489395141602, "logps/rejected": -2.964346408843994, "loss": 0.3709, "rewards/accuracies": 0.875, "rewards/chosen": -16.1824893951416, "rewards/margins": 13.460972785949707, "rewards/rejected": -29.643461227416992, "step": 227 }, { "epoch": 0.5684899485741001, "grad_norm": 33.5, "learning_rate": 2.9454284382646654e-07, "logits/chosen": 0.8826979398727417, "logits/rejected": 0.7478980422019958, "logps/chosen": -1.6775342226028442, "logps/rejected": -3.144256830215454, "loss": 0.7344, "rewards/accuracies": 0.84375, "rewards/chosen": -16.77534294128418, "rewards/margins": 14.667223930358887, "rewards/rejected": -31.44256591796875, "step": 228 }, { "epoch": 0.5709833255415303, "grad_norm": 8.0625, "learning_rate": 2.917248074106132e-07, "logits/chosen": 0.7391412854194641, "logits/rejected": 0.7339631915092468, "logps/chosen": -1.612046241760254, "logps/rejected": -2.4834823608398438, "loss": 0.46, "rewards/accuracies": 0.84375, "rewards/chosen": -16.120464324951172, "rewards/margins": 8.714361190795898, "rewards/rejected": -24.834821701049805, "step": 229 }, { "epoch": 0.5734767025089605, "grad_norm": 10.5625, "learning_rate": 2.889106512437452e-07, "logits/chosen": 0.7340772151947021, "logits/rejected": 0.8868482708930969, "logps/chosen": -1.7300639152526855, "logps/rejected": -2.9352312088012695, "loss": 0.454, "rewards/accuracies": 0.8125, "rewards/chosen": -17.30063819885254, "rewards/margins": 12.051673889160156, "rewards/rejected": -29.352313995361328, "step": 230 }, { "epoch": 0.5759700794763908, "grad_norm": 9.8125, "learning_rate": 2.8610056270622344e-07, "logits/chosen": 0.9421735405921936, "logits/rejected": 0.7073564529418945, "logps/chosen": -1.7943646907806396, "logps/rejected": -3.0728399753570557, "loss": 0.4413, "rewards/accuracies": 0.8125, "rewards/chosen": -17.943645477294922, "rewards/margins": 12.784753799438477, "rewards/rejected": -30.7283992767334, "step": 231 }, { "epoch": 0.5784634564438211, "grad_norm": 14.4375, "learning_rate": 2.8329472890756593e-07, "logits/chosen": 0.8662580251693726, "logits/rejected": 0.844997227191925, "logps/chosen": -1.5802185535430908, "logps/rejected": -2.640042304992676, "loss": 0.8093, "rewards/accuracies": 0.8125, "rewards/chosen": -15.802186965942383, "rewards/margins": 10.598236083984375, "rewards/rejected": -26.40042495727539, "step": 232 }, { "epoch": 0.5809568334112514, "grad_norm": 37.25, "learning_rate": 2.8049333667398917e-07, "logits/chosen": 0.9215195775032043, "logits/rejected": 0.8155514597892761, "logps/chosen": -2.146245241165161, "logps/rejected": -3.7037928104400635, "loss": 0.6597, "rewards/accuracies": 0.875, "rewards/chosen": -21.462451934814453, "rewards/margins": 15.575474739074707, "rewards/rejected": -37.037925720214844, "step": 233 }, { "epoch": 0.5834502103786816, "grad_norm": 10.375, "learning_rate": 2.776965725359684e-07, "logits/chosen": 0.8086358308792114, "logits/rejected": 0.7704899907112122, "logps/chosen": -1.5882647037506104, "logps/rejected": -2.976109743118286, "loss": 0.6318, "rewards/accuracies": 0.84375, "rewards/chosen": -15.882646560668945, "rewards/margins": 13.878451347351074, "rewards/rejected": -29.761098861694336, "step": 234 }, { "epoch": 0.5859435873461118, "grad_norm": 9.9375, "learning_rate": 2.7490462271581774e-07, "logits/chosen": 0.9086362719535828, "logits/rejected": 0.8243853449821472, "logps/chosen": -1.8874664306640625, "logps/rejected": -3.1539669036865234, "loss": 0.7813, "rewards/accuracies": 0.8125, "rewards/chosen": -18.874664306640625, "rewards/margins": 12.665003776550293, "rewards/rejected": -31.539669036865234, "step": 235 }, { "epoch": 0.5884369643135422, "grad_norm": 9.375, "learning_rate": 2.7211767311529e-07, "logits/chosen": 0.8527828454971313, "logits/rejected": 0.8761582374572754, "logps/chosen": -1.5832545757293701, "logps/rejected": -2.610931396484375, "loss": 0.7349, "rewards/accuracies": 0.75, "rewards/chosen": -15.832547187805176, "rewards/margins": 10.276766777038574, "rewards/rejected": -26.10931396484375, "step": 236 }, { "epoch": 0.5909303412809724, "grad_norm": 8.3125, "learning_rate": 2.6933590930319903e-07, "logits/chosen": 0.7043416500091553, "logits/rejected": 0.7324919104576111, "logps/chosen": -1.7684717178344727, "logps/rejected": -3.180450916290283, "loss": 0.4904, "rewards/accuracies": 0.75, "rewards/chosen": -17.68471908569336, "rewards/margins": 14.11978816986084, "rewards/rejected": -31.804506301879883, "step": 237 }, { "epoch": 0.5934237182484027, "grad_norm": 7.78125, "learning_rate": 2.665595165030632e-07, "logits/chosen": 0.7452791929244995, "logits/rejected": 0.7571016550064087, "logps/chosen": -1.6962933540344238, "logps/rejected": -4.225780963897705, "loss": 0.0719, "rewards/accuracies": 0.96875, "rewards/chosen": -16.962934494018555, "rewards/margins": 25.29487419128418, "rewards/rejected": -42.257808685302734, "step": 238 }, { "epoch": 0.5959170952158329, "grad_norm": 11.375, "learning_rate": 2.637886795807726e-07, "logits/chosen": 0.81926429271698, "logits/rejected": 0.8182339072227478, "logps/chosen": -1.742372751235962, "logps/rejected": -3.2170917987823486, "loss": 0.3052, "rewards/accuracies": 0.875, "rewards/chosen": -17.42372703552246, "rewards/margins": 14.747193336486816, "rewards/rejected": -32.17091751098633, "step": 239 }, { "epoch": 0.5984104721832632, "grad_norm": 7.34375, "learning_rate": 2.6102358303227965e-07, "logits/chosen": 0.8492619395256042, "logits/rejected": 0.8256470561027527, "logps/chosen": -1.6656391620635986, "logps/rejected": -3.1693525314331055, "loss": 0.713, "rewards/accuracies": 0.78125, "rewards/chosen": -16.656391143798828, "rewards/margins": 15.03713607788086, "rewards/rejected": -31.693523406982422, "step": 240 }, { "epoch": 0.6009038491506935, "grad_norm": 12.375, "learning_rate": 2.5826441097131433e-07, "logits/chosen": 0.7694429755210876, "logits/rejected": 0.7062366008758545, "logps/chosen": -1.8840895891189575, "logps/rejected": -3.3917837142944336, "loss": 0.4917, "rewards/accuracies": 0.90625, "rewards/chosen": -18.84089469909668, "rewards/margins": 15.07693862915039, "rewards/rejected": -33.9178352355957, "step": 241 }, { "epoch": 0.6033972261181237, "grad_norm": 3.140625, "learning_rate": 2.555113471171251e-07, "logits/chosen": 0.7511383295059204, "logits/rejected": 0.8419240713119507, "logps/chosen": -1.99148428440094, "logps/rejected": -3.644866943359375, "loss": 0.1984, "rewards/accuracies": 0.9375, "rewards/chosen": -19.914844512939453, "rewards/margins": 16.533824920654297, "rewards/rejected": -36.44866943359375, "step": 242 }, { "epoch": 0.605890603085554, "grad_norm": 34.0, "learning_rate": 2.527645747822462e-07, "logits/chosen": 0.7965211272239685, "logits/rejected": 0.7004488706588745, "logps/chosen": -1.90436851978302, "logps/rejected": -3.2234106063842773, "loss": 0.3643, "rewards/accuracies": 0.90625, "rewards/chosen": -19.043685913085938, "rewards/margins": 13.19041919708252, "rewards/rejected": -32.23410415649414, "step": 243 }, { "epoch": 0.6083839800529842, "grad_norm": 9.125, "learning_rate": 2.5002427686029125e-07, "logits/chosen": 0.9241939783096313, "logits/rejected": 0.8476071953773499, "logps/chosen": -1.663865566253662, "logps/rejected": -2.624908685684204, "loss": 0.5361, "rewards/accuracies": 0.75, "rewards/chosen": -16.638656616210938, "rewards/margins": 9.610431671142578, "rewards/rejected": -26.249088287353516, "step": 244 }, { "epoch": 0.6108773570204146, "grad_norm": 9.9375, "learning_rate": 2.472906358137759e-07, "logits/chosen": 0.7417331337928772, "logits/rejected": 0.67648845911026, "logps/chosen": -1.45391845703125, "logps/rejected": -2.7644288539886475, "loss": 0.451, "rewards/accuracies": 0.84375, "rewards/chosen": -14.539186477661133, "rewards/margins": 13.105106353759766, "rewards/rejected": -27.644290924072266, "step": 245 }, { "epoch": 0.6133707339878448, "grad_norm": 13.9375, "learning_rate": 2.445638336619681e-07, "logits/chosen": 0.8194867968559265, "logits/rejected": 0.7898424863815308, "logps/chosen": -1.7679089307785034, "logps/rejected": -3.1125354766845703, "loss": 0.5163, "rewards/accuracies": 0.8125, "rewards/chosen": -17.679088592529297, "rewards/margins": 13.446268081665039, "rewards/rejected": -31.12535858154297, "step": 246 }, { "epoch": 0.615864110955275, "grad_norm": 9.8125, "learning_rate": 2.418440519687684e-07, "logits/chosen": 0.9577868580818176, "logits/rejected": 0.7647145986557007, "logps/chosen": -1.611385703086853, "logps/rejected": -2.760087728500366, "loss": 0.6587, "rewards/accuracies": 0.71875, "rewards/chosen": -16.11385726928711, "rewards/margins": 11.487018585205078, "rewards/rejected": -27.600875854492188, "step": 247 }, { "epoch": 0.6183574879227053, "grad_norm": 60.75, "learning_rate": 2.391314718306212e-07, "logits/chosen": 0.8142352104187012, "logits/rejected": 0.7828744053840637, "logps/chosen": -1.1173535585403442, "logps/rejected": -1.752410650253296, "loss": 0.6811, "rewards/accuracies": 0.8125, "rewards/chosen": -11.173534393310547, "rewards/margins": 6.350571632385254, "rewards/rejected": -17.52410888671875, "step": 248 }, { "epoch": 0.6208508648901355, "grad_norm": 12.375, "learning_rate": 2.3642627386445537e-07, "logits/chosen": 0.8487910628318787, "logits/rejected": 0.8330541849136353, "logps/chosen": -1.5378450155258179, "logps/rejected": -2.359829902648926, "loss": 0.8157, "rewards/accuracies": 0.75, "rewards/chosen": -15.378450393676758, "rewards/margins": 8.219846725463867, "rewards/rejected": -23.598297119140625, "step": 249 }, { "epoch": 0.6233442418575659, "grad_norm": 6.25, "learning_rate": 2.3372863819565868e-07, "logits/chosen": 0.8798298239707947, "logits/rejected": 0.7591216564178467, "logps/chosen": -1.6283077001571655, "logps/rejected": -3.044259548187256, "loss": 0.4055, "rewards/accuracies": 0.8125, "rewards/chosen": -16.2830753326416, "rewards/margins": 14.159520149230957, "rewards/rejected": -30.442594528198242, "step": 250 }, { "epoch": 0.6258376188249961, "grad_norm": 6.90625, "learning_rate": 2.310387444460842e-07, "logits/chosen": 0.8435265421867371, "logits/rejected": 0.6582808494567871, "logps/chosen": -1.9542193412780762, "logps/rejected": -3.527535915374756, "loss": 0.2511, "rewards/accuracies": 0.875, "rewards/chosen": -19.542194366455078, "rewards/margins": 15.733165740966797, "rewards/rejected": -35.275360107421875, "step": 251 }, { "epoch": 0.6283309957924264, "grad_norm": 12.5625, "learning_rate": 2.2835677172208942e-07, "logits/chosen": 0.9236465692520142, "logits/rejected": 0.7837573885917664, "logps/chosen": -1.5361783504486084, "logps/rejected": -2.8093583583831787, "loss": 0.4637, "rewards/accuracies": 0.84375, "rewards/chosen": -15.361783981323242, "rewards/margins": 12.731797218322754, "rewards/rejected": -28.093584060668945, "step": 252 }, { "epoch": 0.6308243727598566, "grad_norm": 7.25, "learning_rate": 2.2568289860261148e-07, "logits/chosen": 0.8141547441482544, "logits/rejected": 0.734942615032196, "logps/chosen": -1.593569040298462, "logps/rejected": -3.0460009574890137, "loss": 0.4794, "rewards/accuracies": 0.8125, "rewards/chosen": -15.935691833496094, "rewards/margins": 14.52431869506836, "rewards/rejected": -30.46000862121582, "step": 253 }, { "epoch": 0.6333177497272869, "grad_norm": 19.875, "learning_rate": 2.2301730312727568e-07, "logits/chosen": 0.8214707374572754, "logits/rejected": 0.7384806871414185, "logps/chosen": -1.9334094524383545, "logps/rejected": -3.117767095565796, "loss": 0.5317, "rewards/accuracies": 0.84375, "rewards/chosen": -19.334095001220703, "rewards/margins": 11.843574523925781, "rewards/rejected": -31.177671432495117, "step": 254 }, { "epoch": 0.6358111266947172, "grad_norm": 4.375, "learning_rate": 2.203601627845411e-07, "logits/chosen": 0.9514889717102051, "logits/rejected": 0.8385657072067261, "logps/chosen": -2.093611001968384, "logps/rejected": -4.076337814331055, "loss": 0.1654, "rewards/accuracies": 0.90625, "rewards/chosen": -20.936111450195312, "rewards/margins": 19.8272705078125, "rewards/rejected": -40.76338195800781, "step": 255 }, { "epoch": 0.6383045036621474, "grad_norm": 32.5, "learning_rate": 2.1771165449988274e-07, "logits/chosen": 1.076192855834961, "logits/rejected": 0.8149666786193848, "logps/chosen": -1.584862232208252, "logps/rejected": -2.5359246730804443, "loss": 0.4198, "rewards/accuracies": 0.84375, "rewards/chosen": -15.84862232208252, "rewards/margins": 9.510624885559082, "rewards/rejected": -25.3592472076416, "step": 256 }, { "epoch": 0.6407978806295777, "grad_norm": 10.9375, "learning_rate": 2.1507195462401042e-07, "logits/chosen": 0.8264479041099548, "logits/rejected": 0.8565191626548767, "logps/chosen": -1.590366244316101, "logps/rejected": -2.96756911277771, "loss": 0.7281, "rewards/accuracies": 0.65625, "rewards/chosen": -15.903663635253906, "rewards/margins": 13.772027969360352, "rewards/rejected": -29.675691604614258, "step": 257 }, { "epoch": 0.6432912575970079, "grad_norm": 18.75, "learning_rate": 2.1244123892112674e-07, "logits/chosen": 0.8875083923339844, "logits/rejected": 0.8365639448165894, "logps/chosen": -1.9993164539337158, "logps/rejected": -4.3548431396484375, "loss": 0.4588, "rewards/accuracies": 0.84375, "rewards/chosen": -19.993162155151367, "rewards/margins": 23.555269241333008, "rewards/rejected": -43.548431396484375, "step": 258 }, { "epoch": 0.6457846345644382, "grad_norm": 5.09375, "learning_rate": 2.0981968255722427e-07, "logits/chosen": 0.9401863217353821, "logits/rejected": 0.8267409801483154, "logps/chosen": -1.508123755455017, "logps/rejected": -2.8934311866760254, "loss": 0.2964, "rewards/accuracies": 0.84375, "rewards/chosen": -15.08123779296875, "rewards/margins": 13.853076934814453, "rewards/rejected": -28.93431282043457, "step": 259 }, { "epoch": 0.6482780115318685, "grad_norm": 10.125, "learning_rate": 2.072074600884213e-07, "logits/chosen": 0.7929245233535767, "logits/rejected": 0.7758727669715881, "logps/chosen": -1.806505560874939, "logps/rejected": -3.316180944442749, "loss": 0.6586, "rewards/accuracies": 0.78125, "rewards/chosen": -18.0650577545166, "rewards/margins": 15.09675407409668, "rewards/rejected": -33.16181182861328, "step": 260 }, { "epoch": 0.6507713884992987, "grad_norm": 6.96875, "learning_rate": 2.0460474544933978e-07, "logits/chosen": 0.7232526540756226, "logits/rejected": 0.7585304975509644, "logps/chosen": -1.4770225286483765, "logps/rejected": -2.5309412479400635, "loss": 0.423, "rewards/accuracies": 0.875, "rewards/chosen": -14.770224571228027, "rewards/margins": 10.539185523986816, "rewards/rejected": -25.309412002563477, "step": 261 }, { "epoch": 0.653264765466729, "grad_norm": 5.1875, "learning_rate": 2.020117119415233e-07, "logits/chosen": 0.7610968351364136, "logits/rejected": 0.6675768494606018, "logps/chosen": -1.518571376800537, "logps/rejected": -2.640080690383911, "loss": 0.3495, "rewards/accuracies": 0.875, "rewards/chosen": -15.185713768005371, "rewards/margins": 11.215094566345215, "rewards/rejected": -26.400808334350586, "step": 262 }, { "epoch": 0.6557581424341593, "grad_norm": 20.125, "learning_rate": 1.9942853222189841e-07, "logits/chosen": 0.8614793419837952, "logits/rejected": 0.7701671719551086, "logps/chosen": -1.5696934461593628, "logps/rejected": -2.8778579235076904, "loss": 0.6096, "rewards/accuracies": 0.875, "rewards/chosen": -15.69693374633789, "rewards/margins": 13.081643104553223, "rewards/rejected": -28.778575897216797, "step": 263 }, { "epoch": 0.6582515194015895, "grad_norm": 12.375, "learning_rate": 1.968553782912778e-07, "logits/chosen": 0.8768056631088257, "logits/rejected": 0.8102119565010071, "logps/chosen": -1.6998172998428345, "logps/rejected": -2.9253602027893066, "loss": 0.625, "rewards/accuracies": 0.75, "rewards/chosen": -16.998172760009766, "rewards/margins": 12.255431175231934, "rewards/rejected": -29.253602981567383, "step": 264 }, { "epoch": 0.6607448963690198, "grad_norm": 29.0, "learning_rate": 1.942924214829077e-07, "logits/chosen": 0.9345517158508301, "logits/rejected": 0.7886137962341309, "logps/chosen": -1.9977731704711914, "logps/rejected": -3.9683516025543213, "loss": 0.5431, "rewards/accuracies": 0.8125, "rewards/chosen": -19.97772979736328, "rewards/margins": 19.705781936645508, "rewards/rejected": -39.68351364135742, "step": 265 }, { "epoch": 0.66323827333645, "grad_norm": 4.125, "learning_rate": 1.9173983245106005e-07, "logits/chosen": 0.9463739395141602, "logits/rejected": 0.8353683948516846, "logps/chosen": -1.8554211854934692, "logps/rejected": -3.5142271518707275, "loss": 0.2197, "rewards/accuracies": 0.90625, "rewards/chosen": -18.554210662841797, "rewards/margins": 16.58806037902832, "rewards/rejected": -35.14227294921875, "step": 266 }, { "epoch": 0.6657316503038803, "grad_norm": 31.25, "learning_rate": 1.891977811596689e-07, "logits/chosen": 1.0108263492584229, "logits/rejected": 0.723067581653595, "logps/chosen": -1.615850567817688, "logps/rejected": -2.9190895557403564, "loss": 0.7786, "rewards/accuracies": 0.78125, "rewards/chosen": -16.158506393432617, "rewards/margins": 13.032387733459473, "rewards/rejected": -29.19089698791504, "step": 267 }, { "epoch": 0.6682250272713106, "grad_norm": 3.640625, "learning_rate": 1.8666643687101418e-07, "logits/chosen": 0.922001302242279, "logits/rejected": 0.8183608651161194, "logps/chosen": -1.845801830291748, "logps/rejected": -3.9838411808013916, "loss": 0.2435, "rewards/accuracies": 0.90625, "rewards/chosen": -18.458017349243164, "rewards/margins": 21.380395889282227, "rewards/rejected": -39.83841323852539, "step": 268 }, { "epoch": 0.6707184042387409, "grad_norm": 9.625, "learning_rate": 1.8414596813445047e-07, "logits/chosen": 0.9229664206504822, "logits/rejected": 0.8024593591690063, "logps/chosen": -1.5461751222610474, "logps/rejected": -2.713073968887329, "loss": 0.4835, "rewards/accuracies": 0.78125, "rewards/chosen": -15.461751937866211, "rewards/margins": 11.668989181518555, "rewards/rejected": -27.130741119384766, "step": 269 }, { "epoch": 0.6732117812061711, "grad_norm": 4.78125, "learning_rate": 1.8163654277518476e-07, "logits/chosen": 0.8847929835319519, "logits/rejected": 0.7221932411193848, "logps/chosen": -1.56783127784729, "logps/rejected": -2.7949106693267822, "loss": 0.357, "rewards/accuracies": 0.875, "rewards/chosen": -15.678312301635742, "rewards/margins": 12.270795822143555, "rewards/rejected": -27.949108123779297, "step": 270 }, { "epoch": 0.6757051581736013, "grad_norm": 5.1875, "learning_rate": 1.7913832788310162e-07, "logits/chosen": 0.9237401485443115, "logits/rejected": 0.8515968322753906, "logps/chosen": -1.6207122802734375, "logps/rejected": -2.975242853164673, "loss": 0.3603, "rewards/accuracies": 0.875, "rewards/chosen": -16.207122802734375, "rewards/margins": 13.54530143737793, "rewards/rejected": -29.752422332763672, "step": 271 }, { "epoch": 0.6781985351410317, "grad_norm": 24.375, "learning_rate": 1.7665148980163747e-07, "logits/chosen": 0.9174185991287231, "logits/rejected": 0.8517237901687622, "logps/chosen": -1.9268598556518555, "logps/rejected": -3.653189182281494, "loss": 0.5412, "rewards/accuracies": 0.78125, "rewards/chosen": -19.268598556518555, "rewards/margins": 17.263296127319336, "rewards/rejected": -36.531890869140625, "step": 272 }, { "epoch": 0.6806919121084619, "grad_norm": 20.125, "learning_rate": 1.741761941167051e-07, "logits/chosen": 0.8469513654708862, "logits/rejected": 0.7570927739143372, "logps/chosen": -1.7269822359085083, "logps/rejected": -3.0970540046691895, "loss": 0.4379, "rewards/accuracies": 0.84375, "rewards/chosen": -17.26982307434082, "rewards/margins": 13.700716018676758, "rewards/rejected": -30.970539093017578, "step": 273 }, { "epoch": 0.6831852890758922, "grad_norm": 7.65625, "learning_rate": 1.7171260564566735e-07, "logits/chosen": 0.853800892829895, "logits/rejected": 0.6823726892471313, "logps/chosen": -1.6823272705078125, "logps/rejected": -3.1744813919067383, "loss": 0.4328, "rewards/accuracies": 0.84375, "rewards/chosen": -16.823274612426758, "rewards/margins": 14.921540260314941, "rewards/rejected": -31.744813919067383, "step": 274 }, { "epoch": 0.6856786660433224, "grad_norm": 7.5625, "learning_rate": 1.6926088842636336e-07, "logits/chosen": 0.8564770817756653, "logits/rejected": 0.7224562168121338, "logps/chosen": -1.7104108333587646, "logps/rejected": -3.030578851699829, "loss": 0.34, "rewards/accuracies": 0.8125, "rewards/chosen": -17.104108810424805, "rewards/margins": 13.201680183410645, "rewards/rejected": -30.3057918548584, "step": 275 }, { "epoch": 0.6881720430107527, "grad_norm": 5.375, "learning_rate": 1.6682120570618583e-07, "logits/chosen": 0.9256489276885986, "logits/rejected": 0.8403459787368774, "logps/chosen": -1.7236558198928833, "logps/rejected": -3.5449249744415283, "loss": 0.2784, "rewards/accuracies": 0.90625, "rewards/chosen": -17.23655891418457, "rewards/margins": 18.212690353393555, "rewards/rejected": -35.449249267578125, "step": 276 }, { "epoch": 0.690665419978183, "grad_norm": 6.75, "learning_rate": 1.6439371993121142e-07, "logits/chosen": 1.0069345235824585, "logits/rejected": 0.8647799491882324, "logps/chosen": -1.7778537273406982, "logps/rejected": -3.3906145095825195, "loss": 0.4693, "rewards/accuracies": 0.84375, "rewards/chosen": -17.77853775024414, "rewards/margins": 16.127605438232422, "rewards/rejected": -33.90614318847656, "step": 277 }, { "epoch": 0.6931587969456132, "grad_norm": 7.15625, "learning_rate": 1.61978592735384e-07, "logits/chosen": 0.7570043802261353, "logits/rejected": 0.7392297387123108, "logps/chosen": -1.772491455078125, "logps/rejected": -3.0083491802215576, "loss": 0.3305, "rewards/accuracies": 0.78125, "rewards/chosen": -17.72491455078125, "rewards/margins": 12.358576774597168, "rewards/rejected": -30.0834903717041, "step": 278 }, { "epoch": 0.6956521739130435, "grad_norm": 6.6875, "learning_rate": 1.595759849297528e-07, "logits/chosen": 0.9452332258224487, "logits/rejected": 0.8405147790908813, "logps/chosen": -1.5206505060195923, "logps/rejected": -2.9708354473114014, "loss": 0.7058, "rewards/accuracies": 0.6875, "rewards/chosen": -15.206504821777344, "rewards/margins": 14.501848220825195, "rewards/rejected": -29.708354949951172, "step": 279 }, { "epoch": 0.6981455508804737, "grad_norm": 5.78125, "learning_rate": 1.5718605649176415e-07, "logits/chosen": 0.9056351780891418, "logits/rejected": 0.759840190410614, "logps/chosen": -1.3903148174285889, "logps/rejected": -2.4290876388549805, "loss": 0.4354, "rewards/accuracies": 0.84375, "rewards/chosen": -13.90314769744873, "rewards/margins": 10.387725830078125, "rewards/rejected": -24.290874481201172, "step": 280 }, { "epoch": 0.700638927847904, "grad_norm": 6.0625, "learning_rate": 1.5480896655460975e-07, "logits/chosen": 0.8469112515449524, "logits/rejected": 0.7188205718994141, "logps/chosen": -1.4428998231887817, "logps/rejected": -3.476562261581421, "loss": 0.4048, "rewards/accuracies": 0.8125, "rewards/chosen": -14.428997993469238, "rewards/margins": 20.336626052856445, "rewards/rejected": -34.765625, "step": 281 }, { "epoch": 0.7031323048153343, "grad_norm": 6.78125, "learning_rate": 1.5244487339663086e-07, "logits/chosen": 0.9786227941513062, "logits/rejected": 0.9008299112319946, "logps/chosen": -2.115980386734009, "logps/rejected": -3.8103702068328857, "loss": 0.3597, "rewards/accuracies": 0.90625, "rewards/chosen": -21.159805297851562, "rewards/margins": 16.943897247314453, "rewards/rejected": -38.103702545166016, "step": 282 }, { "epoch": 0.7056256817827645, "grad_norm": 9.3125, "learning_rate": 1.5009393443077906e-07, "logits/chosen": 0.9762454032897949, "logits/rejected": 0.8306148648262024, "logps/chosen": -1.981116771697998, "logps/rejected": -3.2973973751068115, "loss": 0.4867, "rewards/accuracies": 0.8125, "rewards/chosen": -19.811168670654297, "rewards/margins": 13.16280460357666, "rewards/rejected": -32.973976135253906, "step": 283 }, { "epoch": 0.7081190587501948, "grad_norm": 22.625, "learning_rate": 1.477563061941355e-07, "logits/chosen": 1.017063856124878, "logits/rejected": 0.7016565799713135, "logps/chosen": -1.3156154155731201, "logps/rejected": -2.303849458694458, "loss": 0.6619, "rewards/accuracies": 0.8125, "rewards/chosen": -13.15615463256836, "rewards/margins": 9.882339477539062, "rewards/rejected": -23.038494110107422, "step": 284 }, { "epoch": 0.7106124357176251, "grad_norm": 12.9375, "learning_rate": 1.4543214433748714e-07, "logits/chosen": 1.039493203163147, "logits/rejected": 0.8438839912414551, "logps/chosen": -1.7385656833648682, "logps/rejected": -3.1600584983825684, "loss": 0.4472, "rewards/accuracies": 0.84375, "rewards/chosen": -17.385656356811523, "rewards/margins": 14.21492862701416, "rewards/rejected": -31.6005859375, "step": 285 }, { "epoch": 0.7131058126850554, "grad_norm": 6.125, "learning_rate": 1.4312160361496325e-07, "logits/chosen": 0.880534291267395, "logits/rejected": 0.8419840335845947, "logps/chosen": -1.7119375467300415, "logps/rejected": -3.064938545227051, "loss": 0.5029, "rewards/accuracies": 0.84375, "rewards/chosen": -17.11937713623047, "rewards/margins": 13.530012130737305, "rewards/rejected": -30.64938735961914, "step": 286 }, { "epoch": 0.7155991896524856, "grad_norm": 8.8125, "learning_rate": 1.4082483787373093e-07, "logits/chosen": 0.8826863765716553, "logits/rejected": 0.8228853940963745, "logps/chosen": -1.5570282936096191, "logps/rejected": -2.6813974380493164, "loss": 0.8264, "rewards/accuracies": 0.75, "rewards/chosen": -15.570282936096191, "rewards/margins": 11.243692398071289, "rewards/rejected": -26.813976287841797, "step": 287 }, { "epoch": 0.7180925666199158, "grad_norm": 7.6875, "learning_rate": 1.3854200004375123e-07, "logits/chosen": 0.752357542514801, "logits/rejected": 0.7416955828666687, "logps/chosen": -1.8245292901992798, "logps/rejected": -3.410393714904785, "loss": 0.2918, "rewards/accuracies": 0.90625, "rewards/chosen": -18.24529266357422, "rewards/margins": 15.858641624450684, "rewards/rejected": -34.10393142700195, "step": 288 }, { "epoch": 0.7205859435873461, "grad_norm": 6.375, "learning_rate": 1.3627324212759662e-07, "logits/chosen": 0.9414355754852295, "logits/rejected": 0.7949234843254089, "logps/chosen": -1.5395060777664185, "logps/rejected": -2.7976861000061035, "loss": 0.5103, "rewards/accuracies": 0.875, "rewards/chosen": -15.395059585571289, "rewards/margins": 12.581799507141113, "rewards/rejected": -27.97686195373535, "step": 289 }, { "epoch": 0.7230793205547764, "grad_norm": 13.375, "learning_rate": 1.3401871519032942e-07, "logits/chosen": 0.7719554305076599, "logits/rejected": 0.8289276957511902, "logps/chosen": -1.5537012815475464, "logps/rejected": -2.931002140045166, "loss": 0.4564, "rewards/accuracies": 0.78125, "rewards/chosen": -15.537013053894043, "rewards/margins": 13.7730073928833, "rewards/rejected": -29.310020446777344, "step": 290 }, { "epoch": 0.7255726975222067, "grad_norm": 12.0, "learning_rate": 1.317785693494433e-07, "logits/chosen": 0.906543493270874, "logits/rejected": 0.8372653126716614, "logps/chosen": -1.877508282661438, "logps/rejected": -3.658639669418335, "loss": 0.5423, "rewards/accuracies": 0.78125, "rewards/chosen": -18.775081634521484, "rewards/margins": 17.811315536499023, "rewards/rejected": -36.58639907836914, "step": 291 }, { "epoch": 0.7280660744896369, "grad_norm": 20.25, "learning_rate": 1.2955295376486793e-07, "logits/chosen": 0.9387526512145996, "logits/rejected": 0.8902648687362671, "logps/chosen": -1.6775869131088257, "logps/rejected": -3.076120138168335, "loss": 0.8689, "rewards/accuracies": 0.75, "rewards/chosen": -16.775869369506836, "rewards/margins": 13.985333442687988, "rewards/rejected": -30.76120376586914, "step": 292 }, { "epoch": 0.7305594514570671, "grad_norm": 14.0625, "learning_rate": 1.273420166290371e-07, "logits/chosen": 0.771159827709198, "logits/rejected": 0.7604851126670837, "logps/chosen": -1.4995477199554443, "logps/rejected": -2.7952613830566406, "loss": 0.5616, "rewards/accuracies": 0.78125, "rewards/chosen": -14.995477676391602, "rewards/margins": 12.957136154174805, "rewards/rejected": -27.952613830566406, "step": 293 }, { "epoch": 0.7330528284244975, "grad_norm": 5.625, "learning_rate": 1.2514590515702093e-07, "logits/chosen": 0.9259358048439026, "logits/rejected": 0.8557572364807129, "logps/chosen": -1.718395471572876, "logps/rejected": -3.260573387145996, "loss": 0.4787, "rewards/accuracies": 0.84375, "rewards/chosen": -17.183956146240234, "rewards/margins": 15.421775817871094, "rewards/rejected": -32.60573196411133, "step": 294 }, { "epoch": 0.7355462053919277, "grad_norm": 3.984375, "learning_rate": 1.2296476557672452e-07, "logits/chosen": 0.9226200580596924, "logits/rejected": 0.7464591264724731, "logps/chosen": -1.7733253240585327, "logps/rejected": -3.0839881896972656, "loss": 0.4696, "rewards/accuracies": 0.875, "rewards/chosen": -17.733253479003906, "rewards/margins": 13.10662841796875, "rewards/rejected": -30.839881896972656, "step": 295 }, { "epoch": 0.738039582359358, "grad_norm": 12.8125, "learning_rate": 1.2079874311915026e-07, "logits/chosen": 0.9862551689147949, "logits/rejected": 0.8426701426506042, "logps/chosen": -1.5399045944213867, "logps/rejected": -2.9282631874084473, "loss": 0.5123, "rewards/accuracies": 0.78125, "rewards/chosen": -15.399044036865234, "rewards/margins": 13.883587837219238, "rewards/rejected": -29.282634735107422, "step": 296 }, { "epoch": 0.7405329593267882, "grad_norm": 11.3125, "learning_rate": 1.1864798200872824e-07, "logits/chosen": 0.9428563714027405, "logits/rejected": 0.7972367405891418, "logps/chosen": -1.6001325845718384, "logps/rejected": -3.5637686252593994, "loss": 0.2544, "rewards/accuracies": 0.90625, "rewards/chosen": -16.001325607299805, "rewards/margins": 19.63636016845703, "rewards/rejected": -35.6376838684082, "step": 297 }, { "epoch": 0.7430263362942184, "grad_norm": 27.75, "learning_rate": 1.1651262545371318e-07, "logits/chosen": 0.8185573816299438, "logits/rejected": 0.8264700174331665, "logps/chosen": -1.9273895025253296, "logps/rejected": -3.5442724227905273, "loss": 0.3429, "rewards/accuracies": 0.90625, "rewards/chosen": -19.273895263671875, "rewards/margins": 16.16883087158203, "rewards/rejected": -35.442726135253906, "step": 298 }, { "epoch": 0.7455197132616488, "grad_norm": 10.25, "learning_rate": 1.1439281563664836e-07, "logits/chosen": 0.8733742833137512, "logits/rejected": 0.8228683471679688, "logps/chosen": -2.0226247310638428, "logps/rejected": -3.6978607177734375, "loss": 0.2416, "rewards/accuracies": 0.9375, "rewards/chosen": -20.226245880126953, "rewards/margins": 16.75235939025879, "rewards/rejected": -36.978607177734375, "step": 299 }, { "epoch": 0.748013090229079, "grad_norm": 23.75, "learning_rate": 1.1228869370489933e-07, "logits/chosen": 0.8455230593681335, "logits/rejected": 0.726607620716095, "logps/chosen": -1.7042028903961182, "logps/rejected": -2.9396235942840576, "loss": 0.6624, "rewards/accuracies": 0.8125, "rewards/chosen": -17.042028427124023, "rewards/margins": 12.354209899902344, "rewards/rejected": -29.396238327026367, "step": 300 }, { "epoch": 0.7505064671965093, "grad_norm": 16.625, "learning_rate": 1.1020039976125454e-07, "logits/chosen": 0.862872838973999, "logits/rejected": 0.7240791320800781, "logps/chosen": -1.6873464584350586, "logps/rejected": -3.173642635345459, "loss": 0.4094, "rewards/accuracies": 0.84375, "rewards/chosen": -16.873464584350586, "rewards/margins": 14.862963676452637, "rewards/rejected": -31.73642921447754, "step": 301 }, { "epoch": 0.7529998441639395, "grad_norm": 20.625, "learning_rate": 1.0812807285459737e-07, "logits/chosen": 0.8827072978019714, "logits/rejected": 0.7801661491394043, "logps/chosen": -1.760999321937561, "logps/rejected": -3.0102696418762207, "loss": 0.1915, "rewards/accuracies": 0.9375, "rewards/chosen": -17.60999298095703, "rewards/margins": 12.492703437805176, "rewards/rejected": -30.10269546508789, "step": 302 }, { "epoch": 0.7554932211313699, "grad_norm": 9.875, "learning_rate": 1.0607185097064733e-07, "logits/chosen": 0.9539688229560852, "logits/rejected": 0.8203067183494568, "logps/chosen": -1.5383775234222412, "logps/rejected": -2.5994794368743896, "loss": 0.6321, "rewards/accuracies": 0.78125, "rewards/chosen": -15.38377571105957, "rewards/margins": 10.61102294921875, "rewards/rejected": -25.994796752929688, "step": 303 }, { "epoch": 0.7579865980988001, "grad_norm": 10.375, "learning_rate": 1.0403187102277212e-07, "logits/chosen": 0.9740419387817383, "logits/rejected": 0.7236615419387817, "logps/chosen": -1.680724024772644, "logps/rejected": -3.213937759399414, "loss": 0.5017, "rewards/accuracies": 0.75, "rewards/chosen": -16.807239532470703, "rewards/margins": 15.332136154174805, "rewards/rejected": -32.13937759399414, "step": 304 }, { "epoch": 0.7604799750662303, "grad_norm": 8.5, "learning_rate": 1.020082688428718e-07, "logits/chosen": 0.7849897146224976, "logits/rejected": 0.7147915959358215, "logps/chosen": -1.7177461385726929, "logps/rejected": -3.196275234222412, "loss": 0.4481, "rewards/accuracies": 0.84375, "rewards/chosen": -17.17746353149414, "rewards/margins": 14.785287857055664, "rewards/rejected": -31.962751388549805, "step": 305 }, { "epoch": 0.7629733520336606, "grad_norm": 6.84375, "learning_rate": 1.0000117917233373e-07, "logits/chosen": 0.7844271659851074, "logits/rejected": 0.795640230178833, "logps/chosen": -1.8986274003982544, "logps/rejected": -3.815453052520752, "loss": 0.2918, "rewards/accuracies": 0.875, "rewards/chosen": -18.98627471923828, "rewards/margins": 19.168254852294922, "rewards/rejected": -38.1545295715332, "step": 306 }, { "epoch": 0.7654667290010908, "grad_norm": 22.375, "learning_rate": 9.801073565306134e-08, "logits/chosen": 0.915310800075531, "logits/rejected": 0.8614601492881775, "logps/chosen": -1.577059030532837, "logps/rejected": -2.661583185195923, "loss": 0.7258, "rewards/accuracies": 0.71875, "rewards/chosen": -15.770591735839844, "rewards/margins": 10.84524154663086, "rewards/rejected": -26.61583137512207, "step": 307 }, { "epoch": 0.7679601059685212, "grad_norm": 11.5, "learning_rate": 9.603707081857533e-08, "logits/chosen": 0.8341223001480103, "logits/rejected": 0.7446467876434326, "logps/chosen": -2.0905356407165527, "logps/rejected": -3.864170551300049, "loss": 0.2911, "rewards/accuracies": 0.875, "rewards/chosen": -20.90535545349121, "rewards/margins": 17.73634910583496, "rewards/rejected": -38.64170837402344, "step": 308 }, { "epoch": 0.7704534829359514, "grad_norm": 9.4375, "learning_rate": 9.40803160851891e-08, "logits/chosen": 0.9718061685562134, "logits/rejected": 0.9494335651397705, "logps/chosen": -1.6537656784057617, "logps/rejected": -3.119168758392334, "loss": 0.9953, "rewards/accuracies": 0.78125, "rewards/chosen": -16.537656784057617, "rewards/margins": 14.654030799865723, "rewards/rejected": -31.191692352294922, "step": 309 }, { "epoch": 0.7729468599033816, "grad_norm": 6.1875, "learning_rate": 9.214060174325823e-08, "logits/chosen": 0.7993795871734619, "logits/rejected": 0.7918787002563477, "logps/chosen": -1.9169942140579224, "logps/rejected": -3.608771800994873, "loss": 0.4286, "rewards/accuracies": 0.9375, "rewards/chosen": -19.16994285583496, "rewards/margins": 16.91777229309082, "rewards/rejected": -36.08771514892578, "step": 310 }, { "epoch": 0.7754402368708119, "grad_norm": 76.0, "learning_rate": 9.021805694850552e-08, "logits/chosen": 0.7791964411735535, "logits/rejected": 0.6525046229362488, "logps/chosen": -1.878448724746704, "logps/rejected": -3.2219111919403076, "loss": 0.3889, "rewards/accuracies": 0.96875, "rewards/chosen": -18.784488677978516, "rewards/margins": 13.434623718261719, "rewards/rejected": -32.21910858154297, "step": 311 }, { "epoch": 0.7779336138382422, "grad_norm": 4.09375, "learning_rate": 8.831280971342049e-08, "logits/chosen": 0.8384397625923157, "logits/rejected": 0.8411078453063965, "logps/chosen": -1.9580962657928467, "logps/rejected": -3.684387445449829, "loss": 0.4734, "rewards/accuracies": 0.875, "rewards/chosen": -19.580963134765625, "rewards/margins": 17.26291275024414, "rewards/rejected": -36.8438720703125, "step": 312 }, { "epoch": 0.7804269908056725, "grad_norm": 7.46875, "learning_rate": 8.642498689873619e-08, "logits/chosen": 0.9194357395172119, "logits/rejected": 0.7971946597099304, "logps/chosen": -1.6777794361114502, "logps/rejected": -2.920085906982422, "loss": 0.6, "rewards/accuracies": 0.84375, "rewards/chosen": -16.777795791625977, "rewards/margins": 12.423064231872559, "rewards/rejected": -29.20086097717285, "step": 313 }, { "epoch": 0.7829203677731027, "grad_norm": 25.0, "learning_rate": 8.45547142049821e-08, "logits/chosen": 0.8890621066093445, "logits/rejected": 0.6691703796386719, "logps/chosen": -1.6438565254211426, "logps/rejected": -3.2583494186401367, "loss": 0.2676, "rewards/accuracies": 0.875, "rewards/chosen": -16.43856430053711, "rewards/margins": 16.14493179321289, "rewards/rejected": -32.58349609375, "step": 314 }, { "epoch": 0.7854137447405329, "grad_norm": 36.75, "learning_rate": 8.270211616411413e-08, "logits/chosen": 0.8961160182952881, "logits/rejected": 0.7380497455596924, "logps/chosen": -1.8019180297851562, "logps/rejected": -3.853311061859131, "loss": 0.4376, "rewards/accuracies": 0.84375, "rewards/chosen": -18.019180297851562, "rewards/margins": 20.513931274414062, "rewards/rejected": -38.533111572265625, "step": 315 }, { "epoch": 0.7879071217079632, "grad_norm": 4.9375, "learning_rate": 8.086731613122324e-08, "logits/chosen": 0.8375217914581299, "logits/rejected": 0.706248939037323, "logps/chosen": -1.8641583919525146, "logps/rejected": -3.3603389263153076, "loss": 0.203, "rewards/accuracies": 0.90625, "rewards/chosen": -18.641584396362305, "rewards/margins": 14.961803436279297, "rewards/rejected": -33.60338592529297, "step": 316 }, { "epoch": 0.7904004986753935, "grad_norm": 3.71875, "learning_rate": 7.905043627632113e-08, "logits/chosen": 0.7290425300598145, "logits/rejected": 0.7092160582542419, "logps/chosen": -1.6382381916046143, "logps/rejected": -3.2959959506988525, "loss": 0.2101, "rewards/accuracies": 0.875, "rewards/chosen": -16.382381439208984, "rewards/margins": 16.577579498291016, "rewards/rejected": -32.9599609375, "step": 317 }, { "epoch": 0.7928938756428238, "grad_norm": 6.71875, "learning_rate": 7.725159757620596e-08, "logits/chosen": 0.9056103825569153, "logits/rejected": 0.8917890787124634, "logps/chosen": -1.4776190519332886, "logps/rejected": -2.642958641052246, "loss": 0.46, "rewards/accuracies": 0.84375, "rewards/chosen": -14.776190757751465, "rewards/margins": 11.653392791748047, "rewards/rejected": -26.429582595825195, "step": 318 }, { "epoch": 0.795387252610254, "grad_norm": 7.28125, "learning_rate": 7.547091980640708e-08, "logits/chosen": 0.7614390850067139, "logits/rejected": 0.7574427127838135, "logps/chosen": -1.3012231588363647, "logps/rejected": -2.685375928878784, "loss": 0.4226, "rewards/accuracies": 0.84375, "rewards/chosen": -13.012231826782227, "rewards/margins": 13.841525077819824, "rewards/rejected": -26.853755950927734, "step": 319 }, { "epoch": 0.7978806295776842, "grad_norm": 18.625, "learning_rate": 7.370852153320973e-08, "logits/chosen": 0.9617218971252441, "logits/rejected": 0.7541022896766663, "logps/chosen": -1.5465266704559326, "logps/rejected": -2.6077213287353516, "loss": 0.6237, "rewards/accuracies": 0.8125, "rewards/chosen": -15.465266227722168, "rewards/margins": 10.611949920654297, "rewards/rejected": -26.07721710205078, "step": 320 }, { "epoch": 0.8003740065451146, "grad_norm": 4.96875, "learning_rate": 7.196452010576056e-08, "logits/chosen": 0.8094066381454468, "logits/rejected": 0.7924161553382874, "logps/chosen": -2.0370168685913086, "logps/rejected": -3.8806991577148438, "loss": 0.2498, "rewards/accuracies": 0.90625, "rewards/chosen": -20.370168685913086, "rewards/margins": 18.436824798583984, "rewards/rejected": -38.80699157714844, "step": 321 }, { "epoch": 0.8028673835125448, "grad_norm": 11.625, "learning_rate": 7.023903164825346e-08, "logits/chosen": 0.9718628525733948, "logits/rejected": 0.8176442384719849, "logps/chosen": -2.1258928775787354, "logps/rejected": -4.15927267074585, "loss": 0.6349, "rewards/accuracies": 0.875, "rewards/chosen": -21.258926391601562, "rewards/margins": 20.333797454833984, "rewards/rejected": -41.59272384643555, "step": 322 }, { "epoch": 0.8053607604799751, "grad_norm": 5.78125, "learning_rate": 6.853217105219782e-08, "logits/chosen": 0.7881964445114136, "logits/rejected": 0.6961764693260193, "logps/chosen": -1.541295051574707, "logps/rejected": -2.8516957759857178, "loss": 0.2766, "rewards/accuracies": 0.90625, "rewards/chosen": -15.412951469421387, "rewards/margins": 13.104007720947266, "rewards/rejected": -28.516956329345703, "step": 323 }, { "epoch": 0.8078541374474053, "grad_norm": 6.96875, "learning_rate": 6.684405196876843e-08, "logits/chosen": 0.9054229259490967, "logits/rejected": 0.799680233001709, "logps/chosen": -1.280112624168396, "logps/rejected": -2.1591079235076904, "loss": 0.6875, "rewards/accuracies": 0.78125, "rewards/chosen": -12.801126480102539, "rewards/margins": 8.789952278137207, "rewards/rejected": -21.591079711914062, "step": 324 }, { "epoch": 0.8103475144148355, "grad_norm": 11.875, "learning_rate": 6.517478680123776e-08, "logits/chosen": 0.8642288446426392, "logits/rejected": 0.825298547744751, "logps/chosen": -1.4944114685058594, "logps/rejected": -2.5201451778411865, "loss": 0.7636, "rewards/accuracies": 0.71875, "rewards/chosen": -14.944114685058594, "rewards/margins": 10.257339477539062, "rewards/rejected": -25.201452255249023, "step": 325 }, { "epoch": 0.8128408913822659, "grad_norm": 7.59375, "learning_rate": 6.352448669749224e-08, "logits/chosen": 0.9343512654304504, "logits/rejected": 0.8261175155639648, "logps/chosen": -2.1113977432250977, "logps/rejected": -4.173766136169434, "loss": 0.3753, "rewards/accuracies": 0.875, "rewards/chosen": -21.11397933959961, "rewards/margins": 20.623685836791992, "rewards/rejected": -41.73766326904297, "step": 326 }, { "epoch": 0.8153342683496961, "grad_norm": 5.25, "learning_rate": 6.189326154263068e-08, "logits/chosen": 0.7759539484977722, "logits/rejected": 0.7987840175628662, "logps/chosen": -1.8643192052841187, "logps/rejected": -3.6068685054779053, "loss": 0.4069, "rewards/accuracies": 0.84375, "rewards/chosen": -18.643192291259766, "rewards/margins": 17.425495147705078, "rewards/rejected": -36.068687438964844, "step": 327 }, { "epoch": 0.8178276453171264, "grad_norm": 22.875, "learning_rate": 6.028121995164812e-08, "logits/chosen": 0.8969675302505493, "logits/rejected": 0.7524930238723755, "logps/chosen": -1.4999438524246216, "logps/rejected": -2.7047371864318848, "loss": 0.6928, "rewards/accuracies": 0.78125, "rewards/chosen": -14.999438285827637, "rewards/margins": 12.047935485839844, "rewards/rejected": -27.04737091064453, "step": 328 }, { "epoch": 0.8203210222845566, "grad_norm": 5.0, "learning_rate": 5.868846926220346e-08, "logits/chosen": 0.9210751056671143, "logits/rejected": 0.8755130767822266, "logps/chosen": -2.071080446243286, "logps/rejected": -4.128433704376221, "loss": 0.3026, "rewards/accuracies": 0.875, "rewards/chosen": -20.710805892944336, "rewards/margins": 20.573535919189453, "rewards/rejected": -41.284339904785156, "step": 329 }, { "epoch": 0.822814399251987, "grad_norm": 6.375, "learning_rate": 5.7115115527472575e-08, "logits/chosen": 0.746177077293396, "logits/rejected": 0.7545452117919922, "logps/chosen": -1.6824297904968262, "logps/rejected": -2.987593650817871, "loss": 0.3245, "rewards/accuracies": 0.875, "rewards/chosen": -16.824296951293945, "rewards/margins": 13.051637649536133, "rewards/rejected": -29.875934600830078, "step": 330 }, { "epoch": 0.8253077762194172, "grad_norm": 9.1875, "learning_rate": 5.556126350908654e-08, "logits/chosen": 0.8064150810241699, "logits/rejected": 0.7365544438362122, "logps/chosen": -1.7546963691711426, "logps/rejected": -3.0438730716705322, "loss": 0.4564, "rewards/accuracies": 0.84375, "rewards/chosen": -17.54696273803711, "rewards/margins": 12.891766548156738, "rewards/rejected": -30.438732147216797, "step": 331 }, { "epoch": 0.8278011531868474, "grad_norm": 40.75, "learning_rate": 5.402701667015655e-08, "logits/chosen": 0.8081064820289612, "logits/rejected": 0.8690564632415771, "logps/chosen": -2.0253567695617676, "logps/rejected": -3.4268417358398438, "loss": 0.591, "rewards/accuracies": 0.75, "rewards/chosen": -20.253568649291992, "rewards/margins": 14.014848709106445, "rewards/rejected": -34.26841735839844, "step": 332 }, { "epoch": 0.8302945301542777, "grad_norm": 5.90625, "learning_rate": 5.2512477168384125e-08, "logits/chosen": 0.7250826954841614, "logits/rejected": 0.6569004654884338, "logps/chosen": -1.7179774045944214, "logps/rejected": -3.111191987991333, "loss": 0.4474, "rewards/accuracies": 0.875, "rewards/chosen": -17.179773330688477, "rewards/margins": 13.932147979736328, "rewards/rejected": -31.111919403076172, "step": 333 }, { "epoch": 0.8327879071217079, "grad_norm": 5.5625, "learning_rate": 5.101774584925959e-08, "logits/chosen": 0.7951087355613708, "logits/rejected": 0.7745989561080933, "logps/chosen": -1.7096567153930664, "logps/rejected": -3.168893814086914, "loss": 0.3182, "rewards/accuracies": 0.84375, "rewards/chosen": -17.096567153930664, "rewards/margins": 14.592374801635742, "rewards/rejected": -31.688940048217773, "step": 334 }, { "epoch": 0.8352812840891383, "grad_norm": 11.8125, "learning_rate": 4.9542922239346865e-08, "logits/chosen": 0.9192527532577515, "logits/rejected": 0.8001135587692261, "logps/chosen": -1.9547454118728638, "logps/rejected": -3.6320178508758545, "loss": 0.2258, "rewards/accuracies": 0.875, "rewards/chosen": -19.547454833984375, "rewards/margins": 16.772724151611328, "rewards/rejected": -36.3201789855957, "step": 335 }, { "epoch": 0.8377746610565685, "grad_norm": 9.1875, "learning_rate": 4.8088104539656715e-08, "logits/chosen": 0.7919576168060303, "logits/rejected": 0.8408181667327881, "logps/chosen": -1.7132326364517212, "logps/rejected": -3.2617900371551514, "loss": 0.6176, "rewards/accuracies": 0.78125, "rewards/chosen": -17.132326126098633, "rewards/margins": 15.485575675964355, "rewards/rejected": -32.61790084838867, "step": 336 }, { "epoch": 0.8402680380239987, "grad_norm": 10.8125, "learning_rate": 4.665338961910819e-08, "logits/chosen": 0.9263704419136047, "logits/rejected": 0.9631155729293823, "logps/chosen": -1.8150866031646729, "logps/rejected": -3.4056591987609863, "loss": 0.3432, "rewards/accuracies": 0.8125, "rewards/chosen": -18.15086555480957, "rewards/margins": 15.905729293823242, "rewards/rejected": -34.05659103393555, "step": 337 }, { "epoch": 0.842761414991429, "grad_norm": 14.8125, "learning_rate": 4.5238873008078036e-08, "logits/chosen": 0.92448890209198, "logits/rejected": 0.9032832980155945, "logps/chosen": -1.8629943132400513, "logps/rejected": -3.826144218444824, "loss": 0.4574, "rewards/accuracies": 0.84375, "rewards/chosen": -18.62994384765625, "rewards/margins": 19.631500244140625, "rewards/rejected": -38.261444091796875, "step": 338 }, { "epoch": 0.8452547919588593, "grad_norm": 5.9375, "learning_rate": 4.38446488920405e-08, "logits/chosen": 0.7789740562438965, "logits/rejected": 0.7178948521614075, "logps/chosen": -1.713463544845581, "logps/rejected": -3.125110149383545, "loss": 0.1509, "rewards/accuracies": 0.96875, "rewards/chosen": -17.13463592529297, "rewards/margins": 14.11646556854248, "rewards/rejected": -31.251100540161133, "step": 339 }, { "epoch": 0.8477481689262896, "grad_norm": 11.5, "learning_rate": 4.247081010529546e-08, "logits/chosen": 0.7394505739212036, "logits/rejected": 0.7483058571815491, "logps/chosen": -1.7241424322128296, "logps/rejected": -2.999985456466675, "loss": 0.9044, "rewards/accuracies": 0.71875, "rewards/chosen": -17.241424560546875, "rewards/margins": 12.758427619934082, "rewards/rejected": -29.999855041503906, "step": 340 }, { "epoch": 0.8502415458937198, "grad_norm": 5.34375, "learning_rate": 4.1117448124787594e-08, "logits/chosen": 0.8453940153121948, "logits/rejected": 0.7934137582778931, "logps/chosen": -1.7638615369796753, "logps/rejected": -3.4477648735046387, "loss": 0.3345, "rewards/accuracies": 0.875, "rewards/chosen": -17.63861656188965, "rewards/margins": 16.839033126831055, "rewards/rejected": -34.4776496887207, "step": 341 }, { "epoch": 0.85273492286115, "grad_norm": 8.9375, "learning_rate": 3.9784653064014826e-08, "logits/chosen": 0.9908114671707153, "logits/rejected": 0.7525830268859863, "logps/chosen": -1.7432222366333008, "logps/rejected": -3.2565882205963135, "loss": 0.7245, "rewards/accuracies": 0.71875, "rewards/chosen": -17.432220458984375, "rewards/margins": 15.133658409118652, "rewards/rejected": -32.565879821777344, "step": 342 }, { "epoch": 0.8552282998285803, "grad_norm": 6.96875, "learning_rate": 3.8472513667028556e-08, "logits/chosen": 0.9397574067115784, "logits/rejected": 0.7737162709236145, "logps/chosen": -1.6045491695404053, "logps/rejected": -2.63539981842041, "loss": 0.4466, "rewards/accuracies": 0.875, "rewards/chosen": -16.04549217224121, "rewards/margins": 10.30850601196289, "rewards/rejected": -26.35399627685547, "step": 343 }, { "epoch": 0.8577216767960106, "grad_norm": 4.8125, "learning_rate": 3.7181117302524304e-08, "logits/chosen": 1.0774602890014648, "logits/rejected": 0.7839712500572205, "logps/chosen": -1.9381461143493652, "logps/rejected": -3.418266773223877, "loss": 0.4184, "rewards/accuracies": 0.875, "rewards/chosen": -19.381460189819336, "rewards/margins": 14.801210403442383, "rewards/rejected": -34.18267059326172, "step": 344 }, { "epoch": 0.8602150537634409, "grad_norm": 11.5, "learning_rate": 3.591054995802462e-08, "logits/chosen": 0.8521052598953247, "logits/rejected": 0.8215041756629944, "logps/chosen": -1.5074630975723267, "logps/rejected": -2.6447994709014893, "loss": 0.7877, "rewards/accuracies": 0.78125, "rewards/chosen": -15.074629783630371, "rewards/margins": 11.373364448547363, "rewards/rejected": -26.447994232177734, "step": 345 }, { "epoch": 0.8627084307308711, "grad_norm": 10.3125, "learning_rate": 3.466089623415333e-08, "logits/chosen": 0.8286025524139404, "logits/rejected": 0.7513220310211182, "logps/chosen": -2.0260732173919678, "logps/rejected": -3.53950572013855, "loss": 0.5153, "rewards/accuracies": 0.8125, "rewards/chosen": -20.260732650756836, "rewards/margins": 15.134326934814453, "rewards/rejected": -35.39506149291992, "step": 346 }, { "epoch": 0.8652018076983014, "grad_norm": 4.40625, "learning_rate": 3.3432239339002654e-08, "logits/chosen": 0.6205800175666809, "logits/rejected": 0.8249342441558838, "logps/chosen": -1.9891235828399658, "logps/rejected": -3.901944398880005, "loss": 0.3429, "rewards/accuracies": 0.875, "rewards/chosen": -19.891237258911133, "rewards/margins": 19.12820816040039, "rewards/rejected": -39.01944351196289, "step": 347 }, { "epoch": 0.8676951846657317, "grad_norm": 6.34375, "learning_rate": 3.222466108259252e-08, "logits/chosen": 0.9737166166305542, "logits/rejected": 0.8952223658561707, "logps/chosen": -1.9070756435394287, "logps/rejected": -3.703439712524414, "loss": 0.3237, "rewards/accuracies": 0.90625, "rewards/chosen": -19.07075309753418, "rewards/margins": 17.96364402770996, "rewards/rejected": -37.03439712524414, "step": 348 }, { "epoch": 0.8701885616331619, "grad_norm": 6.65625, "learning_rate": 3.10382418714235e-08, "logits/chosen": 0.9149331450462341, "logits/rejected": 0.8159484267234802, "logps/chosen": -1.6051839590072632, "logps/rejected": -2.925374984741211, "loss": 0.5309, "rewards/accuracies": 0.84375, "rewards/chosen": -16.051841735839844, "rewards/margins": 13.201909065246582, "rewards/rejected": -29.25374984741211, "step": 349 }, { "epoch": 0.8726819386005922, "grad_norm": 6.65625, "learning_rate": 2.9873060703122815e-08, "logits/chosen": 0.9303115606307983, "logits/rejected": 0.8033692836761475, "logps/chosen": -2.0454282760620117, "logps/rejected": -3.6858325004577637, "loss": 0.3837, "rewards/accuracies": 0.84375, "rewards/chosen": -20.454280853271484, "rewards/margins": 16.40404510498047, "rewards/rejected": -36.85832595825195, "step": 350 }, { "epoch": 0.8751753155680224, "grad_norm": 8.0, "learning_rate": 2.8729195161184243e-08, "logits/chosen": 0.7548041939735413, "logits/rejected": 0.8524357080459595, "logps/chosen": -1.8255373239517212, "logps/rejected": -3.674532890319824, "loss": 0.5723, "rewards/accuracies": 0.84375, "rewards/chosen": -18.255373001098633, "rewards/margins": 18.489957809448242, "rewards/rejected": -36.745330810546875, "step": 351 }, { "epoch": 0.8776686925354527, "grad_norm": 15.8125, "learning_rate": 2.7606721409802498e-08, "logits/chosen": 0.9838480353355408, "logits/rejected": 0.8637805581092834, "logps/chosen": -1.717268705368042, "logps/rejected": -2.822934627532959, "loss": 0.7475, "rewards/accuracies": 0.75, "rewards/chosen": -17.17268943786621, "rewards/margins": 11.056660652160645, "rewards/rejected": -28.229345321655273, "step": 352 }, { "epoch": 0.880162069502883, "grad_norm": 6.34375, "learning_rate": 2.650571418880144e-08, "logits/chosen": 0.8108838796615601, "logits/rejected": 0.793830394744873, "logps/chosen": -1.8453660011291504, "logps/rejected": -3.455012083053589, "loss": 0.3673, "rewards/accuracies": 0.875, "rewards/chosen": -18.453659057617188, "rewards/margins": 16.096466064453125, "rewards/rejected": -34.55012512207031, "step": 353 }, { "epoch": 0.8826554464703132, "grad_norm": 5.15625, "learning_rate": 2.5426246808657902e-08, "logits/chosen": 0.7718413472175598, "logits/rejected": 0.7736707925796509, "logps/chosen": -1.9948774576187134, "logps/rejected": -3.8196072578430176, "loss": 0.2475, "rewards/accuracies": 0.90625, "rewards/chosen": -19.948774337768555, "rewards/margins": 18.247299194335938, "rewards/rejected": -38.19607162475586, "step": 354 }, { "epoch": 0.8851488234377435, "grad_norm": 8.25, "learning_rate": 2.4368391145620064e-08, "logits/chosen": 0.8589321374893188, "logits/rejected": 0.7836854457855225, "logps/chosen": -1.6310110092163086, "logps/rejected": -2.939948320388794, "loss": 0.2476, "rewards/accuracies": 0.875, "rewards/chosen": -16.31011199951172, "rewards/margins": 13.089373588562012, "rewards/rejected": -29.39948272705078, "step": 355 }, { "epoch": 0.8876422004051737, "grad_norm": 5.53125, "learning_rate": 2.3332217636921637e-08, "logits/chosen": 0.9285929203033447, "logits/rejected": 0.8569374084472656, "logps/chosen": -1.9147915840148926, "logps/rejected": -3.808230400085449, "loss": 0.3004, "rewards/accuracies": 0.875, "rewards/chosen": -19.147912979125977, "rewards/margins": 18.934389114379883, "rewards/rejected": -38.08230209350586, "step": 356 }, { "epoch": 0.8901355773726041, "grad_norm": 10.6875, "learning_rate": 2.2317795276091977e-08, "logits/chosen": 0.8501561880111694, "logits/rejected": 0.8791577219963074, "logps/chosen": -1.746106505393982, "logps/rejected": -3.179072856903076, "loss": 0.8857, "rewards/accuracies": 0.71875, "rewards/chosen": -17.4610652923584, "rewards/margins": 14.329660415649414, "rewards/rejected": -31.79072380065918, "step": 357 }, { "epoch": 0.8926289543400343, "grad_norm": 9.1875, "learning_rate": 2.1325191608361908e-08, "logits/chosen": 0.8351438641548157, "logits/rejected": 0.8163132667541504, "logps/chosen": -1.5546523332595825, "logps/rejected": -2.714953660964966, "loss": 0.4302, "rewards/accuracies": 0.84375, "rewards/chosen": -15.546524047851562, "rewards/margins": 11.603012084960938, "rewards/rejected": -27.1495361328125, "step": 358 }, { "epoch": 0.8951223313074645, "grad_norm": 12.625, "learning_rate": 2.035447272616638e-08, "logits/chosen": 0.8828765153884888, "logits/rejected": 0.7569836378097534, "logps/chosen": -1.804396629333496, "logps/rejected": -3.248098134994507, "loss": 0.4005, "rewards/accuracies": 0.78125, "rewards/chosen": -18.043964385986328, "rewards/margins": 14.437012672424316, "rewards/rejected": -32.480979919433594, "step": 359 }, { "epoch": 0.8976157082748948, "grad_norm": 4.96875, "learning_rate": 1.9405703264743645e-08, "logits/chosen": 0.8172731995582581, "logits/rejected": 0.7823519706726074, "logps/chosen": -1.4623509645462036, "logps/rejected": -2.556082010269165, "loss": 0.3153, "rewards/accuracies": 0.84375, "rewards/chosen": -14.62350845336914, "rewards/margins": 10.937310218811035, "rewards/rejected": -25.56081771850586, "step": 360 }, { "epoch": 0.900109085242325, "grad_norm": 10.4375, "learning_rate": 1.8478946397831535e-08, "logits/chosen": 0.8463267683982849, "logits/rejected": 0.8544177412986755, "logps/chosen": -1.8382923603057861, "logps/rejected": -3.775862693786621, "loss": 0.405, "rewards/accuracies": 0.90625, "rewards/chosen": -18.382923126220703, "rewards/margins": 19.375703811645508, "rewards/rejected": -37.75862503051758, "step": 361 }, { "epoch": 0.9026024622097554, "grad_norm": 8.5, "learning_rate": 1.7574263833461018e-08, "logits/chosen": 0.85582435131073, "logits/rejected": 0.750614583492279, "logps/chosen": -1.5502458810806274, "logps/rejected": -2.6339385509490967, "loss": 0.4212, "rewards/accuracies": 0.78125, "rewards/chosen": -15.502457618713379, "rewards/margins": 10.836931228637695, "rewards/rejected": -26.33938980102539, "step": 362 }, { "epoch": 0.9050958391771856, "grad_norm": 8.25, "learning_rate": 1.6691715809847622e-08, "logits/chosen": 1.001466989517212, "logits/rejected": 0.8945422768592834, "logps/chosen": -1.4286935329437256, "logps/rejected": -2.590465545654297, "loss": 0.6361, "rewards/accuracies": 0.71875, "rewards/chosen": -14.28693675994873, "rewards/margins": 11.617722511291504, "rewards/rejected": -25.90465545654297, "step": 363 }, { "epoch": 0.9075892161446159, "grad_norm": 27.25, "learning_rate": 1.5831361091380085e-08, "logits/chosen": 1.0156899690628052, "logits/rejected": 0.9483416080474854, "logps/chosen": -2.2000174522399902, "logps/rejected": -3.7587168216705322, "loss": 0.7706, "rewards/accuracies": 0.65625, "rewards/chosen": -22.000173568725586, "rewards/margins": 15.586994171142578, "rewards/rejected": -37.5871696472168, "step": 364 }, { "epoch": 0.9100825931120461, "grad_norm": 11.9375, "learning_rate": 1.4993256964707667e-08, "logits/chosen": 0.9330320358276367, "logits/rejected": 0.7909821271896362, "logps/chosen": -1.7297008037567139, "logps/rejected": -3.1835391521453857, "loss": 0.6938, "rewards/accuracies": 0.71875, "rewards/chosen": -17.297008514404297, "rewards/margins": 14.538382530212402, "rewards/rejected": -31.835391998291016, "step": 365 }, { "epoch": 0.9125759700794764, "grad_norm": 150.0, "learning_rate": 1.4177459234925959e-08, "logits/chosen": 1.0243542194366455, "logits/rejected": 0.8424570560455322, "logps/chosen": -1.55518639087677, "logps/rejected": -2.423957109451294, "loss": 0.74, "rewards/accuracies": 0.625, "rewards/chosen": -15.551864624023438, "rewards/margins": 8.687705993652344, "rewards/rejected": -24.23957061767578, "step": 366 }, { "epoch": 0.9150693470469067, "grad_norm": 3.8125, "learning_rate": 1.3384022221860707e-08, "logits/chosen": 0.7477589845657349, "logits/rejected": 0.7240265011787415, "logps/chosen": -1.9058078527450562, "logps/rejected": -4.386825084686279, "loss": 0.1929, "rewards/accuracies": 0.9375, "rewards/chosen": -19.05807876586914, "rewards/margins": 24.810171127319336, "rewards/rejected": -43.868247985839844, "step": 367 }, { "epoch": 0.9175627240143369, "grad_norm": 11.3125, "learning_rate": 1.2612998756451366e-08, "logits/chosen": 0.8763638734817505, "logits/rejected": 0.8146540522575378, "logps/chosen": -1.8290818929672241, "logps/rejected": -3.116455554962158, "loss": 0.6755, "rewards/accuracies": 0.71875, "rewards/chosen": -18.290821075439453, "rewards/margins": 12.873735427856445, "rewards/rejected": -31.164554595947266, "step": 368 }, { "epoch": 0.9200561009817672, "grad_norm": 45.0, "learning_rate": 1.1864440177232976e-08, "logits/chosen": 0.8767358660697937, "logits/rejected": 0.7798057794570923, "logps/chosen": -1.9157465696334839, "logps/rejected": -4.174086570739746, "loss": 0.3354, "rewards/accuracies": 0.875, "rewards/chosen": -19.1574649810791, "rewards/margins": 22.58340072631836, "rewards/rejected": -41.74085998535156, "step": 369 }, { "epoch": 0.9225494779491974, "grad_norm": 7.6875, "learning_rate": 1.1138396326917977e-08, "logits/chosen": 0.9398146867752075, "logits/rejected": 0.9706467390060425, "logps/chosen": -2.1085903644561768, "logps/rejected": -3.980117082595825, "loss": 0.438, "rewards/accuracies": 0.90625, "rewards/chosen": -21.085905075073242, "rewards/margins": 18.71526336669922, "rewards/rejected": -39.80117416381836, "step": 370 }, { "epoch": 0.9250428549166277, "grad_norm": 6.59375, "learning_rate": 1.0434915549077461e-08, "logits/chosen": 0.9370230436325073, "logits/rejected": 0.6799491047859192, "logps/chosen": -2.0628821849823, "logps/rejected": -4.021778106689453, "loss": 0.2515, "rewards/accuracies": 0.875, "rewards/chosen": -20.628820419311523, "rewards/margins": 19.588960647583008, "rewards/rejected": -40.21778106689453, "step": 371 }, { "epoch": 0.927536231884058, "grad_norm": 4.65625, "learning_rate": 9.754044684922053e-09, "logits/chosen": 0.9780002236366272, "logits/rejected": 0.8844839334487915, "logps/chosen": -2.1081106662750244, "logps/rejected": -3.9642934799194336, "loss": 0.2881, "rewards/accuracies": 0.875, "rewards/chosen": -21.08110809326172, "rewards/margins": 18.56182861328125, "rewards/rejected": -39.64293670654297, "step": 372 }, { "epoch": 0.9300296088514882, "grad_norm": 38.0, "learning_rate": 9.095829070183286e-09, "logits/chosen": 0.8360333442687988, "logits/rejected": 0.7579271793365479, "logps/chosen": -1.7824090719223022, "logps/rejected": -3.0244534015655518, "loss": 0.8543, "rewards/accuracies": 0.78125, "rewards/chosen": -17.82408905029297, "rewards/margins": 12.42044448852539, "rewards/rejected": -30.24453353881836, "step": 373 }, { "epoch": 0.9325229858189185, "grad_norm": 9.625, "learning_rate": 8.460312532094555e-09, "logits/chosen": 0.85768723487854, "logits/rejected": 0.8098315000534058, "logps/chosen": -1.6384191513061523, "logps/rejected": -2.991260051727295, "loss": 0.3512, "rewards/accuracies": 0.875, "rewards/chosen": -16.38418960571289, "rewards/margins": 13.528410911560059, "rewards/rejected": -29.9126033782959, "step": 374 }, { "epoch": 0.9350163627863488, "grad_norm": 164.0, "learning_rate": 7.847537386473157e-09, "logits/chosen": 0.7866430878639221, "logits/rejected": 0.8129922747612, "logps/chosen": -1.9942247867584229, "logps/rejected": -3.5987548828125, "loss": 0.4176, "rewards/accuracies": 0.84375, "rewards/chosen": -19.942249298095703, "rewards/margins": 16.04530143737793, "rewards/rejected": -35.987548828125, "step": 375 }, { "epoch": 0.937509739753779, "grad_norm": 7.46875, "learning_rate": 7.257544434902646e-09, "logits/chosen": 0.7069447040557861, "logits/rejected": 0.7614144086837769, "logps/chosen": -1.3751400709152222, "logps/rejected": -2.30409574508667, "loss": 0.4299, "rewards/accuracies": 0.78125, "rewards/chosen": -13.751401901245117, "rewards/margins": 9.289555549621582, "rewards/rejected": -23.040958404541016, "step": 376 }, { "epoch": 0.9400031167212093, "grad_norm": 11.75, "learning_rate": 6.690372962015922e-09, "logits/chosen": 0.7851680517196655, "logits/rejected": 0.7357572913169861, "logps/chosen": -1.5484297275543213, "logps/rejected": -2.7063143253326416, "loss": 0.835, "rewards/accuracies": 0.78125, "rewards/chosen": -15.484295845031738, "rewards/margins": 11.578847885131836, "rewards/rejected": -27.063142776489258, "step": 377 }, { "epoch": 0.9424964936886395, "grad_norm": 17.625, "learning_rate": 6.146060732879643e-09, "logits/chosen": 0.9812300801277161, "logits/rejected": 0.9034566879272461, "logps/chosen": -1.6774706840515137, "logps/rejected": -2.880139112472534, "loss": 0.711, "rewards/accuracies": 0.6875, "rewards/chosen": -16.774707794189453, "rewards/margins": 12.026679992675781, "rewards/rejected": -28.801387786865234, "step": 378 }, { "epoch": 0.9449898706560698, "grad_norm": 8.6875, "learning_rate": 5.624643990479616e-09, "logits/chosen": 0.8070354461669922, "logits/rejected": 0.8890936374664307, "logps/chosen": -1.5846179723739624, "logps/rejected": -2.867779016494751, "loss": 0.9569, "rewards/accuracies": 0.75, "rewards/chosen": -15.846179962158203, "rewards/margins": 12.831609725952148, "rewards/rejected": -28.67778778076172, "step": 379 }, { "epoch": 0.9474832476235001, "grad_norm": 12.0, "learning_rate": 5.126157453307456e-09, "logits/chosen": 0.9138520359992981, "logits/rejected": 0.8435475826263428, "logps/chosen": -1.5853928327560425, "logps/rejected": -3.102871894836426, "loss": 0.3206, "rewards/accuracies": 0.90625, "rewards/chosen": -15.853928565979004, "rewards/margins": 15.174790382385254, "rewards/rejected": -31.028718948364258, "step": 380 }, { "epoch": 0.9499766245909304, "grad_norm": 5.6875, "learning_rate": 4.6506343130488956e-09, "logits/chosen": 0.7391858100891113, "logits/rejected": 0.7543048858642578, "logps/chosen": -2.2143211364746094, "logps/rejected": -4.464791774749756, "loss": 0.2126, "rewards/accuracies": 0.875, "rewards/chosen": -22.14321517944336, "rewards/margins": 22.50470542907715, "rewards/rejected": -44.647911071777344, "step": 381 }, { "epoch": 0.9524700015583606, "grad_norm": 28.375, "learning_rate": 4.198106232373788e-09, "logits/chosen": 0.8407728672027588, "logits/rejected": 0.7748773694038391, "logps/chosen": -1.5643658638000488, "logps/rejected": -2.904953956604004, "loss": 0.6156, "rewards/accuracies": 0.84375, "rewards/chosen": -15.643659591674805, "rewards/margins": 13.405879020690918, "rewards/rejected": -29.049535751342773, "step": 382 }, { "epoch": 0.9549633785257908, "grad_norm": 5.5, "learning_rate": 3.768603342827719e-09, "logits/chosen": 0.7649537324905396, "logits/rejected": 0.9055894613265991, "logps/chosen": -2.0387561321258545, "logps/rejected": -3.687187671661377, "loss": 0.2725, "rewards/accuracies": 0.875, "rewards/chosen": -20.387561798095703, "rewards/margins": 16.484315872192383, "rewards/rejected": -36.87187957763672, "step": 383 }, { "epoch": 0.9574567554932212, "grad_norm": 9.625, "learning_rate": 3.3621542428259764e-09, "logits/chosen": 0.7531914710998535, "logits/rejected": 0.7303828001022339, "logps/chosen": -1.985339641571045, "logps/rejected": -3.589801549911499, "loss": 0.2374, "rewards/accuracies": 0.90625, "rewards/chosen": -19.853397369384766, "rewards/margins": 16.044618606567383, "rewards/rejected": -35.898014068603516, "step": 384 }, { "epoch": 0.9599501324606514, "grad_norm": 13.1875, "learning_rate": 2.978785995748928e-09, "logits/chosen": 0.8882652521133423, "logits/rejected": 0.7716068029403687, "logps/chosen": -1.404790997505188, "logps/rejected": -2.1909685134887695, "loss": 1.0033, "rewards/accuracies": 0.71875, "rewards/chosen": -14.047908782958984, "rewards/margins": 7.8617753982543945, "rewards/rejected": -21.909685134887695, "step": 385 }, { "epoch": 0.9624435094280817, "grad_norm": 13.8125, "learning_rate": 2.618524128140309e-09, "logits/chosen": 0.8234500885009766, "logits/rejected": 0.826442301273346, "logps/chosen": -1.7514184713363647, "logps/rejected": -3.4685299396514893, "loss": 0.4523, "rewards/accuracies": 0.875, "rewards/chosen": -17.51418685913086, "rewards/margins": 17.171112060546875, "rewards/rejected": -34.685298919677734, "step": 386 }, { "epoch": 0.9649368863955119, "grad_norm": 19.0, "learning_rate": 2.2813926280074225e-09, "logits/chosen": 0.9154322147369385, "logits/rejected": 0.7984371781349182, "logps/chosen": -1.6743313074111938, "logps/rejected": -2.603154182434082, "loss": 0.6813, "rewards/accuracies": 0.75, "rewards/chosen": -16.74331283569336, "rewards/margins": 9.288228988647461, "rewards/rejected": -26.03154182434082, "step": 387 }, { "epoch": 0.9674302633629421, "grad_norm": 9.6875, "learning_rate": 1.9674139432240056e-09, "logits/chosen": 0.7447303533554077, "logits/rejected": 0.6416030526161194, "logps/chosen": -1.896831750869751, "logps/rejected": -3.2503926753997803, "loss": 0.2141, "rewards/accuracies": 0.90625, "rewards/chosen": -18.96831703186035, "rewards/margins": 13.535609245300293, "rewards/rejected": -32.50392532348633, "step": 388 }, { "epoch": 0.9699236403303725, "grad_norm": 3.5, "learning_rate": 1.6766089800352934e-09, "logits/chosen": 0.8328643441200256, "logits/rejected": 0.8856253623962402, "logps/chosen": -2.1792397499084473, "logps/rejected": -4.523174285888672, "loss": 0.2019, "rewards/accuracies": 0.9375, "rewards/chosen": -21.792396545410156, "rewards/margins": 23.43934440612793, "rewards/rejected": -45.23174285888672, "step": 389 }, { "epoch": 0.9724170172978027, "grad_norm": 8.25, "learning_rate": 1.408997101666326e-09, "logits/chosen": 0.8109475374221802, "logits/rejected": 0.9047868251800537, "logps/chosen": -2.0790956020355225, "logps/rejected": -4.099780559539795, "loss": 0.1869, "rewards/accuracies": 0.90625, "rewards/chosen": -20.790956497192383, "rewards/margins": 20.206846237182617, "rewards/rejected": -40.997806549072266, "step": 390 }, { "epoch": 0.974910394265233, "grad_norm": 5.375, "learning_rate": 1.1645961270323746e-09, "logits/chosen": 0.8082598447799683, "logits/rejected": 0.7497880458831787, "logps/chosen": -1.4980417490005493, "logps/rejected": -3.6578102111816406, "loss": 0.3026, "rewards/accuracies": 0.875, "rewards/chosen": -14.98041820526123, "rewards/margins": 21.59768295288086, "rewards/rejected": -36.578102111816406, "step": 391 }, { "epoch": 0.9774037712326632, "grad_norm": 14.0625, "learning_rate": 9.434223295524958e-10, "logits/chosen": 0.822067141532898, "logits/rejected": 0.8411962985992432, "logps/chosen": -1.5677428245544434, "logps/rejected": -2.5952415466308594, "loss": 0.421, "rewards/accuracies": 0.9375, "rewards/chosen": -15.677427291870117, "rewards/margins": 10.274986267089844, "rewards/rejected": -25.95241355895996, "step": 392 }, { "epoch": 0.9798971482000935, "grad_norm": 41.0, "learning_rate": 7.454904360661762e-10, "logits/chosen": 0.7836760878562927, "logits/rejected": 0.7335962653160095, "logps/chosen": -1.7851057052612305, "logps/rejected": -3.307037830352783, "loss": 0.6381, "rewards/accuracies": 0.78125, "rewards/chosen": -17.851055145263672, "rewards/margins": 15.219318389892578, "rewards/rejected": -33.07037353515625, "step": 393 }, { "epoch": 0.9823905251675238, "grad_norm": 7.6875, "learning_rate": 5.708136258525231e-10, "logits/chosen": 1.0210167169570923, "logits/rejected": 0.8218429088592529, "logps/chosen": -1.745602011680603, "logps/rejected": -3.2325327396392822, "loss": 0.4579, "rewards/accuracies": 0.84375, "rewards/chosen": -17.45602035522461, "rewards/margins": 14.869308471679688, "rewards/rejected": -32.3253288269043, "step": 394 }, { "epoch": 0.984883902134954, "grad_norm": 8.8125, "learning_rate": 4.194035297527765e-10, "logits/chosen": 0.9661321640014648, "logits/rejected": 0.8232787847518921, "logps/chosen": -1.6603198051452637, "logps/rejected": -3.042840003967285, "loss": 0.5841, "rewards/accuracies": 0.8125, "rewards/chosen": -16.603199005126953, "rewards/margins": 13.825201034545898, "rewards/rejected": -30.42839813232422, "step": 395 }, { "epoch": 0.9873772791023843, "grad_norm": 9.875, "learning_rate": 2.912702293959901e-10, "logits/chosen": 0.9294121861457825, "logits/rejected": 0.7912936210632324, "logps/chosen": -1.8600192070007324, "logps/rejected": -3.488579750061035, "loss": 0.3076, "rewards/accuracies": 0.90625, "rewards/chosen": -18.600191116333008, "rewards/margins": 16.28560447692871, "rewards/rejected": -34.885799407958984, "step": 396 }, { "epoch": 0.9898706560698145, "grad_norm": 6.28125, "learning_rate": 1.8642225652760746e-10, "logits/chosen": 1.0502732992172241, "logits/rejected": 0.7951204180717468, "logps/chosen": -1.755967617034912, "logps/rejected": -3.1554126739501953, "loss": 0.4734, "rewards/accuracies": 0.84375, "rewards/chosen": -17.559675216674805, "rewards/margins": 13.994451522827148, "rewards/rejected": -31.554126739501953, "step": 397 }, { "epoch": 0.9923640330372449, "grad_norm": 6.15625, "learning_rate": 1.0486659244136054e-10, "logits/chosen": 0.8521815538406372, "logits/rejected": 0.8380050659179688, "logps/chosen": -1.743971586227417, "logps/rejected": -3.0718655586242676, "loss": 0.2066, "rewards/accuracies": 0.875, "rewards/chosen": -17.439714431762695, "rewards/margins": 13.27894115447998, "rewards/rejected": -30.71865463256836, "step": 398 }, { "epoch": 0.9948574100046751, "grad_norm": 8.0625, "learning_rate": 4.6608667514608234e-11, "logits/chosen": 0.8208640217781067, "logits/rejected": 0.7938324213027954, "logps/chosen": -1.5473930835723877, "logps/rejected": -2.6816587448120117, "loss": 0.3252, "rewards/accuracies": 0.875, "rewards/chosen": -15.473930358886719, "rewards/margins": 11.342655181884766, "rewards/rejected": -26.816585540771484, "step": 399 }, { "epoch": 0.9973507869721053, "grad_norm": 4.71875, "learning_rate": 1.1652360846531317e-11, "logits/chosen": 1.0007308721542358, "logits/rejected": 0.8393873572349548, "logps/chosen": -2.072484016418457, "logps/rejected": -4.150708198547363, "loss": 0.3247, "rewards/accuracies": 0.84375, "rewards/chosen": -20.724838256835938, "rewards/margins": 20.782241821289062, "rewards/rejected": -41.507083892822266, "step": 400 }, { "epoch": 0.9998441639395356, "grad_norm": 9.0, "learning_rate": 0.0, "logits/chosen": 0.8794471025466919, "logits/rejected": 0.8441964983940125, "logps/chosen": -1.8147742748260498, "logps/rejected": -3.1960084438323975, "loss": 0.3852, "rewards/accuracies": 0.875, "rewards/chosen": -18.147741317749023, "rewards/margins": 13.812341690063477, "rewards/rejected": -31.9600830078125, "step": 401 }, { "epoch": 0.9998441639395356, "step": 401, "total_flos": 5.68672318443248e+18, "train_loss": 2.14272621887599, "train_runtime": 89392.3847, "train_samples_per_second": 0.144, "train_steps_per_second": 0.004 } ], "logging_steps": 1, "max_steps": 401, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 110, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.68672318443248e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }