diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,11629 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9992652461425422, + "eval_steps": 100, + "global_step": 765, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001306229079924892, + "grad_norm": 5.53417315287533, + "learning_rate": 6.493506493506494e-09, + "logits/chosen": -0.7533285021781921, + "logits/rejected": -0.8020980358123779, + "logps/chosen": -306.6681823730469, + "logps/rejected": -328.7090148925781, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.002612458159849784, + "grad_norm": 5.368357576960736, + "learning_rate": 1.2987012987012988e-08, + "logits/chosen": -0.9330071210861206, + "logits/rejected": -0.9560132026672363, + "logps/chosen": -330.1785888671875, + "logps/rejected": -321.1015625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.0039186872397746755, + "grad_norm": 6.432629734255936, + "learning_rate": 1.9480519480519478e-08, + "logits/chosen": -0.7450475692749023, + "logits/rejected": -0.7653542757034302, + "logps/chosen": -332.4261779785156, + "logps/rejected": -324.4579772949219, + "loss": 0.6931, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.0014441871317103505, + "rewards/margins": 0.0007401347393169999, + "rewards/rejected": -0.0021843216381967068, + "step": 3 + }, + { + "epoch": 0.005224916319699568, + "grad_norm": 7.0047837477379895, + "learning_rate": 2.5974025974025976e-08, + "logits/chosen": -0.7317397594451904, + "logits/rejected": -0.7434061169624329, + "logps/chosen": -350.59619140625, + "logps/rejected": -409.7134704589844, + "loss": 0.6926, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0004230784543324262, + "rewards/margins": 0.00048499589320272207, + "rewards/rejected": -6.191732245497406e-05, + "step": 4 + }, + { + "epoch": 0.006531145399624459, + "grad_norm": 6.366176631380578, + "learning_rate": 3.246753246753246e-08, + "logits/chosen": -0.680822491645813, + "logits/rejected": -0.6706495881080627, + "logps/chosen": -346.29913330078125, + "logps/rejected": -405.939453125, + "loss": 0.6933, + "rewards/accuracies": 0.4375, + "rewards/chosen": 3.0529568903148174e-05, + "rewards/margins": -0.0002498173853382468, + "rewards/rejected": 0.00028034677961841226, + "step": 5 + }, + { + "epoch": 0.007837374479549351, + "grad_norm": 5.6539667591600695, + "learning_rate": 3.8961038961038956e-08, + "logits/chosen": -0.8521619439125061, + "logits/rejected": -0.8027850985527039, + "logps/chosen": -337.5583190917969, + "logps/rejected": -355.25732421875, + "loss": 0.6934, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0013673019129782915, + "rewards/margins": 0.0007385491626337171, + "rewards/rejected": 0.0006287526921369135, + "step": 6 + }, + { + "epoch": 0.009143603559474243, + "grad_norm": 5.560604956235579, + "learning_rate": 4.545454545454545e-08, + "logits/chosen": -0.7255852818489075, + "logits/rejected": -0.7422863245010376, + "logps/chosen": -319.12176513671875, + "logps/rejected": -319.749267578125, + "loss": 0.6933, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.0008336210157722235, + "rewards/margins": 0.001907060039229691, + "rewards/rejected": -0.0010734390234574676, + "step": 7 + }, + { + "epoch": 0.010449832639399135, + "grad_norm": 5.955487253622147, + "learning_rate": 5.194805194805195e-08, + "logits/chosen": -0.7017946243286133, + "logits/rejected": -0.7843577265739441, + "logps/chosen": -278.2752380371094, + "logps/rejected": -350.0848388671875, + "loss": 0.6932, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.001695701852440834, + "rewards/margins": 0.000880406005308032, + "rewards/rejected": -0.002576107857748866, + "step": 8 + }, + { + "epoch": 0.011756061719324026, + "grad_norm": 6.006797093418932, + "learning_rate": 5.844155844155844e-08, + "logits/chosen": -0.6909061074256897, + "logits/rejected": -0.6938877105712891, + "logps/chosen": -325.1270751953125, + "logps/rejected": -367.7559814453125, + "loss": 0.6931, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0005253530107438564, + "rewards/margins": -8.34012171253562e-05, + "rewards/rejected": -0.00044195164809934795, + "step": 9 + }, + { + "epoch": 0.013062290799248918, + "grad_norm": 5.748997143664564, + "learning_rate": 6.493506493506492e-08, + "logits/chosen": -0.8251830339431763, + "logits/rejected": -0.8110605478286743, + "logps/chosen": -347.2880859375, + "logps/rejected": -325.6681823730469, + "loss": 0.6927, + "rewards/accuracies": 0.40625, + "rewards/chosen": 0.0017610812792554498, + "rewards/margins": 0.0014071224723011255, + "rewards/rejected": 0.0003539586905390024, + "step": 10 + }, + { + "epoch": 0.01436851987917381, + "grad_norm": 5.904174312860514, + "learning_rate": 7.142857142857142e-08, + "logits/chosen": -0.9508811235427856, + "logits/rejected": -0.9571268558502197, + "logps/chosen": -372.4806213378906, + "logps/rejected": -368.0173645019531, + "loss": 0.6932, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.001803419436328113, + "rewards/margins": 0.0010235118679702282, + "rewards/rejected": 0.0007799076847732067, + "step": 11 + }, + { + "epoch": 0.015674748959098702, + "grad_norm": 5.508795346436636, + "learning_rate": 7.792207792207791e-08, + "logits/chosen": -0.825182318687439, + "logits/rejected": -0.843744158744812, + "logps/chosen": -314.36944580078125, + "logps/rejected": -337.3401794433594, + "loss": 0.6936, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.00026215065736323595, + "rewards/margins": -0.002879395382478833, + "rewards/rejected": 0.0026172446087002754, + "step": 12 + }, + { + "epoch": 0.016980978039023594, + "grad_norm": 5.203593985403037, + "learning_rate": 8.441558441558441e-08, + "logits/chosen": -0.8466988801956177, + "logits/rejected": -0.8560088276863098, + "logps/chosen": -377.123046875, + "logps/rejected": -355.28460693359375, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": -6.914167897775769e-06, + "rewards/margins": -0.00111465435475111, + "rewards/rejected": 0.0011077403323724866, + "step": 13 + }, + { + "epoch": 0.018287207118948486, + "grad_norm": 4.714653883292506, + "learning_rate": 9.09090909090909e-08, + "logits/chosen": -0.6836709976196289, + "logits/rejected": -0.6261293888092041, + "logps/chosen": -336.0246276855469, + "logps/rejected": -342.3045349121094, + "loss": 0.6929, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.00016624919953756034, + "rewards/margins": 0.0006826830212958157, + "rewards/rejected": -0.0008489321917295456, + "step": 14 + }, + { + "epoch": 0.01959343619887338, + "grad_norm": 6.248100770125027, + "learning_rate": 9.74025974025974e-08, + "logits/chosen": -0.8154281973838806, + "logits/rejected": -0.8204558491706848, + "logps/chosen": -365.2437744140625, + "logps/rejected": -361.426025390625, + "loss": 0.693, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0007468414260074496, + "rewards/margins": 0.0015008283080533147, + "rewards/rejected": -0.0022476697340607643, + "step": 15 + }, + { + "epoch": 0.02089966527879827, + "grad_norm": 5.297998460944868, + "learning_rate": 1.038961038961039e-07, + "logits/chosen": -0.817639172077179, + "logits/rejected": -0.8031401634216309, + "logps/chosen": -360.5960388183594, + "logps/rejected": -339.3817138671875, + "loss": 0.6928, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0016389607917517424, + "rewards/margins": 0.001264180988073349, + "rewards/rejected": -0.002903142012655735, + "step": 16 + }, + { + "epoch": 0.022205894358723163, + "grad_norm": 6.012963273370913, + "learning_rate": 1.1038961038961038e-07, + "logits/chosen": -0.8308616876602173, + "logits/rejected": -0.8376795053482056, + "logps/chosen": -368.58013916015625, + "logps/rejected": -363.4211730957031, + "loss": 0.693, + "rewards/accuracies": 0.53125, + "rewards/chosen": 5.0950038712471724e-05, + "rewards/margins": 0.0007496691541746259, + "rewards/rejected": -0.0006987190572544932, + "step": 17 + }, + { + "epoch": 0.02351212343864805, + "grad_norm": 5.25011457146126, + "learning_rate": 1.1688311688311688e-07, + "logits/chosen": -0.8814838528633118, + "logits/rejected": -0.9372345209121704, + "logps/chosen": -336.8822937011719, + "logps/rejected": -394.75213623046875, + "loss": 0.6926, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.0012537740403786302, + "rewards/margins": 0.00014532334171235561, + "rewards/rejected": -0.0013990971492603421, + "step": 18 + }, + { + "epoch": 0.024818352518572943, + "grad_norm": 6.447271557198901, + "learning_rate": 1.2337662337662337e-07, + "logits/chosen": -0.8274356126785278, + "logits/rejected": -0.8105829358100891, + "logps/chosen": -400.1302490234375, + "logps/rejected": -395.794677734375, + "loss": 0.693, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.0009712266619317234, + "rewards/margins": -0.000335605232976377, + "rewards/rejected": -0.0006356216035783291, + "step": 19 + }, + { + "epoch": 0.026124581598497836, + "grad_norm": 5.223075966265418, + "learning_rate": 1.2987012987012984e-07, + "logits/chosen": -0.8048669099807739, + "logits/rejected": -0.7924157381057739, + "logps/chosen": -364.3175964355469, + "logps/rejected": -345.0390625, + "loss": 0.6931, + "rewards/accuracies": 0.34375, + "rewards/chosen": -0.002713401336222887, + "rewards/margins": -0.0027847718447446823, + "rewards/rejected": 7.137066859286278e-05, + "step": 20 + }, + { + "epoch": 0.027430810678422728, + "grad_norm": 5.073866807536054, + "learning_rate": 1.3636363636363635e-07, + "logits/chosen": -0.710981011390686, + "logits/rejected": -0.7615970373153687, + "logps/chosen": -309.5833435058594, + "logps/rejected": -340.5343322753906, + "loss": 0.6935, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.002394826617091894, + "rewards/margins": -0.0006454420508816838, + "rewards/rejected": -0.0017493844497948885, + "step": 21 + }, + { + "epoch": 0.02873703975834762, + "grad_norm": 5.890261968335633, + "learning_rate": 1.4285714285714285e-07, + "logits/chosen": -0.861196756362915, + "logits/rejected": -0.839537501335144, + "logps/chosen": -328.5140075683594, + "logps/rejected": -330.8149108886719, + "loss": 0.6928, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.002065961482003331, + "rewards/margins": 0.001079261302947998, + "rewards/rejected": -0.0031452225521206856, + "step": 22 + }, + { + "epoch": 0.030043268838272512, + "grad_norm": 6.2911479429566, + "learning_rate": 1.4935064935064935e-07, + "logits/chosen": -0.8220208883285522, + "logits/rejected": -0.7974289655685425, + "logps/chosen": -359.7161560058594, + "logps/rejected": -353.8252868652344, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.002723624696955085, + "rewards/margins": -0.0005458975210785866, + "rewards/rejected": -0.0021777271758764982, + "step": 23 + }, + { + "epoch": 0.031349497918197404, + "grad_norm": 5.635570219614167, + "learning_rate": 1.5584415584415582e-07, + "logits/chosen": -0.8430062532424927, + "logits/rejected": -0.8447110652923584, + "logps/chosen": -374.71612548828125, + "logps/rejected": -394.64373779296875, + "loss": 0.6925, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.0024116707500070333, + "rewards/margins": 0.001287098159082234, + "rewards/rejected": -0.0036987685598433018, + "step": 24 + }, + { + "epoch": 0.032655726998122296, + "grad_norm": 6.5390462612025635, + "learning_rate": 1.6233766233766232e-07, + "logits/chosen": -0.8247517943382263, + "logits/rejected": -0.7864022254943848, + "logps/chosen": -328.7938537597656, + "logps/rejected": -388.9433288574219, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.002558019245043397, + "rewards/margins": 0.0008082650601863861, + "rewards/rejected": -0.0033662840723991394, + "step": 25 + }, + { + "epoch": 0.03396195607804719, + "grad_norm": 5.607040791661334, + "learning_rate": 1.6883116883116883e-07, + "logits/chosen": -0.7527787685394287, + "logits/rejected": -0.788528561592102, + "logps/chosen": -371.24884033203125, + "logps/rejected": -365.76025390625, + "loss": 0.6919, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.002985129365697503, + "rewards/margins": 0.0011370348511263728, + "rewards/rejected": -0.004122164100408554, + "step": 26 + }, + { + "epoch": 0.03526818515797208, + "grad_norm": 6.015200706987167, + "learning_rate": 1.7532467532467533e-07, + "logits/chosen": -0.7967982888221741, + "logits/rejected": -0.7809445858001709, + "logps/chosen": -316.6277160644531, + "logps/rejected": -321.981201171875, + "loss": 0.692, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0029974528588354588, + "rewards/margins": 0.0033523414749652147, + "rewards/rejected": -0.006349794566631317, + "step": 27 + }, + { + "epoch": 0.03657441423789697, + "grad_norm": 5.313916762315249, + "learning_rate": 1.818181818181818e-07, + "logits/chosen": -0.7769748568534851, + "logits/rejected": -0.7578511238098145, + "logps/chosen": -289.66497802734375, + "logps/rejected": -307.5926208496094, + "loss": 0.6926, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0026655124966055155, + "rewards/margins": 0.0017671298701316118, + "rewards/rejected": -0.004432642366737127, + "step": 28 + }, + { + "epoch": 0.037880643317821865, + "grad_norm": 6.03004406279561, + "learning_rate": 1.883116883116883e-07, + "logits/chosen": -0.7647740840911865, + "logits/rejected": -0.7874799966812134, + "logps/chosen": -355.2894592285156, + "logps/rejected": -344.6125183105469, + "loss": 0.6923, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.00052436109399423, + "rewards/margins": 0.0032959890086203814, + "rewards/rejected": -0.0038203501608222723, + "step": 29 + }, + { + "epoch": 0.03918687239774676, + "grad_norm": 5.689135947815352, + "learning_rate": 1.948051948051948e-07, + "logits/chosen": -0.8144919872283936, + "logits/rejected": -0.790741503238678, + "logps/chosen": -331.15130615234375, + "logps/rejected": -396.5157470703125, + "loss": 0.6919, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0018128703813999891, + "rewards/margins": 0.0031230710446834564, + "rewards/rejected": -0.004935941658914089, + "step": 30 + }, + { + "epoch": 0.04049310147767165, + "grad_norm": 5.393155495079252, + "learning_rate": 2.012987012987013e-07, + "logits/chosen": -0.7429512739181519, + "logits/rejected": -0.7122764587402344, + "logps/chosen": -338.0105895996094, + "logps/rejected": -338.2264404296875, + "loss": 0.6922, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.003354718443006277, + "rewards/margins": 0.004183335229754448, + "rewards/rejected": -0.007538053207099438, + "step": 31 + }, + { + "epoch": 0.04179933055759654, + "grad_norm": 5.668901019910231, + "learning_rate": 2.077922077922078e-07, + "logits/chosen": -0.8369336128234863, + "logits/rejected": -0.8760527968406677, + "logps/chosen": -320.0756530761719, + "logps/rejected": -340.4056701660156, + "loss": 0.6928, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.003008706495165825, + "rewards/margins": -0.00027210236294195056, + "rewards/rejected": -0.0027366040740162134, + "step": 32 + }, + { + "epoch": 0.04310555963752143, + "grad_norm": 5.826830283729323, + "learning_rate": 2.1428571428571426e-07, + "logits/chosen": -0.8723607063293457, + "logits/rejected": -0.8672237396240234, + "logps/chosen": -382.5314636230469, + "logps/rejected": -384.20733642578125, + "loss": 0.6915, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.0007879233453422785, + "rewards/margins": 0.004684963263571262, + "rewards/rejected": -0.005472886376082897, + "step": 33 + }, + { + "epoch": 0.044411788717446325, + "grad_norm": 5.158846671276125, + "learning_rate": 2.2077922077922076e-07, + "logits/chosen": -0.9048483967781067, + "logits/rejected": -0.9139261841773987, + "logps/chosen": -373.8707275390625, + "logps/rejected": -382.60357666015625, + "loss": 0.6917, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.008929147385060787, + "rewards/margins": 0.004028785042464733, + "rewards/rejected": -0.01295793242752552, + "step": 34 + }, + { + "epoch": 0.04571801779737122, + "grad_norm": 5.559939318000692, + "learning_rate": 2.2727272727272726e-07, + "logits/chosen": -0.7409634590148926, + "logits/rejected": -0.7191226482391357, + "logps/chosen": -348.6598205566406, + "logps/rejected": -328.2349548339844, + "loss": 0.6925, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.007357947528362274, + "rewards/margins": 0.0007142568356357515, + "rewards/rejected": -0.008072204887866974, + "step": 35 + }, + { + "epoch": 0.0470242468772961, + "grad_norm": 9.154842170611309, + "learning_rate": 2.3376623376623376e-07, + "logits/chosen": -0.8227452635765076, + "logits/rejected": -0.8269484639167786, + "logps/chosen": -357.90045166015625, + "logps/rejected": -402.8288269042969, + "loss": 0.6908, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.009749974124133587, + "rewards/margins": 0.0010247562313452363, + "rewards/rejected": -0.010774729773402214, + "step": 36 + }, + { + "epoch": 0.048330475957220995, + "grad_norm": 5.7031636677945015, + "learning_rate": 2.4025974025974024e-07, + "logits/chosen": -0.7661442756652832, + "logits/rejected": -0.7485809326171875, + "logps/chosen": -342.1812438964844, + "logps/rejected": -372.6727294921875, + "loss": 0.6919, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.008897833526134491, + "rewards/margins": 0.0031514859292656183, + "rewards/rejected": -0.01204932015389204, + "step": 37 + }, + { + "epoch": 0.04963670503714589, + "grad_norm": 4.892465982370716, + "learning_rate": 2.4675324675324674e-07, + "logits/chosen": -0.7579087018966675, + "logits/rejected": -0.7431960105895996, + "logps/chosen": -325.8722229003906, + "logps/rejected": -308.6358642578125, + "loss": 0.6913, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.004845607094466686, + "rewards/margins": -5.3153024055063725e-05, + "rewards/rejected": -0.004792453721165657, + "step": 38 + }, + { + "epoch": 0.05094293411707078, + "grad_norm": 6.881014592563438, + "learning_rate": 2.532467532467532e-07, + "logits/chosen": -0.8480383157730103, + "logits/rejected": -0.8337075710296631, + "logps/chosen": -359.5284423828125, + "logps/rejected": -349.55572509765625, + "loss": 0.691, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.007051127031445503, + "rewards/margins": 0.004876017104834318, + "rewards/rejected": -0.011927143670618534, + "step": 39 + }, + { + "epoch": 0.05224916319699567, + "grad_norm": 5.493754973788018, + "learning_rate": 2.597402597402597e-07, + "logits/chosen": -0.8774313926696777, + "logits/rejected": -0.8068287372589111, + "logps/chosen": -361.03448486328125, + "logps/rejected": -361.7033386230469, + "loss": 0.6909, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.0067117949947714806, + "rewards/margins": 0.005268721375614405, + "rewards/rejected": -0.011980515904724598, + "step": 40 + }, + { + "epoch": 0.05355539227692056, + "grad_norm": 6.457786146530932, + "learning_rate": 2.662337662337662e-07, + "logits/chosen": -0.8075283765792847, + "logits/rejected": -0.8117259740829468, + "logps/chosen": -326.5600891113281, + "logps/rejected": -381.46124267578125, + "loss": 0.6901, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.010759281925857067, + "rewards/margins": 0.00531811686232686, + "rewards/rejected": -0.016077399253845215, + "step": 41 + }, + { + "epoch": 0.054861621356845455, + "grad_norm": 5.567861196809157, + "learning_rate": 2.727272727272727e-07, + "logits/chosen": -0.8133203387260437, + "logits/rejected": -0.7968176603317261, + "logps/chosen": -356.2418212890625, + "logps/rejected": -365.59423828125, + "loss": 0.691, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.007873620837926865, + "rewards/margins": 0.004118986427783966, + "rewards/rejected": -0.011992606334388256, + "step": 42 + }, + { + "epoch": 0.05616785043677035, + "grad_norm": 7.195302776766147, + "learning_rate": 2.792207792207792e-07, + "logits/chosen": -0.6616829633712769, + "logits/rejected": -0.7046526670455933, + "logps/chosen": -355.0251770019531, + "logps/rejected": -368.4796142578125, + "loss": 0.6901, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.005020341835916042, + "rewards/margins": 0.0022294616792351007, + "rewards/rejected": -0.007249803282320499, + "step": 43 + }, + { + "epoch": 0.05747407951669524, + "grad_norm": 5.570002780795404, + "learning_rate": 2.857142857142857e-07, + "logits/chosen": -0.8667637705802917, + "logits/rejected": -0.8768635988235474, + "logps/chosen": -373.51416015625, + "logps/rejected": -368.0703430175781, + "loss": 0.6904, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.017643045634031296, + "rewards/margins": 0.008519397117197514, + "rewards/rejected": -0.026162443682551384, + "step": 44 + }, + { + "epoch": 0.05878030859662013, + "grad_norm": 6.762630790620927, + "learning_rate": 2.922077922077922e-07, + "logits/chosen": -0.8118594884872437, + "logits/rejected": -0.8025345206260681, + "logps/chosen": -397.09771728515625, + "logps/rejected": -412.2569274902344, + "loss": 0.6872, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.0086355684325099, + "rewards/margins": 0.010753954760730267, + "rewards/rejected": -0.019389525055885315, + "step": 45 + }, + { + "epoch": 0.060086537676545024, + "grad_norm": 6.109227844723321, + "learning_rate": 2.987012987012987e-07, + "logits/chosen": -0.6671664714813232, + "logits/rejected": -0.6598337888717651, + "logps/chosen": -353.7411193847656, + "logps/rejected": -359.293701171875, + "loss": 0.6894, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.014533436857163906, + "rewards/margins": 0.0012778714299201965, + "rewards/rejected": -0.015811307355761528, + "step": 46 + }, + { + "epoch": 0.061392766756469916, + "grad_norm": 6.70995116363929, + "learning_rate": 3.0519480519480515e-07, + "logits/chosen": -0.7817015647888184, + "logits/rejected": -0.7327536344528198, + "logps/chosen": -389.1075744628906, + "logps/rejected": -378.82916259765625, + "loss": 0.6882, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.011939141899347305, + "rewards/margins": 0.013087021186947823, + "rewards/rejected": -0.025026164948940277, + "step": 47 + }, + { + "epoch": 0.06269899583639481, + "grad_norm": 5.945912831172837, + "learning_rate": 3.1168831168831165e-07, + "logits/chosen": -0.7871267795562744, + "logits/rejected": -0.8593096733093262, + "logps/chosen": -368.50732421875, + "logps/rejected": -399.8733825683594, + "loss": 0.6879, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.016629451885819435, + "rewards/margins": 0.009660568088293076, + "rewards/rejected": -0.02629002183675766, + "step": 48 + }, + { + "epoch": 0.0640052249163197, + "grad_norm": 5.761148234193876, + "learning_rate": 3.1818181818181815e-07, + "logits/chosen": -0.8592261075973511, + "logits/rejected": -0.8339124917984009, + "logps/chosen": -342.5096435546875, + "logps/rejected": -364.2750244140625, + "loss": 0.6882, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.012762600556015968, + "rewards/margins": 0.007272784598171711, + "rewards/rejected": -0.020035386085510254, + "step": 49 + }, + { + "epoch": 0.06531145399624459, + "grad_norm": 6.025184839765024, + "learning_rate": 3.2467532467532465e-07, + "logits/chosen": -0.8174738883972168, + "logits/rejected": -0.8106712102890015, + "logps/chosen": -361.9267578125, + "logps/rejected": -387.04803466796875, + "loss": 0.6863, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.014780733734369278, + "rewards/margins": 0.008798524737358093, + "rewards/rejected": -0.02357925847172737, + "step": 50 + }, + { + "epoch": 0.06661768307616948, + "grad_norm": 6.1254838917404975, + "learning_rate": 3.3116883116883115e-07, + "logits/chosen": -0.8324989676475525, + "logits/rejected": -0.858439564704895, + "logps/chosen": -397.22735595703125, + "logps/rejected": -415.16827392578125, + "loss": 0.6897, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.022905468940734863, + "rewards/margins": 0.007425696589052677, + "rewards/rejected": -0.030331166461110115, + "step": 51 + }, + { + "epoch": 0.06792391215609438, + "grad_norm": 5.1791059625450835, + "learning_rate": 3.3766233766233765e-07, + "logits/chosen": -0.7436783909797668, + "logits/rejected": -0.6987237930297852, + "logps/chosen": -337.4371032714844, + "logps/rejected": -393.50238037109375, + "loss": 0.6894, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.015715861693024635, + "rewards/margins": 0.009158104658126831, + "rewards/rejected": -0.024873966351151466, + "step": 52 + }, + { + "epoch": 0.06923014123601927, + "grad_norm": 5.593768325173036, + "learning_rate": 3.4415584415584415e-07, + "logits/chosen": -0.8788232207298279, + "logits/rejected": -0.8599483966827393, + "logps/chosen": -354.47039794921875, + "logps/rejected": -345.22979736328125, + "loss": 0.6863, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02064862847328186, + "rewards/margins": 0.010548464953899384, + "rewards/rejected": -0.031197093427181244, + "step": 53 + }, + { + "epoch": 0.07053637031594416, + "grad_norm": 6.190297216841535, + "learning_rate": 3.5064935064935066e-07, + "logits/chosen": -0.8456141948699951, + "logits/rejected": -0.8253046870231628, + "logps/chosen": -354.0450439453125, + "logps/rejected": -398.3966369628906, + "loss": 0.6877, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.022301586344838142, + "rewards/margins": 0.011066760867834091, + "rewards/rejected": -0.033368345350027084, + "step": 54 + }, + { + "epoch": 0.07184259939586905, + "grad_norm": 6.081492793618372, + "learning_rate": 3.5714285714285716e-07, + "logits/chosen": -0.8616874814033508, + "logits/rejected": -0.8373209834098816, + "logps/chosen": -374.4599914550781, + "logps/rejected": -398.0825500488281, + "loss": 0.6842, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.028480036184191704, + "rewards/margins": 0.02368302457034588, + "rewards/rejected": -0.05216306075453758, + "step": 55 + }, + { + "epoch": 0.07314882847579394, + "grad_norm": 5.829629571122562, + "learning_rate": 3.636363636363636e-07, + "logits/chosen": -0.7987987399101257, + "logits/rejected": -0.7420557737350464, + "logps/chosen": -336.6307067871094, + "logps/rejected": -336.77435302734375, + "loss": 0.685, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0241280198097229, + "rewards/margins": 0.024407856166362762, + "rewards/rejected": -0.04853588342666626, + "step": 56 + }, + { + "epoch": 0.07445505755571884, + "grad_norm": 7.817816498473776, + "learning_rate": 3.701298701298701e-07, + "logits/chosen": -0.7835097312927246, + "logits/rejected": -0.8163594007492065, + "logps/chosen": -323.8734436035156, + "logps/rejected": -351.6636047363281, + "loss": 0.682, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.013543693348765373, + "rewards/margins": 0.026210352778434753, + "rewards/rejected": -0.03975404426455498, + "step": 57 + }, + { + "epoch": 0.07576128663564373, + "grad_norm": 5.328210622720547, + "learning_rate": 3.766233766233766e-07, + "logits/chosen": -0.864203691482544, + "logits/rejected": -0.8562058806419373, + "logps/chosen": -342.7728271484375, + "logps/rejected": -363.4197692871094, + "loss": 0.6852, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.0332660973072052, + "rewards/margins": 0.01524202898144722, + "rewards/rejected": -0.04850813001394272, + "step": 58 + }, + { + "epoch": 0.07706751571556862, + "grad_norm": 5.834247719910111, + "learning_rate": 3.831168831168831e-07, + "logits/chosen": -0.958035945892334, + "logits/rejected": -0.9945738315582275, + "logps/chosen": -376.7722473144531, + "logps/rejected": -405.8808288574219, + "loss": 0.6835, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.033054254949092865, + "rewards/margins": 0.016461899504065514, + "rewards/rejected": -0.04951614886522293, + "step": 59 + }, + { + "epoch": 0.07837374479549351, + "grad_norm": 5.941158721215624, + "learning_rate": 3.896103896103896e-07, + "logits/chosen": -0.8537582159042358, + "logits/rejected": -0.8429221510887146, + "logps/chosen": -367.89715576171875, + "logps/rejected": -388.77142333984375, + "loss": 0.6804, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.024989962577819824, + "rewards/margins": 0.03262363001704216, + "rewards/rejected": -0.05761359632015228, + "step": 60 + }, + { + "epoch": 0.0796799738754184, + "grad_norm": 5.888918338304251, + "learning_rate": 3.961038961038961e-07, + "logits/chosen": -0.8765754699707031, + "logits/rejected": -0.8787387609481812, + "logps/chosen": -344.7409973144531, + "logps/rejected": -372.68890380859375, + "loss": 0.6814, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.040865350514650345, + "rewards/margins": 0.035499751567840576, + "rewards/rejected": -0.07636509835720062, + "step": 61 + }, + { + "epoch": 0.0809862029553433, + "grad_norm": 6.390786373294418, + "learning_rate": 4.025974025974026e-07, + "logits/chosen": -0.8274524211883545, + "logits/rejected": -0.7997250556945801, + "logps/chosen": -365.7718200683594, + "logps/rejected": -375.0115661621094, + "loss": 0.6842, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.04738666117191315, + "rewards/margins": 0.021708115935325623, + "rewards/rejected": -0.06909477710723877, + "step": 62 + }, + { + "epoch": 0.08229243203526819, + "grad_norm": 6.591255688502251, + "learning_rate": 4.090909090909091e-07, + "logits/chosen": -0.9231469035148621, + "logits/rejected": -0.9118146300315857, + "logps/chosen": -400.7498779296875, + "logps/rejected": -383.37457275390625, + "loss": 0.6779, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.0491667315363884, + "rewards/margins": 0.02765682339668274, + "rewards/rejected": -0.07682356238365173, + "step": 63 + }, + { + "epoch": 0.08359866111519308, + "grad_norm": 6.116625419008146, + "learning_rate": 4.155844155844156e-07, + "logits/chosen": -0.8560450077056885, + "logits/rejected": -0.8375284075737, + "logps/chosen": -407.3421630859375, + "logps/rejected": -427.3675231933594, + "loss": 0.6778, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06571590155363083, + "rewards/margins": 0.03016790747642517, + "rewards/rejected": -0.095883809030056, + "step": 64 + }, + { + "epoch": 0.08490489019511797, + "grad_norm": 6.560476520141482, + "learning_rate": 4.22077922077922e-07, + "logits/chosen": -0.6509677767753601, + "logits/rejected": -0.6574783325195312, + "logps/chosen": -342.76953125, + "logps/rejected": -382.87091064453125, + "loss": 0.6751, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.06003749370574951, + "rewards/margins": 0.036659955978393555, + "rewards/rejected": -0.09669744968414307, + "step": 65 + }, + { + "epoch": 0.08621111927504287, + "grad_norm": 7.017591808922279, + "learning_rate": 4.285714285714285e-07, + "logits/chosen": -0.8858566284179688, + "logits/rejected": -0.9486796855926514, + "logps/chosen": -410.815673828125, + "logps/rejected": -477.1343078613281, + "loss": 0.6714, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.048973143100738525, + "rewards/margins": 0.07083471864461899, + "rewards/rejected": -0.11980785429477692, + "step": 66 + }, + { + "epoch": 0.08751734835496776, + "grad_norm": 6.342251911458394, + "learning_rate": 4.35064935064935e-07, + "logits/chosen": -0.7848328948020935, + "logits/rejected": -0.8182150721549988, + "logps/chosen": -309.9764099121094, + "logps/rejected": -375.5473937988281, + "loss": 0.671, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.06594257056713104, + "rewards/margins": 0.06243371590971947, + "rewards/rejected": -0.1283762902021408, + "step": 67 + }, + { + "epoch": 0.08882357743489265, + "grad_norm": 8.15248197984531, + "learning_rate": 4.415584415584415e-07, + "logits/chosen": -0.8365911245346069, + "logits/rejected": -0.8488500118255615, + "logps/chosen": -359.4619140625, + "logps/rejected": -402.00555419921875, + "loss": 0.6739, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07775245606899261, + "rewards/margins": 0.059413328766822815, + "rewards/rejected": -0.13716578483581543, + "step": 68 + }, + { + "epoch": 0.09012980651481754, + "grad_norm": 6.142817657239046, + "learning_rate": 4.48051948051948e-07, + "logits/chosen": -0.6473894119262695, + "logits/rejected": -0.6304734349250793, + "logps/chosen": -354.01153564453125, + "logps/rejected": -414.6365661621094, + "loss": 0.6733, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07209914922714233, + "rewards/margins": 0.0502709299325943, + "rewards/rejected": -0.12237009406089783, + "step": 69 + }, + { + "epoch": 0.09143603559474243, + "grad_norm": 6.086995660494217, + "learning_rate": 4.545454545454545e-07, + "logits/chosen": -0.9957101941108704, + "logits/rejected": -0.9663684368133545, + "logps/chosen": -380.725341796875, + "logps/rejected": -373.6209716796875, + "loss": 0.6734, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12014191597700119, + "rewards/margins": 0.0520767904818058, + "rewards/rejected": -0.1722187101840973, + "step": 70 + }, + { + "epoch": 0.09274226467466731, + "grad_norm": 5.584922482955073, + "learning_rate": 4.61038961038961e-07, + "logits/chosen": -0.9089574813842773, + "logits/rejected": -0.8867455720901489, + "logps/chosen": -350.91229248046875, + "logps/rejected": -343.4925842285156, + "loss": 0.6709, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.11282657086849213, + "rewards/margins": 0.05241067707538605, + "rewards/rejected": -0.16523723304271698, + "step": 71 + }, + { + "epoch": 0.0940484937545922, + "grad_norm": 6.927477029011778, + "learning_rate": 4.675324675324675e-07, + "logits/chosen": -0.7093459963798523, + "logits/rejected": -0.7499104142189026, + "logps/chosen": -345.0081787109375, + "logps/rejected": -417.9425048828125, + "loss": 0.6599, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.09345806390047073, + "rewards/margins": 0.07576846331357956, + "rewards/rejected": -0.1692265421152115, + "step": 72 + }, + { + "epoch": 0.0953547228345171, + "grad_norm": 6.702250099967103, + "learning_rate": 4.7402597402597397e-07, + "logits/chosen": -0.818958044052124, + "logits/rejected": -0.8174691200256348, + "logps/chosen": -404.3327941894531, + "logps/rejected": -415.90478515625, + "loss": 0.6667, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.12455743551254272, + "rewards/margins": 0.05297121778130531, + "rewards/rejected": -0.17752866446971893, + "step": 73 + }, + { + "epoch": 0.09666095191444199, + "grad_norm": 6.092144279530064, + "learning_rate": 4.805194805194805e-07, + "logits/chosen": -0.7657275795936584, + "logits/rejected": -0.7777887582778931, + "logps/chosen": -379.3691711425781, + "logps/rejected": -428.69580078125, + "loss": 0.6674, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1482219696044922, + "rewards/margins": 0.06344159692525864, + "rewards/rejected": -0.21166357398033142, + "step": 74 + }, + { + "epoch": 0.09796718099436688, + "grad_norm": 6.813649848962726, + "learning_rate": 4.87012987012987e-07, + "logits/chosen": -0.7140544652938843, + "logits/rejected": -0.7011992931365967, + "logps/chosen": -348.525390625, + "logps/rejected": -352.27520751953125, + "loss": 0.6522, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16398543119430542, + "rewards/margins": 0.07341369986534119, + "rewards/rejected": -0.23739910125732422, + "step": 75 + }, + { + "epoch": 0.09927341007429177, + "grad_norm": 5.924445401824014, + "learning_rate": 4.935064935064935e-07, + "logits/chosen": -0.860354483127594, + "logits/rejected": -0.8541557788848877, + "logps/chosen": -377.2205810546875, + "logps/rejected": -426.15020751953125, + "loss": 0.6664, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.19610106945037842, + "rewards/margins": 0.06874527782201767, + "rewards/rejected": -0.2648463547229767, + "step": 76 + }, + { + "epoch": 0.10057963915421667, + "grad_norm": 6.276805637384467, + "learning_rate": 5e-07, + "logits/chosen": -0.7837976217269897, + "logits/rejected": -0.8085183501243591, + "logps/chosen": -337.7972106933594, + "logps/rejected": -410.7466735839844, + "loss": 0.6488, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18771220743656158, + "rewards/margins": 0.13911239802837372, + "rewards/rejected": -0.3268246054649353, + "step": 77 + }, + { + "epoch": 0.10188586823414156, + "grad_norm": 7.1132775197267835, + "learning_rate": 4.999973936536504e-07, + "logits/chosen": -0.8305960297584534, + "logits/rejected": -0.870134711265564, + "logps/chosen": -373.0445556640625, + "logps/rejected": -421.777587890625, + "loss": 0.6519, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.20363560318946838, + "rewards/margins": 0.08339838683605194, + "rewards/rejected": -0.2870340049266815, + "step": 78 + }, + { + "epoch": 0.10319209731406645, + "grad_norm": 7.439532614597457, + "learning_rate": 4.99989574668946e-07, + "logits/chosen": -0.66015625, + "logits/rejected": -0.6870557069778442, + "logps/chosen": -347.6210632324219, + "logps/rejected": -369.9632568359375, + "loss": 0.6488, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2359056919813156, + "rewards/margins": 0.1219453290104866, + "rewards/rejected": -0.3578509986400604, + "step": 79 + }, + { + "epoch": 0.10449832639399134, + "grad_norm": 7.430436150693382, + "learning_rate": 4.999765432089186e-07, + "logits/chosen": -0.9603585600852966, + "logits/rejected": -0.9237443208694458, + "logps/chosen": -396.7093200683594, + "logps/rejected": -388.0702819824219, + "loss": 0.6565, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.2579508423805237, + "rewards/margins": 0.1478283703327179, + "rewards/rejected": -0.4057792127132416, + "step": 80 + }, + { + "epoch": 0.10580455547391623, + "grad_norm": 7.07421240398173, + "learning_rate": 4.999582995452841e-07, + "logits/chosen": -0.9165019989013672, + "logits/rejected": -0.918879508972168, + "logps/chosen": -442.6611328125, + "logps/rejected": -442.1038513183594, + "loss": 0.6375, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.3471214771270752, + "rewards/margins": 0.2095186412334442, + "rewards/rejected": -0.5566401481628418, + "step": 81 + }, + { + "epoch": 0.10711078455384113, + "grad_norm": 7.473916645229775, + "learning_rate": 4.999348440584371e-07, + "logits/chosen": -0.831704318523407, + "logits/rejected": -0.8036379814147949, + "logps/chosen": -387.5152282714844, + "logps/rejected": -406.57977294921875, + "loss": 0.6279, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3682708144187927, + "rewards/margins": 0.06527732312679291, + "rewards/rejected": -0.43354812264442444, + "step": 82 + }, + { + "epoch": 0.10841701363376602, + "grad_norm": 6.758897060359514, + "learning_rate": 4.999061772374425e-07, + "logits/chosen": -0.8267980217933655, + "logits/rejected": -0.8135631084442139, + "logps/chosen": -360.9224853515625, + "logps/rejected": -410.9493103027344, + "loss": 0.6362, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.31843888759613037, + "rewards/margins": 0.1723370999097824, + "rewards/rejected": -0.490776002407074, + "step": 83 + }, + { + "epoch": 0.10972324271369091, + "grad_norm": 7.324188272112449, + "learning_rate": 4.998722996800258e-07, + "logits/chosen": -0.893416166305542, + "logits/rejected": -0.8873583674430847, + "logps/chosen": -422.30242919921875, + "logps/rejected": -506.13873291015625, + "loss": 0.6302, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.37821486592292786, + "rewards/margins": 0.3631640672683716, + "rewards/rejected": -0.7413789629936218, + "step": 84 + }, + { + "epoch": 0.1110294717936158, + "grad_norm": 7.071494771750178, + "learning_rate": 4.998332120925598e-07, + "logits/chosen": -0.8114030957221985, + "logits/rejected": -0.8294497132301331, + "logps/chosen": -377.10308837890625, + "logps/rejected": -420.3814392089844, + "loss": 0.6317, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3377629518508911, + "rewards/margins": 0.20530584454536438, + "rewards/rejected": -0.5430688261985779, + "step": 85 + }, + { + "epoch": 0.1123357008735407, + "grad_norm": 7.702160752771187, + "learning_rate": 4.997889152900512e-07, + "logits/chosen": -0.8865103125572205, + "logits/rejected": -0.8853594660758972, + "logps/chosen": -379.38360595703125, + "logps/rejected": -412.70294189453125, + "loss": 0.6422, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43963319063186646, + "rewards/margins": 0.18820373713970184, + "rewards/rejected": -0.6278368830680847, + "step": 86 + }, + { + "epoch": 0.11364192995346559, + "grad_norm": 7.899877403482623, + "learning_rate": 4.997394101961223e-07, + "logits/chosen": -0.738101601600647, + "logits/rejected": -0.7562313079833984, + "logps/chosen": -445.0351257324219, + "logps/rejected": -553.4995727539062, + "loss": 0.5974, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5740631222724915, + "rewards/margins": 0.38850170373916626, + "rewards/rejected": -0.9625648856163025, + "step": 87 + }, + { + "epoch": 0.11494815903339048, + "grad_norm": 7.056492791536779, + "learning_rate": 4.996846978429924e-07, + "logits/chosen": -0.8737019896507263, + "logits/rejected": -0.9100275039672852, + "logps/chosen": -349.7705383300781, + "logps/rejected": -389.4776611328125, + "loss": 0.6083, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.39956849813461304, + "rewards/margins": 0.17830373346805573, + "rewards/rejected": -0.5778722167015076, + "step": 88 + }, + { + "epoch": 0.11625438811331537, + "grad_norm": 7.42785869199897, + "learning_rate": 4.996247793714564e-07, + "logits/chosen": -0.8866239786148071, + "logits/rejected": -0.8441641330718994, + "logps/chosen": -401.234130859375, + "logps/rejected": -407.03680419921875, + "loss": 0.6002, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.5768677592277527, + "rewards/margins": 0.22373563051223755, + "rewards/rejected": -0.8006033897399902, + "step": 89 + }, + { + "epoch": 0.11756061719324026, + "grad_norm": 7.132596880162971, + "learning_rate": 4.995596560308606e-07, + "logits/chosen": -0.9549114108085632, + "logits/rejected": -0.9279428720474243, + "logps/chosen": -373.385009765625, + "logps/rejected": -369.7181091308594, + "loss": 0.6257, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5690935850143433, + "rewards/margins": 0.09207704663276672, + "rewards/rejected": -0.6611706614494324, + "step": 90 + }, + { + "epoch": 0.11886684627316516, + "grad_norm": 8.167268203168634, + "learning_rate": 4.994893291790767e-07, + "logits/chosen": -0.8127573132514954, + "logits/rejected": -0.857043981552124, + "logps/chosen": -386.78863525390625, + "logps/rejected": -473.54925537109375, + "loss": 0.6402, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6578102707862854, + "rewards/margins": 0.35495108366012573, + "rewards/rejected": -1.0127613544464111, + "step": 91 + }, + { + "epoch": 0.12017307535309005, + "grad_norm": 8.536222808457058, + "learning_rate": 4.99413800282474e-07, + "logits/chosen": -0.6337046027183533, + "logits/rejected": -0.6360065937042236, + "logps/chosen": -335.1719055175781, + "logps/rejected": -412.2095031738281, + "loss": 0.6204, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.596889078617096, + "rewards/margins": 0.350045770406723, + "rewards/rejected": -0.9469348192214966, + "step": 92 + }, + { + "epoch": 0.12147930443301494, + "grad_norm": 7.4400186544457485, + "learning_rate": 4.993330709158879e-07, + "logits/chosen": -0.9430676698684692, + "logits/rejected": -0.9217436909675598, + "logps/chosen": -443.1399230957031, + "logps/rejected": -472.9926452636719, + "loss": 0.6013, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.693689227104187, + "rewards/margins": 0.39686375856399536, + "rewards/rejected": -1.0905530452728271, + "step": 93 + }, + { + "epoch": 0.12278553351293983, + "grad_norm": 7.040196818071325, + "learning_rate": 4.992471427625881e-07, + "logits/chosen": -0.9236155152320862, + "logits/rejected": -0.9282536506652832, + "logps/chosen": -437.7912292480469, + "logps/rejected": -467.1312255859375, + "loss": 0.5769, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8054531216621399, + "rewards/margins": 0.33481860160827637, + "rewards/rejected": -1.1402716636657715, + "step": 94 + }, + { + "epoch": 0.12409176259286472, + "grad_norm": 7.456887694852602, + "learning_rate": 4.99156017614243e-07, + "logits/chosen": -0.8159753680229187, + "logits/rejected": -0.8799095749855042, + "logps/chosen": -410.9903259277344, + "logps/rejected": -498.1451416015625, + "loss": 0.5744, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7265411019325256, + "rewards/margins": 0.4974522292613983, + "rewards/rejected": -1.2239933013916016, + "step": 95 + }, + { + "epoch": 0.12539799167278962, + "grad_norm": 9.486122843338816, + "learning_rate": 4.990596973708818e-07, + "logits/chosen": -0.9695833325386047, + "logits/rejected": -0.9918654561042786, + "logps/chosen": -425.7803649902344, + "logps/rejected": -433.4793701171875, + "loss": 0.601, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8133587837219238, + "rewards/margins": 0.20483535528182983, + "rewards/rejected": -1.0181939601898193, + "step": 96 + }, + { + "epoch": 0.1267042207527145, + "grad_norm": 7.446348571195673, + "learning_rate": 4.989581840408562e-07, + "logits/chosen": -0.8435705900192261, + "logits/rejected": -0.8851275444030762, + "logps/chosen": -423.5701904296875, + "logps/rejected": -482.48553466796875, + "loss": 0.5832, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8618156909942627, + "rewards/margins": 0.42404988408088684, + "rewards/rejected": -1.2858655452728271, + "step": 97 + }, + { + "epoch": 0.1280104498326394, + "grad_norm": 7.488734645813898, + "learning_rate": 4.988514797407971e-07, + "logits/chosen": -0.8705964088439941, + "logits/rejected": -0.8357868790626526, + "logps/chosen": -428.7393798828125, + "logps/rejected": -447.38116455078125, + "loss": 0.584, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8960607647895813, + "rewards/margins": 0.2598303556442261, + "rewards/rejected": -1.1558911800384521, + "step": 98 + }, + { + "epoch": 0.1293166789125643, + "grad_norm": 8.76021914794472, + "learning_rate": 4.987395866955715e-07, + "logits/chosen": -0.8333250284194946, + "logits/rejected": -0.7638567686080933, + "logps/chosen": -388.22039794921875, + "logps/rejected": -438.1500244140625, + "loss": 0.6623, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0235893726348877, + "rewards/margins": 0.34661492705345154, + "rewards/rejected": -1.3702044486999512, + "step": 99 + }, + { + "epoch": 0.13062290799248918, + "grad_norm": 7.857957421669487, + "learning_rate": 4.986225072382356e-07, + "logits/chosen": -0.8902202844619751, + "logits/rejected": -0.9488434791564941, + "logps/chosen": -463.49908447265625, + "logps/rejected": -563.2487182617188, + "loss": 0.5603, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.0943914651870728, + "rewards/margins": 0.6192384958267212, + "rewards/rejected": -1.713630199432373, + "step": 100 + }, + { + "epoch": 0.13062290799248918, + "eval_logits/chosen": -0.72847580909729, + "eval_logits/rejected": -0.7241373062133789, + "eval_logps/chosen": -452.3144836425781, + "eval_logps/rejected": -505.3885192871094, + "eval_loss": 0.5761846303939819, + "eval_rewards/accuracies": 0.7620000243186951, + "eval_rewards/chosen": -1.0828338861465454, + "eval_rewards/margins": 0.4697641432285309, + "eval_rewards/rejected": -1.5525977611541748, + "eval_runtime": 304.0817, + "eval_samples_per_second": 6.577, + "eval_steps_per_second": 0.411, + "step": 100 + }, + { + "epoch": 0.13192913707241408, + "grad_norm": 7.635265083517575, + "learning_rate": 4.985002438099865e-07, + "logits/chosen": -0.9624214768409729, + "logits/rejected": -0.8964027166366577, + "logps/chosen": -556.7987060546875, + "logps/rejected": -576.9541015625, + "loss": 0.5888, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.1731356382369995, + "rewards/margins": 0.6622936725616455, + "rewards/rejected": -1.8354291915893555, + "step": 101 + }, + { + "epoch": 0.13323536615233897, + "grad_norm": 7.247392624675164, + "learning_rate": 4.983727989601106e-07, + "logits/chosen": -1.0717601776123047, + "logits/rejected": -1.0006539821624756, + "logps/chosen": -481.7669677734375, + "logps/rejected": -495.4006042480469, + "loss": 0.5706, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2065238952636719, + "rewards/margins": 0.4556655287742615, + "rewards/rejected": -1.6621893644332886, + "step": 102 + }, + { + "epoch": 0.13454159523226386, + "grad_norm": 7.552359919662666, + "learning_rate": 4.982401753459316e-07, + "logits/chosen": -0.8172162175178528, + "logits/rejected": -0.8234974145889282, + "logps/chosen": -443.9198303222656, + "logps/rejected": -477.24822998046875, + "loss": 0.5737, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.0775035619735718, + "rewards/margins": 0.36101657152175903, + "rewards/rejected": -1.4385201930999756, + "step": 103 + }, + { + "epoch": 0.13584782431218875, + "grad_norm": 7.922818559159025, + "learning_rate": 4.981023757327539e-07, + "logits/chosen": -0.8530606031417847, + "logits/rejected": -0.9096869230270386, + "logps/chosen": -455.50677490234375, + "logps/rejected": -595.2265625, + "loss": 0.5405, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1649234294891357, + "rewards/margins": 0.7843038439750671, + "rewards/rejected": -1.9492273330688477, + "step": 104 + }, + { + "epoch": 0.13715405339211365, + "grad_norm": 8.198566814604895, + "learning_rate": 4.979594029938057e-07, + "logits/chosen": -0.9811791181564331, + "logits/rejected": -0.9687495231628418, + "logps/chosen": -421.53704833984375, + "logps/rejected": -469.3525390625, + "loss": 0.5213, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1149628162384033, + "rewards/margins": 0.4379804730415344, + "rewards/rejected": -1.552943229675293, + "step": 105 + }, + { + "epoch": 0.13846028247203854, + "grad_norm": 9.074632820163128, + "learning_rate": 4.978112601101787e-07, + "logits/chosen": -0.8650058507919312, + "logits/rejected": -0.7921556234359741, + "logps/chosen": -453.47271728515625, + "logps/rejected": -440.36822509765625, + "loss": 0.5685, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2446175813674927, + "rewards/margins": 0.22569072246551514, + "rewards/rejected": -1.4703084230422974, + "step": 106 + }, + { + "epoch": 0.13976651155196343, + "grad_norm": 8.97067518042141, + "learning_rate": 4.976579501707664e-07, + "logits/chosen": -0.9297839403152466, + "logits/rejected": -0.9460043907165527, + "logps/chosen": -449.9891662597656, + "logps/rejected": -488.8280334472656, + "loss": 0.6154, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3405613899230957, + "rewards/margins": 0.4242904484272003, + "rewards/rejected": -1.7648519277572632, + "step": 107 + }, + { + "epoch": 0.14107274063188832, + "grad_norm": 9.517296734066722, + "learning_rate": 4.97499476372199e-07, + "logits/chosen": -0.889796257019043, + "logits/rejected": -0.9328855276107788, + "logps/chosen": -496.8541564941406, + "logps/rejected": -586.541015625, + "loss": 0.5356, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3333938121795654, + "rewards/margins": 0.5474119186401367, + "rewards/rejected": -1.8808057308197021, + "step": 108 + }, + { + "epoch": 0.1423789697118132, + "grad_norm": 10.357543065107334, + "learning_rate": 4.973358420187775e-07, + "logits/chosen": -0.9294272065162659, + "logits/rejected": -0.9437241554260254, + "logps/chosen": -470.52130126953125, + "logps/rejected": -586.981689453125, + "loss": 0.5847, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3664886951446533, + "rewards/margins": 0.8229946494102478, + "rewards/rejected": -2.189483165740967, + "step": 109 + }, + { + "epoch": 0.1436851987917381, + "grad_norm": 9.267300039036453, + "learning_rate": 4.971670505224043e-07, + "logits/chosen": -0.6836638450622559, + "logits/rejected": -0.7107027173042297, + "logps/chosen": -498.22015380859375, + "logps/rejected": -625.4920043945312, + "loss": 0.5078, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.5284080505371094, + "rewards/margins": 0.881400465965271, + "rewards/rejected": -2.409808397293091, + "step": 110 + }, + { + "epoch": 0.144991427871663, + "grad_norm": 9.927126856410316, + "learning_rate": 4.969931054025121e-07, + "logits/chosen": -0.8554806709289551, + "logits/rejected": -0.8764801025390625, + "logps/chosen": -453.3316345214844, + "logps/rejected": -546.2518310546875, + "loss": 0.581, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.422972559928894, + "rewards/margins": 0.7001107335090637, + "rewards/rejected": -2.1230833530426025, + "step": 111 + }, + { + "epoch": 0.1462976569515879, + "grad_norm": 8.326539836612254, + "learning_rate": 4.968140102859908e-07, + "logits/chosen": -0.8901181817054749, + "logits/rejected": -0.8997009992599487, + "logps/chosen": -451.3870849609375, + "logps/rejected": -563.731201171875, + "loss": 0.5226, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4622441530227661, + "rewards/margins": 0.8325024843215942, + "rewards/rejected": -2.2947468757629395, + "step": 112 + }, + { + "epoch": 0.14760388603151278, + "grad_norm": 12.633047907372701, + "learning_rate": 4.966297689071116e-07, + "logits/chosen": -0.8476269841194153, + "logits/rejected": -0.9035442471504211, + "logps/chosen": -531.0331420898438, + "logps/rejected": -632.9279174804688, + "loss": 0.5371, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5847188234329224, + "rewards/margins": 0.6562604904174805, + "rewards/rejected": -2.2409791946411133, + "step": 113 + }, + { + "epoch": 0.14891011511143767, + "grad_norm": 9.252739358001087, + "learning_rate": 4.964403851074493e-07, + "logits/chosen": -0.9159296154975891, + "logits/rejected": -0.9023479223251343, + "logps/chosen": -538.9677124023438, + "logps/rejected": -565.4589233398438, + "loss": 0.5988, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9100488424301147, + "rewards/margins": 0.4137873351573944, + "rewards/rejected": -2.323836088180542, + "step": 114 + }, + { + "epoch": 0.15021634419136257, + "grad_norm": 8.76761591596254, + "learning_rate": 4.962458628358021e-07, + "logits/chosen": -0.758056640625, + "logits/rejected": -0.795865535736084, + "logps/chosen": -458.5084228515625, + "logps/rejected": -566.1270141601562, + "loss": 0.5715, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5470147132873535, + "rewards/margins": 0.7053811550140381, + "rewards/rejected": -2.2523956298828125, + "step": 115 + }, + { + "epoch": 0.15152257327128746, + "grad_norm": 8.875080902313666, + "learning_rate": 4.960462061481092e-07, + "logits/chosen": -0.8292801976203918, + "logits/rejected": -0.8678185939788818, + "logps/chosen": -449.1661376953125, + "logps/rejected": -524.7813110351562, + "loss": 0.5424, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5658118724822998, + "rewards/margins": 0.7005196809768677, + "rewards/rejected": -2.266331434249878, + "step": 116 + }, + { + "epoch": 0.15282880235121235, + "grad_norm": 9.014607302875838, + "learning_rate": 4.958414192073665e-07, + "logits/chosen": -0.9082063436508179, + "logits/rejected": -0.8753992915153503, + "logps/chosen": -563.1096801757812, + "logps/rejected": -673.3067016601562, + "loss": 0.5354, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8120057582855225, + "rewards/margins": 0.3947896659374237, + "rewards/rejected": -2.2067954540252686, + "step": 117 + }, + { + "epoch": 0.15413503143113724, + "grad_norm": 11.06100420906947, + "learning_rate": 4.956315062835396e-07, + "logits/chosen": -0.8366618156433105, + "logits/rejected": -0.8464934229850769, + "logps/chosen": -440.1483154296875, + "logps/rejected": -477.4801025390625, + "loss": 0.5435, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5700161457061768, + "rewards/margins": 0.3956853747367859, + "rewards/rejected": -1.9657015800476074, + "step": 118 + }, + { + "epoch": 0.15544126051106213, + "grad_norm": 12.513622831753462, + "learning_rate": 4.954164717534748e-07, + "logits/chosen": -0.8261559009552002, + "logits/rejected": -0.7612857222557068, + "logps/chosen": -460.90472412109375, + "logps/rejected": -490.856201171875, + "loss": 0.5343, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5897612571716309, + "rewards/margins": 0.3713700473308563, + "rewards/rejected": -1.9611313343048096, + "step": 119 + }, + { + "epoch": 0.15674748959098703, + "grad_norm": 8.160394952244957, + "learning_rate": 4.951963201008075e-07, + "logits/chosen": -0.8434547185897827, + "logits/rejected": -0.8719228506088257, + "logps/chosen": -502.63629150390625, + "logps/rejected": -562.45654296875, + "loss": 0.5379, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5950428247451782, + "rewards/margins": 0.6352701187133789, + "rewards/rejected": -2.2303130626678467, + "step": 120 + }, + { + "epoch": 0.15805371867091192, + "grad_norm": 8.672085140532355, + "learning_rate": 4.949710559158699e-07, + "logits/chosen": -0.9285653829574585, + "logits/rejected": -0.9329188466072083, + "logps/chosen": -584.3574829101562, + "logps/rejected": -701.239013671875, + "loss": 0.5285, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.002269744873047, + "rewards/margins": 0.9513869285583496, + "rewards/rejected": -2.9536566734313965, + "step": 121 + }, + { + "epoch": 0.1593599477508368, + "grad_norm": 9.635911922607669, + "learning_rate": 4.947406838955933e-07, + "logits/chosen": -0.7806838750839233, + "logits/rejected": -0.7795370221138, + "logps/chosen": -526.0626831054688, + "logps/rejected": -689.8364868164062, + "loss": 0.5524, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6684224605560303, + "rewards/margins": 1.2676262855529785, + "rewards/rejected": -2.936048746109009, + "step": 122 + }, + { + "epoch": 0.1606661768307617, + "grad_norm": 8.522293868309866, + "learning_rate": 4.945052088434123e-07, + "logits/chosen": -1.0120658874511719, + "logits/rejected": -1.0085734128952026, + "logps/chosen": -587.6144409179688, + "logps/rejected": -654.2259521484375, + "loss": 0.5153, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9400118589401245, + "rewards/margins": 0.6181426048278809, + "rewards/rejected": -2.558154582977295, + "step": 123 + }, + { + "epoch": 0.1619724059106866, + "grad_norm": 9.48236040451038, + "learning_rate": 4.942646356691631e-07, + "logits/chosen": -0.7166386246681213, + "logits/rejected": -0.7875579595565796, + "logps/chosen": -539.0728759765625, + "logps/rejected": -672.5732421875, + "loss": 0.5045, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.182424545288086, + "rewards/margins": 0.7280668616294861, + "rewards/rejected": -2.910491466522217, + "step": 124 + }, + { + "epoch": 0.1632786349906115, + "grad_norm": 8.526936686673514, + "learning_rate": 4.940189693889818e-07, + "logits/chosen": -0.988465428352356, + "logits/rejected": -1.012081265449524, + "logps/chosen": -544.8677978515625, + "logps/rejected": -642.533447265625, + "loss": 0.4646, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7634834051132202, + "rewards/margins": 0.7361648678779602, + "rewards/rejected": -2.499648332595825, + "step": 125 + }, + { + "epoch": 0.16458486407053638, + "grad_norm": 12.786714115254995, + "learning_rate": 4.937682151251997e-07, + "logits/chosen": -0.9385434985160828, + "logits/rejected": -0.8883792161941528, + "logps/chosen": -572.0708618164062, + "logps/rejected": -620.1557006835938, + "loss": 0.5329, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0728726387023926, + "rewards/margins": 0.634572446346283, + "rewards/rejected": -2.7074453830718994, + "step": 126 + }, + { + "epoch": 0.16589109315046127, + "grad_norm": 9.282917945488935, + "learning_rate": 4.935123781062365e-07, + "logits/chosen": -0.9921805262565613, + "logits/rejected": -0.9778479337692261, + "logps/chosen": -545.0863647460938, + "logps/rejected": -655.0972290039062, + "loss": 0.5103, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.6734533309936523, + "rewards/margins": 1.0550148487091064, + "rewards/rejected": -2.7284679412841797, + "step": 127 + }, + { + "epoch": 0.16719732223038616, + "grad_norm": 9.229465913733174, + "learning_rate": 4.932514636664913e-07, + "logits/chosen": -0.9057982563972473, + "logits/rejected": -0.8632020950317383, + "logps/chosen": -469.0540771484375, + "logps/rejected": -525.9947509765625, + "loss": 0.4629, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.562605619430542, + "rewards/margins": 0.8607323169708252, + "rewards/rejected": -2.423337936401367, + "step": 128 + }, + { + "epoch": 0.16850355131031106, + "grad_norm": 8.962747801495956, + "learning_rate": 4.929854772462311e-07, + "logits/chosen": -0.8092485070228577, + "logits/rejected": -0.7825321555137634, + "logps/chosen": -504.3341064453125, + "logps/rejected": -535.06396484375, + "loss": 0.5647, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.720110535621643, + "rewards/margins": 0.45117801427841187, + "rewards/rejected": -2.1712887287139893, + "step": 129 + }, + { + "epoch": 0.16980978039023595, + "grad_norm": 9.67145433756404, + "learning_rate": 4.927144243914781e-07, + "logits/chosen": -0.804796040058136, + "logits/rejected": -0.8286614418029785, + "logps/chosen": -493.044677734375, + "logps/rejected": -635.99560546875, + "loss": 0.4706, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.03037691116333, + "rewards/margins": 1.352684497833252, + "rewards/rejected": -3.383061170578003, + "step": 130 + }, + { + "epoch": 0.17111600947016084, + "grad_norm": 8.961180482268233, + "learning_rate": 4.924383107538929e-07, + "logits/chosen": -0.9036099910736084, + "logits/rejected": -0.9011116027832031, + "logps/chosen": -551.7821655273438, + "logps/rejected": -679.7416381835938, + "loss": 0.5407, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1092605590820312, + "rewards/margins": 0.9267193675041199, + "rewards/rejected": -3.035979747772217, + "step": 131 + }, + { + "epoch": 0.17242223855008573, + "grad_norm": 11.220325409910862, + "learning_rate": 4.921571420906578e-07, + "logits/chosen": -0.872887372970581, + "logits/rejected": -0.8407180309295654, + "logps/chosen": -550.0153198242188, + "logps/rejected": -656.0322875976562, + "loss": 0.5722, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.172218084335327, + "rewards/margins": 0.5464658141136169, + "rewards/rejected": -2.718684196472168, + "step": 132 + }, + { + "epoch": 0.17372846763001062, + "grad_norm": 9.18653059128084, + "learning_rate": 4.918709242643563e-07, + "logits/chosen": -0.8891811370849609, + "logits/rejected": -0.9292304515838623, + "logps/chosen": -517.5161743164062, + "logps/rejected": -619.11328125, + "loss": 0.4595, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.8330395221710205, + "rewards/margins": 0.7570334076881409, + "rewards/rejected": -2.5900731086730957, + "step": 133 + }, + { + "epoch": 0.17503469670993552, + "grad_norm": 9.18457403544812, + "learning_rate": 4.915796632428505e-07, + "logits/chosen": -0.9480959177017212, + "logits/rejected": -0.9999913573265076, + "logps/chosen": -586.74169921875, + "logps/rejected": -696.614013671875, + "loss": 0.4629, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.009488105773926, + "rewards/margins": 0.9081063866615295, + "rewards/rejected": -2.9175941944122314, + "step": 134 + }, + { + "epoch": 0.1763409257898604, + "grad_norm": 11.727197619914456, + "learning_rate": 4.912833650991573e-07, + "logits/chosen": -0.8458810448646545, + "logits/rejected": -0.8598436713218689, + "logps/chosen": -524.3538818359375, + "logps/rejected": -614.8167724609375, + "loss": 0.5901, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.777543544769287, + "rewards/margins": 1.0405200719833374, + "rewards/rejected": -2.818063735961914, + "step": 135 + }, + { + "epoch": 0.1776471548697853, + "grad_norm": 9.191709773975337, + "learning_rate": 4.909820360113213e-07, + "logits/chosen": -0.9621294736862183, + "logits/rejected": -1.0132098197937012, + "logps/chosen": -567.4052734375, + "logps/rejected": -681.6702880859375, + "loss": 0.4652, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.1079905033111572, + "rewards/margins": 0.9598952531814575, + "rewards/rejected": -3.0678858757019043, + "step": 136 + }, + { + "epoch": 0.1789533839497102, + "grad_norm": 9.346469893179457, + "learning_rate": 4.906756822622864e-07, + "logits/chosen": -0.8825656175613403, + "logits/rejected": -0.9310536980628967, + "logps/chosen": -524.7208862304688, + "logps/rejected": -718.4395141601562, + "loss": 0.4943, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8902442455291748, + "rewards/margins": 1.395514726638794, + "rewards/rejected": -3.2857589721679688, + "step": 137 + }, + { + "epoch": 0.18025961302963509, + "grad_norm": 8.885243669741756, + "learning_rate": 4.903643102397643e-07, + "logits/chosen": -0.8142070174217224, + "logits/rejected": -0.8311284780502319, + "logps/chosen": -557.0999145507812, + "logps/rejected": -668.8630981445312, + "loss": 0.4997, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.0316572189331055, + "rewards/margins": 0.9219380617141724, + "rewards/rejected": -2.9535951614379883, + "step": 138 + }, + { + "epoch": 0.18156584210955998, + "grad_norm": 10.223180789734753, + "learning_rate": 4.900479264361018e-07, + "logits/chosen": -0.9322543740272522, + "logits/rejected": -0.9504646062850952, + "logps/chosen": -603.5999755859375, + "logps/rejected": -702.2202758789062, + "loss": 0.4762, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.2028281688690186, + "rewards/margins": 0.9659155011177063, + "rewards/rejected": -3.16874361038208, + "step": 139 + }, + { + "epoch": 0.18287207118948487, + "grad_norm": 9.20081752161527, + "learning_rate": 4.897265374481447e-07, + "logits/chosen": -0.8732661008834839, + "logits/rejected": -0.9553130269050598, + "logps/chosen": -511.3672790527344, + "logps/rejected": -646.978759765625, + "loss": 0.468, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8701015710830688, + "rewards/margins": 0.940824031829834, + "rewards/rejected": -2.8109254837036133, + "step": 140 + }, + { + "epoch": 0.18417830026940976, + "grad_norm": 9.432685979588154, + "learning_rate": 4.894001499771015e-07, + "logits/chosen": -0.9715933799743652, + "logits/rejected": -0.8695637583732605, + "logps/chosen": -649.057373046875, + "logps/rejected": -643.0631103515625, + "loss": 0.4503, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4942514896392822, + "rewards/margins": 0.5338544249534607, + "rewards/rejected": -3.0281057357788086, + "step": 141 + }, + { + "epoch": 0.18548452934933463, + "grad_norm": 13.104777321859196, + "learning_rate": 4.890687708284024e-07, + "logits/chosen": -0.9079119563102722, + "logits/rejected": -0.9527072906494141, + "logps/chosen": -589.8755493164062, + "logps/rejected": -756.660400390625, + "loss": 0.4617, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2556536197662354, + "rewards/margins": 1.3938080072402954, + "rewards/rejected": -3.649461507797241, + "step": 142 + }, + { + "epoch": 0.18679075842925952, + "grad_norm": 10.528385278196993, + "learning_rate": 4.887324069115581e-07, + "logits/chosen": -0.9083209037780762, + "logits/rejected": -0.8825032711029053, + "logps/chosen": -600.8740234375, + "logps/rejected": -660.4830322265625, + "loss": 0.5546, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.31591796875, + "rewards/margins": 0.6303241848945618, + "rewards/rejected": -2.946242332458496, + "step": 143 + }, + { + "epoch": 0.1880969875091844, + "grad_norm": 13.644717006371213, + "learning_rate": 4.883910652400155e-07, + "logits/chosen": -0.8000559210777283, + "logits/rejected": -0.8278550505638123, + "logps/chosen": -562.575927734375, + "logps/rejected": -690.1162109375, + "loss": 0.4827, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.350778102874756, + "rewards/margins": 0.9358606338500977, + "rewards/rejected": -3.2866384983062744, + "step": 144 + }, + { + "epoch": 0.1894032165891093, + "grad_norm": 11.442706123817796, + "learning_rate": 4.880447529310118e-07, + "logits/chosen": -0.7975379824638367, + "logits/rejected": -0.8202475309371948, + "logps/chosen": -589.868408203125, + "logps/rejected": -746.5616455078125, + "loss": 0.5009, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.4633235931396484, + "rewards/margins": 1.171708106994629, + "rewards/rejected": -3.6350317001342773, + "step": 145 + }, + { + "epoch": 0.1907094456690342, + "grad_norm": 10.565974328419516, + "learning_rate": 4.876934772054251e-07, + "logits/chosen": -0.7488982081413269, + "logits/rejected": -0.810793399810791, + "logps/chosen": -613.113525390625, + "logps/rejected": -894.685302734375, + "loss": 0.4131, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.57572865486145, + "rewards/margins": 2.021328926086426, + "rewards/rejected": -4.597057342529297, + "step": 146 + }, + { + "epoch": 0.1920156747489591, + "grad_norm": 11.457022715404966, + "learning_rate": 4.873372453876254e-07, + "logits/chosen": -0.8528122305870056, + "logits/rejected": -0.9192670583724976, + "logps/chosen": -590.0909423828125, + "logps/rejected": -798.7685546875, + "loss": 0.4564, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5091986656188965, + "rewards/margins": 1.6112761497497559, + "rewards/rejected": -4.120474815368652, + "step": 147 + }, + { + "epoch": 0.19332190382888398, + "grad_norm": 13.441936360102783, + "learning_rate": 4.869760649053207e-07, + "logits/chosen": -0.8818020820617676, + "logits/rejected": -0.8745623230934143, + "logps/chosen": -628.12109375, + "logps/rejected": -758.0457763671875, + "loss": 0.5465, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.561213493347168, + "rewards/margins": 1.085404872894287, + "rewards/rejected": -3.646618366241455, + "step": 148 + }, + { + "epoch": 0.19462813290880887, + "grad_norm": 11.44437228846766, + "learning_rate": 4.866099432894024e-07, + "logits/chosen": -0.7465147376060486, + "logits/rejected": -0.7887036800384521, + "logps/chosen": -581.6749267578125, + "logps/rejected": -763.8986206054688, + "loss": 0.4528, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.4309072494506836, + "rewards/margins": 1.333343505859375, + "rewards/rejected": -3.7642509937286377, + "step": 149 + }, + { + "epoch": 0.19593436198873376, + "grad_norm": 10.261817592185366, + "learning_rate": 4.862388881737881e-07, + "logits/chosen": -0.7357074022293091, + "logits/rejected": -0.7889531850814819, + "logps/chosen": -592.6793823242188, + "logps/rejected": -763.1603393554688, + "loss": 0.4495, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.7371535301208496, + "rewards/margins": 1.1889464855194092, + "rewards/rejected": -3.926100254058838, + "step": 150 + }, + { + "epoch": 0.19724059106865865, + "grad_norm": 9.944126517301331, + "learning_rate": 4.858629072952634e-07, + "logits/chosen": -0.7918027639389038, + "logits/rejected": -0.8364720344543457, + "logps/chosen": -590.2254028320312, + "logps/rejected": -745.180908203125, + "loss": 0.4616, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.5804126262664795, + "rewards/margins": 0.9752347469329834, + "rewards/rejected": -3.555647134780884, + "step": 151 + }, + { + "epoch": 0.19854682014858355, + "grad_norm": 8.866243575571254, + "learning_rate": 4.854820084933192e-07, + "logits/chosen": -0.8944368958473206, + "logits/rejected": -0.9187857508659363, + "logps/chosen": -660.8124389648438, + "logps/rejected": -761.919677734375, + "loss": 0.4311, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6248433589935303, + "rewards/margins": 1.121117115020752, + "rewards/rejected": -3.745960235595703, + "step": 152 + }, + { + "epoch": 0.19985304922850844, + "grad_norm": 14.410936928898042, + "learning_rate": 4.850961997099892e-07, + "logits/chosen": -0.9639002680778503, + "logits/rejected": -0.9395177364349365, + "logps/chosen": -597.547607421875, + "logps/rejected": -680.5858764648438, + "loss": 0.4692, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.559708595275879, + "rewards/margins": 0.8148014545440674, + "rewards/rejected": -3.374509811401367, + "step": 153 + }, + { + "epoch": 0.20115927830843333, + "grad_norm": 11.24933528745637, + "learning_rate": 4.847054889896838e-07, + "logits/chosen": -0.8532655239105225, + "logits/rejected": -0.9007124900817871, + "logps/chosen": -621.040771484375, + "logps/rejected": -810.769287109375, + "loss": 0.4895, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.8787829875946045, + "rewards/margins": 1.3338385820388794, + "rewards/rejected": -4.212621212005615, + "step": 154 + }, + { + "epoch": 0.20246550738835822, + "grad_norm": 17.235667735611003, + "learning_rate": 4.843098844790228e-07, + "logits/chosen": -0.8356879353523254, + "logits/rejected": -0.7729538083076477, + "logps/chosen": -607.823486328125, + "logps/rejected": -642.9387817382812, + "loss": 0.5611, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.6873245239257812, + "rewards/margins": 0.5770570039749146, + "rewards/rejected": -3.2643816471099854, + "step": 155 + }, + { + "epoch": 0.20377173646828312, + "grad_norm": 15.801541834876094, + "learning_rate": 4.83909394426665e-07, + "logits/chosen": -0.8892329931259155, + "logits/rejected": -0.8549962043762207, + "logps/chosen": -718.0670166015625, + "logps/rejected": -855.0306396484375, + "loss": 0.5509, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.1425247192382812, + "rewards/margins": 1.2918095588684082, + "rewards/rejected": -4.434334754943848, + "step": 156 + }, + { + "epoch": 0.205077965548208, + "grad_norm": 22.929329377097073, + "learning_rate": 4.83504027183137e-07, + "logits/chosen": -0.9463051557540894, + "logits/rejected": -0.9906477332115173, + "logps/chosen": -636.51904296875, + "logps/rejected": -810.24169921875, + "loss": 0.4337, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.714447498321533, + "rewards/margins": 1.3694745302200317, + "rewards/rejected": -4.083921432495117, + "step": 157 + }, + { + "epoch": 0.2063841946281329, + "grad_norm": 11.639690209880769, + "learning_rate": 4.83093791200658e-07, + "logits/chosen": -0.7416931390762329, + "logits/rejected": -0.7359007596969604, + "logps/chosen": -532.9208984375, + "logps/rejected": -670.9202880859375, + "loss": 0.4256, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3252341747283936, + "rewards/margins": 1.2705352306365967, + "rewards/rejected": -3.595769166946411, + "step": 158 + }, + { + "epoch": 0.2076904237080578, + "grad_norm": 15.270391743947712, + "learning_rate": 4.826786950329646e-07, + "logits/chosen": -0.8616015315055847, + "logits/rejected": -0.8714800477027893, + "logps/chosen": -603.2241821289062, + "logps/rejected": -741.3072509765625, + "loss": 0.5139, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.4090378284454346, + "rewards/margins": 1.1060690879821777, + "rewards/rejected": -3.5151071548461914, + "step": 159 + }, + { + "epoch": 0.20899665278798268, + "grad_norm": 8.63825492932275, + "learning_rate": 4.822587473351316e-07, + "logits/chosen": -0.8719635009765625, + "logits/rejected": -0.9493328928947449, + "logps/chosen": -539.3067016601562, + "logps/rejected": -690.457275390625, + "loss": 0.423, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1986799240112305, + "rewards/margins": 1.2904539108276367, + "rewards/rejected": -3.489133834838867, + "step": 160 + }, + { + "epoch": 0.21030288186790758, + "grad_norm": 14.424579182139063, + "learning_rate": 4.818339568633926e-07, + "logits/chosen": -0.940915048122406, + "logits/rejected": -0.9314707517623901, + "logps/chosen": -586.869384765625, + "logps/rejected": -641.973876953125, + "loss": 0.4428, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.6642723083496094, + "rewards/margins": 0.6295632123947144, + "rewards/rejected": -3.2938356399536133, + "step": 161 + }, + { + "epoch": 0.21160911094783247, + "grad_norm": 10.626168680350675, + "learning_rate": 4.81404332474956e-07, + "logits/chosen": -0.7765993475914001, + "logits/rejected": -0.783936619758606, + "logps/chosen": -682.2221069335938, + "logps/rejected": -807.446533203125, + "loss": 0.4454, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9070258140563965, + "rewards/margins": 1.1761747598648071, + "rewards/rejected": -4.083200454711914, + "step": 162 + }, + { + "epoch": 0.21291534002775736, + "grad_norm": 13.762353067778275, + "learning_rate": 4.809698831278217e-07, + "logits/chosen": -0.9006916284561157, + "logits/rejected": -0.882185697555542, + "logps/chosen": -582.73046875, + "logps/rejected": -688.5364379882812, + "loss": 0.4343, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.400038242340088, + "rewards/margins": 1.1299452781677246, + "rewards/rejected": -3.5299835205078125, + "step": 163 + }, + { + "epoch": 0.21422156910768225, + "grad_norm": 15.59615888833518, + "learning_rate": 4.805306178805933e-07, + "logits/chosen": -0.8027355074882507, + "logits/rejected": -0.7851669788360596, + "logps/chosen": -634.9528198242188, + "logps/rejected": -750.8024291992188, + "loss": 0.5245, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.8227155208587646, + "rewards/margins": 1.1176807880401611, + "rewards/rejected": -3.940396308898926, + "step": 164 + }, + { + "epoch": 0.21552779818760714, + "grad_norm": 10.146904391466114, + "learning_rate": 4.800865458922898e-07, + "logits/chosen": -0.8721917867660522, + "logits/rejected": -0.9246646165847778, + "logps/chosen": -511.4060974121094, + "logps/rejected": -643.2660522460938, + "loss": 0.4669, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.18168568611145, + "rewards/margins": 0.9130581617355347, + "rewards/rejected": -3.0947437286376953, + "step": 165 + }, + { + "epoch": 0.21683402726753204, + "grad_norm": 13.069612323257635, + "learning_rate": 4.796376764221546e-07, + "logits/chosen": -1.075876235961914, + "logits/rejected": -1.0736372470855713, + "logps/chosen": -720.34033203125, + "logps/rejected": -769.7064819335938, + "loss": 0.4463, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6499907970428467, + "rewards/margins": 0.8865436911582947, + "rewards/rejected": -3.536534309387207, + "step": 166 + }, + { + "epoch": 0.21814025634745693, + "grad_norm": 13.680715613764924, + "learning_rate": 4.791840188294619e-07, + "logits/chosen": -0.9063047766685486, + "logits/rejected": -0.8700284361839294, + "logps/chosen": -595.5171508789062, + "logps/rejected": -717.4872436523438, + "loss": 0.4196, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.565876007080078, + "rewards/margins": 1.157697081565857, + "rewards/rejected": -3.7235727310180664, + "step": 167 + }, + { + "epoch": 0.21944648542738182, + "grad_norm": 26.410003114025304, + "learning_rate": 4.787255825733224e-07, + "logits/chosen": -0.8095680475234985, + "logits/rejected": -0.8170436024665833, + "logps/chosen": -659.5059204101562, + "logps/rejected": -702.8510131835938, + "loss": 0.4668, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.0381665229797363, + "rewards/margins": 0.7416744828224182, + "rewards/rejected": -3.7798409461975098, + "step": 168 + }, + { + "epoch": 0.2207527145073067, + "grad_norm": 12.27297441753684, + "learning_rate": 4.782623772124855e-07, + "logits/chosen": -0.7508028149604797, + "logits/rejected": -0.845954954624176, + "logps/chosen": -651.4632568359375, + "logps/rejected": -898.3553466796875, + "loss": 0.4654, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.235539674758911, + "rewards/margins": 1.770397424697876, + "rewards/rejected": -5.005937099456787, + "step": 169 + }, + { + "epoch": 0.2220589435872316, + "grad_norm": 11.56139650007245, + "learning_rate": 4.777944124051395e-07, + "logits/chosen": -0.8815621137619019, + "logits/rejected": -0.8710463047027588, + "logps/chosen": -595.5137329101562, + "logps/rejected": -665.1507568359375, + "loss": 0.4484, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6465539932250977, + "rewards/margins": 0.8060583472251892, + "rewards/rejected": -3.4526124000549316, + "step": 170 + }, + { + "epoch": 0.2233651726671565, + "grad_norm": 16.94182109300071, + "learning_rate": 4.773216979087119e-07, + "logits/chosen": -0.8682816028594971, + "logits/rejected": -0.9051592946052551, + "logps/chosen": -668.440185546875, + "logps/rejected": -770.4189453125, + "loss": 0.5038, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.168666124343872, + "rewards/margins": 0.9612530469894409, + "rewards/rejected": -4.129919528961182, + "step": 171 + }, + { + "epoch": 0.2246714017470814, + "grad_norm": 13.911281202814001, + "learning_rate": 4.768442435796639e-07, + "logits/chosen": -0.9317069053649902, + "logits/rejected": -0.9124513864517212, + "logps/chosen": -647.5184936523438, + "logps/rejected": -706.0861206054688, + "loss": 0.4405, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8290905952453613, + "rewards/margins": 0.7692254781723022, + "rewards/rejected": -3.598315954208374, + "step": 172 + }, + { + "epoch": 0.22597763082700628, + "grad_norm": 15.550040420673614, + "learning_rate": 4.7636205937328664e-07, + "logits/chosen": -0.7879063487052917, + "logits/rejected": -0.6880006790161133, + "logps/chosen": -639.0218505859375, + "logps/rejected": -654.2172241210938, + "loss": 0.5298, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.9527385234832764, + "rewards/margins": 0.6208396553993225, + "rewards/rejected": -3.573578357696533, + "step": 173 + }, + { + "epoch": 0.22728385990693117, + "grad_norm": 15.622889611331605, + "learning_rate": 4.758751553434922e-07, + "logits/chosen": -0.8385047316551208, + "logits/rejected": -0.8385977745056152, + "logps/chosen": -650.1749877929688, + "logps/rejected": -773.8004760742188, + "loss": 0.4689, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.798471450805664, + "rewards/margins": 1.2958422899246216, + "rewards/rejected": -4.094313621520996, + "step": 174 + }, + { + "epoch": 0.22859008898685607, + "grad_norm": 10.630697667093651, + "learning_rate": 4.753835416426051e-07, + "logits/chosen": -0.7726784348487854, + "logits/rejected": -0.7915071845054626, + "logps/chosen": -616.429443359375, + "logps/rejected": -742.1764526367188, + "loss": 0.4527, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.6264476776123047, + "rewards/margins": 1.346327304840088, + "rewards/rejected": -3.9727747440338135, + "step": 175 + }, + { + "epoch": 0.22989631806678096, + "grad_norm": 21.621277056488225, + "learning_rate": 4.748872285211498e-07, + "logits/chosen": -0.766167163848877, + "logits/rejected": -0.7375807762145996, + "logps/chosen": -612.8403930664062, + "logps/rejected": -690.181884765625, + "loss": 0.495, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.520641326904297, + "rewards/margins": 0.8460010886192322, + "rewards/rejected": -3.3666419982910156, + "step": 176 + }, + { + "epoch": 0.23120254714670585, + "grad_norm": 21.05225240395737, + "learning_rate": 4.743862263276376e-07, + "logits/chosen": -0.7831792831420898, + "logits/rejected": -0.7660663723945618, + "logps/chosen": -684.431640625, + "logps/rejected": -809.43798828125, + "loss": 0.4497, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.232773542404175, + "rewards/margins": 1.158035159111023, + "rewards/rejected": -4.390808582305908, + "step": 177 + }, + { + "epoch": 0.23250877622663074, + "grad_norm": 15.628810528649288, + "learning_rate": 4.738805455083502e-07, + "logits/chosen": -0.8534025549888611, + "logits/rejected": -0.8425536155700684, + "logps/chosen": -617.8245239257812, + "logps/rejected": -676.0670166015625, + "loss": 0.5264, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6181697845458984, + "rewards/margins": 0.8310192823410034, + "rewards/rejected": -3.4491891860961914, + "step": 178 + }, + { + "epoch": 0.23381500530655563, + "grad_norm": 12.752693104616785, + "learning_rate": 4.7337019660712254e-07, + "logits/chosen": -0.8026778101921082, + "logits/rejected": -0.8057217001914978, + "logps/chosen": -564.225830078125, + "logps/rejected": -793.5078125, + "loss": 0.471, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.4254250526428223, + "rewards/margins": 1.4830188751220703, + "rewards/rejected": -3.9084439277648926, + "step": 179 + }, + { + "epoch": 0.23512123438648053, + "grad_norm": 14.774928040537697, + "learning_rate": 4.7285519026512267e-07, + "logits/chosen": -0.8348312377929688, + "logits/rejected": -0.8021372556686401, + "logps/chosen": -633.8662109375, + "logps/rejected": -700.7567138671875, + "loss": 0.4983, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.7143614292144775, + "rewards/margins": 0.7620492577552795, + "rewards/rejected": -3.4764111042022705, + "step": 180 + }, + { + "epoch": 0.23642746346640542, + "grad_norm": 11.622812039115109, + "learning_rate": 4.723355372206297e-07, + "logits/chosen": -0.8993362188339233, + "logits/rejected": -0.9162436127662659, + "logps/chosen": -591.3616333007812, + "logps/rejected": -703.9219970703125, + "loss": 0.4848, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3152692317962646, + "rewards/margins": 0.9252865314483643, + "rewards/rejected": -3.240555763244629, + "step": 181 + }, + { + "epoch": 0.2377336925463303, + "grad_norm": 15.982569238831562, + "learning_rate": 4.718112483088102e-07, + "logits/chosen": -0.695371150970459, + "logits/rejected": -0.7330318093299866, + "logps/chosen": -648.0274047851562, + "logps/rejected": -778.1810302734375, + "loss": 0.5135, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.06677508354187, + "rewards/margins": 1.2116183042526245, + "rewards/rejected": -4.278393745422363, + "step": 182 + }, + { + "epoch": 0.2390399216262552, + "grad_norm": 14.477017877144629, + "learning_rate": 4.7128233446149205e-07, + "logits/chosen": -0.8352495431900024, + "logits/rejected": -0.7390860915184021, + "logps/chosen": -579.0858154296875, + "logps/rejected": -652.7052001953125, + "loss": 0.4737, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.529910087585449, + "rewards/margins": 0.8141626119613647, + "rewards/rejected": -3.3440728187561035, + "step": 183 + }, + { + "epoch": 0.2403461507061801, + "grad_norm": 14.969475270165255, + "learning_rate": 4.7074880670693673e-07, + "logits/chosen": -0.8044037818908691, + "logits/rejected": -0.7950783967971802, + "logps/chosen": -594.140869140625, + "logps/rejected": -658.424560546875, + "loss": 0.509, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.541053295135498, + "rewards/margins": 0.5490034818649292, + "rewards/rejected": -3.090056896209717, + "step": 184 + }, + { + "epoch": 0.241652379786105, + "grad_norm": 9.781489605816779, + "learning_rate": 4.702106761696091e-07, + "logits/chosen": -0.701477587223053, + "logits/rejected": -0.785467267036438, + "logps/chosen": -580.6058349609375, + "logps/rejected": -789.3968505859375, + "loss": 0.4721, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.574538230895996, + "rewards/margins": 1.637567162513733, + "rewards/rejected": -4.2121052742004395, + "step": 185 + }, + { + "epoch": 0.24295860886602988, + "grad_norm": 12.354874815334579, + "learning_rate": 4.6966795406994564e-07, + "logits/chosen": -0.8601551055908203, + "logits/rejected": -0.8328098654747009, + "logps/chosen": -630.858642578125, + "logps/rejected": -707.3577880859375, + "loss": 0.4537, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7662644386291504, + "rewards/margins": 0.8900072574615479, + "rewards/rejected": -3.6562719345092773, + "step": 186 + }, + { + "epoch": 0.24426483794595477, + "grad_norm": 17.448659280928165, + "learning_rate": 4.6912065172412046e-07, + "logits/chosen": -0.9235214591026306, + "logits/rejected": -0.8834742307662964, + "logps/chosen": -673.1686401367188, + "logps/rejected": -738.9641723632812, + "loss": 0.4663, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8677878379821777, + "rewards/margins": 0.9876227378845215, + "rewards/rejected": -3.8554108142852783, + "step": 187 + }, + { + "epoch": 0.24557106702587966, + "grad_norm": 12.47966408616588, + "learning_rate": 4.685687805438094e-07, + "logits/chosen": -0.8535528182983398, + "logits/rejected": -0.8522934317588806, + "logps/chosen": -629.5390625, + "logps/rejected": -729.8989868164062, + "loss": 0.4595, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8061907291412354, + "rewards/margins": 0.9717488884925842, + "rewards/rejected": -3.7779393196105957, + "step": 188 + }, + { + "epoch": 0.24687729610580456, + "grad_norm": 19.228380348984462, + "learning_rate": 4.680123520359519e-07, + "logits/chosen": -0.7098379731178284, + "logits/rejected": -0.6918249130249023, + "logps/chosen": -640.8948364257812, + "logps/rejected": -736.1873779296875, + "loss": 0.4534, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9255459308624268, + "rewards/margins": 1.0353283882141113, + "rewards/rejected": -3.960874080657959, + "step": 189 + }, + { + "epoch": 0.24818352518572945, + "grad_norm": 13.78796204485732, + "learning_rate": 4.674513778025112e-07, + "logits/chosen": -0.7351849675178528, + "logits/rejected": -0.7797478437423706, + "logps/chosen": -515.9529418945312, + "logps/rejected": -650.210693359375, + "loss": 0.4358, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3447022438049316, + "rewards/margins": 1.2216882705688477, + "rewards/rejected": -3.566390037536621, + "step": 190 + }, + { + "epoch": 0.24948975426565434, + "grad_norm": 16.198151345068265, + "learning_rate": 4.6688586954023255e-07, + "logits/chosen": -0.649688184261322, + "logits/rejected": -0.7320199012756348, + "logps/chosen": -569.9924926757812, + "logps/rejected": -789.763916015625, + "loss": 0.3996, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.4391114711761475, + "rewards/margins": 1.7697255611419678, + "rewards/rejected": -4.208836555480957, + "step": 191 + }, + { + "epoch": 0.25079598334557923, + "grad_norm": 11.119564441609715, + "learning_rate": 4.663158390403991e-07, + "logits/chosen": -0.8746720552444458, + "logits/rejected": -0.8720070123672485, + "logps/chosen": -556.7640991210938, + "logps/rejected": -648.1425170898438, + "loss": 0.4835, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.4596023559570312, + "rewards/margins": 0.8872272968292236, + "rewards/rejected": -3.346829652786255, + "step": 192 + }, + { + "epoch": 0.2521022124255041, + "grad_norm": 16.9682810475937, + "learning_rate": 4.657412981885861e-07, + "logits/chosen": -0.8757528066635132, + "logits/rejected": -0.8742597699165344, + "logps/chosen": -655.8448486328125, + "logps/rejected": -716.5648803710938, + "loss": 0.4632, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7153611183166504, + "rewards/margins": 0.9040799140930176, + "rewards/rejected": -3.619441032409668, + "step": 193 + }, + { + "epoch": 0.253408441505429, + "grad_norm": 12.940046755041541, + "learning_rate": 4.651622589644132e-07, + "logits/chosen": -0.853613555431366, + "logits/rejected": -0.9235856533050537, + "logps/chosen": -631.1458740234375, + "logps/rejected": -784.771240234375, + "loss": 0.4331, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.815965175628662, + "rewards/margins": 1.0641415119171143, + "rewards/rejected": -3.8801069259643555, + "step": 194 + }, + { + "epoch": 0.2547146705853539, + "grad_norm": 15.053926223340259, + "learning_rate": 4.6457873344129443e-07, + "logits/chosen": -0.8052849769592285, + "logits/rejected": -0.867551863193512, + "logps/chosen": -657.406494140625, + "logps/rejected": -894.9517211914062, + "loss": 0.473, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.0834543704986572, + "rewards/margins": 1.7164838314056396, + "rewards/rejected": -4.799938201904297, + "step": 195 + }, + { + "epoch": 0.2560208996652788, + "grad_norm": 14.927847637838813, + "learning_rate": 4.639907337861869e-07, + "logits/chosen": -0.8298658728599548, + "logits/rejected": -0.8566642999649048, + "logps/chosen": -635.9237060546875, + "logps/rejected": -793.529541015625, + "loss": 0.4053, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.755927562713623, + "rewards/margins": 1.3652548789978027, + "rewards/rejected": -4.121182441711426, + "step": 196 + }, + { + "epoch": 0.2573271287452037, + "grad_norm": 12.418504870497374, + "learning_rate": 4.6339827225933657e-07, + "logits/chosen": -0.8944472670555115, + "logits/rejected": -0.8577872514724731, + "logps/chosen": -650.7933959960938, + "logps/rejected": -742.0443725585938, + "loss": 0.4086, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.070491313934326, + "rewards/margins": 1.0094940662384033, + "rewards/rejected": -4.079985618591309, + "step": 197 + }, + { + "epoch": 0.2586333578251286, + "grad_norm": 12.08101650781618, + "learning_rate": 4.62801361214023e-07, + "logits/chosen": -0.8086925745010376, + "logits/rejected": -0.856459379196167, + "logps/chosen": -634.4454956054688, + "logps/rejected": -836.5366821289062, + "loss": 0.4246, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.9828410148620605, + "rewards/margins": 1.5875557661056519, + "rewards/rejected": -4.570396900177002, + "step": 198 + }, + { + "epoch": 0.2599395869050535, + "grad_norm": 19.136812624206406, + "learning_rate": 4.622000130963014e-07, + "logits/chosen": -0.7772096395492554, + "logits/rejected": -0.8084428310394287, + "logps/chosen": -703.0916748046875, + "logps/rejected": -838.975830078125, + "loss": 0.4351, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.5541677474975586, + "rewards/margins": 1.2203888893127441, + "rewards/rejected": -4.7745561599731445, + "step": 199 + }, + { + "epoch": 0.26124581598497837, + "grad_norm": 30.844882622332946, + "learning_rate": 4.6159424044474383e-07, + "logits/chosen": -0.783661425113678, + "logits/rejected": -0.7943572402000427, + "logps/chosen": -686.8040161132812, + "logps/rejected": -793.8935546875, + "loss": 0.5441, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.420450210571289, + "rewards/margins": 1.065230369567871, + "rewards/rejected": -4.485680103302002, + "step": 200 + }, + { + "epoch": 0.26124581598497837, + "eval_logits/chosen": -0.6964080929756165, + "eval_logits/rejected": -0.6965639591217041, + "eval_logps/chosen": -685.1904907226562, + "eval_logps/rejected": -860.1481323242188, + "eval_loss": 0.44446808099746704, + "eval_rewards/accuracies": 0.8360000252723694, + "eval_rewards/chosen": -3.4115946292877197, + "eval_rewards/margins": 1.6885993480682373, + "eval_rewards/rejected": -5.100193500518799, + "eval_runtime": 305.4451, + "eval_samples_per_second": 6.548, + "eval_steps_per_second": 0.409, + "step": 200 + }, + { + "epoch": 0.26255204506490326, + "grad_norm": 17.760792922849937, + "learning_rate": 4.6098405589017676e-07, + "logits/chosen": -0.8347344398498535, + "logits/rejected": -0.8454535603523254, + "logps/chosen": -638.2152099609375, + "logps/rejected": -787.5073852539062, + "loss": 0.429, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0217223167419434, + "rewards/margins": 1.3569830656051636, + "rewards/rejected": -4.3787055015563965, + "step": 201 + }, + { + "epoch": 0.26385827414482815, + "grad_norm": 18.25367163242615, + "learning_rate": 4.6036947215541856e-07, + "logits/chosen": -0.7643561363220215, + "logits/rejected": -0.839961051940918, + "logps/chosen": -753.9393310546875, + "logps/rejected": -926.6439208984375, + "loss": 0.5115, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.067323207855225, + "rewards/margins": 1.2652134895324707, + "rewards/rejected": -5.332536697387695, + "step": 202 + }, + { + "epoch": 0.26516450322475305, + "grad_norm": 16.18688044045776, + "learning_rate": 4.597505020550138e-07, + "logits/chosen": -0.8352375030517578, + "logits/rejected": -0.7859711647033691, + "logps/chosen": -640.655517578125, + "logps/rejected": -761.5785522460938, + "loss": 0.499, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.453089714050293, + "rewards/margins": 1.1171209812164307, + "rewards/rejected": -4.570209980010986, + "step": 203 + }, + { + "epoch": 0.26647073230467794, + "grad_norm": 21.13978640832832, + "learning_rate": 4.591271584949662e-07, + "logits/chosen": -0.8258641958236694, + "logits/rejected": -0.7898882627487183, + "logps/chosen": -699.2488403320312, + "logps/rejected": -853.1764526367188, + "loss": 0.4811, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.18013858795166, + "rewards/margins": 1.710571050643921, + "rewards/rejected": -4.89070987701416, + "step": 204 + }, + { + "epoch": 0.26777696138460283, + "grad_norm": 13.936241022062031, + "learning_rate": 4.584994544724695e-07, + "logits/chosen": -0.9291080236434937, + "logits/rejected": -0.8747403621673584, + "logps/chosen": -621.3516235351562, + "logps/rejected": -753.9003295898438, + "loss": 0.3705, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.6660590171813965, + "rewards/margins": 1.369396448135376, + "rewards/rejected": -4.035455703735352, + "step": 205 + }, + { + "epoch": 0.2690831904645277, + "grad_norm": 24.239952793904404, + "learning_rate": 4.578674030756363e-07, + "logits/chosen": -0.8430845141410828, + "logits/rejected": -0.874817430973053, + "logps/chosen": -669.2354736328125, + "logps/rejected": -831.9165649414062, + "loss": 0.3822, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.265414237976074, + "rewards/margins": 1.3706064224243164, + "rewards/rejected": -4.636020660400391, + "step": 206 + }, + { + "epoch": 0.2703894195444526, + "grad_norm": 16.98616197060073, + "learning_rate": 4.572310174832255e-07, + "logits/chosen": -0.7017495036125183, + "logits/rejected": -0.6992954015731812, + "logps/chosen": -630.5468139648438, + "logps/rejected": -817.4756469726562, + "loss": 0.3992, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.040794849395752, + "rewards/margins": 1.788942813873291, + "rewards/rejected": -4.829737663269043, + "step": 207 + }, + { + "epoch": 0.2716956486243775, + "grad_norm": 15.180321612991502, + "learning_rate": 4.565903109643672e-07, + "logits/chosen": -0.9050486087799072, + "logits/rejected": -0.8393621444702148, + "logps/chosen": -680.2135009765625, + "logps/rejected": -749.2343139648438, + "loss": 0.4198, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.655306339263916, + "rewards/margins": 1.145632028579712, + "rewards/rejected": -3.800938367843628, + "step": 208 + }, + { + "epoch": 0.2730018777043024, + "grad_norm": 25.61838659954735, + "learning_rate": 4.5594529687828607e-07, + "logits/chosen": -0.678799033164978, + "logits/rejected": -0.6235646605491638, + "logps/chosen": -606.0216674804688, + "logps/rejected": -865.0751953125, + "loss": 0.4466, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.032256841659546, + "rewards/margins": 2.4543349742889404, + "rewards/rejected": -5.486591815948486, + "step": 209 + }, + { + "epoch": 0.2743081067842273, + "grad_norm": 16.73237051103879, + "learning_rate": 4.5529598867402314e-07, + "logits/chosen": -0.7634575963020325, + "logits/rejected": -0.7480574250221252, + "logps/chosen": -672.0314331054688, + "logps/rejected": -834.9887084960938, + "loss": 0.3801, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.0546722412109375, + "rewards/margins": 1.4492563009262085, + "rewards/rejected": -4.5039286613464355, + "step": 210 + }, + { + "epoch": 0.2756143358641522, + "grad_norm": 14.802819908395904, + "learning_rate": 4.5464239989015483e-07, + "logits/chosen": -0.8287503123283386, + "logits/rejected": -0.8498775959014893, + "logps/chosen": -639.4356689453125, + "logps/rejected": -721.876708984375, + "loss": 0.4012, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.190981388092041, + "rewards/margins": 0.6815940141677856, + "rewards/rejected": -3.872575521469116, + "step": 211 + }, + { + "epoch": 0.2769205649440771, + "grad_norm": 15.793263099422985, + "learning_rate": 4.5398454415451126e-07, + "logits/chosen": -0.8768157362937927, + "logits/rejected": -0.9070101976394653, + "logps/chosen": -717.0108032226562, + "logps/rejected": -846.1328125, + "loss": 0.4161, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.556455612182617, + "rewards/margins": 1.2665857076644897, + "rewards/rejected": -4.823040962219238, + "step": 212 + }, + { + "epoch": 0.27822679402400197, + "grad_norm": 17.10594111843166, + "learning_rate": 4.5332243518389136e-07, + "logits/chosen": -0.7659977078437805, + "logits/rejected": -0.8257431983947754, + "logps/chosen": -705.0125732421875, + "logps/rejected": -928.66259765625, + "loss": 0.3759, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.354266405105591, + "rewards/margins": 1.808074712753296, + "rewards/rejected": -5.162341117858887, + "step": 213 + }, + { + "epoch": 0.27953302310392686, + "grad_norm": 24.144228395118713, + "learning_rate": 4.526560867837776e-07, + "logits/chosen": -0.8346929550170898, + "logits/rejected": -0.8240299820899963, + "logps/chosen": -769.806396484375, + "logps/rejected": -960.2931518554688, + "loss": 0.4983, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.8731882572174072, + "rewards/margins": 1.9701744318008423, + "rewards/rejected": -5.843362331390381, + "step": 214 + }, + { + "epoch": 0.28083925218385175, + "grad_norm": 16.56926567578079, + "learning_rate": 4.5198551284804773e-07, + "logits/chosen": -0.8138917088508606, + "logits/rejected": -0.7641476392745972, + "logps/chosen": -751.6607666015625, + "logps/rejected": -855.57763671875, + "loss": 0.4315, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.096513271331787, + "rewards/margins": 1.1268495321273804, + "rewards/rejected": -5.223363399505615, + "step": 215 + }, + { + "epoch": 0.28214548126377664, + "grad_norm": 20.387481706576544, + "learning_rate": 4.5131072735868523e-07, + "logits/chosen": -0.8751257061958313, + "logits/rejected": -0.8823789358139038, + "logps/chosen": -729.88818359375, + "logps/rejected": -880.8683471679688, + "loss": 0.4153, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.5861120223999023, + "rewards/margins": 1.625636339187622, + "rewards/rejected": -5.211748123168945, + "step": 216 + }, + { + "epoch": 0.28345171034370154, + "grad_norm": 22.915151414898517, + "learning_rate": 4.506317443854877e-07, + "logits/chosen": -0.6338347792625427, + "logits/rejected": -0.6184822916984558, + "logps/chosen": -762.0576782226562, + "logps/rejected": -895.22607421875, + "loss": 0.4508, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.526608943939209, + "rewards/margins": 1.1892971992492676, + "rewards/rejected": -5.715906143188477, + "step": 217 + }, + { + "epoch": 0.2847579394236264, + "grad_norm": 13.159897885289423, + "learning_rate": 4.4994857808577337e-07, + "logits/chosen": -0.6982518434524536, + "logits/rejected": -0.7118569612503052, + "logps/chosen": -781.4166259765625, + "logps/rejected": -1003.0726928710938, + "loss": 0.308, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.0865607261657715, + "rewards/margins": 2.2177541255950928, + "rewards/rejected": -6.304314613342285, + "step": 218 + }, + { + "epoch": 0.2860641685035513, + "grad_norm": 27.302561844802497, + "learning_rate": 4.492612427040863e-07, + "logits/chosen": -0.8502603769302368, + "logits/rejected": -0.8385042548179626, + "logps/chosen": -834.5120849609375, + "logps/rejected": -1042.2188720703125, + "loss": 0.4398, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.505267143249512, + "rewards/margins": 2.019343852996826, + "rewards/rejected": -6.524610996246338, + "step": 219 + }, + { + "epoch": 0.2873703975834762, + "grad_norm": 16.41161061540287, + "learning_rate": 4.4856975257189896e-07, + "logits/chosen": -0.7951454520225525, + "logits/rejected": -0.8055264949798584, + "logps/chosen": -727.2957153320312, + "logps/rejected": -966.2659301757812, + "loss": 0.4247, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.9063920974731445, + "rewards/margins": 2.326056718826294, + "rewards/rejected": -6.232448577880859, + "step": 220 + }, + { + "epoch": 0.2886766266634011, + "grad_norm": 15.073743400941652, + "learning_rate": 4.478741221073135e-07, + "logits/chosen": -0.8535463809967041, + "logits/rejected": -0.850567638874054, + "logps/chosen": -794.3897705078125, + "logps/rejected": -891.7565307617188, + "loss": 0.3925, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.9899892807006836, + "rewards/margins": 1.1424330472946167, + "rewards/rejected": -5.132421970367432, + "step": 221 + }, + { + "epoch": 0.289982855743326, + "grad_norm": 15.717093144659811, + "learning_rate": 4.471743658147614e-07, + "logits/chosen": -0.805902361869812, + "logits/rejected": -0.7969594597816467, + "logps/chosen": -721.6943969726562, + "logps/rejected": -866.4282836914062, + "loss": 0.4412, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.9946467876434326, + "rewards/margins": 1.2327009439468384, + "rewards/rejected": -5.2273478507995605, + "step": 222 + }, + { + "epoch": 0.2912890848232509, + "grad_norm": 14.694800440693287, + "learning_rate": 4.4647049828470075e-07, + "logits/chosen": -0.7446881532669067, + "logits/rejected": -0.7209632396697998, + "logps/chosen": -654.7391357421875, + "logps/rejected": -803.6948852539062, + "loss": 0.3623, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.3331480026245117, + "rewards/margins": 1.561418056488037, + "rewards/rejected": -4.894566059112549, + "step": 223 + }, + { + "epoch": 0.2925953139031758, + "grad_norm": 15.526264148405597, + "learning_rate": 4.4576253419331205e-07, + "logits/chosen": -0.7109897136688232, + "logits/rejected": -0.7546270489692688, + "logps/chosen": -685.828857421875, + "logps/rejected": -840.8103637695312, + "loss": 0.4788, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.5382773876190186, + "rewards/margins": 1.3709080219268799, + "rewards/rejected": -4.909185409545898, + "step": 224 + }, + { + "epoch": 0.29390154298310067, + "grad_norm": 22.88069996920225, + "learning_rate": 4.450504883021923e-07, + "logits/chosen": -1.0191212892532349, + "logits/rejected": -0.9649794101715088, + "logps/chosen": -757.391845703125, + "logps/rejected": -843.6371459960938, + "loss": 0.4204, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7928903102874756, + "rewards/margins": 1.3336069583892822, + "rewards/rejected": -5.126497268676758, + "step": 225 + }, + { + "epoch": 0.29520777206302556, + "grad_norm": 13.434275589157789, + "learning_rate": 4.4433437545804715e-07, + "logits/chosen": -0.8132909536361694, + "logits/rejected": -0.7702129483222961, + "logps/chosen": -753.9683227539062, + "logps/rejected": -931.2380981445312, + "loss": 0.4057, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.648000955581665, + "rewards/margins": 1.8615243434906006, + "rewards/rejected": -5.509525299072266, + "step": 226 + }, + { + "epoch": 0.29651400114295046, + "grad_norm": 12.235894599708347, + "learning_rate": 4.436142105923814e-07, + "logits/chosen": -0.9691725969314575, + "logits/rejected": -1.0009491443634033, + "logps/chosen": -696.4452514648438, + "logps/rejected": -795.2962036132812, + "loss": 0.3845, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3394429683685303, + "rewards/margins": 1.029392123222351, + "rewards/rejected": -4.368834972381592, + "step": 227 + }, + { + "epoch": 0.29782023022287535, + "grad_norm": 22.271476941430596, + "learning_rate": 4.4289000872118767e-07, + "logits/chosen": -0.8251463174819946, + "logits/rejected": -0.8388761281967163, + "logps/chosen": -622.3071899414062, + "logps/rejected": -791.090087890625, + "loss": 0.3786, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.2112112045288086, + "rewards/margins": 1.293255090713501, + "rewards/rejected": -4.5044660568237305, + "step": 228 + }, + { + "epoch": 0.29912645930280024, + "grad_norm": 18.189840903303466, + "learning_rate": 4.4216178494463295e-07, + "logits/chosen": -0.9312427043914795, + "logits/rejected": -0.9146935343742371, + "logps/chosen": -785.6533203125, + "logps/rejected": -880.157958984375, + "loss": 0.3356, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6715898513793945, + "rewards/margins": 1.162749171257019, + "rewards/rejected": -4.834339141845703, + "step": 229 + }, + { + "epoch": 0.30043268838272513, + "grad_norm": 26.36376504008489, + "learning_rate": 4.4142955444674463e-07, + "logits/chosen": -0.9649460315704346, + "logits/rejected": -0.9716936349868774, + "logps/chosen": -664.906005859375, + "logps/rejected": -912.1724853515625, + "loss": 0.4067, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.310150384902954, + "rewards/margins": 1.8418421745300293, + "rewards/rejected": -5.1519927978515625, + "step": 230 + }, + { + "epoch": 0.30173891746265, + "grad_norm": 19.94846396440649, + "learning_rate": 4.406933324950928e-07, + "logits/chosen": -0.9337835311889648, + "logits/rejected": -0.9488065242767334, + "logps/chosen": -800.484375, + "logps/rejected": -963.2352294921875, + "loss": 0.375, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.947709798812866, + "rewards/margins": 1.5597035884857178, + "rewards/rejected": -5.507413387298584, + "step": 231 + }, + { + "epoch": 0.3030451465425749, + "grad_norm": 16.356495739331635, + "learning_rate": 4.3995313444047254e-07, + "logits/chosen": -0.9251805543899536, + "logits/rejected": -0.8757014274597168, + "logps/chosen": -750.8514404296875, + "logps/rejected": -866.1437377929688, + "loss": 0.4655, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.803636074066162, + "rewards/margins": 1.2165488004684448, + "rewards/rejected": -5.020185470581055, + "step": 232 + }, + { + "epoch": 0.3043513756224998, + "grad_norm": 21.24285096426969, + "learning_rate": 4.3920897571658406e-07, + "logits/chosen": -0.8091636896133423, + "logits/rejected": -0.8993147611618042, + "logps/chosen": -768.8638916015625, + "logps/rejected": -1060.4085693359375, + "loss": 0.4114, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.014678478240967, + "rewards/margins": 2.306403875350952, + "rewards/rejected": -6.321082592010498, + "step": 233 + }, + { + "epoch": 0.3056576047024247, + "grad_norm": 17.134518173916593, + "learning_rate": 4.384608718397102e-07, + "logits/chosen": -0.7329398393630981, + "logits/rejected": -0.7744626402854919, + "logps/chosen": -693.506103515625, + "logps/rejected": -891.974609375, + "loss": 0.4762, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.8318393230438232, + "rewards/margins": 1.836269736289978, + "rewards/rejected": -5.668108940124512, + "step": 234 + }, + { + "epoch": 0.3069638337823496, + "grad_norm": 16.555347617715572, + "learning_rate": 4.377088384083935e-07, + "logits/chosen": -0.8090454339981079, + "logits/rejected": -0.8010983467102051, + "logps/chosen": -734.9536743164062, + "logps/rejected": -961.3272705078125, + "loss": 0.3279, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.9544529914855957, + "rewards/margins": 2.0194091796875, + "rewards/rejected": -5.9738616943359375, + "step": 235 + }, + { + "epoch": 0.3082700628622745, + "grad_norm": 21.692512442920936, + "learning_rate": 4.369528911031105e-07, + "logits/chosen": -0.6334437727928162, + "logits/rejected": -0.7008833289146423, + "logps/chosen": -765.7869873046875, + "logps/rejected": -925.2149047851562, + "loss": 0.3784, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.122602939605713, + "rewards/margins": 1.3997082710266113, + "rewards/rejected": -5.522310733795166, + "step": 236 + }, + { + "epoch": 0.3095762919421994, + "grad_norm": 19.148346287583163, + "learning_rate": 4.3619304568594546e-07, + "logits/chosen": -0.878132700920105, + "logits/rejected": -0.9558409452438354, + "logps/chosen": -673.8487548828125, + "logps/rejected": -857.96240234375, + "loss": 0.4497, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.3650078773498535, + "rewards/margins": 1.5477778911590576, + "rewards/rejected": -4.912785530090332, + "step": 237 + }, + { + "epoch": 0.31088252102212427, + "grad_norm": 15.517792339257628, + "learning_rate": 4.354293180002608e-07, + "logits/chosen": -0.7494323253631592, + "logits/rejected": -0.7071195840835571, + "logps/chosen": -731.3680419921875, + "logps/rejected": -777.686767578125, + "loss": 0.3642, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.9493746757507324, + "rewards/margins": 0.709790050983429, + "rewards/rejected": -4.6591644287109375, + "step": 238 + }, + { + "epoch": 0.31218875010204916, + "grad_norm": 19.81836418744902, + "learning_rate": 4.346617239703676e-07, + "logits/chosen": -0.7905477285385132, + "logits/rejected": -0.7663273811340332, + "logps/chosen": -812.0971069335938, + "logps/rejected": -957.8253173828125, + "loss": 0.3362, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.33413553237915, + "rewards/margins": 1.6475704908370972, + "rewards/rejected": -5.981706142425537, + "step": 239 + }, + { + "epoch": 0.31349497918197405, + "grad_norm": 21.50199670306285, + "learning_rate": 4.338902796011929e-07, + "logits/chosen": -0.8615242838859558, + "logits/rejected": -0.8263786435127258, + "logps/chosen": -781.7965698242188, + "logps/rejected": -923.489990234375, + "loss": 0.3566, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.6246654987335205, + "rewards/margins": 1.7381658554077148, + "rewards/rejected": -5.362831115722656, + "step": 240 + }, + { + "epoch": 0.31480120826189895, + "grad_norm": 18.40707890032469, + "learning_rate": 4.331150009779465e-07, + "logits/chosen": -0.8664236068725586, + "logits/rejected": -0.8414658904075623, + "logps/chosen": -730.3052368164062, + "logps/rejected": -858.5838623046875, + "loss": 0.429, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.8383097648620605, + "rewards/margins": 1.3162572383880615, + "rewards/rejected": -5.154566764831543, + "step": 241 + }, + { + "epoch": 0.31610743734182384, + "grad_norm": 19.439482180801054, + "learning_rate": 4.323359042657853e-07, + "logits/chosen": -0.7379111051559448, + "logits/rejected": -0.71939617395401, + "logps/chosen": -810.045166015625, + "logps/rejected": -912.1922607421875, + "loss": 0.3901, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.5196075439453125, + "rewards/margins": 0.9989256858825684, + "rewards/rejected": -5.518532752990723, + "step": 242 + }, + { + "epoch": 0.31741366642174873, + "grad_norm": 18.81095569934508, + "learning_rate": 4.3155300570947624e-07, + "logits/chosen": -0.8109769821166992, + "logits/rejected": -0.7917392253875732, + "logps/chosen": -820.1282958984375, + "logps/rejected": -953.7001342773438, + "loss": 0.376, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.9693336486816406, + "rewards/margins": 1.540510654449463, + "rewards/rejected": -5.5098443031311035, + "step": 243 + }, + { + "epoch": 0.3187198955016736, + "grad_norm": 18.232556176954414, + "learning_rate": 4.307663216330577e-07, + "logits/chosen": -0.824482262134552, + "logits/rejected": -0.859273374080658, + "logps/chosen": -725.1882934570312, + "logps/rejected": -920.5584716796875, + "loss": 0.4058, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.7261176109313965, + "rewards/margins": 1.9005296230316162, + "rewards/rejected": -5.626646995544434, + "step": 244 + }, + { + "epoch": 0.3200261245815985, + "grad_norm": 20.052726394978947, + "learning_rate": 4.2997586843949896e-07, + "logits/chosen": -0.8550601005554199, + "logits/rejected": -0.872449517250061, + "logps/chosen": -739.6688842773438, + "logps/rejected": -893.7305908203125, + "loss": 0.4682, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.840355157852173, + "rewards/margins": 1.4003340005874634, + "rewards/rejected": -5.240689277648926, + "step": 245 + }, + { + "epoch": 0.3213323536615234, + "grad_norm": 22.120568271547505, + "learning_rate": 4.2918166261035847e-07, + "logits/chosen": -0.5994831323623657, + "logits/rejected": -0.6662572622299194, + "logps/chosen": -685.1893920898438, + "logps/rejected": -880.3656005859375, + "loss": 0.3879, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6331100463867188, + "rewards/margins": 1.4999750852584839, + "rewards/rejected": -5.133085250854492, + "step": 246 + }, + { + "epoch": 0.3226385827414483, + "grad_norm": 25.04758767065529, + "learning_rate": 4.283837207054399e-07, + "logits/chosen": -0.6785750985145569, + "logits/rejected": -0.69914710521698, + "logps/chosen": -704.94091796875, + "logps/rejected": -909.0919799804688, + "loss": 0.4266, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.776233673095703, + "rewards/margins": 1.8772923946380615, + "rewards/rejected": -5.6535258293151855, + "step": 247 + }, + { + "epoch": 0.3239448118213732, + "grad_norm": 15.071461682352357, + "learning_rate": 4.2758205936244706e-07, + "logits/chosen": -0.7484068870544434, + "logits/rejected": -0.7148491144180298, + "logps/chosen": -770.5116577148438, + "logps/rejected": -923.0064086914062, + "loss": 0.3902, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.398849010467529, + "rewards/margins": 1.488804817199707, + "rewards/rejected": -5.8876543045043945, + "step": 248 + }, + { + "epoch": 0.3252510409012981, + "grad_norm": 16.115715937528535, + "learning_rate": 4.2677669529663686e-07, + "logits/chosen": -0.8120733499526978, + "logits/rejected": -0.7861438393592834, + "logps/chosen": -716.216064453125, + "logps/rejected": -879.9732666015625, + "loss": 0.344, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.722062587738037, + "rewards/margins": 1.7555803060531616, + "rewards/rejected": -5.477643013000488, + "step": 249 + }, + { + "epoch": 0.326557269981223, + "grad_norm": 27.76120754393567, + "learning_rate": 4.259676453004708e-07, + "logits/chosen": -0.7561047077178955, + "logits/rejected": -0.7340124845504761, + "logps/chosen": -738.0452880859375, + "logps/rejected": -907.1728515625, + "loss": 0.4392, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.9837238788604736, + "rewards/margins": 1.5732449293136597, + "rewards/rejected": -5.556968688964844, + "step": 250 + }, + { + "epoch": 0.32786349906114787, + "grad_norm": 15.979247504106482, + "learning_rate": 4.25154926243265e-07, + "logits/chosen": -0.8759682178497314, + "logits/rejected": -0.7738591432571411, + "logps/chosen": -714.5944213867188, + "logps/rejected": -795.8113403320312, + "loss": 0.3328, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2611489295959473, + "rewards/margins": 1.2338060140609741, + "rewards/rejected": -4.494955539703369, + "step": 251 + }, + { + "epoch": 0.32916972814107276, + "grad_norm": 21.82665055739408, + "learning_rate": 4.2433855507083816e-07, + "logits/chosen": -0.8485604524612427, + "logits/rejected": -0.8467884659767151, + "logps/chosen": -714.1653442382812, + "logps/rejected": -866.3451538085938, + "loss": 0.4448, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.7272872924804688, + "rewards/margins": 1.4408416748046875, + "rewards/rejected": -5.168128967285156, + "step": 252 + }, + { + "epoch": 0.33047595722099765, + "grad_norm": 21.215293194514476, + "learning_rate": 4.235185488051585e-07, + "logits/chosen": -0.706048846244812, + "logits/rejected": -0.7180212140083313, + "logps/chosen": -710.47314453125, + "logps/rejected": -951.2412109375, + "loss": 0.3518, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.7834746837615967, + "rewards/margins": 1.9697420597076416, + "rewards/rejected": -5.75321626663208, + "step": 253 + }, + { + "epoch": 0.33178218630092254, + "grad_norm": 30.62127400438521, + "learning_rate": 4.226949245439887e-07, + "logits/chosen": -0.7309754490852356, + "logits/rejected": -0.7566152215003967, + "logps/chosen": -622.594970703125, + "logps/rejected": -773.9696044921875, + "loss": 0.3909, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.568760871887207, + "rewards/margins": 1.102934718132019, + "rewards/rejected": -4.671695709228516, + "step": 254 + }, + { + "epoch": 0.33308841538084744, + "grad_norm": 13.591190068285586, + "learning_rate": 4.2186769946052945e-07, + "logits/chosen": -0.8123141527175903, + "logits/rejected": -0.783890426158905, + "logps/chosen": -756.3726196289062, + "logps/rejected": -836.5651245117188, + "loss": 0.3886, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.8985278606414795, + "rewards/margins": 1.0888140201568604, + "rewards/rejected": -4.98734188079834, + "step": 255 + }, + { + "epoch": 0.33439464446077233, + "grad_norm": 16.629705733383222, + "learning_rate": 4.210368908030614e-07, + "logits/chosen": -0.9059042930603027, + "logits/rejected": -0.9054913520812988, + "logps/chosen": -769.614013671875, + "logps/rejected": -895.432861328125, + "loss": 0.394, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.691370725631714, + "rewards/margins": 1.422187328338623, + "rewards/rejected": -5.113557815551758, + "step": 256 + }, + { + "epoch": 0.3357008735406972, + "grad_norm": 14.971829751828695, + "learning_rate": 4.202025158945855e-07, + "logits/chosen": -0.8011924624443054, + "logits/rejected": -0.8006505966186523, + "logps/chosen": -684.6802368164062, + "logps/rejected": -919.0767822265625, + "loss": 0.366, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.046587944030762, + "rewards/margins": 1.879364252090454, + "rewards/rejected": -5.925952434539795, + "step": 257 + }, + { + "epoch": 0.3370071026206221, + "grad_norm": 19.668038427375784, + "learning_rate": 4.1936459213246166e-07, + "logits/chosen": -0.8572171926498413, + "logits/rejected": -0.8683452010154724, + "logps/chosen": -809.2364501953125, + "logps/rejected": -1078.41845703125, + "loss": 0.3368, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.206806182861328, + "rewards/margins": 2.6465835571289062, + "rewards/rejected": -6.853388786315918, + "step": 258 + }, + { + "epoch": 0.338313331700547, + "grad_norm": 39.6303014381974, + "learning_rate": 4.185231369880461e-07, + "logits/chosen": -0.7904149293899536, + "logits/rejected": -0.7628904581069946, + "logps/chosen": -836.9259033203125, + "logps/rejected": -1044.5078125, + "loss": 0.6005, + "rewards/accuracies": 0.65625, + "rewards/chosen": -5.005545616149902, + "rewards/margins": 1.8065614700317383, + "rewards/rejected": -6.812107563018799, + "step": 259 + }, + { + "epoch": 0.3396195607804719, + "grad_norm": 18.040502881336984, + "learning_rate": 4.176781680063274e-07, + "logits/chosen": -0.8526827692985535, + "logits/rejected": -0.8415613174438477, + "logps/chosen": -822.579833984375, + "logps/rejected": -931.8614501953125, + "loss": 0.3968, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.4847869873046875, + "rewards/margins": 1.229368805885315, + "rewards/rejected": -5.714155673980713, + "step": 260 + }, + { + "epoch": 0.3409257898603968, + "grad_norm": 18.615431463139885, + "learning_rate": 4.1682970280555987e-07, + "logits/chosen": -0.7734108567237854, + "logits/rejected": -0.7589735388755798, + "logps/chosen": -727.7169189453125, + "logps/rejected": -957.0603637695312, + "loss": 0.376, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.171051025390625, + "rewards/margins": 2.155472993850708, + "rewards/rejected": -6.326523780822754, + "step": 261 + }, + { + "epoch": 0.3422320189403217, + "grad_norm": 14.85047813726669, + "learning_rate": 4.1597775907689706e-07, + "logits/chosen": -0.9151461720466614, + "logits/rejected": -0.8985196352005005, + "logps/chosen": -783.2546997070312, + "logps/rejected": -905.3399658203125, + "loss": 0.4107, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.076227188110352, + "rewards/margins": 1.4259974956512451, + "rewards/rejected": -5.502224922180176, + "step": 262 + }, + { + "epoch": 0.3435382480202466, + "grad_norm": 18.614910926586862, + "learning_rate": 4.1512235458402243e-07, + "logits/chosen": -0.8314370512962341, + "logits/rejected": -0.8302509784698486, + "logps/chosen": -783.2666625976562, + "logps/rejected": -1022.0087890625, + "loss": 0.4005, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.511577606201172, + "rewards/margins": 2.0172276496887207, + "rewards/rejected": -6.528805732727051, + "step": 263 + }, + { + "epoch": 0.34484447710017146, + "grad_norm": 17.858743835450216, + "learning_rate": 4.142635071627789e-07, + "logits/chosen": -0.7118708491325378, + "logits/rejected": -0.7446046471595764, + "logps/chosen": -711.8963012695312, + "logps/rejected": -930.8863525390625, + "loss": 0.3693, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.9071197509765625, + "rewards/margins": 1.9890848398208618, + "rewards/rejected": -5.896204948425293, + "step": 264 + }, + { + "epoch": 0.34615070618009636, + "grad_norm": 36.928248348050545, + "learning_rate": 4.1340123472079736e-07, + "logits/chosen": -0.783199667930603, + "logits/rejected": -0.8067238330841064, + "logps/chosen": -670.074951171875, + "logps/rejected": -888.4215698242188, + "loss": 0.3255, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.4405863285064697, + "rewards/margins": 2.0715410709381104, + "rewards/rejected": -5.512126922607422, + "step": 265 + }, + { + "epoch": 0.34745693526002125, + "grad_norm": 16.839612613063686, + "learning_rate": 4.125355552371226e-07, + "logits/chosen": -0.8244245648384094, + "logits/rejected": -0.7949088215827942, + "logps/chosen": -735.464599609375, + "logps/rejected": -927.4498901367188, + "loss": 0.4041, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.125042915344238, + "rewards/margins": 1.8313933610916138, + "rewards/rejected": -5.956435203552246, + "step": 266 + }, + { + "epoch": 0.34876316433994614, + "grad_norm": 16.593790276306056, + "learning_rate": 4.116664867618394e-07, + "logits/chosen": -0.7892999649047852, + "logits/rejected": -0.8079116940498352, + "logps/chosen": -764.3323974609375, + "logps/rejected": -880.1994018554688, + "loss": 0.3958, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.445393085479736, + "rewards/margins": 1.059816837310791, + "rewards/rejected": -5.505209922790527, + "step": 267 + }, + { + "epoch": 0.35006939341987103, + "grad_norm": 15.64984000116705, + "learning_rate": 4.1079404741569513e-07, + "logits/chosen": -0.743574857711792, + "logits/rejected": -0.7336382269859314, + "logps/chosen": -741.19677734375, + "logps/rejected": -862.8565063476562, + "loss": 0.3945, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.141275882720947, + "rewards/margins": 1.4089856147766113, + "rewards/rejected": -5.550261497497559, + "step": 268 + }, + { + "epoch": 0.3513756224997959, + "grad_norm": 23.080229037991064, + "learning_rate": 4.099182553897228e-07, + "logits/chosen": -0.7218165397644043, + "logits/rejected": -0.7307446599006653, + "logps/chosen": -690.0428466796875, + "logps/rejected": -1017.659423828125, + "loss": 0.3222, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.46806263923645, + "rewards/margins": 2.848270893096924, + "rewards/rejected": -6.316333770751953, + "step": 269 + }, + { + "epoch": 0.3526818515797208, + "grad_norm": 15.137850783640578, + "learning_rate": 4.0903912894486115e-07, + "logits/chosen": -0.8584780693054199, + "logits/rejected": -0.8682304620742798, + "logps/chosen": -786.788330078125, + "logps/rejected": -964.2037963867188, + "loss": 0.3801, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.453627586364746, + "rewards/margins": 1.5645616054534912, + "rewards/rejected": -6.018187999725342, + "step": 270 + }, + { + "epoch": 0.3539880806596457, + "grad_norm": 16.920756196804312, + "learning_rate": 4.0815668641157407e-07, + "logits/chosen": -0.720942497253418, + "logits/rejected": -0.7908264398574829, + "logps/chosen": -785.8327026367188, + "logps/rejected": -1009.3907470703125, + "loss": 0.3295, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.186331748962402, + "rewards/margins": 1.9950276613235474, + "rewards/rejected": -6.181359767913818, + "step": 271 + }, + { + "epoch": 0.3552943097395706, + "grad_norm": 19.264848572771506, + "learning_rate": 4.072709461894687e-07, + "logits/chosen": -0.7569836974143982, + "logits/rejected": -0.7545579671859741, + "logps/chosen": -749.757080078125, + "logps/rejected": -933.4493408203125, + "loss": 0.3748, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.149528503417969, + "rewards/margins": 1.973623275756836, + "rewards/rejected": -6.123151779174805, + "step": 272 + }, + { + "epoch": 0.3566005388194955, + "grad_norm": 16.611848455705292, + "learning_rate": 4.063819267469113e-07, + "logits/chosen": -0.7995561957359314, + "logits/rejected": -0.8192765712738037, + "logps/chosen": -752.6700439453125, + "logps/rejected": -1020.6839599609375, + "loss": 0.363, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.0336713790893555, + "rewards/margins": 2.4031214714050293, + "rewards/rejected": -6.436792850494385, + "step": 273 + }, + { + "epoch": 0.3579067678994204, + "grad_norm": 28.426846813580198, + "learning_rate": 4.054896466206426e-07, + "logits/chosen": -0.7028381824493408, + "logits/rejected": -0.68387770652771, + "logps/chosen": -789.5866088867188, + "logps/rejected": -966.2410888671875, + "loss": 0.4124, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.212189674377441, + "rewards/margins": 1.875523328781128, + "rewards/rejected": -6.087713241577148, + "step": 274 + }, + { + "epoch": 0.3592129969793453, + "grad_norm": 21.783109948267267, + "learning_rate": 4.0459412441539097e-07, + "logits/chosen": -0.720589280128479, + "logits/rejected": -0.7432644367218018, + "logps/chosen": -799.8099365234375, + "logps/rejected": -975.6666259765625, + "loss": 0.3745, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.347837448120117, + "rewards/margins": 1.7226536273956299, + "rewards/rejected": -6.070490837097168, + "step": 275 + }, + { + "epoch": 0.36051922605927017, + "grad_norm": 19.53557011898164, + "learning_rate": 4.036953788034846e-07, + "logits/chosen": -0.7636262774467468, + "logits/rejected": -0.7430158257484436, + "logps/chosen": -754.807861328125, + "logps/rejected": -938.4634399414062, + "loss": 0.4592, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.003164768218994, + "rewards/margins": 1.8176988363265991, + "rewards/rejected": -5.820863723754883, + "step": 276 + }, + { + "epoch": 0.36182545513919506, + "grad_norm": 22.123611426990557, + "learning_rate": 4.027934285244623e-07, + "logits/chosen": -0.7706893682479858, + "logits/rejected": -0.7902205586433411, + "logps/chosen": -764.242919921875, + "logps/rejected": -956.5457763671875, + "loss": 0.303, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.8078622817993164, + "rewards/margins": 2.1033294200897217, + "rewards/rejected": -5.911191940307617, + "step": 277 + }, + { + "epoch": 0.36313168421911995, + "grad_norm": 18.678151028721693, + "learning_rate": 4.0188829238468256e-07, + "logits/chosen": -0.8828777074813843, + "logits/rejected": -0.9020895957946777, + "logps/chosen": -643.5447387695312, + "logps/rejected": -933.9203491210938, + "loss": 0.4152, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.4640512466430664, + "rewards/margins": 2.1585946083068848, + "rewards/rejected": -5.622645378112793, + "step": 278 + }, + { + "epoch": 0.36443791329904485, + "grad_norm": 20.975474287569014, + "learning_rate": 4.0097998925693166e-07, + "logits/chosen": -0.8540270924568176, + "logits/rejected": -0.8402454853057861, + "logps/chosen": -715.0292358398438, + "logps/rejected": -820.9437866210938, + "loss": 0.395, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.91107439994812, + "rewards/margins": 1.1521496772766113, + "rewards/rejected": -5.063223838806152, + "step": 279 + }, + { + "epoch": 0.36574414237896974, + "grad_norm": 26.519174108929906, + "learning_rate": 4.0006853808002984e-07, + "logits/chosen": -0.7778640389442444, + "logits/rejected": -0.792134702205658, + "logps/chosen": -669.5437622070312, + "logps/rejected": -876.0143432617188, + "loss": 0.3501, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.469069242477417, + "rewards/margins": 1.8177546262741089, + "rewards/rejected": -5.2868242263793945, + "step": 280 + }, + { + "epoch": 0.36705037145889463, + "grad_norm": 26.476765515958245, + "learning_rate": 3.9915395785843674e-07, + "logits/chosen": -0.8218634128570557, + "logits/rejected": -0.8803665637969971, + "logps/chosen": -788.5381469726562, + "logps/rejected": -1056.4686279296875, + "loss": 0.3851, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.204514503479004, + "rewards/margins": 2.040257692337036, + "rewards/rejected": -6.244772434234619, + "step": 281 + }, + { + "epoch": 0.3683566005388195, + "grad_norm": 18.11399655933868, + "learning_rate": 3.9823626766185493e-07, + "logits/chosen": -0.7796139121055603, + "logits/rejected": -0.8175578117370605, + "logps/chosen": -761.0985717773438, + "logps/rejected": -1089.9530029296875, + "loss": 0.3791, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.092470645904541, + "rewards/margins": 2.7390975952148438, + "rewards/rejected": -6.831568717956543, + "step": 282 + }, + { + "epoch": 0.36966282961874436, + "grad_norm": 17.708912321185966, + "learning_rate": 3.973154866248323e-07, + "logits/chosen": -0.7699542045593262, + "logits/rejected": -0.7915944457054138, + "logps/chosen": -720.2528686523438, + "logps/rejected": -922.4497680664062, + "loss": 0.3909, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.033339500427246, + "rewards/margins": 1.768364667892456, + "rewards/rejected": -5.801704406738281, + "step": 283 + }, + { + "epoch": 0.37096905869866925, + "grad_norm": 24.59750532391127, + "learning_rate": 3.963916339463632e-07, + "logits/chosen": -0.8112433552742004, + "logits/rejected": -0.8277667760848999, + "logps/chosen": -811.3380126953125, + "logps/rejected": -985.3678588867188, + "loss": 0.4022, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.430903434753418, + "rewards/margins": 1.9277082681655884, + "rewards/rejected": -6.358611106872559, + "step": 284 + }, + { + "epoch": 0.37227528777859414, + "grad_norm": 17.110729368478943, + "learning_rate": 3.954647288894882e-07, + "logits/chosen": -0.7523562908172607, + "logits/rejected": -0.7668542861938477, + "logps/chosen": -688.250732421875, + "logps/rejected": -911.3945922851562, + "loss": 0.359, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.7461307048797607, + "rewards/margins": 2.113868474960327, + "rewards/rejected": -5.859999179840088, + "step": 285 + }, + { + "epoch": 0.37358151685851904, + "grad_norm": 25.899394843198557, + "learning_rate": 3.9453479078089215e-07, + "logits/chosen": -0.8517543077468872, + "logits/rejected": -0.861216127872467, + "logps/chosen": -755.353759765625, + "logps/rejected": -947.4692993164062, + "loss": 0.377, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.596562147140503, + "rewards/margins": 1.961177110671997, + "rewards/rejected": -5.5577392578125, + "step": 286 + }, + { + "epoch": 0.37488774593844393, + "grad_norm": 19.089362499056715, + "learning_rate": 3.936018390105013e-07, + "logits/chosen": -0.7259124517440796, + "logits/rejected": -0.7561787962913513, + "logps/chosen": -753.0985717773438, + "logps/rejected": -1026.896728515625, + "loss": 0.3647, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.9283385276794434, + "rewards/margins": 2.4963464736938477, + "rewards/rejected": -6.424685478210449, + "step": 287 + }, + { + "epoch": 0.3761939750183688, + "grad_norm": 18.268500447333547, + "learning_rate": 3.926658930310793e-07, + "logits/chosen": -0.8416800498962402, + "logits/rejected": -0.8493793606758118, + "logps/chosen": -744.7431640625, + "logps/rejected": -862.709228515625, + "loss": 0.4318, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.217617511749268, + "rewards/margins": 1.3080933094024658, + "rewards/rejected": -5.525710582733154, + "step": 288 + }, + { + "epoch": 0.3775002040982937, + "grad_norm": 24.401828710049934, + "learning_rate": 3.9172697235782113e-07, + "logits/chosen": -0.8084543347358704, + "logits/rejected": -0.8261023759841919, + "logps/chosen": -656.1014404296875, + "logps/rejected": -815.1503295898438, + "loss": 0.3557, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.5779876708984375, + "rewards/margins": 1.3531869649887085, + "rewards/rejected": -4.9311747550964355, + "step": 289 + }, + { + "epoch": 0.3788064331782186, + "grad_norm": 26.87923966516802, + "learning_rate": 3.907850965679467e-07, + "logits/chosen": -0.9053621888160706, + "logits/rejected": -0.8963493704795837, + "logps/chosen": -796.131103515625, + "logps/rejected": -926.56787109375, + "loss": 0.4174, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.860959529876709, + "rewards/margins": 1.4262628555297852, + "rewards/rejected": -5.287222385406494, + "step": 290 + }, + { + "epoch": 0.3801126622581435, + "grad_norm": 19.905113208319822, + "learning_rate": 3.898402853002921e-07, + "logits/chosen": -0.7205644845962524, + "logits/rejected": -0.7387488484382629, + "logps/chosen": -710.8943481445312, + "logps/rejected": -1005.5984497070312, + "loss": 0.3575, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.6093287467956543, + "rewards/margins": 2.4270055294036865, + "rewards/rejected": -6.03633451461792, + "step": 291 + }, + { + "epoch": 0.3814188913380684, + "grad_norm": 18.78689097302734, + "learning_rate": 3.8889255825490053e-07, + "logits/chosen": -0.8066189289093018, + "logits/rejected": -0.8041746616363525, + "logps/chosen": -678.9967041015625, + "logps/rejected": -807.6049194335938, + "loss": 0.4242, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.634415626525879, + "rewards/margins": 1.2685359716415405, + "rewards/rejected": -4.902951240539551, + "step": 292 + }, + { + "epoch": 0.3827251204179933, + "grad_norm": 24.124003964607752, + "learning_rate": 3.879419351926115e-07, + "logits/chosen": -0.8516795039176941, + "logits/rejected": -0.8285474181175232, + "logps/chosen": -713.4191284179688, + "logps/rejected": -836.4865112304688, + "loss": 0.448, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.8255889415740967, + "rewards/margins": 1.2763196229934692, + "rewards/rejected": -5.1019086837768555, + "step": 293 + }, + { + "epoch": 0.3840313494979182, + "grad_norm": 24.512379687313626, + "learning_rate": 3.8698843593464843e-07, + "logits/chosen": -0.874547004699707, + "logits/rejected": -0.8514626622200012, + "logps/chosen": -662.3638916015625, + "logps/rejected": -867.0076293945312, + "loss": 0.382, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.4917638301849365, + "rewards/margins": 2.1748902797698975, + "rewards/rejected": -5.66665506362915, + "step": 294 + }, + { + "epoch": 0.38533757857784307, + "grad_norm": 23.28648570945252, + "learning_rate": 3.860320803622059e-07, + "logits/chosen": -0.8211836814880371, + "logits/rejected": -0.7879142761230469, + "logps/chosen": -709.470947265625, + "logps/rejected": -909.6597290039062, + "loss": 0.3984, + "rewards/accuracies": 0.59375, + "rewards/chosen": -3.751055955886841, + "rewards/margins": 1.771988034248352, + "rewards/rejected": -5.523044109344482, + "step": 295 + }, + { + "epoch": 0.38664380765776796, + "grad_norm": 15.966272480655949, + "learning_rate": 3.850728884160347e-07, + "logits/chosen": -0.9212681651115417, + "logits/rejected": -0.8988990783691406, + "logps/chosen": -700.7277221679688, + "logps/rejected": -839.85302734375, + "loss": 0.4332, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.487891674041748, + "rewards/margins": 1.4060091972351074, + "rewards/rejected": -4.8939008712768555, + "step": 296 + }, + { + "epoch": 0.38795003673769285, + "grad_norm": 16.62431980117085, + "learning_rate": 3.841108800960264e-07, + "logits/chosen": -0.8558894991874695, + "logits/rejected": -0.8787685036659241, + "logps/chosen": -702.3900756835938, + "logps/rejected": -1032.5880126953125, + "loss": 0.3388, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6224589347839355, + "rewards/margins": 2.8511135578155518, + "rewards/rejected": -6.473572731018066, + "step": 297 + }, + { + "epoch": 0.38925626581761774, + "grad_norm": 20.505033729319532, + "learning_rate": 3.831460754607958e-07, + "logits/chosen": -0.8187196254730225, + "logits/rejected": -0.8213932514190674, + "logps/chosen": -654.1541748046875, + "logps/rejected": -828.8375244140625, + "loss": 0.449, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.3246121406555176, + "rewards/margins": 1.8316363096237183, + "rewards/rejected": -5.156248092651367, + "step": 298 + }, + { + "epoch": 0.39056249489754263, + "grad_norm": 24.663180868372333, + "learning_rate": 3.821784946272633e-07, + "logits/chosen": -0.7171926498413086, + "logits/rejected": -0.6735442876815796, + "logps/chosen": -678.7763671875, + "logps/rejected": -844.7467041015625, + "loss": 0.3789, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.276444911956787, + "rewards/margins": 1.5180835723876953, + "rewards/rejected": -4.794528007507324, + "step": 299 + }, + { + "epoch": 0.3918687239774675, + "grad_norm": 17.49083772410938, + "learning_rate": 3.8120815777023506e-07, + "logits/chosen": -0.7320696115493774, + "logits/rejected": -0.7526537179946899, + "logps/chosen": -622.1019897460938, + "logps/rejected": -824.1396484375, + "loss": 0.3586, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.15362548828125, + "rewards/margins": 1.5491368770599365, + "rewards/rejected": -4.702762603759766, + "step": 300 + }, + { + "epoch": 0.3918687239774675, + "eval_logits/chosen": -0.765253484249115, + "eval_logits/rejected": -0.7676687240600586, + "eval_logps/chosen": -685.0309448242188, + "eval_logps/rejected": -878.11181640625, + "eval_loss": 0.39491522312164307, + "eval_rewards/accuracies": 0.871999979019165, + "eval_rewards/chosen": -3.4099984169006348, + "eval_rewards/margins": 1.8698322772979736, + "eval_rewards/rejected": -5.2798309326171875, + "eval_runtime": 306.3629, + "eval_samples_per_second": 6.528, + "eval_steps_per_second": 0.408, + "step": 300 + }, + { + "epoch": 0.3931749530573924, + "grad_norm": 16.091751836838906, + "learning_rate": 3.8023508512198257e-07, + "logits/chosen": -0.868710994720459, + "logits/rejected": -0.8930901288986206, + "logps/chosen": -620.3997192382812, + "logps/rejected": -734.6292114257812, + "loss": 0.3495, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.891201972961426, + "rewards/margins": 1.2318321466445923, + "rewards/rejected": -4.1230340003967285, + "step": 301 + }, + { + "epoch": 0.3944811821373173, + "grad_norm": 16.748258638354255, + "learning_rate": 3.792592969718204e-07, + "logits/chosen": -0.8345980644226074, + "logits/rejected": -0.8519657850265503, + "logps/chosen": -716.6856689453125, + "logps/rejected": -984.464599609375, + "loss": 0.3587, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.8957595825195312, + "rewards/margins": 2.130659580230713, + "rewards/rejected": -6.026419162750244, + "step": 302 + }, + { + "epoch": 0.3957874112172422, + "grad_norm": 16.663033219846664, + "learning_rate": 3.7828081366568384e-07, + "logits/chosen": -0.8669158220291138, + "logits/rejected": -0.8916921615600586, + "logps/chosen": -657.3521118164062, + "logps/rejected": -780.7493896484375, + "loss": 0.3705, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.228039503097534, + "rewards/margins": 1.194130301475525, + "rewards/rejected": -4.4221696853637695, + "step": 303 + }, + { + "epoch": 0.3970936402971671, + "grad_norm": 13.630350117251616, + "learning_rate": 3.772996556057039e-07, + "logits/chosen": -0.8360152840614319, + "logits/rejected": -0.8600199222564697, + "logps/chosen": -676.6741943359375, + "logps/rejected": -839.611083984375, + "loss": 0.355, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.482692003250122, + "rewards/margins": 1.5282418727874756, + "rewards/rejected": -5.010933876037598, + "step": 304 + }, + { + "epoch": 0.398399869377092, + "grad_norm": 13.096658139864836, + "learning_rate": 3.763158432497823e-07, + "logits/chosen": -0.7596051692962646, + "logits/rejected": -0.7697958946228027, + "logps/chosen": -722.1881103515625, + "logps/rejected": -954.9688720703125, + "loss": 0.3335, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.748467206954956, + "rewards/margins": 1.9416289329528809, + "rewards/rejected": -5.690095901489258, + "step": 305 + }, + { + "epoch": 0.3997060984570169, + "grad_norm": 15.636359990948772, + "learning_rate": 3.753293971111652e-07, + "logits/chosen": -1.0189201831817627, + "logits/rejected": -1.0032825469970703, + "logps/chosen": -795.2416381835938, + "logps/rejected": -1025.907958984375, + "loss": 0.3618, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.307117938995361, + "rewards/margins": 2.324575185775757, + "rewards/rejected": -6.631693363189697, + "step": 306 + }, + { + "epoch": 0.40101232753694177, + "grad_norm": 14.176580126233755, + "learning_rate": 3.743403377580148e-07, + "logits/chosen": -0.7651489973068237, + "logits/rejected": -0.8138822317123413, + "logps/chosen": -788.5110473632812, + "logps/rejected": -976.249267578125, + "loss": 0.3584, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.725651264190674, + "rewards/margins": 1.478232502937317, + "rewards/rejected": -6.203884124755859, + "step": 307 + }, + { + "epoch": 0.40231855661686666, + "grad_norm": 18.74087147989152, + "learning_rate": 3.7334868581298104e-07, + "logits/chosen": -0.7951301336288452, + "logits/rejected": -0.8030416965484619, + "logps/chosen": -865.9967651367188, + "logps/rejected": -1085.788330078125, + "loss": 0.3421, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.567200183868408, + "rewards/margins": 1.9825466871261597, + "rewards/rejected": -6.549746513366699, + "step": 308 + }, + { + "epoch": 0.40362478569679155, + "grad_norm": 36.18623439913893, + "learning_rate": 3.7235446195277136e-07, + "logits/chosen": -0.8499747514724731, + "logits/rejected": -0.8248738646507263, + "logps/chosen": -713.7739868164062, + "logps/rejected": -1034.135009765625, + "loss": 0.287, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.9644153118133545, + "rewards/margins": 2.9848814010620117, + "rewards/rejected": -6.949296951293945, + "step": 309 + }, + { + "epoch": 0.40493101477671645, + "grad_norm": 32.30699716936894, + "learning_rate": 3.713576869077195e-07, + "logits/chosen": -0.8620212078094482, + "logits/rejected": -0.8468830585479736, + "logps/chosen": -833.579833984375, + "logps/rejected": -1144.938232421875, + "loss": 0.4218, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.6529130935668945, + "rewards/margins": 3.0569937229156494, + "rewards/rejected": -7.709907054901123, + "step": 310 + }, + { + "epoch": 0.40623724385664134, + "grad_norm": 16.10176328410453, + "learning_rate": 3.703583814613536e-07, + "logits/chosen": -0.7625004649162292, + "logits/rejected": -0.7636492252349854, + "logps/chosen": -889.8790893554688, + "logps/rejected": -1146.9530029296875, + "loss": 0.3173, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.2467522621154785, + "rewards/margins": 2.842636823654175, + "rewards/rejected": -8.089387893676758, + "step": 311 + }, + { + "epoch": 0.40754347293656623, + "grad_norm": 16.447526660324478, + "learning_rate": 3.693565664499623e-07, + "logits/chosen": -0.961453914642334, + "logits/rejected": -0.9357421398162842, + "logps/chosen": -755.412841796875, + "logps/rejected": -965.876953125, + "loss": 0.3669, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.1061601638793945, + "rewards/margins": 2.250901937484741, + "rewards/rejected": -6.357061862945557, + "step": 312 + }, + { + "epoch": 0.4088497020164911, + "grad_norm": 23.371894242649518, + "learning_rate": 3.683522627621608e-07, + "logits/chosen": -0.854593813419342, + "logits/rejected": -0.9100630879402161, + "logps/chosen": -789.1729736328125, + "logps/rejected": -1074.300048828125, + "loss": 0.356, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.489620208740234, + "rewards/margins": 2.253740072250366, + "rewards/rejected": -6.7433600425720215, + "step": 313 + }, + { + "epoch": 0.410155931096416, + "grad_norm": 32.176515119654795, + "learning_rate": 3.6734549133845533e-07, + "logits/chosen": -0.7982116341590881, + "logits/rejected": -0.8008893132209778, + "logps/chosen": -879.0695190429688, + "logps/rejected": -1155.3798828125, + "loss": 0.3758, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.085649013519287, + "rewards/margins": 2.7232906818389893, + "rewards/rejected": -7.8089399337768555, + "step": 314 + }, + { + "epoch": 0.4114621601763409, + "grad_norm": 26.366438342209516, + "learning_rate": 3.6633627317080585e-07, + "logits/chosen": -0.9103017449378967, + "logits/rejected": -0.841745138168335, + "logps/chosen": -873.0088500976562, + "logps/rejected": -1075.5986328125, + "loss": 0.3927, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.0356855392456055, + "rewards/margins": 2.1732540130615234, + "rewards/rejected": -7.208939552307129, + "step": 315 + }, + { + "epoch": 0.4127683892562658, + "grad_norm": 21.575409938356536, + "learning_rate": 3.653246293021891e-07, + "logits/chosen": -0.8488960266113281, + "logits/rejected": -0.8882749080657959, + "logps/chosen": -883.19287109375, + "logps/rejected": -1041.235107421875, + "loss": 0.4297, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.056049346923828, + "rewards/margins": 1.5542391538619995, + "rewards/rejected": -6.610287666320801, + "step": 316 + }, + { + "epoch": 0.4140746183361907, + "grad_norm": 35.21304677674138, + "learning_rate": 3.643105808261596e-07, + "logits/chosen": -0.8114643692970276, + "logits/rejected": -0.8472386002540588, + "logps/chosen": -771.3401489257812, + "logps/rejected": -962.2191162109375, + "loss": 0.4264, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.822422504425049, + "rewards/margins": 1.6922321319580078, + "rewards/rejected": -6.514655113220215, + "step": 317 + }, + { + "epoch": 0.4153808474161156, + "grad_norm": 24.020179547425144, + "learning_rate": 3.632941488864097e-07, + "logits/chosen": -0.7898439168930054, + "logits/rejected": -0.8224785923957825, + "logps/chosen": -779.1013793945312, + "logps/rejected": -1024.5921630859375, + "loss": 0.3937, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.604560852050781, + "rewards/margins": 2.217036247253418, + "rewards/rejected": -6.821596622467041, + "step": 318 + }, + { + "epoch": 0.4166870764960405, + "grad_norm": 15.576349233981643, + "learning_rate": 3.6227535467632867e-07, + "logits/chosen": -0.9034079313278198, + "logits/rejected": -0.9171915054321289, + "logps/chosen": -718.1586303710938, + "logps/rejected": -1026.34228515625, + "loss": 0.3409, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.507500171661377, + "rewards/margins": 2.7573251724243164, + "rewards/rejected": -6.264824867248535, + "step": 319 + }, + { + "epoch": 0.41799330557596537, + "grad_norm": 23.449788537469892, + "learning_rate": 3.6125421943856125e-07, + "logits/chosen": -0.7588320970535278, + "logits/rejected": -0.7619040012359619, + "logps/chosen": -728.4111328125, + "logps/rejected": -954.530029296875, + "loss": 0.3494, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.0672736167907715, + "rewards/margins": 2.065455913543701, + "rewards/rejected": -6.132730007171631, + "step": 320 + }, + { + "epoch": 0.41929953465589026, + "grad_norm": 18.658517623897286, + "learning_rate": 3.602307644645641e-07, + "logits/chosen": -0.9201053380966187, + "logits/rejected": -0.9317169785499573, + "logps/chosen": -665.0424194335938, + "logps/rejected": -868.5579833984375, + "loss": 0.2884, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.754164457321167, + "rewards/margins": 1.7947040796279907, + "rewards/rejected": -5.548868179321289, + "step": 321 + }, + { + "epoch": 0.42060576373581515, + "grad_norm": 15.532608997201159, + "learning_rate": 3.5920501109416233e-07, + "logits/chosen": -0.977518618106842, + "logits/rejected": -0.9513765573501587, + "logps/chosen": -797.5126953125, + "logps/rejected": -904.82470703125, + "loss": 0.3584, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.851003646850586, + "rewards/margins": 1.2614593505859375, + "rewards/rejected": -5.112462997436523, + "step": 322 + }, + { + "epoch": 0.42191199281574004, + "grad_norm": 21.182996168241864, + "learning_rate": 3.581769807151044e-07, + "logits/chosen": -0.8207509517669678, + "logits/rejected": -0.8763917684555054, + "logps/chosen": -709.5821533203125, + "logps/rejected": -980.6561279296875, + "loss": 0.383, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7620749473571777, + "rewards/margins": 2.246896505355835, + "rewards/rejected": -6.008971214294434, + "step": 323 + }, + { + "epoch": 0.42321822189566494, + "grad_norm": 16.443088156761927, + "learning_rate": 3.571466947626162e-07, + "logits/chosen": -0.8285816311836243, + "logits/rejected": -0.814357340335846, + "logps/chosen": -751.146240234375, + "logps/rejected": -968.7061767578125, + "loss": 0.2996, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.3344221115112305, + "rewards/margins": 1.9379419088363647, + "rewards/rejected": -6.272363662719727, + "step": 324 + }, + { + "epoch": 0.42452445097558983, + "grad_norm": 19.619563047541227, + "learning_rate": 3.5611417471895376e-07, + "logits/chosen": -0.7870075106620789, + "logits/rejected": -0.7580903172492981, + "logps/chosen": -721.9214477539062, + "logps/rejected": -890.1439208984375, + "loss": 0.2708, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.170993804931641, + "rewards/margins": 1.6512360572814941, + "rewards/rejected": -5.822230339050293, + "step": 325 + }, + { + "epoch": 0.4258306800555147, + "grad_norm": 18.263410236851353, + "learning_rate": 3.5507944211295604e-07, + "logits/chosen": -0.89299476146698, + "logits/rejected": -0.8813902139663696, + "logps/chosen": -851.031982421875, + "logps/rejected": -1084.912841796875, + "loss": 0.3693, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.2356858253479, + "rewards/margins": 2.128682851791382, + "rewards/rejected": -6.364368915557861, + "step": 326 + }, + { + "epoch": 0.4271369091354396, + "grad_norm": 24.469596399437656, + "learning_rate": 3.540425185195953e-07, + "logits/chosen": -0.7243574261665344, + "logits/rejected": -0.7720382213592529, + "logps/chosen": -872.6907348632812, + "logps/rejected": -1353.14599609375, + "loss": 0.408, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.031065464019775, + "rewards/margins": 3.7641468048095703, + "rewards/rejected": -8.795212745666504, + "step": 327 + }, + { + "epoch": 0.4284431382153645, + "grad_norm": 27.430992219157226, + "learning_rate": 3.5300342555952787e-07, + "logits/chosen": -0.7673375606536865, + "logits/rejected": -0.8125264048576355, + "logps/chosen": -761.0677490234375, + "logps/rejected": -919.9730834960938, + "loss": 0.4373, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.35603141784668, + "rewards/margins": 1.2068922519683838, + "rewards/rejected": -5.562923431396484, + "step": 328 + }, + { + "epoch": 0.4297493672952894, + "grad_norm": 17.69204592084202, + "learning_rate": 3.519621848986428e-07, + "logits/chosen": -0.883451521396637, + "logits/rejected": -0.9020692110061646, + "logps/chosen": -846.7744140625, + "logps/rejected": -1188.0194091796875, + "loss": 0.3338, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.060979843139648, + "rewards/margins": 2.9336440563201904, + "rewards/rejected": -7.994624137878418, + "step": 329 + }, + { + "epoch": 0.4310555963752143, + "grad_norm": 20.036121655806124, + "learning_rate": 3.5091881824761046e-07, + "logits/chosen": -0.8099783062934875, + "logits/rejected": -0.8211954832077026, + "logps/chosen": -773.2403564453125, + "logps/rejected": -1035.19580078125, + "loss": 0.3221, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.553930282592773, + "rewards/margins": 2.4084632396698, + "rewards/rejected": -6.962393283843994, + "step": 330 + }, + { + "epoch": 0.4323618254551392, + "grad_norm": 29.84146239469266, + "learning_rate": 3.4987334736142977e-07, + "logits/chosen": -0.8917319774627686, + "logits/rejected": -0.7940698862075806, + "logps/chosen": -839.36376953125, + "logps/rejected": -1059.7322998046875, + "loss": 0.4586, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.834168434143066, + "rewards/margins": 2.5644521713256836, + "rewards/rejected": -7.39862060546875, + "step": 331 + }, + { + "epoch": 0.4336680545350641, + "grad_norm": 21.68856790635962, + "learning_rate": 3.4882579403897455e-07, + "logits/chosen": -0.7471975088119507, + "logits/rejected": -0.8152725696563721, + "logps/chosen": -822.88818359375, + "logps/rejected": -1091.7850341796875, + "loss": 0.3349, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.2126054763793945, + "rewards/margins": 2.201547622680664, + "rewards/rejected": -7.414153099060059, + "step": 332 + }, + { + "epoch": 0.43497428361498897, + "grad_norm": 23.858340474183475, + "learning_rate": 3.4777618012253895e-07, + "logits/chosen": -0.9458297491073608, + "logits/rejected": -0.9789966344833374, + "logps/chosen": -846.14990234375, + "logps/rejected": -1049.368408203125, + "loss": 0.3866, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.42270565032959, + "rewards/margins": 2.1395516395568848, + "rewards/rejected": -6.562256336212158, + "step": 333 + }, + { + "epoch": 0.43628051269491386, + "grad_norm": 20.135393062791458, + "learning_rate": 3.4672452749738233e-07, + "logits/chosen": -0.6803852319717407, + "logits/rejected": -0.7442112565040588, + "logps/chosen": -839.7537231445312, + "logps/rejected": -1007.471923828125, + "loss": 0.4086, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.06041955947876, + "rewards/margins": 1.4089503288269043, + "rewards/rejected": -6.469369888305664, + "step": 334 + }, + { + "epoch": 0.43758674177483875, + "grad_norm": 26.30301422269973, + "learning_rate": 3.4567085809127245e-07, + "logits/chosen": -0.927352249622345, + "logits/rejected": -0.856540322303772, + "logps/chosen": -872.8140869140625, + "logps/rejected": -1098.021728515625, + "loss": 0.4098, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.658236503601074, + "rewards/margins": 2.292365312576294, + "rewards/rejected": -6.950601577758789, + "step": 335 + }, + { + "epoch": 0.43889297085476364, + "grad_norm": 23.07833880886756, + "learning_rate": 3.446151938740285e-07, + "logits/chosen": -0.8015051484107971, + "logits/rejected": -0.8258290886878967, + "logps/chosen": -785.102294921875, + "logps/rejected": -1044.4542236328125, + "loss": 0.4308, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.3944268226623535, + "rewards/margins": 2.6531472206115723, + "rewards/rejected": -7.047574043273926, + "step": 336 + }, + { + "epoch": 0.44019919993468853, + "grad_norm": 24.733070230983582, + "learning_rate": 3.4355755685706326e-07, + "logits/chosen": -0.8332501649856567, + "logits/rejected": -0.8956983685493469, + "logps/chosen": -841.2762451171875, + "logps/rejected": -1415.9942626953125, + "loss": 0.4168, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.841058731079102, + "rewards/margins": 5.052248954772949, + "rewards/rejected": -9.893306732177734, + "step": 337 + }, + { + "epoch": 0.4415054290146134, + "grad_norm": 19.097211954684813, + "learning_rate": 3.4249796909292374e-07, + "logits/chosen": -0.8756859302520752, + "logits/rejected": -0.8802890181541443, + "logps/chosen": -720.0657958984375, + "logps/rejected": -893.16748046875, + "loss": 0.4286, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.8131256103515625, + "rewards/margins": 1.430503249168396, + "rewards/rejected": -5.24362850189209, + "step": 338 + }, + { + "epoch": 0.4428116580945383, + "grad_norm": 16.02571558109891, + "learning_rate": 3.4143645267483137e-07, + "logits/chosen": -0.8234140276908875, + "logits/rejected": -0.8196402788162231, + "logps/chosen": -729.0518798828125, + "logps/rejected": -888.3472900390625, + "loss": 0.3755, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.293505668640137, + "rewards/margins": 1.6021445989608765, + "rewards/rejected": -5.895649433135986, + "step": 339 + }, + { + "epoch": 0.4441178871744632, + "grad_norm": 20.094294388002286, + "learning_rate": 3.403730297362219e-07, + "logits/chosen": -0.8382161855697632, + "logits/rejected": -0.8710712194442749, + "logps/chosen": -740.0045166015625, + "logps/rejected": -877.9818115234375, + "loss": 0.4001, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.060654163360596, + "rewards/margins": 1.3980674743652344, + "rewards/rejected": -5.458722114562988, + "step": 340 + }, + { + "epoch": 0.4454241162543881, + "grad_norm": 15.094516807495772, + "learning_rate": 3.3930772245028317e-07, + "logits/chosen": -0.761610746383667, + "logits/rejected": -0.8118141293525696, + "logps/chosen": -753.0406494140625, + "logps/rejected": -1056.1932373046875, + "loss": 0.3754, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.8587474822998047, + "rewards/margins": 2.5299229621887207, + "rewards/rejected": -6.388670921325684, + "step": 341 + }, + { + "epoch": 0.446730345334313, + "grad_norm": 19.17347194519931, + "learning_rate": 3.382405530294933e-07, + "logits/chosen": -0.8792775869369507, + "logits/rejected": -0.8162313103675842, + "logps/chosen": -714.256103515625, + "logps/rejected": -881.3817749023438, + "loss": 0.3459, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3989529609680176, + "rewards/margins": 1.8843724727630615, + "rewards/rejected": -5.2833251953125, + "step": 342 + }, + { + "epoch": 0.4480365744142379, + "grad_norm": 14.512362955820704, + "learning_rate": 3.371715437251571e-07, + "logits/chosen": -0.8337914943695068, + "logits/rejected": -0.858870804309845, + "logps/chosen": -704.8893432617188, + "logps/rejected": -1020.823974609375, + "loss": 0.353, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.540440082550049, + "rewards/margins": 2.77099871635437, + "rewards/rejected": -6.311439037322998, + "step": 343 + }, + { + "epoch": 0.4493428034941628, + "grad_norm": 14.767671827120036, + "learning_rate": 3.3610071682694286e-07, + "logits/chosen": -0.9238380193710327, + "logits/rejected": -0.8974184989929199, + "logps/chosen": -788.702880859375, + "logps/rejected": -924.5913696289062, + "loss": 0.3612, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.856344699859619, + "rewards/margins": 1.5991241931915283, + "rewards/rejected": -5.455468654632568, + "step": 344 + }, + { + "epoch": 0.45064903257408767, + "grad_norm": 13.867972850755454, + "learning_rate": 3.3502809466241653e-07, + "logits/chosen": -1.0090371370315552, + "logits/rejected": -0.9271538257598877, + "logps/chosen": -864.546875, + "logps/rejected": -1048.196044921875, + "loss": 0.3424, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.667964458465576, + "rewards/margins": 2.3645546436309814, + "rewards/rejected": -7.032519340515137, + "step": 345 + }, + { + "epoch": 0.45195526165401256, + "grad_norm": 20.821023709654465, + "learning_rate": 3.3395369959657713e-07, + "logits/chosen": -0.8493632674217224, + "logits/rejected": -0.8992824554443359, + "logps/chosen": -694.7129516601562, + "logps/rejected": -967.9573974609375, + "loss": 0.3349, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.8514938354492188, + "rewards/margins": 2.331794261932373, + "rewards/rejected": -6.183287620544434, + "step": 346 + }, + { + "epoch": 0.45326149073393746, + "grad_norm": 20.674015602908195, + "learning_rate": 3.3287755403139e-07, + "logits/chosen": -0.829691469669342, + "logits/rejected": -0.8424278497695923, + "logps/chosen": -786.063232421875, + "logps/rejected": -927.3167724609375, + "loss": 0.3809, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.13981294631958, + "rewards/margins": 1.3104791641235352, + "rewards/rejected": -5.450291633605957, + "step": 347 + }, + { + "epoch": 0.45456771981386235, + "grad_norm": 14.753612377748723, + "learning_rate": 3.3179968040531945e-07, + "logits/chosen": -0.7737857699394226, + "logits/rejected": -0.8464547395706177, + "logps/chosen": -736.8883056640625, + "logps/rejected": -1053.45166015625, + "loss": 0.3447, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.1345133781433105, + "rewards/margins": 2.341533899307251, + "rewards/rejected": -6.476047992706299, + "step": 348 + }, + { + "epoch": 0.45587394889378724, + "grad_norm": 17.764757170781934, + "learning_rate": 3.3072010119286155e-07, + "logits/chosen": -0.8078272342681885, + "logits/rejected": -0.8359349966049194, + "logps/chosen": -783.0404663085938, + "logps/rejected": -923.0152587890625, + "loss": 0.346, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.535928249359131, + "rewards/margins": 1.3883693218231201, + "rewards/rejected": -5.92429780960083, + "step": 349 + }, + { + "epoch": 0.45718017797371213, + "grad_norm": 15.68293110394188, + "learning_rate": 3.2963883890407495e-07, + "logits/chosen": -0.992402195930481, + "logits/rejected": -0.9728306531906128, + "logps/chosen": -829.182373046875, + "logps/rejected": -916.9439086914062, + "loss": 0.3691, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.468806266784668, + "rewards/margins": 1.1914604902267456, + "rewards/rejected": -5.660266876220703, + "step": 350 + }, + { + "epoch": 0.458486407053637, + "grad_norm": 17.46952938405253, + "learning_rate": 3.28555916084112e-07, + "logits/chosen": -0.6900120973587036, + "logits/rejected": -0.7500667572021484, + "logps/chosen": -700.6179809570312, + "logps/rejected": -979.9500122070312, + "loss": 0.3445, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.12177848815918, + "rewards/margins": 2.488147258758545, + "rewards/rejected": -6.609926223754883, + "step": 351 + }, + { + "epoch": 0.4597926361335619, + "grad_norm": 22.1790777909453, + "learning_rate": 3.274713553127479e-07, + "logits/chosen": -0.8638774156570435, + "logits/rejected": -0.7936422824859619, + "logps/chosen": -765.1031494140625, + "logps/rejected": -930.421875, + "loss": 0.3737, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.624390602111816, + "rewards/margins": 1.6826943159103394, + "rewards/rejected": -6.307085037231445, + "step": 352 + }, + { + "epoch": 0.4610988652134868, + "grad_norm": 12.479143269697133, + "learning_rate": 3.263851792039109e-07, + "logits/chosen": -0.9091026782989502, + "logits/rejected": -0.9592025279998779, + "logps/chosen": -741.4254760742188, + "logps/rejected": -963.5419921875, + "loss": 0.3, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.949568748474121, + "rewards/margins": 1.8915400505065918, + "rewards/rejected": -5.841108322143555, + "step": 353 + }, + { + "epoch": 0.4624050942934117, + "grad_norm": 18.630079829400888, + "learning_rate": 3.252974104052101e-07, + "logits/chosen": -0.7867209911346436, + "logits/rejected": -0.7810537815093994, + "logps/chosen": -818.9255981445312, + "logps/rejected": -1026.5875244140625, + "loss": 0.376, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.769900321960449, + "rewards/margins": 2.1134841442108154, + "rewards/rejected": -6.883384704589844, + "step": 354 + }, + { + "epoch": 0.4637113233733366, + "grad_norm": 20.191982248042134, + "learning_rate": 3.2420807159746327e-07, + "logits/chosen": -0.9127646684646606, + "logits/rejected": -0.8559162616729736, + "logps/chosen": -896.9894409179688, + "logps/rejected": -1082.9798583984375, + "loss": 0.3528, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.027846336364746, + "rewards/margins": 2.029517889022827, + "rewards/rejected": -7.057364463806152, + "step": 355 + }, + { + "epoch": 0.4650175524532615, + "grad_norm": 34.75961013205875, + "learning_rate": 3.2311718549422435e-07, + "logits/chosen": -0.8626306056976318, + "logits/rejected": -0.8488350510597229, + "logps/chosen": -848.584716796875, + "logps/rejected": -985.186767578125, + "loss": 0.3659, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.735561847686768, + "rewards/margins": 1.573469877243042, + "rewards/rejected": -6.309031963348389, + "step": 356 + }, + { + "epoch": 0.4663237815331864, + "grad_norm": 16.615213616984725, + "learning_rate": 3.220247748413094e-07, + "logits/chosen": -0.9426769614219666, + "logits/rejected": -0.9294852018356323, + "logps/chosen": -804.0889282226562, + "logps/rejected": -981.0759887695312, + "loss": 0.2838, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.3830060958862305, + "rewards/margins": 1.6862914562225342, + "rewards/rejected": -6.069297790527344, + "step": 357 + }, + { + "epoch": 0.46763001061311127, + "grad_norm": 21.324802872506126, + "learning_rate": 3.209308624163225e-07, + "logits/chosen": -0.942538857460022, + "logits/rejected": -0.951295018196106, + "logps/chosen": -875.485107421875, + "logps/rejected": -1008.2823486328125, + "loss": 0.3164, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.925025463104248, + "rewards/margins": 1.5497174263000488, + "rewards/rejected": -6.474742889404297, + "step": 358 + }, + { + "epoch": 0.46893623969303616, + "grad_norm": 24.770720365271597, + "learning_rate": 3.1983547102818096e-07, + "logits/chosen": -0.9059143662452698, + "logits/rejected": -0.8871181607246399, + "logps/chosen": -854.4793090820312, + "logps/rejected": -1108.9869384765625, + "loss": 0.3635, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.860678672790527, + "rewards/margins": 2.5468215942382812, + "rewards/rejected": -7.407499313354492, + "step": 359 + }, + { + "epoch": 0.47024246877296105, + "grad_norm": 28.415307220726664, + "learning_rate": 3.187386235166396e-07, + "logits/chosen": -0.900255560874939, + "logits/rejected": -0.8746569156646729, + "logps/chosen": -831.3404541015625, + "logps/rejected": -973.556396484375, + "loss": 0.4562, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.972175598144531, + "rewards/margins": 1.6384382247924805, + "rewards/rejected": -6.6106133460998535, + "step": 360 + }, + { + "epoch": 0.47154869785288595, + "grad_norm": 22.30729021657225, + "learning_rate": 3.176403427518143e-07, + "logits/chosen": -0.8422956466674805, + "logits/rejected": -0.8382501602172852, + "logps/chosen": -790.753662109375, + "logps/rejected": -1003.9214477539062, + "loss": 0.3137, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.53585147857666, + "rewards/margins": 2.458928346633911, + "rewards/rejected": -6.99478006362915, + "step": 361 + }, + { + "epoch": 0.47285492693281084, + "grad_norm": 20.89001631156089, + "learning_rate": 3.165406516337057e-07, + "logits/chosen": -0.9484958052635193, + "logits/rejected": -0.8832159042358398, + "logps/chosen": -944.7320556640625, + "logps/rejected": -1150.0098876953125, + "loss": 0.3452, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.379519462585449, + "rewards/margins": 2.542079448699951, + "rewards/rejected": -7.9215989112854, + "step": 362 + }, + { + "epoch": 0.47416115601273573, + "grad_norm": 25.18282457138755, + "learning_rate": 3.154395730917213e-07, + "logits/chosen": -0.9980475902557373, + "logits/rejected": -0.9696142077445984, + "logps/chosen": -971.07275390625, + "logps/rejected": -1192.8839111328125, + "loss": 0.3473, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.382442951202393, + "rewards/margins": 2.5971856117248535, + "rewards/rejected": -7.979628562927246, + "step": 363 + }, + { + "epoch": 0.4754673850926606, + "grad_norm": 21.129084566980914, + "learning_rate": 3.143371300841973e-07, + "logits/chosen": -0.8516480922698975, + "logits/rejected": -0.8902812004089355, + "logps/chosen": -815.789306640625, + "logps/rejected": -1200.0732421875, + "loss": 0.3557, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.755250930786133, + "rewards/margins": 3.439861297607422, + "rewards/rejected": -8.195111274719238, + "step": 364 + }, + { + "epoch": 0.4767736141725855, + "grad_norm": 33.46170211823593, + "learning_rate": 3.1323334559792015e-07, + "logits/chosen": -0.909258246421814, + "logits/rejected": -0.9520514011383057, + "logps/chosen": -816.0494995117188, + "logps/rejected": -1048.5968017578125, + "loss": 0.361, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.6202073097229, + "rewards/margins": 2.1964542865753174, + "rewards/rejected": -6.816661834716797, + "step": 365 + }, + { + "epoch": 0.4780798432525104, + "grad_norm": 26.43165680790808, + "learning_rate": 3.1212824264764727e-07, + "logits/chosen": -0.7765522003173828, + "logits/rejected": -0.7952960133552551, + "logps/chosen": -838.6836547851562, + "logps/rejected": -1241.4041748046875, + "loss": 0.3913, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.901317119598389, + "rewards/margins": 3.617424726486206, + "rewards/rejected": -8.5187406539917, + "step": 366 + }, + { + "epoch": 0.4793860723324353, + "grad_norm": 19.76142450798905, + "learning_rate": 3.1102184427562696e-07, + "logits/chosen": -0.8111026883125305, + "logits/rejected": -0.8469929695129395, + "logps/chosen": -913.451416015625, + "logps/rejected": -1214.3255615234375, + "loss": 0.3589, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.217056751251221, + "rewards/margins": 2.5425302982330322, + "rewards/rejected": -7.759586334228516, + "step": 367 + }, + { + "epoch": 0.4806923014123602, + "grad_norm": 28.26438879788685, + "learning_rate": 3.0991417355111807e-07, + "logits/chosen": -0.9394615888595581, + "logits/rejected": -0.9448978900909424, + "logps/chosen": -881.9717407226562, + "logps/rejected": -1111.5242919921875, + "loss": 0.3877, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.882044792175293, + "rewards/margins": 2.1486172676086426, + "rewards/rejected": -7.030661582946777, + "step": 368 + }, + { + "epoch": 0.4819985304922851, + "grad_norm": 18.563211767851524, + "learning_rate": 3.088052535699089e-07, + "logits/chosen": -0.9249133467674255, + "logits/rejected": -0.9536056518554688, + "logps/chosen": -917.371337890625, + "logps/rejected": -1216.37060546875, + "loss": 0.3908, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.822209358215332, + "rewards/margins": 2.9570131301879883, + "rewards/rejected": -7.77922248840332, + "step": 369 + }, + { + "epoch": 0.48330475957221, + "grad_norm": 44.299895456640826, + "learning_rate": 3.07695107453836e-07, + "logits/chosen": -0.9447470307350159, + "logits/rejected": -0.8442981243133545, + "logps/chosen": -790.659423828125, + "logps/rejected": -901.15234375, + "loss": 0.3445, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.9590797424316406, + "rewards/margins": 1.7083748579025269, + "rewards/rejected": -5.667453765869141, + "step": 370 + }, + { + "epoch": 0.48461098865213487, + "grad_norm": 25.707606867503262, + "learning_rate": 3.0658375835030144e-07, + "logits/chosen": -0.8817845582962036, + "logits/rejected": -0.9164577722549438, + "logps/chosen": -795.1084594726562, + "logps/rejected": -1039.523681640625, + "loss": 0.3583, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.440952301025391, + "rewards/margins": 2.2085859775543213, + "rewards/rejected": -6.649538993835449, + "step": 371 + }, + { + "epoch": 0.48591721773205976, + "grad_norm": 16.939141948308613, + "learning_rate": 3.0547122943179067e-07, + "logits/chosen": -0.8850041031837463, + "logits/rejected": -0.8968605399131775, + "logps/chosen": -694.5146484375, + "logps/rejected": -841.6837158203125, + "loss": 0.4063, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.9476029872894287, + "rewards/margins": 1.4921026229858398, + "rewards/rejected": -5.439705848693848, + "step": 372 + }, + { + "epoch": 0.48722344681198465, + "grad_norm": 16.412461798089808, + "learning_rate": 3.0435754389538925e-07, + "logits/chosen": -0.8374086618423462, + "logits/rejected": -0.8305915594100952, + "logps/chosen": -679.0845336914062, + "logps/rejected": -899.1082763671875, + "loss": 0.3347, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.579893112182617, + "rewards/margins": 2.188535213470459, + "rewards/rejected": -5.768428325653076, + "step": 373 + }, + { + "epoch": 0.48852967589190954, + "grad_norm": 23.798205961863953, + "learning_rate": 3.03242724962299e-07, + "logits/chosen": -0.936191201210022, + "logits/rejected": -0.8969914317131042, + "logps/chosen": -755.7578125, + "logps/rejected": -921.0574340820312, + "loss": 0.3774, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.9115138053894043, + "rewards/margins": 1.8655014038085938, + "rewards/rejected": -5.777015209197998, + "step": 374 + }, + { + "epoch": 0.48983590497183443, + "grad_norm": 12.937570248162015, + "learning_rate": 3.0212679587735396e-07, + "logits/chosen": -0.7778648138046265, + "logits/rejected": -0.8440149426460266, + "logps/chosen": -737.39208984375, + "logps/rejected": -1115.466064453125, + "loss": 0.3147, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.710268020629883, + "rewards/margins": 2.5861876010894775, + "rewards/rejected": -6.296454906463623, + "step": 375 + }, + { + "epoch": 0.4911421340517593, + "grad_norm": 20.473240179875067, + "learning_rate": 3.0100977990853565e-07, + "logits/chosen": -0.7876812815666199, + "logits/rejected": -0.8208199143409729, + "logps/chosen": -701.5858154296875, + "logps/rejected": -897.10205078125, + "loss": 0.3963, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.745605707168579, + "rewards/margins": 1.8409405946731567, + "rewards/rejected": -5.586545944213867, + "step": 376 + }, + { + "epoch": 0.4924483631316842, + "grad_norm": 14.20579737409417, + "learning_rate": 2.998917003464882e-07, + "logits/chosen": -0.8073393106460571, + "logits/rejected": -0.8390824794769287, + "logps/chosen": -748.408203125, + "logps/rejected": -955.57470703125, + "loss": 0.364, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.998650074005127, + "rewards/margins": 1.9089974164962769, + "rewards/rejected": -5.907647132873535, + "step": 377 + }, + { + "epoch": 0.4937545922116091, + "grad_norm": 16.24545057635828, + "learning_rate": 2.987725805040321e-07, + "logits/chosen": -0.8982130289077759, + "logits/rejected": -0.8425903916358948, + "logps/chosen": -711.0843505859375, + "logps/rejected": -820.8436889648438, + "loss": 0.409, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7347919940948486, + "rewards/margins": 1.427825927734375, + "rewards/rejected": -5.162618160247803, + "step": 378 + }, + { + "epoch": 0.495060821291534, + "grad_norm": 18.894573729579022, + "learning_rate": 2.976524437156787e-07, + "logits/chosen": -0.9128229022026062, + "logits/rejected": -0.9626132249832153, + "logps/chosen": -747.045166015625, + "logps/rejected": -967.6761474609375, + "loss": 0.336, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7548229694366455, + "rewards/margins": 1.727982521057129, + "rewards/rejected": -5.482805252075195, + "step": 379 + }, + { + "epoch": 0.4963670503714589, + "grad_norm": 16.23155664279542, + "learning_rate": 2.9653131333714354e-07, + "logits/chosen": -0.774066686630249, + "logits/rejected": -0.7619376182556152, + "logps/chosen": -673.14794921875, + "logps/rejected": -844.1553955078125, + "loss": 0.3158, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.664004325866699, + "rewards/margins": 1.75593900680542, + "rewards/rejected": -5.419942855834961, + "step": 380 + }, + { + "epoch": 0.4976732794513838, + "grad_norm": 16.135457912708656, + "learning_rate": 2.954092127448591e-07, + "logits/chosen": -0.8101003766059875, + "logits/rejected": -0.8355172276496887, + "logps/chosen": -614.1039428710938, + "logps/rejected": -779.637939453125, + "loss": 0.4207, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.3870277404785156, + "rewards/margins": 1.5078352689743042, + "rewards/rejected": -4.894863128662109, + "step": 381 + }, + { + "epoch": 0.4989795085313087, + "grad_norm": 20.89194150920151, + "learning_rate": 2.9428616533548766e-07, + "logits/chosen": -0.9312489032745361, + "logits/rejected": -0.9005659818649292, + "logps/chosen": -723.7083740234375, + "logps/rejected": -968.6890258789062, + "loss": 0.3026, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.5352654457092285, + "rewards/margins": 2.6465823650360107, + "rewards/rejected": -6.18184757232666, + "step": 382 + }, + { + "epoch": 0.5002857376112335, + "grad_norm": 15.514193847292193, + "learning_rate": 2.931621945254334e-07, + "logits/chosen": -0.8936735391616821, + "logits/rejected": -0.9682626724243164, + "logps/chosen": -715.4161376953125, + "logps/rejected": -1014.7509155273438, + "loss": 0.3123, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6884326934814453, + "rewards/margins": 1.9404077529907227, + "rewards/rejected": -5.628840446472168, + "step": 383 + }, + { + "epoch": 0.5015919666911585, + "grad_norm": 25.2144093190015, + "learning_rate": 2.9203732375035387e-07, + "logits/chosen": -0.9673444032669067, + "logits/rejected": -0.9933240413665771, + "logps/chosen": -827.9849243164062, + "logps/rejected": -1104.3861083984375, + "loss": 0.4075, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.7136616706848145, + "rewards/margins": 2.6191678047180176, + "rewards/rejected": -7.332829475402832, + "step": 384 + }, + { + "epoch": 0.5028981957710833, + "grad_norm": 17.30717183828124, + "learning_rate": 2.90911576464672e-07, + "logits/chosen": -0.8663851022720337, + "logits/rejected": -0.9097596406936646, + "logps/chosen": -770.527099609375, + "logps/rejected": -1005.6298217773438, + "loss": 0.3697, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.207431316375732, + "rewards/margins": 2.028050184249878, + "rewards/rejected": -6.235481262207031, + "step": 385 + }, + { + "epoch": 0.5042044248510082, + "grad_norm": 19.970645583944684, + "learning_rate": 2.8978497614108635e-07, + "logits/chosen": -0.8607802391052246, + "logits/rejected": -0.8875527381896973, + "logps/chosen": -808.6739501953125, + "logps/rejected": -1079.2705078125, + "loss": 0.3704, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.122775077819824, + "rewards/margins": 2.69840145111084, + "rewards/rejected": -6.821176528930664, + "step": 386 + }, + { + "epoch": 0.5055106539309331, + "grad_norm": 18.538857537509557, + "learning_rate": 2.8865754627008205e-07, + "logits/chosen": -0.9041465520858765, + "logits/rejected": -0.8960049748420715, + "logps/chosen": -820.70458984375, + "logps/rejected": -1124.7659912109375, + "loss": 0.3332, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.593035697937012, + "rewards/margins": 2.8649890422821045, + "rewards/rejected": -7.458024978637695, + "step": 387 + }, + { + "epoch": 0.506816883010858, + "grad_norm": 17.991315804239036, + "learning_rate": 2.8752931035944083e-07, + "logits/chosen": -0.8732789754867554, + "logits/rejected": -0.9428113698959351, + "logps/chosen": -741.2675170898438, + "logps/rejected": -1010.6340942382812, + "loss": 0.3097, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.332613468170166, + "rewards/margins": 2.5484251976013184, + "rewards/rejected": -6.881038665771484, + "step": 388 + }, + { + "epoch": 0.5081231120907829, + "grad_norm": 27.817321746324055, + "learning_rate": 2.8640029193375125e-07, + "logits/chosen": -0.8342300653457642, + "logits/rejected": -0.8032709956169128, + "logps/chosen": -769.5841064453125, + "logps/rejected": -971.3031005859375, + "loss": 0.4094, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.432335376739502, + "rewards/margins": 2.4039621353149414, + "rewards/rejected": -6.836297512054443, + "step": 389 + }, + { + "epoch": 0.5094293411707078, + "grad_norm": 26.19481211137744, + "learning_rate": 2.852705145339176e-07, + "logits/chosen": -1.0258805751800537, + "logits/rejected": -0.9810967445373535, + "logps/chosen": -829.2476806640625, + "logps/rejected": -1077.290771484375, + "loss": 0.3769, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.776191711425781, + "rewards/margins": 2.748802661895752, + "rewards/rejected": -7.524994850158691, + "step": 390 + }, + { + "epoch": 0.5107355702506327, + "grad_norm": 19.01867691799226, + "learning_rate": 2.8414000171666945e-07, + "logits/chosen": -0.8570470809936523, + "logits/rejected": -0.8079515099525452, + "logps/chosen": -811.5380859375, + "logps/rejected": -984.9368896484375, + "loss": 0.3427, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.842589855194092, + "rewards/margins": 2.0718741416931152, + "rewards/rejected": -6.914463996887207, + "step": 391 + }, + { + "epoch": 0.5120417993305576, + "grad_norm": 23.18645994627528, + "learning_rate": 2.830087770540705e-07, + "logits/chosen": -0.8229571580886841, + "logits/rejected": -0.8696286678314209, + "logps/chosen": -805.5714111328125, + "logps/rejected": -1093.4774169921875, + "loss": 0.3441, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.443207740783691, + "rewards/margins": 2.2599196434020996, + "rewards/rejected": -6.703127861022949, + "step": 392 + }, + { + "epoch": 0.5133480284104824, + "grad_norm": 28.281638147055716, + "learning_rate": 2.81876864133027e-07, + "logits/chosen": -0.817804753780365, + "logits/rejected": -0.8166165351867676, + "logps/chosen": -820.2814331054688, + "logps/rejected": -1031.4049072265625, + "loss": 0.3534, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.919102191925049, + "rewards/margins": 1.825037956237793, + "rewards/rejected": -6.744140625, + "step": 393 + }, + { + "epoch": 0.5146542574904074, + "grad_norm": 24.167048796517168, + "learning_rate": 2.807442865547957e-07, + "logits/chosen": -0.9945539832115173, + "logits/rejected": -0.9743906855583191, + "logps/chosen": -915.3056030273438, + "logps/rejected": -1188.7843017578125, + "loss": 0.3163, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.132131576538086, + "rewards/margins": 2.8624014854431152, + "rewards/rejected": -7.994532108306885, + "step": 394 + }, + { + "epoch": 0.5159604865703322, + "grad_norm": 28.18102447139193, + "learning_rate": 2.796110679344921e-07, + "logits/chosen": -0.8455891013145447, + "logits/rejected": -0.7778101563453674, + "logps/chosen": -829.8065795898438, + "logps/rejected": -882.5882568359375, + "loss": 0.4049, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.8128581047058105, + "rewards/margins": 1.0256637334823608, + "rewards/rejected": -5.838521957397461, + "step": 395 + }, + { + "epoch": 0.5172667156502572, + "grad_norm": 14.11300629365015, + "learning_rate": 2.7847723190059794e-07, + "logits/chosen": -0.7391270399093628, + "logits/rejected": -0.7817518711090088, + "logps/chosen": -692.3377075195312, + "logps/rejected": -997.4937744140625, + "loss": 0.3341, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.137124061584473, + "rewards/margins": 2.135505199432373, + "rewards/rejected": -6.2726287841796875, + "step": 396 + }, + { + "epoch": 0.518572944730182, + "grad_norm": 22.34972235720674, + "learning_rate": 2.7734280209446865e-07, + "logits/chosen": -0.8998773694038391, + "logits/rejected": -0.9383985996246338, + "logps/chosen": -799.13623046875, + "logps/rejected": -1060.0535888671875, + "loss": 0.308, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.834539413452148, + "rewards/margins": 2.1194262504577637, + "rewards/rejected": -6.95396614074707, + "step": 397 + }, + { + "epoch": 0.519879173810107, + "grad_norm": 17.128724945944594, + "learning_rate": 2.762078021698398e-07, + "logits/chosen": -0.9537370800971985, + "logits/rejected": -0.9668135643005371, + "logps/chosen": -790.4427490234375, + "logps/rejected": -1048.340576171875, + "loss": 0.3238, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.3831071853637695, + "rewards/margins": 2.7197885513305664, + "rewards/rejected": -7.102895736694336, + "step": 398 + }, + { + "epoch": 0.5211854028900318, + "grad_norm": 29.151857918658536, + "learning_rate": 2.7507225579233486e-07, + "logits/chosen": -0.9291197061538696, + "logits/rejected": -0.90833580493927, + "logps/chosen": -777.3556518554688, + "logps/rejected": -924.982421875, + "loss": 0.3756, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.550766944885254, + "rewards/margins": 1.6807010173797607, + "rewards/rejected": -6.2314677238464355, + "step": 399 + }, + { + "epoch": 0.5224916319699567, + "grad_norm": 21.66068267812314, + "learning_rate": 2.7393618663897107e-07, + "logits/chosen": -0.9538843035697937, + "logits/rejected": -0.9232152700424194, + "logps/chosen": -845.7114868164062, + "logps/rejected": -1059.5697021484375, + "loss": 0.3737, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.844598770141602, + "rewards/margins": 2.347217082977295, + "rewards/rejected": -7.1918158531188965, + "step": 400 + }, + { + "epoch": 0.5224916319699567, + "eval_logits/chosen": -0.7711201906204224, + "eval_logits/rejected": -0.7776599526405334, + "eval_logps/chosen": -779.8291015625, + "eval_logps/rejected": -1017.5, + "eval_loss": 0.3653486669063568, + "eval_rewards/accuracies": 0.8759999871253967, + "eval_rewards/chosen": -4.357980251312256, + "eval_rewards/margins": 2.3157310485839844, + "eval_rewards/rejected": -6.673711776733398, + "eval_runtime": 306.4511, + "eval_samples_per_second": 6.526, + "eval_steps_per_second": 0.408, + "step": 400 + }, + { + "epoch": 0.5237978610498816, + "grad_norm": 18.39108085144253, + "learning_rate": 2.7279961839766587e-07, + "logits/chosen": -0.9713696837425232, + "logits/rejected": -0.9175607562065125, + "logps/chosen": -842.3927001953125, + "logps/rejected": -956.6342163085938, + "loss": 0.3457, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.3701958656311035, + "rewards/margins": 1.6710914373397827, + "rewards/rejected": -6.041287422180176, + "step": 401 + }, + { + "epoch": 0.5251040901298065, + "grad_norm": 15.697766908506615, + "learning_rate": 2.716625747667432e-07, + "logits/chosen": -0.8418725728988647, + "logits/rejected": -0.8784732222557068, + "logps/chosen": -694.2025756835938, + "logps/rejected": -951.430908203125, + "loss": 0.2994, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.085196495056152, + "rewards/margins": 2.302374839782715, + "rewards/rejected": -6.387571334838867, + "step": 402 + }, + { + "epoch": 0.5264103192097314, + "grad_norm": 24.469256220077725, + "learning_rate": 2.7052507945443923e-07, + "logits/chosen": -0.9080750942230225, + "logits/rejected": -0.8578637838363647, + "logps/chosen": -704.4551391601562, + "logps/rejected": -935.6524658203125, + "loss": 0.3242, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.9254515171051025, + "rewards/margins": 2.3511059284210205, + "rewards/rejected": -6.276557922363281, + "step": 403 + }, + { + "epoch": 0.5277165482896563, + "grad_norm": 20.419010003114707, + "learning_rate": 2.69387156178408e-07, + "logits/chosen": -1.0022379159927368, + "logits/rejected": -0.9890075922012329, + "logps/chosen": -783.5850219726562, + "logps/rejected": -908.2481689453125, + "loss": 0.4126, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.0194783210754395, + "rewards/margins": 1.3325276374816895, + "rewards/rejected": -5.352005958557129, + "step": 404 + }, + { + "epoch": 0.5290227773695811, + "grad_norm": 16.07011993586235, + "learning_rate": 2.682488286652269e-07, + "logits/chosen": -0.8653430342674255, + "logits/rejected": -0.9178695678710938, + "logps/chosen": -677.1983642578125, + "logps/rejected": -872.1953735351562, + "loss": 0.4308, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.747790575027466, + "rewards/margins": 1.7173616886138916, + "rewards/rejected": -5.465152263641357, + "step": 405 + }, + { + "epoch": 0.5303290064495061, + "grad_norm": 21.462739199113052, + "learning_rate": 2.6711012064990194e-07, + "logits/chosen": -0.8753893375396729, + "logits/rejected": -0.9124100804328918, + "logps/chosen": -756.1655883789062, + "logps/rejected": -949.3858032226562, + "loss": 0.3056, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.986269950866699, + "rewards/margins": 1.6225194931030273, + "rewards/rejected": -5.608789443969727, + "step": 406 + }, + { + "epoch": 0.5316352355294309, + "grad_norm": 18.071302323354917, + "learning_rate": 2.6597105587537304e-07, + "logits/chosen": -0.9895930290222168, + "logits/rejected": -1.024172067642212, + "logps/chosen": -860.986083984375, + "logps/rejected": -996.1123657226562, + "loss": 0.3608, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.353708267211914, + "rewards/margins": 1.0356533527374268, + "rewards/rejected": -5.389361381530762, + "step": 407 + }, + { + "epoch": 0.5329414646093559, + "grad_norm": 23.905515055395856, + "learning_rate": 2.648316580920187e-07, + "logits/chosen": -0.8011449575424194, + "logits/rejected": -0.775729775428772, + "logps/chosen": -715.1135864257812, + "logps/rejected": -877.1203002929688, + "loss": 0.4277, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.1487812995910645, + "rewards/margins": 1.5813078880310059, + "rewards/rejected": -5.730088710784912, + "step": 408 + }, + { + "epoch": 0.5342476936892807, + "grad_norm": 20.577720180588294, + "learning_rate": 2.6369195105716084e-07, + "logits/chosen": -0.8263465762138367, + "logits/rejected": -0.897908091545105, + "logps/chosen": -739.7882080078125, + "logps/rejected": -1073.1529541015625, + "loss": 0.3552, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.197357654571533, + "rewards/margins": 2.525136947631836, + "rewards/rejected": -6.722494125366211, + "step": 409 + }, + { + "epoch": 0.5355539227692057, + "grad_norm": 20.93012715133535, + "learning_rate": 2.625519585345699e-07, + "logits/chosen": -0.9467390179634094, + "logits/rejected": -0.9878973364830017, + "logps/chosen": -832.529296875, + "logps/rejected": -966.189453125, + "loss": 0.3034, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.822567462921143, + "rewards/margins": 1.0942929983139038, + "rewards/rejected": -5.916861534118652, + "step": 410 + }, + { + "epoch": 0.5368601518491305, + "grad_norm": 15.932987548690928, + "learning_rate": 2.6141170429396845e-07, + "logits/chosen": -1.0242911577224731, + "logits/rejected": -1.0077592134475708, + "logps/chosen": -784.8360595703125, + "logps/rejected": -987.2750244140625, + "loss": 0.3454, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.951958417892456, + "rewards/margins": 2.033290386199951, + "rewards/rejected": -5.985249042510986, + "step": 411 + }, + { + "epoch": 0.5381663809290554, + "grad_norm": 18.290713742749784, + "learning_rate": 2.602712121105363e-07, + "logits/chosen": -1.0145286321640015, + "logits/rejected": -0.9495306611061096, + "logps/chosen": -766.391845703125, + "logps/rejected": -897.2517700195312, + "loss": 0.3763, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.971484899520874, + "rewards/margins": 1.6331852674484253, + "rewards/rejected": -5.60467004776001, + "step": 412 + }, + { + "epoch": 0.5394726100089803, + "grad_norm": 17.085604641508972, + "learning_rate": 2.5913050576441473e-07, + "logits/chosen": -0.9171661138534546, + "logits/rejected": -0.9550764560699463, + "logps/chosen": -799.6805419921875, + "logps/rejected": -1040.7950439453125, + "loss": 0.3239, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.382745742797852, + "rewards/margins": 1.7179137468338013, + "rewards/rejected": -6.100659370422363, + "step": 413 + }, + { + "epoch": 0.5407788390889052, + "grad_norm": 20.64630955775629, + "learning_rate": 2.5798960904021014e-07, + "logits/chosen": -0.7455999851226807, + "logits/rejected": -0.7679321765899658, + "logps/chosen": -785.1378784179688, + "logps/rejected": -1103.63818359375, + "loss": 0.3481, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.42498254776001, + "rewards/margins": 2.7151589393615723, + "rewards/rejected": -7.140141487121582, + "step": 414 + }, + { + "epoch": 0.5420850681688301, + "grad_norm": 16.728318148970214, + "learning_rate": 2.568485457264987e-07, + "logits/chosen": -0.9213064908981323, + "logits/rejected": -0.9041643738746643, + "logps/chosen": -843.6577758789062, + "logps/rejected": -1016.4732666015625, + "loss": 0.3542, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.517221450805664, + "rewards/margins": 1.9062294960021973, + "rewards/rejected": -6.4234514236450195, + "step": 415 + }, + { + "epoch": 0.543391297248755, + "grad_norm": 22.588461926828998, + "learning_rate": 2.5570733961533004e-07, + "logits/chosen": -0.8473490476608276, + "logits/rejected": -0.8507324457168579, + "logps/chosen": -790.0494995117188, + "logps/rejected": -1003.7606201171875, + "loss": 0.4098, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.537339210510254, + "rewards/margins": 1.8845421075820923, + "rewards/rejected": -6.421881198883057, + "step": 416 + }, + { + "epoch": 0.5446975263286798, + "grad_norm": 19.870408634037258, + "learning_rate": 2.545660145017312e-07, + "logits/chosen": -0.7989124655723572, + "logits/rejected": -0.8354920744895935, + "logps/chosen": -711.0654907226562, + "logps/rejected": -959.1341552734375, + "loss": 0.39, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.016317367553711, + "rewards/margins": 2.0849642753601074, + "rewards/rejected": -6.101282119750977, + "step": 417 + }, + { + "epoch": 0.5460037554086048, + "grad_norm": 24.60218776058898, + "learning_rate": 2.5342459418321057e-07, + "logits/chosen": -0.894109845161438, + "logits/rejected": -0.8482675552368164, + "logps/chosen": -770.8828125, + "logps/rejected": -894.0445556640625, + "loss": 0.4073, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.591938018798828, + "rewards/margins": 1.472078561782837, + "rewards/rejected": -6.064016819000244, + "step": 418 + }, + { + "epoch": 0.5473099844885296, + "grad_norm": 25.871862402700973, + "learning_rate": 2.5228310245926143e-07, + "logits/chosen": -0.9959041476249695, + "logits/rejected": -0.9126294255256653, + "logps/chosen": -855.5810546875, + "logps/rejected": -1101.9541015625, + "loss": 0.3782, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.857247352600098, + "rewards/margins": 2.5603063106536865, + "rewards/rejected": -7.417553424835205, + "step": 419 + }, + { + "epoch": 0.5486162135684546, + "grad_norm": 21.195374508414663, + "learning_rate": 2.511415631308664e-07, + "logits/chosen": -0.9022479057312012, + "logits/rejected": -0.9067992568016052, + "logps/chosen": -878.5272827148438, + "logps/rejected": -1105.0299072265625, + "loss": 0.3276, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.165306091308594, + "rewards/margins": 2.0585012435913086, + "rewards/rejected": -7.223807334899902, + "step": 420 + }, + { + "epoch": 0.5499224426483794, + "grad_norm": 25.41666561923603, + "learning_rate": 2.5e-07, + "logits/chosen": -0.9776140451431274, + "logits/rejected": -0.9376563429832458, + "logps/chosen": -813.5545654296875, + "logps/rejected": -894.2927856445312, + "loss": 0.4017, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.246693134307861, + "rewards/margins": 0.8625474572181702, + "rewards/rejected": -5.1092400550842285, + "step": 421 + }, + { + "epoch": 0.5512286717283044, + "grad_norm": 14.958726659038993, + "learning_rate": 2.4885843686913364e-07, + "logits/chosen": -0.8434813022613525, + "logits/rejected": -0.8806734681129456, + "logps/chosen": -736.0880126953125, + "logps/rejected": -961.4481811523438, + "loss": 0.3531, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.191902160644531, + "rewards/margins": 1.9953703880310059, + "rewards/rejected": -6.187272071838379, + "step": 422 + }, + { + "epoch": 0.5525349008082292, + "grad_norm": 21.495028951673262, + "learning_rate": 2.4771689754073855e-07, + "logits/chosen": -0.8508874773979187, + "logits/rejected": -0.8240814208984375, + "logps/chosen": -768.284912109375, + "logps/rejected": -955.7354736328125, + "loss": 0.3225, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.972433567047119, + "rewards/margins": 2.121138095855713, + "rewards/rejected": -6.09357213973999, + "step": 423 + }, + { + "epoch": 0.5538411298881541, + "grad_norm": 15.722999973999624, + "learning_rate": 2.4657540581678946e-07, + "logits/chosen": -0.9035851955413818, + "logits/rejected": -0.9613392949104309, + "logps/chosen": -798.9974365234375, + "logps/rejected": -1164.1221923828125, + "loss": 0.3057, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.214138984680176, + "rewards/margins": 3.4577198028564453, + "rewards/rejected": -7.671858787536621, + "step": 424 + }, + { + "epoch": 0.555147358968079, + "grad_norm": 18.247343995966652, + "learning_rate": 2.4543398549826877e-07, + "logits/chosen": -0.978046178817749, + "logits/rejected": -0.9603185653686523, + "logps/chosen": -746.07861328125, + "logps/rejected": -954.4952392578125, + "loss": 0.3439, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.9832568168640137, + "rewards/margins": 2.3160653114318848, + "rewards/rejected": -6.29932165145874, + "step": 425 + }, + { + "epoch": 0.5564535880480039, + "grad_norm": 21.983144847488614, + "learning_rate": 2.4429266038467e-07, + "logits/chosen": -0.8131458163261414, + "logits/rejected": -0.8295997977256775, + "logps/chosen": -758.4473266601562, + "logps/rejected": -982.2353515625, + "loss": 0.3388, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.1805033683776855, + "rewards/margins": 1.6914095878601074, + "rewards/rejected": -5.871912956237793, + "step": 426 + }, + { + "epoch": 0.5577598171279288, + "grad_norm": 18.619526651872878, + "learning_rate": 2.4315145427350126e-07, + "logits/chosen": -0.8365015387535095, + "logits/rejected": -0.8766813278198242, + "logps/chosen": -728.0346069335938, + "logps/rejected": -953.6134033203125, + "loss": 0.3586, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.375938415527344, + "rewards/margins": 1.970406413078308, + "rewards/rejected": -6.346344470977783, + "step": 427 + }, + { + "epoch": 0.5590660462078537, + "grad_norm": 20.25987698353873, + "learning_rate": 2.4201039095978983e-07, + "logits/chosen": -0.9001325368881226, + "logits/rejected": -0.8716564178466797, + "logps/chosen": -811.824462890625, + "logps/rejected": -1029.6591796875, + "loss": 0.3611, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.3254170417785645, + "rewards/margins": 2.2741880416870117, + "rewards/rejected": -6.599605560302734, + "step": 428 + }, + { + "epoch": 0.5603722752877786, + "grad_norm": 18.373974900181594, + "learning_rate": 2.4086949423558525e-07, + "logits/chosen": -1.0318374633789062, + "logits/rejected": -1.025630235671997, + "logps/chosen": -753.2796630859375, + "logps/rejected": -957.3681640625, + "loss": 0.3139, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.9497110843658447, + "rewards/margins": 2.0519957542419434, + "rewards/rejected": -6.001706600189209, + "step": 429 + }, + { + "epoch": 0.5616785043677035, + "grad_norm": 17.74458197915287, + "learning_rate": 2.3972878788946367e-07, + "logits/chosen": -0.7888402342796326, + "logits/rejected": -0.7724015712738037, + "logps/chosen": -822.6279907226562, + "logps/rejected": -1166.1044921875, + "loss": 0.2831, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.497809410095215, + "rewards/margins": 3.3227591514587402, + "rewards/rejected": -7.820569038391113, + "step": 430 + }, + { + "epoch": 0.5629847334476283, + "grad_norm": 15.667460951436432, + "learning_rate": 2.3858829570603153e-07, + "logits/chosen": -0.9419072270393372, + "logits/rejected": -0.9077606797218323, + "logps/chosen": -782.8739624023438, + "logps/rejected": -954.84619140625, + "loss": 0.3131, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.551786422729492, + "rewards/margins": 1.868566632270813, + "rewards/rejected": -6.420353412628174, + "step": 431 + }, + { + "epoch": 0.5642909625275533, + "grad_norm": 16.228404418354415, + "learning_rate": 2.3744804146543003e-07, + "logits/chosen": -1.0330311059951782, + "logits/rejected": -1.0249035358428955, + "logps/chosen": -830.9697265625, + "logps/rejected": -999.312255859375, + "loss": 0.3204, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.5674214363098145, + "rewards/margins": 1.7816904783248901, + "rewards/rejected": -6.349112033843994, + "step": 432 + }, + { + "epoch": 0.5655971916074781, + "grad_norm": 33.93589918126513, + "learning_rate": 2.3630804894283906e-07, + "logits/chosen": -0.8753880858421326, + "logits/rejected": -0.8946934342384338, + "logps/chosen": -800.2118530273438, + "logps/rejected": -994.9881591796875, + "loss": 0.4265, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.7165446281433105, + "rewards/margins": 1.7866311073303223, + "rewards/rejected": -6.503175735473633, + "step": 433 + }, + { + "epoch": 0.5669034206874031, + "grad_norm": 21.60718764822029, + "learning_rate": 2.3516834190798128e-07, + "logits/chosen": -0.8376666903495789, + "logits/rejected": -0.863196849822998, + "logps/chosen": -910.37451171875, + "logps/rejected": -1207.0146484375, + "loss": 0.3037, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.631561279296875, + "rewards/margins": 2.607675790786743, + "rewards/rejected": -8.239237785339355, + "step": 434 + }, + { + "epoch": 0.5682096497673279, + "grad_norm": 17.78741572283716, + "learning_rate": 2.3402894412462691e-07, + "logits/chosen": -0.8809553980827332, + "logits/rejected": -0.8793525099754333, + "logps/chosen": -816.9341430664062, + "logps/rejected": -1160.713623046875, + "loss": 0.3206, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.884814262390137, + "rewards/margins": 2.8916025161743164, + "rewards/rejected": -7.776415824890137, + "step": 435 + }, + { + "epoch": 0.5695158788472529, + "grad_norm": 19.299157629600934, + "learning_rate": 2.3288987935009804e-07, + "logits/chosen": -0.81032395362854, + "logits/rejected": -0.8216681480407715, + "logps/chosen": -795.78271484375, + "logps/rejected": -1083.7606201171875, + "loss": 0.3267, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.85698127746582, + "rewards/margins": 2.928353786468506, + "rewards/rejected": -7.785335540771484, + "step": 436 + }, + { + "epoch": 0.5708221079271777, + "grad_norm": 18.934945354797385, + "learning_rate": 2.317511713347731e-07, + "logits/chosen": -0.8548052310943604, + "logits/rejected": -0.8772594332695007, + "logps/chosen": -776.690673828125, + "logps/rejected": -983.8834838867188, + "loss": 0.3146, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.52797794342041, + "rewards/margins": 1.9513096809387207, + "rewards/rejected": -6.479287147521973, + "step": 437 + }, + { + "epoch": 0.5721283370071026, + "grad_norm": 27.50365285840327, + "learning_rate": 2.3061284382159193e-07, + "logits/chosen": -0.8831881880760193, + "logits/rejected": -0.9110695719718933, + "logps/chosen": -759.2625122070312, + "logps/rejected": -981.3511962890625, + "loss": 0.397, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.39530086517334, + "rewards/margins": 2.060976982116699, + "rewards/rejected": -6.456278324127197, + "step": 438 + }, + { + "epoch": 0.5734345660870275, + "grad_norm": 28.807216329420136, + "learning_rate": 2.2947492054556072e-07, + "logits/chosen": -0.9537849426269531, + "logits/rejected": -0.9700733423233032, + "logps/chosen": -827.2046508789062, + "logps/rejected": -1069.7030029296875, + "loss": 0.3407, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.6851348876953125, + "rewards/margins": 1.8534389734268188, + "rewards/rejected": -6.538573741912842, + "step": 439 + }, + { + "epoch": 0.5747407951669524, + "grad_norm": 22.99328848252123, + "learning_rate": 2.2833742523325675e-07, + "logits/chosen": -0.9458773732185364, + "logits/rejected": -0.9196736812591553, + "logps/chosen": -804.439697265625, + "logps/rejected": -1033.7572021484375, + "loss": 0.3554, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.986883640289307, + "rewards/margins": 2.2680232524871826, + "rewards/rejected": -7.254906177520752, + "step": 440 + }, + { + "epoch": 0.5760470242468773, + "grad_norm": 15.202510696978031, + "learning_rate": 2.272003816023341e-07, + "logits/chosen": -0.9241797924041748, + "logits/rejected": -0.9261179566383362, + "logps/chosen": -833.6920776367188, + "logps/rejected": -1051.131591796875, + "loss": 0.2392, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.833259582519531, + "rewards/margins": 2.2068347930908203, + "rewards/rejected": -7.040094375610352, + "step": 441 + }, + { + "epoch": 0.5773532533268022, + "grad_norm": 26.410873459357198, + "learning_rate": 2.2606381336102894e-07, + "logits/chosen": -1.0969460010528564, + "logits/rejected": -1.0607943534851074, + "logps/chosen": -842.22314453125, + "logps/rejected": -1066.9891357421875, + "loss": 0.4235, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.723199367523193, + "rewards/margins": 2.193436861038208, + "rewards/rejected": -6.9166364669799805, + "step": 442 + }, + { + "epoch": 0.578659482406727, + "grad_norm": 28.247981746947612, + "learning_rate": 2.2492774420766517e-07, + "logits/chosen": -0.9036411643028259, + "logits/rejected": -0.915302038192749, + "logps/chosen": -822.6942749023438, + "logps/rejected": -1086.462890625, + "loss": 0.3518, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.846108436584473, + "rewards/margins": 2.3862531185150146, + "rewards/rejected": -7.23236083984375, + "step": 443 + }, + { + "epoch": 0.579965711486652, + "grad_norm": 21.11892539199568, + "learning_rate": 2.2379219783016026e-07, + "logits/chosen": -0.9433536529541016, + "logits/rejected": -0.9617864489555359, + "logps/chosen": -900.6919555664062, + "logps/rejected": -1149.1103515625, + "loss": 0.318, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.675385475158691, + "rewards/margins": 2.2155497074127197, + "rewards/rejected": -6.890934944152832, + "step": 444 + }, + { + "epoch": 0.5812719405665768, + "grad_norm": 15.321566164302686, + "learning_rate": 2.2265719790553146e-07, + "logits/chosen": -0.9592891931533813, + "logits/rejected": -0.9572802782058716, + "logps/chosen": -730.5135498046875, + "logps/rejected": -936.27880859375, + "loss": 0.3303, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.321778297424316, + "rewards/margins": 1.9940975904464722, + "rewards/rejected": -6.31587553024292, + "step": 445 + }, + { + "epoch": 0.5825781696465018, + "grad_norm": 14.856682076736497, + "learning_rate": 2.2152276809940204e-07, + "logits/chosen": -0.9369142055511475, + "logits/rejected": -0.9434508681297302, + "logps/chosen": -796.3789672851562, + "logps/rejected": -976.5616455078125, + "loss": 0.3146, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.9632458686828613, + "rewards/margins": 1.5912312269210815, + "rewards/rejected": -5.554476737976074, + "step": 446 + }, + { + "epoch": 0.5838843987264266, + "grad_norm": 17.00151895193613, + "learning_rate": 2.2038893206550796e-07, + "logits/chosen": -1.1397333145141602, + "logits/rejected": -1.1071242094039917, + "logps/chosen": -875.6116943359375, + "logps/rejected": -1100.5426025390625, + "loss": 0.3171, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.550100803375244, + "rewards/margins": 2.633068323135376, + "rewards/rejected": -7.183169364929199, + "step": 447 + }, + { + "epoch": 0.5851906278063516, + "grad_norm": 17.642907713457248, + "learning_rate": 2.192557134452044e-07, + "logits/chosen": -1.0641119480133057, + "logits/rejected": -0.9393628835678101, + "logps/chosen": -889.4263916015625, + "logps/rejected": -1003.4805297851562, + "loss": 0.364, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.028580665588379, + "rewards/margins": 1.7515006065368652, + "rewards/rejected": -6.780081748962402, + "step": 448 + }, + { + "epoch": 0.5864968568862764, + "grad_norm": 33.31921181235088, + "learning_rate": 2.1812313586697307e-07, + "logits/chosen": -0.9102932214736938, + "logits/rejected": -0.9287086725234985, + "logps/chosen": -752.0089721679688, + "logps/rejected": -919.4461669921875, + "loss": 0.3462, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.703210830688477, + "rewards/margins": 1.3443269729614258, + "rewards/rejected": -6.047537803649902, + "step": 449 + }, + { + "epoch": 0.5878030859662013, + "grad_norm": 17.480381281796603, + "learning_rate": 2.1699122294592955e-07, + "logits/chosen": -0.8606734275817871, + "logits/rejected": -0.896245539188385, + "logps/chosen": -832.9607543945312, + "logps/rejected": -1162.478515625, + "loss": 0.2817, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.795577049255371, + "rewards/margins": 3.199331521987915, + "rewards/rejected": -7.994908332824707, + "step": 450 + }, + { + "epoch": 0.5891093150461262, + "grad_norm": 17.202137644316416, + "learning_rate": 2.1585999828333064e-07, + "logits/chosen": -0.9806517362594604, + "logits/rejected": -0.9195981621742249, + "logps/chosen": -829.970947265625, + "logps/rejected": -1054.5245361328125, + "loss": 0.2772, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.403877258300781, + "rewards/margins": 2.6019461154937744, + "rewards/rejected": -7.005823135375977, + "step": 451 + }, + { + "epoch": 0.5904155441260511, + "grad_norm": 19.779239028056754, + "learning_rate": 2.147294854660825e-07, + "logits/chosen": -0.904513955116272, + "logits/rejected": -0.8860921859741211, + "logps/chosen": -786.3958129882812, + "logps/rejected": -968.3399658203125, + "loss": 0.3156, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.6048383712768555, + "rewards/margins": 2.0750350952148438, + "rewards/rejected": -6.679873943328857, + "step": 452 + }, + { + "epoch": 0.591721773205976, + "grad_norm": 20.448314043899785, + "learning_rate": 2.1359970806624884e-07, + "logits/chosen": -0.8353624939918518, + "logits/rejected": -0.8903397917747498, + "logps/chosen": -724.5891723632812, + "logps/rejected": -1040.308349609375, + "loss": 0.274, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.321156024932861, + "rewards/margins": 2.924544334411621, + "rewards/rejected": -7.245700836181641, + "step": 453 + }, + { + "epoch": 0.5930280022859009, + "grad_norm": 26.60414849811314, + "learning_rate": 2.1247068964055917e-07, + "logits/chosen": -0.8759950399398804, + "logits/rejected": -0.901042103767395, + "logps/chosen": -852.2246704101562, + "logps/rejected": -1044.4117431640625, + "loss": 0.416, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.1181817054748535, + "rewards/margins": 1.7556366920471191, + "rewards/rejected": -6.873818874359131, + "step": 454 + }, + { + "epoch": 0.5943342313658257, + "grad_norm": 38.18759204125818, + "learning_rate": 2.1134245372991798e-07, + "logits/chosen": -0.9119591116905212, + "logits/rejected": -0.8581464290618896, + "logps/chosen": -774.8963012695312, + "logps/rejected": -992.7687377929688, + "loss": 0.4143, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.636556625366211, + "rewards/margins": 2.3142166137695312, + "rewards/rejected": -6.9507737159729, + "step": 455 + }, + { + "epoch": 0.5956404604457507, + "grad_norm": 28.856531286525918, + "learning_rate": 2.1021502385891368e-07, + "logits/chosen": -0.7651792764663696, + "logits/rejected": -0.7757113575935364, + "logps/chosen": -770.0438232421875, + "logps/rejected": -974.64404296875, + "loss": 0.3727, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.925751209259033, + "rewards/margins": 2.0303800106048584, + "rewards/rejected": -6.956131458282471, + "step": 456 + }, + { + "epoch": 0.5969466895256755, + "grad_norm": 16.780867878500203, + "learning_rate": 2.09088423535328e-07, + "logits/chosen": -0.9524345993995667, + "logits/rejected": -0.9039627313613892, + "logps/chosen": -815.7782592773438, + "logps/rejected": -1012.5498046875, + "loss": 0.369, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.924071311950684, + "rewards/margins": 2.095405340194702, + "rewards/rejected": -7.019476413726807, + "step": 457 + }, + { + "epoch": 0.5982529186056005, + "grad_norm": 35.404568396113156, + "learning_rate": 2.0796267624964608e-07, + "logits/chosen": -0.914667010307312, + "logits/rejected": -0.8787985444068909, + "logps/chosen": -872.1204223632812, + "logps/rejected": -1020.7347412109375, + "loss": 0.3364, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.007497310638428, + "rewards/margins": 1.6205580234527588, + "rewards/rejected": -6.628055572509766, + "step": 458 + }, + { + "epoch": 0.5995591476855253, + "grad_norm": 20.457701722422097, + "learning_rate": 2.0683780547456664e-07, + "logits/chosen": -0.8531094193458557, + "logits/rejected": -0.8431046009063721, + "logps/chosen": -772.6988525390625, + "logps/rejected": -952.2398071289062, + "loss": 0.3301, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.533632278442383, + "rewards/margins": 1.8784475326538086, + "rewards/rejected": -6.41208028793335, + "step": 459 + }, + { + "epoch": 0.6008653767654503, + "grad_norm": 15.471602821349602, + "learning_rate": 2.0571383466451237e-07, + "logits/chosen": -0.8806161284446716, + "logits/rejected": -0.9237450361251831, + "logps/chosen": -768.0919189453125, + "logps/rejected": -962.9945678710938, + "loss": 0.3217, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.675704002380371, + "rewards/margins": 1.6774792671203613, + "rewards/rejected": -6.353183269500732, + "step": 460 + }, + { + "epoch": 0.6021716058453751, + "grad_norm": 16.21404011539825, + "learning_rate": 2.0459078725514089e-07, + "logits/chosen": -1.0201655626296997, + "logits/rejected": -1.0077934265136719, + "logps/chosen": -864.165283203125, + "logps/rejected": -1056.125244140625, + "loss": 0.3366, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.843500137329102, + "rewards/margins": 1.9824084043502808, + "rewards/rejected": -6.825908660888672, + "step": 461 + }, + { + "epoch": 0.6034778349253, + "grad_norm": 21.870842872086143, + "learning_rate": 2.0346868666285644e-07, + "logits/chosen": -0.8475763201713562, + "logits/rejected": -0.806708574295044, + "logps/chosen": -788.37109375, + "logps/rejected": -972.1688842773438, + "loss": 0.283, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.456578254699707, + "rewards/margins": 2.1057913303375244, + "rewards/rejected": -6.5623698234558105, + "step": 462 + }, + { + "epoch": 0.6047840640052249, + "grad_norm": 38.35605889815296, + "learning_rate": 2.023475562843213e-07, + "logits/chosen": -0.853971004486084, + "logits/rejected": -0.918674647808075, + "logps/chosen": -816.5822143554688, + "logps/rejected": -1115.367431640625, + "loss": 0.3566, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.566136837005615, + "rewards/margins": 2.0404365062713623, + "rewards/rejected": -6.606573104858398, + "step": 463 + }, + { + "epoch": 0.6060902930851498, + "grad_norm": 15.736017698404957, + "learning_rate": 2.0122741949596793e-07, + "logits/chosen": -0.8547173738479614, + "logits/rejected": -0.894489586353302, + "logps/chosen": -788.0221557617188, + "logps/rejected": -1043.7552490234375, + "loss": 0.3182, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.1634907722473145, + "rewards/margins": 2.7182223796844482, + "rewards/rejected": -6.881713390350342, + "step": 464 + }, + { + "epoch": 0.6073965221650747, + "grad_norm": 16.34767928777031, + "learning_rate": 2.0010829965351184e-07, + "logits/chosen": -0.8808002471923828, + "logits/rejected": -0.8989526033401489, + "logps/chosen": -765.3001098632812, + "logps/rejected": -985.0426635742188, + "loss": 0.3329, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.9959161281585693, + "rewards/margins": 2.0970518589019775, + "rewards/rejected": -6.092967987060547, + "step": 465 + }, + { + "epoch": 0.6087027512449996, + "grad_norm": 24.681668419314594, + "learning_rate": 1.9899022009146435e-07, + "logits/chosen": -1.0274349451065063, + "logits/rejected": -1.0152531862258911, + "logps/chosen": -913.58251953125, + "logps/rejected": -1045.136962890625, + "loss": 0.3916, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.3043341636657715, + "rewards/margins": 1.5377391576766968, + "rewards/rejected": -6.842073440551758, + "step": 466 + }, + { + "epoch": 0.6100089803249245, + "grad_norm": 20.13545605217525, + "learning_rate": 1.9787320412264607e-07, + "logits/chosen": -0.9427694082260132, + "logits/rejected": -0.903557300567627, + "logps/chosen": -828.9415283203125, + "logps/rejected": -958.75048828125, + "loss": 0.3119, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.989696979522705, + "rewards/margins": 1.3408408164978027, + "rewards/rejected": -6.330538272857666, + "step": 467 + }, + { + "epoch": 0.6113152094048494, + "grad_norm": 20.553589332643803, + "learning_rate": 1.96757275037701e-07, + "logits/chosen": -0.9104101061820984, + "logits/rejected": -0.9407252073287964, + "logps/chosen": -752.98291015625, + "logps/rejected": -1006.4957275390625, + "loss": 0.4053, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.013631343841553, + "rewards/margins": 2.0689241886138916, + "rewards/rejected": -6.082555770874023, + "step": 468 + }, + { + "epoch": 0.6126214384847742, + "grad_norm": 15.86015294512485, + "learning_rate": 1.9564245610461078e-07, + "logits/chosen": -0.9154040217399597, + "logits/rejected": -0.97034752368927, + "logps/chosen": -686.4542236328125, + "logps/rejected": -934.6543579101562, + "loss": 0.3401, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.8044779300689697, + "rewards/margins": 2.325495481491089, + "rewards/rejected": -6.129973411560059, + "step": 469 + }, + { + "epoch": 0.6139276675646992, + "grad_norm": 16.003289738862154, + "learning_rate": 1.945287705682093e-07, + "logits/chosen": -0.8327289819717407, + "logits/rejected": -0.8527270555496216, + "logps/chosen": -717.9485473632812, + "logps/rejected": -982.47216796875, + "loss": 0.2847, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.025069236755371, + "rewards/margins": 2.376330614089966, + "rewards/rejected": -6.401400089263916, + "step": 470 + }, + { + "epoch": 0.615233896644624, + "grad_norm": 18.621148191091866, + "learning_rate": 1.9341624164969859e-07, + "logits/chosen": -1.0173578262329102, + "logits/rejected": -1.0285704135894775, + "logps/chosen": -774.8818359375, + "logps/rejected": -1054.2027587890625, + "loss": 0.3245, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.084187984466553, + "rewards/margins": 2.5124804973602295, + "rewards/rejected": -6.596668720245361, + "step": 471 + }, + { + "epoch": 0.616540125724549, + "grad_norm": 16.118635035544298, + "learning_rate": 1.92304892546164e-07, + "logits/chosen": -0.8497686386108398, + "logits/rejected": -0.862455427646637, + "logps/chosen": -812.705810546875, + "logps/rejected": -1088.639892578125, + "loss": 0.2711, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.906893730163574, + "rewards/margins": 2.828660249710083, + "rewards/rejected": -7.735554218292236, + "step": 472 + }, + { + "epoch": 0.6178463548044738, + "grad_norm": 16.48765253489488, + "learning_rate": 1.9119474643009108e-07, + "logits/chosen": -0.9804749488830566, + "logits/rejected": -0.9375219345092773, + "logps/chosen": -757.4735107421875, + "logps/rejected": -914.5074462890625, + "loss": 0.301, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.477548122406006, + "rewards/margins": 1.8111082315444946, + "rewards/rejected": -6.288656234741211, + "step": 473 + }, + { + "epoch": 0.6191525838843988, + "grad_norm": 15.195437398662849, + "learning_rate": 1.9008582644888196e-07, + "logits/chosen": -0.6733070611953735, + "logits/rejected": -0.7402501106262207, + "logps/chosen": -763.7427978515625, + "logps/rejected": -1086.78857421875, + "loss": 0.2907, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.562100410461426, + "rewards/margins": 2.552955389022827, + "rewards/rejected": -7.115055561065674, + "step": 474 + }, + { + "epoch": 0.6204588129643236, + "grad_norm": 20.023388007330663, + "learning_rate": 1.8897815572437301e-07, + "logits/chosen": -0.8706028461456299, + "logits/rejected": -0.9461374878883362, + "logps/chosen": -732.7521362304688, + "logps/rejected": -1005.2787475585938, + "loss": 0.4367, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.528969764709473, + "rewards/margins": 2.5225186347961426, + "rewards/rejected": -7.051488876342773, + "step": 475 + }, + { + "epoch": 0.6217650420442485, + "grad_norm": 20.49346284139183, + "learning_rate": 1.8787175735235273e-07, + "logits/chosen": -0.8300421237945557, + "logits/rejected": -0.9250699281692505, + "logps/chosen": -778.4098510742188, + "logps/rejected": -1078.0272216796875, + "loss": 0.3242, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.306589603424072, + "rewards/margins": 2.6981916427612305, + "rewards/rejected": -7.004781246185303, + "step": 476 + }, + { + "epoch": 0.6230712711241734, + "grad_norm": 18.35973579335231, + "learning_rate": 1.8676665440207977e-07, + "logits/chosen": -0.8798630237579346, + "logits/rejected": -0.9428200721740723, + "logps/chosen": -847.861083984375, + "logps/rejected": -1115.469970703125, + "loss": 0.3065, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.708725452423096, + "rewards/margins": 2.0331783294677734, + "rewards/rejected": -6.741903781890869, + "step": 477 + }, + { + "epoch": 0.6243775002040983, + "grad_norm": 21.457791070514528, + "learning_rate": 1.8566286991580267e-07, + "logits/chosen": -1.001512050628662, + "logits/rejected": -0.9693081974983215, + "logps/chosen": -736.879638671875, + "logps/rejected": -926.0786743164062, + "loss": 0.3584, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.08529806137085, + "rewards/margins": 2.071559429168701, + "rewards/rejected": -6.156857013702393, + "step": 478 + }, + { + "epoch": 0.6256837292840232, + "grad_norm": 17.65870882480035, + "learning_rate": 1.8456042690827866e-07, + "logits/chosen": -0.9303890466690063, + "logits/rejected": -0.9561671018600464, + "logps/chosen": -872.1222534179688, + "logps/rejected": -1080.7010498046875, + "loss": 0.3102, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.10383415222168, + "rewards/margins": 2.146256685256958, + "rewards/rejected": -7.2500901222229, + "step": 479 + }, + { + "epoch": 0.6269899583639481, + "grad_norm": 21.848843169526116, + "learning_rate": 1.834593483662942e-07, + "logits/chosen": -1.0235671997070312, + "logits/rejected": -1.0022693872451782, + "logps/chosen": -783.5675048828125, + "logps/rejected": -991.603271484375, + "loss": 0.3159, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.475238800048828, + "rewards/margins": 2.0906448364257812, + "rewards/rejected": -6.565884113311768, + "step": 480 + }, + { + "epoch": 0.6282961874438729, + "grad_norm": 20.285807323673964, + "learning_rate": 1.823596572481856e-07, + "logits/chosen": -0.8799965977668762, + "logits/rejected": -0.865379273891449, + "logps/chosen": -883.8758544921875, + "logps/rejected": -1119.204345703125, + "loss": 0.3373, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.130545139312744, + "rewards/margins": 2.4770002365112305, + "rewards/rejected": -7.607545375823975, + "step": 481 + }, + { + "epoch": 0.6296024165237979, + "grad_norm": 23.36396355797146, + "learning_rate": 1.8126137648336042e-07, + "logits/chosen": -0.882436215877533, + "logits/rejected": -0.9245233535766602, + "logps/chosen": -784.2247924804688, + "logps/rejected": -1008.7647705078125, + "loss": 0.3069, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.217325687408447, + "rewards/margins": 2.155444622039795, + "rewards/rejected": -6.372769832611084, + "step": 482 + }, + { + "epoch": 0.6309086456037227, + "grad_norm": 16.44635250770276, + "learning_rate": 1.8016452897181899e-07, + "logits/chosen": -0.9179688692092896, + "logits/rejected": -0.9435247778892517, + "logps/chosen": -887.08056640625, + "logps/rejected": -1131.097412109375, + "loss": 0.2691, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.979192733764648, + "rewards/margins": 2.308243751525879, + "rewards/rejected": -7.2874369621276855, + "step": 483 + }, + { + "epoch": 0.6322148746836477, + "grad_norm": 26.002503785556364, + "learning_rate": 1.7906913758367743e-07, + "logits/chosen": -0.8960538506507874, + "logits/rejected": -0.9162447452545166, + "logps/chosen": -847.0087280273438, + "logps/rejected": -1033.681396484375, + "loss": 0.3612, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.641334533691406, + "rewards/margins": 2.035878896713257, + "rewards/rejected": -6.677213668823242, + "step": 484 + }, + { + "epoch": 0.6335211037635725, + "grad_norm": 47.32281221829127, + "learning_rate": 1.779752251586906e-07, + "logits/chosen": -0.9276028871536255, + "logits/rejected": -0.912039041519165, + "logps/chosen": -842.8237915039062, + "logps/rejected": -1087.3580322265625, + "loss": 0.3949, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.849575996398926, + "rewards/margins": 2.352696657180786, + "rewards/rejected": -7.202272415161133, + "step": 485 + }, + { + "epoch": 0.6348273328434975, + "grad_norm": 28.467391326181026, + "learning_rate": 1.7688281450577565e-07, + "logits/chosen": -0.9715847969055176, + "logits/rejected": -0.99041748046875, + "logps/chosen": -854.5145874023438, + "logps/rejected": -1045.529541015625, + "loss": 0.3979, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.9397478103637695, + "rewards/margins": 1.85337233543396, + "rewards/rejected": -6.79311990737915, + "step": 486 + }, + { + "epoch": 0.6361335619234223, + "grad_norm": 17.19454286875019, + "learning_rate": 1.7579192840253676e-07, + "logits/chosen": -0.8678796291351318, + "logits/rejected": -0.9223478436470032, + "logps/chosen": -828.3589477539062, + "logps/rejected": -1285.54638671875, + "loss": 0.2783, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.885687828063965, + "rewards/margins": 4.284566879272461, + "rewards/rejected": -9.17025375366211, + "step": 487 + }, + { + "epoch": 0.6374397910033472, + "grad_norm": 18.646551972970194, + "learning_rate": 1.7470258959478997e-07, + "logits/chosen": -0.9070701599121094, + "logits/rejected": -0.9281150102615356, + "logps/chosen": -845.6627807617188, + "logps/rejected": -1099.25244140625, + "loss": 0.2406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.664644718170166, + "rewards/margins": 2.3845531940460205, + "rewards/rejected": -7.049198150634766, + "step": 488 + }, + { + "epoch": 0.6387460200832721, + "grad_norm": 28.390001327673815, + "learning_rate": 1.7361482079608912e-07, + "logits/chosen": -0.8741916418075562, + "logits/rejected": -0.9074406623840332, + "logps/chosen": -759.44873046875, + "logps/rejected": -991.1700439453125, + "loss": 0.3513, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.629438877105713, + "rewards/margins": 2.0953075885772705, + "rewards/rejected": -6.724746227264404, + "step": 489 + }, + { + "epoch": 0.640052249163197, + "grad_norm": 19.202983618249952, + "learning_rate": 1.7252864468725217e-07, + "logits/chosen": -0.9950762987136841, + "logits/rejected": -0.9689397811889648, + "logps/chosen": -817.3816528320312, + "logps/rejected": -1081.819091796875, + "loss": 0.3163, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.948513507843018, + "rewards/margins": 2.7807250022888184, + "rewards/rejected": -7.729238033294678, + "step": 490 + }, + { + "epoch": 0.6413584782431219, + "grad_norm": 47.54669959337329, + "learning_rate": 1.7144408391588812e-07, + "logits/chosen": -0.9382389783859253, + "logits/rejected": -0.9015699625015259, + "logps/chosen": -884.1466674804688, + "logps/rejected": -1108.498291015625, + "loss": 0.4699, + "rewards/accuracies": 0.96875, + "rewards/chosen": -5.405143737792969, + "rewards/margins": 2.269254684448242, + "rewards/rejected": -7.674398422241211, + "step": 491 + }, + { + "epoch": 0.6426647073230468, + "grad_norm": 24.49787042714506, + "learning_rate": 1.7036116109592503e-07, + "logits/chosen": -0.9624199867248535, + "logits/rejected": -0.9387903213500977, + "logps/chosen": -866.46923828125, + "logps/rejected": -1160.3365478515625, + "loss": 0.4061, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.7858428955078125, + "rewards/margins": 3.046083927154541, + "rewards/rejected": -7.8319268226623535, + "step": 492 + }, + { + "epoch": 0.6439709364029716, + "grad_norm": 23.6266600811225, + "learning_rate": 1.692798988071385e-07, + "logits/chosen": -0.9590986967086792, + "logits/rejected": -0.9157624244689941, + "logps/chosen": -707.189697265625, + "logps/rejected": -848.1437377929688, + "loss": 0.3058, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.7680490016937256, + "rewards/margins": 1.6261415481567383, + "rewards/rejected": -5.394190788269043, + "step": 493 + }, + { + "epoch": 0.6452771654828966, + "grad_norm": 32.38766640788159, + "learning_rate": 1.6820031959468058e-07, + "logits/chosen": -0.8452016711235046, + "logits/rejected": -0.8521511554718018, + "logps/chosen": -761.6751708984375, + "logps/rejected": -1109.984375, + "loss": 0.3427, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.9624176025390625, + "rewards/margins": 3.2722549438476562, + "rewards/rejected": -8.234672546386719, + "step": 494 + }, + { + "epoch": 0.6465833945628214, + "grad_norm": 27.974709752171293, + "learning_rate": 1.6712244596861005e-07, + "logits/chosen": -0.9747927784919739, + "logits/rejected": -0.9465031027793884, + "logps/chosen": -825.25439453125, + "logps/rejected": -1000.7984008789062, + "loss": 0.3166, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.469265937805176, + "rewards/margins": 2.1552627086639404, + "rewards/rejected": -6.624528884887695, + "step": 495 + }, + { + "epoch": 0.6478896236427464, + "grad_norm": 20.810107529075697, + "learning_rate": 1.6604630040342287e-07, + "logits/chosen": -0.8734793663024902, + "logits/rejected": -0.8113576173782349, + "logps/chosen": -733.5122680664062, + "logps/rejected": -941.5518188476562, + "loss": 0.2349, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.06757116317749, + "rewards/margins": 2.2041683197021484, + "rewards/rejected": -6.271739482879639, + "step": 496 + }, + { + "epoch": 0.6491958527226712, + "grad_norm": 26.424860817633128, + "learning_rate": 1.6497190533758347e-07, + "logits/chosen": -0.8922603130340576, + "logits/rejected": -0.9254794120788574, + "logps/chosen": -755.695068359375, + "logps/rejected": -1069.649658203125, + "loss": 0.3252, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.36348819732666, + "rewards/margins": 2.7446978092193604, + "rewards/rejected": -7.108185768127441, + "step": 497 + }, + { + "epoch": 0.6505020818025962, + "grad_norm": 33.756785960357675, + "learning_rate": 1.6389928317305714e-07, + "logits/chosen": -0.9473923444747925, + "logits/rejected": -0.9284683465957642, + "logps/chosen": -743.8952026367188, + "logps/rejected": -949.4517211914062, + "loss": 0.4077, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.9511754512786865, + "rewards/margins": 2.0509424209594727, + "rewards/rejected": -6.00211763381958, + "step": 498 + }, + { + "epoch": 0.651808310882521, + "grad_norm": 18.712023115705826, + "learning_rate": 1.6282845627484286e-07, + "logits/chosen": -0.9681456089019775, + "logits/rejected": -1.0133285522460938, + "logps/chosen": -914.3257446289062, + "logps/rejected": -1230.6312255859375, + "loss": 0.3075, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.505870819091797, + "rewards/margins": 3.0627996921539307, + "rewards/rejected": -8.568670272827148, + "step": 499 + }, + { + "epoch": 0.653114539962446, + "grad_norm": 21.626442285320586, + "learning_rate": 1.6175944697050676e-07, + "logits/chosen": -0.7312138080596924, + "logits/rejected": -0.8125101327896118, + "logps/chosen": -764.114013671875, + "logps/rejected": -1055.6309814453125, + "loss": 0.2611, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.200470924377441, + "rewards/margins": 2.1423423290252686, + "rewards/rejected": -7.342813014984131, + "step": 500 + }, + { + "epoch": 0.653114539962446, + "eval_logits/chosen": -0.8074228167533875, + "eval_logits/rejected": -0.8136730790138245, + "eval_logps/chosen": -834.2015380859375, + "eval_logps/rejected": -1117.25146484375, + "eval_loss": 0.3457300364971161, + "eval_rewards/accuracies": 0.8859999775886536, + "eval_rewards/chosen": -4.901705265045166, + "eval_rewards/margins": 2.7695229053497314, + "eval_rewards/rejected": -7.67122745513916, + "eval_runtime": 303.8262, + "eval_samples_per_second": 6.583, + "eval_steps_per_second": 0.411, + "step": 500 + }, + { + "epoch": 0.6544207690423708, + "grad_norm": 35.83155605863081, + "learning_rate": 1.606922775497168e-07, + "logits/chosen": -0.8929469585418701, + "logits/rejected": -0.9043774604797363, + "logps/chosen": -784.7332763671875, + "logps/rejected": -1039.4635009765625, + "loss": 0.3203, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.490346431732178, + "rewards/margins": 2.162130117416382, + "rewards/rejected": -6.6524763107299805, + "step": 501 + }, + { + "epoch": 0.6557269981222957, + "grad_norm": 24.012110512122, + "learning_rate": 1.5962697026377808e-07, + "logits/chosen": -0.8780226707458496, + "logits/rejected": -0.9489363431930542, + "logps/chosen": -765.6043701171875, + "logps/rejected": -1039.3092041015625, + "loss": 0.3271, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.720160484313965, + "rewards/margins": 2.347363233566284, + "rewards/rejected": -7.06752347946167, + "step": 502 + }, + { + "epoch": 0.6570332272022206, + "grad_norm": 24.277681500510212, + "learning_rate": 1.5856354732516863e-07, + "logits/chosen": -0.9098899960517883, + "logits/rejected": -0.926539957523346, + "logps/chosen": -867.1011962890625, + "logps/rejected": -1170.328369140625, + "loss": 0.3981, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.973918914794922, + "rewards/margins": 2.8096470832824707, + "rewards/rejected": -7.783565044403076, + "step": 503 + }, + { + "epoch": 0.6583394562821455, + "grad_norm": 26.363019101162333, + "learning_rate": 1.575020309070763e-07, + "logits/chosen": -1.0532864332199097, + "logits/rejected": -1.0039781332015991, + "logps/chosen": -903.8301391601562, + "logps/rejected": -1048.2803955078125, + "loss": 0.404, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.257813930511475, + "rewards/margins": 1.9001696109771729, + "rewards/rejected": -7.157983779907227, + "step": 504 + }, + { + "epoch": 0.6596456853620704, + "grad_norm": 34.051380385152065, + "learning_rate": 1.564424431429367e-07, + "logits/chosen": -0.9329712390899658, + "logits/rejected": -0.9821687340736389, + "logps/chosen": -780.6395263671875, + "logps/rejected": -1030.6109619140625, + "loss": 0.4478, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.551027297973633, + "rewards/margins": 2.0627126693725586, + "rewards/rejected": -6.613739490509033, + "step": 505 + }, + { + "epoch": 0.6609519144419953, + "grad_norm": 18.91185817042615, + "learning_rate": 1.553848061259715e-07, + "logits/chosen": -0.9905709624290466, + "logits/rejected": -0.9948655366897583, + "logps/chosen": -721.0912475585938, + "logps/rejected": -946.108642578125, + "loss": 0.3625, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.270671367645264, + "rewards/margins": 2.0259008407592773, + "rewards/rejected": -6.296573162078857, + "step": 506 + }, + { + "epoch": 0.6622581435219201, + "grad_norm": 21.4133638867673, + "learning_rate": 1.5432914190872756e-07, + "logits/chosen": -0.9507661461830139, + "logits/rejected": -0.9466673135757446, + "logps/chosen": -741.1168823242188, + "logps/rejected": -1028.7220458984375, + "loss": 0.3489, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.295059680938721, + "rewards/margins": 2.8194098472595215, + "rewards/rejected": -7.114469528198242, + "step": 507 + }, + { + "epoch": 0.6635643726018451, + "grad_norm": 16.850885406295667, + "learning_rate": 1.5327547250261764e-07, + "logits/chosen": -1.0218459367752075, + "logits/rejected": -1.0179591178894043, + "logps/chosen": -858.17333984375, + "logps/rejected": -1121.5169677734375, + "loss": 0.3068, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.725369930267334, + "rewards/margins": 2.5118465423583984, + "rewards/rejected": -7.237216472625732, + "step": 508 + }, + { + "epoch": 0.6648706016817699, + "grad_norm": 18.433618884607107, + "learning_rate": 1.5222381987746102e-07, + "logits/chosen": -0.9506387114524841, + "logits/rejected": -1.002458095550537, + "logps/chosen": -795.8396606445312, + "logps/rejected": -1047.6082763671875, + "loss": 0.3417, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.511249542236328, + "rewards/margins": 2.4017834663391113, + "rewards/rejected": -6.9130330085754395, + "step": 509 + }, + { + "epoch": 0.6661768307616949, + "grad_norm": 22.548196729659644, + "learning_rate": 1.5117420596102548e-07, + "logits/chosen": -0.8478186130523682, + "logits/rejected": -0.9220375418663025, + "logps/chosen": -742.0947265625, + "logps/rejected": -1046.636474609375, + "loss": 0.2665, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.394111633300781, + "rewards/margins": 2.5048062801361084, + "rewards/rejected": -6.8989176750183105, + "step": 510 + }, + { + "epoch": 0.6674830598416197, + "grad_norm": 16.478239174352726, + "learning_rate": 1.501266526385702e-07, + "logits/chosen": -1.0730950832366943, + "logits/rejected": -1.092372179031372, + "logps/chosen": -767.8289184570312, + "logps/rejected": -985.9404296875, + "loss": 0.3245, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.549228668212891, + "rewards/margins": 2.1183018684387207, + "rewards/rejected": -6.667530059814453, + "step": 511 + }, + { + "epoch": 0.6687892889215447, + "grad_norm": 23.638655193847455, + "learning_rate": 1.490811817523896e-07, + "logits/chosen": -0.9937188029289246, + "logits/rejected": -1.0091791152954102, + "logps/chosen": -847.7296142578125, + "logps/rejected": -1097.6788330078125, + "loss": 0.3033, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.684628963470459, + "rewards/margins": 2.2222049236297607, + "rewards/rejected": -6.906834125518799, + "step": 512 + }, + { + "epoch": 0.6700955180014695, + "grad_norm": 25.469595664561382, + "learning_rate": 1.4803781510135722e-07, + "logits/chosen": -0.8402674198150635, + "logits/rejected": -0.8698500394821167, + "logps/chosen": -856.8892822265625, + "logps/rejected": -1082.5703125, + "loss": 0.3428, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.897946357727051, + "rewards/margins": 2.5066497325897217, + "rewards/rejected": -7.404595851898193, + "step": 513 + }, + { + "epoch": 0.6714017470813944, + "grad_norm": 23.476193663048107, + "learning_rate": 1.4699657444047213e-07, + "logits/chosen": -0.9435025453567505, + "logits/rejected": -0.8921396732330322, + "logps/chosen": -774.4256591796875, + "logps/rejected": -1036.7841796875, + "loss": 0.3501, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.357728004455566, + "rewards/margins": 2.8723790645599365, + "rewards/rejected": -7.230106353759766, + "step": 514 + }, + { + "epoch": 0.6727079761613193, + "grad_norm": 18.52575101879441, + "learning_rate": 1.4595748148040465e-07, + "logits/chosen": -0.9316481947898865, + "logits/rejected": -0.871412456035614, + "logps/chosen": -797.5765380859375, + "logps/rejected": -989.5557861328125, + "loss": 0.3503, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.4651994705200195, + "rewards/margins": 2.085799217224121, + "rewards/rejected": -6.550998210906982, + "step": 515 + }, + { + "epoch": 0.6740142052412442, + "grad_norm": 23.95953824389779, + "learning_rate": 1.4492055788704394e-07, + "logits/chosen": -0.8964416980743408, + "logits/rejected": -0.9257286190986633, + "logps/chosen": -900.4140625, + "logps/rejected": -1141.0982666015625, + "loss": 0.2902, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.290503025054932, + "rewards/margins": 2.3838093280792236, + "rewards/rejected": -7.674312591552734, + "step": 516 + }, + { + "epoch": 0.6753204343211691, + "grad_norm": 20.124074172997577, + "learning_rate": 1.4388582528104627e-07, + "logits/chosen": -0.8705368041992188, + "logits/rejected": -0.8657253980636597, + "logps/chosen": -774.8510131835938, + "logps/rejected": -1063.6641845703125, + "loss": 0.3582, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.379948616027832, + "rewards/margins": 2.763211250305176, + "rewards/rejected": -7.143159866333008, + "step": 517 + }, + { + "epoch": 0.676626663401094, + "grad_norm": 23.85216054356757, + "learning_rate": 1.4285330523738385e-07, + "logits/chosen": -0.8885217308998108, + "logits/rejected": -0.9661125540733337, + "logps/chosen": -780.2984619140625, + "logps/rejected": -1220.000732421875, + "loss": 0.2674, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.0980353355407715, + "rewards/margins": 3.5056042671203613, + "rewards/rejected": -7.603639125823975, + "step": 518 + }, + { + "epoch": 0.6779328924810188, + "grad_norm": 23.081844702273408, + "learning_rate": 1.4182301928489554e-07, + "logits/chosen": -0.9551963210105896, + "logits/rejected": -1.0183593034744263, + "logps/chosen": -836.30126953125, + "logps/rejected": -1116.5960693359375, + "loss": 0.2732, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.809637069702148, + "rewards/margins": 2.4394960403442383, + "rewards/rejected": -7.2491326332092285, + "step": 519 + }, + { + "epoch": 0.6792391215609438, + "grad_norm": 22.095622056118994, + "learning_rate": 1.4079498890583762e-07, + "logits/chosen": -0.9430717825889587, + "logits/rejected": -0.9482989311218262, + "logps/chosen": -835.6499633789062, + "logps/rejected": -1055.31396484375, + "loss": 0.3049, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.793646812438965, + "rewards/margins": 2.296147346496582, + "rewards/rejected": -7.089794158935547, + "step": 520 + }, + { + "epoch": 0.6805453506408686, + "grad_norm": 20.717133805005673, + "learning_rate": 1.3976923553543585e-07, + "logits/chosen": -1.1042985916137695, + "logits/rejected": -1.0969711542129517, + "logps/chosen": -912.5108642578125, + "logps/rejected": -1019.5659790039062, + "loss": 0.4057, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.131659507751465, + "rewards/margins": 1.0571504831314087, + "rewards/rejected": -6.188809871673584, + "step": 521 + }, + { + "epoch": 0.6818515797207936, + "grad_norm": 18.213109701212694, + "learning_rate": 1.387457805614387e-07, + "logits/chosen": -0.8132377862930298, + "logits/rejected": -0.8710845708847046, + "logps/chosen": -770.2957763671875, + "logps/rejected": -1096.7030029296875, + "loss": 0.3482, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.406842231750488, + "rewards/margins": 2.6769254207611084, + "rewards/rejected": -7.083767890930176, + "step": 522 + }, + { + "epoch": 0.6831578088007184, + "grad_norm": 18.85695986546673, + "learning_rate": 1.3772464532367123e-07, + "logits/chosen": -1.0416616201400757, + "logits/rejected": -0.9788161516189575, + "logps/chosen": -803.7244873046875, + "logps/rejected": -996.6585083007812, + "loss": 0.2956, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.829257965087891, + "rewards/margins": 2.251364231109619, + "rewards/rejected": -7.08062219619751, + "step": 523 + }, + { + "epoch": 0.6844640378806434, + "grad_norm": 25.509597048268557, + "learning_rate": 1.3670585111359034e-07, + "logits/chosen": -1.050762414932251, + "logits/rejected": -1.0215251445770264, + "logps/chosen": -797.5689086914062, + "logps/rejected": -912.2422485351562, + "loss": 0.3768, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.425357818603516, + "rewards/margins": 1.3879318237304688, + "rewards/rejected": -5.813289642333984, + "step": 524 + }, + { + "epoch": 0.6857702669605682, + "grad_norm": 31.852282652245847, + "learning_rate": 1.3568941917384036e-07, + "logits/chosen": -0.9219133853912354, + "logits/rejected": -0.9097141027450562, + "logps/chosen": -750.3033447265625, + "logps/rejected": -955.424560546875, + "loss": 0.3827, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.9384779930114746, + "rewards/margins": 1.7265440225601196, + "rewards/rejected": -5.665022373199463, + "step": 525 + }, + { + "epoch": 0.6870764960404931, + "grad_norm": 27.83642396092612, + "learning_rate": 1.3467537069781083e-07, + "logits/chosen": -0.8348858952522278, + "logits/rejected": -0.912026584148407, + "logps/chosen": -755.4243774414062, + "logps/rejected": -1065.1910400390625, + "loss": 0.3305, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.685622692108154, + "rewards/margins": 2.498351812362671, + "rewards/rejected": -7.183974266052246, + "step": 526 + }, + { + "epoch": 0.688382725120418, + "grad_norm": 20.154859857966876, + "learning_rate": 1.3366372682919413e-07, + "logits/chosen": -0.9479906558990479, + "logits/rejected": -1.0296324491500854, + "logps/chosen": -764.8931274414062, + "logps/rejected": -997.9901123046875, + "loss": 0.3715, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.2235212326049805, + "rewards/margins": 1.9003543853759766, + "rewards/rejected": -6.123875617980957, + "step": 527 + }, + { + "epoch": 0.6896889542003429, + "grad_norm": 43.509262990869395, + "learning_rate": 1.3265450866154465e-07, + "logits/chosen": -0.8818020820617676, + "logits/rejected": -0.8966426849365234, + "logps/chosen": -793.8504028320312, + "logps/rejected": -989.6928100585938, + "loss": 0.2804, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.28352165222168, + "rewards/margins": 2.0015485286712646, + "rewards/rejected": -6.285069465637207, + "step": 528 + }, + { + "epoch": 0.6909951832802678, + "grad_norm": 22.370295801986263, + "learning_rate": 1.3164773723783916e-07, + "logits/chosen": -0.8573867082595825, + "logits/rejected": -0.9925711154937744, + "logps/chosen": -731.441650390625, + "logps/rejected": -1192.9609375, + "loss": 0.2617, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.299685955047607, + "rewards/margins": 3.543790578842163, + "rewards/rejected": -7.843476295471191, + "step": 529 + }, + { + "epoch": 0.6923014123601927, + "grad_norm": 30.337274628575507, + "learning_rate": 1.3064343355003773e-07, + "logits/chosen": -1.0423648357391357, + "logits/rejected": -1.0021618604660034, + "logps/chosen": -769.501220703125, + "logps/rejected": -996.4588623046875, + "loss": 0.2833, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.6555285453796387, + "rewards/margins": 2.304563045501709, + "rewards/rejected": -5.960091590881348, + "step": 530 + }, + { + "epoch": 0.6936076414401176, + "grad_norm": 21.726072264869213, + "learning_rate": 1.2964161853864652e-07, + "logits/chosen": -0.9621329307556152, + "logits/rejected": -0.9972319006919861, + "logps/chosen": -807.4346923828125, + "logps/rejected": -1011.5498046875, + "loss": 0.3271, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.329762935638428, + "rewards/margins": 2.0207858085632324, + "rewards/rejected": -6.350549221038818, + "step": 531 + }, + { + "epoch": 0.6949138705200425, + "grad_norm": 16.228236624062486, + "learning_rate": 1.2864231309228055e-07, + "logits/chosen": -0.9178920388221741, + "logits/rejected": -0.9607722759246826, + "logps/chosen": -800.65185546875, + "logps/rejected": -1050.902587890625, + "loss": 0.371, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.402060031890869, + "rewards/margins": 2.257364511489868, + "rewards/rejected": -6.659424304962158, + "step": 532 + }, + { + "epoch": 0.6962200995999673, + "grad_norm": 17.11412863341009, + "learning_rate": 1.2764553804722867e-07, + "logits/chosen": -0.8288196325302124, + "logits/rejected": -0.7849943041801453, + "logps/chosen": -822.9666748046875, + "logps/rejected": -1016.3614501953125, + "loss": 0.3288, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.726102828979492, + "rewards/margins": 2.1954498291015625, + "rewards/rejected": -6.9215521812438965, + "step": 533 + }, + { + "epoch": 0.6975263286798923, + "grad_norm": 17.168886769292737, + "learning_rate": 1.2665131418701896e-07, + "logits/chosen": -0.9636877179145813, + "logits/rejected": -0.8900684118270874, + "logps/chosen": -831.2034912109375, + "logps/rejected": -1021.8273315429688, + "loss": 0.3186, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.344829559326172, + "rewards/margins": 2.0918314456939697, + "rewards/rejected": -6.436661720275879, + "step": 534 + }, + { + "epoch": 0.6988325577598171, + "grad_norm": 21.860374335135038, + "learning_rate": 1.2565966224198518e-07, + "logits/chosen": -0.9835441708564758, + "logits/rejected": -1.0193074941635132, + "logps/chosen": -773.3865356445312, + "logps/rejected": -947.406494140625, + "loss": 0.2801, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.3468403816223145, + "rewards/margins": 1.6079427003860474, + "rewards/rejected": -5.954782485961914, + "step": 535 + }, + { + "epoch": 0.7001387868397421, + "grad_norm": 18.01057131898774, + "learning_rate": 1.246706028888348e-07, + "logits/chosen": -0.9090981483459473, + "logits/rejected": -0.958797812461853, + "logps/chosen": -801.5009155273438, + "logps/rejected": -1012.5660400390625, + "loss": 0.341, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.3776373863220215, + "rewards/margins": 1.7712655067443848, + "rewards/rejected": -6.1489033699035645, + "step": 536 + }, + { + "epoch": 0.7014450159196669, + "grad_norm": 25.088999958240304, + "learning_rate": 1.2368415675021768e-07, + "logits/chosen": -1.0301973819732666, + "logits/rejected": -1.0415173768997192, + "logps/chosen": -846.6824340820312, + "logps/rejected": -1005.0170288085938, + "loss": 0.3147, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.839428424835205, + "rewards/margins": 1.6613373756408691, + "rewards/rejected": -6.500766277313232, + "step": 537 + }, + { + "epoch": 0.7027512449995919, + "grad_norm": 24.708438696063062, + "learning_rate": 1.2270034439429623e-07, + "logits/chosen": -0.9158374667167664, + "logits/rejected": -0.9857248067855835, + "logps/chosen": -898.869384765625, + "logps/rejected": -1202.664306640625, + "loss": 0.3081, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.038189888000488, + "rewards/margins": 2.465287923812866, + "rewards/rejected": -7.503477573394775, + "step": 538 + }, + { + "epoch": 0.7040574740795167, + "grad_norm": 23.937697291164447, + "learning_rate": 1.2171918633431622e-07, + "logits/chosen": -0.9632992148399353, + "logits/rejected": -0.9589728713035583, + "logps/chosen": -894.701904296875, + "logps/rejected": -1180.599609375, + "loss": 0.2895, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.9801764488220215, + "rewards/margins": 2.669917345046997, + "rewards/rejected": -7.650093078613281, + "step": 539 + }, + { + "epoch": 0.7053637031594416, + "grad_norm": 26.25664570501424, + "learning_rate": 1.2074070302817959e-07, + "logits/chosen": -1.0312018394470215, + "logits/rejected": -1.0125337839126587, + "logps/chosen": -776.10400390625, + "logps/rejected": -995.2510986328125, + "loss": 0.2686, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.570556163787842, + "rewards/margins": 2.495591402053833, + "rewards/rejected": -7.066147804260254, + "step": 540 + }, + { + "epoch": 0.7066699322393665, + "grad_norm": 24.597234782614173, + "learning_rate": 1.1976491487801746e-07, + "logits/chosen": -0.7603006362915039, + "logits/rejected": -0.8024336099624634, + "logps/chosen": -772.5233154296875, + "logps/rejected": -1199.2117919921875, + "loss": 0.3024, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.72584867477417, + "rewards/margins": 3.7751638889312744, + "rewards/rejected": -8.501012802124023, + "step": 541 + }, + { + "epoch": 0.7079761613192914, + "grad_norm": 35.3757532271724, + "learning_rate": 1.1879184222976488e-07, + "logits/chosen": -0.9676728248596191, + "logits/rejected": -0.9333958625793457, + "logps/chosen": -862.1279296875, + "logps/rejected": -1095.59912109375, + "loss": 0.3705, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.918542861938477, + "rewards/margins": 2.3512566089630127, + "rewards/rejected": -7.26979923248291, + "step": 542 + }, + { + "epoch": 0.7092823903992163, + "grad_norm": 24.764732292716715, + "learning_rate": 1.1782150537273664e-07, + "logits/chosen": -1.0130747556686401, + "logits/rejected": -0.9589080810546875, + "logps/chosen": -920.9356689453125, + "logps/rejected": -1052.5150146484375, + "loss": 0.3492, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.504761695861816, + "rewards/margins": 1.7927732467651367, + "rewards/rejected": -7.297534465789795, + "step": 543 + }, + { + "epoch": 0.7105886194791412, + "grad_norm": 21.974575796244796, + "learning_rate": 1.168539245392042e-07, + "logits/chosen": -0.9473114609718323, + "logits/rejected": -0.8918843865394592, + "logps/chosen": -876.5565795898438, + "logps/rejected": -1087.864990234375, + "loss": 0.3665, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.10919189453125, + "rewards/margins": 2.627614736557007, + "rewards/rejected": -7.736806869506836, + "step": 544 + }, + { + "epoch": 0.711894848559066, + "grad_norm": 17.97992792815349, + "learning_rate": 1.1588911990397362e-07, + "logits/chosen": -0.9870251417160034, + "logits/rejected": -1.0186010599136353, + "logps/chosen": -835.75830078125, + "logps/rejected": -1067.68505859375, + "loss": 0.2964, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.034979820251465, + "rewards/margins": 2.080564498901367, + "rewards/rejected": -7.11554479598999, + "step": 545 + }, + { + "epoch": 0.713201077638991, + "grad_norm": 29.36410303717562, + "learning_rate": 1.1492711158396523e-07, + "logits/chosen": -0.9277634024620056, + "logits/rejected": -0.9040770530700684, + "logps/chosen": -778.7317504882812, + "logps/rejected": -1031.9451904296875, + "loss": 0.389, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.545460224151611, + "rewards/margins": 2.5905895233154297, + "rewards/rejected": -7.136049270629883, + "step": 546 + }, + { + "epoch": 0.7145073067189158, + "grad_norm": 17.973634079399982, + "learning_rate": 1.1396791963779409e-07, + "logits/chosen": -0.8808873891830444, + "logits/rejected": -0.9185099601745605, + "logps/chosen": -859.2599487304688, + "logps/rejected": -1208.873046875, + "loss": 0.2681, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.937643051147461, + "rewards/margins": 2.997467041015625, + "rewards/rejected": -7.935110092163086, + "step": 547 + }, + { + "epoch": 0.7158135357988408, + "grad_norm": 25.10421767660218, + "learning_rate": 1.1301156406535156e-07, + "logits/chosen": -0.8444209694862366, + "logits/rejected": -0.9109209775924683, + "logps/chosen": -869.1829833984375, + "logps/rejected": -1528.3863525390625, + "loss": 0.2826, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.416234016418457, + "rewards/margins": 5.921084403991699, + "rewards/rejected": -11.337318420410156, + "step": 548 + }, + { + "epoch": 0.7171197648787656, + "grad_norm": 26.637099440513637, + "learning_rate": 1.120580648073885e-07, + "logits/chosen": -1.0137161016464233, + "logits/rejected": -0.9916458129882812, + "logps/chosen": -847.381591796875, + "logps/rejected": -1100.040283203125, + "loss": 0.3916, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.878036975860596, + "rewards/margins": 2.2340753078460693, + "rewards/rejected": -7.112112045288086, + "step": 549 + }, + { + "epoch": 0.7184259939586906, + "grad_norm": 17.61880269068818, + "learning_rate": 1.1110744174509951e-07, + "logits/chosen": -0.897437572479248, + "logits/rejected": -0.8928896188735962, + "logps/chosen": -867.4434204101562, + "logps/rejected": -1089.23193359375, + "loss": 0.3098, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.121048450469971, + "rewards/margins": 2.1236259937286377, + "rewards/rejected": -7.2446746826171875, + "step": 550 + }, + { + "epoch": 0.7197322230386154, + "grad_norm": 15.95782153940809, + "learning_rate": 1.1015971469970795e-07, + "logits/chosen": -0.9469540119171143, + "logits/rejected": -0.9789649844169617, + "logps/chosen": -829.0692138671875, + "logps/rejected": -1165.4215087890625, + "loss": 0.2877, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.792716026306152, + "rewards/margins": 3.257359027862549, + "rewards/rejected": -8.05007553100586, + "step": 551 + }, + { + "epoch": 0.7210384521185403, + "grad_norm": 23.122374666528035, + "learning_rate": 1.0921490343205333e-07, + "logits/chosen": -1.0128257274627686, + "logits/rejected": -1.0176877975463867, + "logps/chosen": -916.9769287109375, + "logps/rejected": -1195.42578125, + "loss": 0.3506, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.4742326736450195, + "rewards/margins": 2.6680378913879395, + "rewards/rejected": -8.1422700881958, + "step": 552 + }, + { + "epoch": 0.7223446811984652, + "grad_norm": 26.270798765214032, + "learning_rate": 1.0827302764217886e-07, + "logits/chosen": -0.9295449256896973, + "logits/rejected": -0.9562668800354004, + "logps/chosen": -891.7932739257812, + "logps/rejected": -1106.447509765625, + "loss": 0.3752, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.4281229972839355, + "rewards/margins": 2.2159817218780518, + "rewards/rejected": -7.644104957580566, + "step": 553 + }, + { + "epoch": 0.7236509102783901, + "grad_norm": 23.32404619948412, + "learning_rate": 1.0733410696892072e-07, + "logits/chosen": -0.8652491569519043, + "logits/rejected": -0.8973567485809326, + "logps/chosen": -811.0621948242188, + "logps/rejected": -1088.7923583984375, + "loss": 0.3382, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.997215270996094, + "rewards/margins": 2.428532838821411, + "rewards/rejected": -7.425748348236084, + "step": 554 + }, + { + "epoch": 0.724957139358315, + "grad_norm": 21.973497977464827, + "learning_rate": 1.063981609894987e-07, + "logits/chosen": -0.8850119709968567, + "logits/rejected": -0.8317074775695801, + "logps/chosen": -895.1964721679688, + "logps/rejected": -1150.310791015625, + "loss": 0.3526, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.471782207489014, + "rewards/margins": 2.4418041706085205, + "rewards/rejected": -7.913586616516113, + "step": 555 + }, + { + "epoch": 0.7262633684382399, + "grad_norm": 35.91689983958402, + "learning_rate": 1.0546520921910784e-07, + "logits/chosen": -0.9072127342224121, + "logits/rejected": -0.8466538190841675, + "logps/chosen": -877.4606323242188, + "logps/rejected": -1109.957763671875, + "loss": 0.4265, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.126779079437256, + "rewards/margins": 2.529775381088257, + "rewards/rejected": -7.656554698944092, + "step": 556 + }, + { + "epoch": 0.7275695975181647, + "grad_norm": 23.511498465014164, + "learning_rate": 1.0453527111051183e-07, + "logits/chosen": -0.9427922964096069, + "logits/rejected": -0.939167320728302, + "logps/chosen": -806.1412353515625, + "logps/rejected": -1016.7940063476562, + "loss": 0.3023, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.077123165130615, + "rewards/margins": 2.114588499069214, + "rewards/rejected": -7.191711902618408, + "step": 557 + }, + { + "epoch": 0.7288758265980897, + "grad_norm": 24.44477376498831, + "learning_rate": 1.0360836605363679e-07, + "logits/chosen": -0.9482549428939819, + "logits/rejected": -0.9721497893333435, + "logps/chosen": -800.2005615234375, + "logps/rejected": -1005.4768676757812, + "loss": 0.355, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.509579181671143, + "rewards/margins": 1.8559819459915161, + "rewards/rejected": -6.365560531616211, + "step": 558 + }, + { + "epoch": 0.7301820556780145, + "grad_norm": 21.167159308599608, + "learning_rate": 1.0268451337516773e-07, + "logits/chosen": -1.0456604957580566, + "logits/rejected": -1.012890338897705, + "logps/chosen": -903.2613525390625, + "logps/rejected": -1080.0423583984375, + "loss": 0.3808, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.362766265869141, + "rewards/margins": 2.1328072547912598, + "rewards/rejected": -7.495573997497559, + "step": 559 + }, + { + "epoch": 0.7314882847579395, + "grad_norm": 39.859419738396554, + "learning_rate": 1.0176373233814509e-07, + "logits/chosen": -0.9403766393661499, + "logits/rejected": -0.9335325360298157, + "logps/chosen": -843.5707397460938, + "logps/rejected": -1124.05322265625, + "loss": 0.3127, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.752773761749268, + "rewards/margins": 2.5625851154327393, + "rewards/rejected": -7.315358638763428, + "step": 560 + }, + { + "epoch": 0.7327945138378643, + "grad_norm": 29.279533102730955, + "learning_rate": 1.0084604214156322e-07, + "logits/chosen": -0.967137336730957, + "logits/rejected": -0.9893122315406799, + "logps/chosen": -775.4454345703125, + "logps/rejected": -981.1903686523438, + "loss": 0.2862, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.9357616901397705, + "rewards/margins": 2.174323558807373, + "rewards/rejected": -6.1100850105285645, + "step": 561 + }, + { + "epoch": 0.7341007429177893, + "grad_norm": 20.628888387982226, + "learning_rate": 9.99314619199701e-08, + "logits/chosen": -0.8735795617103577, + "logits/rejected": -0.9275645017623901, + "logps/chosen": -814.8197021484375, + "logps/rejected": -1053.7369384765625, + "loss": 0.3197, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.9540815353393555, + "rewards/margins": 2.201838493347168, + "rewards/rejected": -7.155920028686523, + "step": 562 + }, + { + "epoch": 0.7354069719977141, + "grad_norm": 28.19914485268819, + "learning_rate": 9.902001074306834e-08, + "logits/chosen": -0.9230560064315796, + "logits/rejected": -0.9692875146865845, + "logps/chosen": -799.16162109375, + "logps/rejected": -1046.1767578125, + "loss": 0.3496, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.5525031089782715, + "rewards/margins": 2.366610527038574, + "rewards/rejected": -6.919114112854004, + "step": 563 + }, + { + "epoch": 0.736713201077639, + "grad_norm": 18.395440299034046, + "learning_rate": 9.811170761531739e-08, + "logits/chosen": -0.857429027557373, + "logits/rejected": -0.9134597182273865, + "logps/chosen": -754.7802734375, + "logps/rejected": -1091.92919921875, + "loss": 0.2861, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.486932754516602, + "rewards/margins": 3.0629043579101562, + "rewards/rejected": -7.549837112426758, + "step": 564 + }, + { + "epoch": 0.7380194301575639, + "grad_norm": 16.713239429608148, + "learning_rate": 9.720657147553767e-08, + "logits/chosen": -0.874890923500061, + "logits/rejected": -0.8897607922554016, + "logps/chosen": -796.7207641601562, + "logps/rejected": -1120.3206787109375, + "loss": 0.2626, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.593537330627441, + "rewards/margins": 2.665463447570801, + "rewards/rejected": -7.259000778198242, + "step": 565 + }, + { + "epoch": 0.7393256592374887, + "grad_norm": 20.5336851623424, + "learning_rate": 9.630462119651537e-08, + "logits/chosen": -0.8550716638565063, + "logits/rejected": -0.8601816892623901, + "logps/chosen": -947.548828125, + "logps/rejected": -1085.7291259765625, + "loss": 0.3776, + "rewards/accuracies": 0.65625, + "rewards/chosen": -5.3931732177734375, + "rewards/margins": 1.4523639678955078, + "rewards/rejected": -6.845536708831787, + "step": 566 + }, + { + "epoch": 0.7406318883174137, + "grad_norm": 20.707084475190243, + "learning_rate": 9.5405875584609e-08, + "logits/chosen": -0.922339677810669, + "logits/rejected": -0.9448087811470032, + "logps/chosen": -823.510498046875, + "logps/rejected": -1043.0206298828125, + "loss": 0.2913, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.648580551147461, + "rewards/margins": 2.157121419906616, + "rewards/rejected": -6.805701732635498, + "step": 567 + }, + { + "epoch": 0.7419381173973385, + "grad_norm": 16.852464603891658, + "learning_rate": 9.451035337935731e-08, + "logits/chosen": -0.8741306066513062, + "logits/rejected": -0.9167773723602295, + "logps/chosen": -726.5533447265625, + "logps/rejected": -1080.0703125, + "loss": 0.2901, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.9515931606292725, + "rewards/margins": 3.231985092163086, + "rewards/rejected": -7.183577537536621, + "step": 568 + }, + { + "epoch": 0.7432443464772635, + "grad_norm": 17.208831278317064, + "learning_rate": 9.36180732530886e-08, + "logits/chosen": -0.9364518523216248, + "logits/rejected": -0.9597791433334351, + "logps/chosen": -785.6038818359375, + "logps/rejected": -1091.4915771484375, + "loss": 0.3465, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.260547161102295, + "rewards/margins": 2.8101701736450195, + "rewards/rejected": -7.070717811584473, + "step": 569 + }, + { + "epoch": 0.7445505755571883, + "grad_norm": 20.57132249050924, + "learning_rate": 9.272905381053131e-08, + "logits/chosen": -0.9936679601669312, + "logits/rejected": -0.9452012777328491, + "logps/chosen": -783.473388671875, + "logps/rejected": -977.3974609375, + "loss": 0.4034, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.772213935852051, + "rewards/margins": 2.122070550918579, + "rewards/rejected": -6.894284248352051, + "step": 570 + }, + { + "epoch": 0.7458568046371132, + "grad_norm": 24.488955872960492, + "learning_rate": 9.184331358842592e-08, + "logits/chosen": -1.1170083284378052, + "logits/rejected": -1.1208350658416748, + "logps/chosen": -913.275146484375, + "logps/rejected": -1134.02734375, + "loss": 0.3542, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.806280612945557, + "rewards/margins": 2.3573532104492188, + "rewards/rejected": -7.163634300231934, + "step": 571 + }, + { + "epoch": 0.7471630337170381, + "grad_norm": 24.774291129440638, + "learning_rate": 9.096087105513894e-08, + "logits/chosen": -0.9611457586288452, + "logits/rejected": -0.9745159149169922, + "logps/chosen": -850.2299194335938, + "logps/rejected": -1079.2684326171875, + "loss": 0.3096, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.939201354980469, + "rewards/margins": 1.692257046699524, + "rewards/rejected": -6.631458282470703, + "step": 572 + }, + { + "epoch": 0.748469262796963, + "grad_norm": 21.002407149688135, + "learning_rate": 9.008174461027723e-08, + "logits/chosen": -0.8872397541999817, + "logits/rejected": -0.9736205339431763, + "logps/chosen": -820.9461669921875, + "logps/rejected": -1208.108642578125, + "loss": 0.3514, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.033115863800049, + "rewards/margins": 3.3874659538269043, + "rewards/rejected": -8.420580863952637, + "step": 573 + }, + { + "epoch": 0.7497754918768879, + "grad_norm": 32.13967455164451, + "learning_rate": 8.920595258430486e-08, + "logits/chosen": -0.825206458568573, + "logits/rejected": -0.8276383876800537, + "logps/chosen": -889.4954833984375, + "logps/rejected": -1139.07666015625, + "loss": 0.38, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.5783257484436035, + "rewards/margins": 2.063443183898926, + "rewards/rejected": -7.641768932342529, + "step": 574 + }, + { + "epoch": 0.7510817209568128, + "grad_norm": 28.60867511694671, + "learning_rate": 8.833351323816063e-08, + "logits/chosen": -0.9238741397857666, + "logits/rejected": -0.9454509019851685, + "logps/chosen": -878.4554443359375, + "logps/rejected": -1141.947265625, + "loss": 0.3943, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.250237941741943, + "rewards/margins": 2.2067952156066895, + "rewards/rejected": -7.457033157348633, + "step": 575 + }, + { + "epoch": 0.7523879500367376, + "grad_norm": 16.26873259584617, + "learning_rate": 8.746444476287737e-08, + "logits/chosen": -1.0511510372161865, + "logits/rejected": -1.057821273803711, + "logps/chosen": -857.7843627929688, + "logps/rejected": -1114.47607421875, + "loss": 0.3172, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.964174747467041, + "rewards/margins": 2.3061084747314453, + "rewards/rejected": -7.2702836990356445, + "step": 576 + }, + { + "epoch": 0.7536941791166626, + "grad_norm": 19.308146548606004, + "learning_rate": 8.659876527920276e-08, + "logits/chosen": -0.9624135494232178, + "logits/rejected": -0.9660477638244629, + "logps/chosen": -785.8853759765625, + "logps/rejected": -972.9590454101562, + "loss": 0.3061, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.485295295715332, + "rewards/margins": 2.169145107269287, + "rewards/rejected": -6.654440402984619, + "step": 577 + }, + { + "epoch": 0.7550004081965874, + "grad_norm": 21.61560102039662, + "learning_rate": 8.573649283722115e-08, + "logits/chosen": -0.7940883040428162, + "logits/rejected": -0.8307343125343323, + "logps/chosen": -768.4230346679688, + "logps/rejected": -1035.4759521484375, + "loss": 0.3014, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.916931629180908, + "rewards/margins": 2.457148551940918, + "rewards/rejected": -7.374079704284668, + "step": 578 + }, + { + "epoch": 0.7563066372765124, + "grad_norm": 22.88690338836935, + "learning_rate": 8.487764541597764e-08, + "logits/chosen": -0.9109131693840027, + "logits/rejected": -0.9031786918640137, + "logps/chosen": -765.204833984375, + "logps/rejected": -1008.220703125, + "loss": 0.3464, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.222220420837402, + "rewards/margins": 2.112513303756714, + "rewards/rejected": -6.3347344398498535, + "step": 579 + }, + { + "epoch": 0.7576128663564372, + "grad_norm": 21.862872793760197, + "learning_rate": 8.402224092310297e-08, + "logits/chosen": -0.9250282049179077, + "logits/rejected": -0.9852439165115356, + "logps/chosen": -871.0116577148438, + "logps/rejected": -1154.585693359375, + "loss": 0.3678, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.070359230041504, + "rewards/margins": 2.512484312057495, + "rewards/rejected": -7.58284330368042, + "step": 580 + }, + { + "epoch": 0.7589190954363622, + "grad_norm": 19.942880125656473, + "learning_rate": 8.317029719444016e-08, + "logits/chosen": -0.9003323316574097, + "logits/rejected": -0.8879726529121399, + "logps/chosen": -813.396484375, + "logps/rejected": -1111.6331787109375, + "loss": 0.3085, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.536358833312988, + "rewards/margins": 2.7713277339935303, + "rewards/rejected": -7.307686805725098, + "step": 581 + }, + { + "epoch": 0.760225324516287, + "grad_norm": 45.19702149073493, + "learning_rate": 8.232183199367265e-08, + "logits/chosen": -0.8801093101501465, + "logits/rejected": -0.9021286964416504, + "logps/chosen": -898.3988037109375, + "logps/rejected": -1257.00830078125, + "loss": 0.3687, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.8340349197387695, + "rewards/margins": 2.4283087253570557, + "rewards/rejected": -8.262344360351562, + "step": 582 + }, + { + "epoch": 0.7615315535962119, + "grad_norm": 20.347902273024776, + "learning_rate": 8.147686301195383e-08, + "logits/chosen": -0.8726892471313477, + "logits/rejected": -0.9775661826133728, + "logps/chosen": -773.6263427734375, + "logps/rejected": -1147.288330078125, + "loss": 0.2819, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.852556228637695, + "rewards/margins": 2.879547357559204, + "rewards/rejected": -7.73210334777832, + "step": 583 + }, + { + "epoch": 0.7628377826761368, + "grad_norm": 30.931119659844114, + "learning_rate": 8.063540786753842e-08, + "logits/chosen": -0.9059603214263916, + "logits/rejected": -0.936095654964447, + "logps/chosen": -769.556884765625, + "logps/rejected": -1065.5628662109375, + "loss": 0.3303, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.389358997344971, + "rewards/margins": 3.1140201091766357, + "rewards/rejected": -7.5033793449401855, + "step": 584 + }, + { + "epoch": 0.7641440117560617, + "grad_norm": 60.40616710629875, + "learning_rate": 7.979748410541451e-08, + "logits/chosen": -0.9225594997406006, + "logits/rejected": -0.9047824144363403, + "logps/chosen": -866.5232543945312, + "logps/rejected": -1200.9637451171875, + "loss": 0.4029, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.22835636138916, + "rewards/margins": 3.382688283920288, + "rewards/rejected": -8.611045837402344, + "step": 585 + }, + { + "epoch": 0.7654502408359866, + "grad_norm": 31.555998473913466, + "learning_rate": 7.896310919693858e-08, + "logits/chosen": -0.8222339749336243, + "logits/rejected": -0.8743284344673157, + "logps/chosen": -867.50048828125, + "logps/rejected": -1188.538818359375, + "loss": 0.3465, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.528315544128418, + "rewards/margins": 2.8255791664123535, + "rewards/rejected": -8.35389518737793, + "step": 586 + }, + { + "epoch": 0.7667564699159115, + "grad_norm": 21.69936273672222, + "learning_rate": 7.813230053947054e-08, + "logits/chosen": -0.9629799127578735, + "logits/rejected": -0.9869503378868103, + "logps/chosen": -853.8582763671875, + "logps/rejected": -1051.514892578125, + "loss": 0.3534, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.540807247161865, + "rewards/margins": 1.6317949295043945, + "rewards/rejected": -7.172601699829102, + "step": 587 + }, + { + "epoch": 0.7680626989958363, + "grad_norm": 24.484278864691312, + "learning_rate": 7.730507545601131e-08, + "logits/chosen": -0.8617520332336426, + "logits/rejected": -0.9498992562294006, + "logps/chosen": -748.60546875, + "logps/rejected": -1134.1103515625, + "loss": 0.3334, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.2913970947265625, + "rewards/margins": 3.1425302028656006, + "rewards/rejected": -7.433926582336426, + "step": 588 + }, + { + "epoch": 0.7693689280757613, + "grad_norm": 27.511342623503438, + "learning_rate": 7.648145119484151e-08, + "logits/chosen": -1.0093884468078613, + "logits/rejected": -1.006633996963501, + "logps/chosen": -758.9545288085938, + "logps/rejected": -935.381103515625, + "loss": 0.3744, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.51336145401001, + "rewards/margins": 1.7640907764434814, + "rewards/rejected": -6.27745246887207, + "step": 589 + }, + { + "epoch": 0.7706751571556861, + "grad_norm": 22.414541733494126, + "learning_rate": 7.566144492916191e-08, + "logits/chosen": -0.8036502599716187, + "logits/rejected": -0.8703159093856812, + "logps/chosen": -794.802001953125, + "logps/rejected": -1297.468994140625, + "loss": 0.2328, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.608358860015869, + "rewards/margins": 4.21331262588501, + "rewards/rejected": -8.821670532226562, + "step": 590 + }, + { + "epoch": 0.7719813862356111, + "grad_norm": 26.029107586759444, + "learning_rate": 7.484507375673505e-08, + "logits/chosen": -0.9289277791976929, + "logits/rejected": -0.8871047496795654, + "logps/chosen": -740.1527709960938, + "logps/rejected": -1045.8631591796875, + "loss": 0.2845, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.9896395206451416, + "rewards/margins": 2.939906120300293, + "rewards/rejected": -6.9295454025268555, + "step": 591 + }, + { + "epoch": 0.7732876153155359, + "grad_norm": 30.580890149968727, + "learning_rate": 7.40323546995292e-08, + "logits/chosen": -0.9810222387313843, + "logits/rejected": -0.9534710645675659, + "logps/chosen": -907.60498046875, + "logps/rejected": -1219.4244384765625, + "loss": 0.2711, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.280550003051758, + "rewards/margins": 3.0338191986083984, + "rewards/rejected": -8.314369201660156, + "step": 592 + }, + { + "epoch": 0.7745938443954609, + "grad_norm": 25.625976246905026, + "learning_rate": 7.322330470336313e-08, + "logits/chosen": -0.8802429437637329, + "logits/rejected": -0.8881117105484009, + "logps/chosen": -817.2022705078125, + "logps/rejected": -982.9677124023438, + "loss": 0.3506, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.862236022949219, + "rewards/margins": 1.5472270250320435, + "rewards/rejected": -6.409462928771973, + "step": 593 + }, + { + "epoch": 0.7759000734753857, + "grad_norm": 20.818249995545727, + "learning_rate": 7.241794063755291e-08, + "logits/chosen": -0.9595385193824768, + "logits/rejected": -1.021976113319397, + "logps/chosen": -852.221923828125, + "logps/rejected": -1156.3397216796875, + "loss": 0.3032, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.065010070800781, + "rewards/margins": 2.4911441802978516, + "rewards/rejected": -7.556154251098633, + "step": 594 + }, + { + "epoch": 0.7772063025553106, + "grad_norm": 26.077317678524693, + "learning_rate": 7.161627929456004e-08, + "logits/chosen": -0.8960578441619873, + "logits/rejected": -0.826008677482605, + "logps/chosen": -792.5365600585938, + "logps/rejected": -893.3995361328125, + "loss": 0.3794, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.322532653808594, + "rewards/margins": 1.6258131265640259, + "rewards/rejected": -5.94834566116333, + "step": 595 + }, + { + "epoch": 0.7785125316352355, + "grad_norm": 27.584632338786395, + "learning_rate": 7.081833738964149e-08, + "logits/chosen": -1.0183488130569458, + "logits/rejected": -0.9943074584007263, + "logps/chosen": -820.5167846679688, + "logps/rejected": -1048.6409912109375, + "loss": 0.3894, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.430448055267334, + "rewards/margins": 2.2347586154937744, + "rewards/rejected": -6.665206432342529, + "step": 596 + }, + { + "epoch": 0.7798187607151604, + "grad_norm": 24.080235978815235, + "learning_rate": 7.002413156050108e-08, + "logits/chosen": -0.8766260743141174, + "logits/rejected": -0.8280268311500549, + "logps/chosen": -820.6423950195312, + "logps/rejected": -977.9401245117188, + "loss": 0.3307, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.771195411682129, + "rewards/margins": 1.9175888299942017, + "rewards/rejected": -6.688784599304199, + "step": 597 + }, + { + "epoch": 0.7811249897950853, + "grad_norm": 17.451309577317936, + "learning_rate": 6.923367836694236e-08, + "logits/chosen": -1.0711160898208618, + "logits/rejected": -1.0157512426376343, + "logps/chosen": -793.7659301757812, + "logps/rejected": -992.5198364257812, + "loss": 0.3032, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.318778991699219, + "rewards/margins": 2.1037962436676025, + "rewards/rejected": -6.422574996948242, + "step": 598 + }, + { + "epoch": 0.7824312188750102, + "grad_norm": 32.13868420831174, + "learning_rate": 6.844699429052375e-08, + "logits/chosen": -0.9602109789848328, + "logits/rejected": -0.9311934113502502, + "logps/chosen": -840.842041015625, + "logps/rejected": -1053.7744140625, + "loss": 0.3717, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.749244213104248, + "rewards/margins": 2.2002172470092773, + "rewards/rejected": -6.949460983276367, + "step": 599 + }, + { + "epoch": 0.783737447954935, + "grad_norm": 19.259699976993907, + "learning_rate": 6.766409573421466e-08, + "logits/chosen": -0.9458956718444824, + "logits/rejected": -1.0178898572921753, + "logps/chosen": -805.6896362304688, + "logps/rejected": -1252.30908203125, + "loss": 0.3342, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.890609264373779, + "rewards/margins": 4.166367053985596, + "rewards/rejected": -9.056976318359375, + "step": 600 + }, + { + "epoch": 0.783737447954935, + "eval_logits/chosen": -0.799916684627533, + "eval_logits/rejected": -0.8080865740776062, + "eval_logps/chosen": -814.440185546875, + "eval_logps/rejected": -1083.55029296875, + "eval_loss": 0.3353707194328308, + "eval_rewards/accuracies": 0.8920000195503235, + "eval_rewards/chosen": -4.7040910720825195, + "eval_rewards/margins": 2.6301238536834717, + "eval_rewards/rejected": -7.334214210510254, + "eval_runtime": 305.2507, + "eval_samples_per_second": 6.552, + "eval_steps_per_second": 0.409, + "step": 600 + }, + { + "epoch": 0.78504367703486, + "grad_norm": 20.361644602615105, + "learning_rate": 6.688499902205345e-08, + "logits/chosen": -0.8707839846611023, + "logits/rejected": -0.9645876884460449, + "logps/chosen": -891.9090576171875, + "logps/rejected": -1203.7598876953125, + "loss": 0.2809, + "rewards/accuracies": 0.96875, + "rewards/chosen": -5.256412982940674, + "rewards/margins": 2.483121395111084, + "rewards/rejected": -7.739534378051758, + "step": 601 + }, + { + "epoch": 0.7863499061147848, + "grad_norm": 20.043709061394022, + "learning_rate": 6.610972039880704e-08, + "logits/chosen": -0.8824882507324219, + "logits/rejected": -0.9258497953414917, + "logps/chosen": -727.640625, + "logps/rejected": -986.2440185546875, + "loss": 0.2183, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.061704158782959, + "rewards/margins": 2.5300819873809814, + "rewards/rejected": -6.591785907745361, + "step": 602 + }, + { + "epoch": 0.7876561351947098, + "grad_norm": 25.028173721925533, + "learning_rate": 6.533827602963244e-08, + "logits/chosen": -0.9209650754928589, + "logits/rejected": -0.9454638361930847, + "logps/chosen": -781.026611328125, + "logps/rejected": -1081.3594970703125, + "loss": 0.3306, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.376533508300781, + "rewards/margins": 2.9207139015197754, + "rewards/rejected": -7.297247409820557, + "step": 603 + }, + { + "epoch": 0.7889623642746346, + "grad_norm": 22.348702273934453, + "learning_rate": 6.45706819997392e-08, + "logits/chosen": -1.0147395133972168, + "logits/rejected": -1.038183331489563, + "logps/chosen": -824.8892822265625, + "logps/rejected": -1079.4228515625, + "loss": 0.3183, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.653099060058594, + "rewards/margins": 2.434006452560425, + "rewards/rejected": -7.087105751037598, + "step": 604 + }, + { + "epoch": 0.7902685933545596, + "grad_norm": 27.467778442681738, + "learning_rate": 6.380695431405453e-08, + "logits/chosen": -0.8424959778785706, + "logits/rejected": -0.9284894466400146, + "logps/chosen": -802.2348022460938, + "logps/rejected": -1163.3359375, + "loss": 0.3341, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.737122058868408, + "rewards/margins": 2.743659734725952, + "rewards/rejected": -7.480782508850098, + "step": 605 + }, + { + "epoch": 0.7915748224344844, + "grad_norm": 19.34862528757651, + "learning_rate": 6.304710889688944e-08, + "logits/chosen": -0.9255674481391907, + "logits/rejected": -0.9599024057388306, + "logps/chosen": -789.0675048828125, + "logps/rejected": -1047.88232421875, + "loss": 0.2828, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.71975040435791, + "rewards/margins": 2.2590696811676025, + "rewards/rejected": -6.978819847106934, + "step": 606 + }, + { + "epoch": 0.7928810515144094, + "grad_norm": 63.39258546983016, + "learning_rate": 6.229116159160652e-08, + "logits/chosen": -1.0031625032424927, + "logits/rejected": -1.0137825012207031, + "logps/chosen": -832.7035522460938, + "logps/rejected": -1037.726806640625, + "loss": 0.3436, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.836849212646484, + "rewards/margins": 2.296910285949707, + "rewards/rejected": -7.133759498596191, + "step": 607 + }, + { + "epoch": 0.7941872805943342, + "grad_norm": 18.22971639721024, + "learning_rate": 6.153912816028976e-08, + "logits/chosen": -0.919977068901062, + "logits/rejected": -0.923467218875885, + "logps/chosen": -871.0953369140625, + "logps/rejected": -1129.403564453125, + "loss": 0.2677, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.000254154205322, + "rewards/margins": 2.288395881652832, + "rewards/rejected": -7.288650035858154, + "step": 608 + }, + { + "epoch": 0.7954935096742591, + "grad_norm": 26.306570079281883, + "learning_rate": 6.079102428341587e-08, + "logits/chosen": -0.9508645534515381, + "logits/rejected": -0.955719530582428, + "logps/chosen": -792.3866577148438, + "logps/rejected": -1104.510009765625, + "loss": 0.2807, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.013969898223877, + "rewards/margins": 3.101562261581421, + "rewards/rejected": -7.115531921386719, + "step": 609 + }, + { + "epoch": 0.796799738754184, + "grad_norm": 27.27759663326369, + "learning_rate": 6.004686555952742e-08, + "logits/chosen": -0.9942679405212402, + "logits/rejected": -0.9757542014122009, + "logps/chosen": -863.2615966796875, + "logps/rejected": -1148.544677734375, + "loss": 0.3943, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.1844658851623535, + "rewards/margins": 1.9732271432876587, + "rewards/rejected": -7.157692909240723, + "step": 610 + }, + { + "epoch": 0.7981059678341089, + "grad_norm": 22.309371027174624, + "learning_rate": 5.9306667504907234e-08, + "logits/chosen": -0.966468095779419, + "logits/rejected": -0.9042306542396545, + "logps/chosen": -819.4505615234375, + "logps/rejected": -979.9266967773438, + "loss": 0.3763, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.6890740394592285, + "rewards/margins": 2.081890344619751, + "rewards/rejected": -6.770963668823242, + "step": 611 + }, + { + "epoch": 0.7994121969140338, + "grad_norm": 35.47329473307664, + "learning_rate": 5.857044555325535e-08, + "logits/chosen": -1.0889376401901245, + "logits/rejected": -1.0171784162521362, + "logps/chosen": -853.3372802734375, + "logps/rejected": -974.7836303710938, + "loss": 0.3427, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.811555862426758, + "rewards/margins": 1.7559360265731812, + "rewards/rejected": -6.56749153137207, + "step": 612 + }, + { + "epoch": 0.8007184259939587, + "grad_norm": 36.349287307385815, + "learning_rate": 5.7838215055366954e-08, + "logits/chosen": -0.9624323844909668, + "logits/rejected": -0.9520595669746399, + "logps/chosen": -828.0532836914062, + "logps/rejected": -1140.3701171875, + "loss": 0.3107, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.769121170043945, + "rewards/margins": 2.8237016201019287, + "rewards/rejected": -7.592822551727295, + "step": 613 + }, + { + "epoch": 0.8020246550738835, + "grad_norm": 35.09922416585137, + "learning_rate": 5.710999127881233e-08, + "logits/chosen": -1.011865496635437, + "logits/rejected": -1.0389556884765625, + "logps/chosen": -782.23486328125, + "logps/rejected": -1016.12646484375, + "loss": 0.3195, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.666965007781982, + "rewards/margins": 2.2044434547424316, + "rewards/rejected": -6.871408462524414, + "step": 614 + }, + { + "epoch": 0.8033308841538085, + "grad_norm": 19.12905924237332, + "learning_rate": 5.6385789407618593e-08, + "logits/chosen": -0.9514075517654419, + "logits/rejected": -0.9145126938819885, + "logps/chosen": -811.7752685546875, + "logps/rejected": -1043.508544921875, + "loss": 0.338, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.971982955932617, + "rewards/margins": 2.395125150680542, + "rewards/rejected": -7.367108345031738, + "step": 615 + }, + { + "epoch": 0.8046371132337333, + "grad_norm": 60.821253048729666, + "learning_rate": 5.5665624541952865e-08, + "logits/chosen": -1.0484111309051514, + "logits/rejected": -1.0743966102600098, + "logps/chosen": -804.5001831054688, + "logps/rejected": -1006.0529174804688, + "loss": 0.3242, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.520447254180908, + "rewards/margins": 2.239243507385254, + "rewards/rejected": -6.759690284729004, + "step": 616 + }, + { + "epoch": 0.8059433423136583, + "grad_norm": 59.38789752379059, + "learning_rate": 5.494951169780776e-08, + "logits/chosen": -0.8696390390396118, + "logits/rejected": -0.8830188512802124, + "logps/chosen": -848.6278686523438, + "logps/rejected": -1117.7974853515625, + "loss": 0.3619, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.382977485656738, + "rewards/margins": 2.234811544418335, + "rewards/rejected": -7.617789268493652, + "step": 617 + }, + { + "epoch": 0.8072495713935831, + "grad_norm": 20.31680378636864, + "learning_rate": 5.4237465806688004e-08, + "logits/chosen": -0.9961526989936829, + "logits/rejected": -0.9020803570747375, + "logps/chosen": -790.9029541015625, + "logps/rejected": -987.162353515625, + "loss": 0.2983, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.762269973754883, + "rewards/margins": 2.2328810691833496, + "rewards/rejected": -6.995151519775391, + "step": 618 + }, + { + "epoch": 0.8085558004735081, + "grad_norm": 38.36725235605651, + "learning_rate": 5.3529501715299266e-08, + "logits/chosen": -0.9393517971038818, + "logits/rejected": -1.0322364568710327, + "logps/chosen": -831.10302734375, + "logps/rejected": -1079.7054443359375, + "loss": 0.3344, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.986616134643555, + "rewards/margins": 2.054384469985962, + "rewards/rejected": -7.041000843048096, + "step": 619 + }, + { + "epoch": 0.8098620295534329, + "grad_norm": 23.713937668491724, + "learning_rate": 5.2825634185238583e-08, + "logits/chosen": -0.9017822742462158, + "logits/rejected": -0.9380840063095093, + "logps/chosen": -789.52734375, + "logps/rejected": -969.8795166015625, + "loss": 0.3775, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.724421501159668, + "rewards/margins": 1.923351764678955, + "rewards/rejected": -6.647772789001465, + "step": 620 + }, + { + "epoch": 0.8111682586333578, + "grad_norm": 20.719618911102806, + "learning_rate": 5.212587789268649e-08, + "logits/chosen": -0.9668422937393188, + "logits/rejected": -0.997907280921936, + "logps/chosen": -849.0941772460938, + "logps/rejected": -1135.4027099609375, + "loss": 0.3439, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.6942877769470215, + "rewards/margins": 2.6681554317474365, + "rewards/rejected": -7.362443923950195, + "step": 621 + }, + { + "epoch": 0.8124744877132827, + "grad_norm": 28.78342856793349, + "learning_rate": 5.1430247428101067e-08, + "logits/chosen": -0.8712970018386841, + "logits/rejected": -0.8977018594741821, + "logps/chosen": -872.6751098632812, + "logps/rejected": -1110.70751953125, + "loss": 0.283, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.823261260986328, + "rewards/margins": 2.413942337036133, + "rewards/rejected": -7.237203121185303, + "step": 622 + }, + { + "epoch": 0.8137807167932076, + "grad_norm": 20.643675704776587, + "learning_rate": 5.0738757295913674e-08, + "logits/chosen": -1.0791032314300537, + "logits/rejected": -0.9943168759346008, + "logps/chosen": -833.9264526367188, + "logps/rejected": -957.189453125, + "loss": 0.3851, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.792407512664795, + "rewards/margins": 1.5870347023010254, + "rewards/rejected": -6.37944221496582, + "step": 623 + }, + { + "epoch": 0.8150869458731325, + "grad_norm": 18.136247401431493, + "learning_rate": 5.005142191422665e-08, + "logits/chosen": -0.9701358675956726, + "logits/rejected": -0.9419609308242798, + "logps/chosen": -746.4559326171875, + "logps/rejected": -925.2149658203125, + "loss": 0.4097, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.279734134674072, + "rewards/margins": 1.9801875352859497, + "rewards/rejected": -6.259922027587891, + "step": 624 + }, + { + "epoch": 0.8163931749530574, + "grad_norm": 26.072306298901207, + "learning_rate": 4.936825561451235e-08, + "logits/chosen": -1.0418720245361328, + "logits/rejected": -0.9948733448982239, + "logps/chosen": -899.941650390625, + "logps/rejected": -1055.2972412109375, + "loss": 0.341, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.106154441833496, + "rewards/margins": 1.7844371795654297, + "rewards/rejected": -6.890592575073242, + "step": 625 + }, + { + "epoch": 0.8176994040329822, + "grad_norm": 23.034148794273474, + "learning_rate": 4.868927264131476e-08, + "logits/chosen": -0.9666779637336731, + "logits/rejected": -1.0086778402328491, + "logps/chosen": -884.3038940429688, + "logps/rejected": -1230.3251953125, + "loss": 0.3529, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.3153486251831055, + "rewards/margins": 3.0668115615844727, + "rewards/rejected": -8.382160186767578, + "step": 626 + }, + { + "epoch": 0.8190056331129072, + "grad_norm": 23.650355623633093, + "learning_rate": 4.801448715195227e-08, + "logits/chosen": -1.0483986139297485, + "logits/rejected": -1.0794347524642944, + "logps/chosen": -864.2808227539062, + "logps/rejected": -1053.04150390625, + "loss": 0.3211, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.786685943603516, + "rewards/margins": 2.0903913974761963, + "rewards/rejected": -6.877077102661133, + "step": 627 + }, + { + "epoch": 0.820311862192832, + "grad_norm": 17.274048500959722, + "learning_rate": 4.734391321622242e-08, + "logits/chosen": -0.9791491627693176, + "logits/rejected": -0.9082823395729065, + "logps/chosen": -833.59619140625, + "logps/rejected": -984.154052734375, + "loss": 0.3178, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.538018226623535, + "rewards/margins": 2.0235354900360107, + "rewards/rejected": -6.561553001403809, + "step": 628 + }, + { + "epoch": 0.821618091272757, + "grad_norm": 35.535109704827164, + "learning_rate": 4.667756481610866e-08, + "logits/chosen": -0.9152229428291321, + "logits/rejected": -0.9227945804595947, + "logps/chosen": -828.603271484375, + "logps/rejected": -1044.041259765625, + "loss": 0.365, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.549627780914307, + "rewards/margins": 1.8636504411697388, + "rewards/rejected": -6.413278579711914, + "step": 629 + }, + { + "epoch": 0.8229243203526818, + "grad_norm": 30.88845897386843, + "learning_rate": 4.60154558454888e-08, + "logits/chosen": -1.064465045928955, + "logits/rejected": -1.0457439422607422, + "logps/chosen": -889.4046020507812, + "logps/rejected": -1094.5487060546875, + "loss": 0.424, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.0392560958862305, + "rewards/margins": 2.392564535140991, + "rewards/rejected": -7.431820869445801, + "step": 630 + }, + { + "epoch": 0.8242305494326068, + "grad_norm": 20.901329900516245, + "learning_rate": 4.535760010984513e-08, + "logits/chosen": -0.8499188423156738, + "logits/rejected": -0.8458962440490723, + "logps/chosen": -831.1834716796875, + "logps/rejected": -1015.3555908203125, + "loss": 0.2817, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.353586196899414, + "rewards/margins": 1.9744594097137451, + "rewards/rejected": -7.328044891357422, + "step": 631 + }, + { + "epoch": 0.8255367785125316, + "grad_norm": 31.389449614869186, + "learning_rate": 4.470401132597687e-08, + "logits/chosen": -0.8895751237869263, + "logits/rejected": -0.90300053358078, + "logps/chosen": -820.342529296875, + "logps/rejected": -1040.498779296875, + "loss": 0.3141, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.610342025756836, + "rewards/margins": 2.0794625282287598, + "rewards/rejected": -6.689804553985596, + "step": 632 + }, + { + "epoch": 0.8268430075924565, + "grad_norm": 22.440107167681898, + "learning_rate": 4.405470312171392e-08, + "logits/chosen": -0.9385513663291931, + "logits/rejected": -0.9701187610626221, + "logps/chosen": -816.0090942382812, + "logps/rejected": -1153.3282470703125, + "loss": 0.3333, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.639499664306641, + "rewards/margins": 3.054454803466797, + "rewards/rejected": -7.693953514099121, + "step": 633 + }, + { + "epoch": 0.8281492366723814, + "grad_norm": 22.836475349274135, + "learning_rate": 4.340968903563283e-08, + "logits/chosen": -1.0385801792144775, + "logits/rejected": -0.9340227842330933, + "logps/chosen": -811.0709228515625, + "logps/rejected": -988.4837036132812, + "loss": 0.2933, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.569432258605957, + "rewards/margins": 2.175413131713867, + "rewards/rejected": -6.744844913482666, + "step": 634 + }, + { + "epoch": 0.8294554657523063, + "grad_norm": 27.201658907499592, + "learning_rate": 4.2768982516774495e-08, + "logits/chosen": -0.9834165573120117, + "logits/rejected": -1.0088675022125244, + "logps/chosen": -799.6683959960938, + "logps/rejected": -950.1758422851562, + "loss": 0.346, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.314089775085449, + "rewards/margins": 1.5654332637786865, + "rewards/rejected": -5.879523277282715, + "step": 635 + }, + { + "epoch": 0.8307616948322312, + "grad_norm": 21.147760460354423, + "learning_rate": 4.213259692436366e-08, + "logits/chosen": -0.9662840366363525, + "logits/rejected": -1.0308414697647095, + "logps/chosen": -726.1981201171875, + "logps/rejected": -1077.3818359375, + "loss": 0.3096, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9752042293548584, + "rewards/margins": 2.9876835346221924, + "rewards/rejected": -6.962887287139893, + "step": 636 + }, + { + "epoch": 0.8320679239121561, + "grad_norm": 21.42406296057284, + "learning_rate": 4.1500545527530544e-08, + "logits/chosen": -0.9212138652801514, + "logits/rejected": -0.926445722579956, + "logps/chosen": -825.0010986328125, + "logps/rejected": -1020.178955078125, + "loss": 0.3342, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.832255840301514, + "rewards/margins": 2.0423848628997803, + "rewards/rejected": -6.874640941619873, + "step": 637 + }, + { + "epoch": 0.833374152992081, + "grad_norm": 22.437840301602577, + "learning_rate": 4.087284150503381e-08, + "logits/chosen": -0.8695635795593262, + "logits/rejected": -0.929308295249939, + "logps/chosen": -847.9984130859375, + "logps/rejected": -1145.2864990234375, + "loss": 0.3356, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.7246928215026855, + "rewards/margins": 2.388370990753174, + "rewards/rejected": -7.113063812255859, + "step": 638 + }, + { + "epoch": 0.8346803820720059, + "grad_norm": 30.94820697370598, + "learning_rate": 4.024949794498622e-08, + "logits/chosen": -1.0347599983215332, + "logits/rejected": -1.0474956035614014, + "logps/chosen": -764.6439208984375, + "logps/rejected": -1023.9473876953125, + "loss": 0.3739, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.080129623413086, + "rewards/margins": 2.6597049236297607, + "rewards/rejected": -6.739834308624268, + "step": 639 + }, + { + "epoch": 0.8359866111519307, + "grad_norm": 25.92041492366139, + "learning_rate": 3.963052784458146e-08, + "logits/chosen": -0.9137096405029297, + "logits/rejected": -0.9526919722557068, + "logps/chosen": -831.0147705078125, + "logps/rejected": -1173.9879150390625, + "loss": 0.2769, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.643778324127197, + "rewards/margins": 3.2461047172546387, + "rewards/rejected": -7.889883518218994, + "step": 640 + }, + { + "epoch": 0.8372928402318557, + "grad_norm": 19.424694402625263, + "learning_rate": 3.901594410982326e-08, + "logits/chosen": -0.9314712285995483, + "logits/rejected": -1.0186083316802979, + "logps/chosen": -792.0285034179688, + "logps/rejected": -1059.8748779296875, + "loss": 0.3201, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.710291862487793, + "rewards/margins": 2.3436036109924316, + "rewards/rejected": -7.053895950317383, + "step": 641 + }, + { + "epoch": 0.8385990693117805, + "grad_norm": 49.750792586062126, + "learning_rate": 3.8405759555256156e-08, + "logits/chosen": -1.0113201141357422, + "logits/rejected": -1.0383992195129395, + "logps/chosen": -765.2586059570312, + "logps/rejected": -982.585205078125, + "loss": 0.3595, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.2787184715271, + "rewards/margins": 1.8915104866027832, + "rewards/rejected": -6.170229434967041, + "step": 642 + }, + { + "epoch": 0.8399052983917055, + "grad_norm": 32.60901590818127, + "learning_rate": 3.779998690369857e-08, + "logits/chosen": -0.9662805795669556, + "logits/rejected": -1.001565933227539, + "logps/chosen": -731.9835815429688, + "logps/rejected": -1034.92578125, + "loss": 0.3098, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.9542222023010254, + "rewards/margins": 2.9862449169158936, + "rewards/rejected": -6.940467357635498, + "step": 643 + }, + { + "epoch": 0.8412115274716303, + "grad_norm": 32.001776906392735, + "learning_rate": 3.719863878597704e-08, + "logits/chosen": -0.9169551134109497, + "logits/rejected": -0.909413754940033, + "logps/chosen": -867.5076293945312, + "logps/rejected": -965.96044921875, + "loss": 0.4021, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.9577765464782715, + "rewards/margins": 1.3180768489837646, + "rewards/rejected": -6.275854110717773, + "step": 644 + }, + { + "epoch": 0.8425177565515553, + "grad_norm": 26.775446426020242, + "learning_rate": 3.660172774066339e-08, + "logits/chosen": -0.9407652020454407, + "logits/rejected": -0.8637470006942749, + "logps/chosen": -742.7183837890625, + "logps/rejected": -966.02978515625, + "loss": 0.3015, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.942478895187378, + "rewards/margins": 2.3000540733337402, + "rewards/rejected": -6.242532730102539, + "step": 645 + }, + { + "epoch": 0.8438239856314801, + "grad_norm": 18.452462326552563, + "learning_rate": 3.600926621381306e-08, + "logits/chosen": -0.9551993608474731, + "logits/rejected": -0.9735285639762878, + "logps/chosen": -796.7625732421875, + "logps/rejected": -1045.31005859375, + "loss": 0.2917, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.553375720977783, + "rewards/margins": 2.676002025604248, + "rewards/rejected": -7.229377746582031, + "step": 646 + }, + { + "epoch": 0.845130214711405, + "grad_norm": 21.2291726685947, + "learning_rate": 3.54212665587055e-08, + "logits/chosen": -0.9161040782928467, + "logits/rejected": -0.8548312783241272, + "logps/chosen": -755.5303955078125, + "logps/rejected": -881.5941772460938, + "loss": 0.3122, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.344512462615967, + "rewards/margins": 1.6735119819641113, + "rewards/rejected": -6.018024444580078, + "step": 647 + }, + { + "epoch": 0.8464364437913299, + "grad_norm": 17.077302518351114, + "learning_rate": 3.4837741035586816e-08, + "logits/chosen": -0.9503481388092041, + "logits/rejected": -1.0166294574737549, + "logps/chosen": -806.5877685546875, + "logps/rejected": -1030.3157958984375, + "loss": 0.3193, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.656474590301514, + "rewards/margins": 2.0568296909332275, + "rewards/rejected": -6.71330451965332, + "step": 648 + }, + { + "epoch": 0.8477426728712548, + "grad_norm": 21.127833153046492, + "learning_rate": 3.425870181141394e-08, + "logits/chosen": -0.8827044367790222, + "logits/rejected": -0.9482054114341736, + "logps/chosen": -739.4212036132812, + "logps/rejected": -993.3704833984375, + "loss": 0.3141, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.291741371154785, + "rewards/margins": 2.3153553009033203, + "rewards/rejected": -6.607097625732422, + "step": 649 + }, + { + "epoch": 0.8490489019511797, + "grad_norm": 18.112637343103085, + "learning_rate": 3.3684160959600917e-08, + "logits/chosen": -0.9638092517852783, + "logits/rejected": -0.9645333290100098, + "logps/chosen": -846.0302734375, + "logps/rejected": -1121.314453125, + "loss": 0.3411, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.2316718101501465, + "rewards/margins": 2.5505781173706055, + "rewards/rejected": -7.782249927520752, + "step": 650 + }, + { + "epoch": 0.8503551310311046, + "grad_norm": 25.52167601536166, + "learning_rate": 3.311413045976741e-08, + "logits/chosen": -0.8650178909301758, + "logits/rejected": -0.8249406218528748, + "logps/chosen": -850.0011596679688, + "logps/rejected": -1077.047119140625, + "loss": 0.2862, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.9619140625, + "rewards/margins": 2.3785924911499023, + "rewards/rejected": -7.340506553649902, + "step": 651 + }, + { + "epoch": 0.8516613601110294, + "grad_norm": 16.842773477083057, + "learning_rate": 3.2548622197488744e-08, + "logits/chosen": -0.9444965124130249, + "logits/rejected": -0.9097779393196106, + "logps/chosen": -765.1162109375, + "logps/rejected": -952.4530029296875, + "loss": 0.3226, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.136512756347656, + "rewards/margins": 2.2481629848480225, + "rewards/rejected": -6.3846755027771, + "step": 652 + }, + { + "epoch": 0.8529675891909544, + "grad_norm": 15.833969137148738, + "learning_rate": 3.198764796404807e-08, + "logits/chosen": -0.8982550501823425, + "logits/rejected": -0.8870823383331299, + "logps/chosen": -783.821044921875, + "logps/rejected": -1065.482421875, + "loss": 0.272, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.595544338226318, + "rewards/margins": 2.925309181213379, + "rewards/rejected": -7.520853042602539, + "step": 653 + }, + { + "epoch": 0.8542738182708792, + "grad_norm": 20.813983014515298, + "learning_rate": 3.1431219456190563e-08, + "logits/chosen": -0.7716184854507446, + "logits/rejected": -0.7652902603149414, + "logps/chosen": -788.2040405273438, + "logps/rejected": -1052.087158203125, + "loss": 0.3365, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.614079475402832, + "rewards/margins": 2.3715004920959473, + "rewards/rejected": -6.985579967498779, + "step": 654 + }, + { + "epoch": 0.8555800473508042, + "grad_norm": 33.85912168424529, + "learning_rate": 3.0879348275879484e-08, + "logits/chosen": -0.9596065282821655, + "logits/rejected": -0.9730892777442932, + "logps/chosen": -832.7069091796875, + "logps/rejected": -1055.71533203125, + "loss": 0.4184, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.167130947113037, + "rewards/margins": 2.034320116043091, + "rewards/rejected": -7.201451301574707, + "step": 655 + }, + { + "epoch": 0.856886276430729, + "grad_norm": 25.840604770233004, + "learning_rate": 3.033204593005439e-08, + "logits/chosen": -0.9529532194137573, + "logits/rejected": -0.9935981631278992, + "logps/chosen": -806.2481079101562, + "logps/rejected": -1091.0167236328125, + "loss": 0.3256, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.538727760314941, + "rewards/margins": 2.7176947593688965, + "rewards/rejected": -7.256422519683838, + "step": 656 + }, + { + "epoch": 0.858192505510654, + "grad_norm": 29.406599320225745, + "learning_rate": 2.9789323830390927e-08, + "logits/chosen": -1.0245752334594727, + "logits/rejected": -0.9978625774383545, + "logps/chosen": -851.211669921875, + "logps/rejected": -1119.0037841796875, + "loss": 0.3483, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.464425086975098, + "rewards/margins": 2.873805522918701, + "rewards/rejected": -7.338231086730957, + "step": 657 + }, + { + "epoch": 0.8594987345905788, + "grad_norm": 19.04536365055149, + "learning_rate": 2.925119329306333e-08, + "logits/chosen": -0.8678416013717651, + "logits/rejected": -0.8142892122268677, + "logps/chosen": -808.4974365234375, + "logps/rejected": -921.2274780273438, + "loss": 0.3633, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.781768798828125, + "rewards/margins": 1.3113124370574951, + "rewards/rejected": -6.093081474304199, + "step": 658 + }, + { + "epoch": 0.8608049636705037, + "grad_norm": 17.60296500618362, + "learning_rate": 2.871766553850796e-08, + "logits/chosen": -0.917082667350769, + "logits/rejected": -0.9683120846748352, + "logps/chosen": -802.0975952148438, + "logps/rejected": -1129.573486328125, + "loss": 0.2501, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.332143306732178, + "rewards/margins": 2.751530885696411, + "rewards/rejected": -7.083674907684326, + "step": 659 + }, + { + "epoch": 0.8621111927504286, + "grad_norm": 21.15842626873544, + "learning_rate": 2.818875169118981e-08, + "logits/chosen": -0.9736424684524536, + "logits/rejected": -0.971178412437439, + "logps/chosen": -825.02001953125, + "logps/rejected": -1002.52783203125, + "loss": 0.2667, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.003799915313721, + "rewards/margins": 1.8433326482772827, + "rewards/rejected": -6.847132205963135, + "step": 660 + }, + { + "epoch": 0.8634174218303535, + "grad_norm": 23.135441218561276, + "learning_rate": 2.766446277937029e-08, + "logits/chosen": -1.1651058197021484, + "logits/rejected": -1.1745052337646484, + "logps/chosen": -888.7207641601562, + "logps/rejected": -1079.0030517578125, + "loss": 0.3053, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.9126296043396, + "rewards/margins": 2.0017378330230713, + "rewards/rejected": -6.914368152618408, + "step": 661 + }, + { + "epoch": 0.8647236509102784, + "grad_norm": 21.245688923606288, + "learning_rate": 2.7144809734877316e-08, + "logits/chosen": -0.9561842679977417, + "logits/rejected": -0.9894252419471741, + "logps/chosen": -791.2134399414062, + "logps/rejected": -1127.950439453125, + "loss": 0.2715, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.601735591888428, + "rewards/margins": 3.0200610160827637, + "rewards/rejected": -7.621796607971191, + "step": 662 + }, + { + "epoch": 0.8660298799902033, + "grad_norm": 28.174818636350068, + "learning_rate": 2.6629803392877486e-08, + "logits/chosen": -0.9367572069168091, + "logits/rejected": -1.0345512628555298, + "logps/chosen": -845.8182983398438, + "logps/rejected": -1210.5560302734375, + "loss": 0.318, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.806478500366211, + "rewards/margins": 3.0188956260681152, + "rewards/rejected": -7.825375080108643, + "step": 663 + }, + { + "epoch": 0.8673361090701281, + "grad_norm": 21.922625343186045, + "learning_rate": 2.6119454491649845e-08, + "logits/chosen": -0.8922625780105591, + "logits/rejected": -0.9068939685821533, + "logps/chosen": -841.5750122070312, + "logps/rejected": -1101.30810546875, + "loss": 0.3871, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.900118827819824, + "rewards/margins": 2.467489719390869, + "rewards/rejected": -7.367608070373535, + "step": 664 + }, + { + "epoch": 0.8686423381500531, + "grad_norm": 31.04807995918663, + "learning_rate": 2.5613773672362476e-08, + "logits/chosen": -1.0141932964324951, + "logits/rejected": -1.0032148361206055, + "logps/chosen": -839.9802856445312, + "logps/rejected": -1058.4979248046875, + "loss": 0.3468, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.662171363830566, + "rewards/margins": 2.167586326599121, + "rewards/rejected": -6.8297576904296875, + "step": 665 + }, + { + "epoch": 0.8699485672299779, + "grad_norm": 28.627499460485247, + "learning_rate": 2.5112771478850186e-08, + "logits/chosen": -1.0315594673156738, + "logits/rejected": -1.0283108949661255, + "logps/chosen": -772.7771606445312, + "logps/rejected": -975.7742919921875, + "loss": 0.3636, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.486318111419678, + "rewards/margins": 1.9306285381317139, + "rewards/rejected": -6.4169464111328125, + "step": 666 + }, + { + "epoch": 0.8712547963099029, + "grad_norm": 27.501666238776274, + "learning_rate": 2.46164583573949e-08, + "logits/chosen": -0.9367161989212036, + "logits/rejected": -0.9585106372833252, + "logps/chosen": -824.9557495117188, + "logps/rejected": -1110.76123046875, + "loss": 0.3143, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.000235557556152, + "rewards/margins": 2.6299326419830322, + "rewards/rejected": -7.630168437957764, + "step": 667 + }, + { + "epoch": 0.8725610253898277, + "grad_norm": 31.364682261474258, + "learning_rate": 2.412484465650774e-08, + "logits/chosen": -0.882140576839447, + "logits/rejected": -0.9227953553199768, + "logps/chosen": -795.4391479492188, + "logps/rejected": -1053.7271728515625, + "loss": 0.4831, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.803238868713379, + "rewards/margins": 2.2624800205230713, + "rewards/rejected": -7.065718650817871, + "step": 668 + }, + { + "epoch": 0.8738672544697527, + "grad_norm": 23.304091321744853, + "learning_rate": 2.3637940626713342e-08, + "logits/chosen": -1.0385204553604126, + "logits/rejected": -1.0561144351959229, + "logps/chosen": -851.8370361328125, + "logps/rejected": -1017.2864990234375, + "loss": 0.3164, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.111514568328857, + "rewards/margins": 1.816351056098938, + "rewards/rejected": -6.927865505218506, + "step": 669 + }, + { + "epoch": 0.8751734835496775, + "grad_norm": 23.269790245201648, + "learning_rate": 2.315575642033604e-08, + "logits/chosen": -1.024946928024292, + "logits/rejected": -1.0573128461837769, + "logps/chosen": -793.3582763671875, + "logps/rejected": -1110.23876953125, + "loss": 0.2668, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.664315223693848, + "rewards/margins": 3.121136426925659, + "rewards/rejected": -7.785451889038086, + "step": 670 + }, + { + "epoch": 0.8764797126296024, + "grad_norm": 30.247532810977688, + "learning_rate": 2.2678302091288155e-08, + "logits/chosen": -1.0933949947357178, + "logits/rejected": -1.0754190683364868, + "logps/chosen": -805.4337768554688, + "logps/rejected": -1031.197021484375, + "loss": 0.3132, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.566261291503906, + "rewards/margins": 2.2215776443481445, + "rewards/rejected": -6.787838935852051, + "step": 671 + }, + { + "epoch": 0.8777859417095273, + "grad_norm": 107.71053234226156, + "learning_rate": 2.2205587594860463e-08, + "logits/chosen": -0.8631647229194641, + "logits/rejected": -0.8830554485321045, + "logps/chosen": -810.8700561523438, + "logps/rejected": -1053.4078369140625, + "loss": 0.3621, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.119283676147461, + "rewards/margins": 2.2090401649475098, + "rewards/rejected": -7.328324317932129, + "step": 672 + }, + { + "epoch": 0.8790921707894522, + "grad_norm": 30.991850825647404, + "learning_rate": 2.1737622787514593e-08, + "logits/chosen": -0.813284695148468, + "logits/rejected": -0.8452885150909424, + "logps/chosen": -796.1976318359375, + "logps/rejected": -997.8446044921875, + "loss": 0.4298, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.063453674316406, + "rewards/margins": 1.775707721710205, + "rewards/rejected": -6.8391618728637695, + "step": 673 + }, + { + "epoch": 0.8803983998693771, + "grad_norm": 22.463846231847736, + "learning_rate": 2.1274417426677514e-08, + "logits/chosen": -0.8230746984481812, + "logits/rejected": -0.8600046038627625, + "logps/chosen": -820.2351684570312, + "logps/rejected": -1084.288818359375, + "loss": 0.3613, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.950501918792725, + "rewards/margins": 2.2537648677825928, + "rewards/rejected": -7.204266548156738, + "step": 674 + }, + { + "epoch": 0.881704628949302, + "grad_norm": 21.4399416595423, + "learning_rate": 2.081598117053801e-08, + "logits/chosen": -0.9868246912956238, + "logits/rejected": -1.0269767045974731, + "logps/chosen": -844.5371704101562, + "logps/rejected": -1028.14501953125, + "loss": 0.3512, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.722963809967041, + "rewards/margins": 1.6241395473480225, + "rewards/rejected": -6.347103595733643, + "step": 675 + }, + { + "epoch": 0.8830108580292269, + "grad_norm": 23.258634574631902, + "learning_rate": 2.0362323577845424e-08, + "logits/chosen": -0.9350968599319458, + "logits/rejected": -0.9319708347320557, + "logps/chosen": -831.6373291015625, + "logps/rejected": -1064.093505859375, + "loss": 0.3229, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.654766082763672, + "rewards/margins": 2.459113836288452, + "rewards/rejected": -7.113880157470703, + "step": 676 + }, + { + "epoch": 0.8843170871091518, + "grad_norm": 17.971275755186788, + "learning_rate": 1.991345410771017e-08, + "logits/chosen": -0.9765560626983643, + "logits/rejected": -1.001975655555725, + "logps/chosen": -934.716796875, + "logps/rejected": -1197.37109375, + "loss": 0.3025, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.539895057678223, + "rewards/margins": 2.42952823638916, + "rewards/rejected": -7.969422817230225, + "step": 677 + }, + { + "epoch": 0.8856233161890766, + "grad_norm": 28.491347745271142, + "learning_rate": 1.9469382119406714e-08, + "logits/chosen": -0.9596845507621765, + "logits/rejected": -0.8702284097671509, + "logps/chosen": -802.1336669921875, + "logps/rejected": -1096.5390625, + "loss": 0.2762, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.722399711608887, + "rewards/margins": 3.0645108222961426, + "rewards/rejected": -7.7869110107421875, + "step": 678 + }, + { + "epoch": 0.8869295452690016, + "grad_norm": 32.96394591000798, + "learning_rate": 1.9030116872178314e-08, + "logits/chosen": -0.9617183804512024, + "logits/rejected": -1.048295021057129, + "logps/chosen": -864.1490478515625, + "logps/rejected": -1119.6983642578125, + "loss": 0.4001, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.045253753662109, + "rewards/margins": 1.980230689048767, + "rewards/rejected": -7.025485038757324, + "step": 679 + }, + { + "epoch": 0.8882357743489264, + "grad_norm": 21.241499611321817, + "learning_rate": 1.8595667525043963e-08, + "logits/chosen": -0.9886503219604492, + "logits/rejected": -0.9136042594909668, + "logps/chosen": -812.7432861328125, + "logps/rejected": -1024.8572998046875, + "loss": 0.3109, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.500929832458496, + "rewards/margins": 2.2964494228363037, + "rewards/rejected": -6.797379016876221, + "step": 680 + }, + { + "epoch": 0.8895420034288514, + "grad_norm": 23.122708193972542, + "learning_rate": 1.816604313660741e-08, + "logits/chosen": -1.0124708414077759, + "logits/rejected": -1.0051053762435913, + "logps/chosen": -804.603271484375, + "logps/rejected": -1008.0371704101562, + "loss": 0.3708, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.575305461883545, + "rewards/margins": 2.1409244537353516, + "rewards/rejected": -6.716229438781738, + "step": 681 + }, + { + "epoch": 0.8908482325087762, + "grad_norm": 20.451259814838977, + "learning_rate": 1.7741252664868312e-08, + "logits/chosen": -0.8169146180152893, + "logits/rejected": -0.8272385597229004, + "logps/chosen": -828.5379638671875, + "logps/rejected": -1150.41943359375, + "loss": 0.34, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.057079315185547, + "rewards/margins": 2.8700027465820312, + "rewards/rejected": -7.927082061767578, + "step": 682 + }, + { + "epoch": 0.8921544615887012, + "grad_norm": 30.63702941806789, + "learning_rate": 1.7321304967035487e-08, + "logits/chosen": -0.9371048808097839, + "logits/rejected": -0.8836950063705444, + "logps/chosen": -878.0043334960938, + "logps/rejected": -1135.9581298828125, + "loss": 0.325, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.758838653564453, + "rewards/margins": 2.4666881561279297, + "rewards/rejected": -7.225526809692383, + "step": 683 + }, + { + "epoch": 0.893460690668626, + "grad_norm": 19.813773854758352, + "learning_rate": 1.6906208799342014e-08, + "logits/chosen": -0.9349536895751953, + "logits/rejected": -0.9600227475166321, + "logps/chosen": -775.5115966796875, + "logps/rejected": -1027.392578125, + "loss": 0.3199, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.448516368865967, + "rewards/margins": 2.41306734085083, + "rewards/rejected": -6.861583709716797, + "step": 684 + }, + { + "epoch": 0.8947669197485509, + "grad_norm": 29.17870420577452, + "learning_rate": 1.649597281686302e-08, + "logits/chosen": -0.8184996247291565, + "logits/rejected": -0.9192086458206177, + "logps/chosen": -812.7239990234375, + "logps/rejected": -1163.47412109375, + "loss": 0.3899, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.799856662750244, + "rewards/margins": 2.99330997467041, + "rewards/rejected": -7.793166637420654, + "step": 685 + }, + { + "epoch": 0.8960731488284758, + "grad_norm": 20.560672395597233, + "learning_rate": 1.6090605573334915e-08, + "logits/chosen": -0.8407419919967651, + "logits/rejected": -0.9089653491973877, + "logps/chosen": -818.4083862304688, + "logps/rejected": -1127.5633544921875, + "loss": 0.2907, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.807866096496582, + "rewards/margins": 2.694227457046509, + "rewards/rejected": -7.502093315124512, + "step": 686 + }, + { + "epoch": 0.8973793779084007, + "grad_norm": 16.610603081015654, + "learning_rate": 1.569011552097718e-08, + "logits/chosen": -0.8607504963874817, + "logits/rejected": -0.88968825340271, + "logps/chosen": -860.8826293945312, + "logps/rejected": -1173.7161865234375, + "loss": 0.2899, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.202151775360107, + "rewards/margins": 2.443506956100464, + "rewards/rejected": -7.645658493041992, + "step": 687 + }, + { + "epoch": 0.8986856069883256, + "grad_norm": 30.987286097239437, + "learning_rate": 1.5294511010316145e-08, + "logits/chosen": -0.8558975458145142, + "logits/rejected": -0.8481131792068481, + "logps/chosen": -832.5034790039062, + "logps/rejected": -972.0008544921875, + "loss": 0.4205, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.9938507080078125, + "rewards/margins": 1.4612228870391846, + "rewards/rejected": -6.455073356628418, + "step": 688 + }, + { + "epoch": 0.8999918360682505, + "grad_norm": 32.486020006493476, + "learning_rate": 1.4903800290010815e-08, + "logits/chosen": -0.9570955038070679, + "logits/rejected": -0.9459174871444702, + "logps/chosen": -849.517333984375, + "logps/rejected": -1078.22314453125, + "loss": 0.4155, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.753889083862305, + "rewards/margins": 2.205451488494873, + "rewards/rejected": -6.959341049194336, + "step": 689 + }, + { + "epoch": 0.9012980651481753, + "grad_norm": 25.45059241483055, + "learning_rate": 1.4517991506680761e-08, + "logits/chosen": -0.9805968999862671, + "logits/rejected": -1.0026856660842896, + "logps/chosen": -796.7140502929688, + "logps/rejected": -1002.83447265625, + "loss": 0.3279, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.365528583526611, + "rewards/margins": 2.011324167251587, + "rewards/rejected": -6.376852512359619, + "step": 690 + }, + { + "epoch": 0.9026042942281003, + "grad_norm": 20.08399102951379, + "learning_rate": 1.4137092704736564e-08, + "logits/chosen": -1.0272362232208252, + "logits/rejected": -1.046217679977417, + "logps/chosen": -896.72314453125, + "logps/rejected": -1169.2354736328125, + "loss": 0.2868, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.155872344970703, + "rewards/margins": 2.6793265342712402, + "rewards/rejected": -7.835198402404785, + "step": 691 + }, + { + "epoch": 0.9039105233080251, + "grad_norm": 19.737152866013854, + "learning_rate": 1.3761111826211813e-08, + "logits/chosen": -0.884811282157898, + "logits/rejected": -0.93790203332901, + "logps/chosen": -825.8671875, + "logps/rejected": -1213.501220703125, + "loss": 0.3309, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.409741401672363, + "rewards/margins": 3.5079073905944824, + "rewards/rejected": -7.917649269104004, + "step": 692 + }, + { + "epoch": 0.9052167523879501, + "grad_norm": 24.701952490736144, + "learning_rate": 1.3390056710597647e-08, + "logits/chosen": -1.0386333465576172, + "logits/rejected": -1.0525237321853638, + "logps/chosen": -774.3814697265625, + "logps/rejected": -959.9278564453125, + "loss": 0.3527, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.583800315856934, + "rewards/margins": 1.6484012603759766, + "rewards/rejected": -6.23220157623291, + "step": 693 + }, + { + "epoch": 0.9065229814678749, + "grad_norm": 25.601346480134133, + "learning_rate": 1.302393509467925e-08, + "logits/chosen": -1.007061243057251, + "logits/rejected": -1.0328550338745117, + "logps/chosen": -828.13525390625, + "logps/rejected": -1115.3458251953125, + "loss": 0.3754, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.077081680297852, + "rewards/margins": 2.570059299468994, + "rewards/rejected": -7.6471405029296875, + "step": 694 + }, + { + "epoch": 0.9078292105477999, + "grad_norm": 22.751192382217827, + "learning_rate": 1.2662754612374482e-08, + "logits/chosen": -1.1416237354278564, + "logits/rejected": -1.1116670370101929, + "logps/chosen": -862.865966796875, + "logps/rejected": -1017.9955444335938, + "loss": 0.3109, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.581521511077881, + "rewards/margins": 1.9770212173461914, + "rewards/rejected": -6.5585432052612305, + "step": 695 + }, + { + "epoch": 0.9091354396277247, + "grad_norm": 29.147918711392688, + "learning_rate": 1.2306522794574864e-08, + "logits/chosen": -0.9129850268363953, + "logits/rejected": -0.9268524646759033, + "logps/chosen": -758.1630249023438, + "logps/rejected": -1032.794677734375, + "loss": 0.3501, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.635600566864014, + "rewards/margins": 2.712730646133423, + "rewards/rejected": -7.348330974578857, + "step": 696 + }, + { + "epoch": 0.9104416687076496, + "grad_norm": 18.5578048726823, + "learning_rate": 1.195524706898826e-08, + "logits/chosen": -0.959962010383606, + "logits/rejected": -0.9326778650283813, + "logps/chosen": -780.856689453125, + "logps/rejected": -976.901611328125, + "loss": 0.2673, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.898529529571533, + "rewards/margins": 2.022247076034546, + "rewards/rejected": -6.9207763671875, + "step": 697 + }, + { + "epoch": 0.9117478977875745, + "grad_norm": 19.744948826838282, + "learning_rate": 1.1608934759984424e-08, + "logits/chosen": -0.9286206960678101, + "logits/rejected": -1.01764976978302, + "logps/chosen": -881.43896484375, + "logps/rejected": -1238.6363525390625, + "loss": 0.3231, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.401637554168701, + "rewards/margins": 2.623457908630371, + "rewards/rejected": -8.025094985961914, + "step": 698 + }, + { + "epoch": 0.9130541268674994, + "grad_norm": 32.792459109040685, + "learning_rate": 1.1267593088441884e-08, + "logits/chosen": -0.8476910591125488, + "logits/rejected": -0.8974804282188416, + "logps/chosen": -830.3240356445312, + "logps/rejected": -1020.6620483398438, + "loss": 0.3228, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.793473243713379, + "rewards/margins": 2.098773956298828, + "rewards/rejected": -6.892247200012207, + "step": 699 + }, + { + "epoch": 0.9143603559474243, + "grad_norm": 16.358095159369643, + "learning_rate": 1.0931229171597583e-08, + "logits/chosen": -0.9544919729232788, + "logits/rejected": -0.9292432069778442, + "logps/chosen": -915.648681640625, + "logps/rejected": -1094.630126953125, + "loss": 0.3251, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.481724262237549, + "rewards/margins": 2.09407377243042, + "rewards/rejected": -7.575798034667969, + "step": 700 + }, + { + "epoch": 0.9143603559474243, + "eval_logits/chosen": -0.8042228817939758, + "eval_logits/rejected": -0.8119060397148132, + "eval_logps/chosen": -827.6954345703125, + "eval_logps/rejected": -1104.072998046875, + "eval_loss": 0.33346831798553467, + "eval_rewards/accuracies": 0.8880000114440918, + "eval_rewards/chosen": -4.836644649505615, + "eval_rewards/margins": 2.7027976512908936, + "eval_rewards/rejected": -7.53944206237793, + "eval_runtime": 304.3609, + "eval_samples_per_second": 6.571, + "eval_steps_per_second": 0.411, + "step": 700 + }, + { + "epoch": 0.9156665850273492, + "grad_norm": 45.78976016140901, + "learning_rate": 1.0599850022898537e-08, + "logits/chosen": -0.7225776314735413, + "logits/rejected": -0.6860483884811401, + "logps/chosen": -801.6976318359375, + "logps/rejected": -1225.3739013671875, + "loss": 0.2817, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.951022624969482, + "rewards/margins": 3.52390193939209, + "rewards/rejected": -8.47492504119873, + "step": 701 + }, + { + "epoch": 0.916972814107274, + "grad_norm": 25.229183577535213, + "learning_rate": 1.0273462551855295e-08, + "logits/chosen": -0.9126315712928772, + "logits/rejected": -0.9276062250137329, + "logps/chosen": -742.0897216796875, + "logps/rejected": -975.28125, + "loss": 0.365, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.212459564208984, + "rewards/margins": 2.135211229324341, + "rewards/rejected": -6.347670555114746, + "step": 702 + }, + { + "epoch": 0.918279043187199, + "grad_norm": 18.00129692277072, + "learning_rate": 9.952073563898322e-09, + "logits/chosen": -0.9886517524719238, + "logits/rejected": -1.0185874700546265, + "logps/chosen": -852.1192626953125, + "logps/rejected": -1145.8603515625, + "loss": 0.3242, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.017488479614258, + "rewards/margins": 2.6713109016418457, + "rewards/rejected": -7.688799858093262, + "step": 703 + }, + { + "epoch": 0.9195852722671238, + "grad_norm": 31.544292267398593, + "learning_rate": 9.635689760235682e-09, + "logits/chosen": -0.9542191624641418, + "logits/rejected": -0.9516808986663818, + "logps/chosen": -867.5074462890625, + "logps/rejected": -1126.504638671875, + "loss": 0.4693, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.200738430023193, + "rewards/margins": 2.536679983139038, + "rewards/rejected": -7.7374186515808105, + "step": 704 + }, + { + "epoch": 0.9208915013470488, + "grad_norm": 46.72515936026558, + "learning_rate": 9.324317737713555e-09, + "logits/chosen": -0.9351953864097595, + "logits/rejected": -0.9409655928611755, + "logps/chosen": -792.6817016601562, + "logps/rejected": -1014.02685546875, + "loss": 0.3214, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.412503242492676, + "rewards/margins": 2.4800729751586914, + "rewards/rejected": -6.892575740814209, + "step": 705 + }, + { + "epoch": 0.9221977304269736, + "grad_norm": 23.61367143969372, + "learning_rate": 9.017963988678601e-09, + "logits/chosen": -1.03610360622406, + "logits/rejected": -0.990273654460907, + "logps/chosen": -841.663818359375, + "logps/rejected": -1107.4656982421875, + "loss": 0.3304, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.593728065490723, + "rewards/margins": 2.6971254348754883, + "rewards/rejected": -7.290853500366211, + "step": 706 + }, + { + "epoch": 0.9235039595068986, + "grad_norm": 23.281354565557432, + "learning_rate": 8.716634900842651e-09, + "logits/chosen": -0.9835007786750793, + "logits/rejected": -0.9869141578674316, + "logps/chosen": -792.2289428710938, + "logps/rejected": -1047.859375, + "loss": 0.3497, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.580854892730713, + "rewards/margins": 2.6027719974517822, + "rewards/rejected": -7.183626174926758, + "step": 707 + }, + { + "epoch": 0.9248101885868234, + "grad_norm": 24.611145520672583, + "learning_rate": 8.420336757149454e-09, + "logits/chosen": -1.1103384494781494, + "logits/rejected": -1.1257877349853516, + "logps/chosen": -846.7141723632812, + "logps/rejected": -1046.1112060546875, + "loss": 0.397, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.437961101531982, + "rewards/margins": 2.0100278854370117, + "rewards/rejected": -6.447989463806152, + "step": 708 + }, + { + "epoch": 0.9261164176667483, + "grad_norm": 29.316192799394898, + "learning_rate": 8.129075735643698e-09, + "logits/chosen": -0.9531034827232361, + "logits/rejected": -0.8710059523582458, + "logps/chosen": -958.839599609375, + "logps/rejected": -1179.770751953125, + "loss": 0.3341, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.782157897949219, + "rewards/margins": 2.180623769760132, + "rewards/rejected": -7.96278190612793, + "step": 709 + }, + { + "epoch": 0.9274226467466732, + "grad_norm": 67.50239470619628, + "learning_rate": 7.842857909342165e-09, + "logits/chosen": -0.8792818188667297, + "logits/rejected": -0.8596967458724976, + "logps/chosen": -762.2716064453125, + "logps/rejected": -981.783447265625, + "loss": 0.3203, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.564942359924316, + "rewards/margins": 2.2731096744537354, + "rewards/rejected": -6.838052272796631, + "step": 710 + }, + { + "epoch": 0.9287288758265981, + "grad_norm": 22.773314772539816, + "learning_rate": 7.561689246107145e-09, + "logits/chosen": -0.9148231744766235, + "logits/rejected": -0.9793925881385803, + "logps/chosen": -750.6596069335938, + "logps/rejected": -1137.5262451171875, + "loss": 0.3189, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.167259216308594, + "rewards/margins": 3.1219406127929688, + "rewards/rejected": -7.2891998291015625, + "step": 711 + }, + { + "epoch": 0.930035104906523, + "grad_norm": 22.30274318697143, + "learning_rate": 7.2855756085219714e-09, + "logits/chosen": -0.8467492461204529, + "logits/rejected": -0.8688889741897583, + "logps/chosen": -749.0939331054688, + "logps/rejected": -948.42626953125, + "loss": 0.3543, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.155250072479248, + "rewards/margins": 2.0990848541259766, + "rewards/rejected": -6.254334449768066, + "step": 712 + }, + { + "epoch": 0.9313413339864479, + "grad_norm": 22.012504399894368, + "learning_rate": 7.014522753768848e-09, + "logits/chosen": -0.977868914604187, + "logits/rejected": -1.0833169221878052, + "logps/chosen": -883.92529296875, + "logps/rejected": -1315.0213623046875, + "loss": 0.2862, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.060840606689453, + "rewards/margins": 3.245311975479126, + "rewards/rejected": -8.306153297424316, + "step": 713 + }, + { + "epoch": 0.9326475630663728, + "grad_norm": 18.190681387753177, + "learning_rate": 6.7485363335087475e-09, + "logits/chosen": -0.8860818147659302, + "logits/rejected": -0.9461392164230347, + "logps/chosen": -795.0383911132812, + "logps/rejected": -1020.0634765625, + "loss": 0.3417, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.095912933349609, + "rewards/margins": 2.0163002014160156, + "rewards/rejected": -7.112213134765625, + "step": 714 + }, + { + "epoch": 0.9339537921462977, + "grad_norm": 26.781074891011134, + "learning_rate": 6.4876218937634786e-09, + "logits/chosen": -0.7991393208503723, + "logits/rejected": -0.7921952605247498, + "logps/chosen": -789.0760498046875, + "logps/rejected": -1029.6826171875, + "loss": 0.3409, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.609364032745361, + "rewards/margins": 2.325976610183716, + "rewards/rejected": -6.93533992767334, + "step": 715 + }, + { + "epoch": 0.9352600212262225, + "grad_norm": 23.807291079941198, + "learning_rate": 6.231784874800306e-09, + "logits/chosen": -0.9109543561935425, + "logits/rejected": -0.9539563655853271, + "logps/chosen": -866.422607421875, + "logps/rejected": -1126.484130859375, + "loss": 0.3146, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.959641456604004, + "rewards/margins": 2.240410566329956, + "rewards/rejected": -7.200052261352539, + "step": 716 + }, + { + "epoch": 0.9365662503061475, + "grad_norm": 26.79365637604462, + "learning_rate": 5.981030611018234e-09, + "logits/chosen": -0.8974629640579224, + "logits/rejected": -0.9451881647109985, + "logps/chosen": -748.3617553710938, + "logps/rejected": -979.998291015625, + "loss": 0.2984, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.526296615600586, + "rewards/margins": 1.7230241298675537, + "rewards/rejected": -6.2493205070495605, + "step": 717 + }, + { + "epoch": 0.9378724793860723, + "grad_norm": 24.570663697903854, + "learning_rate": 5.735364330836906e-09, + "logits/chosen": -0.8672972321510315, + "logits/rejected": -0.9144647717475891, + "logps/chosen": -848.5457763671875, + "logps/rejected": -1264.940185546875, + "loss": 0.2165, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.055003643035889, + "rewards/margins": 3.4090349674224854, + "rewards/rejected": -8.464038848876953, + "step": 718 + }, + { + "epoch": 0.9391787084659973, + "grad_norm": 15.934773179952451, + "learning_rate": 5.494791156587686e-09, + "logits/chosen": -1.0396963357925415, + "logits/rejected": -1.020418405532837, + "logps/chosen": -749.0263671875, + "logps/rejected": -883.5203857421875, + "loss": 0.339, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.272754669189453, + "rewards/margins": 1.393311858177185, + "rewards/rejected": -5.666066646575928, + "step": 719 + }, + { + "epoch": 0.9404849375459221, + "grad_norm": 20.2536773624075, + "learning_rate": 5.259316104406636e-09, + "logits/chosen": -0.9552453756332397, + "logits/rejected": -0.940865159034729, + "logps/chosen": -834.688232421875, + "logps/rejected": -1037.806396484375, + "loss": 0.3436, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.794745445251465, + "rewards/margins": 2.015627384185791, + "rewards/rejected": -6.810372829437256, + "step": 720 + }, + { + "epoch": 0.941791166625847, + "grad_norm": 53.84085219336435, + "learning_rate": 5.028944084130155e-09, + "logits/chosen": -0.914104163646698, + "logits/rejected": -0.9181921482086182, + "logps/chosen": -845.41455078125, + "logps/rejected": -1107.7197265625, + "loss": 0.3146, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.820213794708252, + "rewards/margins": 2.7967965602874756, + "rewards/rejected": -7.61700963973999, + "step": 721 + }, + { + "epoch": 0.9430973957057719, + "grad_norm": 20.32358933591622, + "learning_rate": 4.803679899192392e-09, + "logits/chosen": -1.017437219619751, + "logits/rejected": -0.9803633093833923, + "logps/chosen": -998.7100830078125, + "logps/rejected": -1228.1575927734375, + "loss": 0.3148, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.515434741973877, + "rewards/margins": 2.422970771789551, + "rewards/rejected": -7.938405513763428, + "step": 722 + }, + { + "epoch": 0.9444036247856968, + "grad_norm": 26.226455627687997, + "learning_rate": 4.5835282465252476e-09, + "logits/chosen": -0.8738875985145569, + "logits/rejected": -0.9390866756439209, + "logps/chosen": -794.534423828125, + "logps/rejected": -1080.243896484375, + "loss": 0.2523, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.412847518920898, + "rewards/margins": 2.8086698055267334, + "rewards/rejected": -7.221517562866211, + "step": 723 + }, + { + "epoch": 0.9457098538656217, + "grad_norm": 33.10596601901346, + "learning_rate": 4.368493716460392e-09, + "logits/chosen": -0.9799577593803406, + "logits/rejected": -0.9760726094245911, + "logps/chosen": -836.212158203125, + "logps/rejected": -1077.455322265625, + "loss": 0.3313, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.865520477294922, + "rewards/margins": 2.487382411956787, + "rewards/rejected": -7.352903366088867, + "step": 724 + }, + { + "epoch": 0.9470160829455466, + "grad_norm": 21.851274725696168, + "learning_rate": 4.158580792633482e-09, + "logits/chosen": -1.0691559314727783, + "logits/rejected": -0.9570724964141846, + "logps/chosen": -888.2042846679688, + "logps/rejected": -1064.596923828125, + "loss": 0.3111, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.672724723815918, + "rewards/margins": 2.358022451400757, + "rewards/rejected": -7.030746936798096, + "step": 725 + }, + { + "epoch": 0.9483223120254715, + "grad_norm": 38.81636628127586, + "learning_rate": 3.953793851890791e-09, + "logits/chosen": -0.9587030410766602, + "logits/rejected": -0.9567082524299622, + "logps/chosen": -809.8760986328125, + "logps/rejected": -1075.88916015625, + "loss": 0.2958, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.480008602142334, + "rewards/margins": 2.473860740661621, + "rewards/rejected": -6.953869819641113, + "step": 726 + }, + { + "epoch": 0.9496285411053964, + "grad_norm": 24.374217026089934, + "learning_rate": 3.754137164197923e-09, + "logits/chosen": -0.9956728219985962, + "logits/rejected": -0.962954044342041, + "logps/chosen": -934.9635009765625, + "logps/rejected": -1057.8211669921875, + "loss": 0.3842, + "rewards/accuracies": 0.78125, + "rewards/chosen": -5.475031852722168, + "rewards/margins": 1.5260260105133057, + "rewards/rejected": -7.0010576248168945, + "step": 727 + }, + { + "epoch": 0.9509347701853212, + "grad_norm": 21.760779525811405, + "learning_rate": 3.559614892550661e-09, + "logits/chosen": -0.7743152976036072, + "logits/rejected": -0.794499397277832, + "logps/chosen": -774.2794799804688, + "logps/rejected": -1053.2183837890625, + "loss": 0.3013, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.855001926422119, + "rewards/margins": 2.7619917392730713, + "rewards/rejected": -7.6169939041137695, + "step": 728 + }, + { + "epoch": 0.9522409992652462, + "grad_norm": 27.51247655254993, + "learning_rate": 3.370231092888365e-09, + "logits/chosen": -0.9370498657226562, + "logits/rejected": -0.9787413477897644, + "logps/chosen": -823.4442749023438, + "logps/rejected": -1096.9632568359375, + "loss": 0.313, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.691824913024902, + "rewards/margins": 2.6661367416381836, + "rewards/rejected": -7.357962131500244, + "step": 729 + }, + { + "epoch": 0.953547228345171, + "grad_norm": 22.64921257852498, + "learning_rate": 3.185989714009185e-09, + "logits/chosen": -0.8871971368789673, + "logits/rejected": -0.9147964715957642, + "logps/chosen": -774.44921875, + "logps/rejected": -1189.8350830078125, + "loss": 0.2955, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.431604385375977, + "rewards/margins": 3.59488582611084, + "rewards/rejected": -8.026490211486816, + "step": 730 + }, + { + "epoch": 0.954853457425096, + "grad_norm": 38.98489185084576, + "learning_rate": 3.0068945974878744e-09, + "logits/chosen": -0.9947443604469299, + "logits/rejected": -0.9405471682548523, + "logps/chosen": -875.498291015625, + "logps/rejected": -1069.519287109375, + "loss": 0.2867, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.040494918823242, + "rewards/margins": 2.050215005874634, + "rewards/rejected": -7.090709686279297, + "step": 731 + }, + { + "epoch": 0.9561596865050208, + "grad_norm": 24.62642376244049, + "learning_rate": 2.8329494775956862e-09, + "logits/chosen": -1.1279411315917969, + "logits/rejected": -1.147200584411621, + "logps/chosen": -838.68359375, + "logps/rejected": -1042.97412109375, + "loss": 0.2784, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.729671478271484, + "rewards/margins": 2.1291439533233643, + "rewards/rejected": -6.858816146850586, + "step": 732 + }, + { + "epoch": 0.9574659155849458, + "grad_norm": 28.94900881890447, + "learning_rate": 2.664157981222437e-09, + "logits/chosen": -0.9128333330154419, + "logits/rejected": -0.9717162847518921, + "logps/chosen": -858.6765747070312, + "logps/rejected": -1108.43115234375, + "loss": 0.3688, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.196346759796143, + "rewards/margins": 2.261138916015625, + "rewards/rejected": -7.457486152648926, + "step": 733 + }, + { + "epoch": 0.9587721446648706, + "grad_norm": 29.009131812279637, + "learning_rate": 2.5005236278009546e-09, + "logits/chosen": -0.973136305809021, + "logits/rejected": -0.9964556694030762, + "logps/chosen": -806.8096923828125, + "logps/rejected": -1122.0406494140625, + "loss": 0.3474, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.701882839202881, + "rewards/margins": 2.8338756561279297, + "rewards/rejected": -7.5357584953308105, + "step": 734 + }, + { + "epoch": 0.9600783737447955, + "grad_norm": 19.15685039048966, + "learning_rate": 2.342049829233611e-09, + "logits/chosen": -0.8886818885803223, + "logits/rejected": -0.9232064485549927, + "logps/chosen": -770.3201293945312, + "logps/rejected": -1065.265869140625, + "loss": 0.2658, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.26518440246582, + "rewards/margins": 3.126837730407715, + "rewards/rejected": -7.392023086547852, + "step": 735 + }, + { + "epoch": 0.9613846028247204, + "grad_norm": 29.54789803184001, + "learning_rate": 2.188739889821267e-09, + "logits/chosen": -0.9659155607223511, + "logits/rejected": -0.9455364942550659, + "logps/chosen": -881.7896118164062, + "logps/rejected": -1096.7828369140625, + "loss": 0.3206, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.22705078125, + "rewards/margins": 2.3798322677612305, + "rewards/rejected": -7.606882095336914, + "step": 736 + }, + { + "epoch": 0.9626908319046453, + "grad_norm": 28.345744890500455, + "learning_rate": 2.0405970061943003e-09, + "logits/chosen": -0.852986752986908, + "logits/rejected": -0.8692293167114258, + "logps/chosen": -796.1908569335938, + "logps/rejected": -921.0655517578125, + "loss": 0.3987, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.667433738708496, + "rewards/margins": 1.1772699356079102, + "rewards/rejected": -5.844703674316406, + "step": 737 + }, + { + "epoch": 0.9639970609845702, + "grad_norm": 22.909021102233012, + "learning_rate": 1.897624267246073e-09, + "logits/chosen": -0.9738621115684509, + "logits/rejected": -1.0005155801773071, + "logps/chosen": -783.131103515625, + "logps/rejected": -1031.7144775390625, + "loss": 0.2402, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.542294502258301, + "rewards/margins": 2.2111833095550537, + "rewards/rejected": -6.753478050231934, + "step": 738 + }, + { + "epoch": 0.9653032900644951, + "grad_norm": 30.905899100885105, + "learning_rate": 1.7598246540683481e-09, + "logits/chosen": -0.9035788774490356, + "logits/rejected": -0.891940712928772, + "logps/chosen": -839.0357666015625, + "logps/rejected": -1132.1015625, + "loss": 0.3277, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.749287128448486, + "rewards/margins": 3.2719829082489014, + "rewards/rejected": -8.021270751953125, + "step": 739 + }, + { + "epoch": 0.96660951914442, + "grad_norm": 20.962666279736965, + "learning_rate": 1.6272010398893088e-09, + "logits/chosen": -1.1005297899246216, + "logits/rejected": -1.0309395790100098, + "logps/chosen": -900.8092041015625, + "logps/rejected": -1103.6551513671875, + "loss": 0.2369, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.278766632080078, + "rewards/margins": 2.1895079612731934, + "rewards/rejected": -7.4682745933532715, + "step": 740 + }, + { + "epoch": 0.9679157482243449, + "grad_norm": 29.86451006227898, + "learning_rate": 1.4997561900135236e-09, + "logits/chosen": -1.0119831562042236, + "logits/rejected": -1.0092520713806152, + "logps/chosen": -885.3237915039062, + "logps/rejected": -1074.3250732421875, + "loss": 0.3015, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.477841377258301, + "rewards/margins": 1.7145339250564575, + "rewards/rejected": -7.192375183105469, + "step": 741 + }, + { + "epoch": 0.9692219773042697, + "grad_norm": 22.580987418682184, + "learning_rate": 1.377492761764354e-09, + "logits/chosen": -0.9602839946746826, + "logits/rejected": -0.9752902388572693, + "logps/chosen": -784.1845703125, + "logps/rejected": -1084.4151611328125, + "loss": 0.2918, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.722711086273193, + "rewards/margins": 2.7062840461730957, + "rewards/rejected": -7.428995132446289, + "step": 742 + }, + { + "epoch": 0.9705282063841947, + "grad_norm": 26.643058025664857, + "learning_rate": 1.2604133044284982e-09, + "logits/chosen": -0.8779647946357727, + "logits/rejected": -0.9308136701583862, + "logps/chosen": -818.1796264648438, + "logps/rejected": -1074.2091064453125, + "loss": 0.3351, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.8115034103393555, + "rewards/margins": 2.1788394451141357, + "rewards/rejected": -6.990341663360596, + "step": 743 + }, + { + "epoch": 0.9718344354641195, + "grad_norm": 23.47737766636236, + "learning_rate": 1.148520259202923e-09, + "logits/chosen": -1.0406291484832764, + "logits/rejected": -1.0501799583435059, + "logps/chosen": -862.27978515625, + "logps/rejected": -1064.9189453125, + "loss": 0.3525, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.038253307342529, + "rewards/margins": 2.2377071380615234, + "rewards/rejected": -7.275960922241211, + "step": 744 + }, + { + "epoch": 0.9731406645440445, + "grad_norm": 18.114965193565368, + "learning_rate": 1.0418159591438214e-09, + "logits/chosen": -0.9138543009757996, + "logits/rejected": -0.9569115042686462, + "logps/chosen": -823.882568359375, + "logps/rejected": -1185.687744140625, + "loss": 0.3084, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.7956647872924805, + "rewards/margins": 3.1947028636932373, + "rewards/rejected": -7.990367412567139, + "step": 745 + }, + { + "epoch": 0.9744468936239693, + "grad_norm": 18.817714111289238, + "learning_rate": 9.403026291181505e-10, + "logits/chosen": -0.9524965286254883, + "logits/rejected": -0.9789271354675293, + "logps/chosen": -869.812744140625, + "logps/rejected": -1176.66943359375, + "loss": 0.3283, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.999002933502197, + "rewards/margins": 2.8612561225891113, + "rewards/rejected": -7.860259056091309, + "step": 746 + }, + { + "epoch": 0.9757531227038942, + "grad_norm": 24.90036779280411, + "learning_rate": 8.439823857570305e-10, + "logits/chosen": -0.8803153038024902, + "logits/rejected": -0.9198551774024963, + "logps/chosen": -919.7659912109375, + "logps/rejected": -1150.556884765625, + "loss": 0.39, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.701601028442383, + "rewards/margins": 1.7264282703399658, + "rewards/rejected": -7.4280290603637695, + "step": 747 + }, + { + "epoch": 0.9770593517838191, + "grad_norm": 21.430458788071867, + "learning_rate": 7.52857237411808e-10, + "logits/chosen": -0.9212626814842224, + "logits/rejected": -0.9755610227584839, + "logps/chosen": -859.5005493164062, + "logps/rejected": -1160.1336669921875, + "loss": 0.298, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.9637250900268555, + "rewards/margins": 2.754920482635498, + "rewards/rejected": -7.718645095825195, + "step": 748 + }, + { + "epoch": 0.978365580863744, + "grad_norm": 24.23282484362412, + "learning_rate": 6.66929084112089e-10, + "logits/chosen": -1.0224872827529907, + "logits/rejected": -1.0391262769699097, + "logps/chosen": -782.97998046875, + "logps/rejected": -1147.94775390625, + "loss": 0.2762, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.704994201660156, + "rewards/margins": 3.344956874847412, + "rewards/rejected": -8.04995059967041, + "step": 749 + }, + { + "epoch": 0.9796718099436689, + "grad_norm": 46.74074760052989, + "learning_rate": 5.861997175260758e-10, + "logits/chosen": -0.9382998943328857, + "logits/rejected": -0.8692302703857422, + "logps/chosen": -863.0648193359375, + "logps/rejected": -1199.2513427734375, + "loss": 0.3445, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.992151260375977, + "rewards/margins": 3.3170485496520996, + "rewards/rejected": -8.309200286865234, + "step": 750 + }, + { + "epoch": 0.9809780390235938, + "grad_norm": 45.54321160333492, + "learning_rate": 5.106708209232647e-10, + "logits/chosen": -0.9324420094490051, + "logits/rejected": -0.957527220249176, + "logps/chosen": -840.7574462890625, + "logps/rejected": -1093.603271484375, + "loss": 0.3618, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.038885593414307, + "rewards/margins": 2.237790107727051, + "rewards/rejected": -7.276675224304199, + "step": 751 + }, + { + "epoch": 0.9822842681035187, + "grad_norm": 22.458925326174544, + "learning_rate": 4.4034396913941727e-10, + "logits/chosen": -0.9433171153068542, + "logits/rejected": -1.005143404006958, + "logps/chosen": -894.079833984375, + "logps/rejected": -1195.688720703125, + "loss": 0.3837, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.210080146789551, + "rewards/margins": 2.7397055625915527, + "rewards/rejected": -7.949785232543945, + "step": 752 + }, + { + "epoch": 0.9835904971834436, + "grad_norm": 22.408106908118054, + "learning_rate": 3.7522062854355997e-10, + "logits/chosen": -0.9450298547744751, + "logits/rejected": -1.0143907070159912, + "logps/chosen": -807.2323608398438, + "logps/rejected": -1332.6236572265625, + "loss": 0.2876, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.483209133148193, + "rewards/margins": 4.754749298095703, + "rewards/rejected": -9.237958908081055, + "step": 753 + }, + { + "epoch": 0.9848967262633684, + "grad_norm": 19.418290612472866, + "learning_rate": 3.1530215700756313e-10, + "logits/chosen": -1.102285623550415, + "logits/rejected": -1.0857857465744019, + "logps/chosen": -883.404052734375, + "logps/rejected": -1100.610595703125, + "loss": 0.3042, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.034350872039795, + "rewards/margins": 2.195341110229492, + "rewards/rejected": -7.229691982269287, + "step": 754 + }, + { + "epoch": 0.9862029553432933, + "grad_norm": 35.85551364174609, + "learning_rate": 2.605898038777199e-10, + "logits/chosen": -0.9625508189201355, + "logits/rejected": -0.9833151698112488, + "logps/chosen": -790.8330078125, + "logps/rejected": -1025.868896484375, + "loss": 0.3716, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.699563503265381, + "rewards/margins": 2.502751350402832, + "rewards/rejected": -7.202314376831055, + "step": 755 + }, + { + "epoch": 0.9875091844232182, + "grad_norm": 28.53621501039099, + "learning_rate": 2.110847099488222e-10, + "logits/chosen": -0.94759202003479, + "logits/rejected": -0.9781441688537598, + "logps/chosen": -839.1832275390625, + "logps/rejected": -1023.45361328125, + "loss": 0.2928, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.758579254150391, + "rewards/margins": 1.8572137355804443, + "rewards/rejected": -6.615793228149414, + "step": 756 + }, + { + "epoch": 0.9888154135031431, + "grad_norm": 21.22918791897412, + "learning_rate": 1.6678790744015236e-10, + "logits/chosen": -0.9919801354408264, + "logits/rejected": -0.9901013374328613, + "logps/chosen": -787.056396484375, + "logps/rejected": -934.1174926757812, + "loss": 0.3382, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.492656230926514, + "rewards/margins": 1.5403560400009155, + "rewards/rejected": -6.033012390136719, + "step": 757 + }, + { + "epoch": 0.990121642583068, + "grad_norm": 16.615289037512497, + "learning_rate": 1.277003199742499e-10, + "logits/chosen": -0.981942892074585, + "logits/rejected": -0.9660012125968933, + "logps/chosen": -772.347900390625, + "logps/rejected": -923.9476928710938, + "loss": 0.2967, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.184555530548096, + "rewards/margins": 2.0594732761383057, + "rewards/rejected": -6.2440290451049805, + "step": 758 + }, + { + "epoch": 0.9914278716629928, + "grad_norm": 19.158852761980633, + "learning_rate": 9.382276255742727e-11, + "logits/chosen": -1.071539044380188, + "logits/rejected": -1.0275521278381348, + "logps/chosen": -838.9002075195312, + "logps/rejected": -1043.4322509765625, + "loss": 0.2753, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.9658560752868652, + "rewards/margins": 2.511625289916992, + "rewards/rejected": -6.477481842041016, + "step": 759 + }, + { + "epoch": 0.9927341007429178, + "grad_norm": 22.168120567752247, + "learning_rate": 6.515594156286663e-11, + "logits/chosen": -0.9493943452835083, + "logits/rejected": -0.910268247127533, + "logps/chosen": -899.511962890625, + "logps/rejected": -1148.454345703125, + "loss": 0.293, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.537384510040283, + "rewards/margins": 2.493018388748169, + "rewards/rejected": -8.030403137207031, + "step": 760 + }, + { + "epoch": 0.9940403298228426, + "grad_norm": 24.026839554437526, + "learning_rate": 4.170045471588168e-11, + "logits/chosen": -0.9243339896202087, + "logits/rejected": -0.9904308319091797, + "logps/chosen": -838.7510986328125, + "logps/rejected": -1014.429931640625, + "loss": 0.3245, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.799351692199707, + "rewards/margins": 1.7270526885986328, + "rewards/rejected": -6.52640438079834, + "step": 761 + }, + { + "epoch": 0.9953465589027676, + "grad_norm": 28.08092475083908, + "learning_rate": 2.3456791081455375e-11, + "logits/chosen": -0.8584170341491699, + "logits/rejected": -0.915625810623169, + "logps/chosen": -841.3638916015625, + "logps/rejected": -1255.8558349609375, + "loss": 0.3535, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.8940863609313965, + "rewards/margins": 3.722672939300537, + "rewards/rejected": -8.61676025390625, + "step": 762 + }, + { + "epoch": 0.9966527879826924, + "grad_norm": 28.280188569328004, + "learning_rate": 1.0425331054025876e-11, + "logits/chosen": -0.9674179553985596, + "logits/rejected": -0.9638044834136963, + "logps/chosen": -806.9545288085938, + "logps/rejected": -1109.9842529296875, + "loss": 0.3833, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.446547985076904, + "rewards/margins": 2.917111873626709, + "rewards/rejected": -7.363659858703613, + "step": 763 + }, + { + "epoch": 0.9979590170626174, + "grad_norm": 22.616993906433937, + "learning_rate": 2.6063463495762384e-12, + "logits/chosen": -0.9143103361129761, + "logits/rejected": -0.9458924531936646, + "logps/chosen": -783.2197875976562, + "logps/rejected": -1015.7607421875, + "loss": 0.3291, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.563847541809082, + "rewards/margins": 2.473099708557129, + "rewards/rejected": -7.036947250366211, + "step": 764 + }, + { + "epoch": 0.9992652461425422, + "grad_norm": 15.41969150521287, + "learning_rate": 0.0, + "logits/chosen": -0.9288902282714844, + "logits/rejected": -0.9434102773666382, + "logps/chosen": -739.760986328125, + "logps/rejected": -1050.915771484375, + "loss": 0.2618, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.296210289001465, + "rewards/margins": 2.6045422554016113, + "rewards/rejected": -6.900752544403076, + "step": 765 + }, + { + "epoch": 0.9992652461425422, + "step": 765, + "total_flos": 0.0, + "train_loss": 0.41019999388775796, + "train_runtime": 41758.0505, + "train_samples_per_second": 2.347, + "train_steps_per_second": 0.018 + } + ], + "logging_steps": 1, + "max_steps": 765, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}