aaaalongaa's picture
Model save
54ba22e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997382884061764,
"eval_steps": 100,
"global_step": 1910,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005234231876472127,
"grad_norm": 57.10669050657411,
"learning_rate": 2.094240837696335e-08,
"logits/chosen": -0.9002814292907715,
"logits/rejected": -0.9369659423828125,
"logps/chosen": -1.2799328565597534,
"logps/rejected": -1.224565863609314,
"loss": 1.3867,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.002279318869113922,
"rewards/margins": 0.01926611177623272,
"rewards/rejected": -0.02154543064534664,
"step": 10
},
{
"epoch": 0.010468463752944255,
"grad_norm": 205.46189868419341,
"learning_rate": 4.18848167539267e-08,
"logits/chosen": -0.9780851602554321,
"logits/rejected": -0.9534770250320435,
"logps/chosen": -1.2819010019302368,
"logps/rejected": -1.324920415878296,
"loss": 1.3854,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.018570536747574806,
"rewards/margins": 0.012391218915581703,
"rewards/rejected": 0.006179317831993103,
"step": 20
},
{
"epoch": 0.015702695629416383,
"grad_norm": 424.88597498836,
"learning_rate": 6.282722513089005e-08,
"logits/chosen": -0.9846030473709106,
"logits/rejected": -1.001585841178894,
"logps/chosen": -1.3621526956558228,
"logps/rejected": -1.4822871685028076,
"loss": 1.3862,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.006618998944759369,
"rewards/margins": 0.005857313051819801,
"rewards/rejected": 0.0007616858929395676,
"step": 30
},
{
"epoch": 0.02093692750588851,
"grad_norm": 111.87617735661307,
"learning_rate": 8.37696335078534e-08,
"logits/chosen": -0.9275991320610046,
"logits/rejected": -1.0113765001296997,
"logps/chosen": -1.279705286026001,
"logps/rejected": -1.6348555088043213,
"loss": 1.3855,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.006214768625795841,
"rewards/margins": -0.005492387805134058,
"rewards/rejected": -0.0007223807042464614,
"step": 40
},
{
"epoch": 0.02617115938236064,
"grad_norm": 73.6668063718139,
"learning_rate": 1.0471204188481675e-07,
"logits/chosen": -0.8899806141853333,
"logits/rejected": -0.9631911516189575,
"logps/chosen": -1.2982203960418701,
"logps/rejected": -1.5544034242630005,
"loss": 1.3849,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.00998431071639061,
"rewards/margins": 0.011886270716786385,
"rewards/rejected": -0.0019019600003957748,
"step": 50
},
{
"epoch": 0.031405391258832765,
"grad_norm": 116.30393708461156,
"learning_rate": 1.256544502617801e-07,
"logits/chosen": -0.9441679120063782,
"logits/rejected": -0.965167224407196,
"logps/chosen": -1.3546206951141357,
"logps/rejected": -1.517407774925232,
"loss": 1.3838,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.027701353654265404,
"rewards/margins": 0.03188776224851608,
"rewards/rejected": -0.004186409059911966,
"step": 60
},
{
"epoch": 0.036639623135304895,
"grad_norm": 2187.360162764326,
"learning_rate": 1.4659685863874346e-07,
"logits/chosen": -0.8137859106063843,
"logits/rejected": -0.9285731315612793,
"logps/chosen": -1.251236081123352,
"logps/rejected": -1.5110037326812744,
"loss": 1.3836,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.01799890398979187,
"rewards/margins": 0.022558193653821945,
"rewards/rejected": -0.040557097643613815,
"step": 70
},
{
"epoch": 0.04187385501177702,
"grad_norm": 63.631796686270974,
"learning_rate": 1.675392670157068e-07,
"logits/chosen": -0.9329401850700378,
"logits/rejected": -0.9414553642272949,
"logps/chosen": -1.2446186542510986,
"logps/rejected": -1.329633355140686,
"loss": 1.3799,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.034679971635341644,
"rewards/margins": 0.031586576253175735,
"rewards/rejected": -0.06626654416322708,
"step": 80
},
{
"epoch": 0.04710808688824915,
"grad_norm": 70.57364025424204,
"learning_rate": 1.8848167539267015e-07,
"logits/chosen": -0.975296676158905,
"logits/rejected": -0.975991427898407,
"logps/chosen": -1.411871314048767,
"logps/rejected": -1.4917197227478027,
"loss": 1.3838,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.03906696289777756,
"rewards/margins": 0.0577811673283577,
"rewards/rejected": -0.09684813022613525,
"step": 90
},
{
"epoch": 0.05234231876472128,
"grad_norm": 12165.55527932566,
"learning_rate": 2.094240837696335e-07,
"logits/chosen": -0.9536153078079224,
"logits/rejected": -1.0614324808120728,
"logps/chosen": -1.387503743171692,
"logps/rejected": -1.6376310586929321,
"loss": 1.3855,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.07099077105522156,
"rewards/margins": 0.1324595957994461,
"rewards/rejected": -0.20345036685466766,
"step": 100
},
{
"epoch": 0.05234231876472128,
"eval_logits/chosen": -1.0141676664352417,
"eval_logits/rejected": -1.0460058450698853,
"eval_logps/chosen": -1.2700459957122803,
"eval_logps/rejected": -1.4935321807861328,
"eval_loss": 1.3827202320098877,
"eval_rewards/accuracies": 0.6448412537574768,
"eval_rewards/chosen": 0.01650167442858219,
"eval_rewards/margins": 0.07835288345813751,
"eval_rewards/rejected": -0.06185121089220047,
"eval_runtime": 264.6269,
"eval_samples_per_second": 7.558,
"eval_steps_per_second": 0.238,
"step": 100
},
{
"epoch": 0.05757655064119341,
"grad_norm": 191.0826517024898,
"learning_rate": 2.3036649214659686e-07,
"logits/chosen": -0.8402193188667297,
"logits/rejected": -0.9604326486587524,
"logps/chosen": -1.3015209436416626,
"logps/rejected": -1.4067150354385376,
"loss": 1.3805,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.010574941523373127,
"rewards/margins": 0.0737369954586029,
"rewards/rejected": -0.06316206604242325,
"step": 110
},
{
"epoch": 0.06281078251766553,
"grad_norm": 105.26964983422378,
"learning_rate": 2.513089005235602e-07,
"logits/chosen": -0.8950595855712891,
"logits/rejected": -0.9546734690666199,
"logps/chosen": -1.2874231338500977,
"logps/rejected": -1.275660753250122,
"loss": 1.4167,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.02625320851802826,
"rewards/margins": 0.05933423712849617,
"rewards/rejected": -0.03308102488517761,
"step": 120
},
{
"epoch": 0.06804501439413765,
"grad_norm": 98.23082684726371,
"learning_rate": 2.7225130890052355e-07,
"logits/chosen": -0.8970452547073364,
"logits/rejected": -0.9690724611282349,
"logps/chosen": -1.2717323303222656,
"logps/rejected": -1.5841439962387085,
"loss": 1.3711,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.016156885772943497,
"rewards/margins": 0.16786156594753265,
"rewards/rejected": -0.15170469880104065,
"step": 130
},
{
"epoch": 0.07327924627060979,
"grad_norm": 105.57197537487045,
"learning_rate": 2.931937172774869e-07,
"logits/chosen": -1.0700782537460327,
"logits/rejected": -1.0342130661010742,
"logps/chosen": -1.399418830871582,
"logps/rejected": -1.4947900772094727,
"loss": 1.3747,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.007722643204033375,
"rewards/margins": 0.1320706158876419,
"rewards/rejected": -0.1397932469844818,
"step": 140
},
{
"epoch": 0.07851347814708191,
"grad_norm": 92.95205585650491,
"learning_rate": 3.1413612565445027e-07,
"logits/chosen": -0.8677465319633484,
"logits/rejected": -0.9042151570320129,
"logps/chosen": -1.2097841501235962,
"logps/rejected": -1.4593037366867065,
"loss": 1.3624,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.037995148450136185,
"rewards/margins": 0.11828543990850449,
"rewards/rejected": -0.15628059208393097,
"step": 150
},
{
"epoch": 0.08374771002355404,
"grad_norm": 60.18689578296943,
"learning_rate": 3.350785340314136e-07,
"logits/chosen": -0.8861294984817505,
"logits/rejected": -0.954685389995575,
"logps/chosen": -1.2184758186340332,
"logps/rejected": -1.4243767261505127,
"loss": 1.3659,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.012581342831254005,
"rewards/margins": 0.19156894087791443,
"rewards/rejected": -0.2041502743959427,
"step": 160
},
{
"epoch": 0.08898194190002617,
"grad_norm": 162.74421304335266,
"learning_rate": 3.56020942408377e-07,
"logits/chosen": -0.9357515573501587,
"logits/rejected": -0.9549610018730164,
"logps/chosen": -1.3333146572113037,
"logps/rejected": -1.5601656436920166,
"loss": 1.3878,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.03334398195147514,
"rewards/margins": 0.2373563051223755,
"rewards/rejected": -0.20401231944561005,
"step": 170
},
{
"epoch": 0.0942161737764983,
"grad_norm": 303.36378167257243,
"learning_rate": 3.769633507853403e-07,
"logits/chosen": -1.0010288953781128,
"logits/rejected": -1.0621322393417358,
"logps/chosen": -1.3325862884521484,
"logps/rejected": -1.6137030124664307,
"loss": 1.3917,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.10068617761135101,
"rewards/margins": 0.23769037425518036,
"rewards/rejected": -0.13700421154499054,
"step": 180
},
{
"epoch": 0.09945040565297043,
"grad_norm": 677.0727123683995,
"learning_rate": 3.9790575916230365e-07,
"logits/chosen": -0.9208686947822571,
"logits/rejected": -1.0320428609848022,
"logps/chosen": -1.3446584939956665,
"logps/rejected": -1.5973131656646729,
"loss": 1.3741,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.060563139617443085,
"rewards/margins": 0.27492621541023254,
"rewards/rejected": -0.21436305344104767,
"step": 190
},
{
"epoch": 0.10468463752944256,
"grad_norm": 362.00771368392924,
"learning_rate": 3.9997294651491985e-07,
"logits/chosen": -0.8778219223022461,
"logits/rejected": -0.8942776918411255,
"logps/chosen": -1.2816615104675293,
"logps/rejected": -1.3702523708343506,
"loss": 1.4322,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.046035002917051315,
"rewards/margins": 0.17283782362937927,
"rewards/rejected": -0.12680283188819885,
"step": 200
},
{
"epoch": 0.10468463752944256,
"eval_logits/chosen": -1.00635826587677,
"eval_logits/rejected": -1.0356963872909546,
"eval_logps/chosen": -1.2760363817214966,
"eval_logps/rejected": -1.509326696395874,
"eval_loss": 1.3940050601959229,
"eval_rewards/accuracies": 0.7003968358039856,
"eval_rewards/chosen": -0.073353111743927,
"eval_rewards/margins": 0.2254164069890976,
"eval_rewards/rejected": -0.2987695634365082,
"eval_runtime": 263.6711,
"eval_samples_per_second": 7.585,
"eval_steps_per_second": 0.239,
"step": 200
},
{
"epoch": 0.10991886940591468,
"grad_norm": 777.6134451437069,
"learning_rate": 3.9987943769122714e-07,
"logits/chosen": -0.9393421411514282,
"logits/rejected": -0.9751921892166138,
"logps/chosen": -1.2527530193328857,
"logps/rejected": -1.512480616569519,
"loss": 1.4172,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.14303788542747498,
"rewards/margins": 0.22242093086242676,
"rewards/rejected": -0.36545881628990173,
"step": 210
},
{
"epoch": 0.11515310128238682,
"grad_norm": 325.44094651913406,
"learning_rate": 3.997191707590292e-07,
"logits/chosen": -0.9741231799125671,
"logits/rejected": -0.9654073715209961,
"logps/chosen": -1.4455256462097168,
"logps/rejected": -1.4085519313812256,
"loss": 1.3963,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.06883502006530762,
"rewards/margins": 0.3231067359447479,
"rewards/rejected": -0.2542716860771179,
"step": 220
},
{
"epoch": 0.12038733315885894,
"grad_norm": 619.1952484265515,
"learning_rate": 3.9949219924617967e-07,
"logits/chosen": -0.9289252161979675,
"logits/rejected": -0.9729310870170593,
"logps/chosen": -1.2874866724014282,
"logps/rejected": -1.5892341136932373,
"loss": 1.4192,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.08541570603847504,
"rewards/margins": 0.2191784679889679,
"rewards/rejected": -0.3045941889286041,
"step": 230
},
{
"epoch": 0.12562156503533106,
"grad_norm": 361.72441654521475,
"learning_rate": 3.9919859895932e-07,
"logits/chosen": -0.9237004518508911,
"logits/rejected": -1.0107048749923706,
"logps/chosen": -1.3676064014434814,
"logps/rejected": -1.7672977447509766,
"loss": 1.4555,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.15573230385780334,
"rewards/margins": 0.4071730673313141,
"rewards/rejected": -0.25144073367118835,
"step": 240
},
{
"epoch": 0.13085579691180318,
"grad_norm": 107.37570680791188,
"learning_rate": 3.988384679585609e-07,
"logits/chosen": -0.9424523115158081,
"logits/rejected": -0.9789683222770691,
"logps/chosen": -1.3719688653945923,
"logps/rejected": -1.476319670677185,
"loss": 1.4147,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.12189966440200806,
"rewards/margins": 0.20880040526390076,
"rewards/rejected": -0.0869007408618927,
"step": 250
},
{
"epoch": 0.1360900287882753,
"grad_norm": 95.00301311323727,
"learning_rate": 3.9841192652473133e-07,
"logits/chosen": -0.9747894406318665,
"logits/rejected": -1.0530925989151,
"logps/chosen": -1.266790509223938,
"logps/rejected": -1.4995120763778687,
"loss": 1.43,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.008727884851396084,
"rewards/margins": 0.2547326385974884,
"rewards/rejected": -0.24600473046302795,
"step": 260
},
{
"epoch": 0.14132426066474746,
"grad_norm": 297.54395880852655,
"learning_rate": 3.979191171192052e-07,
"logits/chosen": -0.9566612243652344,
"logits/rejected": -0.978779673576355,
"logps/chosen": -1.352007508277893,
"logps/rejected": -1.5359153747558594,
"loss": 1.4378,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.09225025027990341,
"rewards/margins": 0.26082533597946167,
"rewards/rejected": -0.16857507824897766,
"step": 270
},
{
"epoch": 0.14655849254121958,
"grad_norm": 152.14462193643473,
"learning_rate": 3.973602043363207e-07,
"logits/chosen": -1.0354989767074585,
"logits/rejected": -1.0463378429412842,
"logps/chosen": -1.391145944595337,
"logps/rejected": -1.4438835382461548,
"loss": 1.4181,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.12363862991333008,
"rewards/margins": 0.4137144684791565,
"rewards/rejected": -0.2900758385658264,
"step": 280
},
{
"epoch": 0.1517927244176917,
"grad_norm": 150.78955202749123,
"learning_rate": 3.9673537484840704e-07,
"logits/chosen": -0.956143856048584,
"logits/rejected": -1.0358890295028687,
"logps/chosen": -1.398080587387085,
"logps/rejected": -1.4760212898254395,
"loss": 1.4314,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.0525251105427742,
"rewards/margins": 0.2129652500152588,
"rewards/rejected": -0.2654903531074524,
"step": 290
},
{
"epoch": 0.15702695629416383,
"grad_norm": 175.4681648073153,
"learning_rate": 3.960448373434375e-07,
"logits/chosen": -0.9618331789970398,
"logits/rejected": -1.0131045579910278,
"logps/chosen": -1.219518780708313,
"logps/rejected": -1.6010315418243408,
"loss": 1.4033,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.10120991617441177,
"rewards/margins": 0.5264266729354858,
"rewards/rejected": -0.42521676421165466,
"step": 300
},
{
"epoch": 0.15702695629416383,
"eval_logits/chosen": -1.0152956247329712,
"eval_logits/rejected": -1.0432747602462769,
"eval_logps/chosen": -1.2684417963027954,
"eval_logps/rejected": -1.5052709579467773,
"eval_loss": 1.407413363456726,
"eval_rewards/accuracies": 0.7103174328804016,
"eval_rewards/chosen": 0.04056532308459282,
"eval_rewards/margins": 0.27849799394607544,
"eval_rewards/rejected": -0.23793263733386993,
"eval_runtime": 264.1297,
"eval_samples_per_second": 7.572,
"eval_steps_per_second": 0.239,
"step": 300
},
{
"epoch": 0.16226118817063595,
"grad_norm": 249.4339583636602,
"learning_rate": 3.9528882245532945e-07,
"logits/chosen": -0.9384096264839172,
"logits/rejected": -0.9961991310119629,
"logps/chosen": -1.3366284370422363,
"logps/rejected": -1.5762803554534912,
"loss": 1.4132,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.06849001348018646,
"rewards/margins": 0.22860321402549744,
"rewards/rejected": -0.16011318564414978,
"step": 310
},
{
"epoch": 0.16749542004710807,
"grad_norm": 100.66710833696462,
"learning_rate": 3.9446758268691394e-07,
"logits/chosen": -0.9083954095840454,
"logits/rejected": -1.0078755617141724,
"logps/chosen": -1.3704053163528442,
"logps/rejected": -1.5062158107757568,
"loss": 1.4061,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.11326809227466583,
"rewards/margins": 0.3582002520561218,
"rewards/rejected": -0.2449321746826172,
"step": 320
},
{
"epoch": 0.17272965192358022,
"grad_norm": 93.62113614305316,
"learning_rate": 3.935813923256026e-07,
"logits/chosen": -0.9598302841186523,
"logits/rejected": -1.1092036962509155,
"logps/chosen": -1.197092890739441,
"logps/rejected": -1.5259929895401,
"loss": 1.4512,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.29086190462112427,
"rewards/margins": 0.46881595253944397,
"rewards/rejected": -0.1779540628194809,
"step": 330
},
{
"epoch": 0.17796388380005235,
"grad_norm": 1259.8554969874265,
"learning_rate": 3.9263054735177724e-07,
"logits/chosen": -0.9662519693374634,
"logits/rejected": -0.9433367848396301,
"logps/chosen": -1.3681367635726929,
"logps/rejected": -1.514211654663086,
"loss": 1.4474,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.054476749151945114,
"rewards/margins": 0.17750175297260284,
"rewards/rejected": -0.23197850584983826,
"step": 340
},
{
"epoch": 0.18319811567652447,
"grad_norm": 157.20223175549822,
"learning_rate": 3.916153653399351e-07,
"logits/chosen": -0.9618955850601196,
"logits/rejected": -0.965499758720398,
"logps/chosen": -1.2811925411224365,
"logps/rejected": -1.559472680091858,
"loss": 1.4054,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.09336249530315399,
"rewards/margins": 0.42978915572166443,
"rewards/rejected": -0.5231517553329468,
"step": 350
},
{
"epoch": 0.1884323475529966,
"grad_norm": 1304.5710427907864,
"learning_rate": 3.9053618535262144e-07,
"logits/chosen": -0.9978113174438477,
"logits/rejected": -1.0920830965042114,
"logps/chosen": -1.295592188835144,
"logps/rejected": -1.5009433031082153,
"loss": 1.4453,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.11287063360214233,
"rewards/margins": 0.2577170133590698,
"rewards/rejected": -0.14484639465808868,
"step": 360
},
{
"epoch": 0.19366657942946872,
"grad_norm": 418.65708160055283,
"learning_rate": 3.893933678271856e-07,
"logits/chosen": -1.0005239248275757,
"logits/rejected": -1.0983339548110962,
"logps/chosen": -1.2382128238677979,
"logps/rejected": -1.5115060806274414,
"loss": 1.4038,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.09283250570297241,
"rewards/margins": 0.5117446780204773,
"rewards/rejected": -0.4189121723175049,
"step": 370
},
{
"epoch": 0.19890081130594087,
"grad_norm": 193.74911912257855,
"learning_rate": 3.881872944553976e-07,
"logits/chosen": -1.0515316724777222,
"logits/rejected": -1.0202170610427856,
"logps/chosen": -1.5801050662994385,
"logps/rejected": -1.6806576251983643,
"loss": 1.4325,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.18667231500148773,
"rewards/margins": 0.5009941458702087,
"rewards/rejected": -0.31432193517684937,
"step": 380
},
{
"epoch": 0.204135043182413,
"grad_norm": 164.76136107173468,
"learning_rate": 3.869183680559662e-07,
"logits/chosen": -0.9525713920593262,
"logits/rejected": -1.025831699371338,
"logps/chosen": -1.4257429838180542,
"logps/rejected": -1.517421841621399,
"loss": 1.465,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.1658320128917694,
"rewards/margins": 0.2568032145500183,
"rewards/rejected": -0.09097117930650711,
"step": 390
},
{
"epoch": 0.2093692750588851,
"grad_norm": 132.36602040912516,
"learning_rate": 3.8558701244000107e-07,
"logits/chosen": -1.0163413286209106,
"logits/rejected": -1.12177312374115,
"logps/chosen": -1.2447032928466797,
"logps/rejected": -1.5292352437973022,
"loss": 1.4072,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.006441372446715832,
"rewards/margins": 0.40031346678733826,
"rewards/rejected": -0.40675482153892517,
"step": 400
},
{
"epoch": 0.2093692750588851,
"eval_logits/chosen": -1.0346137285232544,
"eval_logits/rejected": -1.0629608631134033,
"eval_logps/chosen": -1.2605527639389038,
"eval_logps/rejected": -1.4991511106491089,
"eval_loss": 1.4382668733596802,
"eval_rewards/accuracies": 0.7222222089767456,
"eval_rewards/chosen": 0.15890030562877655,
"eval_rewards/margins": 0.3050336241722107,
"eval_rewards/rejected": -0.14613336324691772,
"eval_runtime": 269.1306,
"eval_samples_per_second": 7.431,
"eval_steps_per_second": 0.234,
"step": 400
},
{
"epoch": 0.21460350693535724,
"grad_norm": 277.3029855825092,
"learning_rate": 3.841936722694628e-07,
"logits/chosen": -0.9349578619003296,
"logits/rejected": -1.000216007232666,
"logps/chosen": -1.3645018339157104,
"logps/rejected": -1.5343906879425049,
"loss": 1.4552,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.2753569781780243,
"rewards/margins": 0.29162487387657166,
"rewards/rejected": -0.01626790128648281,
"step": 410
},
{
"epoch": 0.21983773881182936,
"grad_norm": 323.6630242480252,
"learning_rate": 3.8273881290864986e-07,
"logits/chosen": -0.9792436361312866,
"logits/rejected": -1.0122894048690796,
"logps/chosen": -1.2483856678009033,
"logps/rejected": -1.410635232925415,
"loss": 1.4225,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.008017847314476967,
"rewards/margins": 0.28319111466407776,
"rewards/rejected": -0.2912089228630066,
"step": 420
},
{
"epoch": 0.22507197068830148,
"grad_norm": 86.80607700867975,
"learning_rate": 3.812229202687705e-07,
"logits/chosen": -0.9529207944869995,
"logits/rejected": -1.045686960220337,
"logps/chosen": -1.2583253383636475,
"logps/rejected": -1.5180001258850098,
"loss": 1.4296,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.12871792912483215,
"rewards/margins": 0.5155263543128967,
"rewards/rejected": -0.3868083655834198,
"step": 430
},
{
"epoch": 0.23030620256477363,
"grad_norm": 915.1354551049579,
"learning_rate": 3.796465006456523e-07,
"logits/chosen": -1.0024458169937134,
"logits/rejected": -1.068861961364746,
"logps/chosen": -1.277616024017334,
"logps/rejected": -1.4234261512756348,
"loss": 1.4226,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.05915598198771477,
"rewards/margins": 0.5244419574737549,
"rewards/rejected": -0.4652860760688782,
"step": 440
},
{
"epoch": 0.23554043444124576,
"grad_norm": 218.08119707377622,
"learning_rate": 3.7801008055064363e-07,
"logits/chosen": -0.9519055485725403,
"logits/rejected": -0.963157057762146,
"logps/chosen": -1.3588078022003174,
"logps/rejected": -1.5339311361312866,
"loss": 1.3993,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.24848918616771698,
"rewards/margins": 0.4637584686279297,
"rewards/rejected": -0.2152692824602127,
"step": 450
},
{
"epoch": 0.24077466631771788,
"grad_norm": 99.08327892137484,
"learning_rate": 3.7631420653476316e-07,
"logits/chosen": -0.9202947616577148,
"logits/rejected": -1.0152450799942017,
"logps/chosen": -1.2867461442947388,
"logps/rejected": -1.4801595211029053,
"loss": 1.4462,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.1470358669757843,
"rewards/margins": 0.279130756855011,
"rewards/rejected": -0.4261665940284729,
"step": 460
},
{
"epoch": 0.24600889819419,
"grad_norm": 228.85559080922738,
"learning_rate": 3.74559445006156e-07,
"logits/chosen": -0.9630579948425293,
"logits/rejected": -0.9651592373847961,
"logps/chosen": -1.393817663192749,
"logps/rejected": -1.5304372310638428,
"loss": 1.3859,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.1274794042110443,
"rewards/margins": 0.317279577255249,
"rewards/rejected": -0.1898001730442047,
"step": 470
},
{
"epoch": 0.2512431300706621,
"grad_norm": 202.93750821371427,
"learning_rate": 3.727463820409182e-07,
"logits/chosen": -0.9812225103378296,
"logits/rejected": -1.044806718826294,
"logps/chosen": -1.4120748043060303,
"logps/rejected": -1.6228997707366943,
"loss": 1.4094,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.017949607223272324,
"rewards/margins": 0.22776713967323303,
"rewards/rejected": -0.2098175287246704,
"step": 480
},
{
"epoch": 0.2564773619471343,
"grad_norm": 166.1560506458322,
"learning_rate": 3.7087562318735215e-07,
"logits/chosen": -0.9065971374511719,
"logits/rejected": -0.9788573384284973,
"logps/chosen": -1.269921064376831,
"logps/rejected": -1.3954648971557617,
"loss": 1.4468,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.12124637514352798,
"rewards/margins": 0.16167902946472168,
"rewards/rejected": -0.04043266549706459,
"step": 490
},
{
"epoch": 0.26171159382360637,
"grad_norm": 198.70593003367702,
"learning_rate": 3.6894779326371806e-07,
"logits/chosen": -0.9271315336227417,
"logits/rejected": -0.9706541299819946,
"logps/chosen": -1.3781875371932983,
"logps/rejected": -1.5902111530303955,
"loss": 1.4887,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.14317944645881653,
"rewards/margins": 0.4533039927482605,
"rewards/rejected": -0.3101245164871216,
"step": 500
},
{
"epoch": 0.26171159382360637,
"eval_logits/chosen": -1.058478593826294,
"eval_logits/rejected": -1.0874103307724,
"eval_logps/chosen": -1.2464978694915771,
"eval_logps/rejected": -1.4891070127487183,
"eval_loss": 1.5037912130355835,
"eval_rewards/accuracies": 0.704365074634552,
"eval_rewards/chosen": 0.3697235584259033,
"eval_rewards/margins": 0.36519646644592285,
"eval_rewards/rejected": 0.00452705891802907,
"eval_runtime": 265.1463,
"eval_samples_per_second": 7.543,
"eval_steps_per_second": 0.238,
"step": 500
},
{
"epoch": 0.2669458257000785,
"grad_norm": 262.1381649197113,
"learning_rate": 3.669635361495502e-07,
"logits/chosen": -1.0345722436904907,
"logits/rejected": -1.1255292892456055,
"logps/chosen": -1.2064478397369385,
"logps/rejected": -1.574259877204895,
"loss": 1.4465,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.039454132318496704,
"rewards/margins": 0.6132072806358337,
"rewards/rejected": -0.5737532377243042,
"step": 510
},
{
"epoch": 0.2721800575765506,
"grad_norm": 89.6307110887034,
"learning_rate": 3.6492351457060587e-07,
"logits/chosen": -0.9336854219436646,
"logits/rejected": -0.965878963470459,
"logps/chosen": -1.1347582340240479,
"logps/rejected": -1.5124342441558838,
"loss": 1.4152,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.026898790150880814,
"rewards/margins": 0.5120420455932617,
"rewards/rejected": -0.5389407873153687,
"step": 520
},
{
"epoch": 0.27741428945302277,
"grad_norm": 1388.4179719079036,
"learning_rate": 3.6282840987752065e-07,
"logits/chosen": -1.0223872661590576,
"logits/rejected": -1.0572293996810913,
"logps/chosen": -1.3843199014663696,
"logps/rejected": -1.625836968421936,
"loss": 1.475,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.04534434527158737,
"rewards/margins": 0.48546886444091797,
"rewards/rejected": -0.4401245713233948,
"step": 530
},
{
"epoch": 0.2826485213294949,
"grad_norm": 195.53670379895084,
"learning_rate": 3.606789218182429e-07,
"logits/chosen": -1.0764001607894897,
"logits/rejected": -1.1225221157073975,
"logps/chosen": -1.5096803903579712,
"logps/rejected": -1.679340124130249,
"loss": 1.431,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.07530850172042847,
"rewards/margins": 0.32085931301116943,
"rewards/rejected": -0.3961678147315979,
"step": 540
},
{
"epoch": 0.287882753205967,
"grad_norm": 127.17060631430301,
"learning_rate": 3.584757683043235e-07,
"logits/chosen": -0.9253309965133667,
"logits/rejected": -0.9592300653457642,
"logps/chosen": -1.2050528526306152,
"logps/rejected": -1.4214028120040894,
"loss": 1.4257,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.09262965619564056,
"rewards/margins": 0.39863070845603943,
"rewards/rejected": -0.30600103735923767,
"step": 550
},
{
"epoch": 0.29311698508243916,
"grad_norm": 119.21787519480398,
"learning_rate": 3.5621968517113905e-07,
"logits/chosen": -1.011732816696167,
"logits/rejected": -1.074186086654663,
"logps/chosen": -1.367538571357727,
"logps/rejected": -1.4972589015960693,
"loss": 1.4233,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.16185715794563293,
"rewards/margins": 0.49168023467063904,
"rewards/rejected": -0.3298230767250061,
"step": 560
},
{
"epoch": 0.29835121695891126,
"grad_norm": 131.29687077414553,
"learning_rate": 3.5391142593212927e-07,
"logits/chosen": -0.9979642629623413,
"logits/rejected": -1.066590666770935,
"logps/chosen": -1.3047949075698853,
"logps/rejected": -1.441133737564087,
"loss": 1.4295,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.03215108811855316,
"rewards/margins": 0.302137553691864,
"rewards/rejected": -0.26998645067214966,
"step": 570
},
{
"epoch": 0.3035854488353834,
"grad_norm": 794.2192746772438,
"learning_rate": 3.515517615271293e-07,
"logits/chosen": -0.9547045826911926,
"logits/rejected": -1.0005202293395996,
"logps/chosen": -1.1787316799163818,
"logps/rejected": -1.4146497249603271,
"loss": 1.4159,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.23103637993335724,
"rewards/margins": 0.48520785570144653,
"rewards/rejected": -0.2541714906692505,
"step": 580
},
{
"epoch": 0.30881968071185556,
"grad_norm": 114.48798800720019,
"learning_rate": 3.4914148006488197e-07,
"logits/chosen": -0.9228054881095886,
"logits/rejected": -0.9928166270256042,
"logps/chosen": -1.314188003540039,
"logps/rejected": -1.7351295948028564,
"loss": 1.4898,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.02696867287158966,
"rewards/margins": -0.19175395369529724,
"rewards/rejected": 0.1647852510213852,
"step": 590
},
{
"epoch": 0.31405391258832765,
"grad_norm": 128.958271520903,
"learning_rate": 3.466813865598163e-07,
"logits/chosen": -0.9575554132461548,
"logits/rejected": -1.0143183469772339,
"logps/chosen": -1.3230020999908447,
"logps/rejected": -1.4859802722930908,
"loss": 1.4435,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.171253964304924,
"rewards/margins": 0.0992809310555458,
"rewards/rejected": 0.07197302579879761,
"step": 600
},
{
"epoch": 0.31405391258832765,
"eval_logits/chosen": -1.0527211427688599,
"eval_logits/rejected": -1.0806084871292114,
"eval_logps/chosen": -1.2730886936187744,
"eval_logps/rejected": -1.5170137882232666,
"eval_loss": 1.4242910146713257,
"eval_rewards/accuracies": 0.72817462682724,
"eval_rewards/chosen": -0.029136493802070618,
"eval_rewards/margins": 0.3849383294582367,
"eval_rewards/rejected": -0.4140748083591461,
"eval_runtime": 266.4809,
"eval_samples_per_second": 7.505,
"eval_steps_per_second": 0.236,
"step": 600
},
{
"epoch": 0.3192881444647998,
"grad_norm": 221.61630470246598,
"learning_rate": 3.4417230266317886e-07,
"logits/chosen": -0.9701067805290222,
"logits/rejected": -1.0397741794586182,
"logps/chosen": -1.228003740310669,
"logps/rejected": -1.4498927593231201,
"loss": 1.4383,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.038508955389261246,
"rewards/margins": 0.4932937026023865,
"rewards/rejected": -0.5318026542663574,
"step": 610
},
{
"epoch": 0.3245223763412719,
"grad_norm": 233.4642222088906,
"learning_rate": 3.41615066388609e-07,
"logits/chosen": -0.9511051177978516,
"logits/rejected": -1.0637654066085815,
"logps/chosen": -1.2569276094436646,
"logps/rejected": -1.5339549779891968,
"loss": 1.4488,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.14977118372917175,
"rewards/margins": 0.43153300881385803,
"rewards/rejected": -0.2817618250846863,
"step": 620
},
{
"epoch": 0.32975660821774405,
"grad_norm": 340.4994744679967,
"learning_rate": 3.390105318322492e-07,
"logits/chosen": -1.0287960767745972,
"logits/rejected": -1.0511208772659302,
"logps/chosen": -1.3860498666763306,
"logps/rejected": -1.5678269863128662,
"loss": 1.414,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.0474742166697979,
"rewards/margins": 0.32056504487991333,
"rewards/rejected": -0.36803925037384033,
"step": 630
},
{
"epoch": 0.33499084009421615,
"grad_norm": 96.74481880578419,
"learning_rate": 3.3635956888748385e-07,
"logits/chosen": -0.988872230052948,
"logits/rejected": -1.0875409841537476,
"logps/chosen": -1.3919366598129272,
"logps/rejected": -1.6267982721328735,
"loss": 1.4391,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.08910200744867325,
"rewards/margins": 0.730194628238678,
"rewards/rejected": -0.6410925984382629,
"step": 640
},
{
"epoch": 0.3402250719706883,
"grad_norm": 106.35251243553931,
"learning_rate": 3.336630629544019e-07,
"logits/chosen": -0.9367281794548035,
"logits/rejected": -0.9902782440185547,
"logps/chosen": -1.3781791925430298,
"logps/rejected": -1.411145567893982,
"loss": 1.4227,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.07712526619434357,
"rewards/margins": 0.3716704249382019,
"rewards/rejected": -0.29454511404037476,
"step": 650
},
{
"epoch": 0.34545930384716045,
"grad_norm": 36.1355257607673,
"learning_rate": 3.3092191464408037e-07,
"logits/chosen": -0.9591676592826843,
"logits/rejected": -1.0145364999771118,
"logps/chosen": -1.2052505016326904,
"logps/rejected": -1.494354009628296,
"loss": 1.3849,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.10244836658239365,
"rewards/margins": 0.40993595123291016,
"rewards/rejected": -0.3074875473976135,
"step": 660
},
{
"epoch": 0.35069353572363254,
"grad_norm": 605.954181910212,
"learning_rate": 3.281370394777878e-07,
"logits/chosen": -0.916668713092804,
"logits/rejected": -0.9587345123291016,
"logps/chosen": -1.2746045589447021,
"logps/rejected": -1.5481473207473755,
"loss": 1.4423,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.026984428986907005,
"rewards/margins": 0.21059663593769073,
"rewards/rejected": -0.18361221253871918,
"step": 670
},
{
"epoch": 0.3559277676001047,
"grad_norm": 165.59804456052152,
"learning_rate": 3.2530936758120725e-07,
"logits/chosen": -0.8825721740722656,
"logits/rejected": -0.9537725448608398,
"logps/chosen": -1.3265260457992554,
"logps/rejected": -1.4398428201675415,
"loss": 1.4304,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.1924048364162445,
"rewards/margins": 0.5167059898376465,
"rewards/rejected": -0.3243011236190796,
"step": 680
},
{
"epoch": 0.3611619994765768,
"grad_norm": 149.3409994530682,
"learning_rate": 3.224398433737821e-07,
"logits/chosen": -0.9888921976089478,
"logits/rejected": -1.0671908855438232,
"logps/chosen": -1.249463438987732,
"logps/rejected": -1.6025193929672241,
"loss": 1.4495,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.06590111553668976,
"rewards/margins": 0.3165399134159088,
"rewards/rejected": -0.25063878297805786,
"step": 690
},
{
"epoch": 0.36639623135304894,
"grad_norm": 39.76420565368717,
"learning_rate": 3.195294252532876e-07,
"logits/chosen": -0.9943802952766418,
"logits/rejected": -1.0306392908096313,
"logps/chosen": -1.3779702186584473,
"logps/rejected": -1.5936956405639648,
"loss": 1.3754,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.06695530563592911,
"rewards/margins": 0.3752886950969696,
"rewards/rejected": -0.4422439932823181,
"step": 700
},
{
"epoch": 0.36639623135304894,
"eval_logits/chosen": -1.0582585334777832,
"eval_logits/rejected": -1.0870941877365112,
"eval_logps/chosen": -1.2654900550842285,
"eval_logps/rejected": -1.511101484298706,
"eval_loss": 1.4125109910964966,
"eval_rewards/accuracies": 0.7440476417541504,
"eval_rewards/chosen": 0.08484180271625519,
"eval_rewards/margins": 0.4102318584918976,
"eval_rewards/rejected": -0.3253900408744812,
"eval_runtime": 263.46,
"eval_samples_per_second": 7.591,
"eval_steps_per_second": 0.239,
"step": 700
},
{
"epoch": 0.3716304632295211,
"grad_norm": 64.77721558069398,
"learning_rate": 3.165790852757337e-07,
"logits/chosen": -1.0020430088043213,
"logits/rejected": -1.0420172214508057,
"logps/chosen": -1.356451392173767,
"logps/rejected": -1.4674928188323975,
"loss": 1.3739,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 0.20629188418388367,
"rewards/margins": 0.4772399067878723,
"rewards/rejected": -0.27094796299934387,
"step": 710
},
{
"epoch": 0.3768646951059932,
"grad_norm": 1320.2371661660036,
"learning_rate": 3.135898088307064e-07,
"logits/chosen": -0.9764865636825562,
"logits/rejected": -0.9581249952316284,
"logps/chosen": -1.360630750656128,
"logps/rejected": -1.4430283308029175,
"loss": 1.4128,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.09529178589582443,
"rewards/margins": 0.38423871994018555,
"rewards/rejected": -0.2889469563961029,
"step": 720
},
{
"epoch": 0.38209892698246534,
"grad_norm": 271.7523715869803,
"learning_rate": 3.1056259431225556e-07,
"logits/chosen": -1.0364816188812256,
"logits/rejected": -1.0083050727844238,
"logps/chosen": -1.3824083805084229,
"logps/rejected": -1.5307719707489014,
"loss": 1.4463,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.052348293364048004,
"rewards/margins": 0.6019971966743469,
"rewards/rejected": -0.5496489405632019,
"step": 730
},
{
"epoch": 0.38733315885893743,
"grad_norm": 80.88419376322062,
"learning_rate": 3.074984527854392e-07,
"logits/chosen": -0.9187393188476562,
"logits/rejected": -0.9706932902336121,
"logps/chosen": -1.3978350162506104,
"logps/rejected": -1.6177310943603516,
"loss": 1.4341,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.2012447863817215,
"rewards/margins": 0.3044939339160919,
"rewards/rejected": -0.10324916988611221,
"step": 740
},
{
"epoch": 0.3925673907354096,
"grad_norm": 90.01276163951067,
"learning_rate": 3.043984076486364e-07,
"logits/chosen": -1.0107219219207764,
"logits/rejected": -0.9986340403556824,
"logps/chosen": -1.273086428642273,
"logps/rejected": -1.4610965251922607,
"loss": 1.513,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.2087746411561966,
"rewards/margins": 0.4228358864784241,
"rewards/rejected": -0.21406126022338867,
"step": 750
},
{
"epoch": 0.39780162261188173,
"grad_norm": 2597.0418080116397,
"learning_rate": 3.0126349429174023e-07,
"logits/chosen": -1.0616979598999023,
"logits/rejected": -1.0581673383712769,
"logps/chosen": -1.5037667751312256,
"logps/rejected": -1.6452052593231201,
"loss": 1.5244,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.08855433762073517,
"rewards/margins": 0.4076352119445801,
"rewards/rejected": -0.49618959426879883,
"step": 760
},
{
"epoch": 0.40303585448835383,
"grad_norm": 162.29078450187296,
"learning_rate": 2.9809475975034583e-07,
"logits/chosen": -0.9451099634170532,
"logits/rejected": -0.9363230466842651,
"logps/chosen": -1.2786595821380615,
"logps/rejected": -1.4589165449142456,
"loss": 1.4723,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.0040801106952130795,
"rewards/margins": 0.1179959774017334,
"rewards/rejected": -0.11391584575176239,
"step": 770
},
{
"epoch": 0.408270086364826,
"grad_norm": 110.30330182049372,
"learning_rate": 2.948932623560495e-07,
"logits/chosen": -0.9574085474014282,
"logits/rejected": -0.9982038736343384,
"logps/chosen": -1.3518567085266113,
"logps/rejected": -1.5835750102996826,
"loss": 1.4165,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.035115428268909454,
"rewards/margins": 0.5685119032859802,
"rewards/rejected": -0.6036273837089539,
"step": 780
},
{
"epoch": 0.4135043182412981,
"grad_norm": 60.51171656648762,
"learning_rate": 2.916600713829742e-07,
"logits/chosen": -1.0429871082305908,
"logits/rejected": -1.0647838115692139,
"logps/chosen": -1.4226644039154053,
"logps/rejected": -1.5080631971359253,
"loss": 1.4199,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.059725116938352585,
"rewards/margins": 0.1583862155675888,
"rewards/rejected": -0.09866108745336533,
"step": 790
},
{
"epoch": 0.4187385501177702,
"grad_norm": 290.41341566251265,
"learning_rate": 2.8839626669064067e-07,
"logits/chosen": -0.9505065679550171,
"logits/rejected": -1.0394840240478516,
"logps/chosen": -1.3534128665924072,
"logps/rejected": -1.6843515634536743,
"loss": 1.4283,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.08606131374835968,
"rewards/margins": 0.5727912187576294,
"rewards/rejected": -0.4867299497127533,
"step": 800
},
{
"epoch": 0.4187385501177702,
"eval_logits/chosen": -1.0662418603897095,
"eval_logits/rejected": -1.0950087308883667,
"eval_logps/chosen": -1.2765921354293823,
"eval_logps/rejected": -1.5239523649215698,
"eval_loss": 1.4428484439849854,
"eval_rewards/accuracies": 0.726190447807312,
"eval_rewards/chosen": -0.08169105648994446,
"eval_rewards/margins": 0.4364626109600067,
"eval_rewards/rejected": -0.5181536674499512,
"eval_runtime": 264.4094,
"eval_samples_per_second": 7.564,
"eval_steps_per_second": 0.238,
"step": 800
},
{
"epoch": 0.4239727819942423,
"grad_norm": 136.61664077267568,
"learning_rate": 2.8510293836330317e-07,
"logits/chosen": -0.9569026827812195,
"logits/rejected": -1.0604639053344727,
"logps/chosen": -1.1840898990631104,
"logps/rejected": -1.5309747457504272,
"loss": 1.4795,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.0819721594452858,
"rewards/margins": 0.2086147964000702,
"rewards/rejected": -0.2905869781970978,
"step": 810
},
{
"epoch": 0.42920701387071447,
"grad_norm": 157.90172498376174,
"learning_rate": 2.8178118634587043e-07,
"logits/chosen": -1.030447244644165,
"logits/rejected": -1.0620393753051758,
"logps/chosen": -1.3706046342849731,
"logps/rejected": -1.6218140125274658,
"loss": 1.4986,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.20854957401752472,
"rewards/margins": 0.1404011845588684,
"rewards/rejected": 0.06814839690923691,
"step": 820
},
{
"epoch": 0.4344412457471866,
"grad_norm": 145.68423411902808,
"learning_rate": 2.7843212007653255e-07,
"logits/chosen": -0.9203750491142273,
"logits/rejected": -0.9779027104377747,
"logps/chosen": -1.2530797719955444,
"logps/rejected": -1.425672173500061,
"loss": 1.4582,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.06837911903858185,
"rewards/margins": 0.17998042702674866,
"rewards/rejected": -0.1116013303399086,
"step": 830
},
{
"epoch": 0.4396754776236587,
"grad_norm": 68.18620996905915,
"learning_rate": 2.750568581162179e-07,
"logits/chosen": -0.9847718477249146,
"logits/rejected": -1.110650658607483,
"logps/chosen": -1.2794532775878906,
"logps/rejected": -1.498471975326538,
"loss": 1.4343,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.10923127084970474,
"rewards/margins": 0.44351276755332947,
"rewards/rejected": -0.3342815339565277,
"step": 840
},
{
"epoch": 0.44490970950013087,
"grad_norm": 242.48291318354657,
"learning_rate": 2.7165652777500305e-07,
"logits/chosen": -1.017841100692749,
"logits/rejected": -1.0413384437561035,
"logps/chosen": -1.3309098482131958,
"logps/rejected": -1.5017926692962646,
"loss": 1.44,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.07142224162817001,
"rewards/margins": 0.2956584095954895,
"rewards/rejected": -0.3670806884765625,
"step": 850
},
{
"epoch": 0.45014394137660296,
"grad_norm": 109.43096475182443,
"learning_rate": 2.682322647355999e-07,
"logits/chosen": -0.998466968536377,
"logits/rejected": -1.0665570497512817,
"logps/chosen": -1.2548444271087646,
"logps/rejected": -1.7455116510391235,
"loss": 1.4151,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.12004120647907257,
"rewards/margins": 0.5363563299179077,
"rewards/rejected": -0.6563974618911743,
"step": 860
},
{
"epoch": 0.4553781732530751,
"grad_norm": 124.43882606046388,
"learning_rate": 2.6478521267404725e-07,
"logits/chosen": -0.911393940448761,
"logits/rejected": -0.9683974385261536,
"logps/chosen": -1.299457311630249,
"logps/rejected": -1.5499870777130127,
"loss": 1.4252,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.025322074070572853,
"rewards/margins": 0.29149001836776733,
"rewards/rejected": -0.26616793870925903,
"step": 870
},
{
"epoch": 0.46061240512954726,
"grad_norm": 78.64863181721877,
"learning_rate": 2.613165228777323e-07,
"logits/chosen": -1.0315337181091309,
"logits/rejected": -1.059273600578308,
"logps/chosen": -1.3746845722198486,
"logps/rejected": -1.4652302265167236,
"loss": 1.386,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.06165723875164986,
"rewards/margins": 0.20029735565185547,
"rewards/rejected": -0.13864010572433472,
"step": 880
},
{
"epoch": 0.46584663700601936,
"grad_norm": 72.31012788663783,
"learning_rate": 2.578273538608695e-07,
"logits/chosen": -1.0157678127288818,
"logits/rejected": -0.9929217100143433,
"logps/chosen": -1.3486502170562744,
"logps/rejected": -1.329793930053711,
"loss": 1.422,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.019388969987630844,
"rewards/margins": 0.2866728901863098,
"rewards/rejected": -0.30606183409690857,
"step": 890
},
{
"epoch": 0.4710808688824915,
"grad_norm": 168.71230864025074,
"learning_rate": 2.5431887097756707e-07,
"logits/chosen": -0.9708254933357239,
"logits/rejected": -1.0406501293182373,
"logps/chosen": -1.3819133043289185,
"logps/rejected": -1.7021602392196655,
"loss": 1.4394,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.25760143995285034,
"rewards/margins": 0.4438048303127289,
"rewards/rejected": -0.18620333075523376,
"step": 900
},
{
"epoch": 0.4710808688824915,
"eval_logits/chosen": -1.0399378538131714,
"eval_logits/rejected": -1.067466139793396,
"eval_logps/chosen": -1.25888991355896,
"eval_logps/rejected": -1.5051480531692505,
"eval_loss": 1.4415701627731323,
"eval_rewards/accuracies": 0.7420634627342224,
"eval_rewards/chosen": 0.18384411931037903,
"eval_rewards/margins": 0.419933021068573,
"eval_rewards/rejected": -0.23608890175819397,
"eval_runtime": 265.0578,
"eval_samples_per_second": 7.546,
"eval_steps_per_second": 0.238,
"step": 900
},
{
"epoch": 0.4763151007589636,
"grad_norm": 107.01879296821734,
"learning_rate": 2.507922460326075e-07,
"logits/chosen": -0.9058791995048523,
"logits/rejected": -0.931871235370636,
"logps/chosen": -1.2475355863571167,
"logps/rejected": -1.5984818935394287,
"loss": 1.5115,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.04505655914545059,
"rewards/margins": 0.017883723601698875,
"rewards/rejected": 0.027172747999429703,
"step": 910
},
{
"epoch": 0.48154933263543576,
"grad_norm": 72.74505143056861,
"learning_rate": 2.4724865689007444e-07,
"logits/chosen": -0.9704592823982239,
"logits/rejected": -1.0217134952545166,
"logps/chosen": -1.2369471788406372,
"logps/rejected": -1.5074516534805298,
"loss": 1.433,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.1680927574634552,
"rewards/margins": 0.4736298620700836,
"rewards/rejected": -0.3055371642112732,
"step": 920
},
{
"epoch": 0.48678356451190785,
"grad_norm": 114.65949067860906,
"learning_rate": 2.436892870799559e-07,
"logits/chosen": -0.9633470773696899,
"logits/rejected": -0.9601320028305054,
"logps/chosen": -1.268119215965271,
"logps/rejected": -1.3131425380706787,
"loss": 1.3956,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.11231119930744171,
"rewards/margins": 0.2431112825870514,
"rewards/rejected": -0.13080011308193207,
"step": 930
},
{
"epoch": 0.49201779638838,
"grad_norm": 101.41199727533493,
"learning_rate": 2.4011532540285447e-07,
"logits/chosen": -1.023418664932251,
"logits/rejected": -1.0831656455993652,
"logps/chosen": -1.4656355381011963,
"logps/rejected": -1.5011208057403564,
"loss": 1.4291,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.01922314427793026,
"rewards/margins": 0.3083404004573822,
"rewards/rejected": -0.2891172766685486,
"step": 940
},
{
"epoch": 0.49725202826485215,
"grad_norm": 89.65607411754935,
"learning_rate": 2.3652796553293793e-07,
"logits/chosen": -0.9452352523803711,
"logits/rejected": -1.002318024635315,
"logps/chosen": -1.3159103393554688,
"logps/rejected": -1.7574964761734009,
"loss": 1.4089,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.028190921992063522,
"rewards/margins": 0.4952470362186432,
"rewards/rejected": -0.4670560956001282,
"step": 950
},
{
"epoch": 0.5024862601413242,
"grad_norm": 382.85539202651387,
"learning_rate": 2.3292840561926163e-07,
"logits/chosen": -0.9121773838996887,
"logits/rejected": -0.9947258830070496,
"logps/chosen": -1.3057327270507812,
"logps/rejected": -1.6355533599853516,
"loss": 1.4497,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.17302857339382172,
"rewards/margins": 0.6395506858825684,
"rewards/rejected": -0.46652212738990784,
"step": 960
},
{
"epoch": 0.5077204920177963,
"grad_norm": 104.65497018703128,
"learning_rate": 2.2931784788559626e-07,
"logits/chosen": -1.0272817611694336,
"logits/rejected": -1.0622152090072632,
"logps/chosen": -1.3166601657867432,
"logps/rejected": -1.7344980239868164,
"loss": 1.4224,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.005081593990325928,
"rewards/margins": 0.5101627111434937,
"rewards/rejected": -0.5152442455291748,
"step": 970
},
{
"epoch": 0.5129547238942685,
"grad_norm": 203.5247230819424,
"learning_rate": 2.2569749822889524e-07,
"logits/chosen": -0.9486488103866577,
"logits/rejected": -1.042937994003296,
"logps/chosen": -1.2337818145751953,
"logps/rejected": -1.4374665021896362,
"loss": 1.4641,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.06478907912969589,
"rewards/margins": 0.2325349748134613,
"rewards/rejected": -0.1677459180355072,
"step": 980
},
{
"epoch": 0.5181889557707406,
"grad_norm": 133.03376715975915,
"learning_rate": 2.220685658165347e-07,
"logits/chosen": -0.963657021522522,
"logits/rejected": -1.0291000604629517,
"logps/chosen": -1.1978986263275146,
"logps/rejected": -1.4739136695861816,
"loss": 1.3969,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.08724164962768555,
"rewards/margins": 0.43057960271835327,
"rewards/rejected": -0.3433380126953125,
"step": 990
},
{
"epoch": 0.5234231876472127,
"grad_norm": 58.99833399504203,
"learning_rate": 2.1843226268246133e-07,
"logits/chosen": -0.9091464281082153,
"logits/rejected": -0.9449722170829773,
"logps/chosen": -1.191294550895691,
"logps/rejected": -1.4181480407714844,
"loss": 1.3847,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.02543627843260765,
"rewards/margins": 0.339578777551651,
"rewards/rejected": -0.31414246559143066,
"step": 1000
},
{
"epoch": 0.5234231876472127,
"eval_logits/chosen": -1.0453258752822876,
"eval_logits/rejected": -1.0717276334762573,
"eval_logps/chosen": -1.263582706451416,
"eval_logps/rejected": -1.5113520622253418,
"eval_loss": 1.41469407081604,
"eval_rewards/accuracies": 0.7440476417541504,
"eval_rewards/chosen": 0.11345059424638748,
"eval_rewards/margins": 0.44259801506996155,
"eval_rewards/rejected": -0.32914745807647705,
"eval_runtime": 263.0472,
"eval_samples_per_second": 7.603,
"eval_steps_per_second": 0.24,
"step": 1000
},
{
"epoch": 0.528657419523685,
"grad_norm": 175.82571938691842,
"learning_rate": 2.1478980332238308e-07,
"logits/chosen": -1.0793265104293823,
"logits/rejected": -1.0983974933624268,
"logps/chosen": -1.3436429500579834,
"logps/rejected": -1.510811686515808,
"loss": 1.4792,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.012947812676429749,
"rewards/margins": 0.24009516835212708,
"rewards/rejected": -0.22714734077453613,
"step": 1010
},
{
"epoch": 0.533891651400157,
"grad_norm": 173.3542667625851,
"learning_rate": 2.1114240428813748e-07,
"logits/chosen": -1.024076223373413,
"logits/rejected": -1.10614013671875,
"logps/chosen": -1.3571223020553589,
"logps/rejected": -1.4241881370544434,
"loss": 1.3932,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.04782426357269287,
"rewards/margins": 0.3030202388763428,
"rewards/rejected": -0.2551959455013275,
"step": 1020
},
{
"epoch": 0.5391258832766291,
"grad_norm": 170.6198273419075,
"learning_rate": 2.074912837813728e-07,
"logits/chosen": -1.0137916803359985,
"logits/rejected": -1.0654577016830444,
"logps/chosen": -1.3849023580551147,
"logps/rejected": -1.4623501300811768,
"loss": 1.3837,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.2025812566280365,
"rewards/margins": 0.5163997411727905,
"rewards/rejected": -0.31381842494010925,
"step": 1030
},
{
"epoch": 0.5443601151531012,
"grad_norm": 169.2969270431579,
"learning_rate": 2.0383766124667928e-07,
"logits/chosen": -1.017490267753601,
"logits/rejected": -0.980577826499939,
"logps/chosen": -1.4890670776367188,
"logps/rejected": -1.468117594718933,
"loss": 1.4238,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.07746653258800507,
"rewards/margins": 0.3936583995819092,
"rewards/rejected": -0.3161918818950653,
"step": 1040
},
{
"epoch": 0.5495943470295734,
"grad_norm": 124.53171403879527,
"learning_rate": 2.001827569643039e-07,
"logits/chosen": -0.9910699725151062,
"logits/rejected": -1.0238853693008423,
"logps/chosen": -1.331132173538208,
"logps/rejected": -1.5334141254425049,
"loss": 1.4448,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.05304880812764168,
"rewards/margins": 0.41380447149276733,
"rewards/rejected": -0.36075565218925476,
"step": 1050
},
{
"epoch": 0.5548285789060455,
"grad_norm": 199.67308013058255,
"learning_rate": 1.9652779164258702e-07,
"logits/chosen": -1.0257099866867065,
"logits/rejected": -1.0186676979064941,
"logps/chosen": -1.3845534324645996,
"logps/rejected": -1.4948935508728027,
"loss": 1.3531,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.1899380087852478,
"rewards/margins": 0.5664941072463989,
"rewards/rejected": -0.37655606865882874,
"step": 1060
},
{
"epoch": 0.5600628107825176,
"grad_norm": 142.96112752249948,
"learning_rate": 1.928739860102556e-07,
"logits/chosen": -1.033825159072876,
"logits/rejected": -1.0962693691253662,
"logps/chosen": -1.3696187734603882,
"logps/rejected": -1.540083408355713,
"loss": 1.389,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.26318567991256714,
"rewards/margins": 0.5921273827552795,
"rewards/rejected": -0.3289416432380676,
"step": 1070
},
{
"epoch": 0.5652970426589898,
"grad_norm": 89.70184971339678,
"learning_rate": 1.8922256040870999e-07,
"logits/chosen": -0.981185793876648,
"logits/rejected": -1.0736793279647827,
"logps/chosen": -1.3201899528503418,
"logps/rejected": -1.4359652996063232,
"loss": 1.4041,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.05177130550146103,
"rewards/margins": 0.3131803870201111,
"rewards/rejected": -0.26140910387039185,
"step": 1080
},
{
"epoch": 0.5705312745354619,
"grad_norm": 196.20946597505844,
"learning_rate": 1.8557473438443928e-07,
"logits/chosen": -0.8916726112365723,
"logits/rejected": -0.9935215711593628,
"logps/chosen": -1.2833975553512573,
"logps/rejected": -1.6577835083007812,
"loss": 1.4167,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.01740972325205803,
"rewards/margins": 0.34133031964302063,
"rewards/rejected": -0.3239206075668335,
"step": 1090
},
{
"epoch": 0.575765506411934,
"grad_norm": 143.17390397505113,
"learning_rate": 1.819317262817032e-07,
"logits/chosen": -0.9497382044792175,
"logits/rejected": -1.0630282163619995,
"logps/chosen": -1.2121965885162354,
"logps/rejected": -1.4094746112823486,
"loss": 1.4128,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.16705064475536346,
"rewards/margins": 0.38941216468811035,
"rewards/rejected": -0.22236153483390808,
"step": 1100
},
{
"epoch": 0.575765506411934,
"eval_logits/chosen": -1.0532737970352173,
"eval_logits/rejected": -1.0803874731063843,
"eval_logps/chosen": -1.25905179977417,
"eval_logps/rejected": -1.5050337314605713,
"eval_loss": 1.4049805402755737,
"eval_rewards/accuracies": 0.7341269850730896,
"eval_rewards/chosen": 0.1814154088497162,
"eval_rewards/margins": 0.415788859128952,
"eval_rewards/rejected": -0.23437345027923584,
"eval_runtime": 264.5999,
"eval_samples_per_second": 7.559,
"eval_steps_per_second": 0.238,
"step": 1100
},
{
"epoch": 0.5809997382884062,
"grad_norm": 252.28706540848293,
"learning_rate": 1.7829475283561475e-07,
"logits/chosen": -1.101175308227539,
"logits/rejected": -1.095895767211914,
"logps/chosen": -1.4018386602401733,
"logps/rejected": -1.5792086124420166,
"loss": 1.3957,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.19977723062038422,
"rewards/margins": 0.4205760359764099,
"rewards/rejected": -0.2207988053560257,
"step": 1110
},
{
"epoch": 0.5862339701648783,
"grad_norm": 87.90505007580005,
"learning_rate": 1.7466502876576064e-07,
"logits/chosen": -1.0454351902008057,
"logits/rejected": -1.1120624542236328,
"logps/chosen": -1.349515676498413,
"logps/rejected": -1.5461986064910889,
"loss": 1.369,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.16857576370239258,
"rewards/margins": 0.5384188890457153,
"rewards/rejected": -0.369843065738678,
"step": 1120
},
{
"epoch": 0.5914682020413504,
"grad_norm": 144.50955438385066,
"learning_rate": 1.7104376637049473e-07,
"logits/chosen": -1.0022087097167969,
"logits/rejected": -0.9819334149360657,
"logps/chosen": -1.2998772859573364,
"logps/rejected": -1.5478651523590088,
"loss": 1.3787,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.08974303305149078,
"rewards/margins": 0.4815033972263336,
"rewards/rejected": -0.5712464451789856,
"step": 1130
},
{
"epoch": 0.5967024339178225,
"grad_norm": 442.03933803012757,
"learning_rate": 1.6743217512204052e-07,
"logits/chosen": -0.9362661242485046,
"logits/rejected": -1.0156729221343994,
"logps/chosen": -1.2051947116851807,
"logps/rejected": -1.3971805572509766,
"loss": 1.4071,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.11900795996189117,
"rewards/margins": 0.34501004219055176,
"rewards/rejected": -0.2260020673274994,
"step": 1140
},
{
"epoch": 0.6019366657942947,
"grad_norm": 109.24460216819094,
"learning_rate": 1.6383146126253681e-07,
"logits/chosen": -1.0144506692886353,
"logits/rejected": -1.0540077686309814,
"logps/chosen": -1.3877553939819336,
"logps/rejected": -1.6135002374649048,
"loss": 1.3792,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.021001553162932396,
"rewards/margins": 0.2840999960899353,
"rewards/rejected": -0.26309841871261597,
"step": 1150
},
{
"epoch": 0.6071708976707668,
"grad_norm": 278.3453630329118,
"learning_rate": 1.60242827401163e-07,
"logits/chosen": -1.0601884126663208,
"logits/rejected": -1.0491106510162354,
"logps/chosen": -1.3804155588150024,
"logps/rejected": -1.478729486465454,
"loss": 1.4281,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.07954071462154388,
"rewards/margins": 0.320735901594162,
"rewards/rejected": -0.2411951720714569,
"step": 1160
},
{
"epoch": 0.6124051295472389,
"grad_norm": 191.92209965945904,
"learning_rate": 1.5666747211247708e-07,
"logits/chosen": -0.9765886068344116,
"logits/rejected": -1.0292718410491943,
"logps/chosen": -1.3733490705490112,
"logps/rejected": -1.5234708786010742,
"loss": 1.4206,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.011603465303778648,
"rewards/margins": 0.39544984698295593,
"rewards/rejected": -0.38384637236595154,
"step": 1170
},
{
"epoch": 0.6176393614237111,
"grad_norm": 508.0527712691584,
"learning_rate": 1.5310658953610188e-07,
"logits/chosen": -0.8951209783554077,
"logits/rejected": -0.9667248725891113,
"logps/chosen": -1.3316763639450073,
"logps/rejected": -1.4773919582366943,
"loss": 1.3855,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.10079389810562134,
"rewards/margins": 0.26270270347595215,
"rewards/rejected": -0.161908820271492,
"step": 1180
},
{
"epoch": 0.6228735933001832,
"grad_norm": 122.17829279586468,
"learning_rate": 1.4956136897789153e-07,
"logits/chosen": -1.054325819015503,
"logits/rejected": -1.1099879741668701,
"logps/chosen": -1.3022596836090088,
"logps/rejected": -1.4433825016021729,
"loss": 1.3658,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.12086385488510132,
"rewards/margins": 0.40718135237693787,
"rewards/rejected": -0.28631752729415894,
"step": 1190
},
{
"epoch": 0.6281078251766553,
"grad_norm": 529.7173443798092,
"learning_rate": 1.4603299451271378e-07,
"logits/chosen": -0.945719838142395,
"logits/rejected": -0.953068733215332,
"logps/chosen": -1.3291584253311157,
"logps/rejected": -1.4614441394805908,
"loss": 1.4134,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.010808942839503288,
"rewards/margins": 0.28808557987213135,
"rewards/rejected": -0.2772766053676605,
"step": 1200
},
{
"epoch": 0.6281078251766553,
"eval_logits/chosen": -1.0341262817382812,
"eval_logits/rejected": -1.0602811574935913,
"eval_logps/chosen": -1.267260193824768,
"eval_logps/rejected": -1.5152653455734253,
"eval_loss": 1.3929948806762695,
"eval_rewards/accuracies": 0.738095223903656,
"eval_rewards/chosen": 0.0582895502448082,
"eval_rewards/margins": 0.4461366534233093,
"eval_rewards/rejected": -0.3878471553325653,
"eval_runtime": 268.653,
"eval_samples_per_second": 7.445,
"eval_steps_per_second": 0.235,
"step": 1200
},
{
"epoch": 0.6333420570531274,
"grad_norm": 103.87458468332127,
"learning_rate": 1.4252264458897765e-07,
"logits/chosen": -1.0020216703414917,
"logits/rejected": -1.003204345703125,
"logps/chosen": -1.3405869007110596,
"logps/rejected": -1.4487977027893066,
"loss": 1.4199,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.15262803435325623,
"rewards/margins": 0.41794830560684204,
"rewards/rejected": -0.26532021164894104,
"step": 1210
},
{
"epoch": 0.6385762889295996,
"grad_norm": 126.9034714680501,
"learning_rate": 1.390314916350422e-07,
"logits/chosen": -1.0031870603561401,
"logits/rejected": -1.025270700454712,
"logps/chosen": -1.4444220066070557,
"logps/rejected": -1.5348130464553833,
"loss": 1.3962,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.06764872372150421,
"rewards/margins": 0.563138484954834,
"rewards/rejected": -0.6307872533798218,
"step": 1220
},
{
"epoch": 0.6438105208060717,
"grad_norm": 276.6912026252489,
"learning_rate": 1.3556070166763415e-07,
"logits/chosen": -0.9344841241836548,
"logits/rejected": -1.0246646404266357,
"logps/chosen": -1.3276522159576416,
"logps/rejected": -1.4863455295562744,
"loss": 1.4041,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.14306631684303284,
"rewards/margins": 0.48536840081214905,
"rewards/rejected": -0.3423021733760834,
"step": 1230
},
{
"epoch": 0.6490447526825438,
"grad_norm": 332.7158437087833,
"learning_rate": 1.321114339024084e-07,
"logits/chosen": -0.9632574915885925,
"logits/rejected": -1.0173401832580566,
"logps/chosen": -1.2917425632476807,
"logps/rejected": -1.5060694217681885,
"loss": 1.3616,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.10876176506280899,
"rewards/margins": 0.5349315404891968,
"rewards/rejected": -0.4261697232723236,
"step": 1240
},
{
"epoch": 0.654278984559016,
"grad_norm": 97.38808549564327,
"learning_rate": 1.2868484036677894e-07,
"logits/chosen": -1.0494537353515625,
"logits/rejected": -1.0573270320892334,
"logps/chosen": -1.312703013420105,
"logps/rejected": -1.3993957042694092,
"loss": 1.419,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.14204749464988708,
"rewards/margins": 0.43673020601272583,
"rewards/rejected": -0.29468271136283875,
"step": 1250
},
{
"epoch": 0.6595132164354881,
"grad_norm": 76.07649121820532,
"learning_rate": 1.2528206551515154e-07,
"logits/chosen": -1.0372177362442017,
"logits/rejected": -1.07349693775177,
"logps/chosen": -1.5161725282669067,
"logps/rejected": -1.4368443489074707,
"loss": 1.3813,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.14961332082748413,
"rewards/margins": 0.501705527305603,
"rewards/rejected": -0.35209211707115173,
"step": 1260
},
{
"epoch": 0.6647474483119602,
"grad_norm": 68.86700640108432,
"learning_rate": 1.2190424584668462e-07,
"logits/chosen": -0.9774061441421509,
"logits/rejected": -1.022447943687439,
"logps/chosen": -1.3168667554855347,
"logps/rejected": -1.5211305618286133,
"loss": 1.36,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.06608399748802185,
"rewards/margins": 0.4469234347343445,
"rewards/rejected": -0.38083943724632263,
"step": 1270
},
{
"epoch": 0.6699816801884323,
"grad_norm": 114.4568415859116,
"learning_rate": 1.185525095257085e-07,
"logits/chosen": -1.0163053274154663,
"logits/rejected": -1.0360127687454224,
"logps/chosen": -1.306158423423767,
"logps/rejected": -1.535194993019104,
"loss": 1.4034,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.10055939853191376,
"rewards/margins": 0.4471976161003113,
"rewards/rejected": -0.34663820266723633,
"step": 1280
},
{
"epoch": 0.6752159120649045,
"grad_norm": 54.90434249157224,
"learning_rate": 1.1522797600492707e-07,
"logits/chosen": -0.9692333936691284,
"logits/rejected": -1.0122019052505493,
"logps/chosen": -1.2191261053085327,
"logps/rejected": -1.5494043827056885,
"loss": 1.3791,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.07996504008769989,
"rewards/margins": 0.43224668502807617,
"rewards/rejected": -0.3522816598415375,
"step": 1290
},
{
"epoch": 0.6804501439413766,
"grad_norm": 108.18282033188903,
"learning_rate": 1.1193175565153017e-07,
"logits/chosen": -0.9120146036148071,
"logits/rejected": -0.9983510971069336,
"logps/chosen": -1.3106328248977661,
"logps/rejected": -1.4724805355072021,
"loss": 1.3657,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.011367501690983772,
"rewards/margins": 0.22781343758106232,
"rewards/rejected": -0.2164459526538849,
"step": 1300
},
{
"epoch": 0.6804501439413766,
"eval_logits/chosen": -1.0687713623046875,
"eval_logits/rejected": -1.0977264642715454,
"eval_logps/chosen": -1.2662289142608643,
"eval_logps/rejected": -1.5134109258651733,
"eval_loss": 1.3927397727966309,
"eval_rewards/accuracies": 0.7361111044883728,
"eval_rewards/chosen": 0.07375912368297577,
"eval_rewards/margins": 0.4337916970252991,
"eval_rewards/rejected": -0.3600325584411621,
"eval_runtime": 263.1627,
"eval_samples_per_second": 7.6,
"eval_steps_per_second": 0.239,
"step": 1300
},
{
"epoch": 0.6856843758178487,
"grad_norm": 156.5057273737964,
"learning_rate": 1.0866494937633952e-07,
"logits/chosen": -0.9764137268066406,
"logits/rejected": -1.0053216218948364,
"logps/chosen": -1.370274305343628,
"logps/rejected": -1.550768494606018,
"loss": 1.3795,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.14111948013305664,
"rewards/margins": 0.499011367559433,
"rewards/rejected": -0.35789191722869873,
"step": 1310
},
{
"epoch": 0.6909186076943209,
"grad_norm": 183.97189217976666,
"learning_rate": 1.0542864826611373e-07,
"logits/chosen": -0.9627429246902466,
"logits/rejected": -0.980324387550354,
"logps/chosen": -1.3693149089813232,
"logps/rejected": -1.5022780895233154,
"loss": 1.3761,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.18044526875019073,
"rewards/margins": 0.5759294033050537,
"rewards/rejected": -0.39548414945602417,
"step": 1320
},
{
"epoch": 0.696152839570793,
"grad_norm": 123.36310936062844,
"learning_rate": 1.0222393321913405e-07,
"logits/chosen": -0.9242043495178223,
"logits/rejected": -0.9338275194168091,
"logps/chosen": -1.349094271659851,
"logps/rejected": -1.579219102859497,
"loss": 1.3722,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.06291097402572632,
"rewards/margins": 0.4077533781528473,
"rewards/rejected": -0.4706643223762512,
"step": 1330
},
{
"epoch": 0.7013870714472651,
"grad_norm": 166.04682689360567,
"learning_rate": 9.905187458419342e-08,
"logits/chosen": -0.852869987487793,
"logits/rejected": -0.9787343144416809,
"logps/chosen": -1.232936143875122,
"logps/rejected": -1.542320966720581,
"loss": 1.4051,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.18137158453464508,
"rewards/margins": 0.23298697173595428,
"rewards/rejected": -0.051615405827760696,
"step": 1340
},
{
"epoch": 0.7066213033237373,
"grad_norm": 95.47739919094735,
"learning_rate": 9.591353180310812e-08,
"logits/chosen": -0.9522558450698853,
"logits/rejected": -1.0096293687820435,
"logps/chosen": -1.3504010438919067,
"logps/rejected": -1.3914040327072144,
"loss": 1.4006,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.19880017638206482,
"rewards/margins": 0.5202890038490295,
"rewards/rejected": -0.32148876786231995,
"step": 1350
},
{
"epoch": 0.7118555352002094,
"grad_norm": 128.51259333091,
"learning_rate": 9.280995305687343e-08,
"logits/chosen": -0.9708206057548523,
"logits/rejected": -1.021451711654663,
"logps/chosen": -1.3550317287445068,
"logps/rejected": -1.5246636867523193,
"loss": 1.4348,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.06314614415168762,
"rewards/margins": 0.2832590341567993,
"rewards/rejected": -0.22011291980743408,
"step": 1360
},
{
"epoch": 0.7170897670766815,
"grad_norm": 94.91641424091182,
"learning_rate": 8.974217491557916e-08,
"logits/chosen": -1.0391184091567993,
"logits/rejected": -1.0677263736724854,
"logps/chosen": -1.3149784803390503,
"logps/rejected": -1.5607545375823975,
"loss": 1.3456,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.1072906106710434,
"rewards/margins": 0.558443546295166,
"rewards/rejected": -0.4511529505252838,
"step": 1370
},
{
"epoch": 0.7223239989531536,
"grad_norm": 288.16991161199616,
"learning_rate": 8.6711221992204e-08,
"logits/chosen": -0.9859504699707031,
"logits/rejected": -1.0607242584228516,
"logps/chosen": -1.2985832691192627,
"logps/rejected": -1.4298102855682373,
"loss": 1.3638,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.14652974903583527,
"rewards/margins": 0.583271861076355,
"rewards/rejected": -0.4367421269416809,
"step": 1380
},
{
"epoch": 0.7275582308296258,
"grad_norm": 147.08196145784484,
"learning_rate": 8.371810660040286e-08,
"logits/chosen": -0.9275096654891968,
"logits/rejected": -1.0270216464996338,
"logps/chosen": -1.4230481386184692,
"logps/rejected": -1.7390727996826172,
"loss": 1.4108,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.008330064825713634,
"rewards/margins": 0.352591872215271,
"rewards/rejected": -0.3609219491481781,
"step": 1390
},
{
"epoch": 0.7327924627060979,
"grad_norm": 44.78410651121718,
"learning_rate": 8.076382841640277e-08,
"logits/chosen": -1.0107920169830322,
"logits/rejected": -1.0693024396896362,
"logps/chosen": -1.3128068447113037,
"logps/rejected": -1.4866211414337158,
"loss": 1.3569,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.04387504979968071,
"rewards/margins": 0.42374491691589355,
"rewards/rejected": -0.37986987829208374,
"step": 1400
},
{
"epoch": 0.7327924627060979,
"eval_logits/chosen": -1.0332118272781372,
"eval_logits/rejected": -1.0589492321014404,
"eval_logps/chosen": -1.2619318962097168,
"eval_logps/rejected": -1.5095207691192627,
"eval_loss": 1.401206135749817,
"eval_rewards/accuracies": 0.738095223903656,
"eval_rewards/chosen": 0.13821402192115784,
"eval_rewards/margins": 0.4398939311504364,
"eval_rewards/rejected": -0.30167993903160095,
"eval_runtime": 269.235,
"eval_samples_per_second": 7.428,
"eval_steps_per_second": 0.234,
"step": 1400
},
{
"epoch": 0.73802669458257,
"grad_norm": 105.62122860453555,
"learning_rate": 7.784937414511845e-08,
"logits/chosen": -1.001814365386963,
"logits/rejected": -1.0847572088241577,
"logps/chosen": -1.3243391513824463,
"logps/rejected": -1.6870197057724,
"loss": 1.4171,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.1925734281539917,
"rewards/margins": 0.6328744292259216,
"rewards/rejected": -0.4403010904788971,
"step": 1410
},
{
"epoch": 0.7432609264590422,
"grad_norm": 51.61690842370248,
"learning_rate": 7.497571719060176e-08,
"logits/chosen": -0.9514597654342651,
"logits/rejected": -1.020627737045288,
"logps/chosen": -1.2951841354370117,
"logps/rejected": -1.4787520170211792,
"loss": 1.3945,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.0273969117552042,
"rewards/margins": 0.40641704201698303,
"rewards/rejected": -0.37902015447616577,
"step": 1420
},
{
"epoch": 0.7484951583355143,
"grad_norm": 155.75874519048327,
"learning_rate": 7.214381733093155e-08,
"logits/chosen": -0.98955237865448,
"logits/rejected": -1.0787885189056396,
"logps/chosen": -1.2784019708633423,
"logps/rejected": -1.4226205348968506,
"loss": 1.4072,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.33527031540870667,
"rewards/margins": 0.6042425632476807,
"rewards/rejected": -0.2689722776412964,
"step": 1430
},
{
"epoch": 0.7537293902119864,
"grad_norm": 48.85149863967685,
"learning_rate": 6.935462039765676e-08,
"logits/chosen": -0.9959059953689575,
"logits/rejected": -1.0643514394760132,
"logps/chosen": -1.2606055736541748,
"logps/rejected": -1.371964454650879,
"loss": 1.3895,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.2062605619430542,
"rewards/margins": 0.3812524676322937,
"rewards/rejected": -0.17499187588691711,
"step": 1440
},
{
"epoch": 0.7589636220884585,
"grad_norm": 120.78360298158358,
"learning_rate": 6.660905795989545e-08,
"logits/chosen": -1.0321362018585205,
"logits/rejected": -1.0730319023132324,
"logps/chosen": -1.4630558490753174,
"logps/rejected": -1.6992202997207642,
"loss": 1.3719,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.18504472076892853,
"rewards/margins": 0.6389643549919128,
"rewards/rejected": -0.4539197087287903,
"step": 1450
},
{
"epoch": 0.7641978539649307,
"grad_norm": 111.414446510109,
"learning_rate": 6.39080470131989e-08,
"logits/chosen": -0.9712077975273132,
"logits/rejected": -1.0402915477752686,
"logps/chosen": -1.3074638843536377,
"logps/rejected": -1.4820187091827393,
"loss": 1.3939,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.018328525125980377,
"rewards/margins": 0.3735812306404114,
"rewards/rejected": -0.3552526831626892,
"step": 1460
},
{
"epoch": 0.7694320858414028,
"grad_norm": 421.6141362022388,
"learning_rate": 6.125248967328198e-08,
"logits/chosen": -0.9822176694869995,
"logits/rejected": -1.059078335762024,
"logps/chosen": -1.365307092666626,
"logps/rejected": -1.535718560218811,
"loss": 1.3764,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.1095791831612587,
"rewards/margins": 0.26515552401542664,
"rewards/rejected": -0.15557633340358734,
"step": 1470
},
{
"epoch": 0.7746663177178749,
"grad_norm": 93.39309030522578,
"learning_rate": 5.8643272874724504e-08,
"logits/chosen": -0.9487727284431458,
"logits/rejected": -1.0384645462036133,
"logps/chosen": -1.2958605289459229,
"logps/rejected": -1.4537372589111328,
"loss": 1.3613,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.055318187922239304,
"rewards/margins": 0.5227149724960327,
"rewards/rejected": -0.4673967957496643,
"step": 1480
},
{
"epoch": 0.7799005495943471,
"grad_norm": 162.31192195360663,
"learning_rate": 5.608126807474145e-08,
"logits/chosen": -0.9301995038986206,
"logits/rejected": -0.9804097414016724,
"logps/chosen": -1.415575623512268,
"logps/rejected": -1.5492489337921143,
"loss": 1.4203,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.015231065452098846,
"rewards/margins": 0.22754482924938202,
"rewards/rejected": -0.24277588725090027,
"step": 1490
},
{
"epoch": 0.7851347814708192,
"grad_norm": 136.2356214233426,
"learning_rate": 5.356733096212422e-08,
"logits/chosen": -1.0096968412399292,
"logits/rejected": -1.0772438049316406,
"logps/chosen": -1.5461193323135376,
"logps/rejected": -1.8158565759658813,
"loss": 1.4025,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.11093674600124359,
"rewards/margins": 0.6151713132858276,
"rewards/rejected": -0.5042346119880676,
"step": 1500
},
{
"epoch": 0.7851347814708192,
"eval_logits/chosen": -1.050403118133545,
"eval_logits/rejected": -1.0775498151779175,
"eval_logps/chosen": -1.2663925886154175,
"eval_logps/rejected": -1.513696312904358,
"eval_loss": 1.3905304670333862,
"eval_rewards/accuracies": 0.7460317611694336,
"eval_rewards/chosen": 0.07130717486143112,
"eval_rewards/margins": 0.4356210231781006,
"eval_rewards/rejected": -0.36431384086608887,
"eval_runtime": 266.5396,
"eval_samples_per_second": 7.504,
"eval_steps_per_second": 0.236,
"step": 1500
},
{
"epoch": 0.7903690133472913,
"grad_norm": 130.03419321106188,
"learning_rate": 5.1102301171446824e-08,
"logits/chosen": -1.013892412185669,
"logits/rejected": -1.0700061321258545,
"logps/chosen": -1.3864901065826416,
"logps/rejected": -1.503999948501587,
"loss": 1.3938,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.011791354976594448,
"rewards/margins": 0.44438084959983826,
"rewards/rejected": -0.45617228746414185,
"step": 1510
},
{
"epoch": 0.7956032452237635,
"grad_norm": 221.10773316045186,
"learning_rate": 4.8687002002635204e-08,
"logits/chosen": -0.9744777679443359,
"logits/rejected": -1.0147373676300049,
"logps/chosen": -1.3872696161270142,
"logps/rejected": -1.5349359512329102,
"loss": 1.3795,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.1630631983280182,
"rewards/margins": 0.39758509397506714,
"rewards/rejected": -0.23452191054821014,
"step": 1520
},
{
"epoch": 0.8008374771002356,
"grad_norm": 38.83280676048386,
"learning_rate": 4.632224014599151e-08,
"logits/chosen": -1.0282185077667236,
"logits/rejected": -1.0352448225021362,
"logps/chosen": -1.3574353456497192,
"logps/rejected": -1.47411048412323,
"loss": 1.3567,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.10859794914722443,
"rewards/margins": 0.5248547792434692,
"rewards/rejected": -0.4162568151950836,
"step": 1530
},
{
"epoch": 0.8060717089767077,
"grad_norm": 66.73720875935125,
"learning_rate": 4.400880541276608e-08,
"logits/chosen": -1.018883228302002,
"logits/rejected": -0.9882569313049316,
"logps/chosen": -1.3592469692230225,
"logps/rejected": -1.4263975620269775,
"loss": 1.38,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.13372251391410828,
"rewards/margins": 0.3992050290107727,
"rewards/rejected": -0.26548251509666443,
"step": 1540
},
{
"epoch": 0.8113059408531798,
"grad_norm": 126.07795147064398,
"learning_rate": 4.1747470471367066e-08,
"logits/chosen": -0.9707099795341492,
"logits/rejected": -1.043423056602478,
"logps/chosen": -1.257214903831482,
"logps/rejected": -1.4531395435333252,
"loss": 1.3925,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.08239827305078506,
"rewards/margins": 0.43017762899398804,
"rewards/rejected": -0.3477793335914612,
"step": 1550
},
{
"epoch": 0.816540172729652,
"grad_norm": 239.23092115114255,
"learning_rate": 3.953899058929542e-08,
"logits/chosen": -0.9740544557571411,
"logits/rejected": -1.002071738243103,
"logps/chosen": -1.36293625831604,
"logps/rejected": -1.4228624105453491,
"loss": 1.3994,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.1495702564716339,
"rewards/margins": 0.3382664620876312,
"rewards/rejected": -0.18869620561599731,
"step": 1560
},
{
"epoch": 0.821774404606124,
"grad_norm": 271.7775399259313,
"learning_rate": 3.738410338089149e-08,
"logits/chosen": -1.0329101085662842,
"logits/rejected": -1.0802018642425537,
"logps/chosen": -1.3035242557525635,
"logps/rejected": -1.4981635808944702,
"loss": 1.4136,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.19920700788497925,
"rewards/margins": 0.6002500653266907,
"rewards/rejected": -0.40104299783706665,
"step": 1570
},
{
"epoch": 0.8270086364825961,
"grad_norm": 112.4116696366104,
"learning_rate": 3.528352856097816e-08,
"logits/chosen": -1.06103515625,
"logits/rejected": -1.077307105064392,
"logps/chosen": -1.341233730316162,
"logps/rejected": -1.4782545566558838,
"loss": 1.3651,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.10534757375717163,
"rewards/margins": 0.4684361517429352,
"rewards/rejected": -0.36308857798576355,
"step": 1580
},
{
"epoch": 0.8322428683590684,
"grad_norm": 112.88065021967937,
"learning_rate": 3.323796770448157e-08,
"logits/chosen": -0.891954243183136,
"logits/rejected": -1.009387731552124,
"logps/chosen": -1.2871875762939453,
"logps/rejected": -1.667616844177246,
"loss": 1.3924,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.18675394356250763,
"rewards/margins": 0.7343874573707581,
"rewards/rejected": -0.5476335287094116,
"step": 1590
},
{
"epoch": 0.8374771002355405,
"grad_norm": 274.35787721935316,
"learning_rate": 3.1248104012111085e-08,
"logits/chosen": -1.0275261402130127,
"logits/rejected": -1.0268418788909912,
"logps/chosen": -1.4677902460098267,
"logps/rejected": -1.5933417081832886,
"loss": 1.4056,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0058928607031702995,
"rewards/margins": 0.3813323378562927,
"rewards/rejected": -0.3872251808643341,
"step": 1600
},
{
"epoch": 0.8374771002355405,
"eval_logits/chosen": -1.0522223711013794,
"eval_logits/rejected": -1.0792322158813477,
"eval_logps/chosen": -1.262178659439087,
"eval_logps/rejected": -1.5092347860336304,
"eval_loss": 1.3950434923171997,
"eval_rewards/accuracies": 0.7341269850730896,
"eval_rewards/chosen": 0.13451193273067474,
"eval_rewards/margins": 0.4319010376930237,
"eval_rewards/rejected": -0.2973891496658325,
"eval_runtime": 263.6942,
"eval_samples_per_second": 7.585,
"eval_steps_per_second": 0.239,
"step": 1600
},
{
"epoch": 0.8427113321120125,
"grad_norm": 138.0710133394622,
"learning_rate": 2.931460208217562e-08,
"logits/chosen": -0.9226272702217102,
"logits/rejected": -1.032504677772522,
"logps/chosen": -1.2919390201568604,
"logps/rejected": -1.6714973449707031,
"loss": 1.3897,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.11851558834314346,
"rewards/margins": 0.38856449723243713,
"rewards/rejected": -0.27004891633987427,
"step": 1610
},
{
"epoch": 0.8479455639884846,
"grad_norm": 374.96362049867355,
"learning_rate": 2.743810768861341e-08,
"logits/chosen": -0.9746836423873901,
"logits/rejected": -1.0413103103637695,
"logps/chosen": -1.255078673362732,
"logps/rejected": -1.5265228748321533,
"loss": 1.3898,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.0367145761847496,
"rewards/margins": 0.3037044405937195,
"rewards/rejected": -0.26698988676071167,
"step": 1620
},
{
"epoch": 0.8531797958649568,
"grad_norm": 1561.6988123064714,
"learning_rate": 2.5619247565308444e-08,
"logits/chosen": -1.0172195434570312,
"logits/rejected": -1.0282325744628906,
"logps/chosen": -1.3624012470245361,
"logps/rejected": -1.428806185722351,
"loss": 1.4127,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.02823961339890957,
"rewards/margins": 0.31183555722236633,
"rewards/rejected": -0.34007519483566284,
"step": 1630
},
{
"epoch": 0.8584140277414289,
"grad_norm": 432.62083647251535,
"learning_rate": 2.3858629196766845e-08,
"logits/chosen": -0.9970697164535522,
"logits/rejected": -1.0459386110305786,
"logps/chosen": -1.39687979221344,
"logps/rejected": -1.453002691268921,
"loss": 1.382,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.012905001640319824,
"rewards/margins": 0.27340295910835266,
"rewards/rejected": -0.2863079607486725,
"step": 1640
},
{
"epoch": 0.863648259617901,
"grad_norm": 107.41089429393227,
"learning_rate": 2.2156840615221563e-08,
"logits/chosen": -1.0009286403656006,
"logits/rejected": -1.008098840713501,
"logps/chosen": -1.3218780755996704,
"logps/rejected": -1.461322546005249,
"loss": 1.4188,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.14685675501823425,
"rewards/margins": 0.3876083493232727,
"rewards/rejected": -0.24075157940387726,
"step": 1650
},
{
"epoch": 0.8688824914943732,
"grad_norm": 139.7533229551783,
"learning_rate": 2.0514450204234724e-08,
"logits/chosen": -1.0012562274932861,
"logits/rejected": -1.0368671417236328,
"logps/chosen": -1.3768261671066284,
"logps/rejected": -1.5115500688552856,
"loss": 1.3544,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.0796336680650711,
"rewards/margins": 0.47576791048049927,
"rewards/rejected": -0.39613422751426697,
"step": 1660
},
{
"epoch": 0.8741167233708453,
"grad_norm": 100.1098611400833,
"learning_rate": 1.8932006508861865e-08,
"logits/chosen": -0.9033932685852051,
"logits/rejected": -1.0143510103225708,
"logps/chosen": -1.2409793138504028,
"logps/rejected": -1.513612985610962,
"loss": 1.3784,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.053215838968753815,
"rewards/margins": 0.46576422452926636,
"rewards/rejected": -0.41254839301109314,
"step": 1670
},
{
"epoch": 0.8793509552473174,
"grad_norm": 164.31866508664524,
"learning_rate": 1.7410038052442633e-08,
"logits/chosen": -0.9056793451309204,
"logits/rejected": -0.9717051386833191,
"logps/chosen": -1.3354060649871826,
"logps/rejected": -1.5312366485595703,
"loss": 1.3525,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.1547504961490631,
"rewards/margins": 0.549881637096405,
"rewards/rejected": -0.3951311409473419,
"step": 1680
},
{
"epoch": 0.8845851871237895,
"grad_norm": 781.1957844328009,
"learning_rate": 1.5949053160077974e-08,
"logits/chosen": -1.0040967464447021,
"logits/rejected": -1.042490839958191,
"logps/chosen": -1.3877414464950562,
"logps/rejected": -1.6622329950332642,
"loss": 1.4005,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.12258515506982803,
"rewards/margins": 0.5700903534889221,
"rewards/rejected": -0.4475051760673523,
"step": 1690
},
{
"epoch": 0.8898194190002617,
"grad_norm": 156.43790654355806,
"learning_rate": 1.4549539788853981e-08,
"logits/chosen": -0.9561022520065308,
"logits/rejected": -1.0446237325668335,
"logps/chosen": -1.3862628936767578,
"logps/rejected": -1.4567193984985352,
"loss": 1.3963,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.06578576564788818,
"rewards/margins": 0.31272581219673157,
"rewards/rejected": -0.246940016746521,
"step": 1700
},
{
"epoch": 0.8898194190002617,
"eval_logits/chosen": -1.0522090196609497,
"eval_logits/rejected": -1.0791518688201904,
"eval_logps/chosen": -1.266129732131958,
"eval_logps/rejected": -1.5129594802856445,
"eval_loss": 1.3915550708770752,
"eval_rewards/accuracies": 0.7400793433189392,
"eval_rewards/chosen": 0.07524589449167252,
"eval_rewards/margins": 0.42850303649902344,
"eval_rewards/rejected": -0.3532571494579315,
"eval_runtime": 270.4262,
"eval_samples_per_second": 7.396,
"eval_steps_per_second": 0.233,
"step": 1700
},
{
"epoch": 0.8950536508767338,
"grad_norm": 133.3233493091381,
"learning_rate": 1.3211965364867906e-08,
"logits/chosen": -0.9714654684066772,
"logits/rejected": -1.0295698642730713,
"logps/chosen": -1.28077232837677,
"logps/rejected": -1.6242740154266357,
"loss": 1.3713,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.012086862698197365,
"rewards/margins": 0.3643137514591217,
"rewards/rejected": -0.3522268831729889,
"step": 1710
},
{
"epoch": 0.9002878827532059,
"grad_norm": 162.10930120150212,
"learning_rate": 1.1936776627111789e-08,
"logits/chosen": -0.9902065396308899,
"logits/rejected": -1.016322135925293,
"logps/chosen": -1.3354121446609497,
"logps/rejected": -1.4340362548828125,
"loss": 1.3901,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.08343851566314697,
"rewards/margins": 0.25945615768432617,
"rewards/rejected": -0.1760176420211792,
"step": 1720
},
{
"epoch": 0.9055221146296781,
"grad_norm": 79.88141762385948,
"learning_rate": 1.072439947826531e-08,
"logits/chosen": -0.9664213061332703,
"logits/rejected": -1.0111840963363647,
"logps/chosen": -1.2742303609848022,
"logps/rejected": -1.4425808191299438,
"loss": 1.3581,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.03463122993707657,
"rewards/margins": 0.39741796255111694,
"rewards/rejected": -0.3627867102622986,
"step": 1730
},
{
"epoch": 0.9107563465061502,
"grad_norm": 98.56849565071181,
"learning_rate": 9.575238842447686e-09,
"logits/chosen": -1.0055774450302124,
"logits/rejected": -1.024350881576538,
"logps/chosen": -1.4313799142837524,
"logps/rejected": -1.6486520767211914,
"loss": 1.3729,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.043267179280519485,
"rewards/margins": 0.36729955673217773,
"rewards/rejected": -0.4105667173862457,
"step": 1740
},
{
"epoch": 0.9159905783826223,
"grad_norm": 50.17096951913472,
"learning_rate": 8.489678529976242e-09,
"logits/chosen": -0.9870138168334961,
"logits/rejected": -1.058259129524231,
"logps/chosen": -1.3640022277832031,
"logps/rejected": -1.4488258361816406,
"loss": 1.378,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.029376666992902756,
"rewards/margins": 0.24675174057483673,
"rewards/rejected": -0.2173750400543213,
"step": 1750
},
{
"epoch": 0.9212248102590945,
"grad_norm": 120.75312862313243,
"learning_rate": 7.468081109177027e-09,
"logits/chosen": -1.0199341773986816,
"logits/rejected": -1.0588958263397217,
"logps/chosen": -1.4964534044265747,
"logps/rejected": -1.6024501323699951,
"loss": 1.3929,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.12446703016757965,
"rewards/margins": 0.37628185749053955,
"rewards/rejected": -0.2518148422241211,
"step": 1760
},
{
"epoch": 0.9264590421355666,
"grad_norm": 61.49650646596077,
"learning_rate": 6.5107877852898176e-09,
"logits/chosen": -0.9167430996894836,
"logits/rejected": -0.9556129574775696,
"logps/chosen": -1.3390752077102661,
"logps/rejected": -1.6009235382080078,
"loss": 1.423,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.4433160424232483,
"rewards/margins": 0.9942270517349243,
"rewards/rejected": -0.5509108901023865,
"step": 1770
},
{
"epoch": 0.9316932740120387,
"grad_norm": 71.97694388450074,
"learning_rate": 5.6181182865083996e-09,
"logits/chosen": -0.9885386228561401,
"logits/rejected": -1.063110113143921,
"logps/chosen": -1.2272593975067139,
"logps/rejected": -1.5328699350357056,
"loss": 1.3663,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.1304434984922409,
"rewards/margins": 0.5252709984779358,
"rewards/rejected": -0.3948274552822113,
"step": 1780
},
{
"epoch": 0.9369275058885108,
"grad_norm": 119.59418512940066,
"learning_rate": 4.790370757193906e-09,
"logits/chosen": -0.9046109914779663,
"logits/rejected": -0.9860088229179382,
"logps/chosen": -1.3347175121307373,
"logps/rejected": -1.5166120529174805,
"loss": 1.3675,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.14059893786907196,
"rewards/margins": 0.5057962536811829,
"rewards/rejected": -0.36519724130630493,
"step": 1790
},
{
"epoch": 0.942161737764983,
"grad_norm": 171.46537540400269,
"learning_rate": 4.0278216582971145e-09,
"logits/chosen": -0.9255016446113586,
"logits/rejected": -0.9924243092536926,
"logps/chosen": -1.1826350688934326,
"logps/rejected": -1.5568852424621582,
"loss": 1.3775,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.08452494442462921,
"rewards/margins": 0.3724609911441803,
"rewards/rejected": -0.28793609142303467,
"step": 1800
},
{
"epoch": 0.942161737764983,
"eval_logits/chosen": -1.0464354753494263,
"eval_logits/rejected": -1.0728554725646973,
"eval_logps/chosen": -1.2658635377883911,
"eval_logps/rejected": -1.5124911069869995,
"eval_loss": 1.3899798393249512,
"eval_rewards/accuracies": 0.7400793433189392,
"eval_rewards/chosen": 0.07923853397369385,
"eval_rewards/margins": 0.42547234892845154,
"eval_rewards/rejected": -0.3462338149547577,
"eval_runtime": 265.6785,
"eval_samples_per_second": 7.528,
"eval_steps_per_second": 0.237,
"step": 1800
},
{
"epoch": 0.9473959696414551,
"grad_norm": 162.20555081217532,
"learning_rate": 3.3307256750225944e-09,
"logits/chosen": -0.9436967968940735,
"logits/rejected": -1.0339114665985107,
"logps/chosen": -1.2900078296661377,
"logps/rejected": -1.6287893056869507,
"loss": 1.3833,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.13000617921352386,
"rewards/margins": 0.5302383303642273,
"rewards/rejected": -0.4002321660518646,
"step": 1810
},
{
"epoch": 0.9526302015179272,
"grad_norm": 69.377718963086,
"learning_rate": 2.6993156317660636e-09,
"logits/chosen": -0.9864269495010376,
"logits/rejected": -1.118485450744629,
"logps/chosen": -1.2873159646987915,
"logps/rejected": -1.5646283626556396,
"loss": 1.3589,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.2326822727918625,
"rewards/margins": 0.4942099452018738,
"rewards/rejected": -0.2615277171134949,
"step": 1820
},
{
"epoch": 0.9578644333943994,
"grad_norm": 263.95656912812524,
"learning_rate": 2.1338024143528142e-09,
"logits/chosen": -0.9538179636001587,
"logits/rejected": -1.0052978992462158,
"logps/chosen": -1.3281259536743164,
"logps/rejected": -1.455165147781372,
"loss": 1.3882,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.04303558170795441,
"rewards/margins": 0.2979881465435028,
"rewards/rejected": -0.2549525201320648,
"step": 1830
},
{
"epoch": 0.9630986652708715,
"grad_norm": 39.18625236358022,
"learning_rate": 1.6343748996036077e-09,
"logits/chosen": -0.9945961833000183,
"logits/rejected": -1.0426194667816162,
"logps/chosen": -1.3158668279647827,
"logps/rejected": -1.5129916667938232,
"loss": 1.3731,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.048399608582258224,
"rewards/margins": 0.4530261158943176,
"rewards/rejected": -0.4046264588832855,
"step": 1840
},
{
"epoch": 0.9683328971473436,
"grad_norm": 1286.9737737396774,
"learning_rate": 1.2011998922513367e-09,
"logits/chosen": -0.908880889415741,
"logits/rejected": -1.0028820037841797,
"logps/chosen": -1.2711695432662964,
"logps/rejected": -1.590257167816162,
"loss": 1.3902,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.03833279758691788,
"rewards/margins": 0.42264699935913086,
"rewards/rejected": -0.3843142092227936,
"step": 1850
},
{
"epoch": 0.9735671290238157,
"grad_norm": 637.3699428656323,
"learning_rate": 8.34422069229701e-10,
"logits/chosen": -0.9810758829116821,
"logits/rejected": -1.034802794456482,
"logps/chosen": -1.3561222553253174,
"logps/rejected": -1.622603416442871,
"loss": 1.3511,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.09475534409284592,
"rewards/margins": 0.5301617383956909,
"rewards/rejected": -0.4354063868522644,
"step": 1860
},
{
"epoch": 0.9788013609002879,
"grad_norm": 140.92434283058824,
"learning_rate": 5.341639313521052e-10,
"logits/chosen": -0.9513614773750305,
"logits/rejected": -0.9637888073921204,
"logps/chosen": -1.2574306726455688,
"logps/rejected": -1.5153371095657349,
"loss": 1.4036,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.3424448072910309,
"rewards/margins": 0.3421035706996918,
"rewards/rejected": 0.00034122465876862407,
"step": 1870
},
{
"epoch": 0.98403559277676,
"grad_norm": 36.89568780813949,
"learning_rate": 3.005257623974966e-10,
"logits/chosen": -0.9985333681106567,
"logits/rejected": -1.0075907707214355,
"logps/chosen": -1.2374012470245361,
"logps/rejected": -1.4116681814193726,
"loss": 1.367,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.05069660022854805,
"rewards/margins": 0.28373846411705017,
"rewards/rejected": -0.2330418825149536,
"step": 1880
},
{
"epoch": 0.9892698246532321,
"grad_norm": 73.10899677501034,
"learning_rate": 1.3358559561642556e-10,
"logits/chosen": -0.9993526339530945,
"logits/rejected": -1.0922927856445312,
"logps/chosen": -1.3391480445861816,
"logps/rejected": -1.56029212474823,
"loss": 1.3802,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.056907374411821365,
"rewards/margins": 0.32270345091819763,
"rewards/rejected": -0.265796035528183,
"step": 1890
},
{
"epoch": 0.9945040565297043,
"grad_norm": 121.55231563020173,
"learning_rate": 3.339918766844807e-11,
"logits/chosen": -0.980857253074646,
"logits/rejected": -1.0078576803207397,
"logps/chosen": -1.4365278482437134,
"logps/rejected": -1.5911202430725098,
"loss": 1.3827,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.15399186313152313,
"rewards/margins": 0.4474136233329773,
"rewards/rejected": -0.29342177510261536,
"step": 1900
},
{
"epoch": 0.9945040565297043,
"eval_logits/chosen": -1.0477176904678345,
"eval_logits/rejected": -1.074251651763916,
"eval_logps/chosen": -1.266432285308838,
"eval_logps/rejected": -1.5128096342086792,
"eval_loss": 1.3904120922088623,
"eval_rewards/accuracies": 0.7400793433189392,
"eval_rewards/chosen": 0.07070931047201157,
"eval_rewards/margins": 0.421721875667572,
"eval_rewards/rejected": -0.35101258754730225,
"eval_runtime": 263.0385,
"eval_samples_per_second": 7.603,
"eval_steps_per_second": 0.24,
"step": 1900
},
{
"epoch": 0.9997382884061764,
"grad_norm": 1926.9365225866572,
"learning_rate": 0.0,
"logits/chosen": -1.0182803869247437,
"logits/rejected": -0.9923262596130371,
"logps/chosen": -1.3955562114715576,
"logps/rejected": -1.4240152835845947,
"loss": 1.3808,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.04296259954571724,
"rewards/margins": 0.4077285826206207,
"rewards/rejected": -0.3647659718990326,
"step": 1910
},
{
"epoch": 0.9997382884061764,
"step": 1910,
"total_flos": 0.0,
"train_loss": 1.4068586224660824,
"train_runtime": 26391.8957,
"train_samples_per_second": 2.316,
"train_steps_per_second": 0.072
}
],
"logging_steps": 10,
"max_steps": 1910,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}