radm
/

PEFT
Safetensors
llama-factory
lora
Generated from Trainer
Qwen2.5-32B-simpo-LoRA / trainer_state.json
radm's picture
first model version
e8a12f2
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998441639395356,
"eval_steps": 500,
"global_step": 401,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0024933769674302633,
"grad_norm": 133.0,
"learning_rate": 4.375e-08,
"logits/chosen": 0.9333375096321106,
"logits/rejected": 0.8665135502815247,
"logps/chosen": -1.574099063873291,
"logps/rejected": -1.2997534275054932,
"loss": 6.6643,
"rewards/accuracies": 0.46875,
"rewards/chosen": -15.74099063873291,
"rewards/margins": -2.7434558868408203,
"rewards/rejected": -12.997533798217773,
"step": 1
},
{
"epoch": 0.004986753934860527,
"grad_norm": 78.5,
"learning_rate": 8.75e-08,
"logits/chosen": 1.007162094116211,
"logits/rejected": 0.9319976568222046,
"logps/chosen": -1.5873029232025146,
"logps/rejected": -1.1813093423843384,
"loss": 6.8875,
"rewards/accuracies": 0.5,
"rewards/chosen": -15.873027801513672,
"rewards/margins": -4.059934616088867,
"rewards/rejected": -11.813094139099121,
"step": 2
},
{
"epoch": 0.0074801309022907905,
"grad_norm": 145.0,
"learning_rate": 1.3125e-07,
"logits/chosen": 1.015642523765564,
"logits/rejected": 0.8658874034881592,
"logps/chosen": -2.187445640563965,
"logps/rejected": -1.3217400312423706,
"loss": 10.8975,
"rewards/accuracies": 0.34375,
"rewards/chosen": -21.87445831298828,
"rewards/margins": -8.657057762145996,
"rewards/rejected": -13.217399597167969,
"step": 3
},
{
"epoch": 0.009973507869721053,
"grad_norm": 80.5,
"learning_rate": 1.75e-07,
"logits/chosen": 1.0409551858901978,
"logits/rejected": 0.9476256966590881,
"logps/chosen": -1.4537204504013062,
"logps/rejected": -1.1356033086776733,
"loss": 5.5006,
"rewards/accuracies": 0.5625,
"rewards/chosen": -14.537205696105957,
"rewards/margins": -3.181171417236328,
"rewards/rejected": -11.356032371520996,
"step": 4
},
{
"epoch": 0.012466884837151316,
"grad_norm": 58.75,
"learning_rate": 2.1875e-07,
"logits/chosen": 0.9139229655265808,
"logits/rejected": 1.0454109907150269,
"logps/chosen": -1.550325632095337,
"logps/rejected": -1.160091757774353,
"loss": 6.5305,
"rewards/accuracies": 0.40625,
"rewards/chosen": -15.503257751464844,
"rewards/margins": -3.902338981628418,
"rewards/rejected": -11.60091781616211,
"step": 5
},
{
"epoch": 0.014960261804581581,
"grad_norm": 123.5,
"learning_rate": 2.625e-07,
"logits/chosen": 0.9995524883270264,
"logits/rejected": 0.9891590476036072,
"logps/chosen": -2.1153974533081055,
"logps/rejected": -1.3793702125549316,
"loss": 10.0926,
"rewards/accuracies": 0.46875,
"rewards/chosen": -21.153976440429688,
"rewards/margins": -7.360274314880371,
"rewards/rejected": -13.793702125549316,
"step": 6
},
{
"epoch": 0.017453638772011844,
"grad_norm": 122.5,
"learning_rate": 3.0625e-07,
"logits/chosen": 0.9477364420890808,
"logits/rejected": 0.8998125791549683,
"logps/chosen": -2.1086533069610596,
"logps/rejected": -1.1814693212509155,
"loss": 10.9058,
"rewards/accuracies": 0.3125,
"rewards/chosen": -21.086532592773438,
"rewards/margins": -9.271841049194336,
"rewards/rejected": -11.814691543579102,
"step": 7
},
{
"epoch": 0.019947015739442107,
"grad_norm": 117.5,
"learning_rate": 3.5e-07,
"logits/chosen": 0.9074363112449646,
"logits/rejected": 1.0108835697174072,
"logps/chosen": -2.171971082687378,
"logps/rejected": -1.2922351360321045,
"loss": 11.804,
"rewards/accuracies": 0.375,
"rewards/chosen": -21.719709396362305,
"rewards/margins": -8.797359466552734,
"rewards/rejected": -12.922350883483887,
"step": 8
},
{
"epoch": 0.02244039270687237,
"grad_norm": 92.0,
"learning_rate": 3.9375e-07,
"logits/chosen": 0.9447617530822754,
"logits/rejected": 0.8549212217330933,
"logps/chosen": -2.003368616104126,
"logps/rejected": -1.2754697799682617,
"loss": 9.4959,
"rewards/accuracies": 0.40625,
"rewards/chosen": -20.033687591552734,
"rewards/margins": -7.278989791870117,
"rewards/rejected": -12.754697799682617,
"step": 9
},
{
"epoch": 0.024933769674302633,
"grad_norm": 98.5,
"learning_rate": 4.375e-07,
"logits/chosen": 1.009035587310791,
"logits/rejected": 0.895173192024231,
"logps/chosen": -1.9958442449569702,
"logps/rejected": -1.3750892877578735,
"loss": 8.6131,
"rewards/accuracies": 0.4375,
"rewards/chosen": -19.95844078063965,
"rewards/margins": -6.207549095153809,
"rewards/rejected": -13.750892639160156,
"step": 10
},
{
"epoch": 0.027427146641732895,
"grad_norm": 110.0,
"learning_rate": 4.812499999999999e-07,
"logits/chosen": 0.9430880546569824,
"logits/rejected": 0.9480469226837158,
"logps/chosen": -2.0413639545440674,
"logps/rejected": -1.3464946746826172,
"loss": 9.6928,
"rewards/accuracies": 0.40625,
"rewards/chosen": -20.413639068603516,
"rewards/margins": -6.948694229125977,
"rewards/rejected": -13.464945793151855,
"step": 11
},
{
"epoch": 0.029920523609163162,
"grad_norm": 152.0,
"learning_rate": 5.25e-07,
"logits/chosen": 0.9941633343696594,
"logits/rejected": 0.7915381193161011,
"logps/chosen": -2.5496878623962402,
"logps/rejected": -1.5264402627944946,
"loss": 12.4115,
"rewards/accuracies": 0.34375,
"rewards/chosen": -25.496877670288086,
"rewards/margins": -10.232475280761719,
"rewards/rejected": -15.26440143585205,
"step": 12
},
{
"epoch": 0.03241390057659342,
"grad_norm": 78.5,
"learning_rate": 5.6875e-07,
"logits/chosen": 0.8952471017837524,
"logits/rejected": 0.8926589488983154,
"logps/chosen": -1.597143530845642,
"logps/rejected": -1.355407476425171,
"loss": 6.8413,
"rewards/accuracies": 0.59375,
"rewards/chosen": -15.971436500549316,
"rewards/margins": -2.41736102104187,
"rewards/rejected": -13.554075241088867,
"step": 13
},
{
"epoch": 0.03490727754402369,
"grad_norm": 128.0,
"learning_rate": 6.125e-07,
"logits/chosen": 1.050255537033081,
"logits/rejected": 0.8761364221572876,
"logps/chosen": -1.834416151046753,
"logps/rejected": -1.298659324645996,
"loss": 7.9545,
"rewards/accuracies": 0.46875,
"rewards/chosen": -18.344160079956055,
"rewards/margins": -5.357568740844727,
"rewards/rejected": -12.986591339111328,
"step": 14
},
{
"epoch": 0.03740065451145395,
"grad_norm": 133.0,
"learning_rate": 6.5625e-07,
"logits/chosen": 1.0313760042190552,
"logits/rejected": 0.914068341255188,
"logps/chosen": -2.0879173278808594,
"logps/rejected": -1.2106117010116577,
"loss": 10.301,
"rewards/accuracies": 0.34375,
"rewards/chosen": -20.879173278808594,
"rewards/margins": -8.773056030273438,
"rewards/rejected": -12.10611629486084,
"step": 15
},
{
"epoch": 0.039894031478884213,
"grad_norm": 56.5,
"learning_rate": 7e-07,
"logits/chosen": 0.9663585424423218,
"logits/rejected": 0.9516808986663818,
"logps/chosen": -1.7078903913497925,
"logps/rejected": -1.2222994565963745,
"loss": 7.0132,
"rewards/accuracies": 0.4375,
"rewards/chosen": -17.078907012939453,
"rewards/margins": -4.8559112548828125,
"rewards/rejected": -12.222993850708008,
"step": 16
},
{
"epoch": 0.04238740844631448,
"grad_norm": 96.5,
"learning_rate": 6.999883476391534e-07,
"logits/chosen": 1.0192354917526245,
"logits/rejected": 0.9732477068901062,
"logps/chosen": -1.774751901626587,
"logps/rejected": -1.0946956872940063,
"loss": 8.648,
"rewards/accuracies": 0.40625,
"rewards/chosen": -17.747520446777344,
"rewards/margins": -6.800562858581543,
"rewards/rejected": -10.946956634521484,
"step": 17
},
{
"epoch": 0.04488078541374474,
"grad_norm": 121.0,
"learning_rate": 6.999533913324853e-07,
"logits/chosen": 0.981746256351471,
"logits/rejected": 0.9062566757202148,
"logps/chosen": -2.0760321617126465,
"logps/rejected": -2.2810633182525635,
"loss": 10.1792,
"rewards/accuracies": 0.3125,
"rewards/chosen": -20.76032066345215,
"rewards/margins": 2.050312042236328,
"rewards/rejected": -22.810632705688477,
"step": 18
},
{
"epoch": 0.047374162381175006,
"grad_norm": 67.0,
"learning_rate": 6.998951334075586e-07,
"logits/chosen": 1.0017695426940918,
"logits/rejected": 0.9386453032493591,
"logps/chosen": -1.5593485832214355,
"logps/rejected": -1.3584306240081787,
"loss": 5.5006,
"rewards/accuracies": 0.625,
"rewards/chosen": -15.593484878540039,
"rewards/margins": -2.0091779232025146,
"rewards/rejected": -13.584305763244629,
"step": 19
},
{
"epoch": 0.049867539348605265,
"grad_norm": 83.5,
"learning_rate": 6.998135777434723e-07,
"logits/chosen": 0.9819589853286743,
"logits/rejected": 0.9480808973312378,
"logps/chosen": -1.7974135875701904,
"logps/rejected": -1.2320420742034912,
"loss": 8.011,
"rewards/accuracies": 0.46875,
"rewards/chosen": -17.974136352539062,
"rewards/margins": -5.653716564178467,
"rewards/rejected": -12.32042121887207,
"step": 20
},
{
"epoch": 0.05236091631603553,
"grad_norm": 123.0,
"learning_rate": 6.99708729770604e-07,
"logits/chosen": 0.9151750802993774,
"logits/rejected": 0.9027111530303955,
"logps/chosen": -1.9205522537231445,
"logps/rejected": -1.6000399589538574,
"loss": 8.9355,
"rewards/accuracies": 0.40625,
"rewards/chosen": -19.205522537231445,
"rewards/margins": -3.205122232437134,
"rewards/rejected": -16.00040054321289,
"step": 21
},
{
"epoch": 0.05485429328346579,
"grad_norm": 53.25,
"learning_rate": 6.995805964702472e-07,
"logits/chosen": 0.9063746333122253,
"logits/rejected": 0.9599690437316895,
"logps/chosen": -1.5286774635314941,
"logps/rejected": -1.182100772857666,
"loss": 6.2756,
"rewards/accuracies": 0.34375,
"rewards/chosen": -15.286775588989258,
"rewards/margins": -3.4657678604125977,
"rewards/rejected": -11.821005821228027,
"step": 22
},
{
"epoch": 0.05734767025089606,
"grad_norm": 115.5,
"learning_rate": 6.994291863741474e-07,
"logits/chosen": 0.9865818619728088,
"logits/rejected": 0.8803253173828125,
"logps/chosen": -1.8937522172927856,
"logps/rejected": -1.1672096252441406,
"loss": 9.051,
"rewards/accuracies": 0.4375,
"rewards/chosen": -18.937522888183594,
"rewards/margins": -7.265425205230713,
"rewards/rejected": -11.672097206115723,
"step": 23
},
{
"epoch": 0.059841047218326324,
"grad_norm": 120.0,
"learning_rate": 6.992545095639337e-07,
"logits/chosen": 0.8972434997558594,
"logits/rejected": 0.8747442960739136,
"logps/chosen": -2.372899055480957,
"logps/rejected": -1.4153152704238892,
"loss": 11.7809,
"rewards/accuracies": 0.3125,
"rewards/chosen": -23.728988647460938,
"rewards/margins": -9.575835227966309,
"rewards/rejected": -14.153154373168945,
"step": 24
},
{
"epoch": 0.06233442418575658,
"grad_norm": 58.5,
"learning_rate": 6.990565776704475e-07,
"logits/chosen": 0.9191975593566895,
"logits/rejected": 0.908176839351654,
"logps/chosen": -1.6683969497680664,
"logps/rejected": -1.231262445449829,
"loss": 7.8375,
"rewards/accuracies": 0.4375,
"rewards/chosen": -16.683971405029297,
"rewards/margins": -4.371344566345215,
"rewards/rejected": -12.31262493133545,
"step": 25
},
{
"epoch": 0.06482780115318684,
"grad_norm": 120.5,
"learning_rate": 6.988354038729676e-07,
"logits/chosen": 0.9013136625289917,
"logits/rejected": 0.7893968820571899,
"logps/chosen": -2.127075433731079,
"logps/rejected": -1.3035414218902588,
"loss": 10.6297,
"rewards/accuracies": 0.34375,
"rewards/chosen": -21.270755767822266,
"rewards/margins": -8.23534107208252,
"rewards/rejected": -13.03541374206543,
"step": 26
},
{
"epoch": 0.06732117812061711,
"grad_norm": 82.0,
"learning_rate": 6.985910028983336e-07,
"logits/chosen": 0.9725473523139954,
"logits/rejected": 0.9624121189117432,
"logps/chosen": -2.005342483520508,
"logps/rejected": -1.2962756156921387,
"loss": 8.3075,
"rewards/accuracies": 0.1875,
"rewards/chosen": -20.053424835205078,
"rewards/margins": -7.09066915512085,
"rewards/rejected": -12.962756156921387,
"step": 27
},
{
"epoch": 0.06981455508804738,
"grad_norm": 52.25,
"learning_rate": 6.983233910199648e-07,
"logits/chosen": 0.8846550583839417,
"logits/rejected": 0.9423845410346985,
"logps/chosen": -1.6742780208587646,
"logps/rejected": -1.188732385635376,
"loss": 7.3535,
"rewards/accuracies": 0.40625,
"rewards/chosen": -16.742778778076172,
"rewards/margins": -4.8554558753967285,
"rewards/rejected": -11.887323379516602,
"step": 28
},
{
"epoch": 0.07230793205547764,
"grad_norm": 74.0,
"learning_rate": 6.98032586056776e-07,
"logits/chosen": 0.9702792167663574,
"logits/rejected": 0.8728958368301392,
"logps/chosen": -1.8141976594924927,
"logps/rejected": -1.3005653619766235,
"loss": 7.5819,
"rewards/accuracies": 0.40625,
"rewards/chosen": -18.14197540283203,
"rewards/margins": -5.136322975158691,
"rewards/rejected": -13.005653381347656,
"step": 29
},
{
"epoch": 0.0748013090229079,
"grad_norm": 115.5,
"learning_rate": 6.977186073719925e-07,
"logits/chosen": 0.855915904045105,
"logits/rejected": 0.7963756918907166,
"logps/chosen": -1.9207674264907837,
"logps/rejected": -1.16620934009552,
"loss": 9.6435,
"rewards/accuracies": 0.34375,
"rewards/chosen": -19.20767593383789,
"rewards/margins": -7.545581340789795,
"rewards/rejected": -11.662094116210938,
"step": 30
},
{
"epoch": 0.07729468599033816,
"grad_norm": 32.5,
"learning_rate": 6.973814758718596e-07,
"logits/chosen": 0.9370359182357788,
"logits/rejected": 0.896599531173706,
"logps/chosen": -1.3457211256027222,
"logps/rejected": -1.0361064672470093,
"loss": 4.8073,
"rewards/accuracies": 0.46875,
"rewards/chosen": -13.457212448120117,
"rewards/margins": -3.0961475372314453,
"rewards/rejected": -10.361063957214355,
"step": 31
},
{
"epoch": 0.07978806295776843,
"grad_norm": 68.0,
"learning_rate": 6.97021214004251e-07,
"logits/chosen": 0.8998004198074341,
"logits/rejected": 0.9092382192611694,
"logps/chosen": -1.5766998529434204,
"logps/rejected": -1.1409169435501099,
"loss": 6.4345,
"rewards/accuracies": 0.4375,
"rewards/chosen": -15.766998291015625,
"rewards/margins": -4.357827663421631,
"rewards/rejected": -11.409171104431152,
"step": 32
},
{
"epoch": 0.0822814399251987,
"grad_norm": 76.0,
"learning_rate": 6.96637845757174e-07,
"logits/chosen": 0.8456138372421265,
"logits/rejected": 0.9118346571922302,
"logps/chosen": -2.059769868850708,
"logps/rejected": -1.3298571109771729,
"loss": 9.2994,
"rewards/accuracies": 0.34375,
"rewards/chosen": -20.597698211669922,
"rewards/margins": -7.299127101898193,
"rewards/rejected": -13.29857063293457,
"step": 33
},
{
"epoch": 0.08477481689262896,
"grad_norm": 50.0,
"learning_rate": 6.962313966571722e-07,
"logits/chosen": 0.8960351347923279,
"logits/rejected": 0.8999559879302979,
"logps/chosen": -1.4601362943649292,
"logps/rejected": -1.4213840961456299,
"loss": 4.6079,
"rewards/accuracies": 0.46875,
"rewards/chosen": -14.601361274719238,
"rewards/margins": -0.3875225782394409,
"rewards/rejected": -14.21384048461914,
"step": 34
},
{
"epoch": 0.08726819386005921,
"grad_norm": 47.0,
"learning_rate": 6.958018937676262e-07,
"logits/chosen": 0.9134461879730225,
"logits/rejected": 0.8920255899429321,
"logps/chosen": -1.46458101272583,
"logps/rejected": -1.2648781538009644,
"loss": 5.2894,
"rewards/accuracies": 0.4375,
"rewards/chosen": -14.6458101272583,
"rewards/margins": -1.99702787399292,
"rewards/rejected": -12.648781776428223,
"step": 35
},
{
"epoch": 0.08976157082748948,
"grad_norm": 86.5,
"learning_rate": 6.953493656869511e-07,
"logits/chosen": 0.9218010902404785,
"logits/rejected": 0.7793766260147095,
"logps/chosen": -1.7202703952789307,
"logps/rejected": -1.3021219968795776,
"loss": 6.3929,
"rewards/accuracies": 0.40625,
"rewards/chosen": -17.20270538330078,
"rewards/margins": -4.181485176086426,
"rewards/rejected": -13.021220207214355,
"step": 36
},
{
"epoch": 0.09225494779491974,
"grad_norm": 55.0,
"learning_rate": 6.948738425466925e-07,
"logits/chosen": 0.9479645490646362,
"logits/rejected": 0.8090993762016296,
"logps/chosen": -1.6109609603881836,
"logps/rejected": -1.395875334739685,
"loss": 5.8692,
"rewards/accuracies": 0.53125,
"rewards/chosen": -16.10961151123047,
"rewards/margins": -2.1508564949035645,
"rewards/rejected": -13.95875358581543,
"step": 37
},
{
"epoch": 0.09474832476235001,
"grad_norm": 32.75,
"learning_rate": 6.943753560095204e-07,
"logits/chosen": 1.020020604133606,
"logits/rejected": 0.9307425618171692,
"logps/chosen": -1.447858452796936,
"logps/rejected": -1.0777596235275269,
"loss": 5.3116,
"rewards/accuracies": 0.375,
"rewards/chosen": -14.478584289550781,
"rewards/margins": -3.700988292694092,
"rewards/rejected": -10.777596473693848,
"step": 38
},
{
"epoch": 0.09724170172978028,
"grad_norm": 45.0,
"learning_rate": 6.938539392671203e-07,
"logits/chosen": 0.939849317073822,
"logits/rejected": 0.9025396108627319,
"logps/chosen": -1.6659669876098633,
"logps/rejected": -1.1725908517837524,
"loss": 7.0117,
"rewards/accuracies": 0.3125,
"rewards/chosen": -16.659669876098633,
"rewards/margins": -4.933763027191162,
"rewards/rejected": -11.725908279418945,
"step": 39
},
{
"epoch": 0.09973507869721053,
"grad_norm": 74.5,
"learning_rate": 6.933096270379841e-07,
"logits/chosen": 0.996893584728241,
"logits/rejected": 0.912053108215332,
"logps/chosen": -1.2696326971054077,
"logps/rejected": -1.1286218166351318,
"loss": 4.2095,
"rewards/accuracies": 0.5625,
"rewards/chosen": -12.69632625579834,
"rewards/margins": -1.410109281539917,
"rewards/rejected": -11.286218643188477,
"step": 40
},
{
"epoch": 0.1022284556646408,
"grad_norm": 47.0,
"learning_rate": 6.927424555650974e-07,
"logits/chosen": 0.9594122171401978,
"logits/rejected": 0.8550945520401001,
"logps/chosen": -1.5375633239746094,
"logps/rejected": -1.2417051792144775,
"loss": 5.0733,
"rewards/accuracies": 0.375,
"rewards/chosen": -15.375633239746094,
"rewards/margins": -2.9585819244384766,
"rewards/rejected": -12.4170503616333,
"step": 41
},
{
"epoch": 0.10472183263207106,
"grad_norm": 44.0,
"learning_rate": 6.921524626135268e-07,
"logits/chosen": 0.8996063470840454,
"logits/rejected": 0.9653378129005432,
"logps/chosen": -1.763725996017456,
"logps/rejected": -1.0993306636810303,
"loss": 8.0476,
"rewards/accuracies": 0.21875,
"rewards/chosen": -17.637258529663086,
"rewards/margins": -6.643953323364258,
"rewards/rejected": -10.993307113647461,
"step": 42
},
{
"epoch": 0.10721520959950133,
"grad_norm": 42.25,
"learning_rate": 6.915396874679055e-07,
"logits/chosen": 1.0091477632522583,
"logits/rejected": 0.9392642974853516,
"logps/chosen": -1.2002838850021362,
"logps/rejected": -1.0848746299743652,
"loss": 3.0284,
"rewards/accuracies": 0.5,
"rewards/chosen": -12.002839088439941,
"rewards/margins": -1.1540918350219727,
"rewards/rejected": -10.848746299743652,
"step": 43
},
{
"epoch": 0.10970858656693158,
"grad_norm": 40.25,
"learning_rate": 6.909041709298168e-07,
"logits/chosen": 0.8822853565216064,
"logits/rejected": 0.8290736079216003,
"logps/chosen": -1.4588274955749512,
"logps/rejected": -1.2779145240783691,
"loss": 4.9219,
"rewards/accuracies": 0.34375,
"rewards/chosen": -14.588274955749512,
"rewards/margins": -1.8091294765472412,
"rewards/rejected": -12.779145240783691,
"step": 44
},
{
"epoch": 0.11220196353436185,
"grad_norm": 56.75,
"learning_rate": 6.902459553150779e-07,
"logits/chosen": 0.9077208638191223,
"logits/rejected": 0.7896067500114441,
"logps/chosen": -1.4615594148635864,
"logps/rejected": -1.2456190586090088,
"loss": 5.1754,
"rewards/accuracies": 0.53125,
"rewards/chosen": -14.615594863891602,
"rewards/margins": -2.1594033241271973,
"rewards/rejected": -12.456191062927246,
"step": 45
},
{
"epoch": 0.11469534050179211,
"grad_norm": 67.0,
"learning_rate": 6.895650844509226e-07,
"logits/chosen": 0.9100595116615295,
"logits/rejected": 0.7619892358779907,
"logps/chosen": -1.6750259399414062,
"logps/rejected": -1.2229750156402588,
"loss": 6.2716,
"rewards/accuracies": 0.34375,
"rewards/chosen": -16.75025749206543,
"rewards/margins": -4.5205078125,
"rewards/rejected": -12.229750633239746,
"step": 46
},
{
"epoch": 0.11718871746922238,
"grad_norm": 70.0,
"learning_rate": 6.88861603673082e-07,
"logits/chosen": 0.8918619751930237,
"logits/rejected": 0.9012125134468079,
"logps/chosen": -1.64901602268219,
"logps/rejected": -1.265884518623352,
"loss": 6.6894,
"rewards/accuracies": 0.53125,
"rewards/chosen": -16.490161895751953,
"rewards/margins": -3.831315279006958,
"rewards/rejected": -12.658845901489258,
"step": 47
},
{
"epoch": 0.11968209443665265,
"grad_norm": 51.0,
"learning_rate": 6.88135559822767e-07,
"logits/chosen": 0.8720345497131348,
"logits/rejected": 0.7581244707107544,
"logps/chosen": -1.872530221939087,
"logps/rejected": -1.4055814743041992,
"loss": 6.6813,
"rewards/accuracies": 0.3125,
"rewards/chosen": -18.72530174255371,
"rewards/margins": -4.669487476348877,
"rewards/rejected": -14.055814743041992,
"step": 48
},
{
"epoch": 0.1221754714040829,
"grad_norm": 52.75,
"learning_rate": 6.873870012435486e-07,
"logits/chosen": 0.8616499900817871,
"logits/rejected": 0.7673721313476562,
"logps/chosen": -1.3419944047927856,
"logps/rejected": -1.2395105361938477,
"loss": 3.3706,
"rewards/accuracies": 0.5,
"rewards/chosen": -13.419943809509277,
"rewards/margins": -1.0248385667800903,
"rewards/rejected": -12.395105361938477,
"step": 49
},
{
"epoch": 0.12466884837151317,
"grad_norm": 56.25,
"learning_rate": 6.866159777781393e-07,
"logits/chosen": 0.8702710866928101,
"logits/rejected": 0.7436060309410095,
"logps/chosen": -1.6595778465270996,
"logps/rejected": -1.1382110118865967,
"loss": 6.7412,
"rewards/accuracies": 0.34375,
"rewards/chosen": -16.595779418945312,
"rewards/margins": -5.213669300079346,
"rewards/rejected": -11.382110595703125,
"step": 50
},
{
"epoch": 0.12716222533894342,
"grad_norm": 50.75,
"learning_rate": 6.858225407650741e-07,
"logits/chosen": 0.7868949174880981,
"logits/rejected": 0.8334120512008667,
"logps/chosen": -1.7013866901397705,
"logps/rejected": -1.3084328174591064,
"loss": 6.2144,
"rewards/accuracies": 0.34375,
"rewards/chosen": -17.013864517211914,
"rewards/margins": -3.929537296295166,
"rewards/rejected": -13.084327697753906,
"step": 51
},
{
"epoch": 0.12965560230637369,
"grad_norm": 65.0,
"learning_rate": 6.850067430352923e-07,
"logits/chosen": 0.8779257535934448,
"logits/rejected": 0.7302612066268921,
"logps/chosen": -1.9540197849273682,
"logps/rejected": -1.4614614248275757,
"loss": 6.5912,
"rewards/accuracies": 0.3125,
"rewards/chosen": -19.540199279785156,
"rewards/margins": -4.925583362579346,
"rewards/rejected": -14.61461353302002,
"step": 52
},
{
"epoch": 0.13214897927380395,
"grad_norm": 79.0,
"learning_rate": 6.8416863890862e-07,
"logits/chosen": 0.91861492395401,
"logits/rejected": 0.8185287714004517,
"logps/chosen": -1.457578182220459,
"logps/rejected": -1.275127649307251,
"loss": 5.29,
"rewards/accuracies": 0.40625,
"rewards/chosen": -14.575782775878906,
"rewards/margins": -1.8245068788528442,
"rewards/rejected": -12.751276016235352,
"step": 53
},
{
"epoch": 0.13464235624123422,
"grad_norm": 25.625,
"learning_rate": 6.833082841901524e-07,
"logits/chosen": 0.8008706569671631,
"logits/rejected": 0.7791386246681213,
"logps/chosen": -1.2828067541122437,
"logps/rejected": -1.14899480342865,
"loss": 3.6665,
"rewards/accuracies": 0.5625,
"rewards/chosen": -12.828067779541016,
"rewards/margins": -1.3381190299987793,
"rewards/rejected": -11.489947319030762,
"step": 54
},
{
"epoch": 0.13713573320866448,
"grad_norm": 37.5,
"learning_rate": 6.82425736166539e-07,
"logits/chosen": 0.8428397178649902,
"logits/rejected": 0.819983720779419,
"logps/chosen": -1.5656299591064453,
"logps/rejected": -1.6019946336746216,
"loss": 5.8169,
"rewards/accuracies": 0.3125,
"rewards/chosen": -15.656298637390137,
"rewards/margins": 0.3636472821235657,
"rewards/rejected": -16.019947052001953,
"step": 55
},
{
"epoch": 0.13962911017609475,
"grad_norm": 43.75,
"learning_rate": 6.815210536021685e-07,
"logits/chosen": 0.7473218441009521,
"logits/rejected": 0.7424555420875549,
"logps/chosen": -1.4687354564666748,
"logps/rejected": -1.2596654891967773,
"loss": 5.3807,
"rewards/accuracies": 0.46875,
"rewards/chosen": -14.687355041503906,
"rewards/margins": -2.090701103210449,
"rewards/rejected": -12.596653938293457,
"step": 56
},
{
"epoch": 0.14212248714352502,
"grad_norm": 33.5,
"learning_rate": 6.805942967352563e-07,
"logits/chosen": 0.8693878650665283,
"logits/rejected": 0.8091084361076355,
"logps/chosen": -1.4544310569763184,
"logps/rejected": -1.1222821474075317,
"loss": 5.2342,
"rewards/accuracies": 0.3125,
"rewards/chosen": -14.544310569763184,
"rewards/margins": -3.3214893341064453,
"rewards/rejected": -11.222820281982422,
"step": 57
},
{
"epoch": 0.14461586411095528,
"grad_norm": 60.75,
"learning_rate": 6.796455272738337e-07,
"logits/chosen": 0.8443146347999573,
"logits/rejected": 0.7834912538528442,
"logps/chosen": -1.630685806274414,
"logps/rejected": -2.097377061843872,
"loss": 5.0217,
"rewards/accuracies": 0.28125,
"rewards/chosen": -16.306856155395508,
"rewards/margins": 4.6669135093688965,
"rewards/rejected": -20.973772048950195,
"step": 58
},
{
"epoch": 0.14710924107838555,
"grad_norm": 34.0,
"learning_rate": 6.78674808391638e-07,
"logits/chosen": 0.7124283313751221,
"logits/rejected": 0.7266104221343994,
"logps/chosen": -1.5309463739395142,
"logps/rejected": -1.204602599143982,
"loss": 4.9925,
"rewards/accuracies": 0.3125,
"rewards/chosen": -15.309463500976562,
"rewards/margins": -3.263436794281006,
"rewards/rejected": -12.046026229858398,
"step": 59
},
{
"epoch": 0.1496026180458158,
"grad_norm": 31.25,
"learning_rate": 6.776822047239079e-07,
"logits/chosen": 0.810710608959198,
"logits/rejected": 0.7433085441589355,
"logps/chosen": -1.3407872915267944,
"logps/rejected": -1.1019080877304077,
"loss": 4.0638,
"rewards/accuracies": 0.4375,
"rewards/chosen": -13.407873153686523,
"rewards/margins": -2.3887932300567627,
"rewards/rejected": -11.019081115722656,
"step": 60
},
{
"epoch": 0.15209599501324605,
"grad_norm": 35.75,
"learning_rate": 6.766677823630784e-07,
"logits/chosen": 0.9204759001731873,
"logits/rejected": 0.8126802444458008,
"logps/chosen": -1.3521380424499512,
"logps/rejected": -1.230940341949463,
"loss": 3.1759,
"rewards/accuracies": 0.46875,
"rewards/chosen": -13.521379470825195,
"rewards/margins": -1.2119766473770142,
"rewards/rejected": -12.309402465820312,
"step": 61
},
{
"epoch": 0.15458937198067632,
"grad_norm": 74.5,
"learning_rate": 6.756316088543799e-07,
"logits/chosen": 0.8732976317405701,
"logits/rejected": 0.7553092837333679,
"logps/chosen": -1.6522544622421265,
"logps/rejected": -1.304805874824524,
"loss": 5.2966,
"rewards/accuracies": 0.3125,
"rewards/chosen": -16.522544860839844,
"rewards/margins": -3.474484920501709,
"rewards/rejected": -13.048059463500977,
"step": 62
},
{
"epoch": 0.1570827489481066,
"grad_norm": 32.25,
"learning_rate": 6.74573753191342e-07,
"logits/chosen": 0.8279662728309631,
"logits/rejected": 0.7906845808029175,
"logps/chosen": -1.3082658052444458,
"logps/rejected": -1.215187907218933,
"loss": 3.2516,
"rewards/accuracies": 0.40625,
"rewards/chosen": -13.082658767700195,
"rewards/margins": -0.9307788610458374,
"rewards/rejected": -12.15187931060791,
"step": 63
},
{
"epoch": 0.15957612591553685,
"grad_norm": 30.0,
"learning_rate": 6.734942858111986e-07,
"logits/chosen": 0.8267450332641602,
"logits/rejected": 0.7294779419898987,
"logps/chosen": -1.272381067276001,
"logps/rejected": -1.2460516691207886,
"loss": 3.548,
"rewards/accuracies": 0.59375,
"rewards/chosen": -12.723810195922852,
"rewards/margins": -0.26329320669174194,
"rewards/rejected": -12.460516929626465,
"step": 64
},
{
"epoch": 0.16206950288296712,
"grad_norm": 50.75,
"learning_rate": 6.723932785901975e-07,
"logits/chosen": 0.9013331532478333,
"logits/rejected": 0.8166715502738953,
"logps/chosen": -1.563563346862793,
"logps/rejected": -1.2308857440948486,
"loss": 4.8669,
"rewards/accuracies": 0.34375,
"rewards/chosen": -15.63563346862793,
"rewards/margins": -3.3267745971679688,
"rewards/rejected": -12.308857917785645,
"step": 65
},
{
"epoch": 0.1645628798503974,
"grad_norm": 18.75,
"learning_rate": 6.712708048388158e-07,
"logits/chosen": 0.833111047744751,
"logits/rejected": 0.7176869511604309,
"logps/chosen": -1.2247819900512695,
"logps/rejected": -1.460402488708496,
"loss": 2.2855,
"rewards/accuracies": 0.6875,
"rewards/chosen": -12.247820854187012,
"rewards/margins": 2.3562047481536865,
"rewards/rejected": -14.604024887084961,
"step": 66
},
{
"epoch": 0.16705625681782765,
"grad_norm": 41.75,
"learning_rate": 6.701269392968773e-07,
"logits/chosen": 0.8795142769813538,
"logits/rejected": 0.7385881543159485,
"logps/chosen": -1.5149438381195068,
"logps/rejected": -1.4080404043197632,
"loss": 3.8984,
"rewards/accuracies": 0.53125,
"rewards/chosen": -15.14943790435791,
"rewards/margins": -1.0690345764160156,
"rewards/rejected": -14.080402374267578,
"step": 67
},
{
"epoch": 0.16954963378525792,
"grad_norm": 37.0,
"learning_rate": 6.689617581285765e-07,
"logits/chosen": 0.8711040616035461,
"logits/rejected": 0.6986596584320068,
"logps/chosen": -1.6020938158035278,
"logps/rejected": -1.350814938545227,
"loss": 4.9801,
"rewards/accuracies": 0.40625,
"rewards/chosen": -16.02094078063965,
"rewards/margins": -2.5127904415130615,
"rewards/rejected": -13.508148193359375,
"step": 68
},
{
"epoch": 0.17204301075268819,
"grad_norm": 45.25,
"learning_rate": 6.677753389174075e-07,
"logits/chosen": 0.9395517706871033,
"logits/rejected": 0.7759240865707397,
"logps/chosen": -1.5319843292236328,
"logps/rejected": -1.3424811363220215,
"loss": 5.1017,
"rewards/accuracies": 0.5625,
"rewards/chosen": -15.319843292236328,
"rewards/margins": -1.8950309753417969,
"rewards/rejected": -13.424812316894531,
"step": 69
},
{
"epoch": 0.17453638772011842,
"grad_norm": 35.0,
"learning_rate": 6.665677606609973e-07,
"logits/chosen": 0.8715901374816895,
"logits/rejected": 0.8017496466636658,
"logps/chosen": -1.5180387496948242,
"logps/rejected": -1.3085150718688965,
"loss": 4.5983,
"rewards/accuracies": 0.4375,
"rewards/chosen": -15.180386543273926,
"rewards/margins": -2.095235824584961,
"rewards/rejected": -13.085149765014648,
"step": 70
},
{
"epoch": 0.1770297646875487,
"grad_norm": 42.75,
"learning_rate": 6.653391037658466e-07,
"logits/chosen": 0.8521101474761963,
"logits/rejected": 0.7697039246559143,
"logps/chosen": -1.5679848194122314,
"logps/rejected": -1.3208036422729492,
"loss": 4.539,
"rewards/accuracies": 0.375,
"rewards/chosen": -15.679848670959473,
"rewards/margins": -2.471813201904297,
"rewards/rejected": -13.208035469055176,
"step": 71
},
{
"epoch": 0.17952314165497896,
"grad_norm": 46.5,
"learning_rate": 6.640894500419754e-07,
"logits/chosen": 0.9186801314353943,
"logits/rejected": 0.7545082569122314,
"logps/chosen": -1.5185034275054932,
"logps/rejected": -1.2024480104446411,
"loss": 5.072,
"rewards/accuracies": 0.3125,
"rewards/chosen": -15.185033798217773,
"rewards/margins": -3.160555601119995,
"rewards/rejected": -12.024478912353516,
"step": 72
},
{
"epoch": 0.18201651862240922,
"grad_norm": 23.375,
"learning_rate": 6.628188826974758e-07,
"logits/chosen": 0.8491867780685425,
"logits/rejected": 0.7822642922401428,
"logps/chosen": -1.1384882926940918,
"logps/rejected": -1.1687763929367065,
"loss": 2.5731,
"rewards/accuracies": 0.5,
"rewards/chosen": -11.384883880615234,
"rewards/margins": 0.3028792440891266,
"rewards/rejected": -11.687764167785645,
"step": 73
},
{
"epoch": 0.1845098955898395,
"grad_norm": 33.0,
"learning_rate": 6.615274863329715e-07,
"logits/chosen": 0.9214451909065247,
"logits/rejected": 0.75420743227005,
"logps/chosen": -1.5405924320220947,
"logps/rejected": -1.6105780601501465,
"loss": 2.1388,
"rewards/accuracies": 0.53125,
"rewards/chosen": -15.405925750732422,
"rewards/margins": 0.6998560428619385,
"rewards/rejected": -16.10578155517578,
"step": 74
},
{
"epoch": 0.18700327255726976,
"grad_norm": 39.5,
"learning_rate": 6.602153469359852e-07,
"logits/chosen": 0.905957043170929,
"logits/rejected": 0.7297846078872681,
"logps/chosen": -1.4268043041229248,
"logps/rejected": -1.3724501132965088,
"loss": 2.9419,
"rewards/accuracies": 0.4375,
"rewards/chosen": -14.268043518066406,
"rewards/margins": -0.5435430407524109,
"rewards/rejected": -13.72450065612793,
"step": 75
},
{
"epoch": 0.18949664952470002,
"grad_norm": 34.5,
"learning_rate": 6.588825518752124e-07,
"logits/chosen": 0.9336991310119629,
"logits/rejected": 0.7428035736083984,
"logps/chosen": -1.4046045541763306,
"logps/rejected": -1.1234869956970215,
"loss": 4.3453,
"rewards/accuracies": 0.34375,
"rewards/chosen": -14.046045303344727,
"rewards/margins": -2.8111753463745117,
"rewards/rejected": -11.234869003295898,
"step": 76
},
{
"epoch": 0.1919900264921303,
"grad_norm": 40.75,
"learning_rate": 6.575291898947046e-07,
"logits/chosen": 0.8886721134185791,
"logits/rejected": 0.6734512448310852,
"logps/chosen": -1.4164619445800781,
"logps/rejected": -1.3936028480529785,
"loss": 3.5175,
"rewards/accuracies": 0.5,
"rewards/chosen": -14.164620399475098,
"rewards/margins": -0.22859010100364685,
"rewards/rejected": -13.936028480529785,
"step": 77
},
{
"epoch": 0.19448340345956056,
"grad_norm": 41.75,
"learning_rate": 6.561553511079596e-07,
"logits/chosen": 0.829595148563385,
"logits/rejected": 0.6838914155960083,
"logps/chosen": -1.5907689332962036,
"logps/rejected": -1.4232121706008911,
"loss": 4.2889,
"rewards/accuracies": 0.375,
"rewards/chosen": -15.907690048217773,
"rewards/margins": -1.675569772720337,
"rewards/rejected": -14.232120513916016,
"step": 78
},
{
"epoch": 0.1969767804269908,
"grad_norm": 34.5,
"learning_rate": 6.54761126991922e-07,
"logits/chosen": 0.9110915660858154,
"logits/rejected": 0.6820324063301086,
"logps/chosen": -1.547090768814087,
"logps/rejected": -1.3237884044647217,
"loss": 4.8103,
"rewards/accuracies": 0.40625,
"rewards/chosen": -15.470909118652344,
"rewards/margins": -2.2330236434936523,
"rewards/rejected": -13.237884521484375,
"step": 79
},
{
"epoch": 0.19947015739442106,
"grad_norm": 43.75,
"learning_rate": 6.533466103808918e-07,
"logits/chosen": 0.8135228157043457,
"logits/rejected": 0.7062645554542542,
"logps/chosen": -1.5417137145996094,
"logps/rejected": -1.3554742336273193,
"loss": 5.119,
"rewards/accuracies": 0.46875,
"rewards/chosen": -15.417137145996094,
"rewards/margins": -1.8623945713043213,
"rewards/rejected": -13.554742813110352,
"step": 80
},
{
"epoch": 0.20196353436185133,
"grad_norm": 64.5,
"learning_rate": 6.519118954603431e-07,
"logits/chosen": 0.818507194519043,
"logits/rejected": 0.7929250001907349,
"logps/chosen": -1.6561720371246338,
"logps/rejected": -1.3400187492370605,
"loss": 5.316,
"rewards/accuracies": 0.3125,
"rewards/chosen": -16.56171989440918,
"rewards/margins": -3.161534309387207,
"rewards/rejected": -13.400186538696289,
"step": 81
},
{
"epoch": 0.2044569113292816,
"grad_norm": 22.75,
"learning_rate": 6.504570777606531e-07,
"logits/chosen": 0.8459409475326538,
"logits/rejected": 0.7011772990226746,
"logps/chosen": -1.3367918729782104,
"logps/rejected": -1.2118648290634155,
"loss": 3.5423,
"rewards/accuracies": 0.53125,
"rewards/chosen": -13.367918014526367,
"rewards/margins": -1.249271273612976,
"rewards/rejected": -12.118647575378418,
"step": 82
},
{
"epoch": 0.20695028829671186,
"grad_norm": 25.875,
"learning_rate": 6.489822541507404e-07,
"logits/chosen": 0.8798666596412659,
"logits/rejected": 0.7069228887557983,
"logps/chosen": -1.1269609928131104,
"logps/rejected": -1.1012687683105469,
"loss": 2.5165,
"rewards/accuracies": 0.53125,
"rewards/chosen": -11.269609451293945,
"rewards/margins": -0.25692227482795715,
"rewards/rejected": -11.012688636779785,
"step": 83
},
{
"epoch": 0.20944366526414213,
"grad_norm": 30.875,
"learning_rate": 6.474875228316158e-07,
"logits/chosen": 0.9361159801483154,
"logits/rejected": 0.8077545762062073,
"logps/chosen": -1.4038376808166504,
"logps/rejected": -1.357134222984314,
"loss": 3.4071,
"rewards/accuracies": 0.46875,
"rewards/chosen": -14.038376808166504,
"rewards/margins": -0.4670344293117523,
"rewards/rejected": -13.571342468261719,
"step": 84
},
{
"epoch": 0.2119370422315724,
"grad_norm": 20.625,
"learning_rate": 6.459729833298434e-07,
"logits/chosen": 0.7581954002380371,
"logits/rejected": 0.7710189819335938,
"logps/chosen": -1.2664942741394043,
"logps/rejected": -1.2973535060882568,
"loss": 3.0325,
"rewards/accuracies": 0.59375,
"rewards/chosen": -12.66494369506836,
"rewards/margins": 0.3085915148258209,
"rewards/rejected": -12.973533630371094,
"step": 85
},
{
"epoch": 0.21443041919900266,
"grad_norm": 38.25,
"learning_rate": 6.444387364909134e-07,
"logits/chosen": 0.8360967636108398,
"logits/rejected": 0.7465887069702148,
"logps/chosen": -1.4347429275512695,
"logps/rejected": -1.4330288171768188,
"loss": 3.0653,
"rewards/accuracies": 0.5,
"rewards/chosen": -14.347427368164062,
"rewards/margins": -0.01714131236076355,
"rewards/rejected": -14.33028793334961,
"step": 86
},
{
"epoch": 0.21692379616643293,
"grad_norm": 25.375,
"learning_rate": 6.428848844725274e-07,
"logits/chosen": 0.7691155672073364,
"logits/rejected": 0.6017144322395325,
"logps/chosen": -1.2951093912124634,
"logps/rejected": -1.3574622869491577,
"loss": 2.8385,
"rewards/accuracies": 0.5625,
"rewards/chosen": -12.951093673706055,
"rewards/margins": 0.623528778553009,
"rewards/rejected": -13.574623107910156,
"step": 87
},
{
"epoch": 0.21941717313386316,
"grad_norm": 48.0,
"learning_rate": 6.413115307377965e-07,
"logits/chosen": 0.8395971059799194,
"logits/rejected": 0.6882689595222473,
"logps/chosen": -1.4701257944107056,
"logps/rejected": -1.4139220714569092,
"loss": 3.306,
"rewards/accuracies": 0.4375,
"rewards/chosen": -14.701258659362793,
"rewards/margins": -0.5620384216308594,
"rewards/rejected": -14.1392183303833,
"step": 88
},
{
"epoch": 0.22191055010129343,
"grad_norm": 31.625,
"learning_rate": 6.397187800483519e-07,
"logits/chosen": 0.8466267585754395,
"logits/rejected": 0.6940711140632629,
"logps/chosen": -1.4214099645614624,
"logps/rejected": -1.3584471940994263,
"loss": 2.831,
"rewards/accuracies": 0.59375,
"rewards/chosen": -14.214098930358887,
"rewards/margins": -0.6296274662017822,
"rewards/rejected": -13.58447265625,
"step": 89
},
{
"epoch": 0.2244039270687237,
"grad_norm": 33.5,
"learning_rate": 6.381067384573693e-07,
"logits/chosen": 0.8270119428634644,
"logits/rejected": 0.65580815076828,
"logps/chosen": -1.4739896059036255,
"logps/rejected": -1.2760796546936035,
"loss": 3.8132,
"rewards/accuracies": 0.40625,
"rewards/chosen": -14.739895820617676,
"rewards/margins": -1.9790987968444824,
"rewards/rejected": -12.760797500610352,
"step": 90
},
{
"epoch": 0.22689730403615396,
"grad_norm": 27.375,
"learning_rate": 6.364755133025077e-07,
"logits/chosen": 0.8560658693313599,
"logits/rejected": 0.6389474868774414,
"logps/chosen": -1.2929621934890747,
"logps/rejected": -1.887449860572815,
"loss": 2.5581,
"rewards/accuracies": 0.59375,
"rewards/chosen": -12.929622650146484,
"rewards/margins": 5.944877624511719,
"rewards/rejected": -18.87449836730957,
"step": 91
},
{
"epoch": 0.22939068100358423,
"grad_norm": 51.25,
"learning_rate": 6.348252131987621e-07,
"logits/chosen": 0.9491753578186035,
"logits/rejected": 0.5920038819313049,
"logps/chosen": -1.7384018898010254,
"logps/rejected": -1.4728763103485107,
"loss": 4.2467,
"rewards/accuracies": 0.4375,
"rewards/chosen": -17.38401985168457,
"rewards/margins": -2.6552560329437256,
"rewards/rejected": -14.728763580322266,
"step": 92
},
{
"epoch": 0.2318840579710145,
"grad_norm": 41.75,
"learning_rate": 6.331559480312316e-07,
"logits/chosen": 0.8945069313049316,
"logits/rejected": 0.6501726508140564,
"logps/chosen": -1.6423015594482422,
"logps/rejected": -1.5272799730300903,
"loss": 3.7742,
"rewards/accuracies": 0.46875,
"rewards/chosen": -16.423015594482422,
"rewards/margins": -1.150214433670044,
"rewards/rejected": -15.272799491882324,
"step": 93
},
{
"epoch": 0.23437743493844476,
"grad_norm": 27.875,
"learning_rate": 6.314678289478021e-07,
"logits/chosen": 0.868090033531189,
"logits/rejected": 0.7094947695732117,
"logps/chosen": -1.3805748224258423,
"logps/rejected": -1.3794375658035278,
"loss": 2.4021,
"rewards/accuracies": 0.5,
"rewards/chosen": -13.805749893188477,
"rewards/margins": -0.011373043060302734,
"rewards/rejected": -13.794376373291016,
"step": 94
},
{
"epoch": 0.23687081190587503,
"grad_norm": 30.625,
"learning_rate": 6.297609683517465e-07,
"logits/chosen": 0.9310474395751953,
"logits/rejected": 0.7228609323501587,
"logps/chosen": -1.339646339416504,
"logps/rejected": -1.4403069019317627,
"loss": 2.1866,
"rewards/accuracies": 0.65625,
"rewards/chosen": -13.396462440490723,
"rewards/margins": 1.0066064596176147,
"rewards/rejected": -14.403070449829102,
"step": 95
},
{
"epoch": 0.2393641888733053,
"grad_norm": 27.875,
"learning_rate": 6.280354798942394e-07,
"logits/chosen": 0.8475272059440613,
"logits/rejected": 0.7736526727676392,
"logps/chosen": -1.3078885078430176,
"logps/rejected": -1.3328254222869873,
"loss": 2.2729,
"rewards/accuracies": 0.5625,
"rewards/chosen": -13.078886985778809,
"rewards/margins": 0.24936795234680176,
"rewards/rejected": -13.328254699707031,
"step": 96
},
{
"epoch": 0.24185756584073553,
"grad_norm": 23.125,
"learning_rate": 6.262914784667902e-07,
"logits/chosen": 0.8516014814376831,
"logits/rejected": 0.6699912548065186,
"logps/chosen": -1.2598787546157837,
"logps/rejected": -1.2950444221496582,
"loss": 3.0957,
"rewards/accuracies": 0.625,
"rewards/chosen": -12.598786354064941,
"rewards/margins": 0.35165655612945557,
"rewards/rejected": -12.950444221496582,
"step": 97
},
{
"epoch": 0.2443509428081658,
"grad_norm": 37.75,
"learning_rate": 6.245290801935929e-07,
"logits/chosen": 0.8076661229133606,
"logits/rejected": 0.6437760591506958,
"logps/chosen": -1.4585728645324707,
"logps/rejected": -1.3383572101593018,
"loss": 4.0758,
"rewards/accuracies": 0.40625,
"rewards/chosen": -14.58572769165039,
"rewards/margins": -1.2021559476852417,
"rewards/rejected": -13.38357162475586,
"step": 98
},
{
"epoch": 0.24684431977559607,
"grad_norm": 22.875,
"learning_rate": 6.227484024237941e-07,
"logits/chosen": 0.8829818367958069,
"logits/rejected": 0.6302488446235657,
"logps/chosen": -1.3252794742584229,
"logps/rejected": -1.350675344467163,
"loss": 2.156,
"rewards/accuracies": 0.59375,
"rewards/chosen": -13.252795219421387,
"rewards/margins": 0.2539580166339874,
"rewards/rejected": -13.506753921508789,
"step": 99
},
{
"epoch": 0.24933769674302633,
"grad_norm": 42.25,
"learning_rate": 6.209495637236789e-07,
"logits/chosen": 0.7620182037353516,
"logits/rejected": 0.7404342889785767,
"logps/chosen": -1.7423349618911743,
"logps/rejected": -1.6396631002426147,
"loss": 4.6426,
"rewards/accuracies": 0.53125,
"rewards/chosen": -17.423349380493164,
"rewards/margins": -1.026718258857727,
"rewards/rejected": -16.396631240844727,
"step": 100
},
{
"epoch": 0.2518310737104566,
"grad_norm": 44.25,
"learning_rate": 6.191326838687767e-07,
"logits/chosen": 0.8130788803100586,
"logits/rejected": 0.6447663307189941,
"logps/chosen": -1.6222593784332275,
"logps/rejected": -1.539342999458313,
"loss": 3.8946,
"rewards/accuracies": 0.59375,
"rewards/chosen": -16.222591400146484,
"rewards/margins": -0.8291639089584351,
"rewards/rejected": -15.393428802490234,
"step": 101
},
{
"epoch": 0.25432445067788684,
"grad_norm": 23.25,
"learning_rate": 6.172978838358858e-07,
"logits/chosen": 0.8688798546791077,
"logits/rejected": 0.7208373546600342,
"logps/chosen": -1.2495828866958618,
"logps/rejected": -1.16335129737854,
"loss": 3.3786,
"rewards/accuracies": 0.53125,
"rewards/chosen": -12.495828628540039,
"rewards/margins": -0.8623146414756775,
"rewards/rejected": -11.633513450622559,
"step": 102
},
{
"epoch": 0.25681782764531713,
"grad_norm": 28.625,
"learning_rate": 6.154452857950179e-07,
"logits/chosen": 0.867901086807251,
"logits/rejected": 0.6571163535118103,
"logps/chosen": -1.4274240732192993,
"logps/rejected": -1.2099568843841553,
"loss": 4.0273,
"rewards/accuracies": 0.40625,
"rewards/chosen": -14.274239540100098,
"rewards/margins": -2.174670934677124,
"rewards/rejected": -12.099568367004395,
"step": 103
},
{
"epoch": 0.25931120461274737,
"grad_norm": 17.5,
"learning_rate": 6.135750131012639e-07,
"logits/chosen": 0.8423357009887695,
"logits/rejected": 0.7953418493270874,
"logps/chosen": -1.1816288232803345,
"logps/rejected": -1.4284311532974243,
"loss": 1.5765,
"rewards/accuracies": 0.84375,
"rewards/chosen": -11.816287994384766,
"rewards/margins": 2.468022346496582,
"rewards/rejected": -14.284311294555664,
"step": 104
},
{
"epoch": 0.26180458158017766,
"grad_norm": 48.0,
"learning_rate": 6.116871902865795e-07,
"logits/chosen": 0.7953894138336182,
"logits/rejected": 0.6910791993141174,
"logps/chosen": -1.4984780550003052,
"logps/rejected": -1.359675645828247,
"loss": 4.2421,
"rewards/accuracies": 0.46875,
"rewards/chosen": -14.984780311584473,
"rewards/margins": -1.3880234956741333,
"rewards/rejected": -13.596756935119629,
"step": 105
},
{
"epoch": 0.2642979585476079,
"grad_norm": 14.5,
"learning_rate": 6.097819430514944e-07,
"logits/chosen": 0.8314008712768555,
"logits/rejected": 0.6421066522598267,
"logps/chosen": -1.1923877000808716,
"logps/rejected": -1.403716802597046,
"loss": 1.3615,
"rewards/accuracies": 0.6875,
"rewards/chosen": -11.92387580871582,
"rewards/margins": 2.1132919788360596,
"rewards/rejected": -14.037168502807617,
"step": 106
},
{
"epoch": 0.2667913355150382,
"grad_norm": 41.25,
"learning_rate": 6.078593982567416e-07,
"logits/chosen": 0.9006607532501221,
"logits/rejected": 0.7951247096061707,
"logps/chosen": -1.5271791219711304,
"logps/rejected": -1.3939223289489746,
"loss": 3.7453,
"rewards/accuracies": 0.5,
"rewards/chosen": -15.271790504455566,
"rewards/margins": -1.332566738128662,
"rewards/rejected": -13.939225196838379,
"step": 107
},
{
"epoch": 0.26928471248246844,
"grad_norm": 47.25,
"learning_rate": 6.059196839148109e-07,
"logits/chosen": 0.7548659443855286,
"logits/rejected": 0.6844202280044556,
"logps/chosen": -1.4953826665878296,
"logps/rejected": -1.211591362953186,
"loss": 5.099,
"rewards/accuracies": 0.375,
"rewards/chosen": -14.953826904296875,
"rewards/margins": -2.8379130363464355,
"rewards/rejected": -12.115914344787598,
"step": 108
},
{
"epoch": 0.27177808944989873,
"grad_norm": 26.75,
"learning_rate": 6.039629291814247e-07,
"logits/chosen": 0.7883430123329163,
"logits/rejected": 0.6593764424324036,
"logps/chosen": -1.4129087924957275,
"logps/rejected": -1.6393111944198608,
"loss": 2.0234,
"rewards/accuracies": 0.71875,
"rewards/chosen": -14.12908935546875,
"rewards/margins": 2.2640252113342285,
"rewards/rejected": -16.39311408996582,
"step": 109
},
{
"epoch": 0.27427146641732897,
"grad_norm": 47.25,
"learning_rate": 6.019892643469387e-07,
"logits/chosen": 0.8495079874992371,
"logits/rejected": 0.7186658978462219,
"logps/chosen": -1.4737249612808228,
"logps/rejected": -1.3164616823196411,
"loss": 3.8864,
"rewards/accuracies": 0.4375,
"rewards/chosen": -14.737249374389648,
"rewards/margins": -1.5726318359375,
"rewards/rejected": -13.164617538452148,
"step": 110
},
{
"epoch": 0.2767648433847592,
"grad_norm": 60.5,
"learning_rate": 5.999988208276662e-07,
"logits/chosen": 0.8825462460517883,
"logits/rejected": 0.6535596251487732,
"logps/chosen": -1.5816519260406494,
"logps/rejected": -1.498726725578308,
"loss": 3.1086,
"rewards/accuracies": 0.40625,
"rewards/chosen": -15.816520690917969,
"rewards/margins": -0.8292534351348877,
"rewards/rejected": -14.987266540527344,
"step": 111
},
{
"epoch": 0.2792582203521895,
"grad_norm": 54.0,
"learning_rate": 5.979917311571282e-07,
"logits/chosen": 0.8688668012619019,
"logits/rejected": 0.5492098927497864,
"logps/chosen": -1.4838958978652954,
"logps/rejected": -1.6213023662567139,
"loss": 2.3478,
"rewards/accuracies": 0.4375,
"rewards/chosen": -14.838959693908691,
"rewards/margins": 1.3740637302398682,
"rewards/rejected": -16.213022232055664,
"step": 112
},
{
"epoch": 0.28175159731961974,
"grad_norm": 46.5,
"learning_rate": 5.959681289772278e-07,
"logits/chosen": 0.842609703540802,
"logits/rejected": 0.6387814283370972,
"logps/chosen": -1.5294029712677002,
"logps/rejected": -1.7203947305679321,
"loss": 2.5737,
"rewards/accuracies": 0.5625,
"rewards/chosen": -15.294027328491211,
"rewards/margins": 1.9099199771881104,
"rewards/rejected": -17.203948974609375,
"step": 113
},
{
"epoch": 0.28424497428705003,
"grad_norm": 22.875,
"learning_rate": 5.939281490293527e-07,
"logits/chosen": 0.7885753512382507,
"logits/rejected": 0.7003703713417053,
"logps/chosen": -1.6169934272766113,
"logps/rejected": -1.616774082183838,
"loss": 3.128,
"rewards/accuracies": 0.53125,
"rewards/chosen": -16.169931411743164,
"rewards/margins": -0.002192378044128418,
"rewards/rejected": -16.167739868164062,
"step": 114
},
{
"epoch": 0.2867383512544803,
"grad_norm": 125.5,
"learning_rate": 5.918719271454026e-07,
"logits/chosen": 0.8902820944786072,
"logits/rejected": 0.6495590806007385,
"logps/chosen": -1.7944972515106201,
"logps/rejected": -1.6713979244232178,
"loss": 3.4653,
"rewards/accuracies": 0.40625,
"rewards/chosen": -17.94497299194336,
"rewards/margins": -1.2309918403625488,
"rewards/rejected": -16.71398162841797,
"step": 115
},
{
"epoch": 0.28923172822191057,
"grad_norm": 12.5,
"learning_rate": 5.897996002387454e-07,
"logits/chosen": 0.9350267648696899,
"logits/rejected": 0.7698911428451538,
"logps/chosen": -1.3168952465057373,
"logps/rejected": -1.5409971475601196,
"loss": 2.018,
"rewards/accuracies": 0.71875,
"rewards/chosen": -13.168952941894531,
"rewards/margins": 2.241018533706665,
"rewards/rejected": -15.409971237182617,
"step": 116
},
{
"epoch": 0.2917251051893408,
"grad_norm": 35.25,
"learning_rate": 5.877113062951007e-07,
"logits/chosen": 0.9151044487953186,
"logits/rejected": 0.7181938886642456,
"logps/chosen": -1.3629308938980103,
"logps/rejected": -2.3142240047454834,
"loss": 2.7597,
"rewards/accuracies": 0.59375,
"rewards/chosen": -13.62930965423584,
"rewards/margins": 9.512930870056152,
"rewards/rejected": -23.142240524291992,
"step": 117
},
{
"epoch": 0.2942184821567711,
"grad_norm": 19.75,
"learning_rate": 5.856071843633516e-07,
"logits/chosen": 0.8448264598846436,
"logits/rejected": 0.6548407077789307,
"logps/chosen": -1.355668544769287,
"logps/rejected": -1.4345015287399292,
"loss": 2.6585,
"rewards/accuracies": 0.59375,
"rewards/chosen": -13.556684494018555,
"rewards/margins": 0.7883304953575134,
"rewards/rejected": -14.345015525817871,
"step": 118
},
{
"epoch": 0.29671185912420134,
"grad_norm": 52.0,
"learning_rate": 5.834873745462869e-07,
"logits/chosen": 0.9469012022018433,
"logits/rejected": 0.6909551620483398,
"logps/chosen": -1.5371216535568237,
"logps/rejected": -1.9698981046676636,
"loss": 1.7712,
"rewards/accuracies": 0.71875,
"rewards/chosen": -15.3712158203125,
"rewards/margins": 4.327763557434082,
"rewards/rejected": -19.698978424072266,
"step": 119
},
{
"epoch": 0.2992052360916316,
"grad_norm": 38.75,
"learning_rate": 5.813520179912718e-07,
"logits/chosen": 0.8846210241317749,
"logits/rejected": 0.6549557447433472,
"logps/chosen": -1.5691332817077637,
"logps/rejected": -1.859965443611145,
"loss": 1.9083,
"rewards/accuracies": 0.46875,
"rewards/chosen": -15.691333770751953,
"rewards/margins": 2.9083199501037598,
"rewards/rejected": -18.599653244018555,
"step": 120
},
{
"epoch": 0.30169861305906187,
"grad_norm": 52.75,
"learning_rate": 5.792012568808498e-07,
"logits/chosen": 0.9424107074737549,
"logits/rejected": 0.638304591178894,
"logps/chosen": -1.7227492332458496,
"logps/rejected": -1.9438109397888184,
"loss": 2.7587,
"rewards/accuracies": 0.5625,
"rewards/chosen": -17.227493286132812,
"rewards/margins": 2.2106146812438965,
"rewards/rejected": -19.438106536865234,
"step": 121
},
{
"epoch": 0.3041919900264921,
"grad_norm": 30.125,
"learning_rate": 5.770352344232754e-07,
"logits/chosen": 0.9350774884223938,
"logits/rejected": 0.7812179327011108,
"logps/chosen": -1.4625705480575562,
"logps/rejected": -1.6399712562561035,
"loss": 2.1925,
"rewards/accuracies": 0.53125,
"rewards/chosen": -14.62570571899414,
"rewards/margins": 1.7740064859390259,
"rewards/rejected": -16.39971351623535,
"step": 122
},
{
"epoch": 0.3066853669939224,
"grad_norm": 38.75,
"learning_rate": 5.748540948429791e-07,
"logits/chosen": 0.8861021995544434,
"logits/rejected": 0.5621581077575684,
"logps/chosen": -1.7297865152359009,
"logps/rejected": -2.025303840637207,
"loss": 2.116,
"rewards/accuracies": 0.625,
"rewards/chosen": -17.297866821289062,
"rewards/margins": 2.955172538757324,
"rewards/rejected": -20.25303840637207,
"step": 123
},
{
"epoch": 0.30917874396135264,
"grad_norm": 40.0,
"learning_rate": 5.726579833709629e-07,
"logits/chosen": 0.8791552782058716,
"logits/rejected": 0.7237104773521423,
"logps/chosen": -1.5754930973052979,
"logps/rejected": -1.760854959487915,
"loss": 1.9028,
"rewards/accuracies": 0.53125,
"rewards/chosen": -15.75493049621582,
"rewards/margins": 1.853618860244751,
"rewards/rejected": -17.608549118041992,
"step": 124
},
{
"epoch": 0.31167212092878294,
"grad_norm": 52.25,
"learning_rate": 5.704470462351321e-07,
"logits/chosen": 0.8605432510375977,
"logits/rejected": 0.6145266890525818,
"logps/chosen": -1.4967951774597168,
"logps/rejected": -1.6985702514648438,
"loss": 2.7418,
"rewards/accuracies": 0.5625,
"rewards/chosen": -14.9679536819458,
"rewards/margins": 2.0177483558654785,
"rewards/rejected": -16.985700607299805,
"step": 125
},
{
"epoch": 0.3141654978962132,
"grad_norm": 9.25,
"learning_rate": 5.682214306505567e-07,
"logits/chosen": 0.89193195104599,
"logits/rejected": 0.7236483097076416,
"logps/chosen": -1.4118638038635254,
"logps/rejected": -1.9812034368515015,
"loss": 1.6725,
"rewards/accuracies": 0.75,
"rewards/chosen": -14.118638038635254,
"rewards/margins": 5.693397521972656,
"rewards/rejected": -19.812034606933594,
"step": 126
},
{
"epoch": 0.31665887486364347,
"grad_norm": 19.0,
"learning_rate": 5.659812848096706e-07,
"logits/chosen": 0.7631481289863586,
"logits/rejected": 0.6791519522666931,
"logps/chosen": -1.5167012214660645,
"logps/rejected": -1.6185718774795532,
"loss": 3.444,
"rewards/accuracies": 0.5625,
"rewards/chosen": -15.167011260986328,
"rewards/margins": 1.0187066793441772,
"rewards/rejected": -16.185718536376953,
"step": 127
},
{
"epoch": 0.3191522518310737,
"grad_norm": 55.75,
"learning_rate": 5.637267578724034e-07,
"logits/chosen": 0.847726047039032,
"logits/rejected": 0.693824291229248,
"logps/chosen": -1.5810160636901855,
"logps/rejected": -1.9167366027832031,
"loss": 2.9597,
"rewards/accuracies": 0.46875,
"rewards/chosen": -15.810161590576172,
"rewards/margins": 3.357205867767334,
"rewards/rejected": -19.16736602783203,
"step": 128
},
{
"epoch": 0.32164562879850395,
"grad_norm": 72.5,
"learning_rate": 5.614579999562487e-07,
"logits/chosen": 0.878848135471344,
"logits/rejected": 0.7662035822868347,
"logps/chosen": -1.6665140390396118,
"logps/rejected": -1.7739882469177246,
"loss": 3.1744,
"rewards/accuracies": 0.46875,
"rewards/chosen": -16.66514015197754,
"rewards/margins": 1.0747425556182861,
"rewards/rejected": -17.73988151550293,
"step": 129
},
{
"epoch": 0.32413900576593424,
"grad_norm": 61.5,
"learning_rate": 5.591751621262691e-07,
"logits/chosen": 0.8593266010284424,
"logits/rejected": 0.7886440753936768,
"logps/chosen": -1.1743977069854736,
"logps/rejected": -1.3935869932174683,
"loss": 1.9932,
"rewards/accuracies": 0.5625,
"rewards/chosen": -11.743976593017578,
"rewards/margins": 2.1918928623199463,
"rewards/rejected": -13.935870170593262,
"step": 130
},
{
"epoch": 0.3266323827333645,
"grad_norm": 23.75,
"learning_rate": 5.568783963850368e-07,
"logits/chosen": 0.9685453176498413,
"logits/rejected": 0.7054411768913269,
"logps/chosen": -1.598836898803711,
"logps/rejected": -1.8996286392211914,
"loss": 2.1934,
"rewards/accuracies": 0.5625,
"rewards/chosen": -15.988369941711426,
"rewards/margins": 3.0079164505004883,
"rewards/rejected": -18.996286392211914,
"step": 131
},
{
"epoch": 0.3291257597007948,
"grad_norm": 22.0,
"learning_rate": 5.545678556625129e-07,
"logits/chosen": 0.8639561533927917,
"logits/rejected": 0.6618623733520508,
"logps/chosen": -1.7690205574035645,
"logps/rejected": -2.254978895187378,
"loss": 1.9177,
"rewards/accuracies": 0.625,
"rewards/chosen": -17.69020652770996,
"rewards/margins": 4.859582901000977,
"rewards/rejected": -22.549787521362305,
"step": 132
},
{
"epoch": 0.331619136668225,
"grad_norm": 27.5,
"learning_rate": 5.522436938058645e-07,
"logits/chosen": 0.8631035089492798,
"logits/rejected": 0.7001104950904846,
"logps/chosen": -1.5964336395263672,
"logps/rejected": -2.130333185195923,
"loss": 1.625,
"rewards/accuracies": 0.75,
"rewards/chosen": -15.964335441589355,
"rewards/margins": 5.338994979858398,
"rewards/rejected": -21.303333282470703,
"step": 133
},
{
"epoch": 0.3341125136356553,
"grad_norm": 59.0,
"learning_rate": 5.49906065569221e-07,
"logits/chosen": 0.733770489692688,
"logits/rejected": 0.5061658620834351,
"logps/chosen": -1.5350837707519531,
"logps/rejected": -1.8332159519195557,
"loss": 2.7709,
"rewards/accuracies": 0.5625,
"rewards/chosen": -15.350838661193848,
"rewards/margins": 2.981321334838867,
"rewards/rejected": -18.3321590423584,
"step": 134
},
{
"epoch": 0.33660589060308554,
"grad_norm": 13.1875,
"learning_rate": 5.475551266033692e-07,
"logits/chosen": 0.9098625183105469,
"logits/rejected": 0.7151045203208923,
"logps/chosen": -1.388254165649414,
"logps/rejected": -1.944246530532837,
"loss": 1.4884,
"rewards/accuracies": 0.75,
"rewards/chosen": -13.88254165649414,
"rewards/margins": 5.559926509857178,
"rewards/rejected": -19.442468643188477,
"step": 135
},
{
"epoch": 0.33909926757051584,
"grad_norm": 36.75,
"learning_rate": 5.451910334453903e-07,
"logits/chosen": 0.9809038639068604,
"logits/rejected": 0.6819513440132141,
"logps/chosen": -1.6769332885742188,
"logps/rejected": -2.2928450107574463,
"loss": 1.2734,
"rewards/accuracies": 0.78125,
"rewards/chosen": -16.769332885742188,
"rewards/margins": 6.159116268157959,
"rewards/rejected": -22.928447723388672,
"step": 136
},
{
"epoch": 0.3415926445379461,
"grad_norm": 111.5,
"learning_rate": 5.428139435082358e-07,
"logits/chosen": 0.9270225763320923,
"logits/rejected": 0.6331555843353271,
"logps/chosen": -1.6441551446914673,
"logps/rejected": -1.7793883085250854,
"loss": 2.9125,
"rewards/accuracies": 0.46875,
"rewards/chosen": -16.441551208496094,
"rewards/margins": 1.3523308038711548,
"rewards/rejected": -17.793882369995117,
"step": 137
},
{
"epoch": 0.34408602150537637,
"grad_norm": 19.25,
"learning_rate": 5.404240150702472e-07,
"logits/chosen": 0.9672467708587646,
"logits/rejected": 0.8573353886604309,
"logps/chosen": -1.3790785074234009,
"logps/rejected": -1.8634607791900635,
"loss": 1.7018,
"rewards/accuracies": 0.65625,
"rewards/chosen": -13.79078483581543,
"rewards/margins": 4.8438215255737305,
"rewards/rejected": -18.634607315063477,
"step": 138
},
{
"epoch": 0.3465793984728066,
"grad_norm": 83.0,
"learning_rate": 5.38021407264616e-07,
"logits/chosen": 0.8024469614028931,
"logits/rejected": 0.5433262586593628,
"logps/chosen": -1.3546580076217651,
"logps/rejected": -1.5655174255371094,
"loss": 2.6886,
"rewards/accuracies": 0.65625,
"rewards/chosen": -13.546581268310547,
"rewards/margins": 2.108593702316284,
"rewards/rejected": -15.655172348022461,
"step": 139
},
{
"epoch": 0.34907277544023685,
"grad_norm": 56.5,
"learning_rate": 5.356062800687886e-07,
"logits/chosen": 0.7994624972343445,
"logits/rejected": 0.6035336256027222,
"logps/chosen": -1.2650129795074463,
"logps/rejected": -1.3674687147140503,
"loss": 2.4405,
"rewards/accuracies": 0.59375,
"rewards/chosen": -12.650128364562988,
"rewards/margins": 1.0245567560195923,
"rewards/rejected": -13.67468547821045,
"step": 140
},
{
"epoch": 0.35156615240766714,
"grad_norm": 60.75,
"learning_rate": 5.331787942938142e-07,
"logits/chosen": 1.0324114561080933,
"logits/rejected": 0.7126603126525879,
"logps/chosen": -1.5447206497192383,
"logps/rejected": -1.9410955905914307,
"loss": 1.5742,
"rewards/accuracies": 0.625,
"rewards/chosen": -15.447206497192383,
"rewards/margins": 3.963749885559082,
"rewards/rejected": -19.41095733642578,
"step": 141
},
{
"epoch": 0.3540595293750974,
"grad_norm": 14.4375,
"learning_rate": 5.307391115736366e-07,
"logits/chosen": 0.7712888717651367,
"logits/rejected": 0.5555048584938049,
"logps/chosen": -1.2323440313339233,
"logps/rejected": -1.6426218748092651,
"loss": 1.5398,
"rewards/accuracies": 0.6875,
"rewards/chosen": -12.32343864440918,
"rewards/margins": 4.102778434753418,
"rewards/rejected": -16.42621612548828,
"step": 142
},
{
"epoch": 0.3565529063425277,
"grad_norm": 42.25,
"learning_rate": 5.282873943543326e-07,
"logits/chosen": 0.8940728306770325,
"logits/rejected": 0.7413418292999268,
"logps/chosen": -1.296794056892395,
"logps/rejected": -1.8393501043319702,
"loss": 1.7974,
"rewards/accuracies": 0.59375,
"rewards/chosen": -12.967940330505371,
"rewards/margins": 5.425559997558594,
"rewards/rejected": -18.39349937438965,
"step": 143
},
{
"epoch": 0.3590462833099579,
"grad_norm": 31.75,
"learning_rate": 5.258238058832948e-07,
"logits/chosen": 0.9329725503921509,
"logits/rejected": 0.5702534914016724,
"logps/chosen": -1.3792263269424438,
"logps/rejected": -1.757681965827942,
"loss": 2.1616,
"rewards/accuracies": 0.65625,
"rewards/chosen": -13.792261123657227,
"rewards/margins": 3.784557342529297,
"rewards/rejected": -17.576818466186523,
"step": 144
},
{
"epoch": 0.3615396602773882,
"grad_norm": 78.5,
"learning_rate": 5.233485101983624e-07,
"logits/chosen": 0.9451256990432739,
"logits/rejected": 0.8186403512954712,
"logps/chosen": -1.5383343696594238,
"logps/rejected": -2.494551181793213,
"loss": 1.4328,
"rewards/accuracies": 0.6875,
"rewards/chosen": -15.383341789245605,
"rewards/margins": 9.562170028686523,
"rewards/rejected": -24.945512771606445,
"step": 145
},
{
"epoch": 0.36403303724481845,
"grad_norm": 58.5,
"learning_rate": 5.208616721168984e-07,
"logits/chosen": 0.9742121696472168,
"logits/rejected": 0.7483265995979309,
"logps/chosen": -1.6329911947250366,
"logps/rejected": -2.0833840370178223,
"loss": 1.8646,
"rewards/accuracies": 0.6875,
"rewards/chosen": -16.329910278320312,
"rewards/margins": 4.503929138183594,
"rewards/rejected": -20.833839416503906,
"step": 146
},
{
"epoch": 0.36652641421224874,
"grad_norm": 29.5,
"learning_rate": 5.183634572248153e-07,
"logits/chosen": 0.8174174427986145,
"logits/rejected": 0.7698001265525818,
"logps/chosen": -1.255910038948059,
"logps/rejected": -1.4298808574676514,
"loss": 2.2763,
"rewards/accuracies": 0.5,
"rewards/chosen": -12.559102058410645,
"rewards/margins": 1.739708423614502,
"rewards/rejected": -14.298810005187988,
"step": 147
},
{
"epoch": 0.369019791179679,
"grad_norm": 161.0,
"learning_rate": 5.158540318655495e-07,
"logits/chosen": 1.1192365884780884,
"logits/rejected": 0.7937313914299011,
"logps/chosen": -1.7974631786346436,
"logps/rejected": -2.402998924255371,
"loss": 2.2646,
"rewards/accuracies": 0.5625,
"rewards/chosen": -17.974632263183594,
"rewards/margins": 6.055357933044434,
"rewards/rejected": -24.02998924255371,
"step": 148
},
{
"epoch": 0.3715131681471092,
"grad_norm": 13.4375,
"learning_rate": 5.133335631289858e-07,
"logits/chosen": 1.004485011100769,
"logits/rejected": 0.6550527215003967,
"logps/chosen": -1.4417423009872437,
"logps/rejected": -2.1560442447662354,
"loss": 1.3901,
"rewards/accuracies": 0.65625,
"rewards/chosen": -14.4174222946167,
"rewards/margins": 7.1430182456970215,
"rewards/rejected": -21.560441970825195,
"step": 149
},
{
"epoch": 0.3740065451145395,
"grad_norm": 29.75,
"learning_rate": 5.10802218840331e-07,
"logits/chosen": 0.8932673335075378,
"logits/rejected": 0.695792019367218,
"logps/chosen": -1.3724555969238281,
"logps/rejected": -1.7769482135772705,
"loss": 1.7406,
"rewards/accuracies": 0.65625,
"rewards/chosen": -13.724554061889648,
"rewards/margins": 4.044928073883057,
"rewards/rejected": -17.76948356628418,
"step": 150
},
{
"epoch": 0.37649992208196975,
"grad_norm": 38.25,
"learning_rate": 5.0826016754894e-07,
"logits/chosen": 0.9987000823020935,
"logits/rejected": 0.6120975017547607,
"logps/chosen": -1.7447395324707031,
"logps/rejected": -2.424745559692383,
"loss": 2.0385,
"rewards/accuracies": 0.625,
"rewards/chosen": -17.4473934173584,
"rewards/margins": 6.800059795379639,
"rewards/rejected": -24.247455596923828,
"step": 151
},
{
"epoch": 0.37899329904940005,
"grad_norm": 43.5,
"learning_rate": 5.057075785170923e-07,
"logits/chosen": 0.7949992418289185,
"logits/rejected": 0.735917866230011,
"logps/chosen": -1.4737513065338135,
"logps/rejected": -1.7997541427612305,
"loss": 2.4462,
"rewards/accuracies": 0.4375,
"rewards/chosen": -14.737512588500977,
"rewards/margins": 3.2600276470184326,
"rewards/rejected": -17.997541427612305,
"step": 152
},
{
"epoch": 0.3814866760168303,
"grad_norm": 34.75,
"learning_rate": 5.031446217087223e-07,
"logits/chosen": 0.7635215520858765,
"logits/rejected": 0.6593471765518188,
"logps/chosen": -1.4680148363113403,
"logps/rejected": -1.8192330598831177,
"loss": 2.3192,
"rewards/accuracies": 0.5625,
"rewards/chosen": -14.680147171020508,
"rewards/margins": 3.5121822357177734,
"rewards/rejected": -18.19232940673828,
"step": 153
},
{
"epoch": 0.3839800529842606,
"grad_norm": 18.625,
"learning_rate": 5.005714677781016e-07,
"logits/chosen": 0.8512160778045654,
"logits/rejected": 0.638878583908081,
"logps/chosen": -1.239166259765625,
"logps/rejected": -1.7152905464172363,
"loss": 1.1124,
"rewards/accuracies": 0.625,
"rewards/chosen": -12.39166259765625,
"rewards/margins": 4.761242866516113,
"rewards/rejected": -17.15290641784668,
"step": 154
},
{
"epoch": 0.3864734299516908,
"grad_norm": 16.375,
"learning_rate": 4.979882880584766e-07,
"logits/chosen": 0.9124481678009033,
"logits/rejected": 0.7296810150146484,
"logps/chosen": -1.7560640573501587,
"logps/rejected": -2.781906843185425,
"loss": 1.6899,
"rewards/accuracies": 0.71875,
"rewards/chosen": -17.56064224243164,
"rewards/margins": 10.258424758911133,
"rewards/rejected": -27.81906509399414,
"step": 155
},
{
"epoch": 0.3889668069191211,
"grad_norm": 30.125,
"learning_rate": 4.953952545506602e-07,
"logits/chosen": 0.8763688802719116,
"logits/rejected": 0.7317189574241638,
"logps/chosen": -1.6232566833496094,
"logps/rejected": -2.2681305408477783,
"loss": 1.9121,
"rewards/accuracies": 0.59375,
"rewards/chosen": -16.232566833496094,
"rewards/margins": 6.448739051818848,
"rewards/rejected": -22.681304931640625,
"step": 156
},
{
"epoch": 0.39146018388655135,
"grad_norm": 23.125,
"learning_rate": 4.927925399115788e-07,
"logits/chosen": 0.8235619068145752,
"logits/rejected": 0.7919750213623047,
"logps/chosen": -1.391683578491211,
"logps/rejected": -1.6939644813537598,
"loss": 2.2898,
"rewards/accuracies": 0.625,
"rewards/chosen": -13.916834831237793,
"rewards/margins": 3.0228097438812256,
"rewards/rejected": -16.939645767211914,
"step": 157
},
{
"epoch": 0.3939535608539816,
"grad_norm": 58.0,
"learning_rate": 4.901803174427757e-07,
"logits/chosen": 0.890289306640625,
"logits/rejected": 0.6626406311988831,
"logps/chosen": -1.6668946743011475,
"logps/rejected": -2.7818055152893066,
"loss": 1.1016,
"rewards/accuracies": 0.625,
"rewards/chosen": -16.668947219848633,
"rewards/margins": 11.14910888671875,
"rewards/rejected": -27.81805419921875,
"step": 158
},
{
"epoch": 0.3964469378214119,
"grad_norm": 50.25,
"learning_rate": 4.875587610788733e-07,
"logits/chosen": 0.7171937227249146,
"logits/rejected": 0.6810190677642822,
"logps/chosen": -1.645186424255371,
"logps/rejected": -2.06756854057312,
"loss": 2.5663,
"rewards/accuracies": 0.5625,
"rewards/chosen": -16.45186424255371,
"rewards/margins": 4.22382116317749,
"rewards/rejected": -20.67568588256836,
"step": 159
},
{
"epoch": 0.3989403147888421,
"grad_norm": 19.375,
"learning_rate": 4.849280453759897e-07,
"logits/chosen": 0.9262104630470276,
"logits/rejected": 0.7050573229789734,
"logps/chosen": -1.6274131536483765,
"logps/rejected": -2.1605324745178223,
"loss": 1.2244,
"rewards/accuracies": 0.6875,
"rewards/chosen": -16.274131774902344,
"rewards/margins": 5.331192970275879,
"rewards/rejected": -21.60532569885254,
"step": 160
},
{
"epoch": 0.4014336917562724,
"grad_norm": 83.0,
"learning_rate": 4.822883455001173e-07,
"logits/chosen": 0.9184644818305969,
"logits/rejected": 0.8644086122512817,
"logps/chosen": -1.5301023721694946,
"logps/rejected": -1.876584768295288,
"loss": 2.0259,
"rewards/accuracies": 0.625,
"rewards/chosen": -15.30102252960205,
"rewards/margins": 3.464823007583618,
"rewards/rejected": -18.76584815979004,
"step": 161
},
{
"epoch": 0.40392706872370265,
"grad_norm": 24.5,
"learning_rate": 4.796398372154588e-07,
"logits/chosen": 1.0671634674072266,
"logits/rejected": 0.8774153590202332,
"logps/chosen": -1.6217372417449951,
"logps/rejected": -2.3855130672454834,
"loss": 1.4698,
"rewards/accuracies": 0.65625,
"rewards/chosen": -16.21737289428711,
"rewards/margins": 7.637757301330566,
"rewards/rejected": -23.85512924194336,
"step": 162
},
{
"epoch": 0.40642044569113295,
"grad_norm": 44.0,
"learning_rate": 4.769826968727243e-07,
"logits/chosen": 0.80574631690979,
"logits/rejected": 0.6158944964408875,
"logps/chosen": -1.5703632831573486,
"logps/rejected": -2.269869327545166,
"loss": 1.3586,
"rewards/accuracies": 0.625,
"rewards/chosen": -15.703633308410645,
"rewards/margins": 6.995059967041016,
"rewards/rejected": -22.698694229125977,
"step": 163
},
{
"epoch": 0.4089138226585632,
"grad_norm": 27.5,
"learning_rate": 4.743171013973885e-07,
"logits/chosen": 0.935499370098114,
"logits/rejected": 0.7237244844436646,
"logps/chosen": -1.7726106643676758,
"logps/rejected": -2.6084468364715576,
"loss": 1.447,
"rewards/accuracies": 0.6875,
"rewards/chosen": -17.72610855102539,
"rewards/margins": 8.358359336853027,
"rewards/rejected": -26.08446502685547,
"step": 164
},
{
"epoch": 0.4114071996259935,
"grad_norm": 30.0,
"learning_rate": 4.716432282779106e-07,
"logits/chosen": 0.9203133583068848,
"logits/rejected": 0.7862353920936584,
"logps/chosen": -1.4431755542755127,
"logps/rejected": -2.1590194702148438,
"loss": 1.4126,
"rewards/accuracies": 0.71875,
"rewards/chosen": -14.431756019592285,
"rewards/margins": 7.158439636230469,
"rewards/rejected": -21.590194702148438,
"step": 165
},
{
"epoch": 0.4139005765934237,
"grad_norm": 100.5,
"learning_rate": 4.6896125555391575e-07,
"logits/chosen": 0.9510793685913086,
"logits/rejected": 0.7097218036651611,
"logps/chosen": -1.377150535583496,
"logps/rejected": -1.8154629468917847,
"loss": 1.436,
"rewards/accuracies": 0.625,
"rewards/chosen": -13.771505355834961,
"rewards/margins": 4.383124351501465,
"rewards/rejected": -18.15462875366211,
"step": 166
},
{
"epoch": 0.41639395356085396,
"grad_norm": 40.5,
"learning_rate": 4.662713618043413e-07,
"logits/chosen": 0.9421004056930542,
"logits/rejected": 0.6513608694076538,
"logps/chosen": -1.4433151483535767,
"logps/rejected": -1.7160542011260986,
"loss": 1.3431,
"rewards/accuracies": 0.65625,
"rewards/chosen": -14.433152198791504,
"rewards/margins": 2.7273917198181152,
"rewards/rejected": -17.16054344177246,
"step": 167
},
{
"epoch": 0.41888733052828425,
"grad_norm": 78.5,
"learning_rate": 4.635737261355447e-07,
"logits/chosen": 0.8841539621353149,
"logits/rejected": 0.7275552153587341,
"logps/chosen": -1.617548942565918,
"logps/rejected": -2.5178287029266357,
"loss": 1.7514,
"rewards/accuracies": 0.53125,
"rewards/chosen": -16.17548942565918,
"rewards/margins": 9.002798080444336,
"rewards/rejected": -25.178287506103516,
"step": 168
},
{
"epoch": 0.4213807074957145,
"grad_norm": 61.5,
"learning_rate": 4.608685281693789e-07,
"logits/chosen": 0.795113205909729,
"logits/rejected": 0.7205825448036194,
"logps/chosen": -1.5723981857299805,
"logps/rejected": -1.8851563930511475,
"loss": 2.6762,
"rewards/accuracies": 0.53125,
"rewards/chosen": -15.723981857299805,
"rewards/margins": 3.1275830268859863,
"rewards/rejected": -18.851564407348633,
"step": 169
},
{
"epoch": 0.4238740844631448,
"grad_norm": 40.5,
"learning_rate": 4.581559480312316e-07,
"logits/chosen": 0.9474557042121887,
"logits/rejected": 0.7945749759674072,
"logps/chosen": -1.8188387155532837,
"logps/rejected": -2.6367805004119873,
"loss": 1.3681,
"rewards/accuracies": 0.71875,
"rewards/chosen": -18.18838882446289,
"rewards/margins": 8.179415702819824,
"rewards/rejected": -26.36780548095703,
"step": 170
},
{
"epoch": 0.426367461430575,
"grad_norm": 33.25,
"learning_rate": 4.5543616633803197e-07,
"logits/chosen": 0.7378120422363281,
"logits/rejected": 0.7000318169593811,
"logps/chosen": -1.4147385358810425,
"logps/rejected": -1.8963797092437744,
"loss": 1.899,
"rewards/accuracies": 0.53125,
"rewards/chosen": -14.147384643554688,
"rewards/margins": 4.816410541534424,
"rewards/rejected": -18.963794708251953,
"step": 171
},
{
"epoch": 0.4288608383980053,
"grad_norm": 35.25,
"learning_rate": 4.527093641862241e-07,
"logits/chosen": 0.9072024822235107,
"logits/rejected": 0.7587930560112,
"logps/chosen": -1.2699742317199707,
"logps/rejected": -1.702739953994751,
"loss": 1.4364,
"rewards/accuracies": 0.65625,
"rewards/chosen": -12.699743270874023,
"rewards/margins": 4.327658176422119,
"rewards/rejected": -17.027400970458984,
"step": 172
},
{
"epoch": 0.43135421536543556,
"grad_norm": 25.75,
"learning_rate": 4.499757231397087e-07,
"logits/chosen": 0.8443821668624878,
"logits/rejected": 0.6597446203231812,
"logps/chosen": -1.509061336517334,
"logps/rejected": -2.0712409019470215,
"loss": 1.2708,
"rewards/accuracies": 0.65625,
"rewards/chosen": -15.09061336517334,
"rewards/margins": 5.6217942237854,
"rewards/rejected": -20.7124080657959,
"step": 173
},
{
"epoch": 0.43384759233286585,
"grad_norm": 23.375,
"learning_rate": 4.4723542521775385e-07,
"logits/chosen": 1.0543487071990967,
"logits/rejected": 0.5649646520614624,
"logps/chosen": -1.4722059965133667,
"logps/rejected": -2.189124584197998,
"loss": 0.8446,
"rewards/accuracies": 0.75,
"rewards/chosen": -14.72205924987793,
"rewards/margins": 7.169185638427734,
"rewards/rejected": -21.891244888305664,
"step": 174
},
{
"epoch": 0.4363409693002961,
"grad_norm": 54.75,
"learning_rate": 4.444886528828749e-07,
"logits/chosen": 0.9907981157302856,
"logits/rejected": 0.7723469138145447,
"logps/chosen": -1.8176202774047852,
"logps/rejected": -2.3857648372650146,
"loss": 1.7344,
"rewards/accuracies": 0.59375,
"rewards/chosen": -18.17620277404785,
"rewards/margins": 5.6814446449279785,
"rewards/rejected": -23.857648849487305,
"step": 175
},
{
"epoch": 0.4388343462677263,
"grad_norm": 31.5,
"learning_rate": 4.417355890286857e-07,
"logits/chosen": 0.9411242008209229,
"logits/rejected": 0.7533101439476013,
"logps/chosen": -1.6791445016860962,
"logps/rejected": -2.381438732147217,
"loss": 1.8322,
"rewards/accuracies": 0.6875,
"rewards/chosen": -16.791446685791016,
"rewards/margins": 7.022940635681152,
"rewards/rejected": -23.81438446044922,
"step": 176
},
{
"epoch": 0.4413277232351566,
"grad_norm": 51.25,
"learning_rate": 4.389764169677205e-07,
"logits/chosen": 0.862296462059021,
"logits/rejected": 0.7431577444076538,
"logps/chosen": -1.3871877193450928,
"logps/rejected": -1.9420627355575562,
"loss": 1.2998,
"rewards/accuracies": 0.65625,
"rewards/chosen": -13.87187671661377,
"rewards/margins": 5.548751354217529,
"rewards/rejected": -19.420629501342773,
"step": 177
},
{
"epoch": 0.44382110020258686,
"grad_norm": 41.0,
"learning_rate": 4.3621132041922745e-07,
"logits/chosen": 0.8196381330490112,
"logits/rejected": 0.735532820224762,
"logps/chosen": -1.3557261228561401,
"logps/rejected": -2.2253174781799316,
"loss": 1.2671,
"rewards/accuracies": 0.625,
"rewards/chosen": -13.557262420654297,
"rewards/margins": 8.695913314819336,
"rewards/rejected": -22.253175735473633,
"step": 178
},
{
"epoch": 0.44631447717001715,
"grad_norm": 28.75,
"learning_rate": 4.334404834969368e-07,
"logits/chosen": 1.0182719230651855,
"logits/rejected": 0.8464354872703552,
"logps/chosen": -1.3779345750808716,
"logps/rejected": -1.8232632875442505,
"loss": 1.3614,
"rewards/accuracies": 0.59375,
"rewards/chosen": -13.779345512390137,
"rewards/margins": 4.4532856941223145,
"rewards/rejected": -18.23263168334961,
"step": 179
},
{
"epoch": 0.4488078541374474,
"grad_norm": 18.375,
"learning_rate": 4.306640906968011e-07,
"logits/chosen": 0.927130401134491,
"logits/rejected": 0.7001396417617798,
"logps/chosen": -1.3739848136901855,
"logps/rejected": -2.3601279258728027,
"loss": 0.5738,
"rewards/accuracies": 0.875,
"rewards/chosen": -13.739850044250488,
"rewards/margins": 9.861430168151855,
"rewards/rejected": -23.601280212402344,
"step": 180
},
{
"epoch": 0.4513012311048777,
"grad_norm": 46.25,
"learning_rate": 4.2788232688471e-07,
"logits/chosen": 0.858923077583313,
"logits/rejected": 0.7578305006027222,
"logps/chosen": -1.2482776641845703,
"logps/rejected": -1.7487417459487915,
"loss": 1.0749,
"rewards/accuracies": 0.6875,
"rewards/chosen": -12.482775688171387,
"rewards/margins": 5.004642486572266,
"rewards/rejected": -17.487417221069336,
"step": 181
},
{
"epoch": 0.4537946080723079,
"grad_norm": 83.0,
"learning_rate": 4.2509537728418233e-07,
"logits/chosen": 0.8518757224082947,
"logits/rejected": 0.7721596360206604,
"logps/chosen": -1.3393375873565674,
"logps/rejected": -1.7837915420532227,
"loss": 1.2853,
"rewards/accuracies": 0.6875,
"rewards/chosen": -13.393375396728516,
"rewards/margins": 4.444540977478027,
"rewards/rejected": -17.83791732788086,
"step": 182
},
{
"epoch": 0.4562879850397382,
"grad_norm": 56.0,
"learning_rate": 4.223034274640317e-07,
"logits/chosen": 0.9242639541625977,
"logits/rejected": 0.7321256995201111,
"logps/chosen": -1.6946762800216675,
"logps/rejected": -2.9650654792785645,
"loss": 1.0034,
"rewards/accuracies": 0.75,
"rewards/chosen": -16.94676399230957,
"rewards/margins": 12.703892707824707,
"rewards/rejected": -29.65065574645996,
"step": 183
},
{
"epoch": 0.45878136200716846,
"grad_norm": 38.75,
"learning_rate": 4.195066633260109e-07,
"logits/chosen": 0.8796188831329346,
"logits/rejected": 0.6841633319854736,
"logps/chosen": -1.3098094463348389,
"logps/rejected": -1.709314227104187,
"loss": 1.0082,
"rewards/accuracies": 0.75,
"rewards/chosen": -13.09809398651123,
"rewards/margins": 3.995047092437744,
"rewards/rejected": -17.0931396484375,
"step": 184
},
{
"epoch": 0.4612747389745987,
"grad_norm": 49.75,
"learning_rate": 4.1670527109243414e-07,
"logits/chosen": 0.8437327146530151,
"logits/rejected": 0.7233911156654358,
"logps/chosen": -1.552445888519287,
"logps/rejected": -2.1319706439971924,
"loss": 1.2603,
"rewards/accuracies": 0.71875,
"rewards/chosen": -15.524458885192871,
"rewards/margins": 5.795248031616211,
"rewards/rejected": -21.3197078704834,
"step": 185
},
{
"epoch": 0.463768115942029,
"grad_norm": 45.5,
"learning_rate": 4.138994372937766e-07,
"logits/chosen": 0.9246405363082886,
"logits/rejected": 0.7257117629051208,
"logps/chosen": -1.5023407936096191,
"logps/rejected": -2.219346046447754,
"loss": 1.2028,
"rewards/accuracies": 0.6875,
"rewards/chosen": -15.023408889770508,
"rewards/margins": 7.170053958892822,
"rewards/rejected": -22.193462371826172,
"step": 186
},
{
"epoch": 0.46626149290945923,
"grad_norm": 74.0,
"learning_rate": 4.110893487562548e-07,
"logits/chosen": 0.7957507371902466,
"logits/rejected": 0.7296849489212036,
"logps/chosen": -1.3323700428009033,
"logps/rejected": -2.052271842956543,
"loss": 0.6391,
"rewards/accuracies": 0.71875,
"rewards/chosen": -13.323701858520508,
"rewards/margins": 7.199017524719238,
"rewards/rejected": -20.52271842956543,
"step": 187
},
{
"epoch": 0.4687548698768895,
"grad_norm": 56.25,
"learning_rate": 4.082751925893869e-07,
"logits/chosen": 0.8817852735519409,
"logits/rejected": 0.7720733880996704,
"logps/chosen": -1.196410059928894,
"logps/rejected": -1.500748634338379,
"loss": 0.9032,
"rewards/accuracies": 0.71875,
"rewards/chosen": -11.96410083770752,
"rewards/margins": 3.0433857440948486,
"rewards/rejected": -15.007488250732422,
"step": 188
},
{
"epoch": 0.47124824684431976,
"grad_norm": 41.0,
"learning_rate": 4.054571561735334e-07,
"logits/chosen": 0.9272749423980713,
"logits/rejected": 0.6019188761711121,
"logps/chosen": -1.804772138595581,
"logps/rejected": -2.7550244331359863,
"loss": 0.5996,
"rewards/accuracies": 0.71875,
"rewards/chosen": -18.04772186279297,
"rewards/margins": 9.502524375915527,
"rewards/rejected": -27.550243377685547,
"step": 189
},
{
"epoch": 0.47374162381175006,
"grad_norm": 15.6875,
"learning_rate": 4.026354271474214e-07,
"logits/chosen": 0.9149619340896606,
"logits/rejected": 0.6641325950622559,
"logps/chosen": -1.705862283706665,
"logps/rejected": -3.0835325717926025,
"loss": 1.0064,
"rewards/accuracies": 0.71875,
"rewards/chosen": -17.058624267578125,
"rewards/margins": 13.776700019836426,
"rewards/rejected": -30.835325241088867,
"step": 190
},
{
"epoch": 0.4762350007791803,
"grad_norm": 23.0,
"learning_rate": 3.998101933956498e-07,
"logits/chosen": 0.8473320007324219,
"logits/rejected": 0.7866963744163513,
"logps/chosen": -1.4231493473052979,
"logps/rejected": -2.1398186683654785,
"loss": 0.7511,
"rewards/accuracies": 0.75,
"rewards/chosen": -14.23149299621582,
"rewards/margins": 7.1666951179504395,
"rewards/rejected": -21.39818572998047,
"step": 191
},
{
"epoch": 0.4787283777466106,
"grad_norm": 39.5,
"learning_rate": 3.969816430361794e-07,
"logits/chosen": 0.8237781524658203,
"logits/rejected": 0.7161869406700134,
"logps/chosen": -1.8508167266845703,
"logps/rejected": -2.9980063438415527,
"loss": 0.7146,
"rewards/accuracies": 0.75,
"rewards/chosen": -18.508167266845703,
"rewards/margins": 11.471895217895508,
"rewards/rejected": -29.980064392089844,
"step": 192
},
{
"epoch": 0.48122175471404083,
"grad_norm": 75.0,
"learning_rate": 3.9414996440780724e-07,
"logits/chosen": 0.9529024958610535,
"logits/rejected": 0.8278242349624634,
"logps/chosen": -1.8834271430969238,
"logps/rejected": -2.4769580364227295,
"loss": 1.046,
"rewards/accuracies": 0.71875,
"rewards/chosen": -18.834270477294922,
"rewards/margins": 5.935309410095215,
"rewards/rejected": -24.769580841064453,
"step": 193
},
{
"epoch": 0.48371513168147107,
"grad_norm": 62.0,
"learning_rate": 3.913153460576256e-07,
"logits/chosen": 0.916070818901062,
"logits/rejected": 0.6884597539901733,
"logps/chosen": -1.893513560295105,
"logps/rejected": -2.9602129459381104,
"loss": 1.0144,
"rewards/accuracies": 0.75,
"rewards/chosen": -18.935134887695312,
"rewards/margins": 10.66699504852295,
"rewards/rejected": -29.602130889892578,
"step": 194
},
{
"epoch": 0.48620850864890136,
"grad_norm": 18.125,
"learning_rate": 3.8847797672846825e-07,
"logits/chosen": 0.9603822231292725,
"logits/rejected": 0.6512764692306519,
"logps/chosen": -1.7449413537979126,
"logps/rejected": -2.7406604290008545,
"loss": 0.3528,
"rewards/accuracies": 0.875,
"rewards/chosen": -17.44941520690918,
"rewards/margins": 9.957185745239258,
"rewards/rejected": -27.40660285949707,
"step": 195
},
{
"epoch": 0.4887018856163316,
"grad_norm": 10.75,
"learning_rate": 3.8563804534634246e-07,
"logits/chosen": 0.9687063694000244,
"logits/rejected": 0.8893125057220459,
"logps/chosen": -1.372796654701233,
"logps/rejected": -2.2664504051208496,
"loss": 0.533,
"rewards/accuracies": 0.8125,
"rewards/chosen": -13.727968215942383,
"rewards/margins": 8.93653678894043,
"rewards/rejected": -22.664501190185547,
"step": 196
},
{
"epoch": 0.4911952625837619,
"grad_norm": 48.75,
"learning_rate": 3.827957410078494e-07,
"logits/chosen": 0.8412132859230042,
"logits/rejected": 0.7297399044036865,
"logps/chosen": -2.030590772628784,
"logps/rejected": -3.408313035964966,
"loss": 0.7652,
"rewards/accuracies": 0.75,
"rewards/chosen": -20.305906295776367,
"rewards/margins": 13.777222633361816,
"rewards/rejected": -34.0831298828125,
"step": 197
},
{
"epoch": 0.49368863955119213,
"grad_norm": 25.0,
"learning_rate": 3.799512529675939e-07,
"logits/chosen": 0.8365733623504639,
"logits/rejected": 0.7946135997772217,
"logps/chosen": -1.8182940483093262,
"logps/rejected": -2.8680472373962402,
"loss": 0.6624,
"rewards/accuracies": 0.875,
"rewards/chosen": -18.182941436767578,
"rewards/margins": 10.497532844543457,
"rewards/rejected": -28.680471420288086,
"step": 198
},
{
"epoch": 0.4961820165186224,
"grad_norm": 49.25,
"learning_rate": 3.7710477062558195e-07,
"logits/chosen": 0.8030841946601868,
"logits/rejected": 0.6840673685073853,
"logps/chosen": -1.7462419271469116,
"logps/rejected": -2.6651408672332764,
"loss": 0.9539,
"rewards/accuracies": 0.65625,
"rewards/chosen": -17.462419509887695,
"rewards/margins": 9.188987731933594,
"rewards/rejected": -26.651405334472656,
"step": 199
},
{
"epoch": 0.49867539348605266,
"grad_norm": 37.25,
"learning_rate": 3.742564835146099e-07,
"logits/chosen": 0.940216064453125,
"logits/rejected": 0.7382882833480835,
"logps/chosen": -1.5715973377227783,
"logps/rejected": -2.2499136924743652,
"loss": 0.5224,
"rewards/accuracies": 0.84375,
"rewards/chosen": -15.715973854064941,
"rewards/margins": 6.7831621170043945,
"rewards/rejected": -22.499134063720703,
"step": 200
},
{
"epoch": 0.501168770453483,
"grad_norm": 53.5,
"learning_rate": 3.71406581287645e-07,
"logits/chosen": 0.8017429113388062,
"logits/rejected": 0.7019472122192383,
"logps/chosen": -1.5708627700805664,
"logps/rejected": -2.446748733520508,
"loss": 0.6145,
"rewards/accuracies": 0.78125,
"rewards/chosen": -15.70862865447998,
"rewards/margins": 8.758858680725098,
"rewards/rejected": -24.467487335205078,
"step": 201
},
{
"epoch": 0.5036621474209132,
"grad_norm": 24.5,
"learning_rate": 3.6855525370519617e-07,
"logits/chosen": 0.9191329479217529,
"logits/rejected": 0.7709681987762451,
"logps/chosen": -1.2503210306167603,
"logps/rejected": -1.8521945476531982,
"loss": 0.5184,
"rewards/accuracies": 0.78125,
"rewards/chosen": -12.50321102142334,
"rewards/margins": 6.018735408782959,
"rewards/rejected": -18.52194595336914,
"step": 202
},
{
"epoch": 0.5061555243883434,
"grad_norm": 17.375,
"learning_rate": 3.6570269062268025e-07,
"logits/chosen": 0.7203347682952881,
"logits/rejected": 0.7312765717506409,
"logps/chosen": -1.9442358016967773,
"logps/rejected": -3.1624157428741455,
"loss": 0.6896,
"rewards/accuracies": 0.8125,
"rewards/chosen": -19.44235610961914,
"rewards/margins": 12.18179988861084,
"rewards/rejected": -31.624156951904297,
"step": 203
},
{
"epoch": 0.5086489013557737,
"grad_norm": 14.125,
"learning_rate": 3.6284908197777915e-07,
"logits/chosen": 0.7811324596405029,
"logits/rejected": 0.7788522839546204,
"logps/chosen": -1.5343513488769531,
"logps/rejected": -2.655756711959839,
"loss": 0.4037,
"rewards/accuracies": 0.84375,
"rewards/chosen": -15.343514442443848,
"rewards/margins": 11.214055061340332,
"rewards/rejected": -26.557571411132812,
"step": 204
},
{
"epoch": 0.511142278323204,
"grad_norm": 9.75,
"learning_rate": 3.599946177777936e-07,
"logits/chosen": 0.9504005908966064,
"logits/rejected": 0.8734852075576782,
"logps/chosen": -1.6099568605422974,
"logps/rejected": -2.527747869491577,
"loss": 0.4034,
"rewards/accuracies": 0.84375,
"rewards/chosen": -16.099567413330078,
"rewards/margins": 9.177909851074219,
"rewards/rejected": -25.277477264404297,
"step": 205
},
{
"epoch": 0.5136356552906343,
"grad_norm": 9.0,
"learning_rate": 3.571394880869919e-07,
"logits/chosen": 1.0245471000671387,
"logits/rejected": 0.8590348958969116,
"logps/chosen": -1.5646387338638306,
"logps/rejected": -2.8300962448120117,
"loss": 0.633,
"rewards/accuracies": 0.78125,
"rewards/chosen": -15.64638614654541,
"rewards/margins": 12.65457534790039,
"rewards/rejected": -28.300960540771484,
"step": 206
},
{
"epoch": 0.5161290322580645,
"grad_norm": 11.9375,
"learning_rate": 3.5428388301395325e-07,
"logits/chosen": 0.9345250129699707,
"logits/rejected": 0.8547608852386475,
"logps/chosen": -1.4048100709915161,
"logps/rejected": -2.2128334045410156,
"loss": 0.5043,
"rewards/accuracies": 0.78125,
"rewards/chosen": -14.048102378845215,
"rewards/margins": 8.080232620239258,
"rewards/rejected": -22.128334045410156,
"step": 207
},
{
"epoch": 0.5186224092254947,
"grad_norm": 12.375,
"learning_rate": 3.514279926989105e-07,
"logits/chosen": 0.9688948392868042,
"logits/rejected": 0.7790014743804932,
"logps/chosen": -2.1355183124542236,
"logps/rejected": -3.5980281829833984,
"loss": 0.5437,
"rewards/accuracies": 0.84375,
"rewards/chosen": -21.35518455505371,
"rewards/margins": 14.625100135803223,
"rewards/rejected": -35.98028564453125,
"step": 208
},
{
"epoch": 0.5211157861929251,
"grad_norm": 8.5,
"learning_rate": 3.485720073010896e-07,
"logits/chosen": 0.9008550643920898,
"logits/rejected": 0.8881810307502747,
"logps/chosen": -1.938570261001587,
"logps/rejected": -3.0095808506011963,
"loss": 0.513,
"rewards/accuracies": 0.78125,
"rewards/chosen": -19.38570213317871,
"rewards/margins": 10.710105895996094,
"rewards/rejected": -30.095808029174805,
"step": 209
},
{
"epoch": 0.5236091631603553,
"grad_norm": 24.5,
"learning_rate": 3.457161169860469e-07,
"logits/chosen": 0.9138238430023193,
"logits/rejected": 0.6945370435714722,
"logps/chosen": -1.7868579626083374,
"logps/rejected": -3.283820390701294,
"loss": 0.5155,
"rewards/accuracies": 0.84375,
"rewards/chosen": -17.868579864501953,
"rewards/margins": 14.969620704650879,
"rewards/rejected": -32.83820343017578,
"step": 210
},
{
"epoch": 0.5261025401277856,
"grad_norm": 5.09375,
"learning_rate": 3.428605119130082e-07,
"logits/chosen": 0.8236789703369141,
"logits/rejected": 0.8235811591148376,
"logps/chosen": -1.940079689025879,
"logps/rejected": -3.2608189582824707,
"loss": 0.2783,
"rewards/accuracies": 0.875,
"rewards/chosen": -19.400798797607422,
"rewards/margins": 13.207389831542969,
"rewards/rejected": -32.60818862915039,
"step": 211
},
{
"epoch": 0.5285959170952158,
"grad_norm": 45.75,
"learning_rate": 3.4000538222220635e-07,
"logits/chosen": 0.9403684139251709,
"logits/rejected": 0.8005753755569458,
"logps/chosen": -1.6408517360687256,
"logps/rejected": -2.543820858001709,
"loss": 0.4837,
"rewards/accuracies": 0.84375,
"rewards/chosen": -16.408517837524414,
"rewards/margins": 9.029691696166992,
"rewards/rejected": -25.438209533691406,
"step": 212
},
{
"epoch": 0.531089294062646,
"grad_norm": 8.625,
"learning_rate": 3.37150918022221e-07,
"logits/chosen": 0.8799944519996643,
"logits/rejected": 0.7955228090286255,
"logps/chosen": -1.9793920516967773,
"logps/rejected": -3.432222843170166,
"loss": 0.216,
"rewards/accuracies": 0.875,
"rewards/chosen": -19.79391860961914,
"rewards/margins": 14.528306007385254,
"rewards/rejected": -34.32222366333008,
"step": 213
},
{
"epoch": 0.5335826710300764,
"grad_norm": 7.96875,
"learning_rate": 3.342973093773199e-07,
"logits/chosen": 0.9032948017120361,
"logits/rejected": 0.8324267268180847,
"logps/chosen": -1.3809956312179565,
"logps/rejected": -2.4675354957580566,
"loss": 0.3747,
"rewards/accuracies": 0.90625,
"rewards/chosen": -13.809956550598145,
"rewards/margins": 10.865400314331055,
"rewards/rejected": -24.675355911254883,
"step": 214
},
{
"epoch": 0.5360760479975066,
"grad_norm": 5.875,
"learning_rate": 3.314447462948038e-07,
"logits/chosen": 0.8150188326835632,
"logits/rejected": 0.7412484884262085,
"logps/chosen": -1.7949788570404053,
"logps/rejected": -3.1454625129699707,
"loss": 0.5245,
"rewards/accuracies": 0.875,
"rewards/chosen": -17.94978904724121,
"rewards/margins": 13.504838943481445,
"rewards/rejected": -31.454627990722656,
"step": 215
},
{
"epoch": 0.5385694249649369,
"grad_norm": 6.46875,
"learning_rate": 3.285934187123551e-07,
"logits/chosen": 0.9771428108215332,
"logits/rejected": 0.7578872442245483,
"logps/chosen": -1.5606952905654907,
"logps/rejected": -2.352637529373169,
"loss": 0.9085,
"rewards/accuracies": 0.71875,
"rewards/chosen": -15.606952667236328,
"rewards/margins": 7.9194231033325195,
"rewards/rejected": -23.526376724243164,
"step": 216
},
{
"epoch": 0.5410628019323671,
"grad_norm": 8.8125,
"learning_rate": 3.2574351648539017e-07,
"logits/chosen": 0.8879974484443665,
"logits/rejected": 0.7966049909591675,
"logps/chosen": -1.748363733291626,
"logps/rejected": -2.8166420459747314,
"loss": 0.7279,
"rewards/accuracies": 0.78125,
"rewards/chosen": -17.4836368560791,
"rewards/margins": 10.682784080505371,
"rewards/rejected": -28.16642189025879,
"step": 217
},
{
"epoch": 0.5435561788997975,
"grad_norm": 12.875,
"learning_rate": 3.228952293744181e-07,
"logits/chosen": 0.9608930349349976,
"logits/rejected": 0.7884482741355896,
"logps/chosen": -1.9304460287094116,
"logps/rejected": -3.123302459716797,
"loss": 0.6031,
"rewards/accuracies": 0.8125,
"rewards/chosen": -19.304460525512695,
"rewards/margins": 11.928570747375488,
"rewards/rejected": -31.233030319213867,
"step": 218
},
{
"epoch": 0.5460495558672277,
"grad_norm": 13.6875,
"learning_rate": 3.200487470324062e-07,
"logits/chosen": 0.9692325592041016,
"logits/rejected": 0.8839060068130493,
"logps/chosen": -1.7750276327133179,
"logps/rejected": -3.3542592525482178,
"loss": 0.4743,
"rewards/accuracies": 0.8125,
"rewards/chosen": -17.750276565551758,
"rewards/margins": 15.792311668395996,
"rewards/rejected": -33.54258728027344,
"step": 219
},
{
"epoch": 0.5485429328346579,
"grad_norm": 15.6875,
"learning_rate": 3.172042589921506e-07,
"logits/chosen": 0.9265443086624146,
"logits/rejected": 0.8019118309020996,
"logps/chosen": -1.751523494720459,
"logps/rejected": -2.899667501449585,
"loss": 0.6666,
"rewards/accuracies": 0.71875,
"rewards/chosen": -17.515233993530273,
"rewards/margins": 11.481440544128418,
"rewards/rejected": -28.996675491333008,
"step": 220
},
{
"epoch": 0.5510363098020882,
"grad_norm": 27.375,
"learning_rate": 3.1436195465365767e-07,
"logits/chosen": 0.8846260905265808,
"logits/rejected": 0.8169682621955872,
"logps/chosen": -1.4631338119506836,
"logps/rejected": -2.289689064025879,
"loss": 0.5698,
"rewards/accuracies": 0.84375,
"rewards/chosen": -14.631338119506836,
"rewards/margins": 8.265554428100586,
"rewards/rejected": -22.896892547607422,
"step": 221
},
{
"epoch": 0.5535296867695184,
"grad_norm": 13.875,
"learning_rate": 3.115220232715318e-07,
"logits/chosen": 0.8759758472442627,
"logits/rejected": 0.8523691296577454,
"logps/chosen": -1.8312069177627563,
"logps/rejected": -3.232063055038452,
"loss": 0.645,
"rewards/accuracies": 0.78125,
"rewards/chosen": -18.312068939208984,
"rewards/margins": 14.008562088012695,
"rewards/rejected": -32.32063293457031,
"step": 222
},
{
"epoch": 0.5560230637369488,
"grad_norm": 5.84375,
"learning_rate": 3.086846539423744e-07,
"logits/chosen": 0.8589959740638733,
"logits/rejected": 0.7877765893936157,
"logps/chosen": -1.3714457750320435,
"logps/rejected": -2.5221285820007324,
"loss": 0.584,
"rewards/accuracies": 0.875,
"rewards/chosen": -13.714457511901855,
"rewards/margins": 11.506828308105469,
"rewards/rejected": -25.221284866333008,
"step": 223
},
{
"epoch": 0.558516440704379,
"grad_norm": 12.75,
"learning_rate": 3.0585003559219284e-07,
"logits/chosen": 0.7336137294769287,
"logits/rejected": 0.8082336187362671,
"logps/chosen": -2.2451255321502686,
"logps/rejected": -4.2201972007751465,
"loss": 0.6203,
"rewards/accuracies": 0.8125,
"rewards/chosen": -22.451255798339844,
"rewards/margins": 19.750713348388672,
"rewards/rejected": -42.20196533203125,
"step": 224
},
{
"epoch": 0.5610098176718092,
"grad_norm": 50.0,
"learning_rate": 3.030183569638207e-07,
"logits/chosen": 0.7706287503242493,
"logits/rejected": 0.7501264810562134,
"logps/chosen": -1.5991909503936768,
"logps/rejected": -2.917886972427368,
"loss": 0.2763,
"rewards/accuracies": 0.875,
"rewards/chosen": -15.99190902709961,
"rewards/margins": 13.186960220336914,
"rewards/rejected": -29.178869247436523,
"step": 225
},
{
"epoch": 0.5635031946392395,
"grad_norm": 1.609375,
"learning_rate": 3.001898066043502e-07,
"logits/chosen": 0.9699455499649048,
"logits/rejected": 0.8485396504402161,
"logps/chosen": -2.2904715538024902,
"logps/rejected": -4.465200424194336,
"loss": 0.0169,
"rewards/accuracies": 1.0,
"rewards/chosen": -22.90471649169922,
"rewards/margins": 21.74728775024414,
"rewards/rejected": -44.652008056640625,
"step": 226
},
{
"epoch": 0.5659965716066698,
"grad_norm": 5.8125,
"learning_rate": 2.973645728525786e-07,
"logits/chosen": 0.8099995851516724,
"logits/rejected": 0.6409150958061218,
"logps/chosen": -1.6182489395141602,
"logps/rejected": -2.964346408843994,
"loss": 0.3709,
"rewards/accuracies": 0.875,
"rewards/chosen": -16.1824893951416,
"rewards/margins": 13.460972785949707,
"rewards/rejected": -29.643461227416992,
"step": 227
},
{
"epoch": 0.5684899485741001,
"grad_norm": 33.5,
"learning_rate": 2.9454284382646654e-07,
"logits/chosen": 0.8826979398727417,
"logits/rejected": 0.7478980422019958,
"logps/chosen": -1.6775342226028442,
"logps/rejected": -3.144256830215454,
"loss": 0.7344,
"rewards/accuracies": 0.84375,
"rewards/chosen": -16.77534294128418,
"rewards/margins": 14.667223930358887,
"rewards/rejected": -31.44256591796875,
"step": 228
},
{
"epoch": 0.5709833255415303,
"grad_norm": 8.0625,
"learning_rate": 2.917248074106132e-07,
"logits/chosen": 0.7391412854194641,
"logits/rejected": 0.7339631915092468,
"logps/chosen": -1.612046241760254,
"logps/rejected": -2.4834823608398438,
"loss": 0.46,
"rewards/accuracies": 0.84375,
"rewards/chosen": -16.120464324951172,
"rewards/margins": 8.714361190795898,
"rewards/rejected": -24.834821701049805,
"step": 229
},
{
"epoch": 0.5734767025089605,
"grad_norm": 10.5625,
"learning_rate": 2.889106512437452e-07,
"logits/chosen": 0.7340772151947021,
"logits/rejected": 0.8868482708930969,
"logps/chosen": -1.7300639152526855,
"logps/rejected": -2.9352312088012695,
"loss": 0.454,
"rewards/accuracies": 0.8125,
"rewards/chosen": -17.30063819885254,
"rewards/margins": 12.051673889160156,
"rewards/rejected": -29.352313995361328,
"step": 230
},
{
"epoch": 0.5759700794763908,
"grad_norm": 9.8125,
"learning_rate": 2.8610056270622344e-07,
"logits/chosen": 0.9421735405921936,
"logits/rejected": 0.7073564529418945,
"logps/chosen": -1.7943646907806396,
"logps/rejected": -3.0728399753570557,
"loss": 0.4413,
"rewards/accuracies": 0.8125,
"rewards/chosen": -17.943645477294922,
"rewards/margins": 12.784753799438477,
"rewards/rejected": -30.7283992767334,
"step": 231
},
{
"epoch": 0.5784634564438211,
"grad_norm": 14.4375,
"learning_rate": 2.8329472890756593e-07,
"logits/chosen": 0.8662580251693726,
"logits/rejected": 0.844997227191925,
"logps/chosen": -1.5802185535430908,
"logps/rejected": -2.640042304992676,
"loss": 0.8093,
"rewards/accuracies": 0.8125,
"rewards/chosen": -15.802186965942383,
"rewards/margins": 10.598236083984375,
"rewards/rejected": -26.40042495727539,
"step": 232
},
{
"epoch": 0.5809568334112514,
"grad_norm": 37.25,
"learning_rate": 2.8049333667398917e-07,
"logits/chosen": 0.9215195775032043,
"logits/rejected": 0.8155514597892761,
"logps/chosen": -2.146245241165161,
"logps/rejected": -3.7037928104400635,
"loss": 0.6597,
"rewards/accuracies": 0.875,
"rewards/chosen": -21.462451934814453,
"rewards/margins": 15.575474739074707,
"rewards/rejected": -37.037925720214844,
"step": 233
},
{
"epoch": 0.5834502103786816,
"grad_norm": 10.375,
"learning_rate": 2.776965725359684e-07,
"logits/chosen": 0.8086358308792114,
"logits/rejected": 0.7704899907112122,
"logps/chosen": -1.5882647037506104,
"logps/rejected": -2.976109743118286,
"loss": 0.6318,
"rewards/accuracies": 0.84375,
"rewards/chosen": -15.882646560668945,
"rewards/margins": 13.878451347351074,
"rewards/rejected": -29.761098861694336,
"step": 234
},
{
"epoch": 0.5859435873461118,
"grad_norm": 9.9375,
"learning_rate": 2.7490462271581774e-07,
"logits/chosen": 0.9086362719535828,
"logits/rejected": 0.8243853449821472,
"logps/chosen": -1.8874664306640625,
"logps/rejected": -3.1539669036865234,
"loss": 0.7813,
"rewards/accuracies": 0.8125,
"rewards/chosen": -18.874664306640625,
"rewards/margins": 12.665003776550293,
"rewards/rejected": -31.539669036865234,
"step": 235
},
{
"epoch": 0.5884369643135422,
"grad_norm": 9.375,
"learning_rate": 2.7211767311529e-07,
"logits/chosen": 0.8527828454971313,
"logits/rejected": 0.8761582374572754,
"logps/chosen": -1.5832545757293701,
"logps/rejected": -2.610931396484375,
"loss": 0.7349,
"rewards/accuracies": 0.75,
"rewards/chosen": -15.832547187805176,
"rewards/margins": 10.276766777038574,
"rewards/rejected": -26.10931396484375,
"step": 236
},
{
"epoch": 0.5909303412809724,
"grad_norm": 8.3125,
"learning_rate": 2.6933590930319903e-07,
"logits/chosen": 0.7043416500091553,
"logits/rejected": 0.7324919104576111,
"logps/chosen": -1.7684717178344727,
"logps/rejected": -3.180450916290283,
"loss": 0.4904,
"rewards/accuracies": 0.75,
"rewards/chosen": -17.68471908569336,
"rewards/margins": 14.11978816986084,
"rewards/rejected": -31.804506301879883,
"step": 237
},
{
"epoch": 0.5934237182484027,
"grad_norm": 7.78125,
"learning_rate": 2.665595165030632e-07,
"logits/chosen": 0.7452791929244995,
"logits/rejected": 0.7571016550064087,
"logps/chosen": -1.6962933540344238,
"logps/rejected": -4.225780963897705,
"loss": 0.0719,
"rewards/accuracies": 0.96875,
"rewards/chosen": -16.962934494018555,
"rewards/margins": 25.29487419128418,
"rewards/rejected": -42.257808685302734,
"step": 238
},
{
"epoch": 0.5959170952158329,
"grad_norm": 11.375,
"learning_rate": 2.637886795807726e-07,
"logits/chosen": 0.81926429271698,
"logits/rejected": 0.8182339072227478,
"logps/chosen": -1.742372751235962,
"logps/rejected": -3.2170917987823486,
"loss": 0.3052,
"rewards/accuracies": 0.875,
"rewards/chosen": -17.42372703552246,
"rewards/margins": 14.747193336486816,
"rewards/rejected": -32.17091751098633,
"step": 239
},
{
"epoch": 0.5984104721832632,
"grad_norm": 7.34375,
"learning_rate": 2.6102358303227965e-07,
"logits/chosen": 0.8492619395256042,
"logits/rejected": 0.8256470561027527,
"logps/chosen": -1.6656391620635986,
"logps/rejected": -3.1693525314331055,
"loss": 0.713,
"rewards/accuracies": 0.78125,
"rewards/chosen": -16.656391143798828,
"rewards/margins": 15.03713607788086,
"rewards/rejected": -31.693523406982422,
"step": 240
},
{
"epoch": 0.6009038491506935,
"grad_norm": 12.375,
"learning_rate": 2.5826441097131433e-07,
"logits/chosen": 0.7694429755210876,
"logits/rejected": 0.7062366008758545,
"logps/chosen": -1.8840895891189575,
"logps/rejected": -3.3917837142944336,
"loss": 0.4917,
"rewards/accuracies": 0.90625,
"rewards/chosen": -18.84089469909668,
"rewards/margins": 15.07693862915039,
"rewards/rejected": -33.9178352355957,
"step": 241
},
{
"epoch": 0.6033972261181237,
"grad_norm": 3.140625,
"learning_rate": 2.555113471171251e-07,
"logits/chosen": 0.7511383295059204,
"logits/rejected": 0.8419240713119507,
"logps/chosen": -1.99148428440094,
"logps/rejected": -3.644866943359375,
"loss": 0.1984,
"rewards/accuracies": 0.9375,
"rewards/chosen": -19.914844512939453,
"rewards/margins": 16.533824920654297,
"rewards/rejected": -36.44866943359375,
"step": 242
},
{
"epoch": 0.605890603085554,
"grad_norm": 34.0,
"learning_rate": 2.527645747822462e-07,
"logits/chosen": 0.7965211272239685,
"logits/rejected": 0.7004488706588745,
"logps/chosen": -1.90436851978302,
"logps/rejected": -3.2234106063842773,
"loss": 0.3643,
"rewards/accuracies": 0.90625,
"rewards/chosen": -19.043685913085938,
"rewards/margins": 13.19041919708252,
"rewards/rejected": -32.23410415649414,
"step": 243
},
{
"epoch": 0.6083839800529842,
"grad_norm": 9.125,
"learning_rate": 2.5002427686029125e-07,
"logits/chosen": 0.9241939783096313,
"logits/rejected": 0.8476071953773499,
"logps/chosen": -1.663865566253662,
"logps/rejected": -2.624908685684204,
"loss": 0.5361,
"rewards/accuracies": 0.75,
"rewards/chosen": -16.638656616210938,
"rewards/margins": 9.610431671142578,
"rewards/rejected": -26.249088287353516,
"step": 244
},
{
"epoch": 0.6108773570204146,
"grad_norm": 9.9375,
"learning_rate": 2.472906358137759e-07,
"logits/chosen": 0.7417331337928772,
"logits/rejected": 0.67648845911026,
"logps/chosen": -1.45391845703125,
"logps/rejected": -2.7644288539886475,
"loss": 0.451,
"rewards/accuracies": 0.84375,
"rewards/chosen": -14.539186477661133,
"rewards/margins": 13.105106353759766,
"rewards/rejected": -27.644290924072266,
"step": 245
},
{
"epoch": 0.6133707339878448,
"grad_norm": 13.9375,
"learning_rate": 2.445638336619681e-07,
"logits/chosen": 0.8194867968559265,
"logits/rejected": 0.7898424863815308,
"logps/chosen": -1.7679089307785034,
"logps/rejected": -3.1125354766845703,
"loss": 0.5163,
"rewards/accuracies": 0.8125,
"rewards/chosen": -17.679088592529297,
"rewards/margins": 13.446268081665039,
"rewards/rejected": -31.12535858154297,
"step": 246
},
{
"epoch": 0.615864110955275,
"grad_norm": 9.8125,
"learning_rate": 2.418440519687684e-07,
"logits/chosen": 0.9577868580818176,
"logits/rejected": 0.7647145986557007,
"logps/chosen": -1.611385703086853,
"logps/rejected": -2.760087728500366,
"loss": 0.6587,
"rewards/accuracies": 0.71875,
"rewards/chosen": -16.11385726928711,
"rewards/margins": 11.487018585205078,
"rewards/rejected": -27.600875854492188,
"step": 247
},
{
"epoch": 0.6183574879227053,
"grad_norm": 60.75,
"learning_rate": 2.391314718306212e-07,
"logits/chosen": 0.8142352104187012,
"logits/rejected": 0.7828744053840637,
"logps/chosen": -1.1173535585403442,
"logps/rejected": -1.752410650253296,
"loss": 0.6811,
"rewards/accuracies": 0.8125,
"rewards/chosen": -11.173534393310547,
"rewards/margins": 6.350571632385254,
"rewards/rejected": -17.52410888671875,
"step": 248
},
{
"epoch": 0.6208508648901355,
"grad_norm": 12.375,
"learning_rate": 2.3642627386445537e-07,
"logits/chosen": 0.8487910628318787,
"logits/rejected": 0.8330541849136353,
"logps/chosen": -1.5378450155258179,
"logps/rejected": -2.359829902648926,
"loss": 0.8157,
"rewards/accuracies": 0.75,
"rewards/chosen": -15.378450393676758,
"rewards/margins": 8.219846725463867,
"rewards/rejected": -23.598297119140625,
"step": 249
},
{
"epoch": 0.6233442418575659,
"grad_norm": 6.25,
"learning_rate": 2.3372863819565868e-07,
"logits/chosen": 0.8798298239707947,
"logits/rejected": 0.7591216564178467,
"logps/chosen": -1.6283077001571655,
"logps/rejected": -3.044259548187256,
"loss": 0.4055,
"rewards/accuracies": 0.8125,
"rewards/chosen": -16.2830753326416,
"rewards/margins": 14.159520149230957,
"rewards/rejected": -30.442594528198242,
"step": 250
},
{
"epoch": 0.6258376188249961,
"grad_norm": 6.90625,
"learning_rate": 2.310387444460842e-07,
"logits/chosen": 0.8435265421867371,
"logits/rejected": 0.6582808494567871,
"logps/chosen": -1.9542193412780762,
"logps/rejected": -3.527535915374756,
"loss": 0.2511,
"rewards/accuracies": 0.875,
"rewards/chosen": -19.542194366455078,
"rewards/margins": 15.733165740966797,
"rewards/rejected": -35.275360107421875,
"step": 251
},
{
"epoch": 0.6283309957924264,
"grad_norm": 12.5625,
"learning_rate": 2.2835677172208942e-07,
"logits/chosen": 0.9236465692520142,
"logits/rejected": 0.7837573885917664,
"logps/chosen": -1.5361783504486084,
"logps/rejected": -2.8093583583831787,
"loss": 0.4637,
"rewards/accuracies": 0.84375,
"rewards/chosen": -15.361783981323242,
"rewards/margins": 12.731797218322754,
"rewards/rejected": -28.093584060668945,
"step": 252
},
{
"epoch": 0.6308243727598566,
"grad_norm": 7.25,
"learning_rate": 2.2568289860261148e-07,
"logits/chosen": 0.8141547441482544,
"logits/rejected": 0.734942615032196,
"logps/chosen": -1.593569040298462,
"logps/rejected": -3.0460009574890137,
"loss": 0.4794,
"rewards/accuracies": 0.8125,
"rewards/chosen": -15.935691833496094,
"rewards/margins": 14.52431869506836,
"rewards/rejected": -30.46000862121582,
"step": 253
},
{
"epoch": 0.6333177497272869,
"grad_norm": 19.875,
"learning_rate": 2.2301730312727568e-07,
"logits/chosen": 0.8214707374572754,
"logits/rejected": 0.7384806871414185,
"logps/chosen": -1.9334094524383545,
"logps/rejected": -3.117767095565796,
"loss": 0.5317,
"rewards/accuracies": 0.84375,
"rewards/chosen": -19.334095001220703,
"rewards/margins": 11.843574523925781,
"rewards/rejected": -31.177671432495117,
"step": 254
},
{
"epoch": 0.6358111266947172,
"grad_norm": 4.375,
"learning_rate": 2.203601627845411e-07,
"logits/chosen": 0.9514889717102051,
"logits/rejected": 0.8385657072067261,
"logps/chosen": -2.093611001968384,
"logps/rejected": -4.076337814331055,
"loss": 0.1654,
"rewards/accuracies": 0.90625,
"rewards/chosen": -20.936111450195312,
"rewards/margins": 19.8272705078125,
"rewards/rejected": -40.76338195800781,
"step": 255
},
{
"epoch": 0.6383045036621474,
"grad_norm": 32.5,
"learning_rate": 2.1771165449988274e-07,
"logits/chosen": 1.076192855834961,
"logits/rejected": 0.8149666786193848,
"logps/chosen": -1.584862232208252,
"logps/rejected": -2.5359246730804443,
"loss": 0.4198,
"rewards/accuracies": 0.84375,
"rewards/chosen": -15.84862232208252,
"rewards/margins": 9.510624885559082,
"rewards/rejected": -25.3592472076416,
"step": 256
},
{
"epoch": 0.6407978806295777,
"grad_norm": 10.9375,
"learning_rate": 2.1507195462401042e-07,
"logits/chosen": 0.8264479041099548,
"logits/rejected": 0.8565191626548767,
"logps/chosen": -1.590366244316101,
"logps/rejected": -2.96756911277771,
"loss": 0.7281,
"rewards/accuracies": 0.65625,
"rewards/chosen": -15.903663635253906,
"rewards/margins": 13.772027969360352,
"rewards/rejected": -29.675691604614258,
"step": 257
},
{
"epoch": 0.6432912575970079,
"grad_norm": 18.75,
"learning_rate": 2.1244123892112674e-07,
"logits/chosen": 0.8875083923339844,
"logits/rejected": 0.8365639448165894,
"logps/chosen": -1.9993164539337158,
"logps/rejected": -4.3548431396484375,
"loss": 0.4588,
"rewards/accuracies": 0.84375,
"rewards/chosen": -19.993162155151367,
"rewards/margins": 23.555269241333008,
"rewards/rejected": -43.548431396484375,
"step": 258
},
{
"epoch": 0.6457846345644382,
"grad_norm": 5.09375,
"learning_rate": 2.0981968255722427e-07,
"logits/chosen": 0.9401863217353821,
"logits/rejected": 0.8267409801483154,
"logps/chosen": -1.508123755455017,
"logps/rejected": -2.8934311866760254,
"loss": 0.2964,
"rewards/accuracies": 0.84375,
"rewards/chosen": -15.08123779296875,
"rewards/margins": 13.853076934814453,
"rewards/rejected": -28.93431282043457,
"step": 259
},
{
"epoch": 0.6482780115318685,
"grad_norm": 10.125,
"learning_rate": 2.072074600884213e-07,
"logits/chosen": 0.7929245233535767,
"logits/rejected": 0.7758727669715881,
"logps/chosen": -1.806505560874939,
"logps/rejected": -3.316180944442749,
"loss": 0.6586,
"rewards/accuracies": 0.78125,
"rewards/chosen": -18.0650577545166,
"rewards/margins": 15.09675407409668,
"rewards/rejected": -33.16181182861328,
"step": 260
},
{
"epoch": 0.6507713884992987,
"grad_norm": 6.96875,
"learning_rate": 2.0460474544933978e-07,
"logits/chosen": 0.7232526540756226,
"logits/rejected": 0.7585304975509644,
"logps/chosen": -1.4770225286483765,
"logps/rejected": -2.5309412479400635,
"loss": 0.423,
"rewards/accuracies": 0.875,
"rewards/chosen": -14.770224571228027,
"rewards/margins": 10.539185523986816,
"rewards/rejected": -25.309412002563477,
"step": 261
},
{
"epoch": 0.653264765466729,
"grad_norm": 5.1875,
"learning_rate": 2.020117119415233e-07,
"logits/chosen": 0.7610968351364136,
"logits/rejected": 0.6675768494606018,
"logps/chosen": -1.518571376800537,
"logps/rejected": -2.640080690383911,
"loss": 0.3495,
"rewards/accuracies": 0.875,
"rewards/chosen": -15.185713768005371,
"rewards/margins": 11.215094566345215,
"rewards/rejected": -26.400808334350586,
"step": 262
},
{
"epoch": 0.6557581424341593,
"grad_norm": 20.125,
"learning_rate": 1.9942853222189841e-07,
"logits/chosen": 0.8614793419837952,
"logits/rejected": 0.7701671719551086,
"logps/chosen": -1.5696934461593628,
"logps/rejected": -2.8778579235076904,
"loss": 0.6096,
"rewards/accuracies": 0.875,
"rewards/chosen": -15.69693374633789,
"rewards/margins": 13.081643104553223,
"rewards/rejected": -28.778575897216797,
"step": 263
},
{
"epoch": 0.6582515194015895,
"grad_norm": 12.375,
"learning_rate": 1.968553782912778e-07,
"logits/chosen": 0.8768056631088257,
"logits/rejected": 0.8102119565010071,
"logps/chosen": -1.6998172998428345,
"logps/rejected": -2.9253602027893066,
"loss": 0.625,
"rewards/accuracies": 0.75,
"rewards/chosen": -16.998172760009766,
"rewards/margins": 12.255431175231934,
"rewards/rejected": -29.253602981567383,
"step": 264
},
{
"epoch": 0.6607448963690198,
"grad_norm": 29.0,
"learning_rate": 1.942924214829077e-07,
"logits/chosen": 0.9345517158508301,
"logits/rejected": 0.7886137962341309,
"logps/chosen": -1.9977731704711914,
"logps/rejected": -3.9683516025543213,
"loss": 0.5431,
"rewards/accuracies": 0.8125,
"rewards/chosen": -19.97772979736328,
"rewards/margins": 19.705781936645508,
"rewards/rejected": -39.68351364135742,
"step": 265
},
{
"epoch": 0.66323827333645,
"grad_norm": 4.125,
"learning_rate": 1.9173983245106005e-07,
"logits/chosen": 0.9463739395141602,
"logits/rejected": 0.8353683948516846,
"logps/chosen": -1.8554211854934692,
"logps/rejected": -3.5142271518707275,
"loss": 0.2197,
"rewards/accuracies": 0.90625,
"rewards/chosen": -18.554210662841797,
"rewards/margins": 16.58806037902832,
"rewards/rejected": -35.14227294921875,
"step": 266
},
{
"epoch": 0.6657316503038803,
"grad_norm": 31.25,
"learning_rate": 1.891977811596689e-07,
"logits/chosen": 1.0108263492584229,
"logits/rejected": 0.723067581653595,
"logps/chosen": -1.615850567817688,
"logps/rejected": -2.9190895557403564,
"loss": 0.7786,
"rewards/accuracies": 0.78125,
"rewards/chosen": -16.158506393432617,
"rewards/margins": 13.032387733459473,
"rewards/rejected": -29.19089698791504,
"step": 267
},
{
"epoch": 0.6682250272713106,
"grad_norm": 3.640625,
"learning_rate": 1.8666643687101418e-07,
"logits/chosen": 0.922001302242279,
"logits/rejected": 0.8183608651161194,
"logps/chosen": -1.845801830291748,
"logps/rejected": -3.9838411808013916,
"loss": 0.2435,
"rewards/accuracies": 0.90625,
"rewards/chosen": -18.458017349243164,
"rewards/margins": 21.380395889282227,
"rewards/rejected": -39.83841323852539,
"step": 268
},
{
"epoch": 0.6707184042387409,
"grad_norm": 9.625,
"learning_rate": 1.8414596813445047e-07,
"logits/chosen": 0.9229664206504822,
"logits/rejected": 0.8024593591690063,
"logps/chosen": -1.5461751222610474,
"logps/rejected": -2.713073968887329,
"loss": 0.4835,
"rewards/accuracies": 0.78125,
"rewards/chosen": -15.461751937866211,
"rewards/margins": 11.668989181518555,
"rewards/rejected": -27.130741119384766,
"step": 269
},
{
"epoch": 0.6732117812061711,
"grad_norm": 4.78125,
"learning_rate": 1.8163654277518476e-07,
"logits/chosen": 0.8847929835319519,
"logits/rejected": 0.7221932411193848,
"logps/chosen": -1.56783127784729,
"logps/rejected": -2.7949106693267822,
"loss": 0.357,
"rewards/accuracies": 0.875,
"rewards/chosen": -15.678312301635742,
"rewards/margins": 12.270795822143555,
"rewards/rejected": -27.949108123779297,
"step": 270
},
{
"epoch": 0.6757051581736013,
"grad_norm": 5.1875,
"learning_rate": 1.7913832788310162e-07,
"logits/chosen": 0.9237401485443115,
"logits/rejected": 0.8515968322753906,
"logps/chosen": -1.6207122802734375,
"logps/rejected": -2.975242853164673,
"loss": 0.3603,
"rewards/accuracies": 0.875,
"rewards/chosen": -16.207122802734375,
"rewards/margins": 13.54530143737793,
"rewards/rejected": -29.752422332763672,
"step": 271
},
{
"epoch": 0.6781985351410317,
"grad_norm": 24.375,
"learning_rate": 1.7665148980163747e-07,
"logits/chosen": 0.9174185991287231,
"logits/rejected": 0.8517237901687622,
"logps/chosen": -1.9268598556518555,
"logps/rejected": -3.653189182281494,
"loss": 0.5412,
"rewards/accuracies": 0.78125,
"rewards/chosen": -19.268598556518555,
"rewards/margins": 17.263296127319336,
"rewards/rejected": -36.531890869140625,
"step": 272
},
{
"epoch": 0.6806919121084619,
"grad_norm": 20.125,
"learning_rate": 1.741761941167051e-07,
"logits/chosen": 0.8469513654708862,
"logits/rejected": 0.7570927739143372,
"logps/chosen": -1.7269822359085083,
"logps/rejected": -3.0970540046691895,
"loss": 0.4379,
"rewards/accuracies": 0.84375,
"rewards/chosen": -17.26982307434082,
"rewards/margins": 13.700716018676758,
"rewards/rejected": -30.970539093017578,
"step": 273
},
{
"epoch": 0.6831852890758922,
"grad_norm": 7.65625,
"learning_rate": 1.7171260564566735e-07,
"logits/chosen": 0.853800892829895,
"logits/rejected": 0.6823726892471313,
"logps/chosen": -1.6823272705078125,
"logps/rejected": -3.1744813919067383,
"loss": 0.4328,
"rewards/accuracies": 0.84375,
"rewards/chosen": -16.823274612426758,
"rewards/margins": 14.921540260314941,
"rewards/rejected": -31.744813919067383,
"step": 274
},
{
"epoch": 0.6856786660433224,
"grad_norm": 7.5625,
"learning_rate": 1.6926088842636336e-07,
"logits/chosen": 0.8564770817756653,
"logits/rejected": 0.7224562168121338,
"logps/chosen": -1.7104108333587646,
"logps/rejected": -3.030578851699829,
"loss": 0.34,
"rewards/accuracies": 0.8125,
"rewards/chosen": -17.104108810424805,
"rewards/margins": 13.201680183410645,
"rewards/rejected": -30.3057918548584,
"step": 275
},
{
"epoch": 0.6881720430107527,
"grad_norm": 5.375,
"learning_rate": 1.6682120570618583e-07,
"logits/chosen": 0.9256489276885986,
"logits/rejected": 0.8403459787368774,
"logps/chosen": -1.7236558198928833,
"logps/rejected": -3.5449249744415283,
"loss": 0.2784,
"rewards/accuracies": 0.90625,
"rewards/chosen": -17.23655891418457,
"rewards/margins": 18.212690353393555,
"rewards/rejected": -35.449249267578125,
"step": 276
},
{
"epoch": 0.690665419978183,
"grad_norm": 6.75,
"learning_rate": 1.6439371993121142e-07,
"logits/chosen": 1.0069345235824585,
"logits/rejected": 0.8647799491882324,
"logps/chosen": -1.7778537273406982,
"logps/rejected": -3.3906145095825195,
"loss": 0.4693,
"rewards/accuracies": 0.84375,
"rewards/chosen": -17.77853775024414,
"rewards/margins": 16.127605438232422,
"rewards/rejected": -33.90614318847656,
"step": 277
},
{
"epoch": 0.6931587969456132,
"grad_norm": 7.15625,
"learning_rate": 1.61978592735384e-07,
"logits/chosen": 0.7570043802261353,
"logits/rejected": 0.7392297387123108,
"logps/chosen": -1.772491455078125,
"logps/rejected": -3.0083491802215576,
"loss": 0.3305,
"rewards/accuracies": 0.78125,
"rewards/chosen": -17.72491455078125,
"rewards/margins": 12.358576774597168,
"rewards/rejected": -30.0834903717041,
"step": 278
},
{
"epoch": 0.6956521739130435,
"grad_norm": 6.6875,
"learning_rate": 1.595759849297528e-07,
"logits/chosen": 0.9452332258224487,
"logits/rejected": 0.8405147790908813,
"logps/chosen": -1.5206505060195923,
"logps/rejected": -2.9708354473114014,
"loss": 0.7058,
"rewards/accuracies": 0.6875,
"rewards/chosen": -15.206504821777344,
"rewards/margins": 14.501848220825195,
"rewards/rejected": -29.708354949951172,
"step": 279
},
{
"epoch": 0.6981455508804737,
"grad_norm": 5.78125,
"learning_rate": 1.5718605649176415e-07,
"logits/chosen": 0.9056351780891418,
"logits/rejected": 0.759840190410614,
"logps/chosen": -1.3903148174285889,
"logps/rejected": -2.4290876388549805,
"loss": 0.4354,
"rewards/accuracies": 0.84375,
"rewards/chosen": -13.90314769744873,
"rewards/margins": 10.387725830078125,
"rewards/rejected": -24.290874481201172,
"step": 280
},
{
"epoch": 0.700638927847904,
"grad_norm": 6.0625,
"learning_rate": 1.5480896655460975e-07,
"logits/chosen": 0.8469112515449524,
"logits/rejected": 0.7188205718994141,
"logps/chosen": -1.4428998231887817,
"logps/rejected": -3.476562261581421,
"loss": 0.4048,
"rewards/accuracies": 0.8125,
"rewards/chosen": -14.428997993469238,
"rewards/margins": 20.336626052856445,
"rewards/rejected": -34.765625,
"step": 281
},
{
"epoch": 0.7031323048153343,
"grad_norm": 6.78125,
"learning_rate": 1.5244487339663086e-07,
"logits/chosen": 0.9786227941513062,
"logits/rejected": 0.9008299112319946,
"logps/chosen": -2.115980386734009,
"logps/rejected": -3.8103702068328857,
"loss": 0.3597,
"rewards/accuracies": 0.90625,
"rewards/chosen": -21.159805297851562,
"rewards/margins": 16.943897247314453,
"rewards/rejected": -38.103702545166016,
"step": 282
},
{
"epoch": 0.7056256817827645,
"grad_norm": 9.3125,
"learning_rate": 1.5009393443077906e-07,
"logits/chosen": 0.9762454032897949,
"logits/rejected": 0.8306148648262024,
"logps/chosen": -1.981116771697998,
"logps/rejected": -3.2973973751068115,
"loss": 0.4867,
"rewards/accuracies": 0.8125,
"rewards/chosen": -19.811168670654297,
"rewards/margins": 13.16280460357666,
"rewards/rejected": -32.973976135253906,
"step": 283
},
{
"epoch": 0.7081190587501948,
"grad_norm": 22.625,
"learning_rate": 1.477563061941355e-07,
"logits/chosen": 1.017063856124878,
"logits/rejected": 0.7016565799713135,
"logps/chosen": -1.3156154155731201,
"logps/rejected": -2.303849458694458,
"loss": 0.6619,
"rewards/accuracies": 0.8125,
"rewards/chosen": -13.15615463256836,
"rewards/margins": 9.882339477539062,
"rewards/rejected": -23.038494110107422,
"step": 284
},
{
"epoch": 0.7106124357176251,
"grad_norm": 12.9375,
"learning_rate": 1.4543214433748714e-07,
"logits/chosen": 1.039493203163147,
"logits/rejected": 0.8438839912414551,
"logps/chosen": -1.7385656833648682,
"logps/rejected": -3.1600584983825684,
"loss": 0.4472,
"rewards/accuracies": 0.84375,
"rewards/chosen": -17.385656356811523,
"rewards/margins": 14.21492862701416,
"rewards/rejected": -31.6005859375,
"step": 285
},
{
"epoch": 0.7131058126850554,
"grad_norm": 6.125,
"learning_rate": 1.4312160361496325e-07,
"logits/chosen": 0.880534291267395,
"logits/rejected": 0.8419840335845947,
"logps/chosen": -1.7119375467300415,
"logps/rejected": -3.064938545227051,
"loss": 0.5029,
"rewards/accuracies": 0.84375,
"rewards/chosen": -17.11937713623047,
"rewards/margins": 13.530012130737305,
"rewards/rejected": -30.64938735961914,
"step": 286
},
{
"epoch": 0.7155991896524856,
"grad_norm": 8.8125,
"learning_rate": 1.4082483787373093e-07,
"logits/chosen": 0.8826863765716553,
"logits/rejected": 0.8228853940963745,
"logps/chosen": -1.5570282936096191,
"logps/rejected": -2.6813974380493164,
"loss": 0.8264,
"rewards/accuracies": 0.75,
"rewards/chosen": -15.570282936096191,
"rewards/margins": 11.243692398071289,
"rewards/rejected": -26.813976287841797,
"step": 287
},
{
"epoch": 0.7180925666199158,
"grad_norm": 7.6875,
"learning_rate": 1.3854200004375123e-07,
"logits/chosen": 0.752357542514801,
"logits/rejected": 0.7416955828666687,
"logps/chosen": -1.8245292901992798,
"logps/rejected": -3.410393714904785,
"loss": 0.2918,
"rewards/accuracies": 0.90625,
"rewards/chosen": -18.24529266357422,
"rewards/margins": 15.858641624450684,
"rewards/rejected": -34.10393142700195,
"step": 288
},
{
"epoch": 0.7205859435873461,
"grad_norm": 6.375,
"learning_rate": 1.3627324212759662e-07,
"logits/chosen": 0.9414355754852295,
"logits/rejected": 0.7949234843254089,
"logps/chosen": -1.5395060777664185,
"logps/rejected": -2.7976861000061035,
"loss": 0.5103,
"rewards/accuracies": 0.875,
"rewards/chosen": -15.395059585571289,
"rewards/margins": 12.581799507141113,
"rewards/rejected": -27.97686195373535,
"step": 289
},
{
"epoch": 0.7230793205547764,
"grad_norm": 13.375,
"learning_rate": 1.3401871519032942e-07,
"logits/chosen": 0.7719554305076599,
"logits/rejected": 0.8289276957511902,
"logps/chosen": -1.5537012815475464,
"logps/rejected": -2.931002140045166,
"loss": 0.4564,
"rewards/accuracies": 0.78125,
"rewards/chosen": -15.537013053894043,
"rewards/margins": 13.7730073928833,
"rewards/rejected": -29.310020446777344,
"step": 290
},
{
"epoch": 0.7255726975222067,
"grad_norm": 12.0,
"learning_rate": 1.317785693494433e-07,
"logits/chosen": 0.906543493270874,
"logits/rejected": 0.8372653126716614,
"logps/chosen": -1.877508282661438,
"logps/rejected": -3.658639669418335,
"loss": 0.5423,
"rewards/accuracies": 0.78125,
"rewards/chosen": -18.775081634521484,
"rewards/margins": 17.811315536499023,
"rewards/rejected": -36.58639907836914,
"step": 291
},
{
"epoch": 0.7280660744896369,
"grad_norm": 20.25,
"learning_rate": 1.2955295376486793e-07,
"logits/chosen": 0.9387526512145996,
"logits/rejected": 0.8902648687362671,
"logps/chosen": -1.6775869131088257,
"logps/rejected": -3.076120138168335,
"loss": 0.8689,
"rewards/accuracies": 0.75,
"rewards/chosen": -16.775869369506836,
"rewards/margins": 13.985333442687988,
"rewards/rejected": -30.76120376586914,
"step": 292
},
{
"epoch": 0.7305594514570671,
"grad_norm": 14.0625,
"learning_rate": 1.273420166290371e-07,
"logits/chosen": 0.771159827709198,
"logits/rejected": 0.7604851126670837,
"logps/chosen": -1.4995477199554443,
"logps/rejected": -2.7952613830566406,
"loss": 0.5616,
"rewards/accuracies": 0.78125,
"rewards/chosen": -14.995477676391602,
"rewards/margins": 12.957136154174805,
"rewards/rejected": -27.952613830566406,
"step": 293
},
{
"epoch": 0.7330528284244975,
"grad_norm": 5.625,
"learning_rate": 1.2514590515702093e-07,
"logits/chosen": 0.9259358048439026,
"logits/rejected": 0.8557572364807129,
"logps/chosen": -1.718395471572876,
"logps/rejected": -3.260573387145996,
"loss": 0.4787,
"rewards/accuracies": 0.84375,
"rewards/chosen": -17.183956146240234,
"rewards/margins": 15.421775817871094,
"rewards/rejected": -32.60573196411133,
"step": 294
},
{
"epoch": 0.7355462053919277,
"grad_norm": 3.984375,
"learning_rate": 1.2296476557672452e-07,
"logits/chosen": 0.9226200580596924,
"logits/rejected": 0.7464591264724731,
"logps/chosen": -1.7733253240585327,
"logps/rejected": -3.0839881896972656,
"loss": 0.4696,
"rewards/accuracies": 0.875,
"rewards/chosen": -17.733253479003906,
"rewards/margins": 13.10662841796875,
"rewards/rejected": -30.839881896972656,
"step": 295
},
{
"epoch": 0.738039582359358,
"grad_norm": 12.8125,
"learning_rate": 1.2079874311915026e-07,
"logits/chosen": 0.9862551689147949,
"logits/rejected": 0.8426701426506042,
"logps/chosen": -1.5399045944213867,
"logps/rejected": -2.9282631874084473,
"loss": 0.5123,
"rewards/accuracies": 0.78125,
"rewards/chosen": -15.399044036865234,
"rewards/margins": 13.883587837219238,
"rewards/rejected": -29.282634735107422,
"step": 296
},
{
"epoch": 0.7405329593267882,
"grad_norm": 11.3125,
"learning_rate": 1.1864798200872824e-07,
"logits/chosen": 0.9428563714027405,
"logits/rejected": 0.7972367405891418,
"logps/chosen": -1.6001325845718384,
"logps/rejected": -3.5637686252593994,
"loss": 0.2544,
"rewards/accuracies": 0.90625,
"rewards/chosen": -16.001325607299805,
"rewards/margins": 19.63636016845703,
"rewards/rejected": -35.6376838684082,
"step": 297
},
{
"epoch": 0.7430263362942184,
"grad_norm": 27.75,
"learning_rate": 1.1651262545371318e-07,
"logits/chosen": 0.8185573816299438,
"logits/rejected": 0.8264700174331665,
"logps/chosen": -1.9273895025253296,
"logps/rejected": -3.5442724227905273,
"loss": 0.3429,
"rewards/accuracies": 0.90625,
"rewards/chosen": -19.273895263671875,
"rewards/margins": 16.16883087158203,
"rewards/rejected": -35.442726135253906,
"step": 298
},
{
"epoch": 0.7455197132616488,
"grad_norm": 10.25,
"learning_rate": 1.1439281563664836e-07,
"logits/chosen": 0.8733742833137512,
"logits/rejected": 0.8228683471679688,
"logps/chosen": -2.0226247310638428,
"logps/rejected": -3.6978607177734375,
"loss": 0.2416,
"rewards/accuracies": 0.9375,
"rewards/chosen": -20.226245880126953,
"rewards/margins": 16.75235939025879,
"rewards/rejected": -36.978607177734375,
"step": 299
},
{
"epoch": 0.748013090229079,
"grad_norm": 23.75,
"learning_rate": 1.1228869370489933e-07,
"logits/chosen": 0.8455230593681335,
"logits/rejected": 0.726607620716095,
"logps/chosen": -1.7042028903961182,
"logps/rejected": -2.9396235942840576,
"loss": 0.6624,
"rewards/accuracies": 0.8125,
"rewards/chosen": -17.042028427124023,
"rewards/margins": 12.354209899902344,
"rewards/rejected": -29.396238327026367,
"step": 300
},
{
"epoch": 0.7505064671965093,
"grad_norm": 16.625,
"learning_rate": 1.1020039976125454e-07,
"logits/chosen": 0.862872838973999,
"logits/rejected": 0.7240791320800781,
"logps/chosen": -1.6873464584350586,
"logps/rejected": -3.173642635345459,
"loss": 0.4094,
"rewards/accuracies": 0.84375,
"rewards/chosen": -16.873464584350586,
"rewards/margins": 14.862963676452637,
"rewards/rejected": -31.73642921447754,
"step": 301
},
{
"epoch": 0.7529998441639395,
"grad_norm": 20.625,
"learning_rate": 1.0812807285459737e-07,
"logits/chosen": 0.8827072978019714,
"logits/rejected": 0.7801661491394043,
"logps/chosen": -1.760999321937561,
"logps/rejected": -3.0102696418762207,
"loss": 0.1915,
"rewards/accuracies": 0.9375,
"rewards/chosen": -17.60999298095703,
"rewards/margins": 12.492703437805176,
"rewards/rejected": -30.10269546508789,
"step": 302
},
{
"epoch": 0.7554932211313699,
"grad_norm": 9.875,
"learning_rate": 1.0607185097064733e-07,
"logits/chosen": 0.9539688229560852,
"logits/rejected": 0.8203067183494568,
"logps/chosen": -1.5383775234222412,
"logps/rejected": -2.5994794368743896,
"loss": 0.6321,
"rewards/accuracies": 0.78125,
"rewards/chosen": -15.38377571105957,
"rewards/margins": 10.61102294921875,
"rewards/rejected": -25.994796752929688,
"step": 303
},
{
"epoch": 0.7579865980988001,
"grad_norm": 10.375,
"learning_rate": 1.0403187102277212e-07,
"logits/chosen": 0.9740419387817383,
"logits/rejected": 0.7236615419387817,
"logps/chosen": -1.680724024772644,
"logps/rejected": -3.213937759399414,
"loss": 0.5017,
"rewards/accuracies": 0.75,
"rewards/chosen": -16.807239532470703,
"rewards/margins": 15.332136154174805,
"rewards/rejected": -32.13937759399414,
"step": 304
},
{
"epoch": 0.7604799750662303,
"grad_norm": 8.5,
"learning_rate": 1.020082688428718e-07,
"logits/chosen": 0.7849897146224976,
"logits/rejected": 0.7147915959358215,
"logps/chosen": -1.7177461385726929,
"logps/rejected": -3.196275234222412,
"loss": 0.4481,
"rewards/accuracies": 0.84375,
"rewards/chosen": -17.17746353149414,
"rewards/margins": 14.785287857055664,
"rewards/rejected": -31.962751388549805,
"step": 305
},
{
"epoch": 0.7629733520336606,
"grad_norm": 6.84375,
"learning_rate": 1.0000117917233373e-07,
"logits/chosen": 0.7844271659851074,
"logits/rejected": 0.795640230178833,
"logps/chosen": -1.8986274003982544,
"logps/rejected": -3.815453052520752,
"loss": 0.2918,
"rewards/accuracies": 0.875,
"rewards/chosen": -18.98627471923828,
"rewards/margins": 19.168254852294922,
"rewards/rejected": -38.1545295715332,
"step": 306
},
{
"epoch": 0.7654667290010908,
"grad_norm": 22.375,
"learning_rate": 9.801073565306134e-08,
"logits/chosen": 0.915310800075531,
"logits/rejected": 0.8614601492881775,
"logps/chosen": -1.577059030532837,
"logps/rejected": -2.661583185195923,
"loss": 0.7258,
"rewards/accuracies": 0.71875,
"rewards/chosen": -15.770591735839844,
"rewards/margins": 10.84524154663086,
"rewards/rejected": -26.61583137512207,
"step": 307
},
{
"epoch": 0.7679601059685212,
"grad_norm": 11.5,
"learning_rate": 9.603707081857533e-08,
"logits/chosen": 0.8341223001480103,
"logits/rejected": 0.7446467876434326,
"logps/chosen": -2.0905356407165527,
"logps/rejected": -3.864170551300049,
"loss": 0.2911,
"rewards/accuracies": 0.875,
"rewards/chosen": -20.90535545349121,
"rewards/margins": 17.73634910583496,
"rewards/rejected": -38.64170837402344,
"step": 308
},
{
"epoch": 0.7704534829359514,
"grad_norm": 9.4375,
"learning_rate": 9.40803160851891e-08,
"logits/chosen": 0.9718061685562134,
"logits/rejected": 0.9494335651397705,
"logps/chosen": -1.6537656784057617,
"logps/rejected": -3.119168758392334,
"loss": 0.9953,
"rewards/accuracies": 0.78125,
"rewards/chosen": -16.537656784057617,
"rewards/margins": 14.654030799865723,
"rewards/rejected": -31.191692352294922,
"step": 309
},
{
"epoch": 0.7729468599033816,
"grad_norm": 6.1875,
"learning_rate": 9.214060174325823e-08,
"logits/chosen": 0.7993795871734619,
"logits/rejected": 0.7918787002563477,
"logps/chosen": -1.9169942140579224,
"logps/rejected": -3.608771800994873,
"loss": 0.4286,
"rewards/accuracies": 0.9375,
"rewards/chosen": -19.16994285583496,
"rewards/margins": 16.91777229309082,
"rewards/rejected": -36.08771514892578,
"step": 310
},
{
"epoch": 0.7754402368708119,
"grad_norm": 76.0,
"learning_rate": 9.021805694850552e-08,
"logits/chosen": 0.7791964411735535,
"logits/rejected": 0.6525046229362488,
"logps/chosen": -1.878448724746704,
"logps/rejected": -3.2219111919403076,
"loss": 0.3889,
"rewards/accuracies": 0.96875,
"rewards/chosen": -18.784488677978516,
"rewards/margins": 13.434623718261719,
"rewards/rejected": -32.21910858154297,
"step": 311
},
{
"epoch": 0.7779336138382422,
"grad_norm": 4.09375,
"learning_rate": 8.831280971342049e-08,
"logits/chosen": 0.8384397625923157,
"logits/rejected": 0.8411078453063965,
"logps/chosen": -1.9580962657928467,
"logps/rejected": -3.684387445449829,
"loss": 0.4734,
"rewards/accuracies": 0.875,
"rewards/chosen": -19.580963134765625,
"rewards/margins": 17.26291275024414,
"rewards/rejected": -36.8438720703125,
"step": 312
},
{
"epoch": 0.7804269908056725,
"grad_norm": 7.46875,
"learning_rate": 8.642498689873619e-08,
"logits/chosen": 0.9194357395172119,
"logits/rejected": 0.7971946597099304,
"logps/chosen": -1.6777794361114502,
"logps/rejected": -2.920085906982422,
"loss": 0.6,
"rewards/accuracies": 0.84375,
"rewards/chosen": -16.777795791625977,
"rewards/margins": 12.423064231872559,
"rewards/rejected": -29.20086097717285,
"step": 313
},
{
"epoch": 0.7829203677731027,
"grad_norm": 25.0,
"learning_rate": 8.45547142049821e-08,
"logits/chosen": 0.8890621066093445,
"logits/rejected": 0.6691703796386719,
"logps/chosen": -1.6438565254211426,
"logps/rejected": -3.2583494186401367,
"loss": 0.2676,
"rewards/accuracies": 0.875,
"rewards/chosen": -16.43856430053711,
"rewards/margins": 16.14493179321289,
"rewards/rejected": -32.58349609375,
"step": 314
},
{
"epoch": 0.7854137447405329,
"grad_norm": 36.75,
"learning_rate": 8.270211616411413e-08,
"logits/chosen": 0.8961160182952881,
"logits/rejected": 0.7380497455596924,
"logps/chosen": -1.8019180297851562,
"logps/rejected": -3.853311061859131,
"loss": 0.4376,
"rewards/accuracies": 0.84375,
"rewards/chosen": -18.019180297851562,
"rewards/margins": 20.513931274414062,
"rewards/rejected": -38.533111572265625,
"step": 315
},
{
"epoch": 0.7879071217079632,
"grad_norm": 4.9375,
"learning_rate": 8.086731613122324e-08,
"logits/chosen": 0.8375217914581299,
"logits/rejected": 0.706248939037323,
"logps/chosen": -1.8641583919525146,
"logps/rejected": -3.3603389263153076,
"loss": 0.203,
"rewards/accuracies": 0.90625,
"rewards/chosen": -18.641584396362305,
"rewards/margins": 14.961803436279297,
"rewards/rejected": -33.60338592529297,
"step": 316
},
{
"epoch": 0.7904004986753935,
"grad_norm": 3.71875,
"learning_rate": 7.905043627632113e-08,
"logits/chosen": 0.7290425300598145,
"logits/rejected": 0.7092160582542419,
"logps/chosen": -1.6382381916046143,
"logps/rejected": -3.2959959506988525,
"loss": 0.2101,
"rewards/accuracies": 0.875,
"rewards/chosen": -16.382381439208984,
"rewards/margins": 16.577579498291016,
"rewards/rejected": -32.9599609375,
"step": 317
},
{
"epoch": 0.7928938756428238,
"grad_norm": 6.71875,
"learning_rate": 7.725159757620596e-08,
"logits/chosen": 0.9056103825569153,
"logits/rejected": 0.8917890787124634,
"logps/chosen": -1.4776190519332886,
"logps/rejected": -2.642958641052246,
"loss": 0.46,
"rewards/accuracies": 0.84375,
"rewards/chosen": -14.776190757751465,
"rewards/margins": 11.653392791748047,
"rewards/rejected": -26.429582595825195,
"step": 318
},
{
"epoch": 0.795387252610254,
"grad_norm": 7.28125,
"learning_rate": 7.547091980640708e-08,
"logits/chosen": 0.7614390850067139,
"logits/rejected": 0.7574427127838135,
"logps/chosen": -1.3012231588363647,
"logps/rejected": -2.685375928878784,
"loss": 0.4226,
"rewards/accuracies": 0.84375,
"rewards/chosen": -13.012231826782227,
"rewards/margins": 13.841525077819824,
"rewards/rejected": -26.853755950927734,
"step": 319
},
{
"epoch": 0.7978806295776842,
"grad_norm": 18.625,
"learning_rate": 7.370852153320973e-08,
"logits/chosen": 0.9617218971252441,
"logits/rejected": 0.7541022896766663,
"logps/chosen": -1.5465266704559326,
"logps/rejected": -2.6077213287353516,
"loss": 0.6237,
"rewards/accuracies": 0.8125,
"rewards/chosen": -15.465266227722168,
"rewards/margins": 10.611949920654297,
"rewards/rejected": -26.07721710205078,
"step": 320
},
{
"epoch": 0.8003740065451146,
"grad_norm": 4.96875,
"learning_rate": 7.196452010576056e-08,
"logits/chosen": 0.8094066381454468,
"logits/rejected": 0.7924161553382874,
"logps/chosen": -2.0370168685913086,
"logps/rejected": -3.8806991577148438,
"loss": 0.2498,
"rewards/accuracies": 0.90625,
"rewards/chosen": -20.370168685913086,
"rewards/margins": 18.436824798583984,
"rewards/rejected": -38.80699157714844,
"step": 321
},
{
"epoch": 0.8028673835125448,
"grad_norm": 11.625,
"learning_rate": 7.023903164825346e-08,
"logits/chosen": 0.9718628525733948,
"logits/rejected": 0.8176442384719849,
"logps/chosen": -2.1258928775787354,
"logps/rejected": -4.15927267074585,
"loss": 0.6349,
"rewards/accuracies": 0.875,
"rewards/chosen": -21.258926391601562,
"rewards/margins": 20.333797454833984,
"rewards/rejected": -41.59272384643555,
"step": 322
},
{
"epoch": 0.8053607604799751,
"grad_norm": 5.78125,
"learning_rate": 6.853217105219782e-08,
"logits/chosen": 0.7881964445114136,
"logits/rejected": 0.6961764693260193,
"logps/chosen": -1.541295051574707,
"logps/rejected": -2.8516957759857178,
"loss": 0.2766,
"rewards/accuracies": 0.90625,
"rewards/chosen": -15.412951469421387,
"rewards/margins": 13.104007720947266,
"rewards/rejected": -28.516956329345703,
"step": 323
},
{
"epoch": 0.8078541374474053,
"grad_norm": 6.96875,
"learning_rate": 6.684405196876843e-08,
"logits/chosen": 0.9054229259490967,
"logits/rejected": 0.799680233001709,
"logps/chosen": -1.280112624168396,
"logps/rejected": -2.1591079235076904,
"loss": 0.6875,
"rewards/accuracies": 0.78125,
"rewards/chosen": -12.801126480102539,
"rewards/margins": 8.789952278137207,
"rewards/rejected": -21.591079711914062,
"step": 324
},
{
"epoch": 0.8103475144148355,
"grad_norm": 11.875,
"learning_rate": 6.517478680123776e-08,
"logits/chosen": 0.8642288446426392,
"logits/rejected": 0.825298547744751,
"logps/chosen": -1.4944114685058594,
"logps/rejected": -2.5201451778411865,
"loss": 0.7636,
"rewards/accuracies": 0.71875,
"rewards/chosen": -14.944114685058594,
"rewards/margins": 10.257339477539062,
"rewards/rejected": -25.201452255249023,
"step": 325
},
{
"epoch": 0.8128408913822659,
"grad_norm": 7.59375,
"learning_rate": 6.352448669749224e-08,
"logits/chosen": 0.9343512654304504,
"logits/rejected": 0.8261175155639648,
"logps/chosen": -2.1113977432250977,
"logps/rejected": -4.173766136169434,
"loss": 0.3753,
"rewards/accuracies": 0.875,
"rewards/chosen": -21.11397933959961,
"rewards/margins": 20.623685836791992,
"rewards/rejected": -41.73766326904297,
"step": 326
},
{
"epoch": 0.8153342683496961,
"grad_norm": 5.25,
"learning_rate": 6.189326154263068e-08,
"logits/chosen": 0.7759539484977722,
"logits/rejected": 0.7987840175628662,
"logps/chosen": -1.8643192052841187,
"logps/rejected": -3.6068685054779053,
"loss": 0.4069,
"rewards/accuracies": 0.84375,
"rewards/chosen": -18.643192291259766,
"rewards/margins": 17.425495147705078,
"rewards/rejected": -36.068687438964844,
"step": 327
},
{
"epoch": 0.8178276453171264,
"grad_norm": 22.875,
"learning_rate": 6.028121995164812e-08,
"logits/chosen": 0.8969675302505493,
"logits/rejected": 0.7524930238723755,
"logps/chosen": -1.4999438524246216,
"logps/rejected": -2.7047371864318848,
"loss": 0.6928,
"rewards/accuracies": 0.78125,
"rewards/chosen": -14.999438285827637,
"rewards/margins": 12.047935485839844,
"rewards/rejected": -27.04737091064453,
"step": 328
},
{
"epoch": 0.8203210222845566,
"grad_norm": 5.0,
"learning_rate": 5.868846926220346e-08,
"logits/chosen": 0.9210751056671143,
"logits/rejected": 0.8755130767822266,
"logps/chosen": -2.071080446243286,
"logps/rejected": -4.128433704376221,
"loss": 0.3026,
"rewards/accuracies": 0.875,
"rewards/chosen": -20.710805892944336,
"rewards/margins": 20.573535919189453,
"rewards/rejected": -41.284339904785156,
"step": 329
},
{
"epoch": 0.822814399251987,
"grad_norm": 6.375,
"learning_rate": 5.7115115527472575e-08,
"logits/chosen": 0.746177077293396,
"logits/rejected": 0.7545452117919922,
"logps/chosen": -1.6824297904968262,
"logps/rejected": -2.987593650817871,
"loss": 0.3245,
"rewards/accuracies": 0.875,
"rewards/chosen": -16.824296951293945,
"rewards/margins": 13.051637649536133,
"rewards/rejected": -29.875934600830078,
"step": 330
},
{
"epoch": 0.8253077762194172,
"grad_norm": 9.1875,
"learning_rate": 5.556126350908654e-08,
"logits/chosen": 0.8064150810241699,
"logits/rejected": 0.7365544438362122,
"logps/chosen": -1.7546963691711426,
"logps/rejected": -3.0438730716705322,
"loss": 0.4564,
"rewards/accuracies": 0.84375,
"rewards/chosen": -17.54696273803711,
"rewards/margins": 12.891766548156738,
"rewards/rejected": -30.438732147216797,
"step": 331
},
{
"epoch": 0.8278011531868474,
"grad_norm": 40.75,
"learning_rate": 5.402701667015655e-08,
"logits/chosen": 0.8081064820289612,
"logits/rejected": 0.8690564632415771,
"logps/chosen": -2.0253567695617676,
"logps/rejected": -3.4268417358398438,
"loss": 0.591,
"rewards/accuracies": 0.75,
"rewards/chosen": -20.253568649291992,
"rewards/margins": 14.014848709106445,
"rewards/rejected": -34.26841735839844,
"step": 332
},
{
"epoch": 0.8302945301542777,
"grad_norm": 5.90625,
"learning_rate": 5.2512477168384125e-08,
"logits/chosen": 0.7250826954841614,
"logits/rejected": 0.6569004654884338,
"logps/chosen": -1.7179774045944214,
"logps/rejected": -3.111191987991333,
"loss": 0.4474,
"rewards/accuracies": 0.875,
"rewards/chosen": -17.179773330688477,
"rewards/margins": 13.932147979736328,
"rewards/rejected": -31.111919403076172,
"step": 333
},
{
"epoch": 0.8327879071217079,
"grad_norm": 5.5625,
"learning_rate": 5.101774584925959e-08,
"logits/chosen": 0.7951087355613708,
"logits/rejected": 0.7745989561080933,
"logps/chosen": -1.7096567153930664,
"logps/rejected": -3.168893814086914,
"loss": 0.3182,
"rewards/accuracies": 0.84375,
"rewards/chosen": -17.096567153930664,
"rewards/margins": 14.592374801635742,
"rewards/rejected": -31.688940048217773,
"step": 334
},
{
"epoch": 0.8352812840891383,
"grad_norm": 11.8125,
"learning_rate": 4.9542922239346865e-08,
"logits/chosen": 0.9192527532577515,
"logits/rejected": 0.8001135587692261,
"logps/chosen": -1.9547454118728638,
"logps/rejected": -3.6320178508758545,
"loss": 0.2258,
"rewards/accuracies": 0.875,
"rewards/chosen": -19.547454833984375,
"rewards/margins": 16.772724151611328,
"rewards/rejected": -36.3201789855957,
"step": 335
},
{
"epoch": 0.8377746610565685,
"grad_norm": 9.1875,
"learning_rate": 4.8088104539656715e-08,
"logits/chosen": 0.7919576168060303,
"logits/rejected": 0.8408181667327881,
"logps/chosen": -1.7132326364517212,
"logps/rejected": -3.2617900371551514,
"loss": 0.6176,
"rewards/accuracies": 0.78125,
"rewards/chosen": -17.132326126098633,
"rewards/margins": 15.485575675964355,
"rewards/rejected": -32.61790084838867,
"step": 336
},
{
"epoch": 0.8402680380239987,
"grad_norm": 10.8125,
"learning_rate": 4.665338961910819e-08,
"logits/chosen": 0.9263704419136047,
"logits/rejected": 0.9631155729293823,
"logps/chosen": -1.8150866031646729,
"logps/rejected": -3.4056591987609863,
"loss": 0.3432,
"rewards/accuracies": 0.8125,
"rewards/chosen": -18.15086555480957,
"rewards/margins": 15.905729293823242,
"rewards/rejected": -34.05659103393555,
"step": 337
},
{
"epoch": 0.842761414991429,
"grad_norm": 14.8125,
"learning_rate": 4.5238873008078036e-08,
"logits/chosen": 0.92448890209198,
"logits/rejected": 0.9032832980155945,
"logps/chosen": -1.8629943132400513,
"logps/rejected": -3.826144218444824,
"loss": 0.4574,
"rewards/accuracies": 0.84375,
"rewards/chosen": -18.62994384765625,
"rewards/margins": 19.631500244140625,
"rewards/rejected": -38.261444091796875,
"step": 338
},
{
"epoch": 0.8452547919588593,
"grad_norm": 5.9375,
"learning_rate": 4.38446488920405e-08,
"logits/chosen": 0.7789740562438965,
"logits/rejected": 0.7178948521614075,
"logps/chosen": -1.713463544845581,
"logps/rejected": -3.125110149383545,
"loss": 0.1509,
"rewards/accuracies": 0.96875,
"rewards/chosen": -17.13463592529297,
"rewards/margins": 14.11646556854248,
"rewards/rejected": -31.251100540161133,
"step": 339
},
{
"epoch": 0.8477481689262896,
"grad_norm": 11.5,
"learning_rate": 4.247081010529546e-08,
"logits/chosen": 0.7394505739212036,
"logits/rejected": 0.7483058571815491,
"logps/chosen": -1.7241424322128296,
"logps/rejected": -2.999985456466675,
"loss": 0.9044,
"rewards/accuracies": 0.71875,
"rewards/chosen": -17.241424560546875,
"rewards/margins": 12.758427619934082,
"rewards/rejected": -29.999855041503906,
"step": 340
},
{
"epoch": 0.8502415458937198,
"grad_norm": 5.34375,
"learning_rate": 4.1117448124787594e-08,
"logits/chosen": 0.8453940153121948,
"logits/rejected": 0.7934137582778931,
"logps/chosen": -1.7638615369796753,
"logps/rejected": -3.4477648735046387,
"loss": 0.3345,
"rewards/accuracies": 0.875,
"rewards/chosen": -17.63861656188965,
"rewards/margins": 16.839033126831055,
"rewards/rejected": -34.4776496887207,
"step": 341
},
{
"epoch": 0.85273492286115,
"grad_norm": 8.9375,
"learning_rate": 3.9784653064014826e-08,
"logits/chosen": 0.9908114671707153,
"logits/rejected": 0.7525830268859863,
"logps/chosen": -1.7432222366333008,
"logps/rejected": -3.2565882205963135,
"loss": 0.7245,
"rewards/accuracies": 0.71875,
"rewards/chosen": -17.432220458984375,
"rewards/margins": 15.133658409118652,
"rewards/rejected": -32.565879821777344,
"step": 342
},
{
"epoch": 0.8552282998285803,
"grad_norm": 6.96875,
"learning_rate": 3.8472513667028556e-08,
"logits/chosen": 0.9397574067115784,
"logits/rejected": 0.7737162709236145,
"logps/chosen": -1.6045491695404053,
"logps/rejected": -2.63539981842041,
"loss": 0.4466,
"rewards/accuracies": 0.875,
"rewards/chosen": -16.04549217224121,
"rewards/margins": 10.30850601196289,
"rewards/rejected": -26.35399627685547,
"step": 343
},
{
"epoch": 0.8577216767960106,
"grad_norm": 4.8125,
"learning_rate": 3.7181117302524304e-08,
"logits/chosen": 1.0774602890014648,
"logits/rejected": 0.7839712500572205,
"logps/chosen": -1.9381461143493652,
"logps/rejected": -3.418266773223877,
"loss": 0.4184,
"rewards/accuracies": 0.875,
"rewards/chosen": -19.381460189819336,
"rewards/margins": 14.801210403442383,
"rewards/rejected": -34.18267059326172,
"step": 344
},
{
"epoch": 0.8602150537634409,
"grad_norm": 11.5,
"learning_rate": 3.591054995802462e-08,
"logits/chosen": 0.8521052598953247,
"logits/rejected": 0.8215041756629944,
"logps/chosen": -1.5074630975723267,
"logps/rejected": -2.6447994709014893,
"loss": 0.7877,
"rewards/accuracies": 0.78125,
"rewards/chosen": -15.074629783630371,
"rewards/margins": 11.373364448547363,
"rewards/rejected": -26.447994232177734,
"step": 345
},
{
"epoch": 0.8627084307308711,
"grad_norm": 10.3125,
"learning_rate": 3.466089623415333e-08,
"logits/chosen": 0.8286025524139404,
"logits/rejected": 0.7513220310211182,
"logps/chosen": -2.0260732173919678,
"logps/rejected": -3.53950572013855,
"loss": 0.5153,
"rewards/accuracies": 0.8125,
"rewards/chosen": -20.260732650756836,
"rewards/margins": 15.134326934814453,
"rewards/rejected": -35.39506149291992,
"step": 346
},
{
"epoch": 0.8652018076983014,
"grad_norm": 4.40625,
"learning_rate": 3.3432239339002654e-08,
"logits/chosen": 0.6205800175666809,
"logits/rejected": 0.8249342441558838,
"logps/chosen": -1.9891235828399658,
"logps/rejected": -3.901944398880005,
"loss": 0.3429,
"rewards/accuracies": 0.875,
"rewards/chosen": -19.891237258911133,
"rewards/margins": 19.12820816040039,
"rewards/rejected": -39.01944351196289,
"step": 347
},
{
"epoch": 0.8676951846657317,
"grad_norm": 6.34375,
"learning_rate": 3.222466108259252e-08,
"logits/chosen": 0.9737166166305542,
"logits/rejected": 0.8952223658561707,
"logps/chosen": -1.9070756435394287,
"logps/rejected": -3.703439712524414,
"loss": 0.3237,
"rewards/accuracies": 0.90625,
"rewards/chosen": -19.07075309753418,
"rewards/margins": 17.96364402770996,
"rewards/rejected": -37.03439712524414,
"step": 348
},
{
"epoch": 0.8701885616331619,
"grad_norm": 6.65625,
"learning_rate": 3.10382418714235e-08,
"logits/chosen": 0.9149331450462341,
"logits/rejected": 0.8159484267234802,
"logps/chosen": -1.6051839590072632,
"logps/rejected": -2.925374984741211,
"loss": 0.5309,
"rewards/accuracies": 0.84375,
"rewards/chosen": -16.051841735839844,
"rewards/margins": 13.201909065246582,
"rewards/rejected": -29.25374984741211,
"step": 349
},
{
"epoch": 0.8726819386005922,
"grad_norm": 6.65625,
"learning_rate": 2.9873060703122815e-08,
"logits/chosen": 0.9303115606307983,
"logits/rejected": 0.8033692836761475,
"logps/chosen": -2.0454282760620117,
"logps/rejected": -3.6858325004577637,
"loss": 0.3837,
"rewards/accuracies": 0.84375,
"rewards/chosen": -20.454280853271484,
"rewards/margins": 16.40404510498047,
"rewards/rejected": -36.85832595825195,
"step": 350
},
{
"epoch": 0.8751753155680224,
"grad_norm": 8.0,
"learning_rate": 2.8729195161184243e-08,
"logits/chosen": 0.7548041939735413,
"logits/rejected": 0.8524357080459595,
"logps/chosen": -1.8255373239517212,
"logps/rejected": -3.674532890319824,
"loss": 0.5723,
"rewards/accuracies": 0.84375,
"rewards/chosen": -18.255373001098633,
"rewards/margins": 18.489957809448242,
"rewards/rejected": -36.745330810546875,
"step": 351
},
{
"epoch": 0.8776686925354527,
"grad_norm": 15.8125,
"learning_rate": 2.7606721409802498e-08,
"logits/chosen": 0.9838480353355408,
"logits/rejected": 0.8637805581092834,
"logps/chosen": -1.717268705368042,
"logps/rejected": -2.822934627532959,
"loss": 0.7475,
"rewards/accuracies": 0.75,
"rewards/chosen": -17.17268943786621,
"rewards/margins": 11.056660652160645,
"rewards/rejected": -28.229345321655273,
"step": 352
},
{
"epoch": 0.880162069502883,
"grad_norm": 6.34375,
"learning_rate": 2.650571418880144e-08,
"logits/chosen": 0.8108838796615601,
"logits/rejected": 0.793830394744873,
"logps/chosen": -1.8453660011291504,
"logps/rejected": -3.455012083053589,
"loss": 0.3673,
"rewards/accuracies": 0.875,
"rewards/chosen": -18.453659057617188,
"rewards/margins": 16.096466064453125,
"rewards/rejected": -34.55012512207031,
"step": 353
},
{
"epoch": 0.8826554464703132,
"grad_norm": 5.15625,
"learning_rate": 2.5426246808657902e-08,
"logits/chosen": 0.7718413472175598,
"logits/rejected": 0.7736707925796509,
"logps/chosen": -1.9948774576187134,
"logps/rejected": -3.8196072578430176,
"loss": 0.2475,
"rewards/accuracies": 0.90625,
"rewards/chosen": -19.948774337768555,
"rewards/margins": 18.247299194335938,
"rewards/rejected": -38.19607162475586,
"step": 354
},
{
"epoch": 0.8851488234377435,
"grad_norm": 8.25,
"learning_rate": 2.4368391145620064e-08,
"logits/chosen": 0.8589321374893188,
"logits/rejected": 0.7836854457855225,
"logps/chosen": -1.6310110092163086,
"logps/rejected": -2.939948320388794,
"loss": 0.2476,
"rewards/accuracies": 0.875,
"rewards/chosen": -16.31011199951172,
"rewards/margins": 13.089373588562012,
"rewards/rejected": -29.39948272705078,
"step": 355
},
{
"epoch": 0.8876422004051737,
"grad_norm": 5.53125,
"learning_rate": 2.3332217636921637e-08,
"logits/chosen": 0.9285929203033447,
"logits/rejected": 0.8569374084472656,
"logps/chosen": -1.9147915840148926,
"logps/rejected": -3.808230400085449,
"loss": 0.3004,
"rewards/accuracies": 0.875,
"rewards/chosen": -19.147912979125977,
"rewards/margins": 18.934389114379883,
"rewards/rejected": -38.08230209350586,
"step": 356
},
{
"epoch": 0.8901355773726041,
"grad_norm": 10.6875,
"learning_rate": 2.2317795276091977e-08,
"logits/chosen": 0.8501561880111694,
"logits/rejected": 0.8791577219963074,
"logps/chosen": -1.746106505393982,
"logps/rejected": -3.179072856903076,
"loss": 0.8857,
"rewards/accuracies": 0.71875,
"rewards/chosen": -17.4610652923584,
"rewards/margins": 14.329660415649414,
"rewards/rejected": -31.79072380065918,
"step": 357
},
{
"epoch": 0.8926289543400343,
"grad_norm": 9.1875,
"learning_rate": 2.1325191608361908e-08,
"logits/chosen": 0.8351438641548157,
"logits/rejected": 0.8163132667541504,
"logps/chosen": -1.5546523332595825,
"logps/rejected": -2.714953660964966,
"loss": 0.4302,
"rewards/accuracies": 0.84375,
"rewards/chosen": -15.546524047851562,
"rewards/margins": 11.603012084960938,
"rewards/rejected": -27.1495361328125,
"step": 358
},
{
"epoch": 0.8951223313074645,
"grad_norm": 12.625,
"learning_rate": 2.035447272616638e-08,
"logits/chosen": 0.8828765153884888,
"logits/rejected": 0.7569836378097534,
"logps/chosen": -1.804396629333496,
"logps/rejected": -3.248098134994507,
"loss": 0.4005,
"rewards/accuracies": 0.78125,
"rewards/chosen": -18.043964385986328,
"rewards/margins": 14.437012672424316,
"rewards/rejected": -32.480979919433594,
"step": 359
},
{
"epoch": 0.8976157082748948,
"grad_norm": 4.96875,
"learning_rate": 1.9405703264743645e-08,
"logits/chosen": 0.8172731995582581,
"logits/rejected": 0.7823519706726074,
"logps/chosen": -1.4623509645462036,
"logps/rejected": -2.556082010269165,
"loss": 0.3153,
"rewards/accuracies": 0.84375,
"rewards/chosen": -14.62350845336914,
"rewards/margins": 10.937310218811035,
"rewards/rejected": -25.56081771850586,
"step": 360
},
{
"epoch": 0.900109085242325,
"grad_norm": 10.4375,
"learning_rate": 1.8478946397831535e-08,
"logits/chosen": 0.8463267683982849,
"logits/rejected": 0.8544177412986755,
"logps/chosen": -1.8382923603057861,
"logps/rejected": -3.775862693786621,
"loss": 0.405,
"rewards/accuracies": 0.90625,
"rewards/chosen": -18.382923126220703,
"rewards/margins": 19.375703811645508,
"rewards/rejected": -37.75862503051758,
"step": 361
},
{
"epoch": 0.9026024622097554,
"grad_norm": 8.5,
"learning_rate": 1.7574263833461018e-08,
"logits/chosen": 0.85582435131073,
"logits/rejected": 0.750614583492279,
"logps/chosen": -1.5502458810806274,
"logps/rejected": -2.6339385509490967,
"loss": 0.4212,
"rewards/accuracies": 0.78125,
"rewards/chosen": -15.502457618713379,
"rewards/margins": 10.836931228637695,
"rewards/rejected": -26.33938980102539,
"step": 362
},
{
"epoch": 0.9050958391771856,
"grad_norm": 8.25,
"learning_rate": 1.6691715809847622e-08,
"logits/chosen": 1.001466989517212,
"logits/rejected": 0.8945422768592834,
"logps/chosen": -1.4286935329437256,
"logps/rejected": -2.590465545654297,
"loss": 0.6361,
"rewards/accuracies": 0.71875,
"rewards/chosen": -14.28693675994873,
"rewards/margins": 11.617722511291504,
"rewards/rejected": -25.90465545654297,
"step": 363
},
{
"epoch": 0.9075892161446159,
"grad_norm": 27.25,
"learning_rate": 1.5831361091380085e-08,
"logits/chosen": 1.0156899690628052,
"logits/rejected": 0.9483416080474854,
"logps/chosen": -2.2000174522399902,
"logps/rejected": -3.7587168216705322,
"loss": 0.7706,
"rewards/accuracies": 0.65625,
"rewards/chosen": -22.000173568725586,
"rewards/margins": 15.586994171142578,
"rewards/rejected": -37.5871696472168,
"step": 364
},
{
"epoch": 0.9100825931120461,
"grad_norm": 11.9375,
"learning_rate": 1.4993256964707667e-08,
"logits/chosen": 0.9330320358276367,
"logits/rejected": 0.7909821271896362,
"logps/chosen": -1.7297008037567139,
"logps/rejected": -3.1835391521453857,
"loss": 0.6938,
"rewards/accuracies": 0.71875,
"rewards/chosen": -17.297008514404297,
"rewards/margins": 14.538382530212402,
"rewards/rejected": -31.835391998291016,
"step": 365
},
{
"epoch": 0.9125759700794764,
"grad_norm": 150.0,
"learning_rate": 1.4177459234925959e-08,
"logits/chosen": 1.0243542194366455,
"logits/rejected": 0.8424570560455322,
"logps/chosen": -1.55518639087677,
"logps/rejected": -2.423957109451294,
"loss": 0.74,
"rewards/accuracies": 0.625,
"rewards/chosen": -15.551864624023438,
"rewards/margins": 8.687705993652344,
"rewards/rejected": -24.23957061767578,
"step": 366
},
{
"epoch": 0.9150693470469067,
"grad_norm": 3.8125,
"learning_rate": 1.3384022221860707e-08,
"logits/chosen": 0.7477589845657349,
"logits/rejected": 0.7240265011787415,
"logps/chosen": -1.9058078527450562,
"logps/rejected": -4.386825084686279,
"loss": 0.1929,
"rewards/accuracies": 0.9375,
"rewards/chosen": -19.05807876586914,
"rewards/margins": 24.810171127319336,
"rewards/rejected": -43.868247985839844,
"step": 367
},
{
"epoch": 0.9175627240143369,
"grad_norm": 11.3125,
"learning_rate": 1.2612998756451366e-08,
"logits/chosen": 0.8763638734817505,
"logits/rejected": 0.8146540522575378,
"logps/chosen": -1.8290818929672241,
"logps/rejected": -3.116455554962158,
"loss": 0.6755,
"rewards/accuracies": 0.71875,
"rewards/chosen": -18.290821075439453,
"rewards/margins": 12.873735427856445,
"rewards/rejected": -31.164554595947266,
"step": 368
},
{
"epoch": 0.9200561009817672,
"grad_norm": 45.0,
"learning_rate": 1.1864440177232976e-08,
"logits/chosen": 0.8767358660697937,
"logits/rejected": 0.7798057794570923,
"logps/chosen": -1.9157465696334839,
"logps/rejected": -4.174086570739746,
"loss": 0.3354,
"rewards/accuracies": 0.875,
"rewards/chosen": -19.1574649810791,
"rewards/margins": 22.58340072631836,
"rewards/rejected": -41.74085998535156,
"step": 369
},
{
"epoch": 0.9225494779491974,
"grad_norm": 7.6875,
"learning_rate": 1.1138396326917977e-08,
"logits/chosen": 0.9398146867752075,
"logits/rejected": 0.9706467390060425,
"logps/chosen": -2.1085903644561768,
"logps/rejected": -3.980117082595825,
"loss": 0.438,
"rewards/accuracies": 0.90625,
"rewards/chosen": -21.085905075073242,
"rewards/margins": 18.71526336669922,
"rewards/rejected": -39.80117416381836,
"step": 370
},
{
"epoch": 0.9250428549166277,
"grad_norm": 6.59375,
"learning_rate": 1.0434915549077461e-08,
"logits/chosen": 0.9370230436325073,
"logits/rejected": 0.6799491047859192,
"logps/chosen": -2.0628821849823,
"logps/rejected": -4.021778106689453,
"loss": 0.2515,
"rewards/accuracies": 0.875,
"rewards/chosen": -20.628820419311523,
"rewards/margins": 19.588960647583008,
"rewards/rejected": -40.21778106689453,
"step": 371
},
{
"epoch": 0.927536231884058,
"grad_norm": 4.65625,
"learning_rate": 9.754044684922053e-09,
"logits/chosen": 0.9780002236366272,
"logits/rejected": 0.8844839334487915,
"logps/chosen": -2.1081106662750244,
"logps/rejected": -3.9642934799194336,
"loss": 0.2881,
"rewards/accuracies": 0.875,
"rewards/chosen": -21.08110809326172,
"rewards/margins": 18.56182861328125,
"rewards/rejected": -39.64293670654297,
"step": 372
},
{
"epoch": 0.9300296088514882,
"grad_norm": 38.0,
"learning_rate": 9.095829070183286e-09,
"logits/chosen": 0.8360333442687988,
"logits/rejected": 0.7579271793365479,
"logps/chosen": -1.7824090719223022,
"logps/rejected": -3.0244534015655518,
"loss": 0.8543,
"rewards/accuracies": 0.78125,
"rewards/chosen": -17.82408905029297,
"rewards/margins": 12.42044448852539,
"rewards/rejected": -30.24453353881836,
"step": 373
},
{
"epoch": 0.9325229858189185,
"grad_norm": 9.625,
"learning_rate": 8.460312532094555e-09,
"logits/chosen": 0.85768723487854,
"logits/rejected": 0.8098315000534058,
"logps/chosen": -1.6384191513061523,
"logps/rejected": -2.991260051727295,
"loss": 0.3512,
"rewards/accuracies": 0.875,
"rewards/chosen": -16.38418960571289,
"rewards/margins": 13.528410911560059,
"rewards/rejected": -29.9126033782959,
"step": 374
},
{
"epoch": 0.9350163627863488,
"grad_norm": 164.0,
"learning_rate": 7.847537386473157e-09,
"logits/chosen": 0.7866430878639221,
"logits/rejected": 0.8129922747612,
"logps/chosen": -1.9942247867584229,
"logps/rejected": -3.5987548828125,
"loss": 0.4176,
"rewards/accuracies": 0.84375,
"rewards/chosen": -19.942249298095703,
"rewards/margins": 16.04530143737793,
"rewards/rejected": -35.987548828125,
"step": 375
},
{
"epoch": 0.937509739753779,
"grad_norm": 7.46875,
"learning_rate": 7.257544434902646e-09,
"logits/chosen": 0.7069447040557861,
"logits/rejected": 0.7614144086837769,
"logps/chosen": -1.3751400709152222,
"logps/rejected": -2.30409574508667,
"loss": 0.4299,
"rewards/accuracies": 0.78125,
"rewards/chosen": -13.751401901245117,
"rewards/margins": 9.289555549621582,
"rewards/rejected": -23.040958404541016,
"step": 376
},
{
"epoch": 0.9400031167212093,
"grad_norm": 11.75,
"learning_rate": 6.690372962015922e-09,
"logits/chosen": 0.7851680517196655,
"logits/rejected": 0.7357572913169861,
"logps/chosen": -1.5484297275543213,
"logps/rejected": -2.7063143253326416,
"loss": 0.835,
"rewards/accuracies": 0.78125,
"rewards/chosen": -15.484295845031738,
"rewards/margins": 11.578847885131836,
"rewards/rejected": -27.063142776489258,
"step": 377
},
{
"epoch": 0.9424964936886395,
"grad_norm": 17.625,
"learning_rate": 6.146060732879643e-09,
"logits/chosen": 0.9812300801277161,
"logits/rejected": 0.9034566879272461,
"logps/chosen": -1.6774706840515137,
"logps/rejected": -2.880139112472534,
"loss": 0.711,
"rewards/accuracies": 0.6875,
"rewards/chosen": -16.774707794189453,
"rewards/margins": 12.026679992675781,
"rewards/rejected": -28.801387786865234,
"step": 378
},
{
"epoch": 0.9449898706560698,
"grad_norm": 8.6875,
"learning_rate": 5.624643990479616e-09,
"logits/chosen": 0.8070354461669922,
"logits/rejected": 0.8890936374664307,
"logps/chosen": -1.5846179723739624,
"logps/rejected": -2.867779016494751,
"loss": 0.9569,
"rewards/accuracies": 0.75,
"rewards/chosen": -15.846179962158203,
"rewards/margins": 12.831609725952148,
"rewards/rejected": -28.67778778076172,
"step": 379
},
{
"epoch": 0.9474832476235001,
"grad_norm": 12.0,
"learning_rate": 5.126157453307456e-09,
"logits/chosen": 0.9138520359992981,
"logits/rejected": 0.8435475826263428,
"logps/chosen": -1.5853928327560425,
"logps/rejected": -3.102871894836426,
"loss": 0.3206,
"rewards/accuracies": 0.90625,
"rewards/chosen": -15.853928565979004,
"rewards/margins": 15.174790382385254,
"rewards/rejected": -31.028718948364258,
"step": 380
},
{
"epoch": 0.9499766245909304,
"grad_norm": 5.6875,
"learning_rate": 4.6506343130488956e-09,
"logits/chosen": 0.7391858100891113,
"logits/rejected": 0.7543048858642578,
"logps/chosen": -2.2143211364746094,
"logps/rejected": -4.464791774749756,
"loss": 0.2126,
"rewards/accuracies": 0.875,
"rewards/chosen": -22.14321517944336,
"rewards/margins": 22.50470542907715,
"rewards/rejected": -44.647911071777344,
"step": 381
},
{
"epoch": 0.9524700015583606,
"grad_norm": 28.375,
"learning_rate": 4.198106232373788e-09,
"logits/chosen": 0.8407728672027588,
"logits/rejected": 0.7748773694038391,
"logps/chosen": -1.5643658638000488,
"logps/rejected": -2.904953956604004,
"loss": 0.6156,
"rewards/accuracies": 0.84375,
"rewards/chosen": -15.643659591674805,
"rewards/margins": 13.405879020690918,
"rewards/rejected": -29.049535751342773,
"step": 382
},
{
"epoch": 0.9549633785257908,
"grad_norm": 5.5,
"learning_rate": 3.768603342827719e-09,
"logits/chosen": 0.7649537324905396,
"logits/rejected": 0.9055894613265991,
"logps/chosen": -2.0387561321258545,
"logps/rejected": -3.687187671661377,
"loss": 0.2725,
"rewards/accuracies": 0.875,
"rewards/chosen": -20.387561798095703,
"rewards/margins": 16.484315872192383,
"rewards/rejected": -36.87187957763672,
"step": 383
},
{
"epoch": 0.9574567554932212,
"grad_norm": 9.625,
"learning_rate": 3.3621542428259764e-09,
"logits/chosen": 0.7531914710998535,
"logits/rejected": 0.7303828001022339,
"logps/chosen": -1.985339641571045,
"logps/rejected": -3.589801549911499,
"loss": 0.2374,
"rewards/accuracies": 0.90625,
"rewards/chosen": -19.853397369384766,
"rewards/margins": 16.044618606567383,
"rewards/rejected": -35.898014068603516,
"step": 384
},
{
"epoch": 0.9599501324606514,
"grad_norm": 13.1875,
"learning_rate": 2.978785995748928e-09,
"logits/chosen": 0.8882652521133423,
"logits/rejected": 0.7716068029403687,
"logps/chosen": -1.404790997505188,
"logps/rejected": -2.1909685134887695,
"loss": 1.0033,
"rewards/accuracies": 0.71875,
"rewards/chosen": -14.047908782958984,
"rewards/margins": 7.8617753982543945,
"rewards/rejected": -21.909685134887695,
"step": 385
},
{
"epoch": 0.9624435094280817,
"grad_norm": 13.8125,
"learning_rate": 2.618524128140309e-09,
"logits/chosen": 0.8234500885009766,
"logits/rejected": 0.826442301273346,
"logps/chosen": -1.7514184713363647,
"logps/rejected": -3.4685299396514893,
"loss": 0.4523,
"rewards/accuracies": 0.875,
"rewards/chosen": -17.51418685913086,
"rewards/margins": 17.171112060546875,
"rewards/rejected": -34.685298919677734,
"step": 386
},
{
"epoch": 0.9649368863955119,
"grad_norm": 19.0,
"learning_rate": 2.2813926280074225e-09,
"logits/chosen": 0.9154322147369385,
"logits/rejected": 0.7984371781349182,
"logps/chosen": -1.6743313074111938,
"logps/rejected": -2.603154182434082,
"loss": 0.6813,
"rewards/accuracies": 0.75,
"rewards/chosen": -16.74331283569336,
"rewards/margins": 9.288228988647461,
"rewards/rejected": -26.03154182434082,
"step": 387
},
{
"epoch": 0.9674302633629421,
"grad_norm": 9.6875,
"learning_rate": 1.9674139432240056e-09,
"logits/chosen": 0.7447303533554077,
"logits/rejected": 0.6416030526161194,
"logps/chosen": -1.896831750869751,
"logps/rejected": -3.2503926753997803,
"loss": 0.2141,
"rewards/accuracies": 0.90625,
"rewards/chosen": -18.96831703186035,
"rewards/margins": 13.535609245300293,
"rewards/rejected": -32.50392532348633,
"step": 388
},
{
"epoch": 0.9699236403303725,
"grad_norm": 3.5,
"learning_rate": 1.6766089800352934e-09,
"logits/chosen": 0.8328643441200256,
"logits/rejected": 0.8856253623962402,
"logps/chosen": -2.1792397499084473,
"logps/rejected": -4.523174285888672,
"loss": 0.2019,
"rewards/accuracies": 0.9375,
"rewards/chosen": -21.792396545410156,
"rewards/margins": 23.43934440612793,
"rewards/rejected": -45.23174285888672,
"step": 389
},
{
"epoch": 0.9724170172978027,
"grad_norm": 8.25,
"learning_rate": 1.408997101666326e-09,
"logits/chosen": 0.8109475374221802,
"logits/rejected": 0.9047868251800537,
"logps/chosen": -2.0790956020355225,
"logps/rejected": -4.099780559539795,
"loss": 0.1869,
"rewards/accuracies": 0.90625,
"rewards/chosen": -20.790956497192383,
"rewards/margins": 20.206846237182617,
"rewards/rejected": -40.997806549072266,
"step": 390
},
{
"epoch": 0.974910394265233,
"grad_norm": 5.375,
"learning_rate": 1.1645961270323746e-09,
"logits/chosen": 0.8082598447799683,
"logits/rejected": 0.7497880458831787,
"logps/chosen": -1.4980417490005493,
"logps/rejected": -3.6578102111816406,
"loss": 0.3026,
"rewards/accuracies": 0.875,
"rewards/chosen": -14.98041820526123,
"rewards/margins": 21.59768295288086,
"rewards/rejected": -36.578102111816406,
"step": 391
},
{
"epoch": 0.9774037712326632,
"grad_norm": 14.0625,
"learning_rate": 9.434223295524958e-10,
"logits/chosen": 0.822067141532898,
"logits/rejected": 0.8411962985992432,
"logps/chosen": -1.5677428245544434,
"logps/rejected": -2.5952415466308594,
"loss": 0.421,
"rewards/accuracies": 0.9375,
"rewards/chosen": -15.677427291870117,
"rewards/margins": 10.274986267089844,
"rewards/rejected": -25.95241355895996,
"step": 392
},
{
"epoch": 0.9798971482000935,
"grad_norm": 41.0,
"learning_rate": 7.454904360661762e-10,
"logits/chosen": 0.7836760878562927,
"logits/rejected": 0.7335962653160095,
"logps/chosen": -1.7851057052612305,
"logps/rejected": -3.307037830352783,
"loss": 0.6381,
"rewards/accuracies": 0.78125,
"rewards/chosen": -17.851055145263672,
"rewards/margins": 15.219318389892578,
"rewards/rejected": -33.07037353515625,
"step": 393
},
{
"epoch": 0.9823905251675238,
"grad_norm": 7.6875,
"learning_rate": 5.708136258525231e-10,
"logits/chosen": 1.0210167169570923,
"logits/rejected": 0.8218429088592529,
"logps/chosen": -1.745602011680603,
"logps/rejected": -3.2325327396392822,
"loss": 0.4579,
"rewards/accuracies": 0.84375,
"rewards/chosen": -17.45602035522461,
"rewards/margins": 14.869308471679688,
"rewards/rejected": -32.3253288269043,
"step": 394
},
{
"epoch": 0.984883902134954,
"grad_norm": 8.8125,
"learning_rate": 4.194035297527765e-10,
"logits/chosen": 0.9661321640014648,
"logits/rejected": 0.8232787847518921,
"logps/chosen": -1.6603198051452637,
"logps/rejected": -3.042840003967285,
"loss": 0.5841,
"rewards/accuracies": 0.8125,
"rewards/chosen": -16.603199005126953,
"rewards/margins": 13.825201034545898,
"rewards/rejected": -30.42839813232422,
"step": 395
},
{
"epoch": 0.9873772791023843,
"grad_norm": 9.875,
"learning_rate": 2.912702293959901e-10,
"logits/chosen": 0.9294121861457825,
"logits/rejected": 0.7912936210632324,
"logps/chosen": -1.8600192070007324,
"logps/rejected": -3.488579750061035,
"loss": 0.3076,
"rewards/accuracies": 0.90625,
"rewards/chosen": -18.600191116333008,
"rewards/margins": 16.28560447692871,
"rewards/rejected": -34.885799407958984,
"step": 396
},
{
"epoch": 0.9898706560698145,
"grad_norm": 6.28125,
"learning_rate": 1.8642225652760746e-10,
"logits/chosen": 1.0502732992172241,
"logits/rejected": 0.7951204180717468,
"logps/chosen": -1.755967617034912,
"logps/rejected": -3.1554126739501953,
"loss": 0.4734,
"rewards/accuracies": 0.84375,
"rewards/chosen": -17.559675216674805,
"rewards/margins": 13.994451522827148,
"rewards/rejected": -31.554126739501953,
"step": 397
},
{
"epoch": 0.9923640330372449,
"grad_norm": 6.15625,
"learning_rate": 1.0486659244136054e-10,
"logits/chosen": 0.8521815538406372,
"logits/rejected": 0.8380050659179688,
"logps/chosen": -1.743971586227417,
"logps/rejected": -3.0718655586242676,
"loss": 0.2066,
"rewards/accuracies": 0.875,
"rewards/chosen": -17.439714431762695,
"rewards/margins": 13.27894115447998,
"rewards/rejected": -30.71865463256836,
"step": 398
},
{
"epoch": 0.9948574100046751,
"grad_norm": 8.0625,
"learning_rate": 4.6608667514608234e-11,
"logits/chosen": 0.8208640217781067,
"logits/rejected": 0.7938324213027954,
"logps/chosen": -1.5473930835723877,
"logps/rejected": -2.6816587448120117,
"loss": 0.3252,
"rewards/accuracies": 0.875,
"rewards/chosen": -15.473930358886719,
"rewards/margins": 11.342655181884766,
"rewards/rejected": -26.816585540771484,
"step": 399
},
{
"epoch": 0.9973507869721053,
"grad_norm": 4.71875,
"learning_rate": 1.1652360846531317e-11,
"logits/chosen": 1.0007308721542358,
"logits/rejected": 0.8393873572349548,
"logps/chosen": -2.072484016418457,
"logps/rejected": -4.150708198547363,
"loss": 0.3247,
"rewards/accuracies": 0.84375,
"rewards/chosen": -20.724838256835938,
"rewards/margins": 20.782241821289062,
"rewards/rejected": -41.507083892822266,
"step": 400
},
{
"epoch": 0.9998441639395356,
"grad_norm": 9.0,
"learning_rate": 0.0,
"logits/chosen": 0.8794471025466919,
"logits/rejected": 0.8441964983940125,
"logps/chosen": -1.8147742748260498,
"logps/rejected": -3.1960084438323975,
"loss": 0.3852,
"rewards/accuracies": 0.875,
"rewards/chosen": -18.147741317749023,
"rewards/margins": 13.812341690063477,
"rewards/rejected": -31.9600830078125,
"step": 401
},
{
"epoch": 0.9998441639395356,
"step": 401,
"total_flos": 5.68672318443248e+18,
"train_loss": 2.14272621887599,
"train_runtime": 89392.3847,
"train_samples_per_second": 0.144,
"train_steps_per_second": 0.004
}
],
"logging_steps": 1,
"max_steps": 401,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 110,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.68672318443248e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}