diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,2945 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.3129888447236826, + "eval_steps": 10, + "global_step": 320, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008206180279523016, + "grad_norm": 72.29982535271158, + "learning_rate": 2.040816326530612e-08, + "logits/chosen": -2.1581597328186035, + "logits/rejected": -2.159653902053833, + "logps/chosen": -25.12261962890625, + "logps/rejected": -43.09302520751953, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.016412360559046033, + "grad_norm": 96.10955860968517, + "learning_rate": 4.081632653061224e-08, + "logits/chosen": -2.0733723640441895, + "logits/rejected": -2.0702476501464844, + "logps/chosen": -24.637685775756836, + "logps/rejected": -41.43503189086914, + "loss": 0.6963, + "rewards/accuracies": 0.453125, + "rewards/chosen": 0.008512299507856369, + "rewards/margins": -0.013849787414073944, + "rewards/rejected": 0.022362088784575462, + "step": 4 + }, + { + "epoch": 0.02461854083856905, + "grad_norm": 68.94505414065074, + "learning_rate": 6.122448979591837e-08, + "logits/chosen": -2.118948459625244, + "logits/rejected": -2.109750270843506, + "logps/chosen": -20.342554092407227, + "logps/rejected": -29.361812591552734, + "loss": 0.6878, + "rewards/accuracies": 0.484375, + "rewards/chosen": 0.028929362073540688, + "rewards/margins": 0.029945004731416702, + "rewards/rejected": -0.0010156440548598766, + "step": 6 + }, + { + "epoch": 0.032824721118092065, + "grad_norm": 77.97306429217313, + "learning_rate": 8.163265306122448e-08, + "logits/chosen": -2.0574400424957275, + "logits/rejected": -2.0550637245178223, + "logps/chosen": -27.057085037231445, + "logps/rejected": -39.5283088684082, + "loss": 0.7013, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.014948742464184761, + "rewards/margins": -0.01355709508061409, + "rewards/rejected": -0.0013916483148932457, + "step": 8 + }, + { + "epoch": 0.04103090139761508, + "grad_norm": 67.40204277563684, + "learning_rate": 1.0204081632653061e-07, + "logits/chosen": -2.0784454345703125, + "logits/rejected": -2.072957992553711, + "logps/chosen": -25.560945510864258, + "logps/rejected": -31.469083786010742, + "loss": 0.6968, + "rewards/accuracies": 0.453125, + "rewards/chosen": -0.004186006262898445, + "rewards/margins": 0.010740835219621658, + "rewards/rejected": -0.014926840551197529, + "step": 10 + }, + { + "epoch": 0.04103090139761508, + "eval_logits/chosen": -2.0721328258514404, + "eval_logits/rejected": -2.069610834121704, + "eval_logps/chosen": -24.65077018737793, + "eval_logps/rejected": -31.145498275756836, + "eval_loss": 0.6956828236579895, + "eval_rewards/accuracies": 0.40668201446533203, + "eval_rewards/chosen": -0.005178123712539673, + "eval_rewards/margins": -0.0014642790192738175, + "eval_rewards/rejected": -0.003713843412697315, + "eval_runtime": 391.6625, + "eval_samples_per_second": 4.427, + "eval_steps_per_second": 1.108, + "step": 10 + }, + { + "epoch": 0.0492370816771381, + "grad_norm": 75.23897935562478, + "learning_rate": 1.2244897959183673e-07, + "logits/chosen": -2.105541229248047, + "logits/rejected": -2.110215187072754, + "logps/chosen": -25.037534713745117, + "logps/rejected": -51.45681381225586, + "loss": 0.691, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.02754499390721321, + "rewards/margins": -0.003392305225133896, + "rewards/rejected": -0.024152684956789017, + "step": 12 + }, + { + "epoch": 0.057443261956661114, + "grad_norm": 62.15854308442268, + "learning_rate": 1.4285714285714285e-07, + "logits/chosen": -2.0920844078063965, + "logits/rejected": -2.0932013988494873, + "logps/chosen": -30.580596923828125, + "logps/rejected": -43.68434143066406, + "loss": 0.693, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.03211332485079765, + "rewards/margins": 0.012334156781435013, + "rewards/rejected": -0.044447485357522964, + "step": 14 + }, + { + "epoch": 0.06564944223618413, + "grad_norm": 70.35846634784929, + "learning_rate": 1.6326530612244896e-07, + "logits/chosen": -2.058406114578247, + "logits/rejected": -2.0612599849700928, + "logps/chosen": -24.555591583251953, + "logps/rejected": -47.54515838623047, + "loss": 0.6833, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.06411759555339813, + "rewards/margins": 0.02048414759337902, + "rewards/rejected": -0.0846017450094223, + "step": 16 + }, + { + "epoch": 0.07385562251570714, + "grad_norm": 53.416196200679686, + "learning_rate": 1.836734693877551e-07, + "logits/chosen": -2.0910069942474365, + "logits/rejected": -2.0841658115386963, + "logps/chosen": -24.07425308227539, + "logps/rejected": -23.863197326660156, + "loss": 0.6845, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.09408427029848099, + "rewards/margins": 0.020395735278725624, + "rewards/rejected": -0.11447998881340027, + "step": 18 + }, + { + "epoch": 0.08206180279523016, + "grad_norm": 81.67543770250495, + "learning_rate": 2.0408163265306121e-07, + "logits/chosen": -2.0422399044036865, + "logits/rejected": -2.0412869453430176, + "logps/chosen": -23.748153686523438, + "logps/rejected": -41.940765380859375, + "loss": 0.6754, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.08865831792354584, + "rewards/margins": 0.042564887553453445, + "rewards/rejected": -0.13122320175170898, + "step": 20 + }, + { + "epoch": 0.08206180279523016, + "eval_logits/chosen": -2.073920726776123, + "eval_logits/rejected": -2.071418285369873, + "eval_logps/chosen": -24.912931442260742, + "eval_logps/rejected": -31.505783081054688, + "eval_loss": 0.6749772429466248, + "eval_rewards/accuracies": 0.559907853603363, + "eval_rewards/chosen": -0.13625794649124146, + "eval_rewards/margins": 0.04759809002280235, + "eval_rewards/rejected": -0.1838560253381729, + "eval_runtime": 388.9647, + "eval_samples_per_second": 4.458, + "eval_steps_per_second": 1.116, + "step": 20 + }, + { + "epoch": 0.09026798307475317, + "grad_norm": 58.1111337202034, + "learning_rate": 2.2448979591836733e-07, + "logits/chosen": -2.141969680786133, + "logits/rejected": -2.139296054840088, + "logps/chosen": -28.556930541992188, + "logps/rejected": -33.20477294921875, + "loss": 0.6809, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1592944860458374, + "rewards/margins": 0.038385238498449326, + "rewards/rejected": -0.19767972826957703, + "step": 22 + }, + { + "epoch": 0.0984741633542762, + "grad_norm": 59.55094100468827, + "learning_rate": 2.4489795918367347e-07, + "logits/chosen": -2.0954370498657227, + "logits/rejected": -2.0884788036346436, + "logps/chosen": -24.139230728149414, + "logps/rejected": -28.622846603393555, + "loss": 0.6507, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.26595112681388855, + "rewards/margins": 0.08658240735530853, + "rewards/rejected": -0.3525335192680359, + "step": 24 + }, + { + "epoch": 0.1066803436337992, + "grad_norm": 57.19794639459796, + "learning_rate": 2.653061224489796e-07, + "logits/chosen": -2.137075662612915, + "logits/rejected": -2.142050266265869, + "logps/chosen": -29.15459442138672, + "logps/rejected": -42.837894439697266, + "loss": 0.6393, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.3813813328742981, + "rewards/margins": 0.10138601809740067, + "rewards/rejected": -0.48276740312576294, + "step": 26 + }, + { + "epoch": 0.11488652391332223, + "grad_norm": 49.34854220939512, + "learning_rate": 2.857142857142857e-07, + "logits/chosen": -2.078376531600952, + "logits/rejected": -2.0755605697631836, + "logps/chosen": -25.93030548095703, + "logps/rejected": -38.2554817199707, + "loss": 0.6617, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.49894633889198303, + "rewards/margins": 0.13289065659046173, + "rewards/rejected": -0.631837010383606, + "step": 28 + }, + { + "epoch": 0.12309270419284524, + "grad_norm": 57.358296790419175, + "learning_rate": 3.0612244897959183e-07, + "logits/chosen": -2.1170706748962402, + "logits/rejected": -2.123375415802002, + "logps/chosen": -21.049701690673828, + "logps/rejected": -53.651187896728516, + "loss": 0.6385, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.6054391264915466, + "rewards/margins": 0.20123842358589172, + "rewards/rejected": -0.806677520275116, + "step": 30 + }, + { + "epoch": 0.12309270419284524, + "eval_logits/chosen": -2.072676420211792, + "eval_logits/rejected": -2.0702009201049805, + "eval_logps/chosen": -25.986181259155273, + "eval_logps/rejected": -32.866546630859375, + "eval_loss": 0.6259192824363708, + "eval_rewards/accuracies": 0.5967742204666138, + "eval_rewards/chosen": -0.672883927822113, + "eval_rewards/margins": 0.1913554072380066, + "eval_rewards/rejected": -0.8642393946647644, + "eval_runtime": 383.4842, + "eval_samples_per_second": 4.522, + "eval_steps_per_second": 1.132, + "step": 30 + }, + { + "epoch": 0.13129888447236826, + "grad_norm": 51.59090790007893, + "learning_rate": 3.265306122448979e-07, + "logits/chosen": -2.065584182739258, + "logits/rejected": -2.059347629547119, + "logps/chosen": -21.224966049194336, + "logps/rejected": -31.50775718688965, + "loss": 0.6031, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.6435321569442749, + "rewards/margins": 0.2527439296245575, + "rewards/rejected": -0.8962759971618652, + "step": 32 + }, + { + "epoch": 0.13950506475189126, + "grad_norm": 48.81049837247277, + "learning_rate": 3.4693877551020406e-07, + "logits/chosen": -2.044010639190674, + "logits/rejected": -2.04213285446167, + "logps/chosen": -21.99913787841797, + "logps/rejected": -41.94310760498047, + "loss": 0.5998, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8395999670028687, + "rewards/margins": 0.3455100953578949, + "rewards/rejected": -1.1851099729537964, + "step": 34 + }, + { + "epoch": 0.14771124503141428, + "grad_norm": 51.08931493056027, + "learning_rate": 3.673469387755102e-07, + "logits/chosen": -2.0850181579589844, + "logits/rejected": -2.0847809314727783, + "logps/chosen": -31.404415130615234, + "logps/rejected": -38.55030059814453, + "loss": 0.5904, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2278621196746826, + "rewards/margins": 0.26516327261924744, + "rewards/rejected": -1.493025541305542, + "step": 36 + }, + { + "epoch": 0.1559174253109373, + "grad_norm": 59.35651114893968, + "learning_rate": 3.877551020408163e-07, + "logits/chosen": -2.114025831222534, + "logits/rejected": -2.1199615001678467, + "logps/chosen": -24.901527404785156, + "logps/rejected": -50.99317932128906, + "loss": 0.5923, + "rewards/accuracies": 0.640625, + "rewards/chosen": -1.1870838403701782, + "rewards/margins": 0.41857296228408813, + "rewards/rejected": -1.6056568622589111, + "step": 38 + }, + { + "epoch": 0.16412360559046033, + "grad_norm": 46.92283583915801, + "learning_rate": 4.0816326530612243e-07, + "logits/chosen": -2.109259843826294, + "logits/rejected": -2.1076252460479736, + "logps/chosen": -27.03592300415039, + "logps/rejected": -42.91762161254883, + "loss": 0.5115, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.1980061531066895, + "rewards/margins": 0.5265197157859802, + "rewards/rejected": -1.7245259284973145, + "step": 40 + }, + { + "epoch": 0.16412360559046033, + "eval_logits/chosen": -2.0738561153411865, + "eval_logits/rejected": -2.0714197158813477, + "eval_logps/chosen": -27.073177337646484, + "eval_logps/rejected": -34.44426345825195, + "eval_loss": 0.5755711197853088, + "eval_rewards/accuracies": 0.6682027578353882, + "eval_rewards/chosen": -1.2163803577423096, + "eval_rewards/margins": 0.43671703338623047, + "eval_rewards/rejected": -1.6530975103378296, + "eval_runtime": 383.5104, + "eval_samples_per_second": 4.521, + "eval_steps_per_second": 1.132, + "step": 40 + }, + { + "epoch": 0.17232978586998332, + "grad_norm": 53.83140343188913, + "learning_rate": 4.285714285714285e-07, + "logits/chosen": -2.0352187156677246, + "logits/rejected": -2.037541627883911, + "logps/chosen": -32.00979232788086, + "logps/rejected": -34.965049743652344, + "loss": 0.5958, + "rewards/accuracies": 0.609375, + "rewards/chosen": -1.6347800493240356, + "rewards/margins": 0.25150731205940247, + "rewards/rejected": -1.8862874507904053, + "step": 42 + }, + { + "epoch": 0.18053596614950634, + "grad_norm": 47.972074857166355, + "learning_rate": 4.4897959183673465e-07, + "logits/chosen": -2.075744390487671, + "logits/rejected": -2.080559253692627, + "logps/chosen": -27.046289443969727, + "logps/rejected": -43.155860900878906, + "loss": 0.5475, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4715561866760254, + "rewards/margins": 0.3980383574962616, + "rewards/rejected": -1.8695944547653198, + "step": 44 + }, + { + "epoch": 0.18874214642902937, + "grad_norm": 49.55763090785352, + "learning_rate": 4.693877551020408e-07, + "logits/chosen": -2.0551533699035645, + "logits/rejected": -2.0602352619171143, + "logps/chosen": -33.108123779296875, + "logps/rejected": -42.747528076171875, + "loss": 0.5183, + "rewards/accuracies": 0.640625, + "rewards/chosen": -1.2049014568328857, + "rewards/margins": 0.6831133961677551, + "rewards/rejected": -1.888014793395996, + "step": 46 + }, + { + "epoch": 0.1969483267085524, + "grad_norm": 36.14217009270528, + "learning_rate": 4.897959183673469e-07, + "logits/chosen": -2.178698778152466, + "logits/rejected": -2.178926467895508, + "logps/chosen": -23.19949722290039, + "logps/rejected": -47.667236328125, + "loss": 0.4744, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.3382686376571655, + "rewards/margins": 0.744179368019104, + "rewards/rejected": -2.0824477672576904, + "step": 48 + }, + { + "epoch": 0.20515450698807539, + "grad_norm": 44.92936419641385, + "learning_rate": 4.999935398141225e-07, + "logits/chosen": -2.0898728370666504, + "logits/rejected": -2.086491107940674, + "logps/chosen": -28.257888793945312, + "logps/rejected": -45.15388870239258, + "loss": 0.493, + "rewards/accuracies": 0.796875, + "rewards/chosen": -1.5506025552749634, + "rewards/margins": 0.6771446466445923, + "rewards/rejected": -2.2277474403381348, + "step": 50 + }, + { + "epoch": 0.20515450698807539, + "eval_logits/chosen": -2.0701448917388916, + "eval_logits/rejected": -2.067728042602539, + "eval_logps/chosen": -26.948427200317383, + "eval_logps/rejected": -35.04188919067383, + "eval_loss": 0.49889206886291504, + "eval_rewards/accuracies": 0.7396313548088074, + "eval_rewards/chosen": -1.154005527496338, + "eval_rewards/margins": 0.797903835773468, + "eval_rewards/rejected": -1.9519096612930298, + "eval_runtime": 383.3286, + "eval_samples_per_second": 4.524, + "eval_steps_per_second": 1.132, + "step": 50 + }, + { + "epoch": 0.2133606872675984, + "grad_norm": 39.90879769142747, + "learning_rate": 4.999418603303176e-07, + "logits/chosen": -2.044602394104004, + "logits/rejected": -2.043879985809326, + "logps/chosen": -20.71453094482422, + "logps/rejected": -39.783756256103516, + "loss": 0.4715, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9839560985565186, + "rewards/margins": 0.9620079398155212, + "rewards/rejected": -1.9459640979766846, + "step": 52 + }, + { + "epoch": 0.22156686754712143, + "grad_norm": 37.33476295758473, + "learning_rate": 4.998385120460602e-07, + "logits/chosen": -2.080956220626831, + "logits/rejected": -2.0807886123657227, + "logps/chosen": -26.196861267089844, + "logps/rejected": -39.63607406616211, + "loss": 0.51, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.140933871269226, + "rewards/margins": 0.8466982841491699, + "rewards/rejected": -1.9876320362091064, + "step": 54 + }, + { + "epoch": 0.22977304782664446, + "grad_norm": 45.971175914221135, + "learning_rate": 4.996835163258461e-07, + "logits/chosen": -2.098681688308716, + "logits/rejected": -2.0977959632873535, + "logps/chosen": -27.12312126159668, + "logps/rejected": -44.47397994995117, + "loss": 0.5346, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.1570792198181152, + "rewards/margins": 0.7375643253326416, + "rewards/rejected": -1.8946435451507568, + "step": 56 + }, + { + "epoch": 0.23797922810616745, + "grad_norm": 37.460312981010176, + "learning_rate": 4.994769052108987e-07, + "logits/chosen": -2.0555946826934814, + "logits/rejected": -2.0529699325561523, + "logps/chosen": -23.68335723876953, + "logps/rejected": -39.27627944946289, + "loss": 0.3812, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.066185712814331, + "rewards/margins": 1.065974235534668, + "rewards/rejected": -2.13215970993042, + "step": 58 + }, + { + "epoch": 0.24618540838569047, + "grad_norm": 34.303283256725926, + "learning_rate": 4.992187214125447e-07, + "logits/chosen": -2.10715651512146, + "logits/rejected": -2.1082205772399902, + "logps/chosen": -23.896142959594727, + "logps/rejected": -50.47355270385742, + "loss": 0.3707, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.9780565500259399, + "rewards/margins": 1.4628773927688599, + "rewards/rejected": -2.4409339427948, + "step": 60 + }, + { + "epoch": 0.24618540838569047, + "eval_logits/chosen": -2.0698177814483643, + "eval_logits/rejected": -2.0673775672912598, + "eval_logps/chosen": -26.22644805908203, + "eval_logps/rejected": -34.93669128417969, + "eval_loss": 0.44575488567352295, + "eval_rewards/accuracies": 0.7695852518081665, + "eval_rewards/chosen": -0.7930165529251099, + "eval_rewards/margins": 1.106292963027954, + "eval_rewards/rejected": -1.8993093967437744, + "eval_runtime": 383.2498, + "eval_samples_per_second": 4.524, + "eval_steps_per_second": 1.132, + "step": 60 + }, + { + "epoch": 0.2543915886652135, + "grad_norm": 34.73750630342744, + "learning_rate": 4.98909018303385e-07, + "logits/chosen": -2.096700429916382, + "logits/rejected": -2.0969526767730713, + "logps/chosen": -28.021564483642578, + "logps/rejected": -44.515228271484375, + "loss": 0.4361, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.700480580329895, + "rewards/margins": 1.3456040620803833, + "rewards/rejected": -2.046084403991699, + "step": 62 + }, + { + "epoch": 0.2625977689447365, + "grad_norm": 32.81062577153495, + "learning_rate": 4.985478599062611e-07, + "logits/chosen": -2.087684154510498, + "logits/rejected": -2.088871717453003, + "logps/chosen": -26.94491958618164, + "logps/rejected": -34.76374435424805, + "loss": 0.4479, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.528798520565033, + "rewards/margins": 0.9001777172088623, + "rewards/rejected": -1.42897629737854, + "step": 64 + }, + { + "epoch": 0.27080394922425954, + "grad_norm": 29.052321436937024, + "learning_rate": 4.981353208810206e-07, + "logits/chosen": -2.0988636016845703, + "logits/rejected": -2.104152202606201, + "logps/chosen": -20.718381881713867, + "logps/rejected": -51.553409576416016, + "loss": 0.4029, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.7514103055000305, + "rewards/margins": 1.5007989406585693, + "rewards/rejected": -2.252209186553955, + "step": 66 + }, + { + "epoch": 0.2790101295037825, + "grad_norm": 37.90898283460309, + "learning_rate": 4.976714865090826e-07, + "logits/chosen": -2.0126729011535645, + "logits/rejected": -2.007772445678711, + "logps/chosen": -27.219005584716797, + "logps/rejected": -26.84417152404785, + "loss": 0.45, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7038444876670837, + "rewards/margins": 0.9012500047683716, + "rewards/rejected": -1.6050945520401, + "step": 68 + }, + { + "epoch": 0.28721630978330553, + "grad_norm": 31.01097368955644, + "learning_rate": 4.971564526758087e-07, + "logits/chosen": -2.0806331634521484, + "logits/rejected": -2.090015411376953, + "logps/chosen": -26.418888092041016, + "logps/rejected": -61.58074951171875, + "loss": 0.3921, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.6639503240585327, + "rewards/margins": 2.018562078475952, + "rewards/rejected": -2.6825122833251953, + "step": 70 + }, + { + "epoch": 0.28721630978330553, + "eval_logits/chosen": -2.0629022121429443, + "eval_logits/rejected": -2.0604288578033447, + "eval_logps/chosen": -25.214706420898438, + "eval_logps/rejected": -34.3575553894043, + "eval_loss": 0.4037153124809265, + "eval_rewards/accuracies": 0.7776497602462769, + "eval_rewards/chosen": -0.2871449589729309, + "eval_rewards/margins": 1.322598934173584, + "eval_rewards/rejected": -1.6097438335418701, + "eval_runtime": 383.2946, + "eval_samples_per_second": 4.524, + "eval_steps_per_second": 1.132, + "step": 70 + }, + { + "epoch": 0.29542249006282856, + "grad_norm": 35.944172042258, + "learning_rate": 4.965903258506806e-07, + "logits/chosen": -1.982763409614563, + "logits/rejected": -1.9825458526611328, + "logps/chosen": -26.120790481567383, + "logps/rejected": -47.724666595458984, + "loss": 0.3965, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.5441683530807495, + "rewards/margins": 1.4394950866699219, + "rewards/rejected": -1.9836633205413818, + "step": 72 + }, + { + "epoch": 0.3036286703423516, + "grad_norm": 30.076414397329636, + "learning_rate": 4.959732230652907e-07, + "logits/chosen": -2.0936832427978516, + "logits/rejected": -2.0875070095062256, + "logps/chosen": -21.951000213623047, + "logps/rejected": -35.42924499511719, + "loss": 0.3786, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.6129394769668579, + "rewards/margins": 1.359676718711853, + "rewards/rejected": -1.9726160764694214, + "step": 74 + }, + { + "epoch": 0.3118348506218746, + "grad_norm": 31.92687509033969, + "learning_rate": 4.953052718891494e-07, + "logits/chosen": -2.0927772521972656, + "logits/rejected": -2.0937466621398926, + "logps/chosen": -24.76601791381836, + "logps/rejected": -40.51527786254883, + "loss": 0.3999, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.39051324129104614, + "rewards/margins": 1.7075785398483276, + "rewards/rejected": -2.0980916023254395, + "step": 76 + }, + { + "epoch": 0.3200410309013976, + "grad_norm": 39.97329898507408, + "learning_rate": 4.945866104033126e-07, + "logits/chosen": -2.07487416267395, + "logits/rejected": -2.0698864459991455, + "logps/chosen": -28.91614532470703, + "logps/rejected": -32.64835739135742, + "loss": 0.3902, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8651710152626038, + "rewards/margins": 1.3195717334747314, + "rewards/rejected": -2.1847424507141113, + "step": 78 + }, + { + "epoch": 0.32824721118092065, + "grad_norm": 35.23935142184965, + "learning_rate": 4.938173871718379e-07, + "logits/chosen": -2.1075708866119385, + "logits/rejected": -2.108260154724121, + "logps/chosen": -30.674692153930664, + "logps/rejected": -45.75242233276367, + "loss": 0.4197, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.8922288417816162, + "rewards/margins": 1.6468966007232666, + "rewards/rejected": -2.5391252040863037, + "step": 80 + }, + { + "epoch": 0.32824721118092065, + "eval_logits/chosen": -2.0568766593933105, + "eval_logits/rejected": -2.0545151233673096, + "eval_logps/chosen": -26.29727554321289, + "eval_logps/rejected": -36.041969299316406, + "eval_loss": 0.3652815520763397, + "eval_rewards/accuracies": 0.7937787771224976, + "eval_rewards/chosen": -0.8284297585487366, + "eval_rewards/margins": 1.6235177516937256, + "eval_rewards/rejected": -2.4519472122192383, + "eval_runtime": 383.0876, + "eval_samples_per_second": 4.526, + "eval_steps_per_second": 1.133, + "step": 80 + }, + { + "epoch": 0.3364533914604437, + "grad_norm": 27.98699458231919, + "learning_rate": 4.929977612110723e-07, + "logits/chosen": -2.0166802406311035, + "logits/rejected": -2.009007215499878, + "logps/chosen": -27.796003341674805, + "logps/rejected": -37.83144760131836, + "loss": 0.3548, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.278414249420166, + "rewards/margins": 1.387697696685791, + "rewards/rejected": -2.666111946105957, + "step": 82 + }, + { + "epoch": 0.34465957173996664, + "grad_norm": 29.909892482214758, + "learning_rate": 4.921279019567806e-07, + "logits/chosen": -2.060328483581543, + "logits/rejected": -2.0567514896392822, + "logps/chosen": -22.3341064453125, + "logps/rejected": -27.066940307617188, + "loss": 0.3357, + "rewards/accuracies": 0.828125, + "rewards/chosen": -1.3317604064941406, + "rewards/margins": 1.3481371402740479, + "rewards/rejected": -2.6798975467681885, + "step": 84 + }, + { + "epoch": 0.35286575201948966, + "grad_norm": 28.894675574314675, + "learning_rate": 4.912079892291184e-07, + "logits/chosen": -2.0049288272857666, + "logits/rejected": -2.0062108039855957, + "logps/chosen": -30.222366333007812, + "logps/rejected": -37.637451171875, + "loss": 0.3523, + "rewards/accuracies": 0.890625, + "rewards/chosen": -1.1566193103790283, + "rewards/margins": 1.645595669746399, + "rewards/rejected": -2.8022148609161377, + "step": 86 + }, + { + "epoch": 0.3610719322990127, + "grad_norm": 33.499254927144385, + "learning_rate": 4.902382131954594e-07, + "logits/chosen": -2.0440762042999268, + "logits/rejected": -2.041065216064453, + "logps/chosen": -28.95600700378418, + "logps/rejected": -32.45362091064453, + "loss": 0.3969, + "rewards/accuracies": 0.859375, + "rewards/chosen": -1.4144505262374878, + "rewards/margins": 1.526777982711792, + "rewards/rejected": -2.9412283897399902, + "step": 88 + }, + { + "epoch": 0.3692781125785357, + "grad_norm": 36.672895433055324, + "learning_rate": 4.892187743310834e-07, + "logits/chosen": -2.1121721267700195, + "logits/rejected": -2.1142220497131348, + "logps/chosen": -33.17682647705078, + "logps/rejected": -55.22991180419922, + "loss": 0.3216, + "rewards/accuracies": 0.859375, + "rewards/chosen": -0.8327025175094604, + "rewards/margins": 2.4360129833221436, + "rewards/rejected": -3.2687156200408936, + "step": 90 + }, + { + "epoch": 0.3692781125785357, + "eval_logits/chosen": -2.052072048187256, + "eval_logits/rejected": -2.0496795177459717, + "eval_logps/chosen": -25.994354248046875, + "eval_logps/rejected": -35.990543365478516, + "eval_loss": 0.33759820461273193, + "eval_rewards/accuracies": 0.8029953837394714, + "eval_rewards/chosen": -0.6769699454307556, + "eval_rewards/margins": 1.7492659091949463, + "eval_rewards/rejected": -2.426236152648926, + "eval_runtime": 382.9061, + "eval_samples_per_second": 4.529, + "eval_steps_per_second": 1.133, + "step": 90 + }, + { + "epoch": 0.37748429285805873, + "grad_norm": 27.350771381278054, + "learning_rate": 4.881498833777333e-07, + "logits/chosen": -2.1142969131469727, + "logits/rejected": -2.1127796173095703, + "logps/chosen": -27.94344711303711, + "logps/rejected": -36.454994201660156, + "loss": 0.3337, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.4834941327571869, + "rewards/margins": 1.6755785942077637, + "rewards/rejected": -2.1590728759765625, + "step": 92 + }, + { + "epoch": 0.38569047313758176, + "grad_norm": 21.635416471698683, + "learning_rate": 4.870317613000496e-07, + "logits/chosen": -2.016702890396118, + "logits/rejected": -2.017988681793213, + "logps/chosen": -29.998384475708008, + "logps/rejected": -37.61553192138672, + "loss": 0.3289, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.3838534653186798, + "rewards/margins": 1.9279775619506836, + "rewards/rejected": -2.311830997467041, + "step": 94 + }, + { + "epoch": 0.3938966534171048, + "grad_norm": 32.26464274668422, + "learning_rate": 4.858646392398927e-07, + "logits/chosen": -2.1012330055236816, + "logits/rejected": -2.1029739379882812, + "logps/chosen": -24.2839298248291, + "logps/rejected": -52.811363220214844, + "loss": 0.3288, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.1970413625240326, + "rewards/margins": 2.4278342723846436, + "rewards/rejected": -2.624875545501709, + "step": 96 + }, + { + "epoch": 0.40210283369662775, + "grad_norm": 24.11098061020347, + "learning_rate": 4.846487584685594e-07, + "logits/chosen": -2.0802366733551025, + "logits/rejected": -2.073582649230957, + "logps/chosen": -24.776100158691406, + "logps/rejected": -30.90226936340332, + "loss": 0.3069, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.5730211734771729, + "rewards/margins": 1.4600439071655273, + "rewards/rejected": -2.0330650806427, + "step": 98 + }, + { + "epoch": 0.41030901397615077, + "grad_norm": 24.386233800584957, + "learning_rate": 4.833843703369075e-07, + "logits/chosen": -2.056903839111328, + "logits/rejected": -2.0560696125030518, + "logps/chosen": -25.943012237548828, + "logps/rejected": -47.84954071044922, + "loss": 0.2756, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.26147836446762085, + "rewards/margins": 2.6134910583496094, + "rewards/rejected": -2.874969482421875, + "step": 100 + }, + { + "epoch": 0.41030901397615077, + "eval_logits/chosen": -2.048205852508545, + "eval_logits/rejected": -2.0459136962890625, + "eval_logps/chosen": -25.047216415405273, + "eval_logps/rejected": -35.27001953125, + "eval_loss": 0.3195771872997284, + "eval_rewards/accuracies": 0.804147481918335, + "eval_rewards/chosen": -0.20340144634246826, + "eval_rewards/margins": 1.8625727891921997, + "eval_rewards/rejected": -2.065974235534668, + "eval_runtime": 383.1037, + "eval_samples_per_second": 4.526, + "eval_steps_per_second": 1.133, + "step": 100 + }, + { + "epoch": 0.4185151942556738, + "grad_norm": 20.26348597387277, + "learning_rate": 4.82071736223395e-07, + "logits/chosen": -2.0507657527923584, + "logits/rejected": -2.044565200805664, + "logps/chosen": -23.989782333374023, + "logps/rejected": -31.029855728149414, + "loss": 0.2811, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2594943940639496, + "rewards/margins": 1.7268887758255005, + "rewards/rejected": -1.9863829612731934, + "step": 102 + }, + { + "epoch": 0.4267213745351968, + "grad_norm": 19.535827187227056, + "learning_rate": 4.807111274800475e-07, + "logits/chosen": -2.0995054244995117, + "logits/rejected": -2.091494083404541, + "logps/chosen": -20.125661849975586, + "logps/rejected": -26.041656494140625, + "loss": 0.2717, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.21148215234279633, + "rewards/margins": 1.814781665802002, + "rewards/rejected": -2.02626371383667, + "step": 104 + }, + { + "epoch": 0.43492755481471984, + "grad_norm": 33.73160324782582, + "learning_rate": 4.793028253763632e-07, + "logits/chosen": -2.0714871883392334, + "logits/rejected": -2.073652744293213, + "logps/chosen": -29.02633285522461, + "logps/rejected": -45.91259765625, + "loss": 0.3285, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.4463399350643158, + "rewards/margins": 2.3804402351379395, + "rewards/rejected": -2.826780080795288, + "step": 106 + }, + { + "epoch": 0.44313373509424286, + "grad_norm": 27.500884484426393, + "learning_rate": 4.778471210411683e-07, + "logits/chosen": -2.1328721046447754, + "logits/rejected": -2.128079414367676, + "logps/chosen": -26.338151931762695, + "logps/rejected": -39.00925064086914, + "loss": 0.2889, + "rewards/accuracies": 0.859375, + "rewards/chosen": -0.20531219244003296, + "rewards/margins": 2.1164348125457764, + "rewards/rejected": -2.321747064590454, + "step": 108 + }, + { + "epoch": 0.4513399153737659, + "grad_norm": 19.395398725174257, + "learning_rate": 4.763443154024334e-07, + "logits/chosen": -2.010795831680298, + "logits/rejected": -2.004256010055542, + "logps/chosen": -21.2674560546875, + "logps/rejected": -29.27115249633789, + "loss": 0.2402, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.35923194885253906, + "rewards/margins": 2.286337375640869, + "rewards/rejected": -2.645569324493408, + "step": 110 + }, + { + "epoch": 0.4513399153737659, + "eval_logits/chosen": -2.047652244567871, + "eval_logits/rejected": -2.0454225540161133, + "eval_logps/chosen": -25.388172149658203, + "eval_logps/rejected": -36.19011306762695, + "eval_loss": 0.29904791712760925, + "eval_rewards/accuracies": 0.8064516186714172, + "eval_rewards/chosen": -0.37387850880622864, + "eval_rewards/margins": 2.152141571044922, + "eval_rewards/rejected": -2.526020050048828, + "eval_runtime": 382.9313, + "eval_samples_per_second": 4.528, + "eval_steps_per_second": 1.133, + "step": 110 + }, + { + "epoch": 0.4595460956532889, + "grad_norm": 19.214229030497684, + "learning_rate": 4.74794719125065e-07, + "logits/chosen": -2.0643627643585205, + "logits/rejected": -2.0607378482818604, + "logps/chosen": -28.60050392150879, + "logps/rejected": -30.703428268432617, + "loss": 0.2887, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.28185003995895386, + "rewards/margins": 1.8889483213424683, + "rewards/rejected": -2.1707985401153564, + "step": 112 + }, + { + "epoch": 0.4677522759328119, + "grad_norm": 26.967072996600148, + "learning_rate": 4.731986525466836e-07, + "logits/chosen": -2.0277886390686035, + "logits/rejected": -2.0253236293792725, + "logps/chosen": -28.50118064880371, + "logps/rejected": -39.31520462036133, + "loss": 0.2738, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.7399206757545471, + "rewards/margins": 2.0028390884399414, + "rewards/rejected": -2.7427597045898438, + "step": 114 + }, + { + "epoch": 0.4759584562123349, + "grad_norm": 23.78521498468923, + "learning_rate": 4.7155644561140293e-07, + "logits/chosen": -2.0169668197631836, + "logits/rejected": -2.0144312381744385, + "logps/chosen": -17.960556030273438, + "logps/rejected": -44.15462875366211, + "loss": 0.308, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.5678286552429199, + "rewards/margins": 2.510011672973633, + "rewards/rejected": -3.077840805053711, + "step": 116 + }, + { + "epoch": 0.4841646364918579, + "grad_norm": 24.124459622722974, + "learning_rate": 4.698684378016222e-07, + "logits/chosen": -1.9945940971374512, + "logits/rejected": -1.9939186573028564, + "logps/chosen": -27.118621826171875, + "logps/rejected": -47.901485443115234, + "loss": 0.3078, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.848014235496521, + "rewards/margins": 2.41487455368042, + "rewards/rejected": -3.2628891468048096, + "step": 118 + }, + { + "epoch": 0.49237081677138095, + "grad_norm": 21.029226154824478, + "learning_rate": 4.681349780678478e-07, + "logits/chosen": -2.069658041000366, + "logits/rejected": -2.066652774810791, + "logps/chosen": -19.98298454284668, + "logps/rejected": -35.198486328125, + "loss": 0.2684, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.4263419508934021, + "rewards/margins": 2.379479169845581, + "rewards/rejected": -2.805821418762207, + "step": 120 + }, + { + "epoch": 0.49237081677138095, + "eval_logits/chosen": -2.0455994606018066, + "eval_logits/rejected": -2.0433595180511475, + "eval_logps/chosen": -25.59235954284668, + "eval_logps/rejected": -36.88100051879883, + "eval_loss": 0.2864527106285095, + "eval_rewards/accuracies": 0.8122119903564453, + "eval_rewards/chosen": -0.47597208619117737, + "eval_rewards/margins": 2.3954925537109375, + "eval_rewards/rejected": -2.871464729309082, + "eval_runtime": 382.8408, + "eval_samples_per_second": 4.529, + "eval_steps_per_second": 1.134, + "step": 120 + }, + { + "epoch": 0.500576997050904, + "grad_norm": 22.882841908986034, + "learning_rate": 4.6635642475655643e-07, + "logits/chosen": -2.0903265476226807, + "logits/rejected": -2.091627836227417, + "logps/chosen": -27.477466583251953, + "logps/rejected": -39.44534683227539, + "loss": 0.3338, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.455081045627594, + "rewards/margins": 2.4550111293792725, + "rewards/rejected": -2.9100921154022217, + "step": 122 + }, + { + "epoch": 0.508783177330427, + "grad_norm": 25.08346996002156, + "learning_rate": 4.6453314553611724e-07, + "logits/chosen": -2.058607816696167, + "logits/rejected": -2.0590226650238037, + "logps/chosen": -26.310922622680664, + "logps/rejected": -41.810604095458984, + "loss": 0.3438, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.7647377848625183, + "rewards/margins": 2.2874107360839844, + "rewards/rejected": -3.0521485805511475, + "step": 124 + }, + { + "epoch": 0.51698935760995, + "grad_norm": 19.245560055238066, + "learning_rate": 4.626655173207856e-07, + "logits/chosen": -2.0874435901641846, + "logits/rejected": -2.0808544158935547, + "logps/chosen": -34.59534454345703, + "logps/rejected": -37.33736801147461, + "loss": 0.2242, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.46927499771118164, + "rewards/margins": 2.6450273990631104, + "rewards/rejected": -3.114302396774292, + "step": 126 + }, + { + "epoch": 0.525195537889473, + "grad_norm": 26.78927560536762, + "learning_rate": 4.607539261927868e-07, + "logits/chosen": -2.038646936416626, + "logits/rejected": -2.036555290222168, + "logps/chosen": -25.151229858398438, + "logps/rejected": -41.385738372802734, + "loss": 0.258, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.7142379879951477, + "rewards/margins": 2.4793665409088135, + "rewards/rejected": -3.1936047077178955, + "step": 128 + }, + { + "epoch": 0.5334017181689961, + "grad_norm": 28.211129877400257, + "learning_rate": 4.587987673225031e-07, + "logits/chosen": -2.0317800045013428, + "logits/rejected": -2.029090642929077, + "logps/chosen": -23.62847137451172, + "logps/rejected": -41.40718460083008, + "loss": 0.314, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7062588930130005, + "rewards/margins": 2.260392904281616, + "rewards/rejected": -2.9666519165039062, + "step": 130 + }, + { + "epoch": 0.5334017181689961, + "eval_logits/chosen": -2.046064615249634, + "eval_logits/rejected": -2.0438313484191895, + "eval_logps/chosen": -24.87871742248535, + "eval_logps/rejected": -36.286964416503906, + "eval_loss": 0.2757984399795532, + "eval_rewards/accuracies": 0.8225806355476379, + "eval_rewards/chosen": -0.11915161460638046, + "eval_rewards/margins": 2.4552958011627197, + "eval_rewards/rejected": -2.5744473934173584, + "eval_runtime": 382.9112, + "eval_samples_per_second": 4.528, + "eval_steps_per_second": 1.133, + "step": 130 + }, + { + "epoch": 0.5416078984485191, + "grad_norm": 19.634258213686227, + "learning_rate": 4.568004448867836e-07, + "logits/chosen": -1.9866001605987549, + "logits/rejected": -1.983536720275879, + "logps/chosen": -31.62383460998535, + "logps/rejected": -39.09525680541992, + "loss": 0.2353, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.31665635108947754, + "rewards/margins": 2.6990866661071777, + "rewards/rejected": -3.0157430171966553, + "step": 132 + }, + { + "epoch": 0.5498140787280421, + "grad_norm": 18.53508760026935, + "learning_rate": 4.547593719853908e-07, + "logits/chosen": -2.117701530456543, + "logits/rejected": -2.1072885990142822, + "logps/chosen": -18.488718032836914, + "logps/rejected": -28.42927360534668, + "loss": 0.2533, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.2543734908103943, + "rewards/margins": 1.9832816123962402, + "rewards/rejected": -2.2376551628112793, + "step": 134 + }, + { + "epoch": 0.558020259007565, + "grad_norm": 16.806992595132815, + "learning_rate": 4.526759705556037e-07, + "logits/chosen": -2.1042609214782715, + "logits/rejected": -2.106780767440796, + "logps/chosen": -25.879810333251953, + "logps/rejected": -48.42536163330078, + "loss": 0.1949, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.10162217915058136, + "rewards/margins": 3.190650224685669, + "rewards/rejected": -3.0890283584594727, + "step": 136 + }, + { + "epoch": 0.566226439287088, + "grad_norm": 29.13623442121838, + "learning_rate": 4.5055067128499336e-07, + "logits/chosen": -2.1067519187927246, + "logits/rejected": -2.0996322631835938, + "logps/chosen": -24.035615921020508, + "logps/rejected": -33.068504333496094, + "loss": 0.3053, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.14914892613887787, + "rewards/margins": 2.2678146362304688, + "rewards/rejected": -2.4169633388519287, + "step": 138 + }, + { + "epoch": 0.5744326195666111, + "grad_norm": 14.892524709426025, + "learning_rate": 4.483839135223899e-07, + "logits/chosen": -2.0495522022247314, + "logits/rejected": -2.049654960632324, + "logps/chosen": -22.117572784423828, + "logps/rejected": -45.10372543334961, + "loss": 0.213, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1762857437133789, + "rewards/margins": 3.072883129119873, + "rewards/rejected": -3.249169111251831, + "step": 140 + }, + { + "epoch": 0.5744326195666111, + "eval_logits/chosen": -2.043985605239868, + "eval_logits/rejected": -2.041775703430176, + "eval_logps/chosen": -24.56585693359375, + "eval_logps/rejected": -36.1696662902832, + "eval_loss": 0.26956623792648315, + "eval_rewards/accuracies": 0.8214285969734192, + "eval_rewards/chosen": 0.03727945312857628, + "eval_rewards/margins": 2.55307674407959, + "eval_rewards/rejected": -2.5157971382141113, + "eval_runtime": 382.8923, + "eval_samples_per_second": 4.529, + "eval_steps_per_second": 1.133, + "step": 140 + }, + { + "epoch": 0.5826387998461341, + "grad_norm": 26.214083971395798, + "learning_rate": 4.461761451870586e-07, + "logits/chosen": -2.0746335983276367, + "logits/rejected": -2.0725696086883545, + "logps/chosen": -28.98033332824707, + "logps/rejected": -33.45741653442383, + "loss": 0.2844, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.3142589330673218, + "rewards/margins": 2.325613498687744, + "rewards/rejected": -2.6398725509643555, + "step": 142 + }, + { + "epoch": 0.5908449801256571, + "grad_norm": 19.801373156192714, + "learning_rate": 4.4392782267610495e-07, + "logits/chosen": -2.026793956756592, + "logits/rejected": -2.02146315574646, + "logps/chosen": -27.090116500854492, + "logps/rejected": -36.820072174072266, + "loss": 0.2405, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.05273272842168808, + "rewards/margins": 2.6351218223571777, + "rewards/rejected": -2.5823891162872314, + "step": 144 + }, + { + "epoch": 0.5990511604051801, + "grad_norm": 16.912833507875153, + "learning_rate": 4.416394107701263e-07, + "logits/chosen": -2.0721688270568848, + "logits/rejected": -2.0710277557373047, + "logps/chosen": -29.055322647094727, + "logps/rejected": -35.73569107055664, + "loss": 0.22, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04920143634080887, + "rewards/margins": 2.790663957595825, + "rewards/rejected": -2.7414627075195312, + "step": 146 + }, + { + "epoch": 0.6072573406847032, + "grad_norm": 18.48986000965074, + "learning_rate": 4.393113825371312e-07, + "logits/chosen": -2.088228225708008, + "logits/rejected": -2.0804646015167236, + "logps/chosen": -22.193986892700195, + "logps/rejected": -28.600019454956055, + "loss": 0.2584, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.23887695372104645, + "rewards/margins": 2.352029323577881, + "rewards/rejected": -2.5909059047698975, + "step": 148 + }, + { + "epoch": 0.6154635209642262, + "grad_norm": 20.195764906566346, + "learning_rate": 4.3694421923474523e-07, + "logits/chosen": -2.0307724475860596, + "logits/rejected": -2.0267581939697266, + "logps/chosen": -25.377315521240234, + "logps/rejected": -26.69452667236328, + "loss": 0.2466, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.02581462636590004, + "rewards/margins": 2.615370512008667, + "rewards/rejected": -2.5895559787750244, + "step": 150 + }, + { + "epoch": 0.6154635209642262, + "eval_logits/chosen": -2.0484068393707275, + "eval_logits/rejected": -2.046243906021118, + "eval_logps/chosen": -24.641366958618164, + "eval_logps/rejected": -36.52920150756836, + "eval_loss": 0.2627074420452118, + "eval_rewards/accuracies": 0.8179723620414734, + "eval_rewards/chosen": -0.00047618892858736217, + "eval_rewards/margins": 2.6950886249542236, + "eval_rewards/rejected": -2.6955649852752686, + "eval_runtime": 383.0874, + "eval_samples_per_second": 4.526, + "eval_steps_per_second": 1.133, + "step": 150 + }, + { + "epoch": 0.6236697012437492, + "grad_norm": 19.84271445630352, + "learning_rate": 4.3453841021072367e-07, + "logits/chosen": -2.0682671070098877, + "logits/rejected": -2.0678606033325195, + "logps/chosen": -29.11172866821289, + "logps/rejected": -43.93864059448242, + "loss": 0.2393, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.21894294023513794, + "rewards/margins": 2.8336007595062256, + "rewards/rejected": -3.0525436401367188, + "step": 152 + }, + { + "epoch": 0.6318758815232722, + "grad_norm": 23.803208123467336, + "learning_rate": 4.32094452801792e-07, + "logits/chosen": -2.120655059814453, + "logits/rejected": -2.113632917404175, + "logps/chosen": -23.940746307373047, + "logps/rejected": -40.54450988769531, + "loss": 0.2101, + "rewards/accuracies": 0.890625, + "rewards/chosen": 0.22472774982452393, + "rewards/margins": 3.2544140815734863, + "rewards/rejected": -3.029686450958252, + "step": 154 + }, + { + "epoch": 0.6400820618027953, + "grad_norm": 27.831688353015224, + "learning_rate": 4.29612852230835e-07, + "logits/chosen": -2.1510727405548096, + "logits/rejected": -2.1424570083618164, + "logps/chosen": -29.35157012939453, + "logps/rejected": -39.825775146484375, + "loss": 0.3136, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.25963878631591797, + "rewards/margins": 2.4727556705474854, + "rewards/rejected": -2.7323946952819824, + "step": 156 + }, + { + "epoch": 0.6482882420823183, + "grad_norm": 16.129032160167743, + "learning_rate": 4.270941215024551e-07, + "logits/chosen": -2.0342090129852295, + "logits/rejected": -2.030592918395996, + "logps/chosen": -25.4481201171875, + "logps/rejected": -30.034128189086914, + "loss": 0.2383, + "rewards/accuracies": 0.859375, + "rewards/chosen": -0.3485499918460846, + "rewards/margins": 2.4843502044677734, + "rewards/rejected": -2.832900047302246, + "step": 158 + }, + { + "epoch": 0.6564944223618413, + "grad_norm": 22.363661877277583, + "learning_rate": 4.2453878129692257e-07, + "logits/chosen": -2.0489790439605713, + "logits/rejected": -2.0505762100219727, + "logps/chosen": -20.61872673034668, + "logps/rejected": -39.31790542602539, + "loss": 0.236, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.36551517248153687, + "rewards/margins": 2.786931037902832, + "rewards/rejected": -3.1524462699890137, + "step": 160 + }, + { + "epoch": 0.6564944223618413, + "eval_logits/chosen": -2.048050880432129, + "eval_logits/rejected": -2.0458953380584717, + "eval_logps/chosen": -25.27681541442871, + "eval_logps/rejected": -37.62437438964844, + "eval_loss": 0.2534183859825134, + "eval_rewards/accuracies": 0.8260368704795837, + "eval_rewards/chosen": -0.31819990277290344, + "eval_rewards/margins": 2.9249520301818848, + "eval_rewards/rejected": -3.243151903152466, + "eval_runtime": 383.1245, + "eval_samples_per_second": 4.526, + "eval_steps_per_second": 1.133, + "step": 160 + }, + { + "epoch": 0.6647006026413643, + "grad_norm": 16.96720912691383, + "learning_rate": 4.2194735986253894e-07, + "logits/chosen": -2.0488674640655518, + "logits/rejected": -2.04327130317688, + "logps/chosen": -29.53526496887207, + "logps/rejected": -29.405216217041016, + "loss": 0.2198, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.7364621162414551, + "rewards/margins": 2.2279775142669678, + "rewards/rejected": -2.9644393920898438, + "step": 162 + }, + { + "epoch": 0.6729067829208873, + "grad_norm": 21.306381619736708, + "learning_rate": 4.193203929064353e-07, + "logits/chosen": -2.098163366317749, + "logits/rejected": -2.09775710105896, + "logps/chosen": -27.31165313720703, + "logps/rejected": -49.05406951904297, + "loss": 0.2626, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6689335107803345, + "rewards/margins": 3.3616225719451904, + "rewards/rejected": -4.030555725097656, + "step": 164 + }, + { + "epoch": 0.6811129632004103, + "grad_norm": 16.744872149563246, + "learning_rate": 4.1665842348382974e-07, + "logits/chosen": -2.076477289199829, + "logits/rejected": -2.078810453414917, + "logps/chosen": -21.170326232910156, + "logps/rejected": -47.55445098876953, + "loss": 0.2274, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.5238662958145142, + "rewards/margins": 3.466320514678955, + "rewards/rejected": -3.9901866912841797, + "step": 166 + }, + { + "epoch": 0.6893191434799333, + "grad_norm": 14.185305520407544, + "learning_rate": 4.139620018857648e-07, + "logits/chosen": -2.003122091293335, + "logits/rejected": -2.000248908996582, + "logps/chosen": -22.870935440063477, + "logps/rejected": -39.74565505981445, + "loss": 0.2638, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.6974708437919617, + "rewards/margins": 2.8220083713531494, + "rewards/rejected": -3.519479513168335, + "step": 168 + }, + { + "epoch": 0.6975253237594563, + "grad_norm": 18.022997094570332, + "learning_rate": 4.1123168552534983e-07, + "logits/chosen": -2.025099277496338, + "logits/rejected": -2.0219833850860596, + "logps/chosen": -28.584171295166016, + "logps/rejected": -38.055633544921875, + "loss": 0.2503, + "rewards/accuracies": 0.859375, + "rewards/chosen": -0.9480264186859131, + "rewards/margins": 2.6562623977661133, + "rewards/rejected": -3.6042890548706055, + "step": 170 + }, + { + "epoch": 0.6975253237594563, + "eval_logits/chosen": -2.047826051712036, + "eval_logits/rejected": -2.0456759929656982, + "eval_logps/chosen": -25.28836441040039, + "eval_logps/rejected": -37.87636184692383, + "eval_loss": 0.2495637983083725, + "eval_rewards/accuracies": 0.8271889686584473, + "eval_rewards/chosen": -0.32397639751434326, + "eval_rewards/margins": 3.0451700687408447, + "eval_rewards/rejected": -3.3691465854644775, + "eval_runtime": 383.2321, + "eval_samples_per_second": 4.525, + "eval_steps_per_second": 1.132, + "step": 170 + }, + { + "epoch": 0.7057315040389793, + "grad_norm": 30.201529803311132, + "learning_rate": 4.084680388225302e-07, + "logits/chosen": -2.0860209465026855, + "logits/rejected": -2.0855419635772705, + "logps/chosen": -25.88998031616211, + "logps/rejected": -35.96123504638672, + "loss": 0.2004, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12960225343704224, + "rewards/margins": 3.401216745376587, + "rewards/rejected": -3.5308187007904053, + "step": 172 + }, + { + "epoch": 0.7139376843185024, + "grad_norm": 13.521536443635728, + "learning_rate": 4.0567163308740925e-07, + "logits/chosen": -2.061061143875122, + "logits/rejected": -2.0628788471221924, + "logps/chosen": -28.78243637084961, + "logps/rejected": -46.390567779541016, + "loss": 0.1995, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46569639444351196, + "rewards/margins": 3.489924430847168, + "rewards/rejected": -3.9556210041046143, + "step": 174 + }, + { + "epoch": 0.7221438645980254, + "grad_norm": 12.241119865111436, + "learning_rate": 4.028430464021445e-07, + "logits/chosen": -2.0593080520629883, + "logits/rejected": -2.0476720333099365, + "logps/chosen": -25.62759780883789, + "logps/rejected": -28.63144302368164, + "loss": 0.2311, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.2619730532169342, + "rewards/margins": 2.6522488594055176, + "rewards/rejected": -2.91422176361084, + "step": 176 + }, + { + "epoch": 0.7303500448775484, + "grad_norm": 19.69681362187802, + "learning_rate": 3.9998286350144517e-07, + "logits/chosen": -2.127859592437744, + "logits/rejected": -2.124366283416748, + "logps/chosen": -27.991050720214844, + "logps/rejected": -32.49789810180664, + "loss": 0.1931, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4937363862991333, + "rewards/margins": 2.826824188232422, + "rewards/rejected": -3.3205604553222656, + "step": 178 + }, + { + "epoch": 0.7385562251570714, + "grad_norm": 18.465447655230893, + "learning_rate": 3.970916756516936e-07, + "logits/chosen": -2.0612165927886963, + "logits/rejected": -2.0634703636169434, + "logps/chosen": -31.92318344116211, + "logps/rejected": -47.82712173461914, + "loss": 0.1869, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8367382287979126, + "rewards/margins": 3.4318110942840576, + "rewards/rejected": -4.26854944229126, + "step": 180 + }, + { + "epoch": 0.7385562251570714, + "eval_logits/chosen": -2.047463893890381, + "eval_logits/rejected": -2.0453338623046875, + "eval_logps/chosen": -25.764009475708008, + "eval_logps/rejected": -38.632667541503906, + "eval_loss": 0.24548786878585815, + "eval_rewards/accuracies": 0.8248847723007202, + "eval_rewards/chosen": -0.5617985725402832, + "eval_rewards/margins": 3.185500383377075, + "eval_rewards/rejected": -3.7472991943359375, + "eval_runtime": 383.1735, + "eval_samples_per_second": 4.525, + "eval_steps_per_second": 1.133, + "step": 180 + }, + { + "epoch": 0.7467624054365944, + "grad_norm": 28.11697685801471, + "learning_rate": 3.941700805287168e-07, + "logits/chosen": -2.100497245788574, + "logits/rejected": -2.0966544151306152, + "logps/chosen": -22.02035140991211, + "logps/rejected": -34.0717658996582, + "loss": 0.2677, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.21246086061000824, + "rewards/margins": 2.8322882652282715, + "rewards/rejected": -3.0447492599487305, + "step": 182 + }, + { + "epoch": 0.7549685857161175, + "grad_norm": 21.185462762042597, + "learning_rate": 3.912186820942329e-07, + "logits/chosen": -2.07576847076416, + "logits/rejected": -2.076803684234619, + "logps/chosen": -28.578994750976562, + "logps/rejected": -53.69257354736328, + "loss": 0.2491, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.507973849773407, + "rewards/margins": 4.095354080200195, + "rewards/rejected": -4.603327751159668, + "step": 184 + }, + { + "epoch": 0.7631747659956405, + "grad_norm": 15.467370455908538, + "learning_rate": 3.8823809047099844e-07, + "logits/chosen": -2.091827630996704, + "logits/rejected": -2.0913453102111816, + "logps/chosen": -22.403928756713867, + "logps/rejected": -48.54361343383789, + "loss": 0.214, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5807015895843506, + "rewards/margins": 3.9955556392669678, + "rewards/rejected": -4.57625675201416, + "step": 186 + }, + { + "epoch": 0.7713809462751635, + "grad_norm": 21.449176063404654, + "learning_rate": 3.8522892181668145e-07, + "logits/chosen": -2.073751449584961, + "logits/rejected": -2.0747601985931396, + "logps/chosen": -25.900894165039062, + "logps/rejected": -46.20444107055664, + "loss": 0.2199, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7566344141960144, + "rewards/margins": 3.398766040802002, + "rewards/rejected": -4.15540075302124, + "step": 188 + }, + { + "epoch": 0.7795871265546865, + "grad_norm": 17.55165479512368, + "learning_rate": 3.821917981964873e-07, + "logits/chosen": -2.0060808658599854, + "logits/rejected": -2.0017528533935547, + "logps/chosen": -22.145849227905273, + "logps/rejected": -36.78887939453125, + "loss": 0.2346, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.17031463980674744, + "rewards/margins": 2.9387521743774414, + "rewards/rejected": -3.1090667247772217, + "step": 190 + }, + { + "epoch": 0.7795871265546865, + "eval_logits/chosen": -2.0468969345092773, + "eval_logits/rejected": -2.044738292694092, + "eval_logps/chosen": -24.78700065612793, + "eval_logps/rejected": -37.58818054199219, + "eval_loss": 0.2414015829563141, + "eval_rewards/accuracies": 0.8329492807388306, + "eval_rewards/chosen": -0.07329300791025162, + "eval_rewards/margins": 3.1517624855041504, + "eval_rewards/rejected": -3.225055456161499, + "eval_runtime": 383.2411, + "eval_samples_per_second": 4.525, + "eval_steps_per_second": 1.132, + "step": 190 + }, + { + "epoch": 0.7877933068342096, + "grad_norm": 17.368985354953686, + "learning_rate": 3.79127347454564e-07, + "logits/chosen": -2.0460448265075684, + "logits/rejected": -2.0370802879333496, + "logps/chosen": -26.563547134399414, + "logps/rejected": -38.11757278442383, + "loss": 0.2211, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37561261653900146, + "rewards/margins": 2.6861519813537598, + "rewards/rejected": -3.0617644786834717, + "step": 192 + }, + { + "epoch": 0.7959994871137326, + "grad_norm": 24.935734742960335, + "learning_rate": 3.760362030842113e-07, + "logits/chosen": -2.0645647048950195, + "logits/rejected": -2.0566651821136475, + "logps/chosen": -25.446151733398438, + "logps/rejected": -29.0418758392334, + "loss": 0.2469, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27131855487823486, + "rewards/margins": 2.571014165878296, + "rewards/rejected": -2.842332124710083, + "step": 194 + }, + { + "epoch": 0.8042056673932555, + "grad_norm": 30.86901898716295, + "learning_rate": 3.7291900409692346e-07, + "logits/chosen": -2.1032328605651855, + "logits/rejected": -2.1033365726470947, + "logps/chosen": -25.201629638671875, + "logps/rejected": -39.323097229003906, + "loss": 0.2264, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.08912299573421478, + "rewards/margins": 3.298488140106201, + "rewards/rejected": -3.387611150741577, + "step": 196 + }, + { + "epoch": 0.8124118476727785, + "grad_norm": 21.758022770529394, + "learning_rate": 3.6977639489029056e-07, + "logits/chosen": -2.0734946727752686, + "logits/rejected": -2.0731263160705566, + "logps/chosen": -23.40597915649414, + "logps/rejected": -54.18486022949219, + "loss": 0.2027, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06441287696361542, + "rewards/margins": 3.9613521099090576, + "rewards/rejected": -3.896939277648926, + "step": 198 + }, + { + "epoch": 0.8206180279523015, + "grad_norm": 24.294166941995968, + "learning_rate": 3.666090251147864e-07, + "logits/chosen": -2.0556201934814453, + "logits/rejected": -2.056826114654541, + "logps/chosen": -22.517078399658203, + "logps/rejected": -50.51237487792969, + "loss": 0.2229, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.15975169837474823, + "rewards/margins": 4.061436176300049, + "rewards/rejected": -4.221188068389893, + "step": 200 + }, + { + "epoch": 0.8206180279523015, + "eval_logits/chosen": -2.0442752838134766, + "eval_logits/rejected": -2.0421106815338135, + "eval_logps/chosen": -24.409955978393555, + "eval_logps/rejected": -37.24555206298828, + "eval_loss": 0.23997244238853455, + "eval_rewards/accuracies": 0.8341013789176941, + "eval_rewards/chosen": 0.1152293011546135, + "eval_rewards/margins": 3.168970823287964, + "eval_rewards/rejected": -3.053741216659546, + "eval_runtime": 383.2231, + "eval_samples_per_second": 4.525, + "eval_steps_per_second": 1.132, + "step": 200 + }, + { + "epoch": 0.8288242082318246, + "grad_norm": 16.34097836225302, + "learning_rate": 3.6341754953947074e-07, + "logits/chosen": -1.9979509115219116, + "logits/rejected": -1.9910781383514404, + "logps/chosen": -19.41321563720703, + "logps/rejected": -31.22482681274414, + "loss": 0.1755, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.12338539958000183, + "rewards/margins": 3.176830291748047, + "rewards/rejected": -3.0534448623657227, + "step": 202 + }, + { + "epoch": 0.8370303885113476, + "grad_norm": 12.694366662014156, + "learning_rate": 3.6020262791663334e-07, + "logits/chosen": -2.0971240997314453, + "logits/rejected": -2.098155975341797, + "logps/chosen": -28.991458892822266, + "logps/rejected": -45.771728515625, + "loss": 0.1939, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.10578002035617828, + "rewards/margins": 3.6688270568847656, + "rewards/rejected": -3.563046932220459, + "step": 204 + }, + { + "epoch": 0.8452365687908706, + "grad_norm": 14.104950698059591, + "learning_rate": 3.569649248454077e-07, + "logits/chosen": -2.0265486240386963, + "logits/rejected": -2.023569107055664, + "logps/chosen": -19.130268096923828, + "logps/rejected": -45.600608825683594, + "loss": 0.1853, + "rewards/accuracies": 0.921875, + "rewards/chosen": 0.10110447555780411, + "rewards/margins": 3.614145040512085, + "rewards/rejected": -3.513040542602539, + "step": 206 + }, + { + "epoch": 0.8534427490703936, + "grad_norm": 17.487336280708007, + "learning_rate": 3.53705109634383e-07, + "logits/chosen": -2.0617876052856445, + "logits/rejected": -2.059702157974243, + "logps/chosen": -22.653236389160156, + "logps/rejected": -33.451725006103516, + "loss": 0.2036, + "rewards/accuracies": 0.859375, + "rewards/chosen": -0.21188408136367798, + "rewards/margins": 2.7427585124969482, + "rewards/rejected": -2.9546422958374023, + "step": 208 + }, + { + "epoch": 0.8616489293499167, + "grad_norm": 9.723870774955257, + "learning_rate": 3.5042385616324236e-07, + "logits/chosen": -2.030812978744507, + "logits/rejected": -2.0279781818389893, + "logps/chosen": -24.24312973022461, + "logps/rejected": -41.24238204956055, + "loss": 0.1987, + "rewards/accuracies": 0.859375, + "rewards/chosen": -0.33003199100494385, + "rewards/margins": 3.4097635746002197, + "rewards/rejected": -3.739795684814453, + "step": 210 + }, + { + "epoch": 0.8616489293499167, + "eval_logits/chosen": -2.044862985610962, + "eval_logits/rejected": -2.042698621749878, + "eval_logps/chosen": -24.65873146057129, + "eval_logps/rejected": -37.76519012451172, + "eval_loss": 0.23567576706409454, + "eval_rewards/accuracies": 0.8375576138496399, + "eval_rewards/chosen": -0.009159128181636333, + "eval_rewards/margins": 3.304400682449341, + "eval_rewards/rejected": -3.3135595321655273, + "eval_runtime": 383.2234, + "eval_samples_per_second": 4.525, + "eval_steps_per_second": 1.132, + "step": 210 + }, + { + "epoch": 0.8698551096294397, + "grad_norm": 21.492517186963823, + "learning_rate": 3.471218427434564e-07, + "logits/chosen": -2.080448865890503, + "logits/rejected": -2.077627182006836, + "logps/chosen": -28.046445846557617, + "logps/rejected": -39.041099548339844, + "loss": 0.2058, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.2396007478237152, + "rewards/margins": 3.291498899459839, + "rewards/rejected": -3.531099557876587, + "step": 212 + }, + { + "epoch": 0.8780612899089627, + "grad_norm": 13.459743982109678, + "learning_rate": 3.4379975197806025e-07, + "logits/chosen": -2.0183067321777344, + "logits/rejected": -2.0159990787506104, + "logps/chosen": -30.474184036254883, + "logps/rejected": -43.47400665283203, + "loss": 0.1623, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.49888333678245544, + "rewards/margins": 3.724670886993408, + "rewards/rejected": -4.2235541343688965, + "step": 214 + }, + { + "epoch": 0.8862674701884857, + "grad_norm": 21.39701909954645, + "learning_rate": 3.404582706205438e-07, + "logits/chosen": -2.0548624992370605, + "logits/rejected": -2.0510313510894775, + "logps/chosen": -25.710582733154297, + "logps/rejected": -47.18820571899414, + "loss": 0.2419, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.6691687107086182, + "rewards/margins": 3.2068655490875244, + "rewards/rejected": -3.8760342597961426, + "step": 216 + }, + { + "epoch": 0.8944736504680088, + "grad_norm": 15.742259895322631, + "learning_rate": 3.370980894328836e-07, + "logits/chosen": -2.0632827281951904, + "logits/rejected": -2.0607898235321045, + "logps/chosen": -29.016246795654297, + "logps/rejected": -29.224483489990234, + "loss": 0.231, + "rewards/accuracies": 0.859375, + "rewards/chosen": -0.02165369875729084, + "rewards/margins": 2.8292150497436523, + "rewards/rejected": -2.8508687019348145, + "step": 218 + }, + { + "epoch": 0.9026798307475318, + "grad_norm": 15.763481233552632, + "learning_rate": 3.337199030427465e-07, + "logits/chosen": -2.0306642055511475, + "logits/rejected": -2.0277583599090576, + "logps/chosen": -24.812223434448242, + "logps/rejected": -42.606937408447266, + "loss": 0.2242, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.5038759112358093, + "rewards/margins": 3.393310070037842, + "rewards/rejected": -3.897186040878296, + "step": 220 + }, + { + "epoch": 0.9026798307475318, + "eval_logits/chosen": -2.0421266555786133, + "eval_logits/rejected": -2.0399889945983887, + "eval_logps/chosen": -24.929790496826172, + "eval_logps/rejected": -38.282798767089844, + "eval_loss": 0.23096635937690735, + "eval_rewards/accuracies": 0.8398617506027222, + "eval_rewards/chosen": -0.144687682390213, + "eval_rewards/margins": 3.4276747703552246, + "eval_rewards/rejected": -3.5723624229431152, + "eval_runtime": 383.2137, + "eval_samples_per_second": 4.525, + "eval_steps_per_second": 1.133, + "step": 220 + }, + { + "epoch": 0.9108860110270548, + "grad_norm": 21.58130767879211, + "learning_rate": 3.303244097998937e-07, + "logits/chosen": -2.059748888015747, + "logits/rejected": -2.055283546447754, + "logps/chosen": -28.7065372467041, + "logps/rejected": -32.10883331298828, + "loss": 0.2483, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.3254169821739197, + "rewards/margins": 2.9877233505249023, + "rewards/rejected": -3.313140392303467, + "step": 222 + }, + { + "epoch": 0.9190921913065778, + "grad_norm": 22.43367663307141, + "learning_rate": 3.2691231163181577e-07, + "logits/chosen": -2.03068470954895, + "logits/rejected": -2.029240608215332, + "logps/chosen": -30.380455017089844, + "logps/rejected": -51.02494430541992, + "loss": 0.2287, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.5234890580177307, + "rewards/margins": 3.933694362640381, + "rewards/rejected": -4.457183361053467, + "step": 224 + }, + { + "epoch": 0.9272983715861007, + "grad_norm": 18.763839267544366, + "learning_rate": 3.2348431389862775e-07, + "logits/chosen": -2.0352532863616943, + "logits/rejected": -2.0309362411499023, + "logps/chosen": -26.602890014648438, + "logps/rejected": -50.593746185302734, + "loss": 0.2013, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.3150665760040283, + "rewards/margins": 4.31168270111084, + "rewards/rejected": -4.626749515533447, + "step": 226 + }, + { + "epoch": 0.9355045518656238, + "grad_norm": 12.546473609091086, + "learning_rate": 3.2004112524725485e-07, + "logits/chosen": -2.102660894393921, + "logits/rejected": -2.098243474960327, + "logps/chosen": -18.035892486572266, + "logps/rejected": -29.32090187072754, + "loss": 0.22, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.6185750365257263, + "rewards/margins": 2.496063232421875, + "rewards/rejected": -3.114638328552246, + "step": 228 + }, + { + "epoch": 0.9437107321451468, + "grad_norm": 15.34640763010342, + "learning_rate": 3.16583457464939e-07, + "logits/chosen": -2.1066555976867676, + "logits/rejected": -2.102388381958008, + "logps/chosen": -22.569570541381836, + "logps/rejected": -35.02971267700195, + "loss": 0.2468, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.08270835876464844, + "rewards/margins": 3.306253433227539, + "rewards/rejected": -3.2235450744628906, + "step": 230 + }, + { + "epoch": 0.9437107321451468, + "eval_logits/chosen": -2.0403828620910645, + "eval_logits/rejected": -2.0383219718933105, + "eval_logps/chosen": -25.11418342590332, + "eval_logps/rejected": -38.591522216796875, + "eval_loss": 0.22682493925094604, + "eval_rewards/accuracies": 0.8364055156707764, + "eval_rewards/chosen": -0.23688402771949768, + "eval_rewards/margins": 3.4898440837860107, + "eval_rewards/rejected": -3.7267279624938965, + "eval_runtime": 383.2213, + "eval_samples_per_second": 4.525, + "eval_steps_per_second": 1.133, + "step": 230 + }, + { + "epoch": 0.9519169124246698, + "grad_norm": 20.317292755951325, + "learning_rate": 3.1311202533209516e-07, + "logits/chosen": -2.066338300704956, + "logits/rejected": -2.0640790462493896, + "logps/chosen": -18.971635818481445, + "logps/rejected": -37.304718017578125, + "loss": 0.2079, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.2257971614599228, + "rewards/margins": 3.128026008605957, + "rewards/rejected": -3.353823184967041, + "step": 232 + }, + { + "epoch": 0.9601230927041928, + "grad_norm": 21.602603549490688, + "learning_rate": 3.096275464745501e-07, + "logits/chosen": -2.0534098148345947, + "logits/rejected": -2.059464931488037, + "logps/chosen": -27.803630828857422, + "logps/rejected": -51.089698791503906, + "loss": 0.1848, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1761884093284607, + "rewards/margins": 3.8570480346679688, + "rewards/rejected": -4.033236503601074, + "step": 234 + }, + { + "epoch": 0.9683292729837158, + "grad_norm": 25.9781205793325, + "learning_rate": 3.061307412151922e-07, + "logits/chosen": -2.063019037246704, + "logits/rejected": -2.0604867935180664, + "logps/chosen": -30.086076736450195, + "logps/rejected": -38.32453536987305, + "loss": 0.2619, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.35281646251678467, + "rewards/margins": 3.2657690048217773, + "rewards/rejected": -3.6185855865478516, + "step": 236 + }, + { + "epoch": 0.9765354532632389, + "grad_norm": 21.253354056577106, + "learning_rate": 3.0262233242506414e-07, + "logits/chosen": -2.0784614086151123, + "logits/rejected": -2.0770914554595947, + "logps/chosen": -21.169662475585938, + "logps/rejected": -38.09800720214844, + "loss": 0.2181, + "rewards/accuracies": 0.890625, + "rewards/chosen": 0.11809039115905762, + "rewards/margins": 3.477996587753296, + "rewards/rejected": -3.359905958175659, + "step": 238 + }, + { + "epoch": 0.9847416335427619, + "grad_norm": 19.6268618703958, + "learning_rate": 2.9910304537392837e-07, + "logits/chosen": -1.9946519136428833, + "logits/rejected": -1.9943212270736694, + "logps/chosen": -25.262042999267578, + "logps/rejected": -37.486366271972656, + "loss": 0.218, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.47168225049972534, + "rewards/margins": 3.64669132232666, + "rewards/rejected": -4.118373394012451, + "step": 240 + }, + { + "epoch": 0.9847416335427619, + "eval_logits/chosen": -2.043826103210449, + "eval_logits/rejected": -2.0417795181274414, + "eval_logps/chosen": -24.943058013916016, + "eval_logps/rejected": -38.50017166137695, + "eval_loss": 0.22486171126365662, + "eval_rewards/accuracies": 0.8387096524238586, + "eval_rewards/chosen": -0.1513207107782364, + "eval_rewards/margins": 3.5297298431396484, + "eval_rewards/rejected": -3.6810505390167236, + "eval_runtime": 383.2373, + "eval_samples_per_second": 4.525, + "eval_steps_per_second": 1.132, + "step": 240 + }, + { + "epoch": 0.9929478138222849, + "grad_norm": 14.682865028202697, + "learning_rate": 2.955736075803371e-07, + "logits/chosen": -2.0690362453460693, + "logits/rejected": -2.070676326751709, + "logps/chosen": -24.48103904724121, + "logps/rejected": -47.019752502441406, + "loss": 0.1536, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06840521097183228, + "rewards/margins": 4.232391357421875, + "rewards/rejected": -4.163986682891846, + "step": 242 + }, + { + "epoch": 1.001153994101808, + "grad_norm": 23.69289209855517, + "learning_rate": 2.9203474866123756e-07, + "logits/chosen": -2.050558090209961, + "logits/rejected": -2.045598030090332, + "logps/chosen": -26.37921142578125, + "logps/rejected": -40.620906829833984, + "loss": 0.2308, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.41915470361709595, + "rewards/margins": 3.705007314682007, + "rewards/rejected": -4.124162197113037, + "step": 244 + }, + { + "epoch": 1.009360174381331, + "grad_norm": 17.446487056986353, + "learning_rate": 2.884872001811425e-07, + "logits/chosen": -1.9810688495635986, + "logits/rejected": -1.9750444889068604, + "logps/chosen": -21.791378021240234, + "logps/rejected": -34.808170318603516, + "loss": 0.1961, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5856488943099976, + "rewards/margins": 3.6117939949035645, + "rewards/rejected": -4.197443008422852, + "step": 246 + }, + { + "epoch": 1.017566354660854, + "grad_norm": 10.922108855871292, + "learning_rate": 2.849316955008996e-07, + "logits/chosen": -2.0840566158294678, + "logits/rejected": -2.0829105377197266, + "logps/chosen": -20.662445068359375, + "logps/rejected": -45.1578254699707, + "loss": 0.1894, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.044170308858156204, + "rewards/margins": 4.373213768005371, + "rewards/rejected": -4.417383670806885, + "step": 248 + }, + { + "epoch": 1.025772534940377, + "grad_norm": 7.713913555670714, + "learning_rate": 2.8136896962608785e-07, + "logits/chosen": -2.0515904426574707, + "logits/rejected": -2.051260232925415, + "logps/chosen": -25.802305221557617, + "logps/rejected": -45.2052116394043, + "loss": 0.1496, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.37995997071266174, + "rewards/margins": 4.365400314331055, + "rewards/rejected": -4.745360374450684, + "step": 250 + }, + { + "epoch": 1.025772534940377, + "eval_logits/chosen": -2.042832612991333, + "eval_logits/rejected": -2.040806293487549, + "eval_logps/chosen": -25.576231002807617, + "eval_logps/rejected": -39.28297805786133, + "eval_loss": 0.2241121530532837, + "eval_rewards/accuracies": 0.8352534770965576, + "eval_rewards/chosen": -0.4679082930088043, + "eval_rewards/margins": 3.604546546936035, + "eval_rewards/rejected": -4.072454452514648, + "eval_runtime": 383.2982, + "eval_samples_per_second": 4.524, + "eval_steps_per_second": 1.132, + "step": 250 + }, + { + "epoch": 1.0339787152199, + "grad_norm": 13.7751082342933, + "learning_rate": 2.777997590550758e-07, + "logits/chosen": -2.155553102493286, + "logits/rejected": -2.1575121879577637, + "logps/chosen": -27.36958122253418, + "logps/rejected": -42.855403900146484, + "loss": 0.1621, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4879440367221832, + "rewards/margins": 3.895385980606079, + "rewards/rejected": -4.38332986831665, + "step": 252 + }, + { + "epoch": 1.042184895499423, + "grad_norm": 16.01605616077575, + "learning_rate": 2.742248016267692e-07, + "logits/chosen": -2.085308790206909, + "logits/rejected": -2.082420825958252, + "logps/chosen": -27.64177703857422, + "logps/rejected": -43.7150993347168, + "loss": 0.1802, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.7673605680465698, + "rewards/margins": 4.039832592010498, + "rewards/rejected": -4.807192325592041, + "step": 254 + }, + { + "epoch": 1.050391075778946, + "grad_norm": 10.928437797325595, + "learning_rate": 2.706448363680831e-07, + "logits/chosen": -2.0981714725494385, + "logits/rejected": -2.0977556705474854, + "logps/chosen": -25.607622146606445, + "logps/rejected": -46.82646179199219, + "loss": 0.1185, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.45319244265556335, + "rewards/margins": 4.636317253112793, + "rewards/rejected": -5.089509010314941, + "step": 256 + }, + { + "epoch": 1.058597256058469, + "grad_norm": 8.863927238249516, + "learning_rate": 2.6706060334116775e-07, + "logits/chosen": -2.0557029247283936, + "logits/rejected": -2.058215379714966, + "logps/chosen": -24.904468536376953, + "logps/rejected": -52.995845794677734, + "loss": 0.1153, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.11815226078033447, + "rewards/margins": 5.022463321685791, + "rewards/rejected": -5.140615940093994, + "step": 258 + }, + { + "epoch": 1.0668034363379921, + "grad_norm": 6.912079383205965, + "learning_rate": 2.634728434904204e-07, + "logits/chosen": -2.0943830013275146, + "logits/rejected": -2.0922083854675293, + "logps/chosen": -27.497676849365234, + "logps/rejected": -48.15060806274414, + "loss": 0.1458, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.2815995514392853, + "rewards/margins": 4.458585739135742, + "rewards/rejected": -4.740184783935547, + "step": 260 + }, + { + "epoch": 1.0668034363379921, + "eval_logits/chosen": -2.0431203842163086, + "eval_logits/rejected": -2.0411202907562256, + "eval_logps/chosen": -25.558095932006836, + "eval_logps/rejected": -39.460453033447266, + "eval_loss": 0.2211485058069229, + "eval_rewards/accuracies": 0.8364055156707764, + "eval_rewards/chosen": -0.4588410556316376, + "eval_rewards/margins": 3.702350378036499, + "eval_rewards/rejected": -4.161191463470459, + "eval_runtime": 383.5467, + "eval_samples_per_second": 4.521, + "eval_steps_per_second": 1.132, + "step": 260 + }, + { + "epoch": 1.0750096166175152, + "grad_norm": 15.832890184225922, + "learning_rate": 2.5988229848931483e-07, + "logits/chosen": -2.0925068855285645, + "logits/rejected": -2.0973916053771973, + "logps/chosen": -35.409889221191406, + "logps/rejected": -56.27671813964844, + "loss": 0.1349, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4806308150291443, + "rewards/margins": 4.840456008911133, + "rewards/rejected": -5.321086883544922, + "step": 262 + }, + { + "epoch": 1.0832157968970382, + "grad_norm": 10.693776832442103, + "learning_rate": 2.562897105870801e-07, + "logits/chosen": -2.0722289085388184, + "logits/rejected": -2.0738039016723633, + "logps/chosen": -22.13025665283203, + "logps/rejected": -36.53873825073242, + "loss": 0.1738, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.3039347231388092, + "rewards/margins": 3.8571736812591553, + "rewards/rejected": -4.161108016967773, + "step": 264 + }, + { + "epoch": 1.0914219771765612, + "grad_norm": 19.182488661584227, + "learning_rate": 2.5269582245526096e-07, + "logits/chosen": -2.0609676837921143, + "logits/rejected": -2.0527615547180176, + "logps/chosen": -25.158451080322266, + "logps/rejected": -37.255897521972656, + "loss": 0.1578, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35766327381134033, + "rewards/margins": 3.5247013568878174, + "rewards/rejected": -3.882364273071289, + "step": 266 + }, + { + "epoch": 1.0996281574560842, + "grad_norm": 9.44923086833568, + "learning_rate": 2.4910137703418926e-07, + "logits/chosen": -2.0382041931152344, + "logits/rejected": -2.0330657958984375, + "logps/chosen": -27.744306564331055, + "logps/rejected": -37.20608901977539, + "loss": 0.1527, + "rewards/accuracies": 0.890625, + "rewards/chosen": 0.0014753900468349457, + "rewards/margins": 4.225948333740234, + "rewards/rejected": -4.224472522735596, + "step": 268 + }, + { + "epoch": 1.107834337735607, + "grad_norm": 10.39281903644009, + "learning_rate": 2.4550711737940205e-07, + "logits/chosen": -2.0400283336639404, + "logits/rejected": -2.0371220111846924, + "logps/chosen": -26.864028930664062, + "logps/rejected": -52.77448272705078, + "loss": 0.1428, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.7589187026023865, + "rewards/margins": 4.930204391479492, + "rewards/rejected": -5.689122200012207, + "step": 270 + }, + { + "epoch": 1.107834337735607, + "eval_logits/chosen": -2.043691635131836, + "eval_logits/rejected": -2.0416359901428223, + "eval_logps/chosen": -25.027318954467773, + "eval_logps/rejected": -38.982704162597656, + "eval_loss": 0.22408966720104218, + "eval_rewards/accuracies": 0.8364055156707764, + "eval_rewards/chosen": -0.19345209002494812, + "eval_rewards/margins": 3.7288661003112793, + "eval_rewards/rejected": -3.9223177433013916, + "eval_runtime": 383.6577, + "eval_samples_per_second": 4.52, + "eval_steps_per_second": 1.131, + "step": 270 + }, + { + "epoch": 1.11604051801513, + "grad_norm": 6.39182395545924, + "learning_rate": 2.419137865080337e-07, + "logits/chosen": -2.069648027420044, + "logits/rejected": -2.0665249824523926, + "logps/chosen": -22.426170349121094, + "logps/rejected": -28.420143127441406, + "loss": 0.1774, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.1105562299489975, + "rewards/margins": 3.127821683883667, + "rewards/rejected": -3.238377571105957, + "step": 272 + }, + { + "epoch": 1.124246698294653, + "grad_norm": 11.027723946191974, + "learning_rate": 2.383221272452178e-07, + "logits/chosen": -2.0779690742492676, + "logits/rejected": -2.087660312652588, + "logps/chosen": -24.797609329223633, + "logps/rejected": -65.99850463867188, + "loss": 0.1436, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.11033198237419128, + "rewards/margins": 5.425482749938965, + "rewards/rejected": -5.53581428527832, + "step": 274 + }, + { + "epoch": 1.132452878574176, + "grad_norm": 17.805145954377792, + "learning_rate": 2.3473288207052741e-07, + "logits/chosen": -2.08833384513855, + "logits/rejected": -2.0850491523742676, + "logps/chosen": -25.857656478881836, + "logps/rejected": -32.381996154785156, + "loss": 0.1588, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1286620795726776, + "rewards/margins": 3.531426191329956, + "rewards/rejected": -3.660088062286377, + "step": 276 + }, + { + "epoch": 1.1406590588536991, + "grad_norm": 17.682382824702852, + "learning_rate": 2.3114679296448726e-07, + "logits/chosen": -2.131256341934204, + "logits/rejected": -2.125654697418213, + "logps/chosen": -24.452953338623047, + "logps/rejected": -31.313426971435547, + "loss": 0.2115, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.033705681562423706, + "rewards/margins": 3.4805080890655518, + "rewards/rejected": -3.514213800430298, + "step": 278 + }, + { + "epoch": 1.1488652391332221, + "grad_norm": 16.145295646746973, + "learning_rate": 2.2756460125518942e-07, + "logits/chosen": -2.0623483657836914, + "logits/rejected": -2.056093692779541, + "logps/chosen": -21.44537353515625, + "logps/rejected": -35.502716064453125, + "loss": 0.1726, + "rewards/accuracies": 0.859375, + "rewards/chosen": -0.28275609016418457, + "rewards/margins": 3.6864452362060547, + "rewards/rejected": -3.96920108795166, + "step": 280 + }, + { + "epoch": 1.1488652391332221, + "eval_logits/chosen": -2.045440196990967, + "eval_logits/rejected": -2.0434725284576416, + "eval_logps/chosen": -25.390390396118164, + "eval_logps/rejected": -39.54319763183594, + "eval_loss": 0.2192622274160385, + "eval_rewards/accuracies": 0.8352534770965576, + "eval_rewards/chosen": -0.37498798966407776, + "eval_rewards/margins": 3.8275747299194336, + "eval_rewards/rejected": -4.2025628089904785, + "eval_runtime": 383.5119, + "eval_samples_per_second": 4.521, + "eval_steps_per_second": 1.132, + "step": 280 + }, + { + "epoch": 1.1570714194127452, + "grad_norm": 18.875320195670874, + "learning_rate": 2.2398704746504318e-07, + "logits/chosen": -1.9832574129104614, + "logits/rejected": -1.9829756021499634, + "logps/chosen": -31.6018123626709, + "logps/rejected": -41.83821105957031, + "loss": 0.1804, + "rewards/accuracies": 0.859375, + "rewards/chosen": -0.4063544273376465, + "rewards/margins": 4.4296417236328125, + "rewards/rejected": -4.835995674133301, + "step": 282 + }, + { + "epoch": 1.1652775996922682, + "grad_norm": 14.790336639999394, + "learning_rate": 2.20414871157692e-07, + "logits/chosen": -2.0224382877349854, + "logits/rejected": -2.027156352996826, + "logps/chosen": -25.788434982299805, + "logps/rejected": -50.23531723022461, + "loss": 0.1406, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.5073741674423218, + "rewards/margins": 4.580756664276123, + "rewards/rejected": -5.088130474090576, + "step": 284 + }, + { + "epoch": 1.1734837799717912, + "grad_norm": 14.288848495922638, + "learning_rate": 2.1684881078512867e-07, + "logits/chosen": -2.0773634910583496, + "logits/rejected": -2.0686299800872803, + "logps/chosen": -25.211549758911133, + "logps/rejected": -30.360422134399414, + "loss": 0.1824, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.5571584105491638, + "rewards/margins": 3.3241891860961914, + "rewards/rejected": -3.881347417831421, + "step": 286 + }, + { + "epoch": 1.1816899602513142, + "grad_norm": 17.171906139091153, + "learning_rate": 2.1328960353503978e-07, + "logits/chosen": -2.085387945175171, + "logits/rejected": -2.0828468799591064, + "logps/chosen": -23.766998291015625, + "logps/rejected": -33.4593620300293, + "loss": 0.1488, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.5340551137924194, + "rewards/margins": 3.7335057258605957, + "rewards/rejected": -4.267560958862305, + "step": 288 + }, + { + "epoch": 1.1898961405308373, + "grad_norm": 13.043172605616002, + "learning_rate": 2.0973798517841173e-07, + "logits/chosen": -2.018440008163452, + "logits/rejected": -2.0149004459381104, + "logps/chosen": -31.094905853271484, + "logps/rejected": -34.58868408203125, + "loss": 0.1415, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.8485018610954285, + "rewards/margins": 3.8137242794036865, + "rewards/rejected": -4.66222620010376, + "step": 290 + }, + { + "epoch": 1.1898961405308373, + "eval_logits/chosen": -2.0464136600494385, + "eval_logits/rejected": -2.044499158859253, + "eval_logps/chosen": -25.900218963623047, + "eval_logps/rejected": -40.12266159057617, + "eval_loss": 0.21777255833148956, + "eval_rewards/accuracies": 0.8364055156707764, + "eval_rewards/chosen": -0.6299027800559998, + "eval_rewards/margins": 3.8623909950256348, + "eval_rewards/rejected": -4.492293834686279, + "eval_runtime": 383.3374, + "eval_samples_per_second": 4.523, + "eval_steps_per_second": 1.132, + "step": 290 + }, + { + "epoch": 1.1981023208103603, + "grad_norm": 20.638807271142973, + "learning_rate": 2.0619468991743042e-07, + "logits/chosen": -2.061304807662964, + "logits/rejected": -2.0597338676452637, + "logps/chosen": -24.799753189086914, + "logps/rejected": -57.012210845947266, + "loss": 0.1738, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.1362073421478271, + "rewards/margins": 5.481666564941406, + "rewards/rejected": -6.617873191833496, + "step": 292 + }, + { + "epoch": 1.2063085010898833, + "grad_norm": 15.936474764605512, + "learning_rate": 2.026604502337039e-07, + "logits/chosen": -2.047367572784424, + "logits/rejected": -2.044090747833252, + "logps/chosen": -27.630414962768555, + "logps/rejected": -50.361202239990234, + "loss": 0.1453, + "rewards/accuracies": 0.890625, + "rewards/chosen": 0.04436264932155609, + "rewards/margins": 4.908329486846924, + "rewards/rejected": -4.863966941833496, + "step": 294 + }, + { + "epoch": 1.2145146813694063, + "grad_norm": 15.408621452000459, + "learning_rate": 1.9913599673684159e-07, + "logits/chosen": -2.0377581119537354, + "logits/rejected": -2.0361804962158203, + "logps/chosen": -22.501596450805664, + "logps/rejected": -40.86521530151367, + "loss": 0.1384, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.5628668069839478, + "rewards/margins": 3.820146083831787, + "rewards/rejected": -4.383012771606445, + "step": 296 + }, + { + "epoch": 1.2227208616489293, + "grad_norm": 12.285332508425714, + "learning_rate": 1.9562205801342034e-07, + "logits/chosen": -2.061429738998413, + "logits/rejected": -2.0591816902160645, + "logps/chosen": -29.394742965698242, + "logps/rejected": -43.32746505737305, + "loss": 0.1526, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.384346067905426, + "rewards/margins": 4.4763689041137695, + "rewards/rejected": -4.860714912414551, + "step": 298 + }, + { + "epoch": 1.2309270419284524, + "grad_norm": 5.583982281384929, + "learning_rate": 1.9211936047636867e-07, + "logits/chosen": -2.0710768699645996, + "logits/rejected": -2.0669403076171875, + "logps/chosen": -20.023929595947266, + "logps/rejected": -34.831573486328125, + "loss": 0.1595, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.22750067710876465, + "rewards/margins": 4.123189926147461, + "rewards/rejected": -4.350690841674805, + "step": 300 + }, + { + "epoch": 1.2309270419284524, + "eval_logits/chosen": -2.0440096855163574, + "eval_logits/rejected": -2.042032480239868, + "eval_logps/chosen": -24.7576961517334, + "eval_logps/rejected": -38.71461486816406, + "eval_loss": 0.2164347618818283, + "eval_rewards/accuracies": 0.8387096524238586, + "eval_rewards/chosen": -0.058640193194150925, + "eval_rewards/margins": 3.729631185531616, + "eval_rewards/rejected": -3.788270950317383, + "eval_runtime": 383.42, + "eval_samples_per_second": 4.522, + "eval_steps_per_second": 1.132, + "step": 300 + }, + { + "epoch": 1.2391332222079754, + "grad_norm": 11.637212026135893, + "learning_rate": 1.886286282148002e-07, + "logits/chosen": -2.0720925331115723, + "logits/rejected": -2.06878399848938, + "logps/chosen": -21.090686798095703, + "logps/rejected": -28.20726776123047, + "loss": 0.156, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.15111806988716125, + "rewards/margins": 3.5836377143859863, + "rewards/rejected": -3.4325199127197266, + "step": 302 + }, + { + "epoch": 1.2473394024874984, + "grad_norm": 6.71334576605166, + "learning_rate": 1.8515058284432743e-07, + "logits/chosen": -2.1127841472625732, + "logits/rejected": -2.111618757247925, + "logps/chosen": -23.5858097076416, + "logps/rejected": -42.50680923461914, + "loss": 0.1521, + "rewards/accuracies": 0.890625, + "rewards/chosen": -0.2578216791152954, + "rewards/margins": 4.158515453338623, + "rewards/rejected": -4.416337013244629, + "step": 304 + }, + { + "epoch": 1.2555455827670214, + "grad_norm": 16.588064814380605, + "learning_rate": 1.8168594335788728e-07, + "logits/chosen": -2.042862892150879, + "logits/rejected": -2.0439772605895996, + "logps/chosen": -24.17473030090332, + "logps/rejected": -53.81690216064453, + "loss": 0.177, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.44858402013778687, + "rewards/margins": 4.805813789367676, + "rewards/rejected": -5.254397392272949, + "step": 306 + }, + { + "epoch": 1.2637517630465445, + "grad_norm": 6.647490191841678, + "learning_rate": 1.7823542597710832e-07, + "logits/chosen": -2.0013082027435303, + "logits/rejected": -2.004103183746338, + "logps/chosen": -23.131669998168945, + "logps/rejected": -43.91994857788086, + "loss": 0.1689, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.4323474168777466, + "rewards/margins": 4.400308609008789, + "rewards/rejected": -3.967961311340332, + "step": 308 + }, + { + "epoch": 1.2719579433260675, + "grad_norm": 13.202795454945317, + "learning_rate": 1.7479974400425123e-07, + "logits/chosen": -2.046172857284546, + "logits/rejected": -2.0446221828460693, + "logps/chosen": -23.60657501220703, + "logps/rejected": -39.958396911621094, + "loss": 0.1706, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.3769833445549011, + "rewards/margins": 4.0399041175842285, + "rewards/rejected": -3.66292142868042, + "step": 310 + }, + { + "epoch": 1.2719579433260675, + "eval_logits/chosen": -2.043919086456299, + "eval_logits/rejected": -2.04194974899292, + "eval_logps/chosen": -24.46892547607422, + "eval_logps/rejected": -38.424713134765625, + "eval_loss": 0.21722769737243652, + "eval_rewards/accuracies": 0.8375576138496399, + "eval_rewards/chosen": 0.08574579656124115, + "eval_rewards/margins": 3.7290680408477783, + "eval_rewards/rejected": -3.643322467803955, + "eval_runtime": 383.1854, + "eval_samples_per_second": 4.525, + "eval_steps_per_second": 1.133, + "step": 310 + }, + { + "epoch": 1.2801641236055905, + "grad_norm": 10.439998982466557, + "learning_rate": 1.7137960767475263e-07, + "logits/chosen": -2.062668561935425, + "logits/rejected": -2.058307647705078, + "logps/chosen": -20.401418685913086, + "logps/rejected": -39.55351257324219, + "loss": 0.179, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.12827420234680176, + "rewards/margins": 4.090062141418457, + "rewards/rejected": -3.9617881774902344, + "step": 312 + }, + { + "epoch": 1.2883703038851135, + "grad_norm": 7.172405069379902, + "learning_rate": 1.6797572401040244e-07, + "logits/chosen": -2.0692923069000244, + "logits/rejected": -2.067417860031128, + "logps/chosen": -30.879615783691406, + "logps/rejected": -40.37434387207031, + "loss": 0.1199, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.12180179357528687, + "rewards/margins": 4.198694229125977, + "rewards/rejected": -4.076892375946045, + "step": 314 + }, + { + "epoch": 1.2965764841646366, + "grad_norm": 24.11773202321173, + "learning_rate": 1.6458879667318687e-07, + "logits/chosen": -2.064960479736328, + "logits/rejected": -2.0638976097106934, + "logps/chosen": -29.61989974975586, + "logps/rejected": -38.073875427246094, + "loss": 0.1542, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.3140927255153656, + "rewards/margins": 3.569211721420288, + "rewards/rejected": -3.8833043575286865, + "step": 316 + }, + { + "epoch": 1.3047826644441596, + "grad_norm": 15.353619034619824, + "learning_rate": 1.612195258198243e-07, + "logits/chosen": -2.1528408527374268, + "logits/rejected": -2.1589419841766357, + "logps/chosen": -25.104019165039062, + "logps/rejected": -56.826202392578125, + "loss": 0.1502, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.11969700455665588, + "rewards/margins": 5.174037933349609, + "rewards/rejected": -5.054340362548828, + "step": 318 + }, + { + "epoch": 1.3129888447236826, + "grad_norm": 6.821222616952283, + "learning_rate": 1.57868607957027e-07, + "logits/chosen": -2.0985031127929688, + "logits/rejected": -2.0935959815979004, + "logps/chosen": -30.2029972076416, + "logps/rejected": -43.66718292236328, + "loss": 0.1404, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.6867274641990662, + "rewards/margins": 4.196517467498779, + "rewards/rejected": -4.883245468139648, + "step": 320 + }, + { + "epoch": 1.3129888447236826, + "eval_logits/chosen": -2.0446114540100098, + "eval_logits/rejected": -2.042722702026367, + "eval_logps/chosen": -25.383127212524414, + "eval_logps/rejected": -39.56816101074219, + "eval_loss": 0.21271604299545288, + "eval_rewards/accuracies": 0.8375576138496399, + "eval_rewards/chosen": -0.37135595083236694, + "eval_rewards/margins": 3.84369158744812, + "eval_rewards/rejected": -4.215047359466553, + "eval_runtime": 383.232, + "eval_samples_per_second": 4.525, + "eval_steps_per_second": 1.132, + "step": 320 + } + ], + "logging_steps": 2, + "max_steps": 486, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}