{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.28565863792767643, "eval_steps": 1000, "global_step": 1100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 1072.96875, "epoch": 0.0002596896708433422, "grad_norm": 0.11445748805999756, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": 0.2734375111758709, "reward_std": 0.10437176283448935, "rewards/spct_argmax_reward_func": 0.125, "rewards/spct_format_reward_func": 0.1484375, "step": 1 }, { "completion_length": 1276.6875, "epoch": 0.0005193793416866844, "grad_norm": 0.09729848057031631, "kl": 0.0, "learning_rate": 2e-05, "loss": 0.0, "reward": 0.3335937596857548, "reward_std": 0.24471135437488556, "rewards/spct_argmax_reward_func": 0.171875, "rewards/spct_format_reward_func": 0.16171875223517418, "step": 2 }, { "completion_length": 1303.0625, "epoch": 0.0007790690125300266, "grad_norm": 0.10673292726278305, "kl": 0.0019175222550984472, "learning_rate": 4e-05, "loss": 0.0, "reward": 0.2582031302154064, "reward_std": 0.09682012489065528, "rewards/spct_argmax_reward_func": 0.109375, "rewards/spct_format_reward_func": 0.14882813021540642, "step": 3 }, { "completion_length": 1154.859375, "epoch": 0.0010387586833733688, "grad_norm": 0.12231016904115677, "kl": 0.0021098374272696674, "learning_rate": 6e-05, "loss": 0.0, "reward": 0.2691406272351742, "reward_std": 0.16900215297937393, "rewards/spct_argmax_reward_func": 0.125, "rewards/spct_format_reward_func": 0.14414062537252903, "step": 4 }, { "completion_length": 1252.90625, "epoch": 0.001298448354216711, "grad_norm": 0.13428553938865662, "kl": 0.0030718485359102488, "learning_rate": 8e-05, "loss": 0.0, "reward": 0.32343750447034836, "reward_std": 0.2041499074548483, "rewards/spct_argmax_reward_func": 0.15625, "rewards/spct_format_reward_func": 0.16718750074505806, "step": 5 }, { "completion_length": 1136.4375, "epoch": 0.0015581380250600531, "grad_norm": 0.13827918469905853, "kl": 0.018605221062898636, "learning_rate": 0.0001, "loss": 0.0, "reward": 0.4726562798023224, "reward_std": 0.2657049186527729, "rewards/spct_argmax_reward_func": 0.296875, "rewards/spct_format_reward_func": 0.1757812462747097, "step": 6 }, { "completion_length": 1090.359375, "epoch": 0.0018178276959033954, "grad_norm": 0.13010632991790771, "kl": 0.015392395434901118, "learning_rate": 9.999999983349353e-05, "loss": 0.0, "reward": 0.3062500059604645, "reward_std": 0.16459800768643618, "rewards/spct_argmax_reward_func": 0.125, "rewards/spct_format_reward_func": 0.18124999850988388, "step": 7 }, { "completion_length": 991.6875, "epoch": 0.0020775173667467377, "grad_norm": 0.11427158117294312, "kl": 0.07720394432544708, "learning_rate": 9.999999933397414e-05, "loss": 0.0, "reward": 0.45625002682209015, "reward_std": 0.33481547981500626, "rewards/spct_argmax_reward_func": 0.265625, "rewards/spct_format_reward_func": 0.19062500074505806, "step": 8 }, { "completion_length": 782.1875, "epoch": 0.00233720703759008, "grad_norm": 0.15961626172065735, "kl": 0.06521674524992704, "learning_rate": 9.999999850144181e-05, "loss": 0.0, "reward": 0.5203125216066837, "reward_std": 0.2377968579530716, "rewards/spct_argmax_reward_func": 0.328125, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 9 }, { "completion_length": 868.234375, "epoch": 0.002596896708433422, "grad_norm": 0.14769606292247772, "kl": 0.553159249946475, "learning_rate": 9.999999733589656e-05, "loss": 0.0003, "reward": 0.5890625417232513, "reward_std": 0.2666206434369087, "rewards/spct_argmax_reward_func": 0.390625, "rewards/spct_format_reward_func": 0.19843750074505806, "step": 10 }, { "completion_length": 754.28125, "epoch": 0.0028565863792767644, "grad_norm": 0.16362467408180237, "kl": 0.5701745375990868, "learning_rate": 9.999999583733839e-05, "loss": 0.0003, "reward": 0.5140625238418579, "reward_std": 0.26557842642068863, "rewards/spct_argmax_reward_func": 0.328125, "rewards/spct_format_reward_func": 0.18593750149011612, "step": 11 }, { "completion_length": 776.015625, "epoch": 0.0031162760501201063, "grad_norm": 1.6908760070800781, "kl": 10.896993659436703, "learning_rate": 9.999999400576732e-05, "loss": 0.0054, "reward": 0.42460938543081284, "reward_std": 0.2391582392156124, "rewards/spct_argmax_reward_func": 0.234375, "rewards/spct_format_reward_func": 0.19023437798023224, "step": 12 }, { "completion_length": 737.21875, "epoch": 0.0033759657209634485, "grad_norm": 2.1586380004882812, "kl": 8.607151478528976, "learning_rate": 9.999999184118334e-05, "loss": 0.0043, "reward": 0.6054687798023224, "reward_std": 0.23449104744940996, "rewards/spct_argmax_reward_func": 0.40625, "rewards/spct_format_reward_func": 0.19921875, "step": 13 }, { "completion_length": 684.59375, "epoch": 0.003635655391806791, "grad_norm": 0.6662282347679138, "kl": 2.5646320283412933, "learning_rate": 9.999998934358648e-05, "loss": 0.0013, "reward": 0.7765625417232513, "reward_std": 0.20380739867687225, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.19843750074505806, "step": 14 }, { "completion_length": 626.140625, "epoch": 0.003895345062650133, "grad_norm": 0.13170087337493896, "kl": 0.4185841456055641, "learning_rate": 9.999998651297678e-05, "loss": 0.0002, "reward": 0.5585937798023224, "reward_std": 0.19873128086328506, "rewards/spct_argmax_reward_func": 0.375, "rewards/spct_format_reward_func": 0.1835937537252903, "step": 15 }, { "completion_length": 667.5, "epoch": 0.004155034733493475, "grad_norm": 37.868743896484375, "kl": 221.97751937061548, "learning_rate": 9.999998334935422e-05, "loss": 0.111, "reward": 0.7578125447034836, "reward_std": 0.2756754830479622, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.1953125037252903, "step": 16 }, { "completion_length": 686.828125, "epoch": 0.004414724404336818, "grad_norm": 3.6042239665985107, "kl": 28.432168379426003, "learning_rate": 9.999997985271884e-05, "loss": 0.0142, "reward": 0.46210939437150955, "reward_std": 0.1969691440463066, "rewards/spct_argmax_reward_func": 0.265625, "rewards/spct_format_reward_func": 0.19648437201976776, "step": 17 }, { "completion_length": 655.84375, "epoch": 0.00467441407518016, "grad_norm": 0.16615164279937744, "kl": 1.4234853200614452, "learning_rate": 9.999997602307065e-05, "loss": 0.0007, "reward": 0.494531262665987, "reward_std": 0.20726894959807396, "rewards/spct_argmax_reward_func": 0.3125, "rewards/spct_format_reward_func": 0.18203124776482582, "step": 18 }, { "completion_length": 738.53125, "epoch": 0.004934103746023502, "grad_norm": 0.106849804520607, "kl": 0.34067749232053757, "learning_rate": 9.99999718604097e-05, "loss": 0.0002, "reward": 0.7042969167232513, "reward_std": 0.22920003160834312, "rewards/spct_argmax_reward_func": 0.515625, "rewards/spct_format_reward_func": 0.18867187574505806, "step": 19 }, { "completion_length": 724.84375, "epoch": 0.005193793416866844, "grad_norm": 0.09250710904598236, "kl": 0.21815712377429008, "learning_rate": 9.9999967364736e-05, "loss": 0.0001, "reward": 0.6867187917232513, "reward_std": 0.23481567203998566, "rewards/spct_argmax_reward_func": 0.5, "rewards/spct_format_reward_func": 0.18671875447034836, "step": 20 }, { "completion_length": 708.78125, "epoch": 0.005453483087710187, "grad_norm": 0.12125667929649353, "kl": 0.24770721793174744, "learning_rate": 9.999996253604958e-05, "loss": 0.0001, "reward": 0.6792968958616257, "reward_std": 0.35128287971019745, "rewards/spct_argmax_reward_func": 0.5, "rewards/spct_format_reward_func": 0.17929687723517418, "step": 21 }, { "completion_length": 759.078125, "epoch": 0.005713172758553529, "grad_norm": 0.1062643751502037, "kl": 0.5302725434303284, "learning_rate": 9.999995737435048e-05, "loss": 0.0003, "reward": 0.5406250134110451, "reward_std": 0.21039529517292976, "rewards/spct_argmax_reward_func": 0.359375, "rewards/spct_format_reward_func": 0.18125000223517418, "step": 22 }, { "completion_length": 805.078125, "epoch": 0.005972862429396871, "grad_norm": 0.12250348180532455, "kl": 0.2985941097140312, "learning_rate": 9.999995187963873e-05, "loss": 0.0001, "reward": 0.7164062857627869, "reward_std": 0.26870644837617874, "rewards/spct_argmax_reward_func": 0.53125, "rewards/spct_format_reward_func": 0.18515624850988388, "step": 23 }, { "completion_length": 783.03125, "epoch": 0.0062325521002402125, "grad_norm": 0.08847484737634659, "kl": 0.4012532904744148, "learning_rate": 9.999994605191435e-05, "loss": 0.0002, "reward": 0.6339844241738319, "reward_std": 0.2366088479757309, "rewards/spct_argmax_reward_func": 0.453125, "rewards/spct_format_reward_func": 0.18085937574505806, "step": 24 }, { "completion_length": 767.65625, "epoch": 0.006492241771083555, "grad_norm": 0.10108791291713715, "kl": 0.30916163325309753, "learning_rate": 9.999993989117742e-05, "loss": 0.0002, "reward": 0.5886719152331352, "reward_std": 0.16554560512304306, "rewards/spct_argmax_reward_func": 0.40625, "rewards/spct_format_reward_func": 0.18242187798023224, "step": 25 }, { "completion_length": 867.765625, "epoch": 0.006751931441926897, "grad_norm": 0.08807889372110367, "kl": 0.2798624113202095, "learning_rate": 9.999993339742794e-05, "loss": 0.0001, "reward": 0.5742187723517418, "reward_std": 0.1610843874514103, "rewards/spct_argmax_reward_func": 0.375, "rewards/spct_format_reward_func": 0.1992187574505806, "step": 26 }, { "completion_length": 852.21875, "epoch": 0.007011621112770239, "grad_norm": 0.072133868932724, "kl": 0.22349048405885696, "learning_rate": 9.9999926570666e-05, "loss": 0.0001, "reward": 0.7148437947034836, "reward_std": 0.22998128086328506, "rewards/spct_argmax_reward_func": 0.515625, "rewards/spct_format_reward_func": 0.19921875, "step": 27 }, { "completion_length": 840.484375, "epoch": 0.007271310783613582, "grad_norm": 0.09626276046037674, "kl": 0.4776938408613205, "learning_rate": 9.999991941089158e-05, "loss": 0.0002, "reward": 0.7296875417232513, "reward_std": 0.27114178240299225, "rewards/spct_argmax_reward_func": 0.53125, "rewards/spct_format_reward_func": 0.19843750074505806, "step": 28 }, { "completion_length": 858.234375, "epoch": 0.007531000454456924, "grad_norm": 0.1508883535861969, "kl": 1.5821395218372345, "learning_rate": 9.999991191810477e-05, "loss": 0.0008, "reward": 0.7312500402331352, "reward_std": 0.2235843911767006, "rewards/spct_argmax_reward_func": 0.53125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 29 }, { "completion_length": 864.5625, "epoch": 0.007790690125300266, "grad_norm": 4.6276164054870605, "kl": 10.105692274868488, "learning_rate": 9.999990409230562e-05, "loss": 0.0051, "reward": 0.4609375298023224, "reward_std": 0.21294067427515984, "rewards/spct_argmax_reward_func": 0.296875, "rewards/spct_format_reward_func": 0.16406250186264515, "step": 30 }, { "completion_length": 789.625, "epoch": 0.00805037979614361, "grad_norm": 0.20775368809700012, "kl": 0.7179489508271217, "learning_rate": 9.999989593349417e-05, "loss": 0.0004, "reward": 0.7648437917232513, "reward_std": 0.16682089492678642, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.18671875074505806, "step": 31 }, { "completion_length": 869.9375, "epoch": 0.00831006946698695, "grad_norm": 0.06856699287891388, "kl": 0.21709056198596954, "learning_rate": 9.999988744167049e-05, "loss": 0.0001, "reward": 0.8867188096046448, "reward_std": 0.19873128458857536, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.19921875, "step": 32 }, { "completion_length": 788.125, "epoch": 0.008569759137830292, "grad_norm": 0.07071693986654282, "kl": 0.2969743236899376, "learning_rate": 9.999987861683459e-05, "loss": 0.0001, "reward": 0.7394531518220901, "reward_std": 0.06418336002388969, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.17695312947034836, "step": 33 }, { "completion_length": 874.328125, "epoch": 0.008829448808673635, "grad_norm": 0.05442342534661293, "kl": 0.24879979342222214, "learning_rate": 9.999986945898661e-05, "loss": 0.0001, "reward": 0.9500000327825546, "reward_std": 0.1721687838435173, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.18437500298023224, "step": 34 }, { "completion_length": 806.109375, "epoch": 0.009089138479516977, "grad_norm": 0.06062344089150429, "kl": 0.2999439351260662, "learning_rate": 9.999985996812654e-05, "loss": 0.0001, "reward": 0.7257812954485416, "reward_std": 0.09531249850988388, "rewards/spct_argmax_reward_func": 0.546875, "rewards/spct_format_reward_func": 0.17890625074505806, "step": 35 }, { "completion_length": 783.65625, "epoch": 0.00934882815036032, "grad_norm": 0.07036161422729492, "kl": 0.2781189791858196, "learning_rate": 9.999985014425448e-05, "loss": 0.0001, "reward": 0.7812500447034836, "reward_std": 0.1875, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 36 }, { "completion_length": 833.859375, "epoch": 0.009608517821203661, "grad_norm": 0.10208093374967575, "kl": 0.43649233877658844, "learning_rate": 9.999983998737047e-05, "loss": 0.0002, "reward": 0.7929687947034836, "reward_std": 0.26123128086328506, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.19921875, "step": 37 }, { "completion_length": 802.65625, "epoch": 0.009868207492047004, "grad_norm": 0.06406177580356598, "kl": 0.28247159719467163, "learning_rate": 9.999982949747459e-05, "loss": 0.0001, "reward": 0.9179688096046448, "reward_std": 0.18906250596046448, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.1992187537252903, "step": 38 }, { "completion_length": 819.96875, "epoch": 0.010127897162890346, "grad_norm": 0.06570809334516525, "kl": 0.29778867214918137, "learning_rate": 9.999981867456694e-05, "loss": 0.0001, "reward": 0.7468750476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.546875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 39 }, { "completion_length": 755.25, "epoch": 0.010387586833733689, "grad_norm": 0.09781163930892944, "kl": 0.4788541868329048, "learning_rate": 9.999980751864753e-05, "loss": 0.0002, "reward": 0.6539062932133675, "reward_std": 0.2628084868192673, "rewards/spct_argmax_reward_func": 0.46875, "rewards/spct_format_reward_func": 0.18515625223517418, "step": 40 }, { "completion_length": 867.140625, "epoch": 0.01064727650457703, "grad_norm": 0.0996704027056694, "kl": 0.3174842745065689, "learning_rate": 9.99997960297165e-05, "loss": 0.0002, "reward": 0.7058594077825546, "reward_std": 0.33427614718675613, "rewards/spct_argmax_reward_func": 0.515625, "rewards/spct_format_reward_func": 0.19023437425494194, "step": 41 }, { "completion_length": 752.828125, "epoch": 0.010906966175420373, "grad_norm": 0.07650017738342285, "kl": 0.40758591890335083, "learning_rate": 9.999978420777388e-05, "loss": 0.0002, "reward": 0.6828125342726707, "reward_std": 0.20060018077492714, "rewards/spct_argmax_reward_func": 0.5, "rewards/spct_format_reward_func": 0.18281250074505806, "step": 42 }, { "completion_length": 698.890625, "epoch": 0.011166655846263715, "grad_norm": 0.0643046572804451, "kl": 0.4105265587568283, "learning_rate": 9.999977205281975e-05, "loss": 0.0002, "reward": 0.48164065182209015, "reward_std": 0.06813925999449566, "rewards/spct_argmax_reward_func": 0.3125, "rewards/spct_format_reward_func": 0.16914062574505806, "step": 43 }, { "completion_length": 766.96875, "epoch": 0.011426345517107058, "grad_norm": 0.06558862328529358, "kl": 0.3538302332162857, "learning_rate": 9.999975956485421e-05, "loss": 0.0002, "reward": 0.8718750476837158, "reward_std": 0.17558756470680237, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 44 }, { "completion_length": 780.90625, "epoch": 0.0116860351879504, "grad_norm": 0.06522978097200394, "kl": 0.3195286840200424, "learning_rate": 9.999974674387735e-05, "loss": 0.0002, "reward": 0.7722656726837158, "reward_std": 0.17262893170118332, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.19414062798023224, "step": 45 }, { "completion_length": 759.734375, "epoch": 0.011945724858793742, "grad_norm": 0.04786170646548271, "kl": 0.4027058333158493, "learning_rate": 9.999973358988924e-05, "loss": 0.0002, "reward": 0.7152344286441803, "reward_std": 0.06811564136296511, "rewards/spct_argmax_reward_func": 0.515625, "rewards/spct_format_reward_func": 0.19960937649011612, "step": 46 }, { "completion_length": 683.90625, "epoch": 0.012205414529637084, "grad_norm": 0.09078361093997955, "kl": 0.4074721783399582, "learning_rate": 9.999972010288996e-05, "loss": 0.0002, "reward": 0.7558594197034836, "reward_std": 0.2641262710094452, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.193359375, "step": 47 }, { "completion_length": 747.96875, "epoch": 0.012465104200480425, "grad_norm": 0.05354730039834976, "kl": 0.3497168868780136, "learning_rate": 9.999970628287961e-05, "loss": 0.0002, "reward": 0.8398438021540642, "reward_std": 0.06823650375008583, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.19921875, "step": 48 }, { "completion_length": 649.484375, "epoch": 0.012724793871323768, "grad_norm": 0.06744598597288132, "kl": 0.43973295390605927, "learning_rate": 9.999969212985828e-05, "loss": 0.0002, "reward": 0.7941406667232513, "reward_std": 0.13466475158929825, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.18476562947034836, "step": 49 }, { "completion_length": 683.796875, "epoch": 0.01298448354216711, "grad_norm": 0.046666670590639114, "kl": 0.46533529460430145, "learning_rate": 9.999967764382608e-05, "loss": 0.0002, "reward": 0.6843750476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.484375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 50 }, { "completion_length": 681.6875, "epoch": 0.013244173213010453, "grad_norm": 0.06718295812606812, "kl": 0.39853690564632416, "learning_rate": 9.999966282478309e-05, "loss": 0.0002, "reward": 0.4628906548023224, "reward_std": 0.13324500620365143, "rewards/spct_argmax_reward_func": 0.265625, "rewards/spct_format_reward_func": 0.197265625, "step": 51 }, { "completion_length": 660.125, "epoch": 0.013503862883853794, "grad_norm": 0.09397949278354645, "kl": 1.128791593015194, "learning_rate": 9.999964767272942e-05, "loss": 0.0006, "reward": 0.6824219152331352, "reward_std": 0.1337406411767006, "rewards/spct_argmax_reward_func": 0.484375, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 52 }, { "completion_length": 645.6875, "epoch": 0.013763552554697137, "grad_norm": 0.05956258997321129, "kl": 0.4937710165977478, "learning_rate": 9.999963218766515e-05, "loss": 0.0002, "reward": 0.7781250476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 53 }, { "completion_length": 624.46875, "epoch": 0.014023242225540479, "grad_norm": 0.0825095921754837, "kl": 0.4978819116950035, "learning_rate": 9.999961636959039e-05, "loss": 0.0002, "reward": 0.6968750357627869, "reward_std": 0.1376468911767006, "rewards/spct_argmax_reward_func": 0.515625, "rewards/spct_format_reward_func": 0.18125000223517418, "step": 54 }, { "completion_length": 642.703125, "epoch": 0.014282931896383822, "grad_norm": 0.0834197849035263, "kl": 0.4602478966116905, "learning_rate": 9.999960021850526e-05, "loss": 0.0002, "reward": 0.7468750476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.546875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 55 }, { "completion_length": 616.546875, "epoch": 0.014542621567227163, "grad_norm": 0.23571237921714783, "kl": 3.8128450512886047, "learning_rate": 9.999958373440985e-05, "loss": 0.0019, "reward": 0.7781250476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 56 }, { "completion_length": 625.609375, "epoch": 0.014802311238070506, "grad_norm": 0.07272698730230331, "kl": 0.6580063551664352, "learning_rate": 9.999956691730428e-05, "loss": 0.0003, "reward": 0.5906250402331352, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.390625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 57 }, { "completion_length": 534.0625, "epoch": 0.015062000908913848, "grad_norm": 0.06800635159015656, "kl": 0.927604928612709, "learning_rate": 9.999954976718869e-05, "loss": 0.0005, "reward": 0.5828125402331352, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.40625, "rewards/spct_format_reward_func": 0.17656250298023224, "step": 58 }, { "completion_length": 607.296875, "epoch": 0.01532169057975719, "grad_norm": 0.07292387634515762, "kl": 0.955911174416542, "learning_rate": 9.999953228406313e-05, "loss": 0.0005, "reward": 0.9656250476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 59 }, { "completion_length": 606.546875, "epoch": 0.015581380250600532, "grad_norm": 0.31434112787246704, "kl": 1.991075798869133, "learning_rate": 9.999951446792776e-05, "loss": 0.001, "reward": 0.8195312917232513, "reward_std": 0.17706794664263725, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.19453125074505806, "step": 60 }, { "completion_length": 603.046875, "epoch": 0.015841069921443875, "grad_norm": 0.0987938642501831, "kl": 0.7118281126022339, "learning_rate": 9.999949631878269e-05, "loss": 0.0004, "reward": 1.011328175663948, "reward_std": 0.1995125412940979, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.19882812350988388, "step": 61 }, { "completion_length": 565.15625, "epoch": 0.01610075959228722, "grad_norm": 0.1125568076968193, "kl": 0.7062619924545288, "learning_rate": 9.999947783662804e-05, "loss": 0.0004, "reward": 0.7523438036441803, "reward_std": 0.20840006321668625, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.18984375149011612, "step": 62 }, { "completion_length": 578.1875, "epoch": 0.016360449263130558, "grad_norm": 0.06757227331399918, "kl": 0.9972572475671768, "learning_rate": 9.999945902146393e-05, "loss": 0.0005, "reward": 0.7000000476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.5, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 63 }, { "completion_length": 625.640625, "epoch": 0.0166201389339739, "grad_norm": 0.04911191388964653, "kl": 0.6394816637039185, "learning_rate": 9.99994398732905e-05, "loss": 0.0003, "reward": 0.8250000476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 64 }, { "completion_length": 623.15625, "epoch": 0.016879828604817244, "grad_norm": 0.09770926833152771, "kl": 0.7039724141359329, "learning_rate": 9.999942039210786e-05, "loss": 0.0004, "reward": 0.8859375417232513, "reward_std": 0.19928624480962753, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.19843749701976776, "step": 65 }, { "completion_length": 620.34375, "epoch": 0.017139518275660584, "grad_norm": 0.07709519565105438, "kl": 0.6009730696678162, "learning_rate": 9.999940057791615e-05, "loss": 0.0003, "reward": 0.9656250476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 66 }, { "completion_length": 655.984375, "epoch": 0.017399207946503927, "grad_norm": 0.07232090830802917, "kl": 0.5944145321846008, "learning_rate": 9.99993804307155e-05, "loss": 0.0003, "reward": 0.9128906726837158, "reward_std": 0.1386675052344799, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.19414062425494194, "step": 67 }, { "completion_length": 621.015625, "epoch": 0.01765889761734727, "grad_norm": 0.07345007359981537, "kl": 0.6855271458625793, "learning_rate": 9.999935995050604e-05, "loss": 0.0003, "reward": 0.8875000476837158, "reward_std": 0.17075317353010178, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 68 }, { "completion_length": 644.1875, "epoch": 0.017918587288190613, "grad_norm": 0.32803875207901, "kl": 4.312994807958603, "learning_rate": 9.999933913728789e-05, "loss": 0.0022, "reward": 0.7316406667232513, "reward_std": 0.13736147433519363, "rewards/spct_argmax_reward_func": 0.546875, "rewards/spct_format_reward_func": 0.18476562574505806, "step": 69 }, { "completion_length": 646.5, "epoch": 0.018178276959033953, "grad_norm": 0.06803528964519501, "kl": 0.6523339152336121, "learning_rate": 9.999931799106123e-05, "loss": 0.0003, "reward": 0.7000000402331352, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.5, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 70 }, { "completion_length": 699.25, "epoch": 0.018437966629877296, "grad_norm": 0.06568802148103714, "kl": 0.5699658095836639, "learning_rate": 9.999929651182618e-05, "loss": 0.0003, "reward": 0.9320312887430191, "reward_std": 0.13139689119998366, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.19765625149011612, "step": 71 }, { "completion_length": 666.703125, "epoch": 0.01869765630072064, "grad_norm": 0.08337704092264175, "kl": 0.5956772342324257, "learning_rate": 9.999927469958288e-05, "loss": 0.0003, "reward": 0.6589844152331352, "reward_std": 0.19311563670635223, "rewards/spct_argmax_reward_func": 0.484375, "rewards/spct_format_reward_func": 0.17460937798023224, "step": 72 }, { "completion_length": 745.1875, "epoch": 0.018957345971563982, "grad_norm": 0.14806516468524933, "kl": 1.225516751408577, "learning_rate": 9.999925255433147e-05, "loss": 0.0006, "reward": 0.7398438006639481, "reward_std": 0.2707531750202179, "rewards/spct_argmax_reward_func": 0.546875, "rewards/spct_format_reward_func": 0.19296875223517418, "step": 73 }, { "completion_length": 721.125, "epoch": 0.019217035642407322, "grad_norm": 0.05549729987978935, "kl": 0.5621629655361176, "learning_rate": 9.99992300760721e-05, "loss": 0.0003, "reward": 0.8562500476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 74 }, { "completion_length": 694.953125, "epoch": 0.019476725313250665, "grad_norm": 0.05842204764485359, "kl": 0.5783704817295074, "learning_rate": 9.999920726480493e-05, "loss": 0.0003, "reward": 0.9812500476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 75 }, { "completion_length": 702.1875, "epoch": 0.01973641498409401, "grad_norm": 0.07544619590044022, "kl": 0.5850968360900879, "learning_rate": 9.99991841205301e-05, "loss": 0.0003, "reward": 0.7949219197034836, "reward_std": 0.1962406411767006, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.1855468787252903, "step": 76 }, { "completion_length": 778.484375, "epoch": 0.01999610465493735, "grad_norm": 0.10510718077421188, "kl": 0.5399489849805832, "learning_rate": 9.999916064324778e-05, "loss": 0.0003, "reward": 0.7437500357627869, "reward_std": 0.20468298345804214, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.18125000223517418, "step": 77 }, { "completion_length": 728.65625, "epoch": 0.02025579432578069, "grad_norm": 0.07341739535331726, "kl": 0.5296902135014534, "learning_rate": 9.999913683295812e-05, "loss": 0.0003, "reward": 0.8542969226837158, "reward_std": 0.23564088344573975, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 78 }, { "completion_length": 828.375, "epoch": 0.020515483996624034, "grad_norm": 0.06052689254283905, "kl": 0.5109716355800629, "learning_rate": 9.999911268966126e-05, "loss": 0.0003, "reward": 0.8789062947034836, "reward_std": 0.13437500223517418, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.1914062537252903, "step": 79 }, { "completion_length": 815.984375, "epoch": 0.020775173667467377, "grad_norm": 0.06010577455163002, "kl": 0.5450501814484596, "learning_rate": 9.999908821335737e-05, "loss": 0.0003, "reward": 0.8347656652331352, "reward_std": 0.20004075020551682, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.19414062798023224, "step": 80 }, { "completion_length": 808.578125, "epoch": 0.021034863338310717, "grad_norm": 0.06870109587907791, "kl": 0.6045234799385071, "learning_rate": 9.999906340404664e-05, "loss": 0.0003, "reward": 1.0195312947034836, "reward_std": 0.13692002091556787, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.19140625, "step": 81 }, { "completion_length": 766.859375, "epoch": 0.02129455300915406, "grad_norm": 0.076372429728508, "kl": 0.6563413366675377, "learning_rate": 9.999903826172921e-05, "loss": 0.0003, "reward": 0.7753906697034836, "reward_std": 0.14054187387228012, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.1660156287252903, "step": 82 }, { "completion_length": 780.6875, "epoch": 0.021554242679997403, "grad_norm": 0.08063943684101105, "kl": 0.5885810032486916, "learning_rate": 9.999901278640524e-05, "loss": 0.0003, "reward": 0.9230469167232513, "reward_std": 0.20079510658979416, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.18867187574505806, "step": 83 }, { "completion_length": 839.28125, "epoch": 0.021813932350840746, "grad_norm": 0.07393760234117508, "kl": 0.6549200713634491, "learning_rate": 9.999898697807491e-05, "loss": 0.0003, "reward": 0.6890625283122063, "reward_std": 0.2438020557165146, "rewards/spct_argmax_reward_func": 0.515625, "rewards/spct_format_reward_func": 0.17343749850988388, "step": 84 }, { "completion_length": 835.921875, "epoch": 0.022073622021684086, "grad_norm": 0.06917615234851837, "kl": 0.5512703582644463, "learning_rate": 9.99989608367384e-05, "loss": 0.0003, "reward": 0.7882812917232513, "reward_std": 0.21315234154462814, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.19453125074505806, "step": 85 }, { "completion_length": 822.453125, "epoch": 0.02233331169252743, "grad_norm": 0.05478426069021225, "kl": 0.49331602454185486, "learning_rate": 9.999893436239589e-05, "loss": 0.0002, "reward": 0.8562500476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 86 }, { "completion_length": 799.421875, "epoch": 0.022593001363370772, "grad_norm": 0.03933778405189514, "kl": 0.5346524119377136, "learning_rate": 9.999890755504753e-05, "loss": 0.0003, "reward": 0.9160156697034836, "reward_std": 0.10405313968658447, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.197265625, "step": 87 }, { "completion_length": 764.671875, "epoch": 0.022852691034214116, "grad_norm": 0.06561291962862015, "kl": 0.5521984770894051, "learning_rate": 9.99988804146935e-05, "loss": 0.0003, "reward": 0.9242188036441803, "reward_std": 0.13139688968658447, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.18984375149011612, "step": 88 }, { "completion_length": 751.3125, "epoch": 0.023112380705057455, "grad_norm": 0.070639468729496, "kl": 0.5966121256351471, "learning_rate": 9.999885294133402e-05, "loss": 0.0003, "reward": 0.5210937783122063, "reward_std": 0.16445111483335495, "rewards/spct_argmax_reward_func": 0.34375, "rewards/spct_format_reward_func": 0.17734375223517418, "step": 89 }, { "completion_length": 778.78125, "epoch": 0.0233720703759008, "grad_norm": 0.06609543412923813, "kl": 0.5997314751148224, "learning_rate": 9.999882513496924e-05, "loss": 0.0003, "reward": 0.8066406697034836, "reward_std": 0.20747192203998566, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.197265625, "step": 90 }, { "completion_length": 716.796875, "epoch": 0.02363176004674414, "grad_norm": 0.06265345960855484, "kl": 0.6198612451553345, "learning_rate": 9.999879699559933e-05, "loss": 0.0003, "reward": 0.750000037252903, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 91 }, { "completion_length": 785.203125, "epoch": 0.023891449717587485, "grad_norm": 0.07600079476833344, "kl": 0.5755403712391853, "learning_rate": 9.999876852322451e-05, "loss": 0.0003, "reward": 0.6789062917232513, "reward_std": 0.23746994510293007, "rewards/spct_argmax_reward_func": 0.5, "rewards/spct_format_reward_func": 0.17890625074505806, "step": 92 }, { "completion_length": 721.71875, "epoch": 0.024151139388430824, "grad_norm": 0.06846015900373459, "kl": 0.5166611894965172, "learning_rate": 9.999873971784495e-05, "loss": 0.0003, "reward": 0.6292969137430191, "reward_std": 0.10420003236504272, "rewards/spct_argmax_reward_func": 0.453125, "rewards/spct_format_reward_func": 0.17617187649011612, "step": 93 }, { "completion_length": 799.03125, "epoch": 0.024410829059274167, "grad_norm": 12.334141731262207, "kl": 116.75595131516457, "learning_rate": 9.999871057946086e-05, "loss": 0.0584, "reward": 0.8648437857627869, "reward_std": 0.09531249850988388, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.19296875223517418, "step": 94 }, { "completion_length": 802.953125, "epoch": 0.02467051873011751, "grad_norm": 0.06663916260004044, "kl": 0.5010722801089287, "learning_rate": 9.999868110807242e-05, "loss": 0.0003, "reward": 0.7781250476837158, "reward_std": 0.1923343911767006, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 95 }, { "completion_length": 859.78125, "epoch": 0.02493020840096085, "grad_norm": 0.0020451131276786327, "kl": 0.48983538150787354, "learning_rate": 9.999865130367984e-05, "loss": 0.0002, "reward": 0.9500000476837158, "reward_std": 0.0, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 96 }, { "completion_length": 781.96875, "epoch": 0.025189898071804193, "grad_norm": 0.037614528089761734, "kl": 0.4586917385458946, "learning_rate": 9.999862116628329e-05, "loss": 0.0002, "reward": 0.9500000476837158, "reward_std": 0.07216878235340118, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 97 }, { "completion_length": 735.171875, "epoch": 0.025449587742647536, "grad_norm": 0.1769014298915863, "kl": 1.6458949893712997, "learning_rate": 9.9998590695883e-05, "loss": 0.0008, "reward": 0.7476562857627869, "reward_std": 0.1996573731303215, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.16953125223517418, "step": 98 }, { "completion_length": 869.859375, "epoch": 0.02570927741349088, "grad_norm": 0.05692866072058678, "kl": 0.48514484614133835, "learning_rate": 9.999855989247917e-05, "loss": 0.0002, "reward": 0.8386719226837158, "reward_std": 0.16982503235340118, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 99 }, { "completion_length": 813.171875, "epoch": 0.02596896708433422, "grad_norm": 0.06350459903478622, "kl": 0.513233371078968, "learning_rate": 9.999852875607198e-05, "loss": 0.0003, "reward": 0.7566406726837158, "reward_std": 0.14004624262452126, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.17851562425494194, "step": 100 }, { "completion_length": 922.828125, "epoch": 0.026228656755177562, "grad_norm": 0.06005660444498062, "kl": 0.4595305323600769, "learning_rate": 9.999849728666168e-05, "loss": 0.0002, "reward": 0.8332031816244125, "reward_std": 0.24326254054903984, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.19257812201976776, "step": 101 }, { "completion_length": 822.53125, "epoch": 0.026488346426020905, "grad_norm": 0.05588352307677269, "kl": 0.4793969690799713, "learning_rate": 9.999846548424842e-05, "loss": 0.0002, "reward": 0.6699219197034836, "reward_std": 0.1962406411767006, "rewards/spct_argmax_reward_func": 0.484375, "rewards/spct_format_reward_func": 0.1855468787252903, "step": 102 }, { "completion_length": 860.5, "epoch": 0.02674803609686425, "grad_norm": 0.0716882199048996, "kl": 0.5229940414428711, "learning_rate": 9.999843334883247e-05, "loss": 0.0003, "reward": 0.7578125447034836, "reward_std": 0.25697851181030273, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.1953125037252903, "step": 103 }, { "completion_length": 823.109375, "epoch": 0.027007725767707588, "grad_norm": 0.07309507578611374, "kl": 0.9778441786766052, "learning_rate": 9.999840088041403e-05, "loss": 0.0005, "reward": 0.9121094197034836, "reward_std": 0.2016262635588646, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.193359375, "step": 104 }, { "completion_length": 806.453125, "epoch": 0.02726741543855093, "grad_norm": 0.06641244143247604, "kl": 0.46868716180324554, "learning_rate": 9.999836807899328e-05, "loss": 0.0002, "reward": 0.7523437887430191, "reward_std": 0.19086672738194466, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.18984374776482582, "step": 105 }, { "completion_length": 793.609375, "epoch": 0.027527105109394275, "grad_norm": 0.06516101956367493, "kl": 0.4655431658029556, "learning_rate": 9.999833494457048e-05, "loss": 0.0002, "reward": 0.7937500476837158, "reward_std": 0.1875, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 106 }, { "completion_length": 797.515625, "epoch": 0.027786794780237618, "grad_norm": 0.06605003029108047, "kl": 0.491636760532856, "learning_rate": 9.999830147714583e-05, "loss": 0.0002, "reward": 0.8257812857627869, "reward_std": 0.2265625, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.18515625223517418, "step": 107 }, { "completion_length": 830.96875, "epoch": 0.028046484451080957, "grad_norm": 0.05309414118528366, "kl": 0.4654521942138672, "learning_rate": 9.999826767671956e-05, "loss": 0.0002, "reward": 0.7167969048023224, "reward_std": 0.1310237138532102, "rewards/spct_argmax_reward_func": 0.53125, "rewards/spct_format_reward_func": 0.1855468712747097, "step": 108 }, { "completion_length": 831.453125, "epoch": 0.0283061741219243, "grad_norm": 0.06958900392055511, "kl": 0.4783630520105362, "learning_rate": 9.99982335432919e-05, "loss": 0.0002, "reward": 0.9296875447034836, "reward_std": 0.26026666164398193, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.1953125, "step": 109 }, { "completion_length": 813.59375, "epoch": 0.028565863792767644, "grad_norm": 0.0530204251408577, "kl": 0.4936111569404602, "learning_rate": 9.999819907686308e-05, "loss": 0.0002, "reward": 0.8320312947034836, "reward_std": 0.10641074180603027, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.1914062537252903, "step": 110 }, { "completion_length": 841.078125, "epoch": 0.028825553463610983, "grad_norm": 0.05390322953462601, "kl": 0.4375975504517555, "learning_rate": 9.999816427743329e-05, "loss": 0.0002, "reward": 0.9812500476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 111 }, { "completion_length": 829.09375, "epoch": 0.029085243134454326, "grad_norm": 0.052566710859537125, "kl": 0.5378180891275406, "learning_rate": 9.999812914500281e-05, "loss": 0.0003, "reward": 0.7976562976837158, "reward_std": 0.06823650002479553, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.18828125298023224, "step": 112 }, { "completion_length": 872.625, "epoch": 0.02934493280529767, "grad_norm": 0.059604618698358536, "kl": 0.4090324714779854, "learning_rate": 9.999809367957186e-05, "loss": 0.0002, "reward": 0.6972656697034836, "reward_std": 0.20862450823187828, "rewards/spct_argmax_reward_func": 0.515625, "rewards/spct_format_reward_func": 0.181640625, "step": 113 }, { "completion_length": 798.75, "epoch": 0.029604622476141013, "grad_norm": 0.04580379277467728, "kl": 0.4265107810497284, "learning_rate": 9.999805788114068e-05, "loss": 0.0002, "reward": 0.8386719226837158, "reward_std": 0.1337406411767006, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 114 }, { "completion_length": 827.71875, "epoch": 0.029864312146984352, "grad_norm": 0.052929025143384933, "kl": 0.49919481575489044, "learning_rate": 9.999802174970948e-05, "loss": 0.0002, "reward": 0.7457031607627869, "reward_std": 0.20393935590982437, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.18320312723517418, "step": 115 }, { "completion_length": 801.921875, "epoch": 0.030124001817827695, "grad_norm": 0.06270354241132736, "kl": 0.6243145391345024, "learning_rate": 9.999798528527853e-05, "loss": 0.0003, "reward": 0.7351562827825546, "reward_std": 0.13073650002479553, "rewards/spct_argmax_reward_func": 0.546875, "rewards/spct_format_reward_func": 0.18828125298023224, "step": 116 }, { "completion_length": 786.890625, "epoch": 0.03038369148867104, "grad_norm": 0.04954802617430687, "kl": 0.5118148326873779, "learning_rate": 9.999794848784808e-05, "loss": 0.0003, "reward": 0.8730469197034836, "reward_std": 0.07864543329924345, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.1855468787252903, "step": 117 }, { "completion_length": 790.046875, "epoch": 0.03064338115951438, "grad_norm": 0.08773286640644073, "kl": 1.1027397438883781, "learning_rate": 9.999791135741832e-05, "loss": 0.0006, "reward": 0.7410156577825546, "reward_std": 0.12696610391139984, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.17851562798023224, "step": 118 }, { "completion_length": 767.28125, "epoch": 0.03090307083035772, "grad_norm": 0.06861092150211334, "kl": 0.46169231086969376, "learning_rate": 9.999787389398957e-05, "loss": 0.0002, "reward": 0.8296875655651093, "reward_std": 0.24121256917715073, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.18906249850988388, "step": 119 }, { "completion_length": 871.296875, "epoch": 0.031162760501201064, "grad_norm": 0.08495073765516281, "kl": 0.4576600342988968, "learning_rate": 9.999783609756203e-05, "loss": 0.0002, "reward": 0.6179687827825546, "reward_std": 0.2934305816888809, "rewards/spct_argmax_reward_func": 0.4375, "rewards/spct_format_reward_func": 0.18046874552965164, "step": 120 }, { "completion_length": 821.9375, "epoch": 0.031422450172044404, "grad_norm": 0.05117729306221008, "kl": 0.48578010499477386, "learning_rate": 9.999779796813598e-05, "loss": 0.0002, "reward": 0.9011719226837158, "reward_std": 0.09765625, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 121 }, { "completion_length": 779.84375, "epoch": 0.03168213984288775, "grad_norm": 0.07895545661449432, "kl": 0.48700205236673355, "learning_rate": 9.999775950571166e-05, "loss": 0.0002, "reward": 0.8292969092726707, "reward_std": 0.19311564118834212, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.18867187574505806, "step": 122 }, { "completion_length": 800.9375, "epoch": 0.03194182951373109, "grad_norm": 0.05691077932715416, "kl": 0.48565659672021866, "learning_rate": 9.999772071028934e-05, "loss": 0.0002, "reward": 0.8125000447034836, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 123 }, { "completion_length": 773.15625, "epoch": 0.03220151918457444, "grad_norm": 0.05078653246164322, "kl": 0.4764946550130844, "learning_rate": 9.999768158186924e-05, "loss": 0.0002, "reward": 0.8507812917232513, "reward_std": 0.10489916428923607, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.19453125074505806, "step": 124 }, { "completion_length": 806.0625, "epoch": 0.03246120885541778, "grad_norm": 0.07685442268848419, "kl": 0.48768435418605804, "learning_rate": 9.999764212045166e-05, "loss": 0.0002, "reward": 0.6843750476837158, "reward_std": 0.30058756470680237, "rewards/spct_argmax_reward_func": 0.484375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 125 }, { "completion_length": 838.484375, "epoch": 0.032720898526261116, "grad_norm": 0.06862366199493408, "kl": 0.4330436885356903, "learning_rate": 9.999760232603685e-05, "loss": 0.0002, "reward": 0.7898437976837158, "reward_std": 0.19291600584983826, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.19609375298023224, "step": 126 }, { "completion_length": 755.765625, "epoch": 0.03298058819710446, "grad_norm": 0.1582578867673874, "kl": 1.9720457717776299, "learning_rate": 9.999756219862508e-05, "loss": 0.001, "reward": 0.6726562902331352, "reward_std": 0.12999390065670013, "rewards/spct_argmax_reward_func": 0.5, "rewards/spct_format_reward_func": 0.17265625298023224, "step": 127 }, { "completion_length": 810.484375, "epoch": 0.0332402778679478, "grad_norm": 0.08499227464199066, "kl": 0.4710027873516083, "learning_rate": 9.999752173821662e-05, "loss": 0.0002, "reward": 0.8000000417232513, "reward_std": 0.22160272300243378, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.19062500074505806, "step": 128 }, { "completion_length": 766.265625, "epoch": 0.03349996753879114, "grad_norm": 0.059582918882369995, "kl": 0.476198174059391, "learning_rate": 9.999748094481171e-05, "loss": 0.0002, "reward": 0.7968750447034836, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 129 }, { "completion_length": 753.8125, "epoch": 0.03375965720963449, "grad_norm": 0.07558225840330124, "kl": 0.5354813188314438, "learning_rate": 9.999743981841065e-05, "loss": 0.0003, "reward": 0.8550781607627869, "reward_std": 0.10390625149011612, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.18320312723517418, "step": 130 }, { "completion_length": 762.0625, "epoch": 0.03401934688047783, "grad_norm": 0.05384114012122154, "kl": 0.4653412103652954, "learning_rate": 9.999739835901372e-05, "loss": 0.0002, "reward": 0.6875000447034836, "reward_std": 0.17075317353010178, "rewards/spct_argmax_reward_func": 0.5, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 131 }, { "completion_length": 828.703125, "epoch": 0.03427903655132117, "grad_norm": 0.07505116611719131, "kl": 0.4770998880267143, "learning_rate": 9.999735656662117e-05, "loss": 0.0002, "reward": 0.6667969152331352, "reward_std": 0.23715942353010178, "rewards/spct_argmax_reward_func": 0.46875, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 132 }, { "completion_length": 824.40625, "epoch": 0.034538726222164515, "grad_norm": 0.11012324690818787, "kl": 0.7648125290870667, "learning_rate": 9.99973144412333e-05, "loss": 0.0004, "reward": 0.8105469197034836, "reward_std": 0.1674552522599697, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.1855468787252903, "step": 133 }, { "completion_length": 824.046875, "epoch": 0.034798415893007854, "grad_norm": 0.0697242021560669, "kl": 0.3974757492542267, "learning_rate": 9.999727198285038e-05, "loss": 0.0002, "reward": 0.7156250476837158, "reward_std": 0.22841878235340118, "rewards/spct_argmax_reward_func": 0.515625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 134 }, { "completion_length": 804.421875, "epoch": 0.0350581055638512, "grad_norm": 0.06208229809999466, "kl": 0.43839501589536667, "learning_rate": 9.99972291914727e-05, "loss": 0.0002, "reward": 0.8691406697034836, "reward_std": 0.12890625, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.1816406287252903, "step": 135 }, { "completion_length": 829.625, "epoch": 0.03531779523469454, "grad_norm": 0.05824675038456917, "kl": 0.4204314649105072, "learning_rate": 9.999718606710053e-05, "loss": 0.0002, "reward": 0.8609375357627869, "reward_std": 0.10562849044799805, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.18906250223517418, "step": 136 }, { "completion_length": 834.9375, "epoch": 0.03557748490553788, "grad_norm": 0.06553288549184799, "kl": 0.3812936842441559, "learning_rate": 9.999714260973416e-05, "loss": 0.0002, "reward": 0.8949219286441803, "reward_std": 0.16497663455083966, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.19179688021540642, "step": 137 }, { "completion_length": 818.0625, "epoch": 0.03583717457638123, "grad_norm": 0.051413893699645996, "kl": 0.4477742612361908, "learning_rate": 9.999709881937392e-05, "loss": 0.0002, "reward": 0.808593787252903, "reward_std": 0.13139688968658447, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.19921875, "step": 138 }, { "completion_length": 754.171875, "epoch": 0.03609686424722457, "grad_norm": 0.03902757167816162, "kl": 0.48028700053691864, "learning_rate": 9.999705469602004e-05, "loss": 0.0002, "reward": 0.640625037252903, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.453125, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 139 }, { "completion_length": 793.484375, "epoch": 0.036356553918067906, "grad_norm": 0.07695918530225754, "kl": 0.5122064501047134, "learning_rate": 9.999701023967285e-05, "loss": 0.0003, "reward": 0.7980469167232513, "reward_std": 0.16670003160834312, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.18867187574505806, "step": 140 }, { "completion_length": 791.796875, "epoch": 0.03661624358891125, "grad_norm": 0.06321793049573898, "kl": 0.5492091104388237, "learning_rate": 9.999696545033263e-05, "loss": 0.0003, "reward": 0.7515625283122063, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.18906250223517418, "step": 141 }, { "completion_length": 765.53125, "epoch": 0.03687593325975459, "grad_norm": 0.062444888055324554, "kl": 0.5162855684757233, "learning_rate": 9.99969203279997e-05, "loss": 0.0003, "reward": 0.7281250357627869, "reward_std": 0.07094283076003194, "rewards/spct_argmax_reward_func": 0.546875, "rewards/spct_format_reward_func": 0.18125000223517418, "step": 142 }, { "completion_length": 741.171875, "epoch": 0.03713562293059793, "grad_norm": 0.07769950479269028, "kl": 0.5571035742759705, "learning_rate": 9.999687487267434e-05, "loss": 0.0003, "reward": 0.7964844107627869, "reward_std": 0.20590942353010178, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.18710937723517418, "step": 143 }, { "completion_length": 730.828125, "epoch": 0.03739531260144128, "grad_norm": 0.07088322192430496, "kl": 0.634597659111023, "learning_rate": 9.999682908435686e-05, "loss": 0.0003, "reward": 0.9074219167232513, "reward_std": 0.12578125001164153, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.18867187574505806, "step": 144 }, { "completion_length": 753.265625, "epoch": 0.03765500227228462, "grad_norm": 0.06480380892753601, "kl": 0.5714936703443527, "learning_rate": 9.999678296304757e-05, "loss": 0.0003, "reward": 0.8281250447034836, "reward_std": 0.20200317353010178, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 145 }, { "completion_length": 732.203125, "epoch": 0.037914691943127965, "grad_norm": 0.07378308475017548, "kl": 0.45551902055740356, "learning_rate": 9.999673650874677e-05, "loss": 0.0002, "reward": 0.7968750447034836, "reward_std": 0.22841878235340118, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 146 }, { "completion_length": 747.796875, "epoch": 0.038174381613971305, "grad_norm": 0.054888952523469925, "kl": 0.6304065138101578, "learning_rate": 9.999668972145478e-05, "loss": 0.0003, "reward": 0.8460938036441803, "reward_std": 0.0376468914328143, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.18984375149011612, "step": 147 }, { "completion_length": 769.953125, "epoch": 0.038434071284814644, "grad_norm": 0.05707801133394241, "kl": 0.5184547752141953, "learning_rate": 9.99966426011719e-05, "loss": 0.0003, "reward": 0.9535156786441803, "reward_std": 0.03203125001164153, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.18789063021540642, "step": 148 }, { "completion_length": 774.109375, "epoch": 0.03869376095565799, "grad_norm": 0.07484991103410721, "kl": 0.5598358511924744, "learning_rate": 9.999659514789843e-05, "loss": 0.0003, "reward": 0.8605469167232513, "reward_std": 0.22890625149011612, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.18867187574505806, "step": 149 }, { "completion_length": 785.09375, "epoch": 0.03895345062650133, "grad_norm": 0.054522328078746796, "kl": 0.62260602414608, "learning_rate": 9.999654736163472e-05, "loss": 0.0003, "reward": 0.8562500402331352, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 150 }, { "completion_length": 767.921875, "epoch": 0.03921314029734467, "grad_norm": 0.049699507653713226, "kl": 0.48233623057603836, "learning_rate": 9.999649924238107e-05, "loss": 0.0002, "reward": 0.9375000447034836, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 151 }, { "completion_length": 844.09375, "epoch": 0.03947282996818802, "grad_norm": 0.08226032555103302, "kl": 1.0967635661363602, "learning_rate": 9.99964507901378e-05, "loss": 0.0005, "reward": 0.8054687827825546, "reward_std": 0.2333979830145836, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.18046875298023224, "step": 152 }, { "completion_length": 802.359375, "epoch": 0.039732519639031356, "grad_norm": 0.06306403875350952, "kl": 0.5643189251422882, "learning_rate": 9.999640200490523e-05, "loss": 0.0003, "reward": 0.9921875447034836, "reward_std": 0.17135105282068253, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.1953125, "step": 153 }, { "completion_length": 807.984375, "epoch": 0.0399922093098747, "grad_norm": 0.07694803178310394, "kl": 1.149218663573265, "learning_rate": 9.99963528866837e-05, "loss": 0.0006, "reward": 0.8640625402331352, "reward_std": 0.22841878235340118, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 154 }, { "completion_length": 731.546875, "epoch": 0.04025189898071804, "grad_norm": 0.05586035177111626, "kl": 0.6040978878736496, "learning_rate": 9.999630343547352e-05, "loss": 0.0003, "reward": 0.8277344107627869, "reward_std": 0.13222210109233856, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.18710937723517418, "step": 155 }, { "completion_length": 818.234375, "epoch": 0.04051158865156138, "grad_norm": 0.07095596194267273, "kl": 0.5160077065229416, "learning_rate": 9.999625365127503e-05, "loss": 0.0003, "reward": 1.0082031786441803, "reward_std": 0.2285509556531906, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.19570312649011612, "step": 156 }, { "completion_length": 752.171875, "epoch": 0.04077127832240473, "grad_norm": 0.05628721043467522, "kl": 0.5651844888925552, "learning_rate": 9.999620353408855e-05, "loss": 0.0003, "reward": 0.8218750506639481, "reward_std": 0.13829772174358368, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.19687500223517418, "step": 157 }, { "completion_length": 729.046875, "epoch": 0.04103096799324807, "grad_norm": 0.0756976380944252, "kl": 0.6211116164922714, "learning_rate": 9.999615308391442e-05, "loss": 0.0003, "reward": 0.8875000476837158, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 158 }, { "completion_length": 747.046875, "epoch": 0.04129065766409141, "grad_norm": 0.07503578811883926, "kl": 0.5346315130591393, "learning_rate": 9.999610230075297e-05, "loss": 0.0003, "reward": 0.8406250476837158, "reward_std": 0.21875, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 159 }, { "completion_length": 716.40625, "epoch": 0.041550347334934755, "grad_norm": 0.08018270134925842, "kl": 0.5643793642520905, "learning_rate": 9.999605118460458e-05, "loss": 0.0003, "reward": 0.7000000476837158, "reward_std": 0.2235843911767006, "rewards/spct_argmax_reward_func": 0.5, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 160 }, { "completion_length": 669.5625, "epoch": 0.041810037005778095, "grad_norm": 0.10831758379936218, "kl": 0.619762971997261, "learning_rate": 9.999599973546952e-05, "loss": 0.0003, "reward": 0.6527344137430191, "reward_std": 0.2741077095270157, "rewards/spct_argmax_reward_func": 0.484375, "rewards/spct_format_reward_func": 0.16835937649011612, "step": 161 }, { "completion_length": 723.1875, "epoch": 0.042069726676621434, "grad_norm": 0.0829310193657875, "kl": 0.6290503889322281, "learning_rate": 9.999594795334819e-05, "loss": 0.0003, "reward": 0.9343750476837158, "reward_std": 0.23808756470680237, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 162 }, { "completion_length": 707.59375, "epoch": 0.04232941634746478, "grad_norm": 0.049417562782764435, "kl": 0.6413908153772354, "learning_rate": 9.99958958382409e-05, "loss": 0.0003, "reward": 1.0281250476837158, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 163 }, { "completion_length": 692.828125, "epoch": 0.04258910601830812, "grad_norm": 0.0013543296372517943, "kl": 0.6252989917993546, "learning_rate": 9.999584339014802e-05, "loss": 0.0003, "reward": 0.8875000476837158, "reward_std": 0.0, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 164 }, { "completion_length": 688.3125, "epoch": 0.04284879568915147, "grad_norm": 0.05510043352842331, "kl": 0.5380802154541016, "learning_rate": 9.999579060906989e-05, "loss": 0.0003, "reward": 0.8406250476837158, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 165 }, { "completion_length": 656.71875, "epoch": 0.04310848535999481, "grad_norm": 0.0787392109632492, "kl": 0.6420268714427948, "learning_rate": 9.999573749500686e-05, "loss": 0.0003, "reward": 0.8593750447034836, "reward_std": 0.2284187749028206, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 166 }, { "completion_length": 680.328125, "epoch": 0.043368175030838146, "grad_norm": 0.10302329063415527, "kl": 0.6900105327367783, "learning_rate": 9.999568404795929e-05, "loss": 0.0003, "reward": 0.5984375476837158, "reward_std": 0.3247203379869461, "rewards/spct_argmax_reward_func": 0.40625, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 167 }, { "completion_length": 624.09375, "epoch": 0.04362786470168149, "grad_norm": 0.0726478323340416, "kl": 0.6970340609550476, "learning_rate": 9.999563026792754e-05, "loss": 0.0003, "reward": 0.8562500476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 168 }, { "completion_length": 652.046875, "epoch": 0.04388755437252483, "grad_norm": 0.0775001272559166, "kl": 0.669752448797226, "learning_rate": 9.999557615491193e-05, "loss": 0.0003, "reward": 0.8082031756639481, "reward_std": 0.22109375149011612, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.19882812723517418, "step": 169 }, { "completion_length": 610.125, "epoch": 0.04414724404336817, "grad_norm": 0.08282443135976791, "kl": 0.8318617641925812, "learning_rate": 9.999552170891287e-05, "loss": 0.0004, "reward": 0.7667969167232513, "reward_std": 0.16670003160834312, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.18867187574505806, "step": 170 }, { "completion_length": 637.3125, "epoch": 0.04440693371421152, "grad_norm": 0.08534768223762512, "kl": 0.8120494782924652, "learning_rate": 9.99954669299307e-05, "loss": 0.0004, "reward": 0.6843750476837158, "reward_std": 0.2548343911767006, "rewards/spct_argmax_reward_func": 0.484375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 171 }, { "completion_length": 637.03125, "epoch": 0.04466662338505486, "grad_norm": 0.11214221268892288, "kl": 0.821268156170845, "learning_rate": 9.999541181796578e-05, "loss": 0.0004, "reward": 0.9625000357627869, "reward_std": 0.26813211292028427, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.19687500223517418, "step": 172 }, { "completion_length": 659.921875, "epoch": 0.0449263130558982, "grad_norm": 0.07311693578958511, "kl": 0.770146444439888, "learning_rate": 9.999535637301849e-05, "loss": 0.0004, "reward": 0.9343750476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 173 }, { "completion_length": 632.234375, "epoch": 0.045186002726741545, "grad_norm": 0.062140703201293945, "kl": 0.597268134355545, "learning_rate": 9.999530059508919e-05, "loss": 0.0003, "reward": 0.9843750447034836, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 174 }, { "completion_length": 645.3125, "epoch": 0.045445692397584884, "grad_norm": 0.11509417742490768, "kl": 2.16264346241951, "learning_rate": 9.999524448417826e-05, "loss": 0.0011, "reward": 0.9093750417232513, "reward_std": 0.19897300004959106, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.19062500074505806, "step": 175 }, { "completion_length": 650.390625, "epoch": 0.04570538206842823, "grad_norm": 0.07535847276449203, "kl": 0.7875328063964844, "learning_rate": 9.999518804028607e-05, "loss": 0.0004, "reward": 0.8406250476837158, "reward_std": 0.23808756470680237, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 176 }, { "completion_length": 626.78125, "epoch": 0.04596507173927157, "grad_norm": 0.09918079525232315, "kl": 0.7493109107017517, "learning_rate": 9.9995131263413e-05, "loss": 0.0004, "reward": 0.8605469167232513, "reward_std": 0.21953124925494194, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.18867187574505806, "step": 177 }, { "completion_length": 674.09375, "epoch": 0.04622476141011491, "grad_norm": 0.07027146220207214, "kl": 0.7096952944993973, "learning_rate": 9.999507415355941e-05, "loss": 0.0004, "reward": 0.9500000476837158, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 178 }, { "completion_length": 637.21875, "epoch": 0.04648445108095826, "grad_norm": 0.07874693721532822, "kl": 0.820588693022728, "learning_rate": 9.99950167107257e-05, "loss": 0.0004, "reward": 0.6449219137430191, "reward_std": 0.12578125298023224, "rewards/spct_argmax_reward_func": 0.46875, "rewards/spct_format_reward_func": 0.17617187649011612, "step": 179 }, { "completion_length": 678.703125, "epoch": 0.0467441407518016, "grad_norm": 0.059943296015262604, "kl": 0.5854462087154388, "learning_rate": 9.999495893491224e-05, "loss": 0.0003, "reward": 0.9062500447034836, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 180 }, { "completion_length": 757.28125, "epoch": 0.047003830422644936, "grad_norm": 0.044522516429424286, "kl": 0.7136822491884232, "learning_rate": 9.999490082611943e-05, "loss": 0.0004, "reward": 1.0125000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 181 }, { "completion_length": 736.0, "epoch": 0.04726352009348828, "grad_norm": 0.08443983644247055, "kl": 0.8271375596523285, "learning_rate": 9.999484238434764e-05, "loss": 0.0004, "reward": 0.8511719256639481, "reward_std": 0.2914317920804024, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.19492187350988388, "step": 182 }, { "completion_length": 773.53125, "epoch": 0.04752320976433162, "grad_norm": 0.06239347532391548, "kl": 0.6253387331962585, "learning_rate": 9.999478360959729e-05, "loss": 0.0003, "reward": 0.7781250476837158, "reward_std": 0.20200317353010178, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 183 }, { "completion_length": 757.640625, "epoch": 0.04778289943517497, "grad_norm": 0.033641938120126724, "kl": 0.7010242938995361, "learning_rate": 9.999472450186871e-05, "loss": 0.0004, "reward": 1.0125000476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 184 }, { "completion_length": 735.609375, "epoch": 0.04804258910601831, "grad_norm": 0.051388222724199295, "kl": 0.7176603227853775, "learning_rate": 9.999466506116236e-05, "loss": 0.0004, "reward": 0.7132812812924385, "reward_std": 0.06944398465566337, "rewards/spct_argmax_reward_func": 0.53125, "rewards/spct_format_reward_func": 0.18203125149011612, "step": 185 }, { "completion_length": 773.3125, "epoch": 0.04830227877686165, "grad_norm": 0.06018305942416191, "kl": 0.7411898374557495, "learning_rate": 9.999460528747859e-05, "loss": 0.0004, "reward": 0.8710937947034836, "reward_std": 0.1655949354171753, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.1835937537252903, "step": 186 }, { "completion_length": 775.859375, "epoch": 0.048561968447704995, "grad_norm": 0.05398816242814064, "kl": 0.734389528632164, "learning_rate": 9.999454518081783e-05, "loss": 0.0004, "reward": 0.9656250476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 187 }, { "completion_length": 835.0, "epoch": 0.048821658118548335, "grad_norm": 0.0642082542181015, "kl": 0.8165473192930222, "learning_rate": 9.999448474118047e-05, "loss": 0.0004, "reward": 0.7761719226837158, "reward_std": 0.23232503235340118, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 188 }, { "completion_length": 731.984375, "epoch": 0.049081347789391674, "grad_norm": 0.07979387044906616, "kl": 0.7055237889289856, "learning_rate": 9.999442396856691e-05, "loss": 0.0004, "reward": 0.6937500387430191, "reward_std": 0.19375000894069672, "rewards/spct_argmax_reward_func": 0.53125, "rewards/spct_format_reward_func": 0.16250000149011612, "step": 189 }, { "completion_length": 738.765625, "epoch": 0.04934103746023502, "grad_norm": 0.07185626029968262, "kl": 0.7648421227931976, "learning_rate": 9.999436286297754e-05, "loss": 0.0004, "reward": 0.7851562947034836, "reward_std": 0.1985843926668167, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.17578125186264515, "step": 190 }, { "completion_length": 857.578125, "epoch": 0.04960072713107836, "grad_norm": 0.07123878598213196, "kl": 0.6871702820062637, "learning_rate": 9.999430142441278e-05, "loss": 0.0003, "reward": 0.7742187902331352, "reward_std": 0.2954293191432953, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.19609375298023224, "step": 191 }, { "completion_length": 775.484375, "epoch": 0.0498604168019217, "grad_norm": 0.07873695343732834, "kl": 0.8528772592544556, "learning_rate": 9.999423965287305e-05, "loss": 0.0004, "reward": 0.8507812917232513, "reward_std": 0.16985099017620087, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.17890625447034836, "step": 192 }, { "completion_length": 849.25, "epoch": 0.05012010647276505, "grad_norm": 0.06090798228979111, "kl": 0.6392538845539093, "learning_rate": 9.999417754835875e-05, "loss": 0.0003, "reward": 0.8484375476837158, "reward_std": 0.2385135516524315, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.19218749925494194, "step": 193 }, { "completion_length": 771.375, "epoch": 0.05037979614360839, "grad_norm": 0.07742670178413391, "kl": 0.7051632702350616, "learning_rate": 9.999411511087029e-05, "loss": 0.0004, "reward": 0.7679688036441803, "reward_std": 0.30215006321668625, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.18984375149011612, "step": 194 }, { "completion_length": 802.734375, "epoch": 0.05063948581445173, "grad_norm": 0.04468304291367531, "kl": 0.6244394928216934, "learning_rate": 9.999405234040808e-05, "loss": 0.0003, "reward": 0.7937500476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 195 }, { "completion_length": 801.671875, "epoch": 0.05089917548529507, "grad_norm": 0.05050879716873169, "kl": 0.623403400182724, "learning_rate": 9.999398923697257e-05, "loss": 0.0003, "reward": 1.0437500476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 196 }, { "completion_length": 806.921875, "epoch": 0.05115886515613841, "grad_norm": 0.046774134039878845, "kl": 0.6281615495681763, "learning_rate": 9.999392580056414e-05, "loss": 0.0003, "reward": 0.8250000476837158, "reward_std": 0.14433756470680237, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 197 }, { "completion_length": 702.65625, "epoch": 0.05141855482698176, "grad_norm": 0.05078766122460365, "kl": 0.7823338955640793, "learning_rate": 9.999386203118325e-05, "loss": 0.0004, "reward": 0.8546875268220901, "reward_std": 0.06430422142148018, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.1671875026077032, "step": 198 }, { "completion_length": 789.3125, "epoch": 0.0516782444978251, "grad_norm": 0.07505106180906296, "kl": 0.6861991137266159, "learning_rate": 9.99937979288303e-05, "loss": 0.0003, "reward": 0.9812500476837158, "reward_std": 0.2235843911767006, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 199 }, { "completion_length": 788.5625, "epoch": 0.05193793416866844, "grad_norm": 0.06323422491550446, "kl": 0.692236602306366, "learning_rate": 9.999373349350574e-05, "loss": 0.0003, "reward": 0.8292969167232513, "reward_std": 0.13061564043164253, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.18867187574505806, "step": 200 }, { "completion_length": 744.765625, "epoch": 0.052197623839511785, "grad_norm": 0.10452952235937119, "kl": 1.0194439738988876, "learning_rate": 9.999366872520996e-05, "loss": 0.0005, "reward": 0.8339844197034836, "reward_std": 0.09841574728488922, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.193359375, "step": 201 }, { "completion_length": 754.765625, "epoch": 0.052457313510355125, "grad_norm": 0.03528793156147003, "kl": 0.5490115284919739, "learning_rate": 9.999360362394341e-05, "loss": 0.0003, "reward": 0.7937500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 202 }, { "completion_length": 770.3125, "epoch": 0.05271700318119847, "grad_norm": 0.048530541360378265, "kl": 0.6200075447559357, "learning_rate": 9.999353818970653e-05, "loss": 0.0003, "reward": 0.9031250476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 203 }, { "completion_length": 713.140625, "epoch": 0.05297669285204181, "grad_norm": 0.06719918549060822, "kl": 0.7403847575187683, "learning_rate": 9.999347242249976e-05, "loss": 0.0004, "reward": 0.9539062976837158, "reward_std": 0.13073650002479553, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.18828125298023224, "step": 204 }, { "completion_length": 748.140625, "epoch": 0.05323638252288515, "grad_norm": 0.3171277344226837, "kl": 1.5397901982069016, "learning_rate": 9.999340632232354e-05, "loss": 0.0008, "reward": 0.9656250476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 205 }, { "completion_length": 739.515625, "epoch": 0.0534960721937285, "grad_norm": 0.0712236613035202, "kl": 0.610686868429184, "learning_rate": 9.999333988917829e-05, "loss": 0.0003, "reward": 0.9792969226837158, "reward_std": 0.1649906411767006, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 206 }, { "completion_length": 742.609375, "epoch": 0.05375576186457184, "grad_norm": 0.07194830477237701, "kl": 0.5228795260190964, "learning_rate": 9.999327312306448e-05, "loss": 0.0003, "reward": 0.8093750476837158, "reward_std": 0.23808756470680237, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 207 }, { "completion_length": 724.921875, "epoch": 0.054015451535415177, "grad_norm": 0.061135295778512955, "kl": 0.5775347054004669, "learning_rate": 9.999320602398252e-05, "loss": 0.0003, "reward": 0.9925781786441803, "reward_std": 0.1708853468298912, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.19570313021540642, "step": 208 }, { "completion_length": 711.890625, "epoch": 0.05427514120625852, "grad_norm": 0.056350208818912506, "kl": 0.5937463343143463, "learning_rate": 9.99931385919329e-05, "loss": 0.0003, "reward": 1.0242187976837158, "reward_std": 0.1015625, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.19609375298023224, "step": 209 }, { "completion_length": 700.765625, "epoch": 0.05453483087710186, "grad_norm": 0.08826225996017456, "kl": 0.5899027734994888, "learning_rate": 9.999307082691602e-05, "loss": 0.0003, "reward": 0.9085938036441803, "reward_std": 0.27090006321668625, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.18984375149011612, "step": 210 }, { "completion_length": 681.296875, "epoch": 0.0547945205479452, "grad_norm": 0.06515847891569138, "kl": 0.6966002583503723, "learning_rate": 9.999300272893234e-05, "loss": 0.0003, "reward": 0.8386719226837158, "reward_std": 0.1962406411767006, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 211 }, { "completion_length": 692.640625, "epoch": 0.05505421021878855, "grad_norm": 0.07763176411390305, "kl": 0.7265910282731056, "learning_rate": 9.999293429798236e-05, "loss": 0.0004, "reward": 0.9179687947034836, "reward_std": 0.162646890967153, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.19921875, "step": 212 }, { "completion_length": 674.234375, "epoch": 0.05531389988963189, "grad_norm": 0.04343431070446968, "kl": 0.6917009800672531, "learning_rate": 9.99928655340665e-05, "loss": 0.0003, "reward": 0.8230469226837158, "reward_std": 0.06640625, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 213 }, { "completion_length": 670.03125, "epoch": 0.055573589560475235, "grad_norm": 0.055666789412498474, "kl": 0.5685510858893394, "learning_rate": 9.999279643718522e-05, "loss": 0.0003, "reward": 0.8093750402331352, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 214 }, { "completion_length": 628.828125, "epoch": 0.055833279231318575, "grad_norm": 0.08412761986255646, "kl": 1.1603120863437653, "learning_rate": 9.999272700733899e-05, "loss": 0.0006, "reward": 0.9812500476837158, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 215 }, { "completion_length": 638.90625, "epoch": 0.056092968902161915, "grad_norm": 0.07507108896970749, "kl": 0.6869435012340546, "learning_rate": 9.999265724452827e-05, "loss": 0.0003, "reward": 0.8718750476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 216 }, { "completion_length": 605.015625, "epoch": 0.05635265857300526, "grad_norm": 0.09483271092176437, "kl": 0.6731300801038742, "learning_rate": 9.99925871487535e-05, "loss": 0.0003, "reward": 0.8902344107627869, "reward_std": 0.2295202650129795, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.18710937723517418, "step": 217 }, { "completion_length": 632.1875, "epoch": 0.0566123482438486, "grad_norm": 0.06050921976566315, "kl": 0.8271876126527786, "learning_rate": 9.999251672001518e-05, "loss": 0.0004, "reward": 0.7625000476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 218 }, { "completion_length": 598.734375, "epoch": 0.05687203791469194, "grad_norm": 0.08328023552894592, "kl": 0.7479172497987747, "learning_rate": 9.999244595831376e-05, "loss": 0.0004, "reward": 0.9949219226837158, "reward_std": 0.1962406411767006, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 219 }, { "completion_length": 619.890625, "epoch": 0.05713172758553529, "grad_norm": 0.04028952866792679, "kl": 0.7615413144230843, "learning_rate": 9.999237486364973e-05, "loss": 0.0004, "reward": 0.8250000476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 220 }, { "completion_length": 612.3125, "epoch": 0.05739141725637863, "grad_norm": 0.08469294011592865, "kl": 0.6297488436102867, "learning_rate": 9.999230343602356e-05, "loss": 0.0003, "reward": 0.8804687857627869, "reward_std": 0.16264689108356833, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.19296875223517418, "step": 221 }, { "completion_length": 516.765625, "epoch": 0.057651106927221966, "grad_norm": 0.06074606999754906, "kl": 0.5207448601722717, "learning_rate": 9.999223167543569e-05, "loss": 0.0003, "reward": 0.8156250417232513, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.17500000447034836, "step": 222 }, { "completion_length": 567.125, "epoch": 0.05791079659806531, "grad_norm": 0.05657973140478134, "kl": 0.7307711839675903, "learning_rate": 9.999215958188663e-05, "loss": 0.0004, "reward": 0.9968750476837158, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 223 }, { "completion_length": 540.1875, "epoch": 0.05817048626890865, "grad_norm": 0.08788660168647766, "kl": 0.625450111925602, "learning_rate": 9.999208715537687e-05, "loss": 0.0003, "reward": 0.8019531667232513, "reward_std": 0.16305049508810043, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.17695312947034836, "step": 224 }, { "completion_length": 584.609375, "epoch": 0.058430175939752, "grad_norm": 0.10170312225818634, "kl": 1.387826532125473, "learning_rate": 9.999201439590686e-05, "loss": 0.0007, "reward": 0.8347656726837158, "reward_std": 0.24199381470680237, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.19414062798023224, "step": 225 }, { "completion_length": 523.75, "epoch": 0.05868986561059534, "grad_norm": 0.06861460953950882, "kl": 0.7151782959699631, "learning_rate": 9.999194130347711e-05, "loss": 0.0004, "reward": 0.8750000447034836, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 226 }, { "completion_length": 542.265625, "epoch": 0.05894955528143868, "grad_norm": 0.13326475024223328, "kl": 1.3816943764686584, "learning_rate": 9.99918678780881e-05, "loss": 0.0007, "reward": 0.7945312932133675, "reward_std": 0.14054113533347845, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.16953125223517418, "step": 227 }, { "completion_length": 577.515625, "epoch": 0.059209244952282025, "grad_norm": 0.06351920962333679, "kl": 0.8343920409679413, "learning_rate": 9.999179411974033e-05, "loss": 0.0004, "reward": 1.0750000476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 228 }, { "completion_length": 574.640625, "epoch": 0.059468934623125365, "grad_norm": 0.0849146693944931, "kl": 0.6931785643100739, "learning_rate": 9.999172002843427e-05, "loss": 0.0003, "reward": 0.9718750417232513, "reward_std": 0.13647300377488136, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.19062500074505806, "step": 229 }, { "completion_length": 575.28125, "epoch": 0.059728624293968705, "grad_norm": 0.07708153873682022, "kl": 0.8660031259059906, "learning_rate": 9.999164560417041e-05, "loss": 0.0004, "reward": 0.8406250476837158, "reward_std": 0.20200317353010178, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 230 }, { "completion_length": 555.25, "epoch": 0.05998831396481205, "grad_norm": 0.06900390982627869, "kl": 0.7295371741056442, "learning_rate": 9.999157084694927e-05, "loss": 0.0004, "reward": 0.8093750476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 231 }, { "completion_length": 559.953125, "epoch": 0.06024800363565539, "grad_norm": 0.058923959732055664, "kl": 0.6173869520425797, "learning_rate": 9.999149575677134e-05, "loss": 0.0003, "reward": 0.9031250476837158, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 232 }, { "completion_length": 555.34375, "epoch": 0.06050769330649874, "grad_norm": 0.07322978228330612, "kl": 0.715677946805954, "learning_rate": 9.99914203336371e-05, "loss": 0.0004, "reward": 0.8468750417232513, "reward_std": 0.12720970809459686, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.19062500447034836, "step": 233 }, { "completion_length": 532.65625, "epoch": 0.06076738297734208, "grad_norm": 0.08090388029813766, "kl": 0.891457200050354, "learning_rate": 9.999134457754709e-05, "loss": 0.0004, "reward": 0.8792969137430191, "reward_std": 0.10420003236504272, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.17617187649011612, "step": 234 }, { "completion_length": 576.484375, "epoch": 0.06102707264818542, "grad_norm": 0.07297258824110031, "kl": 0.5994470119476318, "learning_rate": 9.999126848850179e-05, "loss": 0.0003, "reward": 0.8250000402331352, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 235 }, { "completion_length": 548.328125, "epoch": 0.06128676231902876, "grad_norm": 0.07117819041013718, "kl": 0.6564696580171585, "learning_rate": 9.99911920665017e-05, "loss": 0.0003, "reward": 0.8875000476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 236 }, { "completion_length": 587.125, "epoch": 0.0615464519898721, "grad_norm": 0.09170150756835938, "kl": 0.8012657314538956, "learning_rate": 9.999111531154735e-05, "loss": 0.0004, "reward": 0.8496094197034836, "reward_std": 0.17521065846085548, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.193359375, "step": 237 }, { "completion_length": 613.8125, "epoch": 0.06180614166071544, "grad_norm": 0.054196301847696304, "kl": 0.6530024856328964, "learning_rate": 9.999103822363922e-05, "loss": 0.0003, "reward": 0.7937500476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 238 }, { "completion_length": 597.203125, "epoch": 0.06206583133155879, "grad_norm": 0.0513160340487957, "kl": 0.6492504477500916, "learning_rate": 9.999096080277787e-05, "loss": 0.0003, "reward": 0.8093750402331352, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 239 }, { "completion_length": 598.203125, "epoch": 0.06232552100240213, "grad_norm": 0.14269216358661652, "kl": 2.3193798065185547, "learning_rate": 9.999088304896378e-05, "loss": 0.0012, "reward": 0.9968750476837158, "reward_std": 0.22841878235340118, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 240 }, { "completion_length": 604.15625, "epoch": 0.06258521067324548, "grad_norm": 0.029394878074526787, "kl": 0.8830621093511581, "learning_rate": 9.999080496219745e-05, "loss": 0.0004, "reward": 0.9656250476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 241 }, { "completion_length": 579.53125, "epoch": 0.06284490034408881, "grad_norm": 0.0667211040854454, "kl": 0.7122501134872437, "learning_rate": 9.999072654247947e-05, "loss": 0.0004, "reward": 0.9218750447034836, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 242 }, { "completion_length": 645.703125, "epoch": 0.06310459001493215, "grad_norm": 1.3710675239562988, "kl": 11.693931594491005, "learning_rate": 9.99906477898103e-05, "loss": 0.0058, "reward": 0.6289062947034836, "reward_std": 0.16705163568258286, "rewards/spct_argmax_reward_func": 0.4375, "rewards/spct_format_reward_func": 0.19140625, "step": 243 }, { "completion_length": 646.796875, "epoch": 0.0633642796857755, "grad_norm": 0.04550695791840553, "kl": 0.6220099627971649, "learning_rate": 9.999056870419048e-05, "loss": 0.0003, "reward": 0.9968750476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 244 }, { "completion_length": 664.609375, "epoch": 0.06362396935661883, "grad_norm": 0.0969640240073204, "kl": 0.5946842432022095, "learning_rate": 9.999048928562055e-05, "loss": 0.0003, "reward": 0.8250000476837158, "reward_std": 0.33183756470680237, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 245 }, { "completion_length": 638.6875, "epoch": 0.06388365902746218, "grad_norm": 0.053894419223070145, "kl": 0.7270700633525848, "learning_rate": 9.999040953410103e-05, "loss": 0.0004, "reward": 0.8718750402331352, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 246 }, { "completion_length": 677.125, "epoch": 0.06414334869830553, "grad_norm": 0.05479460209608078, "kl": 0.6818416267633438, "learning_rate": 9.999032944963245e-05, "loss": 0.0003, "reward": 0.9656250476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 247 }, { "completion_length": 660.375, "epoch": 0.06440303836914887, "grad_norm": 0.6768082976341248, "kl": 8.242286294698715, "learning_rate": 9.999024903221534e-05, "loss": 0.0041, "reward": 0.8875000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 248 }, { "completion_length": 698.28125, "epoch": 0.0646627280399922, "grad_norm": 0.04661981016397476, "kl": 0.7839076668024063, "learning_rate": 9.999016828185024e-05, "loss": 0.0004, "reward": 0.9187500476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 249 }, { "completion_length": 657.21875, "epoch": 0.06492241771083555, "grad_norm": 0.08640936017036438, "kl": 0.6810780316591263, "learning_rate": 9.999008719853768e-05, "loss": 0.0003, "reward": 0.8914062976837158, "reward_std": 0.2389896735548973, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.18828125298023224, "step": 250 }, { "completion_length": 678.046875, "epoch": 0.0651821073816789, "grad_norm": 0.04516544193029404, "kl": 0.678842157125473, "learning_rate": 9.999000578227822e-05, "loss": 0.0003, "reward": 0.9812500476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 251 }, { "completion_length": 721.421875, "epoch": 0.06544179705252223, "grad_norm": 0.0802932158112526, "kl": 0.691382110118866, "learning_rate": 9.99899240330724e-05, "loss": 0.0003, "reward": 0.7992188036441803, "reward_std": 0.2563968896865845, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.18984375149011612, "step": 252 }, { "completion_length": 697.0625, "epoch": 0.06570148672336558, "grad_norm": 0.04571527987718582, "kl": 0.6991862654685974, "learning_rate": 9.998984195092072e-05, "loss": 0.0003, "reward": 1.0425781607627869, "reward_std": 0.06484375149011612, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.19882812723517418, "step": 253 }, { "completion_length": 727.53125, "epoch": 0.06596117639420893, "grad_norm": 0.058586254715919495, "kl": 0.6111409813165665, "learning_rate": 9.99897595358238e-05, "loss": 0.0003, "reward": 0.8250000476837158, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 254 }, { "completion_length": 752.3125, "epoch": 0.06622086606505226, "grad_norm": 0.09486015141010284, "kl": 0.8274885416030884, "learning_rate": 9.998967678778213e-05, "loss": 0.0004, "reward": 0.9656250476837158, "reward_std": 0.3534187823534012, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 255 }, { "completion_length": 736.171875, "epoch": 0.0664805557358956, "grad_norm": 0.07431607693433762, "kl": 0.7171322405338287, "learning_rate": 9.998959370679629e-05, "loss": 0.0004, "reward": 0.8875000476837158, "reward_std": 0.26933756470680237, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 256 }, { "completion_length": 704.203125, "epoch": 0.06674024540673895, "grad_norm": 0.05953443795442581, "kl": 0.77336585521698, "learning_rate": 9.998951029286681e-05, "loss": 0.0004, "reward": 0.703125037252903, "reward_std": 0.20200317353010178, "rewards/spct_argmax_reward_func": 0.515625, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 257 }, { "completion_length": 712.890625, "epoch": 0.06699993507758228, "grad_norm": 0.07427231222391129, "kl": 0.7629324048757553, "learning_rate": 9.998942654599425e-05, "loss": 0.0004, "reward": 0.9406250417232513, "reward_std": 0.22538860887289047, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.19062500074505806, "step": 258 }, { "completion_length": 712.171875, "epoch": 0.06725962474842563, "grad_norm": 0.06078747287392616, "kl": 0.7185111045837402, "learning_rate": 9.998934246617922e-05, "loss": 0.0004, "reward": 0.8593750447034836, "reward_std": 0.22841878235340118, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 259 }, { "completion_length": 719.265625, "epoch": 0.06751931441926898, "grad_norm": 0.07487153261899948, "kl": 0.8091249316930771, "learning_rate": 9.998925805342221e-05, "loss": 0.0004, "reward": 0.7824219167232513, "reward_std": 0.22436564043164253, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.18867187574505806, "step": 260 }, { "completion_length": 734.6875, "epoch": 0.06777900409011231, "grad_norm": 0.07097689062356949, "kl": 0.838052049279213, "learning_rate": 9.998917330772381e-05, "loss": 0.0004, "reward": 0.8835937976837158, "reward_std": 0.2114282213151455, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.19609375298023224, "step": 261 }, { "completion_length": 717.203125, "epoch": 0.06803869376095566, "grad_norm": 0.05735490098595619, "kl": 0.8134017288684845, "learning_rate": 9.99890882290846e-05, "loss": 0.0004, "reward": 1.0906250476837158, "reward_std": 0.15625, "rewards/spct_argmax_reward_func": 0.890625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 262 }, { "completion_length": 772.875, "epoch": 0.068298383431799, "grad_norm": 0.06029437109827995, "kl": 0.7808074802160263, "learning_rate": 9.998900281750511e-05, "loss": 0.0004, "reward": 1.1062500476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.90625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 263 }, { "completion_length": 741.828125, "epoch": 0.06855807310264234, "grad_norm": 0.07436814159154892, "kl": 0.7093585878610611, "learning_rate": 9.998891707298594e-05, "loss": 0.0004, "reward": 0.8625000417232513, "reward_std": 0.22055421769618988, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.19062500074505806, "step": 264 }, { "completion_length": 726.1875, "epoch": 0.06881776277348568, "grad_norm": 0.05976288020610809, "kl": 0.8769553005695343, "learning_rate": 9.998883099552764e-05, "loss": 0.0004, "reward": 0.8816406726837158, "reward_std": 0.1649906411767006, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.19414062798023224, "step": 265 }, { "completion_length": 746.875, "epoch": 0.06907745244432903, "grad_norm": 1063.989013671875, "kl": 1880.0750214755535, "learning_rate": 9.998874458513081e-05, "loss": 0.94, "reward": 0.9902344197034836, "reward_std": 0.223207488656044, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.1933593787252903, "step": 266 }, { "completion_length": 728.484375, "epoch": 0.06933714211517236, "grad_norm": 0.07252563536167145, "kl": 0.8970058262348175, "learning_rate": 9.9988657841796e-05, "loss": 0.0004, "reward": 0.9085938036441803, "reward_std": 0.12656249850988388, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.18984375149011612, "step": 267 }, { "completion_length": 723.40625, "epoch": 0.06959683178601571, "grad_norm": 0.07323230803012848, "kl": 0.7718185484409332, "learning_rate": 9.998857076552381e-05, "loss": 0.0004, "reward": 0.730468787252903, "reward_std": 0.2876468896865845, "rewards/spct_argmax_reward_func": 0.53125, "rewards/spct_format_reward_func": 0.1992187537252903, "step": 268 }, { "completion_length": 730.421875, "epoch": 0.06985652145685906, "grad_norm": 0.06795718520879745, "kl": 0.7839552313089371, "learning_rate": 9.99884833563148e-05, "loss": 0.0004, "reward": 0.7753906697034836, "reward_std": 0.1649906411767006, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.1816406287252903, "step": 269 }, { "completion_length": 753.6875, "epoch": 0.0701162111277024, "grad_norm": 0.058331314474344254, "kl": 0.775619238615036, "learning_rate": 9.998839561416958e-05, "loss": 0.0004, "reward": 0.9343750476837158, "reward_std": 0.1923343911767006, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 270 }, { "completion_length": 708.1875, "epoch": 0.07037590079854573, "grad_norm": 0.0793134868144989, "kl": 0.8009070008993149, "learning_rate": 9.998830753908868e-05, "loss": 0.0004, "reward": 0.8542969226837158, "reward_std": 0.30780966579914093, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 271 }, { "completion_length": 713.25, "epoch": 0.07063559046938908, "grad_norm": 0.0927378311753273, "kl": 1.4378084391355515, "learning_rate": 9.998821913107275e-05, "loss": 0.0007, "reward": 0.7859375476837158, "reward_std": 0.1733047254383564, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.19218749925494194, "step": 272 }, { "completion_length": 701.5625, "epoch": 0.07089528014023243, "grad_norm": 0.06706028431653976, "kl": 0.7151275128126144, "learning_rate": 9.998813039012233e-05, "loss": 0.0004, "reward": 0.9277344197034836, "reward_std": 0.28423137217760086, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.1933593787252903, "step": 273 }, { "completion_length": 712.640625, "epoch": 0.07115496981107576, "grad_norm": 0.07530666887760162, "kl": 1.2895262092351913, "learning_rate": 9.998804131623803e-05, "loss": 0.0006, "reward": 0.8164062947034836, "reward_std": 0.23624513298273087, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.1914062537252903, "step": 274 }, { "completion_length": 724.71875, "epoch": 0.07141465948191911, "grad_norm": 0.054714422672986984, "kl": 0.6786859929561615, "learning_rate": 9.998795190942045e-05, "loss": 0.0003, "reward": 1.0437500476837158, "reward_std": 0.1875, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 275 }, { "completion_length": 692.328125, "epoch": 0.07167434915276245, "grad_norm": 0.07061392068862915, "kl": 0.7730856537818909, "learning_rate": 9.998786216967017e-05, "loss": 0.0004, "reward": 0.6980469152331352, "reward_std": 0.2996594160795212, "rewards/spct_argmax_reward_func": 0.5, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 276 }, { "completion_length": 699.921875, "epoch": 0.07193403882360579, "grad_norm": 0.07400109618902206, "kl": 0.8191356509923935, "learning_rate": 9.998777209698782e-05, "loss": 0.0004, "reward": 0.9277344197034836, "reward_std": 0.26933353394269943, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.1933593787252903, "step": 277 }, { "completion_length": 721.21875, "epoch": 0.07219372849444913, "grad_norm": 0.06846731156110764, "kl": 0.6736878603696823, "learning_rate": 9.998768169137395e-05, "loss": 0.0003, "reward": 0.9324219226837158, "reward_std": 0.30449381470680237, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 278 }, { "completion_length": 675.875, "epoch": 0.07245341816529248, "grad_norm": 0.0636291429400444, "kl": 0.8193652927875519, "learning_rate": 9.998759095282921e-05, "loss": 0.0004, "reward": 0.7566406577825546, "reward_std": 0.17319174855947495, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.17851562425494194, "step": 279 }, { "completion_length": 721.0625, "epoch": 0.07271310783613581, "grad_norm": 0.05009232088923454, "kl": 0.6896322518587112, "learning_rate": 9.998749988135418e-05, "loss": 0.0003, "reward": 0.9656250476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 280 }, { "completion_length": 688.234375, "epoch": 0.07297279750697916, "grad_norm": 26840.064453125, "kl": 161757.89654810727, "learning_rate": 9.998740847694948e-05, "loss": 80.879, "reward": 0.8316406756639481, "reward_std": 0.16655313968658447, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.17539062723517418, "step": 281 }, { "completion_length": 744.6875, "epoch": 0.0732324871778225, "grad_norm": 0.04314430430531502, "kl": 0.7204617410898209, "learning_rate": 9.998731673961567e-05, "loss": 0.0004, "reward": 0.7625000402331352, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 282 }, { "completion_length": 754.890625, "epoch": 0.07349217684866584, "grad_norm": 0.04272119328379631, "kl": 0.6794836819171906, "learning_rate": 9.998722466935344e-05, "loss": 0.0003, "reward": 0.7937500476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 283 }, { "completion_length": 723.140625, "epoch": 0.07375186651950918, "grad_norm": 0.0621221587061882, "kl": 0.7606306225061417, "learning_rate": 9.998713226616334e-05, "loss": 0.0004, "reward": 0.8527344092726707, "reward_std": 0.1333629940636456, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.1808593776077032, "step": 284 }, { "completion_length": 683.390625, "epoch": 0.07401155619035253, "grad_norm": 0.07118014991283417, "kl": 0.7309273183345795, "learning_rate": 9.998703953004603e-05, "loss": 0.0004, "reward": 0.8132812976837158, "reward_std": 0.22448650101432577, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.18828125298023224, "step": 285 }, { "completion_length": 657.15625, "epoch": 0.07427124586119586, "grad_norm": 0.05590268224477768, "kl": 0.7915087938308716, "learning_rate": 9.998694646100209e-05, "loss": 0.0004, "reward": 1.0097656846046448, "reward_std": 0.20263753831386566, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.197265625, "step": 286 }, { "completion_length": 696.171875, "epoch": 0.07453093553203921, "grad_norm": 0.058315739035606384, "kl": 0.7787951231002808, "learning_rate": 9.998685305903215e-05, "loss": 0.0004, "reward": 0.8554687947034836, "reward_std": 0.26022375375032425, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.19921875, "step": 287 }, { "completion_length": 678.40625, "epoch": 0.07479062520288256, "grad_norm": 0.06093481555581093, "kl": 0.8292829543352127, "learning_rate": 9.998675932413685e-05, "loss": 0.0004, "reward": 0.7468750402331352, "reward_std": 0.2284187749028206, "rewards/spct_argmax_reward_func": 0.546875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 288 }, { "completion_length": 660.515625, "epoch": 0.0750503148737259, "grad_norm": 0.05056183785200119, "kl": 0.7940274327993393, "learning_rate": 9.99866652563168e-05, "loss": 0.0004, "reward": 0.8406250476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 289 }, { "completion_length": 677.40625, "epoch": 0.07531000454456924, "grad_norm": 0.08312366902828217, "kl": 0.8802220970392227, "learning_rate": 9.998657085557263e-05, "loss": 0.0004, "reward": 0.6843750402331352, "reward_std": 0.36308755725622177, "rewards/spct_argmax_reward_func": 0.484375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 290 }, { "completion_length": 542.0, "epoch": 0.07556969421541258, "grad_norm": 0.06933753937482834, "kl": 0.9232318997383118, "learning_rate": 9.998647612190496e-05, "loss": 0.0005, "reward": 0.8425781577825546, "reward_std": 0.199116492876783, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.17070312425494194, "step": 291 }, { "completion_length": 612.5625, "epoch": 0.07582938388625593, "grad_norm": 0.059684038162231445, "kl": 0.8585595041513443, "learning_rate": 9.998638105531443e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 292 }, { "completion_length": 612.875, "epoch": 0.07608907355709926, "grad_norm": 0.06772821396589279, "kl": 0.8634567856788635, "learning_rate": 9.998628565580168e-05, "loss": 0.0004, "reward": 0.9332031607627869, "reward_std": 0.2932625263929367, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.19882812723517418, "step": 293 }, { "completion_length": 629.03125, "epoch": 0.07634876322794261, "grad_norm": 0.04669720679521561, "kl": 0.8950015902519226, "learning_rate": 9.998618992336733e-05, "loss": 0.0004, "reward": 0.8562500476837158, "reward_std": 0.18042195588350296, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 294 }, { "completion_length": 632.875, "epoch": 0.07660845289878596, "grad_norm": 0.07149702310562134, "kl": 0.9143030643463135, "learning_rate": 9.998609385801203e-05, "loss": 0.0005, "reward": 0.8093750476837158, "reward_std": 0.33667195588350296, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 295 }, { "completion_length": 566.03125, "epoch": 0.07686814256962929, "grad_norm": 0.0837269127368927, "kl": 0.9528571218252182, "learning_rate": 9.998599745973642e-05, "loss": 0.0005, "reward": 0.9363281726837158, "reward_std": 0.23806153237819672, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.18632812798023224, "step": 296 }, { "completion_length": 599.640625, "epoch": 0.07712783224047264, "grad_norm": 0.08985797315835953, "kl": 0.8520537465810776, "learning_rate": 9.998590072854112e-05, "loss": 0.0004, "reward": 0.8250000476837158, "reward_std": 0.40400633960962296, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 297 }, { "completion_length": 629.203125, "epoch": 0.07738752191131598, "grad_norm": 0.08109050989151001, "kl": 0.9534541815519333, "learning_rate": 9.998580366442681e-05, "loss": 0.0005, "reward": 0.8003906607627869, "reward_std": 0.3293469250202179, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.19101562723517418, "step": 298 }, { "completion_length": 594.5625, "epoch": 0.07764721158215931, "grad_norm": 0.07074429839849472, "kl": 1.0628394335508347, "learning_rate": 9.99857062673941e-05, "loss": 0.0005, "reward": 0.9218750447034836, "reward_std": 0.2645031735301018, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 299 }, { "completion_length": 696.796875, "epoch": 0.07790690125300266, "grad_norm": 0.0637141540646553, "kl": 0.8373609036207199, "learning_rate": 9.998560853744365e-05, "loss": 0.0004, "reward": 0.7343750447034836, "reward_std": 0.2909187823534012, "rewards/spct_argmax_reward_func": 0.546875, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 300 }, { "completion_length": 616.171875, "epoch": 0.07816659092384601, "grad_norm": 0.0761275514960289, "kl": 0.9347843676805496, "learning_rate": 9.998551047457614e-05, "loss": 0.0005, "reward": 0.7625000476837158, "reward_std": 0.3582531660795212, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 301 }, { "completion_length": 637.640625, "epoch": 0.07842628059468934, "grad_norm": 0.06598355621099472, "kl": 0.8820950239896774, "learning_rate": 9.998541207879219e-05, "loss": 0.0004, "reward": 0.9968750476837158, "reward_std": 0.27417195588350296, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 302 }, { "completion_length": 628.453125, "epoch": 0.07868597026553269, "grad_norm": 0.07474572956562042, "kl": 0.9306145310401917, "learning_rate": 9.998531335009247e-05, "loss": 0.0005, "reward": 0.8605469167232513, "reward_std": 0.2404313161969185, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.18867187947034836, "step": 303 }, { "completion_length": 655.5625, "epoch": 0.07894565993637603, "grad_norm": 0.06956198811531067, "kl": 0.9186146557331085, "learning_rate": 9.998521428847763e-05, "loss": 0.0005, "reward": 0.7468750476837158, "reward_std": 0.33667195588350296, "rewards/spct_argmax_reward_func": 0.546875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 304 }, { "completion_length": 668.421875, "epoch": 0.07920534960721937, "grad_norm": 0.1058790534734726, "kl": 1.5120913535356522, "learning_rate": 9.998511489394832e-05, "loss": 0.0008, "reward": 0.6886719167232513, "reward_std": 0.273529494414106, "rewards/spct_argmax_reward_func": 0.5, "rewards/spct_format_reward_func": 0.18867187574505806, "step": 305 }, { "completion_length": 641.71875, "epoch": 0.07946503927806271, "grad_norm": 0.06420472264289856, "kl": 1.021563544869423, "learning_rate": 9.998501516650523e-05, "loss": 0.0005, "reward": 0.6738281697034836, "reward_std": 0.21143416315317154, "rewards/spct_argmax_reward_func": 0.484375, "rewards/spct_format_reward_func": 0.1894531212747097, "step": 306 }, { "completion_length": 644.921875, "epoch": 0.07972472894890606, "grad_norm": 0.06478146463632584, "kl": 0.9087735861539841, "learning_rate": 9.9984915106149e-05, "loss": 0.0005, "reward": 0.6976562887430191, "reward_std": 0.27081795036792755, "rewards/spct_argmax_reward_func": 0.515625, "rewards/spct_format_reward_func": 0.18203125149011612, "step": 307 }, { "completion_length": 665.640625, "epoch": 0.0799844186197494, "grad_norm": 0.054687365889549255, "kl": 0.9179468750953674, "learning_rate": 9.99848147128803e-05, "loss": 0.0005, "reward": 0.9656250476837158, "reward_std": 0.20200317353010178, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 308 }, { "completion_length": 720.234375, "epoch": 0.08024410829059274, "grad_norm": 0.06862634420394897, "kl": 0.8547915667295456, "learning_rate": 9.998471398669982e-05, "loss": 0.0004, "reward": 0.8406250476837158, "reward_std": 0.2909187823534012, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 309 }, { "completion_length": 724.78125, "epoch": 0.08050379796143609, "grad_norm": 0.08215752243995667, "kl": 0.8116766065359116, "learning_rate": 9.998461292760821e-05, "loss": 0.0004, "reward": 0.7625000476837158, "reward_std": 0.43042195588350296, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 310 }, { "completion_length": 634.40625, "epoch": 0.08076348763227943, "grad_norm": 0.04632147029042244, "kl": 0.9813335835933685, "learning_rate": 9.998451153560613e-05, "loss": 0.0005, "reward": 0.9343750476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 311 }, { "completion_length": 685.546875, "epoch": 0.08102317730312276, "grad_norm": 0.0599154569208622, "kl": 0.9256118834018707, "learning_rate": 9.99844098106943e-05, "loss": 0.0005, "reward": 0.8406250476837158, "reward_std": 0.1923343911767006, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 312 }, { "completion_length": 559.796875, "epoch": 0.08128286697396611, "grad_norm": 0.09155964851379395, "kl": 0.8925035148859024, "learning_rate": 9.998430775287335e-05, "loss": 0.0004, "reward": 0.7968750447034836, "reward_std": 0.42558756470680237, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 313 }, { "completion_length": 618.875, "epoch": 0.08154255664480946, "grad_norm": 0.07477092742919922, "kl": 0.9469990581274033, "learning_rate": 9.9984205362144e-05, "loss": 0.0005, "reward": 0.8972656726837158, "reward_std": 0.2685019001364708, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.19414062798023224, "step": 314 }, { "completion_length": 628.578125, "epoch": 0.08180224631565279, "grad_norm": 0.08475860208272934, "kl": 0.8998289406299591, "learning_rate": 9.99841026385069e-05, "loss": 0.0004, "reward": 0.8718750476837158, "reward_std": 0.3270031735301018, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 315 }, { "completion_length": 614.5625, "epoch": 0.08206193598649614, "grad_norm": 0.09692920744419098, "kl": 0.9888022243976593, "learning_rate": 9.998399958196274e-05, "loss": 0.0005, "reward": 0.6960937827825546, "reward_std": 0.33036988228559494, "rewards/spct_argmax_reward_func": 0.515625, "rewards/spct_format_reward_func": 0.18046874925494194, "step": 316 }, { "completion_length": 647.734375, "epoch": 0.08232162565733948, "grad_norm": 0.07116212695837021, "kl": 1.080403134226799, "learning_rate": 9.998389619251223e-05, "loss": 0.0005, "reward": 0.8875000476837158, "reward_std": 0.23325317353010178, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 317 }, { "completion_length": 628.375, "epoch": 0.08258131532818282, "grad_norm": 0.0687141865491867, "kl": 0.9129016995429993, "learning_rate": 9.998379247015603e-05, "loss": 0.0005, "reward": 0.9187500476837158, "reward_std": 0.24292195588350296, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 318 }, { "completion_length": 575.953125, "epoch": 0.08284100499902616, "grad_norm": 0.07934101670980453, "kl": 1.1829459369182587, "learning_rate": 9.998368841489485e-05, "loss": 0.0006, "reward": 1.0125000476837158, "reward_std": 0.26933756470680237, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 319 }, { "completion_length": 543.65625, "epoch": 0.08310069466986951, "grad_norm": 0.09901712834835052, "kl": 1.0874472707509995, "learning_rate": 9.998358402672936e-05, "loss": 0.0005, "reward": 0.8488281667232513, "reward_std": 0.31058451533317566, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.19257812574505806, "step": 320 }, { "completion_length": 556.84375, "epoch": 0.08336038434071284, "grad_norm": 0.09588198363780975, "kl": 1.3019382804632187, "learning_rate": 9.998347930566029e-05, "loss": 0.0007, "reward": 0.8718750476837158, "reward_std": 0.30058756470680237, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 321 }, { "completion_length": 504.046875, "epoch": 0.08362007401155619, "grad_norm": 0.06778541207313538, "kl": 0.9739344567060471, "learning_rate": 9.998337425168831e-05, "loss": 0.0005, "reward": 0.9773437976837158, "reward_std": 0.17526372522115707, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.19609375298023224, "step": 322 }, { "completion_length": 493.546875, "epoch": 0.08387976368239954, "grad_norm": 0.06831658631563187, "kl": 1.0333092212677002, "learning_rate": 9.998326886481413e-05, "loss": 0.0005, "reward": 0.8875000476837158, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 323 }, { "completion_length": 488.96875, "epoch": 0.08413945335324287, "grad_norm": 0.0660349503159523, "kl": 1.0204705893993378, "learning_rate": 9.998316314503843e-05, "loss": 0.0005, "reward": 1.0125000476837158, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 324 }, { "completion_length": 444.5625, "epoch": 0.08439914302408622, "grad_norm": 0.5714083313941956, "kl": 6.531572043895721, "learning_rate": 9.998305709236195e-05, "loss": 0.0033, "reward": 0.9031250476837158, "reward_std": 0.1923343911767006, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 325 }, { "completion_length": 453.890625, "epoch": 0.08465883269492956, "grad_norm": 0.07456135749816895, "kl": 0.9686177372932434, "learning_rate": 9.998295070678538e-05, "loss": 0.0005, "reward": 0.9968750476837158, "reward_std": 0.1923343911767006, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 326 }, { "completion_length": 414.90625, "epoch": 0.08491852236577291, "grad_norm": 0.06689681112766266, "kl": 1.0684386491775513, "learning_rate": 9.998284398830944e-05, "loss": 0.0005, "reward": 0.9902344197034836, "reward_std": 0.160707488656044, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.193359375, "step": 327 }, { "completion_length": 444.453125, "epoch": 0.08517821203661624, "grad_norm": 0.06112468242645264, "kl": 1.0534724444150925, "learning_rate": 9.998273693693482e-05, "loss": 0.0005, "reward": 0.9343750476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 328 }, { "completion_length": 404.625, "epoch": 0.08543790170745959, "grad_norm": 0.09798100590705872, "kl": 1.189456284046173, "learning_rate": 9.998262955266225e-05, "loss": 0.0006, "reward": 0.8914062827825546, "reward_std": 0.1729135513305664, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.18828124925494194, "step": 329 }, { "completion_length": 387.546875, "epoch": 0.08569759137830293, "grad_norm": 0.0701475515961647, "kl": 1.1230026185512543, "learning_rate": 9.998252183549241e-05, "loss": 0.0006, "reward": 0.9343750476837158, "reward_std": 0.1923343911767006, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 330 }, { "completion_length": 379.0, "epoch": 0.08595728104914627, "grad_norm": 0.09980615228414536, "kl": 1.821117490530014, "learning_rate": 9.998241378542608e-05, "loss": 0.0009, "reward": 0.7625000476837158, "reward_std": 0.2235843911767006, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 331 }, { "completion_length": 378.234375, "epoch": 0.08621697071998961, "grad_norm": 0.06917643547058105, "kl": 1.075247049331665, "learning_rate": 9.998230540246392e-05, "loss": 0.0005, "reward": 0.8406250476837158, "reward_std": 0.1923343911767006, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 332 }, { "completion_length": 368.34375, "epoch": 0.08647666039083296, "grad_norm": 0.07459413260221481, "kl": 1.6621658504009247, "learning_rate": 9.998219668660669e-05, "loss": 0.0008, "reward": 0.8406250476837158, "reward_std": 0.1923343911767006, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 333 }, { "completion_length": 358.21875, "epoch": 0.08673635006167629, "grad_norm": 0.11326862126588821, "kl": 2.591246098279953, "learning_rate": 9.99820876378551e-05, "loss": 0.0013, "reward": 0.9187500476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 334 }, { "completion_length": 346.609375, "epoch": 0.08699603973251964, "grad_norm": 0.07941832393407822, "kl": 1.1789697706699371, "learning_rate": 9.998197825620987e-05, "loss": 0.0006, "reward": 0.9656250476837158, "reward_std": 0.1923343911767006, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 335 }, { "completion_length": 346.5, "epoch": 0.08725572940336299, "grad_norm": 0.07914634048938751, "kl": 1.2860912084579468, "learning_rate": 9.998186854167173e-05, "loss": 0.0006, "reward": 0.9343750476837158, "reward_std": 0.20200317353010178, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 336 }, { "completion_length": 376.90625, "epoch": 0.08751541907420632, "grad_norm": 0.06736548990011215, "kl": 1.1506821513175964, "learning_rate": 9.998175849424143e-05, "loss": 0.0006, "reward": 0.7781250476837158, "reward_std": 0.20200317353010178, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 337 }, { "completion_length": 377.96875, "epoch": 0.08777510874504967, "grad_norm": 0.054172925651073456, "kl": 1.1317346096038818, "learning_rate": 9.998164811391967e-05, "loss": 0.0006, "reward": 0.7937500476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 338 }, { "completion_length": 387.484375, "epoch": 0.08803479841589301, "grad_norm": 0.06626787036657333, "kl": 1.3881450593471527, "learning_rate": 9.998153740070723e-05, "loss": 0.0007, "reward": 0.8875000476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 339 }, { "completion_length": 385.328125, "epoch": 0.08829448808673634, "grad_norm": 0.0666026920080185, "kl": 1.284679263830185, "learning_rate": 9.998142635460481e-05, "loss": 0.0006, "reward": 0.7312500402331352, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.53125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 340 }, { "completion_length": 374.734375, "epoch": 0.08855417775757969, "grad_norm": 0.07157890498638153, "kl": 1.2147049009799957, "learning_rate": 9.998131497561315e-05, "loss": 0.0006, "reward": 0.9187500476837158, "reward_std": 0.2235843911767006, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 341 }, { "completion_length": 342.40625, "epoch": 0.08881386742842304, "grad_norm": 0.06417258828878403, "kl": 1.2262182831764221, "learning_rate": 9.998120326373302e-05, "loss": 0.0006, "reward": 0.9531250447034836, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 342 }, { "completion_length": 389.96875, "epoch": 0.08907355709926637, "grad_norm": 0.05919058993458748, "kl": 1.3023448586463928, "learning_rate": 9.998109121896515e-05, "loss": 0.0007, "reward": 0.7460937947034836, "reward_std": 0.16748128086328506, "rewards/spct_argmax_reward_func": 0.546875, "rewards/spct_format_reward_func": 0.19921875, "step": 343 }, { "completion_length": 395.109375, "epoch": 0.08933324677010972, "grad_norm": 0.07081857323646545, "kl": 1.1932538747787476, "learning_rate": 9.998097884131027e-05, "loss": 0.0006, "reward": 1.0125000476837158, "reward_std": 0.23325317353010178, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 344 }, { "completion_length": 351.03125, "epoch": 0.08959293644095306, "grad_norm": 0.04930201172828674, "kl": 1.2350392043590546, "learning_rate": 9.998086613076914e-05, "loss": 0.0006, "reward": 1.0281250476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 345 }, { "completion_length": 399.484375, "epoch": 0.0898526261117964, "grad_norm": 0.06826100498437881, "kl": 1.206580489873886, "learning_rate": 9.998075308734253e-05, "loss": 0.0006, "reward": 0.8703125417232513, "reward_std": 0.19545939564704895, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.19843750074505806, "step": 346 }, { "completion_length": 350.21875, "epoch": 0.09011231578263974, "grad_norm": 0.07486073672771454, "kl": 1.3126424252986908, "learning_rate": 9.998063971103115e-05, "loss": 0.0007, "reward": 0.930468812584877, "reward_std": 0.23410651832818985, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.19609374925494194, "step": 347 }, { "completion_length": 353.546875, "epoch": 0.09037200545348309, "grad_norm": 0.06245407834649086, "kl": 1.3052745759487152, "learning_rate": 9.998052600183581e-05, "loss": 0.0007, "reward": 1.0125000476837158, "reward_std": 0.20683756470680237, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 348 }, { "completion_length": 342.546875, "epoch": 0.09063169512432644, "grad_norm": 0.04284998029470444, "kl": 1.3482347130775452, "learning_rate": 9.998041195975723e-05, "loss": 0.0007, "reward": 1.0906250476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.890625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 349 }, { "completion_length": 379.390625, "epoch": 0.09089138479516977, "grad_norm": 0.0774451494216919, "kl": 1.231341004371643, "learning_rate": 9.998029758479616e-05, "loss": 0.0006, "reward": 0.8093750476837158, "reward_std": 0.2645031735301018, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 350 }, { "completion_length": 341.0625, "epoch": 0.09115107446601312, "grad_norm": 0.051155686378479004, "kl": 1.253787100315094, "learning_rate": 9.998018287695338e-05, "loss": 0.0006, "reward": 0.9656250476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 351 }, { "completion_length": 351.453125, "epoch": 0.09141076413685646, "grad_norm": 0.08161509037017822, "kl": 1.7663933336734772, "learning_rate": 9.998006783622966e-05, "loss": 0.0009, "reward": 0.7863281667232513, "reward_std": 0.22856227308511734, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.19257812574505806, "step": 352 }, { "completion_length": 376.453125, "epoch": 0.0916704538076998, "grad_norm": 0.08294248580932617, "kl": 1.2230308651924133, "learning_rate": 9.997995246262576e-05, "loss": 0.0006, "reward": 0.9031250476837158, "reward_std": 0.28125, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 353 }, { "completion_length": 368.15625, "epoch": 0.09193014347854314, "grad_norm": 0.0678657665848732, "kl": 1.194478988647461, "learning_rate": 9.997983675614247e-05, "loss": 0.0006, "reward": 0.8562500476837158, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 354 }, { "completion_length": 370.9375, "epoch": 0.09218983314938649, "grad_norm": 0.05393889918923378, "kl": 1.1320741176605225, "learning_rate": 9.997972071678051e-05, "loss": 0.0006, "reward": 1.0437500476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 355 }, { "completion_length": 379.9375, "epoch": 0.09244952282022982, "grad_norm": 0.04812407121062279, "kl": 1.2469850480556488, "learning_rate": 9.997960434454069e-05, "loss": 0.0006, "reward": 1.0593750476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.859375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 356 }, { "completion_length": 369.921875, "epoch": 0.09270921249107317, "grad_norm": 0.06278344988822937, "kl": 1.4400579929351807, "learning_rate": 9.997948763942378e-05, "loss": 0.0007, "reward": 1.0125000476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 357 }, { "completion_length": 434.171875, "epoch": 0.09296890216191651, "grad_norm": 0.04322703182697296, "kl": 1.0849723815917969, "learning_rate": 9.997937060143054e-05, "loss": 0.0005, "reward": 1.0417969226837158, "reward_std": 0.13857503235340118, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 358 }, { "completion_length": 356.234375, "epoch": 0.09322859183275985, "grad_norm": 0.17746952176094055, "kl": 3.0698519945144653, "learning_rate": 9.997925323056178e-05, "loss": 0.0015, "reward": 0.9332031756639481, "reward_std": 0.19364523142576218, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.19882812723517418, "step": 359 }, { "completion_length": 419.84375, "epoch": 0.0934882815036032, "grad_norm": 0.07352716475725174, "kl": 1.105272650718689, "learning_rate": 9.997913552681826e-05, "loss": 0.0006, "reward": 0.8250000476837158, "reward_std": 0.23325317353010178, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 360 }, { "completion_length": 419.65625, "epoch": 0.09374797117444654, "grad_norm": 0.07044453918933868, "kl": 1.1236870884895325, "learning_rate": 9.997901749020076e-05, "loss": 0.0006, "reward": 0.7312500402331352, "reward_std": 0.26933756470680237, "rewards/spct_argmax_reward_func": 0.53125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 361 }, { "completion_length": 379.796875, "epoch": 0.09400766084528987, "grad_norm": 0.06636416912078857, "kl": 1.2463771998882294, "learning_rate": 9.997889912071008e-05, "loss": 0.0006, "reward": 1.0105469226837158, "reward_std": 0.12890625, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 362 }, { "completion_length": 396.765625, "epoch": 0.09426735051613322, "grad_norm": 0.07192157953977585, "kl": 1.2425108850002289, "learning_rate": 9.997878041834701e-05, "loss": 0.0006, "reward": 0.9187500476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 363 }, { "completion_length": 413.203125, "epoch": 0.09452704018697657, "grad_norm": 0.0440351739525795, "kl": 1.0295661985874176, "learning_rate": 9.997866138311231e-05, "loss": 0.0005, "reward": 0.8718750476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 364 }, { "completion_length": 362.3125, "epoch": 0.0947867298578199, "grad_norm": 0.08340172469615936, "kl": 1.2919241189956665, "learning_rate": 9.997854201500682e-05, "loss": 0.0006, "reward": 0.8457031548023224, "reward_std": 0.1612292304635048, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.173828125, "step": 365 }, { "completion_length": 415.734375, "epoch": 0.09504641952866325, "grad_norm": 0.047237545251846313, "kl": 1.157056376338005, "learning_rate": 9.997842231403132e-05, "loss": 0.0006, "reward": 0.9343750476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 366 }, { "completion_length": 391.75, "epoch": 0.09530610919950659, "grad_norm": 0.06725894659757614, "kl": 1.010798156261444, "learning_rate": 9.997830228018657e-05, "loss": 0.0005, "reward": 0.9175781756639481, "reward_std": 0.22592813521623611, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.19882812723517418, "step": 367 }, { "completion_length": 394.71875, "epoch": 0.09556579887034994, "grad_norm": 0.06364858895540237, "kl": 1.0522731840610504, "learning_rate": 9.997818191347342e-05, "loss": 0.0005, "reward": 0.832031287252903, "reward_std": 0.09531249850988388, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.19140625, "step": 368 }, { "completion_length": 409.34375, "epoch": 0.09582548854119327, "grad_norm": 0.061755359172821045, "kl": 1.0958834141492844, "learning_rate": 9.997806121389265e-05, "loss": 0.0005, "reward": 0.9500000476837158, "reward_std": 0.17075317353010178, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 369 }, { "completion_length": 376.03125, "epoch": 0.09608517821203662, "grad_norm": 0.06287287175655365, "kl": 1.0689317882061005, "learning_rate": 9.997794018144505e-05, "loss": 0.0005, "reward": 0.7843750417232513, "reward_std": 0.10038861096836627, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.19062500074505806, "step": 370 }, { "completion_length": 390.875, "epoch": 0.09634486788287996, "grad_norm": 0.05660751089453697, "kl": 0.9867503643035889, "learning_rate": 9.997781881613146e-05, "loss": 0.0005, "reward": 0.9187500476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 371 }, { "completion_length": 406.34375, "epoch": 0.0966045575537233, "grad_norm": 0.06528208404779434, "kl": 1.1439546644687653, "learning_rate": 9.997769711795265e-05, "loss": 0.0006, "reward": 0.8523437976837158, "reward_std": 0.1688968911767006, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.19609375298023224, "step": 372 }, { "completion_length": 403.3125, "epoch": 0.09686424722456664, "grad_norm": 0.0563204251229763, "kl": 1.027309626340866, "learning_rate": 9.997757508690946e-05, "loss": 0.0005, "reward": 0.8718750476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 373 }, { "completion_length": 364.34375, "epoch": 0.09712393689540999, "grad_norm": 0.05896495655179024, "kl": 0.9268849045038223, "learning_rate": 9.997745272300268e-05, "loss": 0.0005, "reward": 0.8582031726837158, "reward_std": 0.1682625338435173, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.18632812798023224, "step": 374 }, { "completion_length": 377.84375, "epoch": 0.09738362656625332, "grad_norm": 0.07101687043905258, "kl": 1.0347915440797806, "learning_rate": 9.997733002623316e-05, "loss": 0.0005, "reward": 1.0125000476837158, "reward_std": 0.20683756470680237, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 375 }, { "completion_length": 375.625, "epoch": 0.09764331623709667, "grad_norm": 0.07674078643321991, "kl": 0.95893394947052, "learning_rate": 9.997720699660167e-05, "loss": 0.0005, "reward": 0.7468750476837158, "reward_std": 0.2645031735301018, "rewards/spct_argmax_reward_func": 0.546875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 376 }, { "completion_length": 314.6875, "epoch": 0.09790300590794002, "grad_norm": 0.06139516830444336, "kl": 1.0032539814710617, "learning_rate": 9.997708363410905e-05, "loss": 0.0005, "reward": 0.9062500447034836, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 377 }, { "completion_length": 356.421875, "epoch": 0.09816269557878335, "grad_norm": 0.06432578712701797, "kl": 0.9575898349285126, "learning_rate": 9.997695993875614e-05, "loss": 0.0005, "reward": 0.8699219226837158, "reward_std": 0.1337406411767006, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 378 }, { "completion_length": 337.890625, "epoch": 0.0984223852496267, "grad_norm": 0.0598599836230278, "kl": 0.8627411127090454, "learning_rate": 9.997683591054374e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.17075317353010178, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 379 }, { "completion_length": 317.859375, "epoch": 0.09868207492047004, "grad_norm": 0.0473361611366272, "kl": 1.0459037125110626, "learning_rate": 9.99767115494727e-05, "loss": 0.0005, "reward": 0.9656250476837158, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 380 }, { "completion_length": 313.125, "epoch": 0.09894176459131337, "grad_norm": 0.05492144078016281, "kl": 1.1516597270965576, "learning_rate": 9.997658685554381e-05, "loss": 0.0006, "reward": 0.8855469226837158, "reward_std": 0.06640625, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 381 }, { "completion_length": 287.640625, "epoch": 0.09920145426215672, "grad_norm": 0.07376208901405334, "kl": 1.4453851878643036, "learning_rate": 9.997646182875794e-05, "loss": 0.0007, "reward": 0.9800781607627869, "reward_std": 0.12734375149011612, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.19882812723517418, "step": 382 }, { "completion_length": 317.578125, "epoch": 0.09946114393300007, "grad_norm": 0.07554496079683304, "kl": 0.9691031277179718, "learning_rate": 9.997633646911591e-05, "loss": 0.0005, "reward": 0.7312500402331352, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.53125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 383 }, { "completion_length": 299.21875, "epoch": 0.0997208336038434, "grad_norm": 0.08817862719297409, "kl": 0.9809227585792542, "learning_rate": 9.997621077661854e-05, "loss": 0.0005, "reward": 0.9656250476837158, "reward_std": 0.15625, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 384 }, { "completion_length": 296.328125, "epoch": 0.09998052327468675, "grad_norm": 0.04751366004347801, "kl": 0.9665056467056274, "learning_rate": 9.997608475126668e-05, "loss": 0.0005, "reward": 0.9968750476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 385 }, { "completion_length": 309.984375, "epoch": 0.1002402129455301, "grad_norm": 0.060673341155052185, "kl": 1.1267093122005463, "learning_rate": 9.997595839306118e-05, "loss": 0.0006, "reward": 0.8250000476837158, "reward_std": 0.17075317353010178, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 386 }, { "completion_length": 305.90625, "epoch": 0.10049990261637344, "grad_norm": 0.048185862600803375, "kl": 1.0274233967065811, "learning_rate": 9.997583170200285e-05, "loss": 0.0005, "reward": 0.8449219167232513, "reward_std": 0.09936564415693283, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.18867187574505806, "step": 387 }, { "completion_length": 317.015625, "epoch": 0.10075959228721677, "grad_norm": 0.0573749877512455, "kl": 0.9329746216535568, "learning_rate": 9.997570467809256e-05, "loss": 0.0005, "reward": 0.9031250476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 388 }, { "completion_length": 339.125, "epoch": 0.10101928195806012, "grad_norm": 0.05121227353811264, "kl": 0.9600767195224762, "learning_rate": 9.997557732133115e-05, "loss": 0.0005, "reward": 0.8093750476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 389 }, { "completion_length": 335.21875, "epoch": 0.10127897162890347, "grad_norm": 0.07630322873592377, "kl": 0.9713246077299118, "learning_rate": 9.997544963171947e-05, "loss": 0.0005, "reward": 0.9718750417232513, "reward_std": 0.06430422142148018, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.19062500074505806, "step": 390 }, { "completion_length": 288.546875, "epoch": 0.1015386612997468, "grad_norm": 0.044206876307725906, "kl": 1.025971531867981, "learning_rate": 9.997532160925837e-05, "loss": 0.0005, "reward": 0.828125037252903, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 391 }, { "completion_length": 320.953125, "epoch": 0.10179835097059015, "grad_norm": 0.05448157712817192, "kl": 1.0082298815250397, "learning_rate": 9.99751932539487e-05, "loss": 0.0005, "reward": 0.9078125357627869, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.18906250223517418, "step": 392 }, { "completion_length": 323.140625, "epoch": 0.10205804064143349, "grad_norm": 0.30842283368110657, "kl": 2.9790831953287125, "learning_rate": 9.997506456579132e-05, "loss": 0.0015, "reward": 0.8617188036441803, "reward_std": 0.13139688968658447, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.18984375149011612, "step": 393 }, { "completion_length": 313.28125, "epoch": 0.10231773031227683, "grad_norm": 0.06485384702682495, "kl": 1.1000336706638336, "learning_rate": 9.997493554478707e-05, "loss": 0.0006, "reward": 1.0593750476837158, "reward_std": 0.15625, "rewards/spct_argmax_reward_func": 0.859375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 394 }, { "completion_length": 337.265625, "epoch": 0.10257741998312017, "grad_norm": 0.04161767661571503, "kl": 0.8176599889993668, "learning_rate": 9.997480619093682e-05, "loss": 0.0004, "reward": 0.718750037252903, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.53125, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 395 }, { "completion_length": 314.203125, "epoch": 0.10283710965396352, "grad_norm": 0.045376889407634735, "kl": 0.8770899325609207, "learning_rate": 9.997467650424144e-05, "loss": 0.0004, "reward": 0.9343750476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 396 }, { "completion_length": 330.6875, "epoch": 0.10309679932480685, "grad_norm": 0.058706946671009064, "kl": 1.1744676381349564, "learning_rate": 9.997454648470179e-05, "loss": 0.0006, "reward": 0.7546875476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 397 }, { "completion_length": 322.8125, "epoch": 0.1033564889956502, "grad_norm": 0.08368602395057678, "kl": 1.4005443155765533, "learning_rate": 9.997441613231872e-05, "loss": 0.0007, "reward": 0.8855469226837158, "reward_std": 0.1649906411767006, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 398 }, { "completion_length": 370.359375, "epoch": 0.10361617866649354, "grad_norm": 0.022008171305060387, "kl": 0.8287539929151535, "learning_rate": 9.997428544709313e-05, "loss": 0.0004, "reward": 0.7781250402331352, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 399 }, { "completion_length": 369.40625, "epoch": 0.10387586833733688, "grad_norm": 0.07031254470348358, "kl": 0.8283755630254745, "learning_rate": 9.997415442902586e-05, "loss": 0.0004, "reward": 0.7937500476837158, "reward_std": 0.20683756470680237, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 400 }, { "completion_length": 344.453125, "epoch": 0.10413555800818022, "grad_norm": 0.23568494617938995, "kl": 1.7452974766492844, "learning_rate": 9.99740230781178e-05, "loss": 0.0009, "reward": 0.9343750476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 401 }, { "completion_length": 365.171875, "epoch": 0.10439524767902357, "grad_norm": 0.04931305721402168, "kl": 0.7819005399942398, "learning_rate": 9.997389139436982e-05, "loss": 0.0004, "reward": 1.0281250476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 402 }, { "completion_length": 380.4375, "epoch": 0.1046549373498669, "grad_norm": 0.07516228407621384, "kl": 1.978725552558899, "learning_rate": 9.997375937778277e-05, "loss": 0.001, "reward": 0.9656250476837158, "reward_std": 0.20200317353010178, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 403 }, { "completion_length": 372.890625, "epoch": 0.10491462702071025, "grad_norm": 204.4166717529297, "kl": 2062.6086312383413, "learning_rate": 9.997362702835758e-05, "loss": 1.0313, "reward": 0.8250000476837158, "reward_std": 0.17075317353010178, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 404 }, { "completion_length": 393.515625, "epoch": 0.1051743166915536, "grad_norm": 0.0772203579545021, "kl": 0.8303817510604858, "learning_rate": 9.99734943460951e-05, "loss": 0.0004, "reward": 0.8718750476837158, "reward_std": 0.20200317353010178, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 405 }, { "completion_length": 363.25, "epoch": 0.10543400636239694, "grad_norm": 0.10222291201353073, "kl": 0.8917846083641052, "learning_rate": 9.997336133099624e-05, "loss": 0.0004, "reward": 0.9109375476837158, "reward_std": 0.2358047217130661, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.19218749925494194, "step": 406 }, { "completion_length": 377.703125, "epoch": 0.10569369603324028, "grad_norm": 0.09219878911972046, "kl": 0.9360746741294861, "learning_rate": 9.997322798306185e-05, "loss": 0.0005, "reward": 0.9187500476837158, "reward_std": 0.2957531735301018, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 407 }, { "completion_length": 356.0, "epoch": 0.10595338570408362, "grad_norm": 0.07170337438583374, "kl": 0.7870480269193649, "learning_rate": 9.997309430229282e-05, "loss": 0.0004, "reward": 0.7773437947034836, "reward_std": 0.20167933404445648, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.1835937537252903, "step": 408 }, { "completion_length": 394.59375, "epoch": 0.10621307537492697, "grad_norm": 0.04139097034931183, "kl": 0.7746706008911133, "learning_rate": 9.997296028869008e-05, "loss": 0.0004, "reward": 0.9187500476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 409 }, { "completion_length": 356.859375, "epoch": 0.1064727650457703, "grad_norm": 0.09513399004936218, "kl": 0.9585903882980347, "learning_rate": 9.997282594225449e-05, "loss": 0.0005, "reward": 0.9753906726837158, "reward_std": 0.2680855765938759, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.19414062798023224, "step": 410 }, { "completion_length": 371.203125, "epoch": 0.10673245471661365, "grad_norm": 0.06146064028143883, "kl": 0.8115669190883636, "learning_rate": 9.997269126298695e-05, "loss": 0.0004, "reward": 0.9968750476837158, "reward_std": 0.20200317353010178, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 411 }, { "completion_length": 356.71875, "epoch": 0.106992144387457, "grad_norm": 0.06521661579608917, "kl": 0.7665913105010986, "learning_rate": 9.997255625088837e-05, "loss": 0.0004, "reward": 0.9968750476837158, "reward_std": 0.20200317353010178, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 412 }, { "completion_length": 361.125, "epoch": 0.10725183405830033, "grad_norm": 0.06713630259037018, "kl": 0.7168350666761398, "learning_rate": 9.997242090595962e-05, "loss": 0.0004, "reward": 1.0281250476837158, "reward_std": 0.1923343911767006, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 413 }, { "completion_length": 380.921875, "epoch": 0.10751152372914367, "grad_norm": 0.12109723687171936, "kl": 0.8341867923736572, "learning_rate": 9.997228522820162e-05, "loss": 0.0004, "reward": 0.7906250506639481, "reward_std": 0.3618821054697037, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.19687500223517418, "step": 414 }, { "completion_length": 359.578125, "epoch": 0.10777121339998702, "grad_norm": 0.047842495143413544, "kl": 0.7978852391242981, "learning_rate": 9.997214921761529e-05, "loss": 0.0004, "reward": 0.8406250476837158, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 415 }, { "completion_length": 358.34375, "epoch": 0.10803090307083035, "grad_norm": 0.09739437699317932, "kl": 0.9207877963781357, "learning_rate": 9.99720128742015e-05, "loss": 0.0005, "reward": 0.9187500476837158, "reward_std": 0.3582531735301018, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 416 }, { "completion_length": 383.140625, "epoch": 0.1082905927416737, "grad_norm": 0.050993047654628754, "kl": 0.78919717669487, "learning_rate": 9.99718761979612e-05, "loss": 0.0004, "reward": 0.9343750476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 417 }, { "completion_length": 411.546875, "epoch": 0.10855028241251705, "grad_norm": 0.05443549528717995, "kl": 0.7595970928668976, "learning_rate": 9.997173918889527e-05, "loss": 0.0004, "reward": 0.8875000476837158, "reward_std": 0.17075317353010178, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 418 }, { "completion_length": 388.796875, "epoch": 0.10880997208336038, "grad_norm": 0.07016085088253021, "kl": 0.9001446813344955, "learning_rate": 9.997160184700462e-05, "loss": 0.0005, "reward": 0.8347656726837158, "reward_std": 0.16024872288107872, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.19414062425494194, "step": 419 }, { "completion_length": 404.796875, "epoch": 0.10906966175420373, "grad_norm": 0.04491250216960907, "kl": 0.7565741837024689, "learning_rate": 9.997146417229017e-05, "loss": 0.0004, "reward": 1.1031250357627869, "reward_std": 0.13860391825437546, "rewards/spct_argmax_reward_func": 0.90625, "rewards/spct_format_reward_func": 0.19687500223517418, "step": 420 }, { "completion_length": 387.609375, "epoch": 0.10932935142504707, "grad_norm": 0.03923279047012329, "kl": 0.8786362260580063, "learning_rate": 9.997132616475286e-05, "loss": 0.0004, "reward": 1.087500050663948, "reward_std": 0.10000000149011612, "rewards/spct_argmax_reward_func": 0.890625, "rewards/spct_format_reward_func": 0.19687500223517418, "step": 421 }, { "completion_length": 381.6875, "epoch": 0.1095890410958904, "grad_norm": 0.05555080622434616, "kl": 0.8031009286642075, "learning_rate": 9.997118782439358e-05, "loss": 0.0004, "reward": 1.0718750357627869, "reward_std": 0.13125000149011612, "rewards/spct_argmax_reward_func": 0.875, "rewards/spct_format_reward_func": 0.19687500223517418, "step": 422 }, { "completion_length": 379.59375, "epoch": 0.10984873076673375, "grad_norm": 0.07823827117681503, "kl": 0.7930884212255478, "learning_rate": 9.997104915121327e-05, "loss": 0.0004, "reward": 0.9312500357627869, "reward_std": 0.1985843926668167, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.19687500223517418, "step": 423 }, { "completion_length": 379.109375, "epoch": 0.1101084204375771, "grad_norm": 0.08440396189689636, "kl": 0.7945212125778198, "learning_rate": 9.997091014521285e-05, "loss": 0.0004, "reward": 0.8250000476837158, "reward_std": 0.2596687823534012, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 424 }, { "completion_length": 335.703125, "epoch": 0.11036811010842043, "grad_norm": 0.06531116366386414, "kl": 0.7769049406051636, "learning_rate": 9.997077080639323e-05, "loss": 0.0004, "reward": 0.9656250476837158, "reward_std": 0.17558756470680237, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 425 }, { "completion_length": 360.671875, "epoch": 0.11062779977926378, "grad_norm": 0.055094603449106216, "kl": 0.7887778431177139, "learning_rate": 9.997063113475536e-05, "loss": 0.0004, "reward": 0.8250000402331352, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 426 }, { "completion_length": 371.96875, "epoch": 0.11088748945010712, "grad_norm": 0.06373169273138046, "kl": 0.8537097871303558, "learning_rate": 9.997049113030015e-05, "loss": 0.0004, "reward": 0.8562500476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 427 }, { "completion_length": 335.734375, "epoch": 0.11114717912095047, "grad_norm": 0.08931002765893936, "kl": 1.2894207239151, "learning_rate": 9.997035079302854e-05, "loss": 0.0006, "reward": 0.8250000476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 428 }, { "completion_length": 322.875, "epoch": 0.1114068687917938, "grad_norm": 0.05015404894948006, "kl": 0.7908303588628769, "learning_rate": 9.997021012294147e-05, "loss": 0.0004, "reward": 0.7625000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 429 }, { "completion_length": 347.234375, "epoch": 0.11166655846263715, "grad_norm": 0.0819850042462349, "kl": 0.741546705365181, "learning_rate": 9.997006912003989e-05, "loss": 0.0004, "reward": 0.9316406697034836, "reward_std": 0.30605632066726685, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.197265625, "step": 430 }, { "completion_length": 310.84375, "epoch": 0.1119262481334805, "grad_norm": 0.07982911914587021, "kl": 0.7731579393148422, "learning_rate": 9.996992778432471e-05, "loss": 0.0004, "reward": 0.7937500476837158, "reward_std": 0.2235843911767006, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 431 }, { "completion_length": 276.125, "epoch": 0.11218593780432383, "grad_norm": 0.06078411638736725, "kl": 0.8112277984619141, "learning_rate": 9.996978611579688e-05, "loss": 0.0004, "reward": 0.8437500447034836, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 432 }, { "completion_length": 282.078125, "epoch": 0.11244562747516718, "grad_norm": 0.05779354274272919, "kl": 0.830097422003746, "learning_rate": 9.996964411445737e-05, "loss": 0.0004, "reward": 0.9187500476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 433 }, { "completion_length": 305.40625, "epoch": 0.11270531714601052, "grad_norm": 0.05896634981036186, "kl": 0.8055794537067413, "learning_rate": 9.996950178030709e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 434 }, { "completion_length": 316.546875, "epoch": 0.11296500681685386, "grad_norm": 0.08724737912416458, "kl": 0.9627173095941544, "learning_rate": 9.996935911334699e-05, "loss": 0.0005, "reward": 0.8707031756639481, "reward_std": 0.2034052163362503, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.19882812723517418, "step": 435 }, { "completion_length": 295.75, "epoch": 0.1132246964876972, "grad_norm": 0.05390793830156326, "kl": 0.8028766214847565, "learning_rate": 9.996921611357806e-05, "loss": 0.0004, "reward": 0.9656250476837158, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 436 }, { "completion_length": 323.234375, "epoch": 0.11348438615854055, "grad_norm": 0.024012966081500053, "kl": 0.7121115326881409, "learning_rate": 9.99690727810012e-05, "loss": 0.0004, "reward": 0.8718750476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 437 }, { "completion_length": 311.390625, "epoch": 0.11374407582938388, "grad_norm": 0.05137142911553383, "kl": 0.7505025863647461, "learning_rate": 9.99689291156174e-05, "loss": 0.0004, "reward": 0.7781250476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 438 }, { "completion_length": 333.71875, "epoch": 0.11400376550022723, "grad_norm": 0.089178167283535, "kl": 1.6075081825256348, "learning_rate": 9.99687851174276e-05, "loss": 0.0008, "reward": 0.7468750402331352, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.546875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 439 }, { "completion_length": 317.8125, "epoch": 0.11426345517107057, "grad_norm": 0.04597821459174156, "kl": 0.8917286843061447, "learning_rate": 9.996864078643277e-05, "loss": 0.0004, "reward": 0.8234375417232513, "reward_std": 0.07276666164398193, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.18281250447034836, "step": 440 }, { "completion_length": 300.328125, "epoch": 0.11452314484191391, "grad_norm": 0.03427162766456604, "kl": 0.8184358030557632, "learning_rate": 9.996849612263386e-05, "loss": 0.0004, "reward": 0.8406250476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 441 }, { "completion_length": 351.109375, "epoch": 0.11478283451275725, "grad_norm": 0.04359529912471771, "kl": 0.8682273775339127, "learning_rate": 9.996835112603184e-05, "loss": 0.0004, "reward": 1.0906250476837158, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.890625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 442 }, { "completion_length": 348.1875, "epoch": 0.1150425241836006, "grad_norm": 0.04126712679862976, "kl": 0.6450070142745972, "learning_rate": 9.99682057966277e-05, "loss": 0.0003, "reward": 0.8562500402331352, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 443 }, { "completion_length": 323.15625, "epoch": 0.11530221385444393, "grad_norm": 0.04587388411164284, "kl": 0.7942193299531937, "learning_rate": 9.996806013442236e-05, "loss": 0.0004, "reward": 0.9343750476837158, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 444 }, { "completion_length": 336.171875, "epoch": 0.11556190352528728, "grad_norm": 0.05765999108552933, "kl": 0.7983435839414597, "learning_rate": 9.996791413941682e-05, "loss": 0.0004, "reward": 0.7625000476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 445 }, { "completion_length": 398.03125, "epoch": 0.11582159319613063, "grad_norm": 0.032699521631002426, "kl": 0.8854140043258667, "learning_rate": 9.996776781161203e-05, "loss": 0.0004, "reward": 0.9773437976837158, "reward_std": 0.04059493914246559, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.19609375298023224, "step": 446 }, { "completion_length": 366.8125, "epoch": 0.11608128286697397, "grad_norm": 0.04913553223013878, "kl": 0.8628505319356918, "learning_rate": 9.9967621151009e-05, "loss": 0.0004, "reward": 0.7902344167232513, "reward_std": 0.07200013939291239, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.18085937574505806, "step": 447 }, { "completion_length": 353.53125, "epoch": 0.1163409725378173, "grad_norm": 0.0655585378408432, "kl": 0.8355454355478287, "learning_rate": 9.996747415760868e-05, "loss": 0.0004, "reward": 0.9964844286441803, "reward_std": 0.1042000325396657, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.19960937649011612, "step": 448 }, { "completion_length": 355.75, "epoch": 0.11660066220866065, "grad_norm": 0.05202576890587807, "kl": 0.9529969245195389, "learning_rate": 9.996732683141207e-05, "loss": 0.0005, "reward": 0.9343750476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 449 }, { "completion_length": 381.546875, "epoch": 0.116860351879504, "grad_norm": 2.1095356941223145, "kl": 20.293582797050476, "learning_rate": 9.996717917242013e-05, "loss": 0.0101, "reward": 0.8562500476837158, "reward_std": 0.03608439117670059, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 450 }, { "completion_length": 402.453125, "epoch": 0.11712004155034733, "grad_norm": 0.046623192727565765, "kl": 0.9690369367599487, "learning_rate": 9.996703118063384e-05, "loss": 0.0005, "reward": 0.9480469226837158, "reward_std": 0.10249064117670059, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 451 }, { "completion_length": 355.390625, "epoch": 0.11737973122119068, "grad_norm": 0.07367048412561417, "kl": 1.0056444853544235, "learning_rate": 9.996688285605421e-05, "loss": 0.0005, "reward": 0.9316406697034836, "reward_std": 0.16171875596046448, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.197265625, "step": 452 }, { "completion_length": 360.234375, "epoch": 0.11763942089203402, "grad_norm": 0.06098408252000809, "kl": 0.9471621215343475, "learning_rate": 9.996673419868221e-05, "loss": 0.0005, "reward": 0.8250000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 453 }, { "completion_length": 343.921875, "epoch": 0.11789911056287736, "grad_norm": 0.04663759097456932, "kl": 1.0222516655921936, "learning_rate": 9.996658520851884e-05, "loss": 0.0005, "reward": 0.9167969226837158, "reward_std": 0.10249064117670059, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 454 }, { "completion_length": 312.359375, "epoch": 0.1181588002337207, "grad_norm": 0.04321512579917908, "kl": 1.001597821712494, "learning_rate": 9.996643588556508e-05, "loss": 0.0005, "reward": 1.1667969226837158, "reward_std": 0.06640625, "rewards/spct_argmax_reward_func": 0.96875, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 455 }, { "completion_length": 294.03125, "epoch": 0.11841848990456405, "grad_norm": 0.02839808166027069, "kl": 0.9951864331960678, "learning_rate": 9.996628622982193e-05, "loss": 0.0005, "reward": 1.0437500476837158, "reward_std": 0.03608439117670059, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 456 }, { "completion_length": 302.359375, "epoch": 0.11867817957540738, "grad_norm": 0.04708895459771156, "kl": 0.8580925911664963, "learning_rate": 9.996613624129038e-05, "loss": 0.0004, "reward": 0.8875000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 457 }, { "completion_length": 259.078125, "epoch": 0.11893786924625073, "grad_norm": 0.04158490523695946, "kl": 0.7781384736299515, "learning_rate": 9.996598591997146e-05, "loss": 0.0004, "reward": 1.0000000447034836, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 458 }, { "completion_length": 275.046875, "epoch": 0.11919755891709408, "grad_norm": 0.06003032252192497, "kl": 0.8363835960626602, "learning_rate": 9.996583526586614e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 459 }, { "completion_length": 289.109375, "epoch": 0.11945724858793741, "grad_norm": 0.04742904379963875, "kl": 0.9671317040920258, "learning_rate": 9.996568427897544e-05, "loss": 0.0005, "reward": 0.8562500476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 460 }, { "completion_length": 276.71875, "epoch": 0.11971693825878076, "grad_norm": 0.031016351655125618, "kl": 0.9186227023601532, "learning_rate": 9.996553295930033e-05, "loss": 0.0005, "reward": 0.9031250476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 461 }, { "completion_length": 267.234375, "epoch": 0.1199766279296241, "grad_norm": 0.02378900721669197, "kl": 0.8932260721921921, "learning_rate": 9.996538130684188e-05, "loss": 0.0004, "reward": 0.7156250365078449, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.515625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 462 }, { "completion_length": 276.984375, "epoch": 0.12023631760046743, "grad_norm": 0.052746035158634186, "kl": 0.9572068154811859, "learning_rate": 9.996522932160104e-05, "loss": 0.0005, "reward": 0.8406250476837158, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 463 }, { "completion_length": 245.9375, "epoch": 0.12049600727131078, "grad_norm": 0.0032744738273322582, "kl": 1.016908049583435, "learning_rate": 9.996507700357886e-05, "loss": 0.0005, "reward": 0.9500000476837158, "reward_std": 0.0, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 464 }, { "completion_length": 267.75, "epoch": 0.12075569694215413, "grad_norm": 0.0017838302301242948, "kl": 0.9511768966913223, "learning_rate": 9.996492435277634e-05, "loss": 0.0005, "reward": 0.8250000402331352, "reward_std": 0.0, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 465 }, { "completion_length": 253.890625, "epoch": 0.12101538661299747, "grad_norm": 0.0610818974673748, "kl": 0.9409800469875336, "learning_rate": 9.996477136919449e-05, "loss": 0.0005, "reward": 0.9488281607627869, "reward_std": 0.10092814266681671, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.19882812723517418, "step": 466 }, { "completion_length": 258.4375, "epoch": 0.12127507628384081, "grad_norm": 0.02701980620622635, "kl": 0.9622101038694382, "learning_rate": 9.996461805283434e-05, "loss": 0.0005, "reward": 0.9656250476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 467 }, { "completion_length": 262.484375, "epoch": 0.12153476595468415, "grad_norm": 0.040662359446287155, "kl": 0.9711817502975464, "learning_rate": 9.996446440369693e-05, "loss": 0.0005, "reward": 0.9343750476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 468 }, { "completion_length": 253.125, "epoch": 0.1217944556255275, "grad_norm": 0.04176074266433716, "kl": 0.908716231584549, "learning_rate": 9.996431042178324e-05, "loss": 0.0005, "reward": 1.0750000476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 469 }, { "completion_length": 250.28125, "epoch": 0.12205414529637083, "grad_norm": 0.03575362265110016, "kl": 1.0071223378181458, "learning_rate": 9.996415610709433e-05, "loss": 0.0005, "reward": 0.7937500476837158, "reward_std": 0.03608439117670059, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 470 }, { "completion_length": 251.140625, "epoch": 0.12231383496721418, "grad_norm": 0.05208922177553177, "kl": 0.8868242502212524, "learning_rate": 9.996400145963121e-05, "loss": 0.0004, "reward": 0.8875000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 471 }, { "completion_length": 277.8125, "epoch": 0.12257352463805753, "grad_norm": 0.06929590553045273, "kl": 1.0473266392946243, "learning_rate": 9.996384647939491e-05, "loss": 0.0005, "reward": 0.6062500402331352, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.40625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 472 }, { "completion_length": 262.953125, "epoch": 0.12283321430890086, "grad_norm": 0.04220258817076683, "kl": 1.653901606798172, "learning_rate": 9.996369116638646e-05, "loss": 0.0008, "reward": 0.7937500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 473 }, { "completion_length": 283.578125, "epoch": 0.1230929039797442, "grad_norm": 0.17416371405124664, "kl": 3.455570250749588, "learning_rate": 9.996353552060692e-05, "loss": 0.0017, "reward": 0.7312500476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.53125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 474 }, { "completion_length": 271.859375, "epoch": 0.12335259365058755, "grad_norm": 0.06130778044462204, "kl": 0.9180019199848175, "learning_rate": 9.99633795420573e-05, "loss": 0.0005, "reward": 0.8875000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 475 }, { "completion_length": 270.546875, "epoch": 0.12361228332143089, "grad_norm": 0.045260366052389145, "kl": 0.9339238405227661, "learning_rate": 9.996322323073865e-05, "loss": 0.0005, "reward": 0.9031250476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 476 }, { "completion_length": 279.453125, "epoch": 0.12387197299227423, "grad_norm": 0.03316212072968483, "kl": 1.2198266685009003, "learning_rate": 9.996306658665202e-05, "loss": 0.0006, "reward": 0.9187500476837158, "reward_std": 0.03608439117670059, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 477 }, { "completion_length": 287.40625, "epoch": 0.12413166266311758, "grad_norm": 0.026573054492473602, "kl": 0.9177430272102356, "learning_rate": 9.996290960979844e-05, "loss": 0.0005, "reward": 0.9656250476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 478 }, { "completion_length": 300.046875, "epoch": 0.12439135233396091, "grad_norm": 0.06156982108950615, "kl": 0.9146386682987213, "learning_rate": 9.996275230017894e-05, "loss": 0.0005, "reward": 1.0234375447034836, "reward_std": 0.17135104537010193, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.1953125, "step": 479 }, { "completion_length": 283.75, "epoch": 0.12465104200480426, "grad_norm": 0.03512316569685936, "kl": 1.0412984937429428, "learning_rate": 9.996259465779461e-05, "loss": 0.0005, "reward": 0.8406250402331352, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 480 }, { "completion_length": 280.90625, "epoch": 0.1249107316756476, "grad_norm": 0.07337358593940735, "kl": 0.9535775929689407, "learning_rate": 9.996243668264644e-05, "loss": 0.0005, "reward": 0.9968750476837158, "reward_std": 0.17558756470680237, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 481 }, { "completion_length": 313.71875, "epoch": 0.12517042134649095, "grad_norm": 0.04675306752324104, "kl": 1.0183120965957642, "learning_rate": 9.996227837473555e-05, "loss": 0.0005, "reward": 0.9968750476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 482 }, { "completion_length": 314.46875, "epoch": 0.12543011101733428, "grad_norm": 0.04574199765920639, "kl": 0.9231317937374115, "learning_rate": 9.996211973406296e-05, "loss": 0.0005, "reward": 0.8875000476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 483 }, { "completion_length": 295.6875, "epoch": 0.12568980068817762, "grad_norm": 0.07973606884479523, "kl": 0.9733120799064636, "learning_rate": 9.996196076062972e-05, "loss": 0.0005, "reward": 0.8304687887430191, "reward_std": 0.16748128086328506, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.18984375149011612, "step": 484 }, { "completion_length": 319.078125, "epoch": 0.12594949035902098, "grad_norm": 0.0689970925450325, "kl": 0.8483351171016693, "learning_rate": 9.996180145443687e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.23325317353010178, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 485 }, { "completion_length": 328.875, "epoch": 0.1262091800298643, "grad_norm": 0.06559611111879349, "kl": 0.9142664074897766, "learning_rate": 9.996164181548552e-05, "loss": 0.0005, "reward": 0.9812500476837158, "reward_std": 0.23325317353010178, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 486 }, { "completion_length": 336.734375, "epoch": 0.12646886970070764, "grad_norm": 0.052299849689006805, "kl": 0.8430787175893784, "learning_rate": 9.996148184377673e-05, "loss": 0.0004, "reward": 0.9343750476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 487 }, { "completion_length": 291.734375, "epoch": 0.126728559371551, "grad_norm": 0.08559589833021164, "kl": 0.8611534237861633, "learning_rate": 9.996132153931152e-05, "loss": 0.0004, "reward": 0.8281250447034836, "reward_std": 0.3173343911767006, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 488 }, { "completion_length": 313.984375, "epoch": 0.12698824904239434, "grad_norm": 0.06040756031870842, "kl": 0.8816375881433487, "learning_rate": 9.9961160902091e-05, "loss": 0.0004, "reward": 0.7636719197034836, "reward_std": 0.20590942353010178, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.1855468787252903, "step": 489 }, { "completion_length": 345.578125, "epoch": 0.12724793871323767, "grad_norm": 0.06687922775745392, "kl": 0.9300952255725861, "learning_rate": 9.996099993211623e-05, "loss": 0.0005, "reward": 0.8562500476837158, "reward_std": 0.2957531735301018, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 490 }, { "completion_length": 330.671875, "epoch": 0.12750762838408103, "grad_norm": 0.06032703444361687, "kl": 0.9202833473682404, "learning_rate": 9.996083862938826e-05, "loss": 0.0005, "reward": 0.8550781756639481, "reward_std": 0.1995125338435173, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.19882812723517418, "step": 491 }, { "completion_length": 324.859375, "epoch": 0.12776731805492436, "grad_norm": 0.05831574648618698, "kl": 0.9312995225191116, "learning_rate": 9.99606769939082e-05, "loss": 0.0005, "reward": 0.9656250476837158, "reward_std": 0.15625, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 492 }, { "completion_length": 323.15625, "epoch": 0.1280270077257677, "grad_norm": 0.08509742468595505, "kl": 0.9111131876707077, "learning_rate": 9.996051502567709e-05, "loss": 0.0005, "reward": 0.9656250476837158, "reward_std": 0.2909187823534012, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 493 }, { "completion_length": 317.5, "epoch": 0.12828669739661105, "grad_norm": 0.033624667674303055, "kl": 0.9213976711034775, "learning_rate": 9.996035272469603e-05, "loss": 0.0005, "reward": 0.8562500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 494 }, { "completion_length": 323.5, "epoch": 0.1285463870674544, "grad_norm": 0.043426044285297394, "kl": 0.97125244140625, "learning_rate": 9.99601900909661e-05, "loss": 0.0005, "reward": 0.9968750476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 495 }, { "completion_length": 327.0625, "epoch": 0.12880607673829775, "grad_norm": 0.06975823640823364, "kl": 0.9850981831550598, "learning_rate": 9.996002712448838e-05, "loss": 0.0005, "reward": 0.9500000476837158, "reward_std": 0.2235843911767006, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 496 }, { "completion_length": 310.21875, "epoch": 0.12906576640914108, "grad_norm": 0.05101317912340164, "kl": 1.0551992058753967, "learning_rate": 9.995986382526396e-05, "loss": 0.0005, "reward": 0.9968750476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 497 }, { "completion_length": 343.375, "epoch": 0.1293254560799844, "grad_norm": 0.06366025656461716, "kl": 1.0292676985263824, "learning_rate": 9.995970019329391e-05, "loss": 0.0005, "reward": 0.8550781756639481, "reward_std": 0.1634281426668167, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.19882812723517418, "step": 498 }, { "completion_length": 345.421875, "epoch": 0.12958514575082777, "grad_norm": 0.0629991814494133, "kl": 1.269994467496872, "learning_rate": 9.995953622857934e-05, "loss": 0.0006, "reward": 0.8562500476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 499 }, { "completion_length": 327.78125, "epoch": 0.1298448354216711, "grad_norm": 0.05267056077718735, "kl": 1.0048420131206512, "learning_rate": 9.995937193112135e-05, "loss": 0.0005, "reward": 0.8875000476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 500 }, { "completion_length": 348.75, "epoch": 0.13010452509251444, "grad_norm": 0.057104673236608505, "kl": 1.623704731464386, "learning_rate": 9.9959207300921e-05, "loss": 0.0008, "reward": 0.9812500476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 501 }, { "completion_length": 350.203125, "epoch": 0.1303642147633578, "grad_norm": 0.06417544931173325, "kl": 1.0287377834320068, "learning_rate": 9.995904233797942e-05, "loss": 0.0005, "reward": 0.9968750476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 502 }, { "completion_length": 357.9375, "epoch": 0.13062390443420113, "grad_norm": 0.03433471545577049, "kl": 0.8110226988792419, "learning_rate": 9.995887704229767e-05, "loss": 0.0004, "reward": 0.7937500476837158, "reward_std": 0.03608439117670059, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 503 }, { "completion_length": 375.09375, "epoch": 0.13088359410504447, "grad_norm": 0.049379460513591766, "kl": 0.9045803844928741, "learning_rate": 9.99587114138769e-05, "loss": 0.0005, "reward": 0.8875000476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 504 }, { "completion_length": 381.390625, "epoch": 0.13114328377588783, "grad_norm": 0.0568462535738945, "kl": 0.8245822489261627, "learning_rate": 9.995854545271818e-05, "loss": 0.0004, "reward": 0.9343750476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 505 }, { "completion_length": 357.4375, "epoch": 0.13140297344673116, "grad_norm": 0.06000378355383873, "kl": 1.0002551972866058, "learning_rate": 9.995837915882262e-05, "loss": 0.0005, "reward": 0.6687500402331352, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.46875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 506 }, { "completion_length": 392.390625, "epoch": 0.1316626631175745, "grad_norm": 0.02195489965379238, "kl": 0.8532946556806564, "learning_rate": 9.995821253219133e-05, "loss": 0.0004, "reward": 0.8562500402331352, "reward_std": 0.03608439117670059, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 507 }, { "completion_length": 354.65625, "epoch": 0.13192235278841785, "grad_norm": 0.030919784680008888, "kl": 0.9484735578298569, "learning_rate": 9.995804557282544e-05, "loss": 0.0005, "reward": 1.0281250476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 508 }, { "completion_length": 427.3125, "epoch": 0.13218204245926118, "grad_norm": 0.02095084637403488, "kl": 0.8456670790910721, "learning_rate": 9.995787828072602e-05, "loss": 0.0004, "reward": 0.8093750476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 509 }, { "completion_length": 401.6875, "epoch": 0.13244173213010452, "grad_norm": 0.05019533634185791, "kl": 0.9446804374456406, "learning_rate": 9.995771065589423e-05, "loss": 0.0005, "reward": 0.7468750476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.546875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 510 }, { "completion_length": 414.484375, "epoch": 0.13270142180094788, "grad_norm": 0.05607663467526436, "kl": 0.8686279058456421, "learning_rate": 9.995754269833116e-05, "loss": 0.0004, "reward": 0.7070312947034836, "reward_std": 0.10498128086328506, "rewards/spct_argmax_reward_func": 0.515625, "rewards/spct_format_reward_func": 0.1914062537252903, "step": 511 }, { "completion_length": 389.65625, "epoch": 0.1329611114717912, "grad_norm": 0.035392437130212784, "kl": 0.7820742875337601, "learning_rate": 9.995737440803792e-05, "loss": 0.0004, "reward": 0.6875000447034836, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.5, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 512 }, { "completion_length": 423.609375, "epoch": 0.13322080114263454, "grad_norm": 0.020456209778785706, "kl": 0.8893903642892838, "learning_rate": 9.995720578501563e-05, "loss": 0.0004, "reward": 0.7468750402331352, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.546875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 513 }, { "completion_length": 387.21875, "epoch": 0.1334804908134779, "grad_norm": 0.02718016691505909, "kl": 0.8881625235080719, "learning_rate": 9.995703682926545e-05, "loss": 0.0004, "reward": 0.7468750476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.546875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 514 }, { "completion_length": 421.3125, "epoch": 0.13374018048432124, "grad_norm": 0.0012608692049980164, "kl": 0.8252017796039581, "learning_rate": 9.995686754078848e-05, "loss": 0.0004, "reward": 0.8875000476837158, "reward_std": 0.0, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 515 }, { "completion_length": 439.125, "epoch": 0.13399987015516457, "grad_norm": 0.023204263299703598, "kl": 0.8931468278169632, "learning_rate": 9.995669791958586e-05, "loss": 0.0004, "reward": 0.9078125357627869, "reward_std": 0.03608439117670059, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.18906250223517418, "step": 516 }, { "completion_length": 452.296875, "epoch": 0.13425955982600793, "grad_norm": 0.04532630741596222, "kl": 0.8192959874868393, "learning_rate": 9.995652796565869e-05, "loss": 0.0004, "reward": 0.7312500402331352, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.53125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 517 }, { "completion_length": 467.375, "epoch": 0.13451924949685126, "grad_norm": 0.0015882073203101754, "kl": 0.8386379778385162, "learning_rate": 9.995635767900813e-05, "loss": 0.0004, "reward": 0.8875000476837158, "reward_std": 0.0, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 518 }, { "completion_length": 429.875, "epoch": 0.1347789391676946, "grad_norm": 0.03563825041055679, "kl": 0.7818548828363419, "learning_rate": 9.99561870596353e-05, "loss": 0.0004, "reward": 0.8875000476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 519 }, { "completion_length": 379.21875, "epoch": 0.13503862883853796, "grad_norm": 0.0015590265393257141, "kl": 0.79245226085186, "learning_rate": 9.995601610754133e-05, "loss": 0.0004, "reward": 1.0750000476837158, "reward_std": 0.0, "rewards/spct_argmax_reward_func": 0.875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 520 }, { "completion_length": 427.765625, "epoch": 0.1352983185093813, "grad_norm": 0.05321358144283295, "kl": 0.8258067071437836, "learning_rate": 9.99558448227274e-05, "loss": 0.0004, "reward": 0.9343750476837158, "reward_std": 0.15625, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 521 }, { "completion_length": 404.53125, "epoch": 0.13555800818022462, "grad_norm": 0.027197517454624176, "kl": 0.8636741191148758, "learning_rate": 9.99556732051946e-05, "loss": 0.0004, "reward": 0.9949219226837158, "reward_std": 0.03515625, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 522 }, { "completion_length": 429.28125, "epoch": 0.13581769785106798, "grad_norm": 0.03945168852806091, "kl": 0.7570160627365112, "learning_rate": 9.995550125494409e-05, "loss": 0.0004, "reward": 0.7625000402331352, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 523 }, { "completion_length": 383.328125, "epoch": 0.1360773875219113, "grad_norm": 0.03833471238613129, "kl": 0.8505687266588211, "learning_rate": 9.995532897197702e-05, "loss": 0.0004, "reward": 0.8593750447034836, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 524 }, { "completion_length": 390.1875, "epoch": 0.13633707719275465, "grad_norm": 0.03937728330492973, "kl": 0.8316414654254913, "learning_rate": 9.995515635629456e-05, "loss": 0.0004, "reward": 1.0261719226837158, "reward_std": 0.07124064117670059, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 525 }, { "completion_length": 446.796875, "epoch": 0.136596766863598, "grad_norm": 0.05264423415064812, "kl": 0.7381372153759003, "learning_rate": 9.995498340789781e-05, "loss": 0.0004, "reward": 0.8093750476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 526 }, { "completion_length": 394.75, "epoch": 0.13685645653444134, "grad_norm": 0.0704149603843689, "kl": 0.8459805548191071, "learning_rate": 9.995481012678796e-05, "loss": 0.0004, "reward": 0.7625000476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 527 }, { "completion_length": 390.109375, "epoch": 0.13711614620528467, "grad_norm": 0.04306887835264206, "kl": 0.7836309671401978, "learning_rate": 9.995463651296615e-05, "loss": 0.0004, "reward": 0.9343750476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 528 }, { "completion_length": 346.828125, "epoch": 0.13737583587612803, "grad_norm": 0.03298880159854889, "kl": 0.8455045223236084, "learning_rate": 9.995446256643353e-05, "loss": 0.0004, "reward": 0.9929687976837158, "reward_std": 0.07184493914246559, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.19609375298023224, "step": 529 }, { "completion_length": 384.90625, "epoch": 0.13763552554697137, "grad_norm": 0.04963196814060211, "kl": 1.0451235920190811, "learning_rate": 9.995428828719129e-05, "loss": 0.0005, "reward": 0.9812500476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 530 }, { "completion_length": 366.9375, "epoch": 0.1378952152178147, "grad_norm": 0.058664482086896896, "kl": 0.8767789900302887, "learning_rate": 9.995411367524055e-05, "loss": 0.0004, "reward": 0.9187500476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 531 }, { "completion_length": 364.140625, "epoch": 0.13815490488865806, "grad_norm": 0.0008218743605539203, "kl": 0.6955507844686508, "learning_rate": 9.99539387305825e-05, "loss": 0.0003, "reward": 0.875000037252903, "reward_std": 0.0, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 532 }, { "completion_length": 380.140625, "epoch": 0.1384145945595014, "grad_norm": 0.04881472513079643, "kl": 0.7839880585670471, "learning_rate": 9.995376345321827e-05, "loss": 0.0004, "reward": 0.8250000476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 533 }, { "completion_length": 390.421875, "epoch": 0.13867428423034472, "grad_norm": 0.019884193316102028, "kl": 0.7807821929454803, "learning_rate": 9.995358784314907e-05, "loss": 0.0004, "reward": 1.1843750476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.984375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 534 }, { "completion_length": 378.1875, "epoch": 0.13893397390118808, "grad_norm": 0.05999763309955597, "kl": 0.8366309106349945, "learning_rate": 9.995341190037607e-05, "loss": 0.0004, "reward": 0.8187500536441803, "reward_std": 0.06430421944241971, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.19375000149011612, "step": 535 }, { "completion_length": 361.125, "epoch": 0.13919366357203142, "grad_norm": 0.03340574726462364, "kl": 0.7785570919513702, "learning_rate": 9.99532356249004e-05, "loss": 0.0004, "reward": 0.9656250476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 536 }, { "completion_length": 361.296875, "epoch": 0.13945335324287478, "grad_norm": 0.024948887526988983, "kl": 0.7881710082292557, "learning_rate": 9.995305901672328e-05, "loss": 0.0004, "reward": 1.0437500476837158, "reward_std": 0.03608439117670059, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 537 }, { "completion_length": 344.4375, "epoch": 0.1397130429137181, "grad_norm": 0.0639127567410469, "kl": 0.8646752238273621, "learning_rate": 9.995288207584586e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 538 }, { "completion_length": 360.328125, "epoch": 0.13997273258456144, "grad_norm": 0.06814175099134445, "kl": 0.8896672129631042, "learning_rate": 9.995270480226932e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 539 }, { "completion_length": 377.125, "epoch": 0.1402324222554048, "grad_norm": 0.06798054277896881, "kl": 0.8516007512807846, "learning_rate": 9.995252719599483e-05, "loss": 0.0004, "reward": 0.8855469226837158, "reward_std": 0.1649906411767006, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 540 }, { "completion_length": 364.015625, "epoch": 0.14049211192624814, "grad_norm": 0.047826047986745834, "kl": 0.8218154460191727, "learning_rate": 9.995234925702362e-05, "loss": 0.0004, "reward": 0.7761719226837158, "reward_std": 0.09765625, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 541 }, { "completion_length": 380.15625, "epoch": 0.14075180159709147, "grad_norm": 0.0030466720927506685, "kl": 0.8721163868904114, "learning_rate": 9.995217098535681e-05, "loss": 0.0004, "reward": 0.8875000476837158, "reward_std": 0.0, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 542 }, { "completion_length": 358.6875, "epoch": 0.14101149126793483, "grad_norm": 0.06575354933738708, "kl": 0.9036547541618347, "learning_rate": 9.995199238099564e-05, "loss": 0.0005, "reward": 0.9644531607627869, "reward_std": 0.1321781426668167, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.19882812723517418, "step": 543 }, { "completion_length": 358.171875, "epoch": 0.14127118093877816, "grad_norm": 0.05327659472823143, "kl": 0.8906537592411041, "learning_rate": 9.995181344394127e-05, "loss": 0.0004, "reward": 0.8437500447034836, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 544 }, { "completion_length": 385.15625, "epoch": 0.1415308706096215, "grad_norm": 0.07219736278057098, "kl": 0.7429822832345963, "learning_rate": 9.99516341741949e-05, "loss": 0.0004, "reward": 1.0906250476837158, "reward_std": 0.15625, "rewards/spct_argmax_reward_func": 0.890625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 545 }, { "completion_length": 359.015625, "epoch": 0.14179056028046486, "grad_norm": 0.07932280749082565, "kl": 0.7369190156459808, "learning_rate": 9.995145457175772e-05, "loss": 0.0004, "reward": 0.8406250476837158, "reward_std": 0.1923343911767006, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 546 }, { "completion_length": 331.53125, "epoch": 0.1420502499513082, "grad_norm": 0.05412004515528679, "kl": 0.8041577488183975, "learning_rate": 9.995127463663095e-05, "loss": 0.0004, "reward": 1.0750000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 547 }, { "completion_length": 343.15625, "epoch": 0.14230993962215152, "grad_norm": 0.04911421239376068, "kl": 0.7703281342983246, "learning_rate": 9.995109436881576e-05, "loss": 0.0004, "reward": 0.9656250476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 548 }, { "completion_length": 334.21875, "epoch": 0.14256962929299488, "grad_norm": 0.04327813535928726, "kl": 0.7432957589626312, "learning_rate": 9.995091376831335e-05, "loss": 0.0004, "reward": 1.1218750476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.921875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 549 }, { "completion_length": 338.25, "epoch": 0.14282931896383821, "grad_norm": 0.09037069976329803, "kl": 0.8222482204437256, "learning_rate": 9.995073283512494e-05, "loss": 0.0004, "reward": 0.8562500476837158, "reward_std": 0.30542195588350296, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 550 }, { "completion_length": 333.703125, "epoch": 0.14308900863468155, "grad_norm": 0.09052589535713196, "kl": 0.8443027585744858, "learning_rate": 9.995055156925173e-05, "loss": 0.0004, "reward": 0.9656250476837158, "reward_std": 0.1923343911767006, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 551 }, { "completion_length": 299.453125, "epoch": 0.1433486983055249, "grad_norm": 0.0854087769985199, "kl": 0.9070709943771362, "learning_rate": 9.995036997069491e-05, "loss": 0.0005, "reward": 0.8617188036441803, "reward_std": 0.13139688968658447, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.18984375149011612, "step": 552 }, { "completion_length": 338.46875, "epoch": 0.14360838797636824, "grad_norm": 0.07393091917037964, "kl": 0.7396240383386612, "learning_rate": 9.995018803945573e-05, "loss": 0.0004, "reward": 0.7937500476837158, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 553 }, { "completion_length": 344.40625, "epoch": 0.14386807764721157, "grad_norm": 0.04623749852180481, "kl": 0.7319655120372772, "learning_rate": 9.995000577553537e-05, "loss": 0.0004, "reward": 0.9968750476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 554 }, { "completion_length": 322.578125, "epoch": 0.14412776731805493, "grad_norm": 0.06557784974575043, "kl": 0.6888992637395859, "learning_rate": 9.994982317893504e-05, "loss": 0.0003, "reward": 1.0593750476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.859375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 555 }, { "completion_length": 333.484375, "epoch": 0.14438745698889827, "grad_norm": 0.06242746487259865, "kl": 0.7366724610328674, "learning_rate": 9.994964024965597e-05, "loss": 0.0004, "reward": 0.9187500476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 556 }, { "completion_length": 334.65625, "epoch": 0.1446471466597416, "grad_norm": 0.07174563407897949, "kl": 0.8517805486917496, "learning_rate": 9.994945698769938e-05, "loss": 0.0004, "reward": 0.9277344197034836, "reward_std": 0.17058453056961298, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.193359375, "step": 557 }, { "completion_length": 364.0, "epoch": 0.14490683633058496, "grad_norm": 0.036590367555618286, "kl": 0.7256409227848053, "learning_rate": 9.994927339306647e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 558 }, { "completion_length": 333.125, "epoch": 0.1451665260014283, "grad_norm": 0.05972026288509369, "kl": 0.7808238565921783, "learning_rate": 9.99490894657585e-05, "loss": 0.0004, "reward": 1.0718750357627869, "reward_std": 0.10221333056688309, "rewards/spct_argmax_reward_func": 0.875, "rewards/spct_format_reward_func": 0.19687500223517418, "step": 559 }, { "completion_length": 361.078125, "epoch": 0.14542621567227162, "grad_norm": 0.04750610888004303, "kl": 0.8216852694749832, "learning_rate": 9.994890520577667e-05, "loss": 0.0004, "reward": 0.9968750476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 560 }, { "completion_length": 376.78125, "epoch": 0.14568590534311499, "grad_norm": 0.033486682921648026, "kl": 0.7377017140388489, "learning_rate": 9.99487206131222e-05, "loss": 0.0004, "reward": 0.9656250476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 561 }, { "completion_length": 400.453125, "epoch": 0.14594559501395832, "grad_norm": 0.05402996763586998, "kl": 0.6992240846157074, "learning_rate": 9.994853568779634e-05, "loss": 0.0003, "reward": 0.9500000476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 562 }, { "completion_length": 420.796875, "epoch": 0.14620528468480165, "grad_norm": 0.053882841020822525, "kl": 0.7289632707834244, "learning_rate": 9.99483504298003e-05, "loss": 0.0004, "reward": 0.9500000476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 563 }, { "completion_length": 374.015625, "epoch": 0.146464974355645, "grad_norm": 0.059464920312166214, "kl": 0.6794900745153427, "learning_rate": 9.994816483913534e-05, "loss": 0.0003, "reward": 0.9343750476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 564 }, { "completion_length": 426.734375, "epoch": 0.14672466402648834, "grad_norm": 0.030909806489944458, "kl": 0.6237600296735764, "learning_rate": 9.994797891580268e-05, "loss": 0.0003, "reward": 1.0750000476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 565 }, { "completion_length": 483.921875, "epoch": 0.14698435369733168, "grad_norm": 0.04378826916217804, "kl": 0.6076885163784027, "learning_rate": 9.994779265980354e-05, "loss": 0.0003, "reward": 0.8250000476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 566 }, { "completion_length": 430.15625, "epoch": 0.14724404336817504, "grad_norm": 0.047039102762937546, "kl": 0.7405146807432175, "learning_rate": 9.99476060711392e-05, "loss": 0.0004, "reward": 1.0125000476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 567 }, { "completion_length": 481.5625, "epoch": 0.14750373303901837, "grad_norm": 0.029251301661133766, "kl": 0.5958460122346878, "learning_rate": 9.994741914981087e-05, "loss": 0.0003, "reward": 0.9183594286441803, "reward_std": 0.06276902928948402, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.19960937649011612, "step": 568 }, { "completion_length": 436.796875, "epoch": 0.1477634227098617, "grad_norm": 0.051042042672634125, "kl": 0.7120942622423172, "learning_rate": 9.994723189581983e-05, "loss": 0.0004, "reward": 0.7671875357627869, "reward_std": 0.23808756470680237, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.18906250223517418, "step": 569 }, { "completion_length": 444.515625, "epoch": 0.14802311238070506, "grad_norm": 0.04876108840107918, "kl": 0.6625997424125671, "learning_rate": 9.994704430916728e-05, "loss": 0.0003, "reward": 0.9968750476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 570 }, { "completion_length": 469.25, "epoch": 0.1482828020515484, "grad_norm": 0.04812242463231087, "kl": 0.7221941947937012, "learning_rate": 9.99468563898545e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 571 }, { "completion_length": 491.53125, "epoch": 0.14854249172239173, "grad_norm": 0.0438767746090889, "kl": 0.548522099852562, "learning_rate": 9.994666813788274e-05, "loss": 0.0003, "reward": 0.9812500476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 572 }, { "completion_length": 472.921875, "epoch": 0.1488021813932351, "grad_norm": 0.03702402487397194, "kl": 0.659598708152771, "learning_rate": 9.994647955325326e-05, "loss": 0.0003, "reward": 0.9812500476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 573 }, { "completion_length": 435.234375, "epoch": 0.14906187106407842, "grad_norm": 0.031161926686763763, "kl": 0.7050413936376572, "learning_rate": 9.99462906359673e-05, "loss": 0.0004, "reward": 0.8250000476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 574 }, { "completion_length": 454.828125, "epoch": 0.14932156073492178, "grad_norm": 0.05843861401081085, "kl": 0.7151681184768677, "learning_rate": 9.994610138602612e-05, "loss": 0.0004, "reward": 0.9562500417232513, "reward_std": 0.16904378682374954, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.19062499701976776, "step": 575 }, { "completion_length": 451.546875, "epoch": 0.14958125040576511, "grad_norm": 24.508785247802734, "kl": 61.45370349287987, "learning_rate": 9.994591180343098e-05, "loss": 0.0307, "reward": 0.7906250357627869, "reward_std": 0.21102427318692207, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.18125000223517418, "step": 576 }, { "completion_length": 451.421875, "epoch": 0.14984094007660845, "grad_norm": 0.023676205426454544, "kl": 0.6299633234739304, "learning_rate": 9.994572188818316e-05, "loss": 0.0003, "reward": 0.9187500476837158, "reward_std": 0.03608439117670059, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 577 }, { "completion_length": 409.640625, "epoch": 0.1501006297474518, "grad_norm": 0.043090857565402985, "kl": 0.7423471510410309, "learning_rate": 9.994553164028391e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 578 }, { "completion_length": 437.90625, "epoch": 0.15036031941829514, "grad_norm": 0.045085687190294266, "kl": 0.659099206328392, "learning_rate": 9.99453410597345e-05, "loss": 0.0003, "reward": 0.9343750476837158, "reward_std": 0.15625, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 579 }, { "completion_length": 392.65625, "epoch": 0.15062000908913847, "grad_norm": 0.001036419183947146, "kl": 0.7175569236278534, "learning_rate": 9.994515014653619e-05, "loss": 0.0004, "reward": 0.8796875476837158, "reward_std": 0.0, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 580 }, { "completion_length": 437.640625, "epoch": 0.15087969875998183, "grad_norm": 0.03945207968354225, "kl": 0.6263563483953476, "learning_rate": 9.994495890069027e-05, "loss": 0.0003, "reward": 0.9656250476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 581 }, { "completion_length": 429.234375, "epoch": 0.15113938843082517, "grad_norm": 0.04428955167531967, "kl": 0.6402536481618881, "learning_rate": 9.9944767322198e-05, "loss": 0.0003, "reward": 0.8718750402331352, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 582 }, { "completion_length": 443.53125, "epoch": 0.1513990781016685, "grad_norm": 0.033709969371557236, "kl": 0.6466110199689865, "learning_rate": 9.994457541106065e-05, "loss": 0.0003, "reward": 0.8562500476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 583 }, { "completion_length": 429.71875, "epoch": 0.15165876777251186, "grad_norm": 0.0383843369781971, "kl": 0.6913144737482071, "learning_rate": 9.994438316727953e-05, "loss": 0.0003, "reward": 0.8718750476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 584 }, { "completion_length": 451.75, "epoch": 0.1519184574433552, "grad_norm": 0.03242205083370209, "kl": 0.60165835916996, "learning_rate": 9.994419059085587e-05, "loss": 0.0003, "reward": 0.8406250476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 585 }, { "completion_length": 437.734375, "epoch": 0.15217814711419853, "grad_norm": 0.022669371217489243, "kl": 0.6212546080350876, "learning_rate": 9.994399768179098e-05, "loss": 0.0003, "reward": 1.0437500476837158, "reward_std": 0.03608439117670059, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 586 }, { "completion_length": 399.25, "epoch": 0.15243783678504189, "grad_norm": 0.029800934717059135, "kl": 0.644634023308754, "learning_rate": 9.994380444008617e-05, "loss": 0.0003, "reward": 0.9656250476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 587 }, { "completion_length": 361.875, "epoch": 0.15269752645588522, "grad_norm": 0.027565056458115578, "kl": 0.668412446975708, "learning_rate": 9.99436108657427e-05, "loss": 0.0003, "reward": 0.8750000447034836, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 588 }, { "completion_length": 371.1875, "epoch": 0.15295721612672855, "grad_norm": 0.06677054613828659, "kl": 0.8027912974357605, "learning_rate": 9.994341695876186e-05, "loss": 0.0004, "reward": 0.7781250476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 589 }, { "completion_length": 415.875, "epoch": 0.1532169057975719, "grad_norm": 0.059131041169166565, "kl": 0.7437072247266769, "learning_rate": 9.994322271914494e-05, "loss": 0.0004, "reward": 0.8156250342726707, "reward_std": 0.12680421979166567, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.19062500074505806, "step": 590 }, { "completion_length": 399.984375, "epoch": 0.15347659546841524, "grad_norm": 0.05190002918243408, "kl": 0.7396757006645203, "learning_rate": 9.994302814689324e-05, "loss": 0.0004, "reward": 0.9738281667232513, "reward_std": 0.12578125001164153, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.19257812574505806, "step": 591 }, { "completion_length": 395.546875, "epoch": 0.15373628513925858, "grad_norm": 0.047314297407865524, "kl": 0.6364656835794449, "learning_rate": 9.994283324200804e-05, "loss": 0.0003, "reward": 0.8562500476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 592 }, { "completion_length": 410.609375, "epoch": 0.15399597481010194, "grad_norm": 0.043155185878276825, "kl": 0.6852645426988602, "learning_rate": 9.994263800449067e-05, "loss": 0.0003, "reward": 0.8406250402331352, "reward_std": 0.15625, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 593 }, { "completion_length": 380.34375, "epoch": 0.15425566448094527, "grad_norm": 0.0437658429145813, "kl": 0.7479041069746017, "learning_rate": 9.99424424343424e-05, "loss": 0.0004, "reward": 0.9343750476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 594 }, { "completion_length": 399.125, "epoch": 0.1545153541517886, "grad_norm": 0.04888404160737991, "kl": 0.6927367299795151, "learning_rate": 9.994224653156454e-05, "loss": 0.0003, "reward": 0.9656250476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 595 }, { "completion_length": 350.859375, "epoch": 0.15477504382263196, "grad_norm": 0.01697734370827675, "kl": 0.7349214851856232, "learning_rate": 9.994205029615842e-05, "loss": 0.0004, "reward": 0.9968750476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 596 }, { "completion_length": 354.671875, "epoch": 0.1550347334934753, "grad_norm": 0.0475953035056591, "kl": 0.712427407503128, "learning_rate": 9.994185372812531e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 597 }, { "completion_length": 351.265625, "epoch": 0.15529442316431863, "grad_norm": 0.05779649689793587, "kl": 0.7266118377447128, "learning_rate": 9.994165682746655e-05, "loss": 0.0004, "reward": 1.0593750476837158, "reward_std": 0.15625, "rewards/spct_argmax_reward_func": 0.859375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 598 }, { "completion_length": 360.6875, "epoch": 0.155554112835162, "grad_norm": 0.041866280138492584, "kl": 0.7314080148935318, "learning_rate": 9.994145959418341e-05, "loss": 0.0004, "reward": 0.9343750476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 599 }, { "completion_length": 349.609375, "epoch": 0.15581380250600532, "grad_norm": 0.05718596279621124, "kl": 0.7463361173868179, "learning_rate": 9.994126202827725e-05, "loss": 0.0004, "reward": 1.0281250476837158, "reward_std": 0.15625, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 600 }, { "completion_length": 376.625, "epoch": 0.15607349217684865, "grad_norm": 0.013417850248515606, "kl": 0.6952438056468964, "learning_rate": 9.994106412974936e-05, "loss": 0.0003, "reward": 0.9968750476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 601 }, { "completion_length": 345.015625, "epoch": 0.15633318184769202, "grad_norm": 0.060515813529491425, "kl": 0.933880090713501, "learning_rate": 9.994086589860107e-05, "loss": 0.0005, "reward": 0.7695312947034836, "reward_std": 0.19389688968658447, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.1914062537252903, "step": 602 }, { "completion_length": 423.328125, "epoch": 0.15659287151853535, "grad_norm": 0.04787173494696617, "kl": 0.6842610538005829, "learning_rate": 9.99406673348337e-05, "loss": 0.0003, "reward": 1.0125000476837158, "reward_std": 0.17075317353010178, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 603 }, { "completion_length": 369.0, "epoch": 0.15685256118937868, "grad_norm": 0.0491122268140316, "kl": 0.7776623368263245, "learning_rate": 9.994046843844855e-05, "loss": 0.0004, "reward": 0.9656250476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 604 }, { "completion_length": 416.34375, "epoch": 0.15711225086022204, "grad_norm": 0.05215613543987274, "kl": 0.7216640114784241, "learning_rate": 9.994026920944697e-05, "loss": 0.0004, "reward": 1.0593750476837158, "reward_std": 0.1923343911767006, "rewards/spct_argmax_reward_func": 0.859375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 605 }, { "completion_length": 382.609375, "epoch": 0.15737194053106537, "grad_norm": 0.044661957770586014, "kl": 0.6854386478662491, "learning_rate": 9.994006964783026e-05, "loss": 0.0003, "reward": 1.1375000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.9375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 606 }, { "completion_length": 439.140625, "epoch": 0.1576316302019087, "grad_norm": 0.05998518317937851, "kl": 0.7000111192464828, "learning_rate": 9.99398697535998e-05, "loss": 0.0004, "reward": 0.7625000402331352, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 607 }, { "completion_length": 405.421875, "epoch": 0.15789131987275207, "grad_norm": 0.04706816375255585, "kl": 0.6800902783870697, "learning_rate": 9.993966952675686e-05, "loss": 0.0003, "reward": 0.9500000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 608 }, { "completion_length": 381.625, "epoch": 0.1581510095435954, "grad_norm": 0.10397231578826904, "kl": 1.6594675034284592, "learning_rate": 9.993946896730282e-05, "loss": 0.0008, "reward": 0.9578125476837158, "reward_std": 0.21102426946163177, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 609 }, { "completion_length": 407.15625, "epoch": 0.15841069921443873, "grad_norm": 0.10576917231082916, "kl": 2.9076904207468033, "learning_rate": 9.9939268075239e-05, "loss": 0.0015, "reward": 0.9187500476837158, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 610 }, { "completion_length": 415.84375, "epoch": 0.1586703888852821, "grad_norm": 0.05874104052782059, "kl": 0.7372642308473587, "learning_rate": 9.993906685056673e-05, "loss": 0.0004, "reward": 0.9324219226837158, "reward_std": 0.16830649226903915, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 611 }, { "completion_length": 401.8125, "epoch": 0.15893007855612543, "grad_norm": 0.06319743394851685, "kl": 0.7761110216379166, "learning_rate": 9.993886529328736e-05, "loss": 0.0004, "reward": 0.7781250402331352, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 612 }, { "completion_length": 414.40625, "epoch": 0.15918976822696876, "grad_norm": 0.03632583096623421, "kl": 0.7721259742975235, "learning_rate": 9.993866340340222e-05, "loss": 0.0004, "reward": 0.9500000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 613 }, { "completion_length": 376.640625, "epoch": 0.15944945789781212, "grad_norm": 0.06434360146522522, "kl": 0.8246421962976456, "learning_rate": 9.993846118091268e-05, "loss": 0.0004, "reward": 0.9414062947034836, "reward_std": 0.17231567203998566, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.19140625, "step": 614 }, { "completion_length": 394.90625, "epoch": 0.15970914756865545, "grad_norm": 0.05279766768217087, "kl": 0.7169864773750305, "learning_rate": 9.993825862582005e-05, "loss": 0.0004, "reward": 0.9187500476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 615 }, { "completion_length": 362.109375, "epoch": 0.1599688372394988, "grad_norm": 0.04303775355219841, "kl": 0.809149369597435, "learning_rate": 9.993805573812571e-05, "loss": 0.0004, "reward": 1.0437500476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 616 }, { "completion_length": 392.546875, "epoch": 0.16022852691034215, "grad_norm": 0.06262785196304321, "kl": 0.7505749613046646, "learning_rate": 9.9937852517831e-05, "loss": 0.0004, "reward": 0.9167969226837158, "reward_std": 0.23715942353010178, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 617 }, { "completion_length": 414.0625, "epoch": 0.16048821658118548, "grad_norm": 0.05496448650956154, "kl": 0.785573199391365, "learning_rate": 9.993764896493728e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.17075317353010178, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 618 }, { "completion_length": 371.234375, "epoch": 0.16074790625202884, "grad_norm": 0.05746039003133774, "kl": 0.7864755094051361, "learning_rate": 9.993744507944589e-05, "loss": 0.0004, "reward": 0.9500000476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 619 }, { "completion_length": 413.21875, "epoch": 0.16100759592287217, "grad_norm": 0.05304551124572754, "kl": 0.6381707787513733, "learning_rate": 9.99372408613582e-05, "loss": 0.0003, "reward": 0.8593750447034836, "reward_std": 0.1923343911767006, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 620 }, { "completion_length": 418.84375, "epoch": 0.1612672855937155, "grad_norm": 0.04248740151524544, "kl": 0.8014712780714035, "learning_rate": 9.993703631067558e-05, "loss": 0.0004, "reward": 1.0125000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 621 }, { "completion_length": 394.125, "epoch": 0.16152697526455886, "grad_norm": 0.05814976617693901, "kl": 0.6667888313531876, "learning_rate": 9.993683142739937e-05, "loss": 0.0003, "reward": 0.9343750476837158, "reward_std": 0.1923343911767006, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 622 }, { "completion_length": 414.78125, "epoch": 0.1617866649354022, "grad_norm": 0.07615821063518524, "kl": 0.697838693857193, "learning_rate": 9.993662621153095e-05, "loss": 0.0003, "reward": 0.6289062947034836, "reward_std": 0.22514688223600388, "rewards/spct_argmax_reward_func": 0.4375, "rewards/spct_format_reward_func": 0.1914062537252903, "step": 623 }, { "completion_length": 413.0, "epoch": 0.16204635460624553, "grad_norm": 0.058427657932043076, "kl": 0.7958896160125732, "learning_rate": 9.993642066307168e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 624 }, { "completion_length": 377.328125, "epoch": 0.1623060442770889, "grad_norm": 0.0534265972673893, "kl": 0.6585383713245392, "learning_rate": 9.993621478202292e-05, "loss": 0.0003, "reward": 0.7812500447034836, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 625 }, { "completion_length": 410.375, "epoch": 0.16256573394793222, "grad_norm": 0.06434708088636398, "kl": 0.686640128493309, "learning_rate": 9.993600856838607e-05, "loss": 0.0003, "reward": 0.9343750476837158, "reward_std": 0.1923343911767006, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 626 }, { "completion_length": 399.671875, "epoch": 0.16282542361877556, "grad_norm": 0.04469718039035797, "kl": 0.7151189744472504, "learning_rate": 9.993580202216247e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 627 }, { "completion_length": 441.3125, "epoch": 0.16308511328961892, "grad_norm": 0.06711231917142868, "kl": 0.7714115083217621, "learning_rate": 9.993559514335353e-05, "loss": 0.0004, "reward": 0.8941406756639481, "reward_std": 0.176368810236454, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.19101562723517418, "step": 628 }, { "completion_length": 428.34375, "epoch": 0.16334480296046225, "grad_norm": 0.04302092269062996, "kl": 0.7213712632656097, "learning_rate": 9.993538793196059e-05, "loss": 0.0004, "reward": 0.9187500476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 629 }, { "completion_length": 433.703125, "epoch": 0.16360449263130558, "grad_norm": 0.06118395924568176, "kl": 0.7445573657751083, "learning_rate": 9.993518038798506e-05, "loss": 0.0004, "reward": 1.0281250476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 630 }, { "completion_length": 446.265625, "epoch": 0.16386418230214894, "grad_norm": 0.0671529769897461, "kl": 0.7385834604501724, "learning_rate": 9.99349725114283e-05, "loss": 0.0004, "reward": 0.8679687976837158, "reward_std": 0.24259810894727707, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.19609375298023224, "step": 631 }, { "completion_length": 434.03125, "epoch": 0.16412387197299227, "grad_norm": 0.05915847420692444, "kl": 0.7092148214578629, "learning_rate": 9.993476430229171e-05, "loss": 0.0004, "reward": 1.0433594286441803, "reward_std": 0.17153442651033401, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.19960937649011612, "step": 632 }, { "completion_length": 477.3125, "epoch": 0.1643835616438356, "grad_norm": 0.034308236092329025, "kl": 0.6898788809776306, "learning_rate": 9.993455576057669e-05, "loss": 0.0003, "reward": 0.9812500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 633 }, { "completion_length": 444.96875, "epoch": 0.16464325131467897, "grad_norm": 0.049405716359615326, "kl": 0.963413342833519, "learning_rate": 9.993434688628458e-05, "loss": 0.0005, "reward": 1.0417969226837158, "reward_std": 0.12890625, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 634 }, { "completion_length": 453.4375, "epoch": 0.1649029409855223, "grad_norm": 0.037749066948890686, "kl": 0.6969957053661346, "learning_rate": 9.993413767941683e-05, "loss": 0.0003, "reward": 0.8406250476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 635 }, { "completion_length": 482.6875, "epoch": 0.16516263065636563, "grad_norm": 0.0501139834523201, "kl": 0.6667153835296631, "learning_rate": 9.99339281399748e-05, "loss": 0.0003, "reward": 1.0437500476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 636 }, { "completion_length": 473.203125, "epoch": 0.165422320327209, "grad_norm": 33.083595275878906, "kl": 666.0720947682858, "learning_rate": 9.993371826795987e-05, "loss": 0.333, "reward": 0.8550781607627869, "reward_std": 0.1995125338435173, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.19882812723517418, "step": 637 }, { "completion_length": 493.0625, "epoch": 0.16568200999805233, "grad_norm": 0.04576968401670456, "kl": 0.636703722178936, "learning_rate": 9.99335080633735e-05, "loss": 0.0003, "reward": 0.8015625476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 638 }, { "completion_length": 481.265625, "epoch": 0.16594169966889566, "grad_norm": 0.044124897569417953, "kl": 0.7189852297306061, "learning_rate": 9.993329752621702e-05, "loss": 0.0004, "reward": 0.9656250476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 639 }, { "completion_length": 509.640625, "epoch": 0.16620138933973902, "grad_norm": 0.05379362404346466, "kl": 0.7043949216604233, "learning_rate": 9.993308665649186e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 640 }, { "completion_length": 479.3125, "epoch": 0.16646107901058235, "grad_norm": 0.07884204387664795, "kl": 0.7591642439365387, "learning_rate": 9.993287545419944e-05, "loss": 0.0004, "reward": 0.9558594077825546, "reward_std": 0.20304188132286072, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.19023437798023224, "step": 641 }, { "completion_length": 508.21875, "epoch": 0.16672076868142569, "grad_norm": 0.05506615713238716, "kl": 0.6881031394004822, "learning_rate": 9.993266391934115e-05, "loss": 0.0003, "reward": 0.9355469197034836, "reward_std": 0.20107503235340118, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.1855468787252903, "step": 642 }, { "completion_length": 477.546875, "epoch": 0.16698045835226905, "grad_norm": 0.03239639103412628, "kl": 0.6706093400716782, "learning_rate": 9.993245205191839e-05, "loss": 0.0003, "reward": 0.9187500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 643 }, { "completion_length": 433.03125, "epoch": 0.16724014802311238, "grad_norm": 0.05321906879544258, "kl": 0.9145384579896927, "learning_rate": 9.993223985193259e-05, "loss": 0.0005, "reward": 0.9656250476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 644 }, { "completion_length": 462.71875, "epoch": 0.1674998376939557, "grad_norm": 0.04903215914964676, "kl": 0.8203199356794357, "learning_rate": 9.993202731938516e-05, "loss": 0.0004, "reward": 0.9929687976837158, "reward_std": 0.10792933031916618, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.19609375298023224, "step": 645 }, { "completion_length": 447.34375, "epoch": 0.16775952736479907, "grad_norm": 0.05084645003080368, "kl": 0.7640218734741211, "learning_rate": 9.993181445427751e-05, "loss": 0.0004, "reward": 0.9953125715255737, "reward_std": 0.13232100754976273, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.19843750074505806, "step": 646 }, { "completion_length": 453.984375, "epoch": 0.1680192170356424, "grad_norm": 0.05692630633711815, "kl": 0.9810086041688919, "learning_rate": 9.993160125661106e-05, "loss": 0.0005, "reward": 1.0218750536441803, "reward_std": 0.06913860887289047, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.19375000149011612, "step": 647 }, { "completion_length": 491.71875, "epoch": 0.16827890670648574, "grad_norm": 0.05432989448308945, "kl": 0.7578971982002258, "learning_rate": 9.993138772638724e-05, "loss": 0.0004, "reward": 0.8875000476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 648 }, { "completion_length": 480.765625, "epoch": 0.1685385963773291, "grad_norm": 0.07028412073850632, "kl": 0.8587341606616974, "learning_rate": 9.993117386360745e-05, "loss": 0.0004, "reward": 0.9597656726837158, "reward_std": 0.24199381470680237, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.19414062798023224, "step": 649 }, { "completion_length": 456.625, "epoch": 0.16879828604817243, "grad_norm": 0.07407023012638092, "kl": 0.9093944579362869, "learning_rate": 9.993095966827313e-05, "loss": 0.0005, "reward": 0.9175781756639481, "reward_std": 0.2259281426668167, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.19882812723517418, "step": 650 }, { "completion_length": 428.25, "epoch": 0.16905797571901576, "grad_norm": 0.10051996260881424, "kl": 1.0270016342401505, "learning_rate": 9.99307451403857e-05, "loss": 0.0005, "reward": 0.8460938036441803, "reward_std": 0.17231567378621548, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.18984375149011612, "step": 651 }, { "completion_length": 439.65625, "epoch": 0.16931766538985912, "grad_norm": 0.0538654550909996, "kl": 0.8904881924390793, "learning_rate": 9.993053027994658e-05, "loss": 0.0004, "reward": 0.9500000476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 652 }, { "completion_length": 478.09375, "epoch": 0.16957735506070246, "grad_norm": 0.06323438882827759, "kl": 0.909827783703804, "learning_rate": 9.993031508695723e-05, "loss": 0.0005, "reward": 0.9500000476837158, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 653 }, { "completion_length": 487.21875, "epoch": 0.16983704473154582, "grad_norm": 0.07777228951454163, "kl": 0.8589666336774826, "learning_rate": 9.993009956141905e-05, "loss": 0.0004, "reward": 0.7937500476837158, "reward_std": 0.2596687823534012, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 654 }, { "completion_length": 452.28125, "epoch": 0.17009673440238915, "grad_norm": 0.06958256661891937, "kl": 0.8922757506370544, "learning_rate": 9.992988370333349e-05, "loss": 0.0004, "reward": 0.8816406726837158, "reward_std": 0.20503151416778564, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.19414062798023224, "step": 655 }, { "completion_length": 475.921875, "epoch": 0.17035642407323248, "grad_norm": 0.35555499792099, "kl": 3.527644917368889, "learning_rate": 9.992966751270201e-05, "loss": 0.0018, "reward": 0.8867188096046448, "reward_std": 0.1359066665172577, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.1992187537252903, "step": 656 }, { "completion_length": 483.984375, "epoch": 0.17061611374407584, "grad_norm": 0.05635622888803482, "kl": 0.7931375801563263, "learning_rate": 9.992945098952601e-05, "loss": 0.0004, "reward": 0.8851562887430191, "reward_std": 0.10327189136296511, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.19765625521540642, "step": 657 }, { "completion_length": 500.171875, "epoch": 0.17087580341491918, "grad_norm": 0.04367845505475998, "kl": 0.7740752100944519, "learning_rate": 9.992923413380694e-05, "loss": 0.0004, "reward": 0.8562500476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 658 }, { "completion_length": 513.5, "epoch": 0.1711354930857625, "grad_norm": 0.06065117195248604, "kl": 0.7909809947013855, "learning_rate": 9.992901694554628e-05, "loss": 0.0004, "reward": 0.7296875417232513, "reward_std": 0.19905882328748703, "rewards/spct_argmax_reward_func": 0.53125, "rewards/spct_format_reward_func": 0.19843750447034836, "step": 659 }, { "completion_length": 532.25, "epoch": 0.17139518275660587, "grad_norm": 0.06107771769165993, "kl": 0.8452366441488266, "learning_rate": 9.992879942474545e-05, "loss": 0.0004, "reward": 0.9792969226837158, "reward_std": 0.1649906486272812, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 660 }, { "completion_length": 559.734375, "epoch": 0.1716548724274492, "grad_norm": 0.06968287378549576, "kl": 0.969861775636673, "learning_rate": 9.992858157140588e-05, "loss": 0.0005, "reward": 0.8933594226837158, "reward_std": 0.14176402078010142, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.19023437425494194, "step": 661 }, { "completion_length": 620.375, "epoch": 0.17191456209829253, "grad_norm": 0.049453962594270706, "kl": 0.7733497619628906, "learning_rate": 9.992836338552906e-05, "loss": 0.0004, "reward": 0.9792969226837158, "reward_std": 0.16347210109233856, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 662 }, { "completion_length": 589.875, "epoch": 0.1721742517691359, "grad_norm": 0.07487046718597412, "kl": 0.9360682368278503, "learning_rate": 9.992814486711643e-05, "loss": 0.0005, "reward": 1.0578125417232513, "reward_std": 0.16780882328748703, "rewards/spct_argmax_reward_func": 0.859375, "rewards/spct_format_reward_func": 0.19843750447034836, "step": 663 }, { "completion_length": 617.15625, "epoch": 0.17243394143997923, "grad_norm": 0.05497758463025093, "kl": 0.770787388086319, "learning_rate": 9.992792601616943e-05, "loss": 0.0004, "reward": 0.8875000476837158, "reward_std": 0.23325317353010178, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 664 }, { "completion_length": 617.390625, "epoch": 0.17269363111082256, "grad_norm": 0.04593600332736969, "kl": 0.8565282225608826, "learning_rate": 9.992770683268951e-05, "loss": 0.0004, "reward": 0.9343750476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 665 }, { "completion_length": 637.609375, "epoch": 0.17295332078166592, "grad_norm": 0.06420119106769562, "kl": 0.8773506432771683, "learning_rate": 9.992748731667817e-05, "loss": 0.0004, "reward": 0.9601562917232513, "reward_std": 0.308223120868206, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.19453125447034836, "step": 666 }, { "completion_length": 677.90625, "epoch": 0.17321301045250925, "grad_norm": 0.05653582513332367, "kl": 0.9371504634618759, "learning_rate": 9.992726746813684e-05, "loss": 0.0005, "reward": 0.8230469226837158, "reward_std": 0.2635750249028206, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 667 }, { "completion_length": 647.078125, "epoch": 0.17347270012335259, "grad_norm": 0.15268082916736603, "kl": 2.4877490550279617, "learning_rate": 9.9927047287067e-05, "loss": 0.0012, "reward": 0.8480469137430191, "reward_std": 0.20092814415693283, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.17617188021540642, "step": 668 }, { "completion_length": 665.4375, "epoch": 0.17373238979419595, "grad_norm": 0.06072231009602547, "kl": 0.8549028486013412, "learning_rate": 9.99268267734701e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.2957531735301018, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 669 }, { "completion_length": 625.0, "epoch": 0.17399207946503928, "grad_norm": 0.1430898755788803, "kl": 1.2006125152111053, "learning_rate": 9.992660592734762e-05, "loss": 0.0006, "reward": 0.8285156786441803, "reward_std": 0.13061564118834212, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.18789063021540642, "step": 670 }, { "completion_length": 642.296875, "epoch": 0.1742517691358826, "grad_norm": 0.05480959266424179, "kl": 0.8396151512861252, "learning_rate": 9.992638474870104e-05, "loss": 0.0004, "reward": 0.7585937976837158, "reward_std": 0.21134811267256737, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.19609375298023224, "step": 671 }, { "completion_length": 606.8125, "epoch": 0.17451145880672597, "grad_norm": 0.06829159706830978, "kl": 1.3002463430166245, "learning_rate": 9.992616323753181e-05, "loss": 0.0007, "reward": 1.0125000476837158, "reward_std": 0.2596687823534012, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 672 }, { "completion_length": 641.765625, "epoch": 0.1747711484775693, "grad_norm": 0.048704568296670914, "kl": 0.8017003536224365, "learning_rate": 9.992594139384144e-05, "loss": 0.0004, "reward": 0.9968750476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 673 }, { "completion_length": 680.828125, "epoch": 0.17503083814841264, "grad_norm": 0.059265706688165665, "kl": 0.8163574486970901, "learning_rate": 9.992571921763136e-05, "loss": 0.0004, "reward": 0.9187500476837158, "reward_std": 0.30542195588350296, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 674 }, { "completion_length": 640.0, "epoch": 0.175290527819256, "grad_norm": 0.057180725038051605, "kl": 0.8275455236434937, "learning_rate": 9.992549670890308e-05, "loss": 0.0004, "reward": 0.8562500476837158, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 675 }, { "completion_length": 630.109375, "epoch": 0.17555021749009933, "grad_norm": 0.0803978368639946, "kl": 0.8498388528823853, "learning_rate": 9.992527386765809e-05, "loss": 0.0004, "reward": 0.9031250476837158, "reward_std": 0.3270031735301018, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 676 }, { "completion_length": 653.109375, "epoch": 0.17580990716094266, "grad_norm": 0.05256145820021629, "kl": 0.813862070441246, "learning_rate": 9.992505069389785e-05, "loss": 0.0004, "reward": 0.9343750476837158, "reward_std": 0.15625, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 677 }, { "completion_length": 688.21875, "epoch": 0.17606959683178602, "grad_norm": 0.042021263390779495, "kl": 0.7546396553516388, "learning_rate": 9.992482718762386e-05, "loss": 0.0004, "reward": 0.9187500476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 678 }, { "completion_length": 685.203125, "epoch": 0.17632928650262936, "grad_norm": 0.05580262094736099, "kl": 0.7266627997159958, "learning_rate": 9.992460334883761e-05, "loss": 0.0004, "reward": 1.0585937947034836, "reward_std": 0.19389688968658447, "rewards/spct_argmax_reward_func": 0.859375, "rewards/spct_format_reward_func": 0.19921875, "step": 679 }, { "completion_length": 806.484375, "epoch": 0.1765889761734727, "grad_norm": 0.04503048583865166, "kl": 0.7982790917158127, "learning_rate": 9.992437917754059e-05, "loss": 0.0004, "reward": 0.9031250476837158, "reward_std": 0.15625, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 680 }, { "completion_length": 822.890625, "epoch": 0.17684866584431605, "grad_norm": 0.0592944473028183, "kl": 0.7333850711584091, "learning_rate": 9.992415467373428e-05, "loss": 0.0004, "reward": 0.9500000476837158, "reward_std": 0.33183755725622177, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 681 }, { "completion_length": 800.640625, "epoch": 0.17710835551515938, "grad_norm": 0.05552119016647339, "kl": 0.7090904861688614, "learning_rate": 9.992392983742018e-05, "loss": 0.0004, "reward": 1.0281250476837158, "reward_std": 0.1923343911767006, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 682 }, { "completion_length": 776.96875, "epoch": 0.17736804518600272, "grad_norm": 0.07141875475645065, "kl": 0.9116312265396118, "learning_rate": 9.992370466859981e-05, "loss": 0.0005, "reward": 0.8453125357627869, "reward_std": 0.27871256321668625, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.18906250223517418, "step": 683 }, { "completion_length": 723.125, "epoch": 0.17762773485684608, "grad_norm": 0.2218891829252243, "kl": 1.775693267583847, "learning_rate": 9.992347916727465e-05, "loss": 0.0009, "reward": 0.8046875447034836, "reward_std": 0.2260490000480786, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.1796875, "step": 684 }, { "completion_length": 803.5, "epoch": 0.1778874245276894, "grad_norm": 0.2513050138950348, "kl": 2.6752208471298218, "learning_rate": 9.992325333344619e-05, "loss": 0.0013, "reward": 1.0593750476837158, "reward_std": 0.15625, "rewards/spct_argmax_reward_func": 0.859375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 685 }, { "completion_length": 905.71875, "epoch": 0.17814711419853274, "grad_norm": 0.03677990660071373, "kl": 0.7147061079740524, "learning_rate": 9.992302716711597e-05, "loss": 0.0004, "reward": 0.9339844286441803, "reward_std": 0.1295827254652977, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.19960937649011612, "step": 686 }, { "completion_length": 750.375, "epoch": 0.1784068038693761, "grad_norm": 0.05919785052537918, "kl": 1.0226573944091797, "learning_rate": 9.992280066828545e-05, "loss": 0.0005, "reward": 1.069921925663948, "reward_std": 0.1686195805668831, "rewards/spct_argmax_reward_func": 0.875, "rewards/spct_format_reward_func": 0.19492187723517418, "step": 687 }, { "completion_length": 897.78125, "epoch": 0.17866649354021943, "grad_norm": 0.06909899413585663, "kl": 1.7266048938035965, "learning_rate": 9.99225738369562e-05, "loss": 0.0009, "reward": 0.7523437887430191, "reward_std": 0.19702189043164253, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.17421875149011612, "step": 688 }, { "completion_length": 846.125, "epoch": 0.17892618321106277, "grad_norm": 0.07739473879337311, "kl": 2.471794009208679, "learning_rate": 9.992234667312965e-05, "loss": 0.0012, "reward": 0.9554688036441803, "reward_std": 0.16748128249309957, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.18984375149011612, "step": 689 }, { "completion_length": 916.765625, "epoch": 0.17918587288190613, "grad_norm": 0.045763254165649414, "kl": 1.1177940666675568, "learning_rate": 9.992211917680739e-05, "loss": 0.0006, "reward": 0.8660156726837158, "reward_std": 0.1337406411767006, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.19414062798023224, "step": 690 }, { "completion_length": 879.40625, "epoch": 0.17944556255274946, "grad_norm": 0.05991334840655327, "kl": 0.8062431812286377, "learning_rate": 9.99218913479909e-05, "loss": 0.0004, "reward": 0.8699219226837158, "reward_std": 0.2684094235301018, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 691 }, { "completion_length": 807.875, "epoch": 0.1797052522235928, "grad_norm": 0.024685844779014587, "kl": 0.7056685984134674, "learning_rate": 9.992166318668169e-05, "loss": 0.0004, "reward": 0.9375000447034836, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 692 }, { "completion_length": 856.921875, "epoch": 0.17996494189443615, "grad_norm": 0.06975923478603363, "kl": 1.044943019747734, "learning_rate": 9.99214346928813e-05, "loss": 0.0005, "reward": 0.703125037252903, "reward_std": 0.1963854804635048, "rewards/spct_argmax_reward_func": 0.53125, "rewards/spct_format_reward_func": 0.171875, "step": 693 }, { "completion_length": 890.796875, "epoch": 0.1802246315652795, "grad_norm": 0.046485017985105515, "kl": 0.7637814283370972, "learning_rate": 9.992120586659121e-05, "loss": 0.0004, "reward": 0.8250000476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 694 }, { "completion_length": 757.421875, "epoch": 0.18048432123612285, "grad_norm": 0.14200548827648163, "kl": 1.1413792967796326, "learning_rate": 9.9920976707813e-05, "loss": 0.0006, "reward": 0.7957031577825546, "reward_std": 0.19844920933246613, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.17070312052965164, "step": 695 }, { "completion_length": 915.78125, "epoch": 0.18074401090696618, "grad_norm": 0.03355897590517998, "kl": 0.6967353820800781, "learning_rate": 9.992074721654818e-05, "loss": 0.0003, "reward": 1.0593750476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.859375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 696 }, { "completion_length": 877.84375, "epoch": 0.1810037005778095, "grad_norm": 0.04316726326942444, "kl": 0.7287358641624451, "learning_rate": 9.992051739279825e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 697 }, { "completion_length": 837.65625, "epoch": 0.18126339024865287, "grad_norm": 5296.3056640625, "kl": 19798.364503994584, "learning_rate": 9.992028723656477e-05, "loss": 9.8992, "reward": 0.8406250476837158, "reward_std": 0.2645031735301018, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 698 }, { "completion_length": 863.78125, "epoch": 0.1815230799194962, "grad_norm": 0.07209224253892899, "kl": 1.1841195672750473, "learning_rate": 9.992005674784925e-05, "loss": 0.0006, "reward": 0.8789062947034836, "reward_std": 0.07373128249309957, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.1914062537252903, "step": 699 }, { "completion_length": 834.34375, "epoch": 0.18178276959033954, "grad_norm": 0.08287845551967621, "kl": 2.2959336042404175, "learning_rate": 9.991982592665327e-05, "loss": 0.0011, "reward": 0.9285156726837158, "reward_std": 0.07335469499230385, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.19414062798023224, "step": 700 }, { "completion_length": 749.140625, "epoch": 0.1820424592611829, "grad_norm": 0.17193469405174255, "kl": 3.9108620434999466, "learning_rate": 9.991959477297832e-05, "loss": 0.002, "reward": 0.8671875447034836, "reward_std": 0.23505739122629166, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.1796875037252903, "step": 701 }, { "completion_length": 794.59375, "epoch": 0.18230214893202623, "grad_norm": 1.609176516532898, "kl": 35.45966202020645, "learning_rate": 9.991936328682595e-05, "loss": 0.0177, "reward": 0.7707031667232513, "reward_std": 0.2554687485098839, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.17695312574505806, "step": 702 }, { "completion_length": 817.0, "epoch": 0.18256183860286956, "grad_norm": 0.03973821550607681, "kl": 0.7263753563165665, "learning_rate": 9.991913146819771e-05, "loss": 0.0004, "reward": 0.9343750476837158, "reward_std": 0.15625, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 703 }, { "completion_length": 885.6875, "epoch": 0.18282152827371292, "grad_norm": 0.06560348719358444, "kl": 1.2972942888736725, "learning_rate": 9.991889931709513e-05, "loss": 0.0006, "reward": 0.9468750357627869, "reward_std": 0.19132732599973679, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.19687500223517418, "step": 704 }, { "completion_length": 836.171875, "epoch": 0.18308121794455626, "grad_norm": 0.05135815218091011, "kl": 0.770713210105896, "learning_rate": 9.991866683351979e-05, "loss": 0.0004, "reward": 0.8699219226837158, "reward_std": 0.16830649226903915, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 705 }, { "completion_length": 812.3125, "epoch": 0.1833409076153996, "grad_norm": 0.0775807797908783, "kl": 1.4266963303089142, "learning_rate": 9.991843401747321e-05, "loss": 0.0007, "reward": 1.0066406726837158, "reward_std": 0.20107503235340118, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.19414062798023224, "step": 706 }, { "completion_length": 833.25, "epoch": 0.18360059728624295, "grad_norm": 0.05419030040502548, "kl": 0.9168799370527267, "learning_rate": 9.991820086895694e-05, "loss": 0.0005, "reward": 0.9000000357627869, "reward_std": 0.2945477217435837, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.19687500223517418, "step": 707 }, { "completion_length": 855.046875, "epoch": 0.18386028695708628, "grad_norm": 0.060016389936208725, "kl": 1.5546134561300278, "learning_rate": 9.991796738797254e-05, "loss": 0.0008, "reward": 0.8972656726837158, "reward_std": 0.24498582631349564, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.19414062798023224, "step": 708 }, { "completion_length": 691.53125, "epoch": 0.18411997662792962, "grad_norm": 0.04383380711078644, "kl": 0.9033942222595215, "learning_rate": 9.991773357452156e-05, "loss": 0.0005, "reward": 0.9218750447034836, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 709 }, { "completion_length": 741.109375, "epoch": 0.18437966629877298, "grad_norm": 0.05438307672739029, "kl": 1.17763252556324, "learning_rate": 9.991749942860558e-05, "loss": 0.0006, "reward": 0.8308594226837158, "reward_std": 0.1962406411767006, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.19023437798023224, "step": 710 }, { "completion_length": 831.734375, "epoch": 0.1846393559696163, "grad_norm": 0.045508116483688354, "kl": 0.8454862982034683, "learning_rate": 9.991726495022612e-05, "loss": 0.0004, "reward": 0.8710937947034836, "reward_std": 0.19389689341187477, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.19921875, "step": 711 }, { "completion_length": 772.796875, "epoch": 0.18489904564045964, "grad_norm": 0.07850410789251328, "kl": 0.9876638799905777, "learning_rate": 9.991703013938477e-05, "loss": 0.0005, "reward": 0.8066406548023224, "reward_std": 0.22624599188566208, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.181640625, "step": 712 }, { "completion_length": 803.703125, "epoch": 0.185158735311303, "grad_norm": 0.042418740689754486, "kl": 0.9137747585773468, "learning_rate": 9.991679499608308e-05, "loss": 0.0005, "reward": 0.9812500476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 713 }, { "completion_length": 657.25, "epoch": 0.18541842498214633, "grad_norm": 0.1400376558303833, "kl": 3.9455988854169846, "learning_rate": 9.991655952032264e-05, "loss": 0.002, "reward": 0.6781250387430191, "reward_std": 0.2548343911767006, "rewards/spct_argmax_reward_func": 0.515625, "rewards/spct_format_reward_func": 0.16250000149011612, "step": 714 }, { "completion_length": 705.625, "epoch": 0.18567811465298967, "grad_norm": 0.034976013004779816, "kl": 0.9968140125274658, "learning_rate": 9.9916323712105e-05, "loss": 0.0005, "reward": 0.9019531607627869, "reward_std": 0.06967814266681671, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.19882812723517418, "step": 715 }, { "completion_length": 773.734375, "epoch": 0.18593780432383303, "grad_norm": 0.04963957890868187, "kl": 0.875949501991272, "learning_rate": 9.991608757143171e-05, "loss": 0.0004, "reward": 0.9031250476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 716 }, { "completion_length": 805.109375, "epoch": 0.18619749399467636, "grad_norm": 0.04086252301931381, "kl": 0.8261198252439499, "learning_rate": 9.991585109830439e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.17075317353010178, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 717 }, { "completion_length": 815.46875, "epoch": 0.1864571836655197, "grad_norm": 0.09087847918272018, "kl": 0.8886104673147202, "learning_rate": 9.991561429272458e-05, "loss": 0.0004, "reward": 0.8449219167232513, "reward_std": 0.22436564415693283, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.18867187574505806, "step": 718 }, { "completion_length": 808.96875, "epoch": 0.18671687333636305, "grad_norm": 0.04027705267071724, "kl": 0.8871251493692398, "learning_rate": 9.991537715469385e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.17075317353010178, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 719 }, { "completion_length": 825.28125, "epoch": 0.1869765630072064, "grad_norm": 0.027234995737671852, "kl": 0.8129685819149017, "learning_rate": 9.991513968421382e-05, "loss": 0.0004, "reward": 0.7781250402331352, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 720 }, { "completion_length": 800.8125, "epoch": 0.18723625267804972, "grad_norm": 0.03735742345452309, "kl": 0.8731866478919983, "learning_rate": 9.991490188128604e-05, "loss": 0.0004, "reward": 0.8640625402331352, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 721 }, { "completion_length": 708.296875, "epoch": 0.18749594234889308, "grad_norm": 0.09081047773361206, "kl": 1.8292205929756165, "learning_rate": 9.99146637459121e-05, "loss": 0.0009, "reward": 0.9433594197034836, "reward_std": 0.1337406411767006, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.1777343787252903, "step": 722 }, { "completion_length": 790.9375, "epoch": 0.1877556320197364, "grad_norm": 0.12499433010816574, "kl": 1.0271024852991104, "learning_rate": 9.991442527809358e-05, "loss": 0.0005, "reward": 0.5636719092726707, "reward_std": 0.2447001412510872, "rewards/spct_argmax_reward_func": 0.390625, "rewards/spct_format_reward_func": 0.17304687574505806, "step": 723 }, { "completion_length": 825.609375, "epoch": 0.18801532169057975, "grad_norm": 0.09271940588951111, "kl": 1.1204200834035873, "learning_rate": 9.991418647783208e-05, "loss": 0.0006, "reward": 0.9070312976837158, "reward_std": 0.19807089120149612, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.18828125298023224, "step": 724 }, { "completion_length": 839.796875, "epoch": 0.1882750113614231, "grad_norm": 0.049900081008672714, "kl": 0.8546217828989029, "learning_rate": 9.99139473451292e-05, "loss": 0.0004, "reward": 0.9031250476837158, "reward_std": 0.22841878235340118, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 725 }, { "completion_length": 805.4375, "epoch": 0.18853470103226644, "grad_norm": 0.05667894333600998, "kl": 1.3075708746910095, "learning_rate": 9.991370787998651e-05, "loss": 0.0007, "reward": 0.7617187947034836, "reward_std": 0.2329293228685856, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.1835937537252903, "step": 726 }, { "completion_length": 808.890625, "epoch": 0.18879439070310977, "grad_norm": 0.04118787497282028, "kl": 0.9372167885303497, "learning_rate": 9.99134680824056e-05, "loss": 0.0005, "reward": 1.0593750476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.859375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 727 }, { "completion_length": 855.84375, "epoch": 0.18905408037395313, "grad_norm": 0.040709562599658966, "kl": 1.1262221038341522, "learning_rate": 9.99132279523881e-05, "loss": 0.0006, "reward": 0.8875000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 728 }, { "completion_length": 862.96875, "epoch": 0.18931377004479646, "grad_norm": 0.058021996170282364, "kl": 1.586018592119217, "learning_rate": 9.991298748993558e-05, "loss": 0.0008, "reward": 0.9597656726837158, "reward_std": 0.23232503235340118, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.19414062798023224, "step": 729 }, { "completion_length": 841.203125, "epoch": 0.1895734597156398, "grad_norm": 0.07692593336105347, "kl": 2.435766026377678, "learning_rate": 9.991274669504967e-05, "loss": 0.0012, "reward": 0.9703125357627869, "reward_std": 0.23325317353010178, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.18906250223517418, "step": 730 }, { "completion_length": 820.90625, "epoch": 0.18983314938648316, "grad_norm": 0.03415672853589058, "kl": 0.9766780585050583, "learning_rate": 9.991250556773193e-05, "loss": 0.0005, "reward": 0.8953125476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 731 }, { "completion_length": 852.25, "epoch": 0.1900928390573265, "grad_norm": 0.050578486174345016, "kl": 0.8908003568649292, "learning_rate": 9.9912264107984e-05, "loss": 0.0004, "reward": 1.0750000476837158, "reward_std": 0.17075317353010178, "rewards/spct_argmax_reward_func": 0.875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 732 }, { "completion_length": 849.59375, "epoch": 0.19035252872816985, "grad_norm": 0.9303510189056396, "kl": 12.878043115139008, "learning_rate": 9.991202231580749e-05, "loss": 0.0064, "reward": 0.9160156697034836, "reward_std": 0.14948247536085546, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.1816406287252903, "step": 733 }, { "completion_length": 811.125, "epoch": 0.19061221839901318, "grad_norm": 0.045980874449014664, "kl": 1.0111996084451675, "learning_rate": 9.9911780191204e-05, "loss": 0.0005, "reward": 0.9023437947034836, "reward_std": 0.13917933031916618, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.1835937537252903, "step": 734 }, { "completion_length": 911.421875, "epoch": 0.19087190806985652, "grad_norm": 0.36557117104530334, "kl": 2.6947434544563293, "learning_rate": 9.991153773417513e-05, "loss": 0.0013, "reward": 1.0593750476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.859375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 735 }, { "completion_length": 894.71875, "epoch": 0.19113159774069988, "grad_norm": 0.04745196923613548, "kl": 1.5700717866420746, "learning_rate": 9.991129494472251e-05, "loss": 0.0008, "reward": 1.1218750476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.921875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 736 }, { "completion_length": 873.1875, "epoch": 0.1913912874115432, "grad_norm": 0.05182301625609398, "kl": 0.8157350718975067, "learning_rate": 9.991105182284774e-05, "loss": 0.0004, "reward": 1.0507812947034836, "reward_std": 0.13139688968658447, "rewards/spct_argmax_reward_func": 0.859375, "rewards/spct_format_reward_func": 0.19140625, "step": 737 }, { "completion_length": 995.890625, "epoch": 0.19165097708238654, "grad_norm": 0.031442925333976746, "kl": 0.8070175051689148, "learning_rate": 9.991080836855249e-05, "loss": 0.0004, "reward": 0.9187500476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 738 }, { "completion_length": 960.4375, "epoch": 0.1919106667532299, "grad_norm": 0.023600853979587555, "kl": 0.8268120586872101, "learning_rate": 9.991056458183832e-05, "loss": 0.0004, "reward": 0.7312500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.53125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 739 }, { "completion_length": 969.375, "epoch": 0.19217035642407324, "grad_norm": 0.04238460585474968, "kl": 0.8281887173652649, "learning_rate": 9.991032046270689e-05, "loss": 0.0004, "reward": 0.9187500476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 740 }, { "completion_length": 893.671875, "epoch": 0.19243004609491657, "grad_norm": 0.06734117865562439, "kl": 1.1269726753234863, "learning_rate": 9.99100760111598e-05, "loss": 0.0006, "reward": 0.8355469107627869, "reward_std": 0.1372542567551136, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.17929687350988388, "step": 741 }, { "completion_length": 1009.8125, "epoch": 0.19268973576575993, "grad_norm": 0.0462472178041935, "kl": 1.134584590792656, "learning_rate": 9.990983122719871e-05, "loss": 0.0006, "reward": 0.7820312976837158, "reward_std": 0.16737835109233856, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.18828125298023224, "step": 742 }, { "completion_length": 1011.03125, "epoch": 0.19294942543660326, "grad_norm": 0.046104978770017624, "kl": 0.9751798361539841, "learning_rate": 9.990958611082522e-05, "loss": 0.0005, "reward": 0.8093750476837158, "reward_std": 0.21875, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 743 }, { "completion_length": 983.953125, "epoch": 0.1932091151074466, "grad_norm": 0.0727100819349289, "kl": 1.0762604475021362, "learning_rate": 9.990934066204098e-05, "loss": 0.0005, "reward": 0.8140625357627869, "reward_std": 0.15985843748785555, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.17343749850988388, "step": 744 }, { "completion_length": 1081.921875, "epoch": 0.19346880477828995, "grad_norm": 0.04232596978545189, "kl": 0.8127042949199677, "learning_rate": 9.990909488084763e-05, "loss": 0.0004, "reward": 0.8562500476837158, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 745 }, { "completion_length": 1021.9375, "epoch": 0.1937284944491333, "grad_norm": 0.05350840464234352, "kl": 1.0250237435102463, "learning_rate": 9.99088487672468e-05, "loss": 0.0005, "reward": 0.8148438036441803, "reward_std": 0.19873128086328506, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.18984375149011612, "step": 746 }, { "completion_length": 1073.09375, "epoch": 0.19398818411997662, "grad_norm": 0.04374001920223236, "kl": 0.867370992898941, "learning_rate": 9.99086023212401e-05, "loss": 0.0004, "reward": 0.9941406697034836, "reward_std": 0.10405314119998366, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.1816406287252903, "step": 747 }, { "completion_length": 1145.65625, "epoch": 0.19424787379081998, "grad_norm": 0.05224812403321266, "kl": 0.783129945397377, "learning_rate": 9.990835554282922e-05, "loss": 0.0004, "reward": 0.8316406607627869, "reward_std": 0.2126818080432713, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.19101562723517418, "step": 748 }, { "completion_length": 1052.046875, "epoch": 0.1945075634616633, "grad_norm": 0.038338638842105865, "kl": 0.92141392827034, "learning_rate": 9.990810843201577e-05, "loss": 0.0005, "reward": 0.9824219197034836, "reward_std": 0.10108764842152596, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.1855468787252903, "step": 749 }, { "completion_length": 1031.953125, "epoch": 0.19476725313250665, "grad_norm": 0.06411051750183105, "kl": 0.8355348259210587, "learning_rate": 9.990786098880143e-05, "loss": 0.0004, "reward": 0.9085937887430191, "reward_std": 0.19873128086328506, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.18984375149011612, "step": 750 }, { "completion_length": 999.484375, "epoch": 0.19502694280335, "grad_norm": 0.0479779988527298, "kl": 0.911449745297432, "learning_rate": 9.99076132131878e-05, "loss": 0.0005, "reward": 1.0000000447034836, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 751 }, { "completion_length": 1156.890625, "epoch": 0.19528663247419334, "grad_norm": 0.04399232566356659, "kl": 0.7997239083051682, "learning_rate": 9.990736510517658e-05, "loss": 0.0004, "reward": 0.8132812976837158, "reward_std": 0.14699183031916618, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.18828125298023224, "step": 752 }, { "completion_length": 1089.53125, "epoch": 0.19554632214503667, "grad_norm": 0.04286782816052437, "kl": 0.7871408462524414, "learning_rate": 9.990711666476937e-05, "loss": 0.0004, "reward": 0.7035156488418579, "reward_std": 0.07350547052919865, "rewards/spct_argmax_reward_func": 0.515625, "rewards/spct_format_reward_func": 0.18789062649011612, "step": 753 }, { "completion_length": 1004.796875, "epoch": 0.19580601181588003, "grad_norm": 0.1909383237361908, "kl": 5.701258987188339, "learning_rate": 9.990686789196788e-05, "loss": 0.0029, "reward": 0.9734375476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 754 }, { "completion_length": 1022.546875, "epoch": 0.19606570148672336, "grad_norm": 0.05948866903781891, "kl": 0.9668377637863159, "learning_rate": 9.990661878677372e-05, "loss": 0.0005, "reward": 0.7710937932133675, "reward_std": 0.19873128086328506, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.17734375223517418, "step": 755 }, { "completion_length": 1012.25, "epoch": 0.1963253911575667, "grad_norm": 0.06937777996063232, "kl": 1.4611217826604843, "learning_rate": 9.990636934918858e-05, "loss": 0.0007, "reward": 0.9210937917232513, "reward_std": 0.11041354760527611, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.18671875074505806, "step": 756 }, { "completion_length": 976.375, "epoch": 0.19658508082841006, "grad_norm": 0.04350823909044266, "kl": 0.7508280575275421, "learning_rate": 9.990611957921411e-05, "loss": 0.0004, "reward": 0.8593750447034836, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 757 }, { "completion_length": 961.671875, "epoch": 0.1968447704992534, "grad_norm": 0.019926464185118675, "kl": 0.8467313051223755, "learning_rate": 9.990586947685197e-05, "loss": 0.0004, "reward": 0.9109375402331352, "reward_std": 0.03608439117670059, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 758 }, { "completion_length": 889.15625, "epoch": 0.19710446017009672, "grad_norm": 0.10068635642528534, "kl": 1.1153706312179565, "learning_rate": 9.990561904210383e-05, "loss": 0.0006, "reward": 0.9113281667232513, "reward_std": 0.11027307948097587, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.17695312947034836, "step": 759 }, { "completion_length": 898.53125, "epoch": 0.19736414984094008, "grad_norm": 0.04248156026005745, "kl": 0.8058287650346756, "learning_rate": 9.990536827497135e-05, "loss": 0.0004, "reward": 1.0125000476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 760 }, { "completion_length": 866.4375, "epoch": 0.19762383951178342, "grad_norm": 0.07694686204195023, "kl": 1.5718744397163391, "learning_rate": 9.990511717545623e-05, "loss": 0.0008, "reward": 0.7062500342726707, "reward_std": 0.06562499864958227, "rewards/spct_argmax_reward_func": 0.53125, "rewards/spct_format_reward_func": 0.17500000074505806, "step": 761 }, { "completion_length": 925.984375, "epoch": 0.19788352918262675, "grad_norm": 0.05968976020812988, "kl": 1.074109897017479, "learning_rate": 9.99048657435601e-05, "loss": 0.0005, "reward": 0.9859375357627869, "reward_std": 0.07340743904933333, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.18906250223517418, "step": 762 }, { "completion_length": 917.390625, "epoch": 0.1981432188534701, "grad_norm": 0.06873944401741028, "kl": 0.8896832168102264, "learning_rate": 9.990461397928466e-05, "loss": 0.0004, "reward": 0.7859375476837158, "reward_std": 0.2860843911767006, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 763 }, { "completion_length": 854.25, "epoch": 0.19840290852431344, "grad_norm": 0.052456602454185486, "kl": 0.8232639729976654, "learning_rate": 9.990436188263158e-05, "loss": 0.0004, "reward": 0.8406250402331352, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 764 }, { "completion_length": 834.3125, "epoch": 0.19866259819515678, "grad_norm": 0.5870294570922852, "kl": 9.051081210374832, "learning_rate": 9.990410945360255e-05, "loss": 0.0045, "reward": 0.8796875476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 765 }, { "completion_length": 789.796875, "epoch": 0.19892228786600014, "grad_norm": 0.09239281713962555, "kl": 1.0284759402275085, "learning_rate": 9.990385669219923e-05, "loss": 0.0005, "reward": 0.9691406786441803, "reward_std": 0.09936564043164253, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.18789063021540642, "step": 766 }, { "completion_length": 858.671875, "epoch": 0.19918197753684347, "grad_norm": 0.07058747112751007, "kl": 0.9904787242412567, "learning_rate": 9.990360359842333e-05, "loss": 0.0005, "reward": 0.8210937976837158, "reward_std": 0.17526372522115707, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.19609375298023224, "step": 767 }, { "completion_length": 887.03125, "epoch": 0.1994416672076868, "grad_norm": 0.03932597488164902, "kl": 0.9352779388427734, "learning_rate": 9.99033501722765e-05, "loss": 0.0005, "reward": 0.9500000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 768 }, { "completion_length": 871.765625, "epoch": 0.19970135687853016, "grad_norm": 0.057103246450424194, "kl": 0.7475481629371643, "learning_rate": 9.990309641376046e-05, "loss": 0.0004, "reward": 1.0437500476837158, "reward_std": 0.1875, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 769 }, { "completion_length": 800.671875, "epoch": 0.1999610465493735, "grad_norm": 0.073142409324646, "kl": 0.9091330617666245, "learning_rate": 9.990284232287689e-05, "loss": 0.0005, "reward": 1.0167969167232513, "reward_std": 0.13554486096836627, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.18867187574505806, "step": 770 }, { "completion_length": 886.21875, "epoch": 0.20022073622021683, "grad_norm": 0.06380792707204819, "kl": 0.8393561989068985, "learning_rate": 9.990258789962748e-05, "loss": 0.0004, "reward": 0.8562500476837158, "reward_std": 0.25, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 771 }, { "completion_length": 876.1875, "epoch": 0.2004804258910602, "grad_norm": 0.0539175420999527, "kl": 0.8777493685483932, "learning_rate": 9.990233314401393e-05, "loss": 0.0004, "reward": 0.8582031726837158, "reward_std": 0.1337406411767006, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.18632812798023224, "step": 772 }, { "completion_length": 888.953125, "epoch": 0.20074011556190352, "grad_norm": 0.045082367956638336, "kl": 0.8109681308269501, "learning_rate": 9.990207805603793e-05, "loss": 0.0004, "reward": 1.0125000476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 773 }, { "completion_length": 907.53125, "epoch": 0.20099980523274688, "grad_norm": 0.04788782075047493, "kl": 0.8182965070009232, "learning_rate": 9.990182263570118e-05, "loss": 0.0004, "reward": 0.8718750476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 774 }, { "completion_length": 935.515625, "epoch": 0.2012594949035902, "grad_norm": 0.06093936040997505, "kl": 0.798709824681282, "learning_rate": 9.990156688300539e-05, "loss": 0.0004, "reward": 0.9343750476837158, "reward_std": 0.23808756470680237, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 775 }, { "completion_length": 894.40625, "epoch": 0.20151918457443355, "grad_norm": 0.04844070225954056, "kl": 0.8293834030628204, "learning_rate": 9.990131079795224e-05, "loss": 0.0004, "reward": 0.8835937976837158, "reward_std": 0.13917933031916618, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.19609375298023224, "step": 776 }, { "completion_length": 856.875, "epoch": 0.2017788742452769, "grad_norm": 0.08967852592468262, "kl": 1.5277348458766937, "learning_rate": 9.990105438054346e-05, "loss": 0.0008, "reward": 0.8714844286441803, "reward_std": 0.23872192203998566, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.18398437649011612, "step": 777 }, { "completion_length": 856.15625, "epoch": 0.20203856391612024, "grad_norm": 0.05634549632668495, "kl": 1.3107932358980179, "learning_rate": 9.990079763078075e-05, "loss": 0.0007, "reward": 0.9792969226837158, "reward_std": 0.13857503235340118, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 778 }, { "completion_length": 863.9375, "epoch": 0.20229825358696357, "grad_norm": 0.04137717932462692, "kl": 0.8238113820552826, "learning_rate": 9.990054054866582e-05, "loss": 0.0004, "reward": 1.0281250476837158, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 779 }, { "completion_length": 769.296875, "epoch": 0.20255794325780693, "grad_norm": 0.0686882957816124, "kl": 0.9729804396629333, "learning_rate": 9.990028313420037e-05, "loss": 0.0005, "reward": 0.6414062827825546, "reward_std": 0.16264689119998366, "rewards/spct_argmax_reward_func": 0.46875, "rewards/spct_format_reward_func": 0.17265625298023224, "step": 780 }, { "completion_length": 832.328125, "epoch": 0.20281763292865027, "grad_norm": 0.054898329079151154, "kl": 1.0312897711992264, "learning_rate": 9.990002538738614e-05, "loss": 0.0005, "reward": 0.9890625476837158, "reward_std": 0.15625, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 781 }, { "completion_length": 886.578125, "epoch": 0.2030773225994936, "grad_norm": 0.048047009855508804, "kl": 0.8349514901638031, "learning_rate": 9.989976730822484e-05, "loss": 0.0004, "reward": 0.9500000476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 782 }, { "completion_length": 899.5, "epoch": 0.20333701227033696, "grad_norm": 0.07683295011520386, "kl": 0.7949746251106262, "learning_rate": 9.989950889671817e-05, "loss": 0.0004, "reward": 0.8406250476837158, "reward_std": 0.2645031735301018, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 783 }, { "completion_length": 872.5, "epoch": 0.2035967019411803, "grad_norm": 0.06679295748472214, "kl": 0.9652034342288971, "learning_rate": 9.989925015286785e-05, "loss": 0.0005, "reward": 0.9128906726837158, "reward_std": 0.20107503235340118, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.19414062798023224, "step": 784 }, { "completion_length": 886.203125, "epoch": 0.20385639161202362, "grad_norm": 0.05255492404103279, "kl": 1.0233071446418762, "learning_rate": 9.989899107667564e-05, "loss": 0.0005, "reward": 0.8562500365078449, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 785 }, { "completion_length": 904.453125, "epoch": 0.20411608128286698, "grad_norm": 0.05846455693244934, "kl": 0.9379369467496872, "learning_rate": 9.989873166814323e-05, "loss": 0.0005, "reward": 0.9011719226837158, "reward_std": 0.16982503235340118, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 786 }, { "completion_length": 784.0, "epoch": 0.20437577095371032, "grad_norm": 0.08599824458360672, "kl": 2.8561187982559204, "learning_rate": 9.989847192727235e-05, "loss": 0.0014, "reward": 0.8980469107627869, "reward_std": 0.1695011891424656, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.17929687723517418, "step": 787 }, { "completion_length": 910.421875, "epoch": 0.20463546062455365, "grad_norm": 0.043268777430057526, "kl": 0.8528787642717361, "learning_rate": 9.989821185406477e-05, "loss": 0.0004, "reward": 0.7000000402331352, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.5, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 788 }, { "completion_length": 849.921875, "epoch": 0.204895150295397, "grad_norm": 0.05276117101311684, "kl": 1.0224969536066055, "learning_rate": 9.989795144852216e-05, "loss": 0.0005, "reward": 1.0281250476837158, "reward_std": 0.13950317353010178, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 789 }, { "completion_length": 788.40625, "epoch": 0.20515483996624034, "grad_norm": 0.06547238677740097, "kl": 1.0314314812421799, "learning_rate": 9.98976907106463e-05, "loss": 0.0005, "reward": 0.8796875476837158, "reward_std": 0.1875, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 790 }, { "completion_length": 881.421875, "epoch": 0.20541452963708368, "grad_norm": 0.0738636776804924, "kl": 1.114666759967804, "learning_rate": 9.989742964043892e-05, "loss": 0.0006, "reward": 1.0437500476837158, "reward_std": 0.17075317353010178, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 791 }, { "completion_length": 794.0, "epoch": 0.20567421930792704, "grad_norm": 0.13462834060192108, "kl": 1.1495415419340134, "learning_rate": 9.989716823790173e-05, "loss": 0.0006, "reward": 0.9691406786441803, "reward_std": 0.19795003533363342, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.18789063021540642, "step": 792 }, { "completion_length": 829.5, "epoch": 0.20593390897877037, "grad_norm": 0.08420553803443909, "kl": 1.7887962013483047, "learning_rate": 9.98969065030365e-05, "loss": 0.0009, "reward": 0.8691406697034836, "reward_std": 0.169139958685264, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.197265625, "step": 793 }, { "completion_length": 700.21875, "epoch": 0.2061935986496137, "grad_norm": 9.481217384338379, "kl": 162.5550119280815, "learning_rate": 9.989664443584497e-05, "loss": 0.0813, "reward": 0.7664062976837158, "reward_std": 0.06586672167759389, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.17265624925494194, "step": 794 }, { "completion_length": 801.734375, "epoch": 0.20645328832045706, "grad_norm": 0.05298952758312225, "kl": 1.169827625155449, "learning_rate": 9.989638203632888e-05, "loss": 0.0006, "reward": 1.0906250476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.890625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 795 }, { "completion_length": 805.6875, "epoch": 0.2067129779913004, "grad_norm": 0.0861596167087555, "kl": 1.26225546002388, "learning_rate": 9.989611930448997e-05, "loss": 0.0006, "reward": 0.7226562947034836, "reward_std": 0.16264688968658447, "rewards/spct_argmax_reward_func": 0.53125, "rewards/spct_format_reward_func": 0.19140625, "step": 796 }, { "completion_length": 722.03125, "epoch": 0.20697266766214373, "grad_norm": 0.1435774862766266, "kl": 5.150880038738251, "learning_rate": 9.989585624033001e-05, "loss": 0.0026, "reward": 0.7671875357627869, "reward_std": 0.17075317353010178, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.17343750223517418, "step": 797 }, { "completion_length": 830.109375, "epoch": 0.2072323573329871, "grad_norm": 0.04256216064095497, "kl": 1.3300198912620544, "learning_rate": 9.989559284385074e-05, "loss": 0.0007, "reward": 0.9734375476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 798 }, { "completion_length": 778.03125, "epoch": 0.20749204700383042, "grad_norm": 0.09976735711097717, "kl": 3.680315911769867, "learning_rate": 9.98953291150539e-05, "loss": 0.0018, "reward": 1.0312500447034836, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 799 }, { "completion_length": 855.3125, "epoch": 0.20775173667467375, "grad_norm": 0.045874472707509995, "kl": 1.0383338034152985, "learning_rate": 9.989506505394128e-05, "loss": 0.0005, "reward": 0.9265625476837158, "reward_std": 0.13950317353010178, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 800 }, { "completion_length": 897.359375, "epoch": 0.20801142634551711, "grad_norm": 0.06650635600090027, "kl": 1.038340076804161, "learning_rate": 9.989480066051462e-05, "loss": 0.0005, "reward": 0.9480469226837158, "reward_std": 0.20107503235340118, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 801 }, { "completion_length": 928.8125, "epoch": 0.20827111601636045, "grad_norm": 0.05575248599052429, "kl": 1.2300672829151154, "learning_rate": 9.989453593477567e-05, "loss": 0.0006, "reward": 0.9859375357627869, "reward_std": 0.17921650409698486, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.18906250223517418, "step": 802 }, { "completion_length": 976.28125, "epoch": 0.20853080568720378, "grad_norm": 0.06038487330079079, "kl": 1.2189309149980545, "learning_rate": 9.98942708767262e-05, "loss": 0.0006, "reward": 0.9414062947034836, "reward_std": 0.12656249850988388, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.19140625, "step": 803 }, { "completion_length": 954.1875, "epoch": 0.20879049535804714, "grad_norm": 0.06228208541870117, "kl": 0.8972190171480179, "learning_rate": 9.989400548636798e-05, "loss": 0.0004, "reward": 0.8253906667232513, "reward_std": 0.17162925214506686, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.18476562574505806, "step": 804 }, { "completion_length": 927.125, "epoch": 0.20905018502889047, "grad_norm": 0.03735435754060745, "kl": 1.034796804189682, "learning_rate": 9.989373976370278e-05, "loss": 0.0005, "reward": 0.9218750447034836, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 805 }, { "completion_length": 995.984375, "epoch": 0.2093098746997338, "grad_norm": 0.050149500370025635, "kl": 0.9941942989826202, "learning_rate": 9.989347370873238e-05, "loss": 0.0005, "reward": 0.8160156607627869, "reward_std": 0.2346552163362503, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.19101562723517418, "step": 806 }, { "completion_length": 977.625, "epoch": 0.20956956437057717, "grad_norm": 0.05343880504369736, "kl": 0.79033163189888, "learning_rate": 9.989320732145853e-05, "loss": 0.0004, "reward": 0.8339844197034836, "reward_std": 0.19140625, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.1777343787252903, "step": 807 }, { "completion_length": 1003.546875, "epoch": 0.2098292540414205, "grad_norm": 0.04516676813364029, "kl": 0.9110546559095383, "learning_rate": 9.989294060188301e-05, "loss": 0.0005, "reward": 1.0125000476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 808 }, { "completion_length": 999.1875, "epoch": 0.21008894371226383, "grad_norm": 0.037160493433475494, "kl": 0.8237217366695404, "learning_rate": 9.98926735500076e-05, "loss": 0.0004, "reward": 0.9578125476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 809 }, { "completion_length": 976.984375, "epoch": 0.2103486333831072, "grad_norm": 0.039312154054641724, "kl": 0.8834231346845627, "learning_rate": 9.989240616583408e-05, "loss": 0.0004, "reward": 0.6941406652331352, "reward_std": 0.09765625, "rewards/spct_argmax_reward_func": 0.515625, "rewards/spct_format_reward_func": 0.17851562798023224, "step": 810 }, { "completion_length": 1029.234375, "epoch": 0.21060832305395052, "grad_norm": 0.0595872737467289, "kl": 0.8505868166685104, "learning_rate": 9.989213844936422e-05, "loss": 0.0004, "reward": 0.7554687857627869, "reward_std": 0.23481567203998566, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.19296875223517418, "step": 811 }, { "completion_length": 995.640625, "epoch": 0.21086801272479389, "grad_norm": 0.027027839794754982, "kl": 0.91944320499897, "learning_rate": 9.989187040059982e-05, "loss": 0.0005, "reward": 1.0593750476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.859375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 812 }, { "completion_length": 1044.28125, "epoch": 0.21112770239563722, "grad_norm": 0.03044614940881729, "kl": 0.8314927369356155, "learning_rate": 9.989160201954267e-05, "loss": 0.0004, "reward": 1.0750000476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 813 }, { "completion_length": 972.65625, "epoch": 0.21138739206648055, "grad_norm": 0.028829675167798996, "kl": 0.835040420293808, "learning_rate": 9.989133330619452e-05, "loss": 0.0004, "reward": 0.9187500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 814 }, { "completion_length": 972.65625, "epoch": 0.2116470817373239, "grad_norm": 0.051396310329437256, "kl": 0.8058014661073685, "learning_rate": 9.989106426055719e-05, "loss": 0.0004, "reward": 1.0750000476837158, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 815 }, { "completion_length": 967.71875, "epoch": 0.21190677140816724, "grad_norm": 0.05631554126739502, "kl": 0.8215834945440292, "learning_rate": 9.989079488263246e-05, "loss": 0.0004, "reward": 1.0417969226837158, "reward_std": 0.2274906411767006, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 816 }, { "completion_length": 852.40625, "epoch": 0.21216646107901058, "grad_norm": 0.07239171117544174, "kl": 1.359726369380951, "learning_rate": 9.989052517242215e-05, "loss": 0.0007, "reward": 0.6507812812924385, "reward_std": 0.13254071871051565, "rewards/spct_argmax_reward_func": 0.484375, "rewards/spct_format_reward_func": 0.16640625149011612, "step": 817 }, { "completion_length": 910.046875, "epoch": 0.21242615074985394, "grad_norm": 0.027160031720995903, "kl": 0.8866966217756271, "learning_rate": 9.989025512992803e-05, "loss": 0.0004, "reward": 1.0203125476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 818 }, { "completion_length": 937.5, "epoch": 0.21268584042069727, "grad_norm": 0.03463888540863991, "kl": 0.8537526279687881, "learning_rate": 9.98899847551519e-05, "loss": 0.0004, "reward": 0.8406250476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.18437500298023224, "step": 819 }, { "completion_length": 840.171875, "epoch": 0.2129455300915406, "grad_norm": 0.05850917845964432, "kl": 0.8906130492687225, "learning_rate": 9.988971404809557e-05, "loss": 0.0004, "reward": 0.8359375447034836, "reward_std": 0.06562499864958227, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.1796875, "step": 820 }, { "completion_length": 888.75, "epoch": 0.21320521976238396, "grad_norm": 0.05215049907565117, "kl": 1.053482785820961, "learning_rate": 9.988944300876084e-05, "loss": 0.0005, "reward": 0.9500000476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 821 }, { "completion_length": 898.09375, "epoch": 0.2134649094332273, "grad_norm": 0.06741146743297577, "kl": 0.9500816315412521, "learning_rate": 9.98891716371495e-05, "loss": 0.0005, "reward": 0.8578125536441803, "reward_std": 0.17199182673357427, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.18593750149011612, "step": 822 }, { "completion_length": 902.625, "epoch": 0.21372459910407063, "grad_norm": 0.053370021283626556, "kl": 0.81067755818367, "learning_rate": 9.98888999332634e-05, "loss": 0.0004, "reward": 0.7671875357627869, "reward_std": 0.20200317353010178, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.18906250223517418, "step": 823 }, { "completion_length": 920.40625, "epoch": 0.213984288774914, "grad_norm": 0.0689259022474289, "kl": 0.8204897046089172, "learning_rate": 9.98886278971043e-05, "loss": 0.0004, "reward": 0.9460937976837158, "reward_std": 0.2938968911767006, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.19609375298023224, "step": 824 }, { "completion_length": 850.046875, "epoch": 0.21424397844575732, "grad_norm": 0.04462024196982384, "kl": 0.8871856182813644, "learning_rate": 9.988835552867404e-05, "loss": 0.0004, "reward": 1.0378906726837158, "reward_std": 0.10249064117670059, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.19414062798023224, "step": 825 }, { "completion_length": 809.984375, "epoch": 0.21450366811660065, "grad_norm": 0.05581647902727127, "kl": 0.96964992582798, "learning_rate": 9.988808282797442e-05, "loss": 0.0005, "reward": 0.9898437932133675, "reward_std": 0.10014688968658447, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.17734375223517418, "step": 826 }, { "completion_length": 850.421875, "epoch": 0.21476335778744401, "grad_norm": 0.050043582916259766, "kl": 0.8215989470481873, "learning_rate": 9.988780979500727e-05, "loss": 0.0004, "reward": 0.8191406726837158, "reward_std": 0.12890625, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.19414062798023224, "step": 827 }, { "completion_length": 830.5625, "epoch": 0.21502304745828735, "grad_norm": 0.04089108854532242, "kl": 0.8327604383230209, "learning_rate": 9.98875364297744e-05, "loss": 0.0004, "reward": 1.0281250476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 828 }, { "completion_length": 762.140625, "epoch": 0.21528273712913068, "grad_norm": 0.05130963772535324, "kl": 0.8309207260608673, "learning_rate": 9.988726273227763e-05, "loss": 0.0004, "reward": 0.9187500476837158, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 829 }, { "completion_length": 718.28125, "epoch": 0.21554242679997404, "grad_norm": 0.05304638668894768, "kl": 1.014786809682846, "learning_rate": 9.98869887025188e-05, "loss": 0.0005, "reward": 0.9796875417232513, "reward_std": 0.03305422142148018, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.18281250074505806, "step": 830 }, { "completion_length": 720.375, "epoch": 0.21580211647081737, "grad_norm": 0.030225859954953194, "kl": 1.0718477666378021, "learning_rate": 9.98867143404997e-05, "loss": 0.0005, "reward": 0.9531250447034836, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 831 }, { "completion_length": 743.296875, "epoch": 0.2160618061416607, "grad_norm": 0.0889982134103775, "kl": 1.1280459016561508, "learning_rate": 9.988643964622219e-05, "loss": 0.0006, "reward": 0.9386719167232513, "reward_std": 0.16186564118834212, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.18867187574505806, "step": 832 }, { "completion_length": 755.484375, "epoch": 0.21632149581250407, "grad_norm": 0.04591536894440651, "kl": 0.8120881170034409, "learning_rate": 9.988616461968808e-05, "loss": 0.0004, "reward": 0.8855469226837158, "reward_std": 0.10009414702653885, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 833 }, { "completion_length": 732.421875, "epoch": 0.2165811854833474, "grad_norm": 0.07334186881780624, "kl": 0.9053855240345001, "learning_rate": 9.988588926089921e-05, "loss": 0.0005, "reward": 0.9175781756639481, "reward_std": 0.22592813521623611, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.19882812723517418, "step": 834 }, { "completion_length": 667.328125, "epoch": 0.21684087515419073, "grad_norm": 0.0713491439819336, "kl": 0.8569829910993576, "learning_rate": 9.988561356985742e-05, "loss": 0.0004, "reward": 0.7644531726837158, "reward_std": 0.1962406411767006, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.18632812798023224, "step": 835 }, { "completion_length": 676.640625, "epoch": 0.2171005648250341, "grad_norm": 0.06346412748098373, "kl": 1.3047029376029968, "learning_rate": 9.988533754656452e-05, "loss": 0.0007, "reward": 0.8796875476837158, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 836 }, { "completion_length": 636.734375, "epoch": 0.21736025449587743, "grad_norm": 0.060422398149967194, "kl": 0.9906859248876572, "learning_rate": 9.988506119102239e-05, "loss": 0.0005, "reward": 0.9812500476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 837 }, { "completion_length": 616.640625, "epoch": 0.21761994416672076, "grad_norm": 0.10184997320175171, "kl": 1.2975164651870728, "learning_rate": 9.988478450323284e-05, "loss": 0.0006, "reward": 0.7507812976837158, "reward_std": 0.16198650002479553, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.18828125298023224, "step": 838 }, { "completion_length": 559.234375, "epoch": 0.21787963383756412, "grad_norm": 0.07570101320743561, "kl": 1.1353094577789307, "learning_rate": 9.988450748319773e-05, "loss": 0.0006, "reward": 0.8023438006639481, "reward_std": 0.17405826970934868, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.19296875223517418, "step": 839 }, { "completion_length": 596.53125, "epoch": 0.21813932350840745, "grad_norm": 0.06789427995681763, "kl": 1.1043591499328613, "learning_rate": 9.988423013091887e-05, "loss": 0.0006, "reward": 0.9500000476837158, "reward_std": 0.1875, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 840 }, { "completion_length": 584.25, "epoch": 0.21839901317925078, "grad_norm": 0.08796289563179016, "kl": 0.9591999500989914, "learning_rate": 9.988395244639816e-05, "loss": 0.0005, "reward": 0.9031250476837158, "reward_std": 0.25483438372612, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 841 }, { "completion_length": 564.28125, "epoch": 0.21865870285009414, "grad_norm": 0.06822584569454193, "kl": 1.0397659540176392, "learning_rate": 9.988367442963741e-05, "loss": 0.0005, "reward": 1.0125000476837158, "reward_std": 0.17075317353010178, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 842 }, { "completion_length": 556.9375, "epoch": 0.21891839252093748, "grad_norm": 0.08109904080629349, "kl": 0.9311821162700653, "learning_rate": 9.98833960806385e-05, "loss": 0.0005, "reward": 0.7625000476837158, "reward_std": 0.2596687823534012, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 843 }, { "completion_length": 475.6875, "epoch": 0.2191780821917808, "grad_norm": 0.0787825658917427, "kl": 1.3753817975521088, "learning_rate": 9.988311739940325e-05, "loss": 0.0007, "reward": 1.0593750476837158, "reward_std": 0.15625, "rewards/spct_argmax_reward_func": 0.859375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 844 }, { "completion_length": 471.0625, "epoch": 0.21943777186262417, "grad_norm": 0.09212817251682281, "kl": 1.200907975435257, "learning_rate": 9.988283838593353e-05, "loss": 0.0006, "reward": 0.8843750506639481, "reward_std": 0.21046650409698486, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.19687500223517418, "step": 845 }, { "completion_length": 466.96875, "epoch": 0.2196974615334675, "grad_norm": 0.12594513595104218, "kl": 1.5003891289234161, "learning_rate": 9.988255904023123e-05, "loss": 0.0008, "reward": 0.9066406786441803, "reward_std": 0.19795003160834312, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.18789063021540642, "step": 846 }, { "completion_length": 436.515625, "epoch": 0.21995715120431084, "grad_norm": 0.07886477559804916, "kl": 1.4726166129112244, "learning_rate": 9.988227936229816e-05, "loss": 0.0007, "reward": 0.7941406518220901, "reward_std": 0.15914637595415115, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.18476562947034836, "step": 847 }, { "completion_length": 402.09375, "epoch": 0.2202168408751542, "grad_norm": 0.0778772383928299, "kl": 1.3760251700878143, "learning_rate": 9.98819993521362e-05, "loss": 0.0007, "reward": 0.8609375357627869, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.18906250223517418, "step": 848 }, { "completion_length": 454.328125, "epoch": 0.22047653054599753, "grad_norm": 0.0753503367304802, "kl": 1.0629336535930634, "learning_rate": 9.988171900974724e-05, "loss": 0.0005, "reward": 0.9500000476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 849 }, { "completion_length": 392.125, "epoch": 0.22073622021684086, "grad_norm": 0.08203816413879395, "kl": 1.119798555970192, "learning_rate": 9.988143833513313e-05, "loss": 0.0006, "reward": 0.9734375476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 850 }, { "completion_length": 419.484375, "epoch": 0.22099590988768422, "grad_norm": 0.055000972002744675, "kl": 1.6160969585180283, "learning_rate": 9.988115732829572e-05, "loss": 0.0008, "reward": 0.8562500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 851 }, { "completion_length": 440.078125, "epoch": 0.22125559955852755, "grad_norm": 0.22391332685947418, "kl": 2.393326073884964, "learning_rate": 9.988087598923692e-05, "loss": 0.0012, "reward": 0.9187500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 852 }, { "completion_length": 429.265625, "epoch": 0.22151528922937092, "grad_norm": 0.0609026774764061, "kl": 1.158134549856186, "learning_rate": 9.988059431795855e-05, "loss": 0.0006, "reward": 1.0125000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 853 }, { "completion_length": 420.546875, "epoch": 0.22177497890021425, "grad_norm": 0.05137345939874649, "kl": 1.1330443322658539, "learning_rate": 9.988031231446255e-05, "loss": 0.0006, "reward": 1.0125000476837158, "reward_std": 0.07216878235340118, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 854 }, { "completion_length": 413.171875, "epoch": 0.22203466857105758, "grad_norm": 0.1220623180270195, "kl": 1.254941314458847, "learning_rate": 9.988002997875075e-05, "loss": 0.0006, "reward": 0.8125000447034836, "reward_std": 0.26933756470680237, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 855 }, { "completion_length": 405.203125, "epoch": 0.22229435824190094, "grad_norm": 0.07701337337493896, "kl": 1.5330279171466827, "learning_rate": 9.987974731082505e-05, "loss": 0.0008, "reward": 0.9894531667232513, "reward_std": 0.09453124925494194, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.19257812574505806, "step": 856 }, { "completion_length": 420.109375, "epoch": 0.22255404791274427, "grad_norm": 0.0867229476571083, "kl": 1.3651592433452606, "learning_rate": 9.987946431068733e-05, "loss": 0.0007, "reward": 1.0125000476837158, "reward_std": 0.17075317353010178, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 857 }, { "completion_length": 393.09375, "epoch": 0.2228137375835876, "grad_norm": 0.07171875983476639, "kl": 0.9458460211753845, "learning_rate": 9.987918097833948e-05, "loss": 0.0005, "reward": 0.9968750476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 858 }, { "completion_length": 417.96875, "epoch": 0.22307342725443097, "grad_norm": 0.06187170743942261, "kl": 0.9453712701797485, "learning_rate": 9.98788973137834e-05, "loss": 0.0005, "reward": 0.9031250476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 859 }, { "completion_length": 404.765625, "epoch": 0.2233331169252743, "grad_norm": 0.07691448926925659, "kl": 1.0710168927907944, "learning_rate": 9.987861331702093e-05, "loss": 0.0005, "reward": 1.0125000476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 860 }, { "completion_length": 379.25, "epoch": 0.22359280659611763, "grad_norm": 0.06033973768353462, "kl": 0.8071573972702026, "learning_rate": 9.9878328988054e-05, "loss": 0.0004, "reward": 0.8906250447034836, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 861 }, { "completion_length": 394.984375, "epoch": 0.223852496266961, "grad_norm": 0.07350073009729385, "kl": 0.8434636741876602, "learning_rate": 9.98780443268845e-05, "loss": 0.0004, "reward": 0.8406250402331352, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 862 }, { "completion_length": 423.5625, "epoch": 0.22411218593780433, "grad_norm": 0.07320116460323334, "kl": 0.7830694913864136, "learning_rate": 9.987775933351433e-05, "loss": 0.0004, "reward": 0.9617187976837158, "reward_std": 0.16872268170118332, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.19609375298023224, "step": 863 }, { "completion_length": 386.15625, "epoch": 0.22437187560864766, "grad_norm": 0.05802304297685623, "kl": 0.8026551902294159, "learning_rate": 9.987747400794535e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 864 }, { "completion_length": 397.9375, "epoch": 0.22463156527949102, "grad_norm": 0.07090666890144348, "kl": 0.7920604944229126, "learning_rate": 9.987718835017953e-05, "loss": 0.0004, "reward": 0.9500000476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 865 }, { "completion_length": 381.875, "epoch": 0.22489125495033435, "grad_norm": 0.07112825661897659, "kl": 0.8210591971874237, "learning_rate": 9.987690236021872e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.20683756470680237, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 866 }, { "completion_length": 358.96875, "epoch": 0.22515094462117768, "grad_norm": 0.07431510090827942, "kl": 0.8372532874345779, "learning_rate": 9.987661603806483e-05, "loss": 0.0004, "reward": 1.1062500476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.90625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 867 }, { "completion_length": 371.640625, "epoch": 0.22541063429202104, "grad_norm": 0.08824656903743744, "kl": 0.8814414888620377, "learning_rate": 9.987632938371978e-05, "loss": 0.0004, "reward": 0.9343750476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 868 }, { "completion_length": 398.734375, "epoch": 0.22567032396286438, "grad_norm": 0.07051156461238861, "kl": 0.7518124878406525, "learning_rate": 9.987604239718547e-05, "loss": 0.0004, "reward": 0.9000000506639481, "reward_std": 0.1721687838435173, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.19687500223517418, "step": 869 }, { "completion_length": 439.84375, "epoch": 0.2259300136337077, "grad_norm": 0.05916638299822807, "kl": 0.6854144036769867, "learning_rate": 9.987575507846383e-05, "loss": 0.0003, "reward": 0.9968750476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 870 }, { "completion_length": 425.578125, "epoch": 0.22618970330455107, "grad_norm": 0.07724619656801224, "kl": 0.7319165170192719, "learning_rate": 9.987546742755675e-05, "loss": 0.0004, "reward": 0.9949219226837158, "reward_std": 0.1962406411767006, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 871 }, { "completion_length": 416.03125, "epoch": 0.2264493929753944, "grad_norm": 0.07833908498287201, "kl": 0.8197485953569412, "learning_rate": 9.987517944446615e-05, "loss": 0.0004, "reward": 0.9289062917232513, "reward_std": 0.16044798493385315, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.19453125074505806, "step": 872 }, { "completion_length": 444.78125, "epoch": 0.22670908264623774, "grad_norm": 0.04382457956671715, "kl": 0.7141479402780533, "learning_rate": 9.987489112919395e-05, "loss": 0.0004, "reward": 0.9968750476837158, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 873 }, { "completion_length": 423.640625, "epoch": 0.2269687723170811, "grad_norm": 0.05575844272971153, "kl": 0.8100074231624603, "learning_rate": 9.987460248174208e-05, "loss": 0.0004, "reward": 0.8808594197034836, "reward_std": 0.06399597972631454, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.1933593787252903, "step": 874 }, { "completion_length": 427.5, "epoch": 0.22722846198792443, "grad_norm": 0.047554317861795425, "kl": 0.7705429792404175, "learning_rate": 9.987431350211247e-05, "loss": 0.0004, "reward": 0.9500000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 875 }, { "completion_length": 457.03125, "epoch": 0.22748815165876776, "grad_norm": 0.047957099974155426, "kl": 0.7502092719078064, "learning_rate": 9.987402419030701e-05, "loss": 0.0004, "reward": 0.9343750476837158, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 876 }, { "completion_length": 405.4375, "epoch": 0.22774784132961112, "grad_norm": 0.08301492780447006, "kl": 0.813975378870964, "learning_rate": 9.987373454632766e-05, "loss": 0.0004, "reward": 0.7859375476837158, "reward_std": 0.2235843911767006, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 877 }, { "completion_length": 484.9375, "epoch": 0.22800753100045446, "grad_norm": 0.04040653258562088, "kl": 0.712346538901329, "learning_rate": 9.987344457017634e-05, "loss": 0.0004, "reward": 0.9500000476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 878 }, { "completion_length": 420.03125, "epoch": 0.2282672206712978, "grad_norm": 0.06061885133385658, "kl": 0.7385005950927734, "learning_rate": 9.987315426185496e-05, "loss": 0.0004, "reward": 0.9656250476837158, "reward_std": 0.13950317353010178, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 879 }, { "completion_length": 463.28125, "epoch": 0.22852691034214115, "grad_norm": 0.06717319786548615, "kl": 0.682990089058876, "learning_rate": 9.987286362136547e-05, "loss": 0.0003, "reward": 0.8562500476837158, "reward_std": 0.1875, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 880 }, { "completion_length": 462.046875, "epoch": 0.22878660001298448, "grad_norm": 0.09012453258037567, "kl": 0.829987034201622, "learning_rate": 9.987257264870983e-05, "loss": 0.0004, "reward": 0.8402344137430191, "reward_std": 0.2367817759513855, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.18398437649011612, "step": 881 }, { "completion_length": 558.734375, "epoch": 0.22904628968382781, "grad_norm": 0.0572986975312233, "kl": 0.6862163245677948, "learning_rate": 9.987228134388994e-05, "loss": 0.0003, "reward": 0.8378906697034836, "reward_std": 0.19780314713716507, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.197265625, "step": 882 }, { "completion_length": 486.1875, "epoch": 0.22930597935467117, "grad_norm": 0.061884764581918716, "kl": 0.7181236445903778, "learning_rate": 9.987198970690777e-05, "loss": 0.0004, "reward": 0.9910156726837158, "reward_std": 0.1962406411767006, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.19414062798023224, "step": 883 }, { "completion_length": 524.640625, "epoch": 0.2295656690255145, "grad_norm": 0.04365864768624306, "kl": 0.746274009346962, "learning_rate": 9.987169773776522e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 884 }, { "completion_length": 519.84375, "epoch": 0.22982535869635784, "grad_norm": 0.06055955961346626, "kl": 0.7349700331687927, "learning_rate": 9.987140543646429e-05, "loss": 0.0004, "reward": 0.9636719226837158, "reward_std": 0.2323250249028206, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 885 }, { "completion_length": 559.90625, "epoch": 0.2300850483672012, "grad_norm": 0.05924634635448456, "kl": 0.625357985496521, "learning_rate": 9.987111280300688e-05, "loss": 0.0003, "reward": 0.8496094197034836, "reward_std": 0.2280418798327446, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.193359375, "step": 886 }, { "completion_length": 593.546875, "epoch": 0.23034473803804453, "grad_norm": 0.02601754665374756, "kl": 0.6078949570655823, "learning_rate": 9.987081983739497e-05, "loss": 0.0003, "reward": 0.9628906846046448, "reward_std": 0.07280314341187477, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.1972656287252903, "step": 887 }, { "completion_length": 601.734375, "epoch": 0.23060442770888787, "grad_norm": 0.04650099202990532, "kl": 0.6060713976621628, "learning_rate": 9.987052653963049e-05, "loss": 0.0003, "reward": 0.9343750476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 888 }, { "completion_length": 675.953125, "epoch": 0.23086411737973123, "grad_norm": 0.04992023482918739, "kl": 0.56981560587883, "learning_rate": 9.987023290971542e-05, "loss": 0.0003, "reward": 0.9757813066244125, "reward_std": 0.20810628309845924, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.19453125074505806, "step": 889 }, { "completion_length": 553.46875, "epoch": 0.23112380705057456, "grad_norm": 0.06599405407905579, "kl": 0.6908873915672302, "learning_rate": 9.986993894765167e-05, "loss": 0.0003, "reward": 0.8644531667232513, "reward_std": 0.22920002788305283, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.19257812574505806, "step": 890 }, { "completion_length": 539.859375, "epoch": 0.23138349672141792, "grad_norm": 0.05704854801297188, "kl": 0.6367550790309906, "learning_rate": 9.986964465344124e-05, "loss": 0.0003, "reward": 0.9968750476837158, "reward_std": 0.1923343911767006, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 891 }, { "completion_length": 579.8125, "epoch": 0.23164318639226125, "grad_norm": 0.07381540536880493, "kl": 0.5375368148088455, "learning_rate": 9.986935002708608e-05, "loss": 0.0003, "reward": 0.6585937887430191, "reward_std": 0.3052750639617443, "rewards/spct_argmax_reward_func": 0.484375, "rewards/spct_format_reward_func": 0.17421875521540642, "step": 892 }, { "completion_length": 634.5625, "epoch": 0.23190287606310458, "grad_norm": 0.05110936239361763, "kl": 0.5724217891693115, "learning_rate": 9.986905506858815e-05, "loss": 0.0003, "reward": 0.9316406846046448, "reward_std": 0.23388753458857536, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.197265625, "step": 893 }, { "completion_length": 585.734375, "epoch": 0.23216256573394795, "grad_norm": 0.06550898402929306, "kl": 0.6057186126708984, "learning_rate": 9.98687597779494e-05, "loss": 0.0003, "reward": 0.7937500476837158, "reward_std": 0.25, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 894 }, { "completion_length": 582.984375, "epoch": 0.23242225540479128, "grad_norm": 0.05248439311981201, "kl": 0.629508450627327, "learning_rate": 9.986846415517183e-05, "loss": 0.0003, "reward": 0.8718750476837158, "reward_std": 0.17558756470680237, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 895 }, { "completion_length": 600.390625, "epoch": 0.2326819450756346, "grad_norm": 0.05395275726914406, "kl": 0.6234866231679916, "learning_rate": 9.986816820025738e-05, "loss": 0.0003, "reward": 1.0593750476837158, "reward_std": 0.1923343911767006, "rewards/spct_argmax_reward_func": 0.859375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 896 }, { "completion_length": 624.5625, "epoch": 0.23294163474647797, "grad_norm": 0.06124822795391083, "kl": 0.5698778927326202, "learning_rate": 9.986787191320803e-05, "loss": 0.0003, "reward": 0.8875000476837158, "reward_std": 0.2235843911767006, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 897 }, { "completion_length": 583.15625, "epoch": 0.2332013244173213, "grad_norm": 0.07163985818624496, "kl": 0.6411216109991074, "learning_rate": 9.986757529402575e-05, "loss": 0.0003, "reward": 0.7156250402331352, "reward_std": 0.30058756470680237, "rewards/spct_argmax_reward_func": 0.515625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 898 }, { "completion_length": 603.421875, "epoch": 0.23346101408816464, "grad_norm": 0.03929058834910393, "kl": 0.6594471633434296, "learning_rate": 9.986727834271252e-05, "loss": 0.0003, "reward": 0.9031250476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 899 }, { "completion_length": 567.015625, "epoch": 0.233720703759008, "grad_norm": 0.06886397302150726, "kl": 0.6860673725605011, "learning_rate": 9.986698105927033e-05, "loss": 0.0003, "reward": 0.8046875447034836, "reward_std": 0.16715743544045836, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.1796875037252903, "step": 900 }, { "completion_length": 584.546875, "epoch": 0.23398039342985133, "grad_norm": 0.0446842759847641, "kl": 0.6568848341703415, "learning_rate": 9.986668344370112e-05, "loss": 0.0003, "reward": 0.9644531756639481, "reward_std": 0.15859375149011612, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.19882812723517418, "step": 901 }, { "completion_length": 606.015625, "epoch": 0.23424008310069466, "grad_norm": 0.05964631959795952, "kl": 0.6543751806020737, "learning_rate": 9.986638549600691e-05, "loss": 0.0003, "reward": 0.9031250476837158, "reward_std": 0.2284187749028206, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 902 }, { "completion_length": 579.671875, "epoch": 0.23449977277153802, "grad_norm": 0.04008655250072479, "kl": 0.656440943479538, "learning_rate": 9.986608721618968e-05, "loss": 0.0003, "reward": 1.1199219226837158, "reward_std": 0.10732503235340118, "rewards/spct_argmax_reward_func": 0.921875, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 903 }, { "completion_length": 597.46875, "epoch": 0.23475946244238136, "grad_norm": 0.0356593057513237, "kl": 0.6634125709533691, "learning_rate": 9.98657886042514e-05, "loss": 0.0003, "reward": 0.9968750476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 904 }, { "completion_length": 598.765625, "epoch": 0.2350191521132247, "grad_norm": 0.055319856852293015, "kl": 0.6072937548160553, "learning_rate": 9.986548966019409e-05, "loss": 0.0003, "reward": 0.8250000476837158, "reward_std": 0.2596687823534012, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 905 }, { "completion_length": 599.515625, "epoch": 0.23527884178406805, "grad_norm": 0.04011720046401024, "kl": 0.6394297778606415, "learning_rate": 9.986519038401969e-05, "loss": 0.0003, "reward": 0.8093750476837158, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.609375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 906 }, { "completion_length": 539.6875, "epoch": 0.23553853145491138, "grad_norm": 0.06181870400905609, "kl": 0.6680472940206528, "learning_rate": 9.986489077573024e-05, "loss": 0.0003, "reward": 0.9355469197034836, "reward_std": 0.16655313374940306, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.185546875, "step": 907 }, { "completion_length": 592.359375, "epoch": 0.23579822112575471, "grad_norm": 0.06378273665904999, "kl": 0.6444672793149948, "learning_rate": 9.98645908353277e-05, "loss": 0.0003, "reward": 0.9968750476837158, "reward_std": 0.2645031735301018, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 908 }, { "completion_length": 565.8125, "epoch": 0.23605791079659807, "grad_norm": 0.04463539645075798, "kl": 0.6712350100278854, "learning_rate": 9.986429056281412e-05, "loss": 0.0003, "reward": 0.9031250402331352, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 909 }, { "completion_length": 514.5, "epoch": 0.2363176004674414, "grad_norm": 0.05415711924433708, "kl": 0.7040075212717056, "learning_rate": 9.986398995819143e-05, "loss": 0.0004, "reward": 1.0593750476837158, "reward_std": 0.20200317353010178, "rewards/spct_argmax_reward_func": 0.859375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 910 }, { "completion_length": 538.8125, "epoch": 0.23657729013828474, "grad_norm": 0.06031584367156029, "kl": 0.7156934142112732, "learning_rate": 9.98636890214617e-05, "loss": 0.0004, "reward": 0.9019531607627869, "reward_std": 0.2571781426668167, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.19882812723517418, "step": 911 }, { "completion_length": 504.578125, "epoch": 0.2368369798091281, "grad_norm": 0.053955864161252975, "kl": 0.7205349802970886, "learning_rate": 9.98633877526269e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 912 }, { "completion_length": 466.671875, "epoch": 0.23709666947997143, "grad_norm": 0.04504476115107536, "kl": 0.7839747071266174, "learning_rate": 9.986308615168902e-05, "loss": 0.0004, "reward": 0.7859375476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 913 }, { "completion_length": 487.03125, "epoch": 0.23735635915081477, "grad_norm": 0.03364112228155136, "kl": 0.7745155841112137, "learning_rate": 9.986278421865009e-05, "loss": 0.0004, "reward": 0.9187500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 914 }, { "completion_length": 439.515625, "epoch": 0.23761604882165813, "grad_norm": 0.062265317887067795, "kl": 0.7984298914670944, "learning_rate": 9.986248195351212e-05, "loss": 0.0004, "reward": 1.0718750357627869, "reward_std": 0.1673343926668167, "rewards/spct_argmax_reward_func": 0.875, "rewards/spct_format_reward_func": 0.19687500223517418, "step": 915 }, { "completion_length": 415.875, "epoch": 0.23787573849250146, "grad_norm": 0.030595242977142334, "kl": 0.885298103094101, "learning_rate": 9.986217935627713e-05, "loss": 0.0004, "reward": 1.1218750476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.921875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 916 }, { "completion_length": 428.859375, "epoch": 0.2381354281633448, "grad_norm": 0.07794055342674255, "kl": 0.8455468714237213, "learning_rate": 9.986187642694711e-05, "loss": 0.0004, "reward": 0.9097656607627869, "reward_std": 0.19866476207971573, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.19101562723517418, "step": 917 }, { "completion_length": 422.546875, "epoch": 0.23839511783418815, "grad_norm": 0.056280165910720825, "kl": 0.8260038048028946, "learning_rate": 9.986157316552411e-05, "loss": 0.0004, "reward": 0.9968750476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 918 }, { "completion_length": 410.875, "epoch": 0.23865480750503149, "grad_norm": 0.06919897347688675, "kl": 0.8707011193037033, "learning_rate": 9.986126957201013e-05, "loss": 0.0004, "reward": 0.9031250476837158, "reward_std": 0.20200317353010178, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 919 }, { "completion_length": 369.65625, "epoch": 0.23891449717587482, "grad_norm": 0.06353513151407242, "kl": 1.0242639780044556, "learning_rate": 9.98609656464072e-05, "loss": 0.0005, "reward": 0.9656250476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 920 }, { "completion_length": 361.171875, "epoch": 0.23917418684671818, "grad_norm": 0.04419031739234924, "kl": 0.9921361207962036, "learning_rate": 9.986066138871734e-05, "loss": 0.0005, "reward": 1.0906250476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.890625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 921 }, { "completion_length": 370.609375, "epoch": 0.2394338765175615, "grad_norm": 0.08232284337282181, "kl": 0.8838884979486465, "learning_rate": 9.986035679894257e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.1875, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 922 }, { "completion_length": 353.28125, "epoch": 0.23969356618840484, "grad_norm": 0.06610875576734543, "kl": 0.9876205027103424, "learning_rate": 9.986005187708494e-05, "loss": 0.0005, "reward": 0.9187500476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 923 }, { "completion_length": 330.765625, "epoch": 0.2399532558592482, "grad_norm": 0.03672422468662262, "kl": 1.0350169390439987, "learning_rate": 9.985974662314645e-05, "loss": 0.0005, "reward": 0.9968750476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 924 }, { "completion_length": 352.15625, "epoch": 0.24021294553009154, "grad_norm": 0.0797300860285759, "kl": 1.1590862274169922, "learning_rate": 9.985944103712916e-05, "loss": 0.0006, "reward": 0.9636719226837158, "reward_std": 0.16015625, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 925 }, { "completion_length": 346.84375, "epoch": 0.24047263520093487, "grad_norm": 0.05154603347182274, "kl": 1.062563642859459, "learning_rate": 9.985913511903508e-05, "loss": 0.0005, "reward": 0.9734375476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 926 }, { "completion_length": 347.828125, "epoch": 0.24073232487177823, "grad_norm": 0.11056160926818848, "kl": 2.4112077057361603, "learning_rate": 9.985882886886629e-05, "loss": 0.0012, "reward": 0.7312500476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.53125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 927 }, { "completion_length": 330.765625, "epoch": 0.24099201454262156, "grad_norm": 0.06411836296319962, "kl": 1.412125825881958, "learning_rate": 9.985852228662477e-05, "loss": 0.0007, "reward": 1.0125000476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 928 }, { "completion_length": 324.3125, "epoch": 0.2412517042134649, "grad_norm": 0.06403446197509766, "kl": 1.2930708229541779, "learning_rate": 9.985821537231263e-05, "loss": 0.0006, "reward": 0.8406250402331352, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 929 }, { "completion_length": 331.171875, "epoch": 0.24151139388430826, "grad_norm": 0.0658496543765068, "kl": 1.111285150051117, "learning_rate": 9.985790812593185e-05, "loss": 0.0006, "reward": 0.8562500476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 930 }, { "completion_length": 316.875, "epoch": 0.2417710835551516, "grad_norm": 0.05991911515593529, "kl": 1.059190034866333, "learning_rate": 9.985760054748452e-05, "loss": 0.0005, "reward": 0.9500000476837158, "reward_std": 0.07216878235340118, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 931 }, { "completion_length": 315.03125, "epoch": 0.24203077322599495, "grad_norm": 0.06319495290517807, "kl": 1.030646413564682, "learning_rate": 9.985729263697267e-05, "loss": 0.0005, "reward": 0.9187500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 932 }, { "completion_length": 312.3125, "epoch": 0.24229046289683828, "grad_norm": 0.10859642922878265, "kl": 1.507145255804062, "learning_rate": 9.985698439439834e-05, "loss": 0.0008, "reward": 0.8562500402331352, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 933 }, { "completion_length": 323.84375, "epoch": 0.24255015256768162, "grad_norm": 0.06264462321996689, "kl": 1.0019366890192032, "learning_rate": 9.985667581976361e-05, "loss": 0.0005, "reward": 0.8167969286441803, "reward_std": 0.06328125298023224, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.19179688021540642, "step": 934 }, { "completion_length": 318.953125, "epoch": 0.24280984223852498, "grad_norm": 0.062133025377988815, "kl": 1.6748202443122864, "learning_rate": 9.985636691307052e-05, "loss": 0.0008, "reward": 0.8562500476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 935 }, { "completion_length": 341.953125, "epoch": 0.2430695319093683, "grad_norm": 0.035676512867212296, "kl": 0.9605665355920792, "learning_rate": 9.985605767432113e-05, "loss": 0.0005, "reward": 0.9187500402331352, "reward_std": 0.03608439117670059, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 936 }, { "completion_length": 322.6875, "epoch": 0.24332922158021164, "grad_norm": 0.04378361999988556, "kl": 1.0507690459489822, "learning_rate": 9.98557481035175e-05, "loss": 0.0005, "reward": 0.8406250476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 937 }, { "completion_length": 337.609375, "epoch": 0.243588911251055, "grad_norm": 0.03926045075058937, "kl": 0.9297673553228378, "learning_rate": 9.98554382006617e-05, "loss": 0.0005, "reward": 0.9968750476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 938 }, { "completion_length": 339.484375, "epoch": 0.24384860092189833, "grad_norm": 0.08697099983692169, "kl": 0.9226114749908447, "learning_rate": 9.985512796575577e-05, "loss": 0.0005, "reward": 0.9324219226837158, "reward_std": 0.16015625, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 939 }, { "completion_length": 330.65625, "epoch": 0.24410829059274167, "grad_norm": 0.077800452709198, "kl": 1.5882467329502106, "learning_rate": 9.98548173988018e-05, "loss": 0.0008, "reward": 1.0414063036441803, "reward_std": 0.1627212129533291, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.19765625149011612, "step": 940 }, { "completion_length": 323.09375, "epoch": 0.24436798026358503, "grad_norm": 0.08307567983865738, "kl": 1.0374696552753448, "learning_rate": 9.985450649980184e-05, "loss": 0.0005, "reward": 0.9812500476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 941 }, { "completion_length": 339.046875, "epoch": 0.24462766993442836, "grad_norm": 0.06999646872282028, "kl": 1.7487813532352448, "learning_rate": 9.985419526875797e-05, "loss": 0.0009, "reward": 1.0125000476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 942 }, { "completion_length": 334.40625, "epoch": 0.2448873596052717, "grad_norm": 0.0703369677066803, "kl": 0.9533181041479111, "learning_rate": 9.985388370567229e-05, "loss": 0.0005, "reward": 1.0125000476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 943 }, { "completion_length": 352.0, "epoch": 0.24514704927611505, "grad_norm": 0.06985925883054733, "kl": 0.8818661570549011, "learning_rate": 9.985357181054682e-05, "loss": 0.0004, "reward": 1.0281250476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 944 }, { "completion_length": 370.53125, "epoch": 0.24540673894695839, "grad_norm": 0.053365934640169144, "kl": 0.776655837893486, "learning_rate": 9.985325958338367e-05, "loss": 0.0004, "reward": 0.8875000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 945 }, { "completion_length": 378.90625, "epoch": 0.24566642861780172, "grad_norm": 0.050779491662979126, "kl": 0.8660678714513779, "learning_rate": 9.985294702418492e-05, "loss": 0.0004, "reward": 0.8718750476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 946 }, { "completion_length": 363.65625, "epoch": 0.24592611828864508, "grad_norm": 0.061090629547834396, "kl": 0.7927825003862381, "learning_rate": 9.985263413295266e-05, "loss": 0.0004, "reward": 0.9031250476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 947 }, { "completion_length": 348.265625, "epoch": 0.2461858079594884, "grad_norm": 0.05980953201651573, "kl": 0.7748057693243027, "learning_rate": 9.985232090968892e-05, "loss": 0.0004, "reward": 1.1375000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.9375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 948 }, { "completion_length": 363.0, "epoch": 0.24644549763033174, "grad_norm": 0.05339962989091873, "kl": 0.8377022594213486, "learning_rate": 9.985200735439587e-05, "loss": 0.0004, "reward": 1.0906250476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.890625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 949 }, { "completion_length": 359.140625, "epoch": 0.2467051873011751, "grad_norm": 0.07867810875177383, "kl": 0.9293169230222702, "learning_rate": 9.985169346707553e-05, "loss": 0.0005, "reward": 0.9343750476837158, "reward_std": 0.20200317353010178, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 950 }, { "completion_length": 354.140625, "epoch": 0.24696487697201844, "grad_norm": 0.06304681301116943, "kl": 0.7391790002584457, "learning_rate": 9.985137924773003e-05, "loss": 0.0004, "reward": 1.0125000476837158, "reward_std": 0.14433756470680237, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 951 }, { "completion_length": 363.3125, "epoch": 0.24722456664286177, "grad_norm": 0.0744611844420433, "kl": 0.7458595484495163, "learning_rate": 9.985106469636143e-05, "loss": 0.0004, "reward": 0.8562500402331352, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 952 }, { "completion_length": 353.203125, "epoch": 0.24748425631370513, "grad_norm": 0.04995151609182358, "kl": 0.8119764477014542, "learning_rate": 9.985074981297187e-05, "loss": 0.0004, "reward": 0.9187500476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 953 }, { "completion_length": 350.609375, "epoch": 0.24774394598454846, "grad_norm": 0.07842548191547394, "kl": 0.798823669552803, "learning_rate": 9.98504345975634e-05, "loss": 0.0004, "reward": 0.9187500476837158, "reward_std": 0.23325317353010178, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 954 }, { "completion_length": 364.75, "epoch": 0.2480036356553918, "grad_norm": 0.07057605683803558, "kl": 0.8757713139057159, "learning_rate": 9.985011905013814e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 955 }, { "completion_length": 342.75, "epoch": 0.24826332532623516, "grad_norm": 0.07441583275794983, "kl": 0.7834143936634064, "learning_rate": 9.98498031706982e-05, "loss": 0.0004, "reward": 0.8718750476837158, "reward_std": 0.15625, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 956 }, { "completion_length": 363.546875, "epoch": 0.2485230149970785, "grad_norm": 0.06848502159118652, "kl": 0.8027542382478714, "learning_rate": 9.984948695924567e-05, "loss": 0.0004, "reward": 0.9656250476837158, "reward_std": 0.20200317353010178, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 957 }, { "completion_length": 350.625, "epoch": 0.24878270466792182, "grad_norm": 0.04308827966451645, "kl": 0.828154519200325, "learning_rate": 9.984917041578268e-05, "loss": 0.0004, "reward": 1.0125000476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 958 }, { "completion_length": 360.796875, "epoch": 0.24904239433876518, "grad_norm": 0.04200378805398941, "kl": 0.7134752422571182, "learning_rate": 9.98488535403113e-05, "loss": 0.0004, "reward": 1.0281250476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 959 }, { "completion_length": 346.453125, "epoch": 0.24930208400960852, "grad_norm": 0.05957817658782005, "kl": 0.8180912882089615, "learning_rate": 9.984853633283368e-05, "loss": 0.0004, "reward": 0.9890625476837158, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 960 }, { "completion_length": 363.484375, "epoch": 0.24956177368045185, "grad_norm": 0.0008933711796998978, "kl": 0.846610352396965, "learning_rate": 9.98482187933519e-05, "loss": 0.0004, "reward": 1.0125000402331352, "reward_std": 0.0, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 961 }, { "completion_length": 360.34375, "epoch": 0.2498214633512952, "grad_norm": 0.4484599232673645, "kl": 6.18966019153595, "learning_rate": 9.984790092186808e-05, "loss": 0.0031, "reward": 0.7156250476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.515625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 962 }, { "completion_length": 357.1875, "epoch": 0.25008115302213857, "grad_norm": 0.06149658188223839, "kl": 0.8170712739229202, "learning_rate": 9.984758271838436e-05, "loss": 0.0004, "reward": 0.8406250476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 963 }, { "completion_length": 371.578125, "epoch": 0.2503408426929819, "grad_norm": 0.04065840691328049, "kl": 0.8211720436811447, "learning_rate": 9.984726418290284e-05, "loss": 0.0004, "reward": 0.8875000476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 964 }, { "completion_length": 367.265625, "epoch": 0.25060053236382523, "grad_norm": 0.05395498126745224, "kl": 0.848272979259491, "learning_rate": 9.984694531542565e-05, "loss": 0.0004, "reward": 1.0281250476837158, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 965 }, { "completion_length": 344.046875, "epoch": 0.25086022203466857, "grad_norm": 0.04950331151485443, "kl": 0.9485513269901276, "learning_rate": 9.98466261159549e-05, "loss": 0.0005, "reward": 0.8875000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 966 }, { "completion_length": 331.78125, "epoch": 0.2511199117055119, "grad_norm": 0.06409355252981186, "kl": 1.0163619667291641, "learning_rate": 9.984630658449272e-05, "loss": 0.0005, "reward": 0.9343750476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 967 }, { "completion_length": 353.71875, "epoch": 0.25137960137635523, "grad_norm": 0.07068031281232834, "kl": 0.9211557656526566, "learning_rate": 9.984598672104126e-05, "loss": 0.0005, "reward": 0.7781250476837158, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 968 }, { "completion_length": 348.296875, "epoch": 0.2516392910471986, "grad_norm": 0.0007068797131069005, "kl": 0.8417810499668121, "learning_rate": 9.984566652560263e-05, "loss": 0.0004, "reward": 1.0125000476837158, "reward_std": 0.0, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 969 }, { "completion_length": 386.125, "epoch": 0.25189898071804195, "grad_norm": 0.04886043816804886, "kl": 0.8136007338762283, "learning_rate": 9.984534599817896e-05, "loss": 0.0004, "reward": 1.1500000357627869, "reward_std": 0.10000000149011612, "rewards/spct_argmax_reward_func": 0.953125, "rewards/spct_format_reward_func": 0.19687500223517418, "step": 970 }, { "completion_length": 343.09375, "epoch": 0.2521586703888853, "grad_norm": 0.07751897722482681, "kl": 0.923665925860405, "learning_rate": 9.98450251387724e-05, "loss": 0.0005, "reward": 0.9500000476837158, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 971 }, { "completion_length": 382.46875, "epoch": 0.2524183600597286, "grad_norm": 0.04958454146981239, "kl": 0.9552142918109894, "learning_rate": 9.984470394738507e-05, "loss": 0.0005, "reward": 0.8296875357627869, "reward_std": 0.10601894184947014, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.18906250223517418, "step": 972 }, { "completion_length": 371.203125, "epoch": 0.25267804973057195, "grad_norm": 0.05592610687017441, "kl": 0.9210823178291321, "learning_rate": 9.984438242401912e-05, "loss": 0.0005, "reward": 0.8875000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 973 }, { "completion_length": 355.375, "epoch": 0.2529377394014153, "grad_norm": 0.04008908569812775, "kl": 0.9099245816469193, "learning_rate": 9.98440605686767e-05, "loss": 0.0005, "reward": 0.9187500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 974 }, { "completion_length": 354.53125, "epoch": 0.2531974290722587, "grad_norm": 0.0009469031938351691, "kl": 0.9815316498279572, "learning_rate": 9.984373838135995e-05, "loss": 0.0005, "reward": 1.1375000476837158, "reward_std": 0.0, "rewards/spct_argmax_reward_func": 0.9375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 975 }, { "completion_length": 362.796875, "epoch": 0.253457118743102, "grad_norm": 0.06186050921678543, "kl": 0.8455935716629028, "learning_rate": 9.984341586207097e-05, "loss": 0.0004, "reward": 0.9656250476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 976 }, { "completion_length": 371.796875, "epoch": 0.25371680841394534, "grad_norm": 0.07101483643054962, "kl": 0.8656206876039505, "learning_rate": 9.984309301081198e-05, "loss": 0.0004, "reward": 0.9031250476837158, "reward_std": 0.20200317353010178, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 977 }, { "completion_length": 363.03125, "epoch": 0.25397649808478867, "grad_norm": 0.04811088740825653, "kl": 0.9269344508647919, "learning_rate": 9.98427698275851e-05, "loss": 0.0005, "reward": 0.9968750476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 978 }, { "completion_length": 376.765625, "epoch": 0.254236187755632, "grad_norm": 0.0449732206761837, "kl": 0.8902070075273514, "learning_rate": 9.984244631239246e-05, "loss": 0.0004, "reward": 1.0437500476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 979 }, { "completion_length": 391.09375, "epoch": 0.25449587742647534, "grad_norm": 0.056904491037130356, "kl": 0.9474226981401443, "learning_rate": 9.984212246523623e-05, "loss": 0.0005, "reward": 0.9726562798023224, "reward_std": 0.13490360602736473, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.19140625, "step": 980 }, { "completion_length": 386.25, "epoch": 0.2547555670973187, "grad_norm": 0.07238207757472992, "kl": 0.9531801789999008, "learning_rate": 9.984179828611859e-05, "loss": 0.0005, "reward": 0.8843750506639481, "reward_std": 0.1673343926668167, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.19687500223517418, "step": 981 }, { "completion_length": 330.125, "epoch": 0.25501525676816206, "grad_norm": 0.03866976499557495, "kl": 1.0434903353452682, "learning_rate": 9.984147377504167e-05, "loss": 0.0005, "reward": 1.0937500447034836, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.90625, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 982 }, { "completion_length": 360.25, "epoch": 0.2552749464390054, "grad_norm": 0.04697009548544884, "kl": 0.8409553319215775, "learning_rate": 9.984114893200764e-05, "loss": 0.0004, "reward": 1.0750000476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 983 }, { "completion_length": 379.484375, "epoch": 0.2555346361098487, "grad_norm": 0.04498666897416115, "kl": 0.9279154241085052, "learning_rate": 9.984082375701869e-05, "loss": 0.0005, "reward": 1.0046875476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 984 }, { "completion_length": 375.546875, "epoch": 0.25579432578069206, "grad_norm": 0.03322581946849823, "kl": 0.8671629130840302, "learning_rate": 9.984049825007693e-05, "loss": 0.0004, "reward": 0.9343750476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 985 }, { "completion_length": 369.359375, "epoch": 0.2560540154515354, "grad_norm": 0.05514265224337578, "kl": 0.9094283282756805, "learning_rate": 9.984017241118458e-05, "loss": 0.0005, "reward": 0.9968750476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 986 }, { "completion_length": 371.234375, "epoch": 0.2563137051223788, "grad_norm": 0.057087935507297516, "kl": 0.9180173724889755, "learning_rate": 9.983984624034377e-05, "loss": 0.0005, "reward": 0.8562500402331352, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 987 }, { "completion_length": 366.71875, "epoch": 0.2565733947932221, "grad_norm": 0.038174163550138474, "kl": 0.8666670620441437, "learning_rate": 9.98395197375567e-05, "loss": 0.0004, "reward": 1.0750000476837158, "reward_std": 0.07216878235340118, "rewards/spct_argmax_reward_func": 0.875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 988 }, { "completion_length": 366.46875, "epoch": 0.25683308446406544, "grad_norm": 0.08290740847587585, "kl": 0.8436574935913086, "learning_rate": 9.983919290282554e-05, "loss": 0.0004, "reward": 0.7937500476837158, "reward_std": 0.2235843911767006, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 989 }, { "completion_length": 382.9375, "epoch": 0.2570927741349088, "grad_norm": 0.09008123725652695, "kl": 0.8840433955192566, "learning_rate": 9.983886573615246e-05, "loss": 0.0004, "reward": 0.8406250476837158, "reward_std": 0.2645031660795212, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 990 }, { "completion_length": 375.453125, "epoch": 0.2573524638057521, "grad_norm": 0.028972899541258812, "kl": 0.8322771042585373, "learning_rate": 9.983853823753964e-05, "loss": 0.0004, "reward": 0.9968750476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 991 }, { "completion_length": 349.84375, "epoch": 0.2576121534765955, "grad_norm": 0.056736141443252563, "kl": 0.9448690116405487, "learning_rate": 9.983821040698926e-05, "loss": 0.0005, "reward": 1.0437500476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 992 }, { "completion_length": 356.453125, "epoch": 0.25787184314743883, "grad_norm": 0.07005947828292847, "kl": 0.8738595992326736, "learning_rate": 9.983788224450353e-05, "loss": 0.0004, "reward": 0.8562500476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 993 }, { "completion_length": 344.390625, "epoch": 0.25813153281828216, "grad_norm": 0.06259442865848541, "kl": 0.9200872331857681, "learning_rate": 9.983755375008459e-05, "loss": 0.0005, "reward": 0.9187500476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 994 }, { "completion_length": 369.015625, "epoch": 0.2583912224891255, "grad_norm": 0.0516505129635334, "kl": 0.9625106602907181, "learning_rate": 9.983722492373464e-05, "loss": 0.0005, "reward": 0.9500000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 995 }, { "completion_length": 359.84375, "epoch": 0.2586509121599688, "grad_norm": 0.0417354516685009, "kl": 0.9392140805721283, "learning_rate": 9.98368957654559e-05, "loss": 0.0005, "reward": 0.8875000476837158, "reward_std": 0.07216878235340118, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 996 }, { "completion_length": 346.90625, "epoch": 0.25891060183081216, "grad_norm": 0.061104435473680496, "kl": 1.0043726861476898, "learning_rate": 9.983656627525054e-05, "loss": 0.0005, "reward": 0.9460937976837158, "reward_std": 0.10309493914246559, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.19609375298023224, "step": 997 }, { "completion_length": 371.78125, "epoch": 0.25917029150165555, "grad_norm": 0.04057052358984947, "kl": 0.9119392335414886, "learning_rate": 9.983623645312075e-05, "loss": 0.0005, "reward": 0.9343750476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 998 }, { "completion_length": 361.328125, "epoch": 0.2594299811724989, "grad_norm": 0.0006512693362310529, "kl": 0.8665491193532944, "learning_rate": 9.983590629906873e-05, "loss": 0.0004, "reward": 1.0125000476837158, "reward_std": 0.0, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 999 }, { "completion_length": 362.5, "epoch": 0.2596896708433422, "grad_norm": 0.042585428804159164, "kl": 0.9967220723628998, "learning_rate": 9.983557581309669e-05, "loss": 0.0005, "reward": 1.0386719107627869, "reward_std": 0.06591673195362091, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.19492187723517418, "step": 1000 }, { "epoch": 0.2596896708433422, "eval_completion_length": 359.395, "eval_kl": 0.9441093540191651, "eval_loss": 0.00047411248669959605, "eval_reward": 1.0050000458955766, "eval_reward_std": 0.03, "eval_rewards/spct_argmax_reward_func": 0.805, "eval_rewards/spct_format_reward_func": 0.20000000298023224, "eval_runtime": 153.1976, "eval_samples_per_second": 0.326, "eval_steps_per_second": 0.046, "step": 1000 }, { "completion_length": 383.3125, "epoch": 0.25994936051418555, "grad_norm": 0.03215406835079193, "kl": 0.8682380318641663, "learning_rate": 9.983524499520683e-05, "loss": 0.0004, "reward": 0.7437500432133675, "reward_std": 0.03750000149011612, "rewards/spct_argmax_reward_func": 0.546875, "rewards/spct_format_reward_func": 0.19687500223517418, "step": 1001 }, { "completion_length": 358.0, "epoch": 0.2602090501850289, "grad_norm": 0.07234790921211243, "kl": 1.0744989663362503, "learning_rate": 9.983491384540134e-05, "loss": 0.0005, "reward": 0.9500000476837158, "reward_std": 0.17075317353010178, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1002 }, { "completion_length": 346.65625, "epoch": 0.2604687398558722, "grad_norm": 0.03646721690893173, "kl": 0.9390900731086731, "learning_rate": 9.983458236368243e-05, "loss": 0.0005, "reward": 0.8699219226837158, "reward_std": 0.03515625, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 1003 }, { "completion_length": 352.5, "epoch": 0.2607284295267156, "grad_norm": 0.03949035704135895, "kl": 0.9086542427539825, "learning_rate": 9.983425055005231e-05, "loss": 0.0005, "reward": 0.9500000476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1004 }, { "completion_length": 346.078125, "epoch": 0.26098811919755893, "grad_norm": 0.035830091685056686, "kl": 0.9331285357475281, "learning_rate": 9.98339184045132e-05, "loss": 0.0005, "reward": 0.9656250476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1005 }, { "completion_length": 335.984375, "epoch": 0.26124780886840226, "grad_norm": 0.03841542452573776, "kl": 0.9859025329351425, "learning_rate": 9.983358592706729e-05, "loss": 0.0005, "reward": 0.8718750476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1006 }, { "completion_length": 339.5, "epoch": 0.2615074985392456, "grad_norm": 0.03538947179913521, "kl": 1.0466093122959137, "learning_rate": 9.983325311771681e-05, "loss": 0.0005, "reward": 1.0593750476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.859375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1007 }, { "completion_length": 338.46875, "epoch": 0.26176718821008893, "grad_norm": 0.0008254832937382162, "kl": 0.8880230635404587, "learning_rate": 9.983291997646398e-05, "loss": 0.0004, "reward": 1.0125000476837158, "reward_std": 0.0, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1008 }, { "completion_length": 333.609375, "epoch": 0.26202687788093226, "grad_norm": 0.040478698909282684, "kl": 1.1061403155326843, "learning_rate": 9.9832586503311e-05, "loss": 0.0006, "reward": 0.9031250476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1009 }, { "completion_length": 338.375, "epoch": 0.26228656755177565, "grad_norm": 0.057825714349746704, "kl": 0.9955259561538696, "learning_rate": 9.98322526982601e-05, "loss": 0.0005, "reward": 1.1375000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.9375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1010 }, { "completion_length": 361.015625, "epoch": 0.262546257222619, "grad_norm": 0.04677411541342735, "kl": 1.028190791606903, "learning_rate": 9.983191856131353e-05, "loss": 0.0005, "reward": 0.9156250357627869, "reward_std": 0.04233439126983285, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.19687500223517418, "step": 1011 }, { "completion_length": 320.59375, "epoch": 0.2628059468934623, "grad_norm": 0.071815624833107, "kl": 0.990875706076622, "learning_rate": 9.983158409247348e-05, "loss": 0.0005, "reward": 1.0066406726837158, "reward_std": 0.12890625, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.19414062798023224, "step": 1012 }, { "completion_length": 327.75, "epoch": 0.26306563656430565, "grad_norm": 0.05787309259176254, "kl": 1.0365400463342667, "learning_rate": 9.983124929174218e-05, "loss": 0.0005, "reward": 0.8859375417232513, "reward_std": 0.10170938819646835, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.19843750074505806, "step": 1013 }, { "completion_length": 342.390625, "epoch": 0.263325326235149, "grad_norm": 0.04910970851778984, "kl": 0.9056025445461273, "learning_rate": 9.98309141591219e-05, "loss": 0.0005, "reward": 0.7937500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1014 }, { "completion_length": 345.609375, "epoch": 0.2635850159059923, "grad_norm": 0.04619310423731804, "kl": 0.9859039634466171, "learning_rate": 9.983057869461481e-05, "loss": 0.0005, "reward": 0.8875000476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1015 }, { "completion_length": 350.421875, "epoch": 0.2638447055768357, "grad_norm": 0.04548370838165283, "kl": 0.9925935566425323, "learning_rate": 9.983024289822318e-05, "loss": 0.0005, "reward": 0.9187500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1016 }, { "completion_length": 342.125, "epoch": 0.26410439524767904, "grad_norm": 0.08304701745510101, "kl": 0.9206166416406631, "learning_rate": 9.982990676994924e-05, "loss": 0.0005, "reward": 0.9187500476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1017 }, { "completion_length": 343.015625, "epoch": 0.26436408491852237, "grad_norm": 0.057989805936813354, "kl": 0.8763123750686646, "learning_rate": 9.982957030979525e-05, "loss": 0.0004, "reward": 0.8718750402331352, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1018 }, { "completion_length": 337.8125, "epoch": 0.2646237745893657, "grad_norm": 0.05107349529862404, "kl": 0.9021686017513275, "learning_rate": 9.98292335177634e-05, "loss": 0.0005, "reward": 1.1062500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.90625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1019 }, { "completion_length": 331.25, "epoch": 0.26488346426020903, "grad_norm": 0.0354665145277977, "kl": 0.944190725684166, "learning_rate": 9.9828896393856e-05, "loss": 0.0005, "reward": 1.0437500476837158, "reward_std": 0.03608439117670059, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1020 }, { "completion_length": 342.890625, "epoch": 0.26514315393105237, "grad_norm": 0.04502172768115997, "kl": 1.0207918584346771, "learning_rate": 9.982855893807523e-05, "loss": 0.0005, "reward": 1.0125000476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1021 }, { "completion_length": 350.140625, "epoch": 0.26540284360189575, "grad_norm": 0.032510824501514435, "kl": 0.9570571184158325, "learning_rate": 9.98282211504234e-05, "loss": 0.0005, "reward": 0.8871094286441803, "reward_std": 0.0007812529802322388, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.19960937649011612, "step": 1022 }, { "completion_length": 342.515625, "epoch": 0.2656625332727391, "grad_norm": 0.04809756577014923, "kl": 0.9168087393045425, "learning_rate": 9.98278830309027e-05, "loss": 0.0005, "reward": 1.0125000476837158, "reward_std": 0.07216878235340118, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1023 }, { "completion_length": 333.03125, "epoch": 0.2659222229435824, "grad_norm": 0.03745042905211449, "kl": 0.8950222581624985, "learning_rate": 9.982754457951543e-05, "loss": 0.0004, "reward": 1.0593750476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.859375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1024 }, { "completion_length": 320.265625, "epoch": 0.26618191261442575, "grad_norm": 0.04695386439561844, "kl": 0.9772389084100723, "learning_rate": 9.982720579626381e-05, "loss": 0.0005, "reward": 0.9890625476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 1025 }, { "completion_length": 309.71875, "epoch": 0.2664416022852691, "grad_norm": 0.06558497250080109, "kl": 0.9421823918819427, "learning_rate": 9.982686668115011e-05, "loss": 0.0005, "reward": 1.0000000447034836, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 1026 }, { "completion_length": 315.265625, "epoch": 0.2667012919561124, "grad_norm": 0.055582888424396515, "kl": 0.9640912413597107, "learning_rate": 9.98265272341766e-05, "loss": 0.0005, "reward": 0.9187500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1027 }, { "completion_length": 307.671875, "epoch": 0.2669609816269558, "grad_norm": 0.04041602462530136, "kl": 0.9881391525268555, "learning_rate": 9.98261874553455e-05, "loss": 0.0005, "reward": 1.0593750476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.859375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1028 }, { "completion_length": 322.234375, "epoch": 0.26722067129779914, "grad_norm": 0.06015327200293541, "kl": 0.9552096426486969, "learning_rate": 9.982584734465913e-05, "loss": 0.0005, "reward": 1.0906250476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.890625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1029 }, { "completion_length": 322.40625, "epoch": 0.26748036096864247, "grad_norm": 0.05384114384651184, "kl": 0.9512109756469727, "learning_rate": 9.982550690211974e-05, "loss": 0.0005, "reward": 0.9500000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1030 }, { "completion_length": 308.5, "epoch": 0.2677400506394858, "grad_norm": 0.07273752242326736, "kl": 0.970933198928833, "learning_rate": 9.982516612772956e-05, "loss": 0.0005, "reward": 0.8250000476837158, "reward_std": 0.1610843911767006, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1031 }, { "completion_length": 299.84375, "epoch": 0.26799974031032914, "grad_norm": 0.04847858101129532, "kl": 0.9994475543498993, "learning_rate": 9.982482502149089e-05, "loss": 0.0005, "reward": 0.9343750476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1032 }, { "completion_length": 285.03125, "epoch": 0.2682594299811725, "grad_norm": 0.05147367715835571, "kl": 0.9856204241514206, "learning_rate": 9.9824483583406e-05, "loss": 0.0005, "reward": 0.9062500447034836, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 1033 }, { "completion_length": 290.390625, "epoch": 0.26851911965201586, "grad_norm": 0.058815594762563705, "kl": 0.965342566370964, "learning_rate": 9.982414181347715e-05, "loss": 0.0005, "reward": 0.9234375357627869, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.18906250223517418, "step": 1034 }, { "completion_length": 301.265625, "epoch": 0.2687788093228592, "grad_norm": 0.06825891137123108, "kl": 0.9053496569395065, "learning_rate": 9.982379971170664e-05, "loss": 0.0005, "reward": 1.0125000476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1035 }, { "completion_length": 301.421875, "epoch": 0.2690384989937025, "grad_norm": 0.058646347373723984, "kl": 0.9440910965204239, "learning_rate": 9.982345727809673e-05, "loss": 0.0005, "reward": 0.9187500402331352, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1036 }, { "completion_length": 274.65625, "epoch": 0.26929818866454586, "grad_norm": 0.05614722520112991, "kl": 0.8570746332406998, "learning_rate": 9.982311451264971e-05, "loss": 0.0004, "reward": 0.9218750447034836, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 1037 }, { "completion_length": 287.90625, "epoch": 0.2695578783353892, "grad_norm": 0.06101849675178528, "kl": 0.9609898775815964, "learning_rate": 9.982277141536787e-05, "loss": 0.0005, "reward": 0.9343750476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1038 }, { "completion_length": 296.90625, "epoch": 0.2698175680062326, "grad_norm": 0.0503813810646534, "kl": 0.9284567832946777, "learning_rate": 9.982242798625346e-05, "loss": 0.0005, "reward": 1.0437500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1039 }, { "completion_length": 298.96875, "epoch": 0.2700772576770759, "grad_norm": 0.04700874909758568, "kl": 0.9272035211324692, "learning_rate": 9.982208422530881e-05, "loss": 0.0005, "reward": 1.1062500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.90625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1040 }, { "completion_length": 312.015625, "epoch": 0.27033694734791924, "grad_norm": 0.06669755280017853, "kl": 0.9062181264162064, "learning_rate": 9.982174013253618e-05, "loss": 0.0005, "reward": 0.9343750476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1041 }, { "completion_length": 310.875, "epoch": 0.2705966370187626, "grad_norm": 0.06909660995006561, "kl": 0.8624827116727829, "learning_rate": 9.982139570793787e-05, "loss": 0.0004, "reward": 1.025000050663948, "reward_std": 0.16954772174358368, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.19687500223517418, "step": 1042 }, { "completion_length": 311.609375, "epoch": 0.2708563266896059, "grad_norm": 0.0832807645201683, "kl": 0.9085897356271744, "learning_rate": 9.982105095151619e-05, "loss": 0.0005, "reward": 0.8855469226837158, "reward_std": 0.12890625, "rewards/spct_argmax_reward_func": 0.6875, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 1043 }, { "completion_length": 303.453125, "epoch": 0.27111601636044924, "grad_norm": 0.06180766597390175, "kl": 0.9107806086540222, "learning_rate": 9.98207058632734e-05, "loss": 0.0005, "reward": 0.9500000476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1044 }, { "completion_length": 291.5, "epoch": 0.27137570603129263, "grad_norm": 0.057888101786375046, "kl": 1.009627178311348, "learning_rate": 9.982036044321186e-05, "loss": 0.0005, "reward": 1.0125000476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1045 }, { "completion_length": 300.421875, "epoch": 0.27163539570213596, "grad_norm": 0.04317311570048332, "kl": 0.915572002530098, "learning_rate": 9.982001469133379e-05, "loss": 0.0005, "reward": 1.1375000476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.9375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1046 }, { "completion_length": 292.546875, "epoch": 0.2718950853729793, "grad_norm": 0.05339128524065018, "kl": 0.8928887099027634, "learning_rate": 9.981966860764156e-05, "loss": 0.0004, "reward": 0.8906250447034836, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 1047 }, { "completion_length": 304.25, "epoch": 0.2721547750438226, "grad_norm": 0.10519891232252121, "kl": 0.900107815861702, "learning_rate": 9.981932219213746e-05, "loss": 0.0005, "reward": 0.8304688036441803, "reward_std": 0.17715006321668625, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.18984375149011612, "step": 1048 }, { "completion_length": 321.390625, "epoch": 0.27241446471466596, "grad_norm": 0.07669302076101303, "kl": 0.8985383808612823, "learning_rate": 9.981897544482376e-05, "loss": 0.0004, "reward": 0.8562500476837158, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.65625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1049 }, { "completion_length": 313.859375, "epoch": 0.2726741543855093, "grad_norm": 0.05076369270682335, "kl": 0.9246695041656494, "learning_rate": 9.981862836570281e-05, "loss": 0.0005, "reward": 1.0437500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1050 }, { "completion_length": 331.921875, "epoch": 0.2729338440563527, "grad_norm": 0.030783843249082565, "kl": 0.9270100444555283, "learning_rate": 9.98182809547769e-05, "loss": 0.0005, "reward": 0.8718750476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.671875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1051 }, { "completion_length": 319.4375, "epoch": 0.273193533727196, "grad_norm": 0.027315188199281693, "kl": 0.9309768378734589, "learning_rate": 9.981793321204835e-05, "loss": 0.0005, "reward": 0.9843750447034836, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 1052 }, { "completion_length": 353.15625, "epoch": 0.27345322339803935, "grad_norm": 0.049876533448696136, "kl": 0.8612676709890366, "learning_rate": 9.981758513751949e-05, "loss": 0.0004, "reward": 1.0281250476837158, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1053 }, { "completion_length": 360.703125, "epoch": 0.2737129130688827, "grad_norm": 0.07243779301643372, "kl": 0.9055207967758179, "learning_rate": 9.981723673119263e-05, "loss": 0.0005, "reward": 0.7781250402331352, "reward_std": 0.16591878235340118, "rewards/spct_argmax_reward_func": 0.578125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1054 }, { "completion_length": 361.5, "epoch": 0.273972602739726, "grad_norm": 0.0008080619154497981, "kl": 0.9425939619541168, "learning_rate": 9.981688799307007e-05, "loss": 0.0005, "reward": 0.9500000476837158, "reward_std": 0.0, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1055 }, { "completion_length": 370.03125, "epoch": 0.27423229241056934, "grad_norm": 0.04880138486623764, "kl": 1.0019371509552002, "learning_rate": 9.981653892315417e-05, "loss": 0.0005, "reward": 1.1062500476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.90625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1056 }, { "completion_length": 388.59375, "epoch": 0.27449198208141273, "grad_norm": 0.05205095559358597, "kl": 1.0095855593681335, "learning_rate": 9.981618952144721e-05, "loss": 0.0005, "reward": 0.9656250476837158, "reward_std": 0.10341878235340118, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1057 }, { "completion_length": 386.65625, "epoch": 0.27475167175225607, "grad_norm": 0.0016155611956492066, "kl": 0.978692352771759, "learning_rate": 9.981583978795155e-05, "loss": 0.0005, "reward": 1.0046875476837158, "reward_std": 0.0, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 1058 }, { "completion_length": 412.5, "epoch": 0.2750113614230994, "grad_norm": 0.037158314138650894, "kl": 0.8787950128316879, "learning_rate": 9.981548972266951e-05, "loss": 0.0004, "reward": 0.9109375476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 1059 }, { "completion_length": 366.5, "epoch": 0.27527105109394273, "grad_norm": 0.0569445863366127, "kl": 0.9509799480438232, "learning_rate": 9.981513932560344e-05, "loss": 0.0005, "reward": 0.8847656697034836, "reward_std": 0.09765625, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.1816406287252903, "step": 1060 }, { "completion_length": 439.234375, "epoch": 0.27553074076478606, "grad_norm": 0.04842041805386543, "kl": 0.8855141997337341, "learning_rate": 9.981478859675564e-05, "loss": 0.0004, "reward": 0.9187500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1061 }, { "completion_length": 427.25, "epoch": 0.2757904304356294, "grad_norm": 0.03609868139028549, "kl": 0.8694937527179718, "learning_rate": 9.981443753612845e-05, "loss": 0.0004, "reward": 1.0312500447034836, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.1875000037252903, "step": 1062 }, { "completion_length": 442.59375, "epoch": 0.2760501201064728, "grad_norm": 0.03862107917666435, "kl": 0.9588389694690704, "learning_rate": 9.981408614372424e-05, "loss": 0.0005, "reward": 0.9578125476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 1063 }, { "completion_length": 460.6875, "epoch": 0.2763098097773161, "grad_norm": 0.03409034013748169, "kl": 0.8413974493741989, "learning_rate": 9.981373441954532e-05, "loss": 0.0004, "reward": 0.8406250402331352, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1064 }, { "completion_length": 486.78125, "epoch": 0.27656949944815945, "grad_norm": 0.048095934092998505, "kl": 0.883465126156807, "learning_rate": 9.981338236359405e-05, "loss": 0.0004, "reward": 0.9656250476837158, "reward_std": 0.13950317353010178, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1065 }, { "completion_length": 491.328125, "epoch": 0.2768291891190028, "grad_norm": 0.05095721781253815, "kl": 0.7862337976694107, "learning_rate": 9.981302997587276e-05, "loss": 0.0004, "reward": 0.9656250476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1066 }, { "completion_length": 474.671875, "epoch": 0.2770888787898461, "grad_norm": 0.021662535145878792, "kl": 0.8219651579856873, "learning_rate": 9.981267725638381e-05, "loss": 0.0004, "reward": 1.0906250476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.890625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1067 }, { "completion_length": 496.875, "epoch": 0.27734856846068945, "grad_norm": 0.06895433366298676, "kl": 0.8573415279388428, "learning_rate": 9.981232420512955e-05, "loss": 0.0004, "reward": 0.9812500476837158, "reward_std": 0.19716878235340118, "rewards/spct_argmax_reward_func": 0.78125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1068 }, { "completion_length": 495.234375, "epoch": 0.27760825813153284, "grad_norm": 0.04625494405627251, "kl": 0.7920906990766525, "learning_rate": 9.981197082211231e-05, "loss": 0.0004, "reward": 1.0281250476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1069 }, { "completion_length": 508.421875, "epoch": 0.27786794780237617, "grad_norm": 0.03288198262453079, "kl": 0.8665333390235901, "learning_rate": 9.981161710733447e-05, "loss": 0.0004, "reward": 0.9343750476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1070 }, { "completion_length": 509.296875, "epoch": 0.2781276374732195, "grad_norm": 0.04856428876519203, "kl": 0.8065062463283539, "learning_rate": 9.981126306079837e-05, "loss": 0.0004, "reward": 1.0281250476837158, "reward_std": 0.09375, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1071 }, { "completion_length": 532.546875, "epoch": 0.27838732714406283, "grad_norm": 0.022686829790472984, "kl": 0.80021932721138, "learning_rate": 9.981090868250638e-05, "loss": 0.0004, "reward": 1.0906250476837158, "reward_std": 0.03125, "rewards/spct_argmax_reward_func": 0.890625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1072 }, { "completion_length": 495.890625, "epoch": 0.27864701681490617, "grad_norm": 0.05029721558094025, "kl": 0.8213796466588974, "learning_rate": 9.981055397246083e-05, "loss": 0.0004, "reward": 1.0203125476837158, "reward_std": 0.15625, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 1073 }, { "completion_length": 495.96875, "epoch": 0.27890670648574956, "grad_norm": 0.04361069202423096, "kl": 0.8738257735967636, "learning_rate": 9.981019893066415e-05, "loss": 0.0004, "reward": 1.0261719226837158, "reward_std": 0.07124064117670059, "rewards/spct_argmax_reward_func": 0.828125, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 1074 }, { "completion_length": 532.484375, "epoch": 0.2791663961565929, "grad_norm": 0.03190043941140175, "kl": 0.7941219955682755, "learning_rate": 9.980984355711863e-05, "loss": 0.0004, "reward": 1.0906250476837158, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.890625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1075 }, { "completion_length": 560.234375, "epoch": 0.2794260858274362, "grad_norm": 0.03197089582681656, "kl": 0.7503125220537186, "learning_rate": 9.980948785182666e-05, "loss": 0.0004, "reward": 0.9187500476837158, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.71875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1076 }, { "completion_length": 494.59375, "epoch": 0.27968577549827955, "grad_norm": 0.0006204999517649412, "kl": 0.83378104865551, "learning_rate": 9.980913181479063e-05, "loss": 0.0004, "reward": 1.0640625357627869, "reward_std": 0.0, "rewards/spct_argmax_reward_func": 0.875, "rewards/spct_format_reward_func": 0.18906250223517418, "step": 1077 }, { "completion_length": 557.59375, "epoch": 0.2799454651691229, "grad_norm": 0.06316392123699188, "kl": 0.7963871210813522, "learning_rate": 9.980877544601291e-05, "loss": 0.0004, "reward": 0.8164062947034836, "reward_std": 0.16264688968658447, "rewards/spct_argmax_reward_func": 0.625, "rewards/spct_format_reward_func": 0.1914062537252903, "step": 1078 }, { "completion_length": 544.8125, "epoch": 0.2802051548399662, "grad_norm": 0.0455130934715271, "kl": 0.8736381083726883, "learning_rate": 9.980841874549583e-05, "loss": 0.0004, "reward": 0.7445312887430191, "reward_std": 0.06406249850988388, "rewards/spct_argmax_reward_func": 0.5625, "rewards/spct_format_reward_func": 0.18203125149011612, "step": 1079 }, { "completion_length": 617.28125, "epoch": 0.2804648445108096, "grad_norm": 0.0013491009594872594, "kl": 0.8207730054855347, "learning_rate": 9.980806171324182e-05, "loss": 0.0004, "reward": 1.0125000476837158, "reward_std": 0.0, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1080 }, { "completion_length": 558.484375, "epoch": 0.28072453418165294, "grad_norm": 0.03542719781398773, "kl": 0.751207247376442, "learning_rate": 9.980770434925321e-05, "loss": 0.0004, "reward": 0.9917969107627869, "reward_std": 0.07072763890028, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.19492187723517418, "step": 1081 }, { "completion_length": 561.65625, "epoch": 0.2809842238524963, "grad_norm": 0.04315389320254326, "kl": 0.7264769077301025, "learning_rate": 9.980734665353245e-05, "loss": 0.0004, "reward": 1.0437500476837158, "reward_std": 0.13466878235340118, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1082 }, { "completion_length": 655.75, "epoch": 0.2812439135233396, "grad_norm": 0.03166816011071205, "kl": 0.7473782151937485, "learning_rate": 9.980698862608184e-05, "loss": 0.0004, "reward": 0.7937500402331352, "reward_std": 0.0625, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1083 }, { "completion_length": 601.65625, "epoch": 0.28150360319418294, "grad_norm": 0.029008308425545692, "kl": 0.749731570482254, "learning_rate": 9.98066302669038e-05, "loss": 0.0004, "reward": 0.9343750402331352, "reward_std": 0.06733439117670059, "rewards/spct_argmax_reward_func": 0.734375, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1084 }, { "completion_length": 610.0, "epoch": 0.28176329286502627, "grad_norm": 0.03611854463815689, "kl": 0.7826325446367264, "learning_rate": 9.980627157600073e-05, "loss": 0.0004, "reward": 1.0671875476837158, "reward_std": 0.09858439117670059, "rewards/spct_argmax_reward_func": 0.875, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 1085 }, { "completion_length": 623.5, "epoch": 0.28202298253586966, "grad_norm": 0.04331173375248909, "kl": 0.7588333338499069, "learning_rate": 9.980591255337501e-05, "loss": 0.0004, "reward": 0.9968750476837158, "reward_std": 0.15625, "rewards/spct_argmax_reward_func": 0.796875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1086 }, { "completion_length": 609.46875, "epoch": 0.282282672206713, "grad_norm": 0.052879199385643005, "kl": 0.7183580100536346, "learning_rate": 9.980555319902902e-05, "loss": 0.0004, "reward": 0.7050781697034836, "reward_std": 0.13530313968658447, "rewards/spct_argmax_reward_func": 0.515625, "rewards/spct_format_reward_func": 0.1894531287252903, "step": 1087 }, { "completion_length": 604.1875, "epoch": 0.2825423618775563, "grad_norm": 0.039401452988386154, "kl": 0.7460415661334991, "learning_rate": 9.980519351296517e-05, "loss": 0.0004, "reward": 0.9656250476837158, "reward_std": 0.13950317353010178, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1088 }, { "completion_length": 597.015625, "epoch": 0.28280205154839966, "grad_norm": 0.04843837767839432, "kl": 0.7598419487476349, "learning_rate": 9.980483349518583e-05, "loss": 0.0004, "reward": 0.9441406726837158, "reward_std": 0.12890625, "rewards/spct_argmax_reward_func": 0.75, "rewards/spct_format_reward_func": 0.19414062798023224, "step": 1089 }, { "completion_length": 568.203125, "epoch": 0.283061741219243, "grad_norm": 0.032686810940504074, "kl": 0.7499186992645264, "learning_rate": 9.980447314569345e-05, "loss": 0.0004, "reward": 0.9636719226837158, "reward_std": 0.07124064117670059, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 1090 }, { "completion_length": 612.15625, "epoch": 0.2833214308900863, "grad_norm": 0.06258390843868256, "kl": 0.7621691823005676, "learning_rate": 9.980411246449038e-05, "loss": 0.0004, "reward": 0.7843750417232513, "reward_std": 0.16288860887289047, "rewards/spct_argmax_reward_func": 0.59375, "rewards/spct_format_reward_func": 0.19062500074505806, "step": 1091 }, { "completion_length": 577.109375, "epoch": 0.2835811205609297, "grad_norm": 0.057678889483213425, "kl": 0.7665934562683105, "learning_rate": 9.980375145157901e-05, "loss": 0.0004, "reward": 0.9656250476837158, "reward_std": 0.20200317353010178, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1092 }, { "completion_length": 574.03125, "epoch": 0.28384081023177304, "grad_norm": 0.030086910352110863, "kl": 0.7960618883371353, "learning_rate": 9.980339010696181e-05, "loss": 0.0004, "reward": 0.9636719226837158, "reward_std": 0.07124064117670059, "rewards/spct_argmax_reward_func": 0.765625, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 1093 }, { "completion_length": 545.1875, "epoch": 0.2841004999026164, "grad_norm": 0.0005794529570266604, "kl": 0.7751719504594803, "learning_rate": 9.980302843064114e-05, "loss": 0.0004, "reward": 1.0125000476837158, "reward_std": 0.0, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1094 }, { "completion_length": 503.21875, "epoch": 0.2843601895734597, "grad_norm": 0.04959278926253319, "kl": 0.7981711477041245, "learning_rate": 9.980266642261939e-05, "loss": 0.0004, "reward": 1.0359375476837158, "reward_std": 0.125, "rewards/spct_argmax_reward_func": 0.84375, "rewards/spct_format_reward_func": 0.19218750298023224, "step": 1095 }, { "completion_length": 508.859375, "epoch": 0.28461987924430304, "grad_norm": 0.000540952431038022, "kl": 0.7713431715965271, "learning_rate": 9.980230408289903e-05, "loss": 0.0004, "reward": 1.0750000476837158, "reward_std": 0.0, "rewards/spct_argmax_reward_func": 0.875, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1096 }, { "completion_length": 501.546875, "epoch": 0.2848795689151464, "grad_norm": 0.04248228669166565, "kl": 0.7053830325603485, "learning_rate": 9.980194141148242e-05, "loss": 0.0004, "reward": 0.9031250476837158, "reward_std": 0.1298343911767006, "rewards/spct_argmax_reward_func": 0.703125, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1097 }, { "completion_length": 455.421875, "epoch": 0.28513925858598976, "grad_norm": 0.053583450615406036, "kl": 0.7698601633310318, "learning_rate": 9.980157840837202e-05, "loss": 0.0004, "reward": 1.1273438036441803, "reward_std": 0.0015625001396983862, "rewards/spct_argmax_reward_func": 0.9375, "rewards/spct_format_reward_func": 0.18984375149011612, "step": 1098 }, { "completion_length": 505.40625, "epoch": 0.2853989482568331, "grad_norm": 0.05965655669569969, "kl": 0.7898445874452591, "learning_rate": 9.98012150735702e-05, "loss": 0.0004, "reward": 0.8406250476837158, "reward_std": 0.1923343911767006, "rewards/spct_argmax_reward_func": 0.640625, "rewards/spct_format_reward_func": 0.20000000298023224, "step": 1099 }, { "completion_length": 504.921875, "epoch": 0.28565863792767643, "grad_norm": 0.05196404457092285, "kl": 0.7863964885473251, "learning_rate": 9.980085140707943e-05, "loss": 0.0004, "reward": 1.0105469226837158, "reward_std": 0.12890625, "rewards/spct_argmax_reward_func": 0.8125, "rewards/spct_format_reward_func": 0.19804687798023224, "step": 1100 } ], "logging_steps": 1, "max_steps": 38500, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }