{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7111111111111111, "eval_steps": 500, "global_step": 3200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 309.04, "epoch": 0.011111111111111112, "grad_norm": NaN, "kl": 222.66988746643065, "learning_rate": 5.444444444444444e-07, "loss": 8.9068, "reward": -18.06266725540161, "reward_std": 6.391496688127518, "rewards/check_first_pass": -9.93666666984558, "rewards/check_solution": -7.600000243186951, "rewards/check_solution_words": -6.068000079095364, "rewards/check_word_guesses": 5.54200014591217, "step": 50 }, { "completion_length": 368.64, "epoch": 0.022222222222222223, "grad_norm": NaN, "kl": 557.3866543316841, "learning_rate": 1.1e-06, "loss": 22.2955, "reward": -17.431167125701904, "reward_std": 5.4497878611087796, "rewards/check_first_pass": -9.859833374023438, "rewards/check_solution": -7.2583335638046265, "rewards/check_solution_words": -5.878333521187305, "rewards/check_word_guesses": 5.565333509445191, "step": 100 }, { "completion_length": 346.92, "epoch": 0.03333333333333333, "grad_norm": NaN, "kl": 4737.8455329227445, "learning_rate": 1.6555555555555559e-06, "loss": 189.5138, "reward": -18.070500688552855, "reward_std": 7.8515861177444455, "rewards/check_first_pass": -9.786166725158692, "rewards/check_solution": -7.325000324249268, "rewards/check_solution_words": -7.050333592891693, "rewards/check_word_guesses": 6.091000156402588, "step": 150 }, { "completion_length": 322.2, "epoch": 0.044444444444444446, "grad_norm": NaN, "kl": 32057.38775477886, "learning_rate": 2.2111111111111113e-06, "loss": 1282.2956, "reward": -15.816333751678467, "reward_std": 6.191992573738098, "rewards/check_first_pass": -9.895000038146973, "rewards/check_solution": -7.100000200271606, "rewards/check_solution_words": -4.8800000631809235, "rewards/check_word_guesses": 6.058666839599609, "step": 200 }, { "completion_length": 349.9, "epoch": 0.05555555555555555, "grad_norm": NaN, "kl": 5074.338300862312, "learning_rate": 2.766666666666667e-06, "loss": 202.9736, "reward": -17.724167308807374, "reward_std": 6.207637655735016, "rewards/check_first_pass": -9.912833366394043, "rewards/check_solution": -7.358333556652069, "rewards/check_solution_words": -6.180666843354702, "rewards/check_word_guesses": 5.727666816711426, "step": 250 }, { "completion_length": 336.42, "epoch": 0.06666666666666667, "grad_norm": NaN, "kl": 315.6221669435501, "learning_rate": 3.322222222222222e-06, "loss": 12.6249, "reward": -16.775000438690185, "reward_std": 5.353409328460693, "rewards/check_first_pass": -9.81633337020874, "rewards/check_solution": -7.341666927337647, "rewards/check_solution_words": -5.623000101844471, "rewards/check_word_guesses": 6.006000165939331, "step": 300 }, { "completion_length": 307.04, "epoch": 0.07777777777777778, "grad_norm": NaN, "kl": 6570.5719665384295, "learning_rate": 3.877777777777778e-06, "loss": 262.8229, "reward": -17.077000389099123, "reward_std": 5.669408960938454, "rewards/check_first_pass": -9.886666717529296, "rewards/check_solution": -7.250000200271606, "rewards/check_solution_words": -5.695666807889938, "rewards/check_word_guesses": 5.755333452224732, "step": 350 }, { "completion_length": 313.08, "epoch": 0.08888888888888889, "grad_norm": NaN, "kl": 1532.4928638124466, "learning_rate": 4.433333333333334e-06, "loss": 61.2997, "reward": -17.507167091369627, "reward_std": 5.527194731235504, "rewards/check_first_pass": -9.908166694641114, "rewards/check_solution": -7.30833353638649, "rewards/check_solution_words": -6.251000165343284, "rewards/check_word_guesses": 5.9603334903717045, "step": 400 }, { "completion_length": 329.37666687011716, "epoch": 0.1, "grad_norm": NaN, "kl": 1601.70994805336, "learning_rate": 4.988888888888889e-06, "loss": 64.0684, "reward": -17.980167026519776, "reward_std": 6.458992264270782, "rewards/check_first_pass": -9.801500053405762, "rewards/check_solution": -7.2666668963432315, "rewards/check_solution_words": -6.554666934013366, "rewards/check_word_guesses": 5.64266683101654, "step": 450 }, { "completion_length": 307.52, "epoch": 0.1111111111111111, "grad_norm": NaN, "kl": 702.5347912788391, "learning_rate": 4.998194324998843e-06, "loss": 28.1014, "reward": -16.74250042915344, "reward_std": 6.445133271217347, "rewards/check_first_pass": -9.824500045776368, "rewards/check_solution": -7.308333573341369, "rewards/check_solution_words": -5.524333542585373, "rewards/check_word_guesses": 5.914666795730591, "step": 500 }, { "completion_length": 335.9, "epoch": 0.12222222222222222, "grad_norm": NaN, "kl": 19601.83191286087, "learning_rate": 4.992631880567301e-06, "loss": 784.0733, "reward": -17.86000030517578, "reward_std": 7.05341215133667, "rewards/check_first_pass": -9.785000047683717, "rewards/check_solution": -7.49166690826416, "rewards/check_solution_words": -6.301333554983139, "rewards/check_word_guesses": 5.71800015449524, "step": 550 }, { "completion_length": 298.2, "epoch": 0.13333333333333333, "grad_norm": NaN, "kl": 1115.117756202221, "learning_rate": 4.983320281008445e-06, "loss": 44.6047, "reward": -16.99700037956238, "reward_std": 5.631768324375153, "rewards/check_first_pass": -9.813000040054321, "rewards/check_solution": -7.041666898727417, "rewards/check_solution_words": -6.250666889995337, "rewards/check_word_guesses": 6.108333473205566, "step": 600 }, { "completion_length": 318.48, "epoch": 0.14444444444444443, "grad_norm": NaN, "kl": 3946.661036362648, "learning_rate": 4.970273531852536e-06, "loss": 157.8665, "reward": -17.999333934783934, "reward_std": 6.210418889522552, "rewards/check_first_pass": -9.89133337020874, "rewards/check_solution": -7.458333578109741, "rewards/check_solution_words": -6.459333531856537, "rewards/check_word_guesses": 5.809666805267334, "step": 650 }, { "completion_length": 351.9, "epoch": 0.15555555555555556, "grad_norm": NaN, "kl": 2870.44579018116, "learning_rate": 4.953511256649632e-06, "loss": 114.8178, "reward": -17.553834075927735, "reward_std": 5.835132333040238, "rewards/check_first_pass": -9.929833374023438, "rewards/check_solution": -7.383333520889282, "rewards/check_solution_words": -6.055666868388653, "rewards/check_word_guesses": 5.815000147819519, "step": 700 }, { "completion_length": 308.34, "epoch": 0.16666666666666666, "grad_norm": NaN, "kl": 164.13174985408784, "learning_rate": 4.933058667453916e-06, "loss": 6.5653, "reward": -16.56966731071472, "reward_std": 6.621588716208935, "rewards/check_first_pass": -9.908333358764649, "rewards/check_solution": -7.291666874885559, "rewards/check_solution_words": -5.485666743516922, "rewards/check_word_guesses": 6.116000127792359, "step": 750 }, { "completion_length": 342.34, "epoch": 0.17777777777777778, "grad_norm": NaN, "kl": 1447.0631847190857, "learning_rate": 4.9089465269023596e-06, "loss": 57.8825, "reward": -17.248333780765535, "reward_std": 6.114709348678589, "rewards/check_first_pass": -9.830000019073486, "rewards/check_solution": -7.2333335685729985, "rewards/check_solution_words": -6.300666825771332, "rewards/check_word_guesses": 6.115666842460632, "step": 800 }, { "completion_length": 354.18, "epoch": 0.18888888888888888, "grad_norm": NaN, "kl": 23526.59426044941, "learning_rate": 4.881211101944802e-06, "loss": 941.0638, "reward": -17.54183391571045, "reward_std": 6.4859533834457395, "rewards/check_first_pass": -9.808833379745483, "rewards/check_solution": -7.708333535194397, "rewards/check_solution_words": -5.9636668264865875, "rewards/check_word_guesses": 5.939000129699707, "step": 850 }, { "completion_length": 308.18, "epoch": 0.2, "grad_norm": NaN, "kl": 138.43031896591185, "learning_rate": 4.84989410929501e-06, "loss": 5.5372, "reward": -17.896833839416505, "reward_std": 5.668911509513855, "rewards/check_first_pass": -9.863166694641114, "rewards/check_solution": -7.233333587646484, "rewards/check_solution_words": -6.624666909873485, "rewards/check_word_guesses": 5.824333515167236, "step": 900 }, { "completion_length": 314.82, "epoch": 0.2111111111111111, "grad_norm": NaN, "kl": 1218.171366314888, "learning_rate": 4.815042652684779e-06, "loss": 48.7269, "reward": -16.533334035873413, "reward_std": 7.360376672744751, "rewards/check_first_pass": -9.612000093460082, "rewards/check_solution": -7.158333578109741, "rewards/check_solution_words": -5.995000202357769, "rewards/check_word_guesses": 6.232000198364258, "step": 950 }, { "completion_length": 339.66, "epoch": 0.2222222222222222, "grad_norm": NaN, "kl": 174.28998464107514, "learning_rate": 4.776709152015443e-06, "loss": 6.9716, "reward": -17.22483383178711, "reward_std": 6.013938563764095, "rewards/check_first_pass": -9.816166725158691, "rewards/check_solution": -7.008333616256714, "rewards/check_solution_words": -6.318000204563141, "rewards/check_word_guesses": 5.9176667785644534, "step": 1000 }, { "completion_length": 299.26, "epoch": 0.23333333333333334, "grad_norm": NaN, "kl": 240.47271874427796, "learning_rate": 4.734951264513368e-06, "loss": 9.6189, "reward": -16.81516722679138, "reward_std": 5.74999471783638, "rewards/check_first_pass": -9.819500045776367, "rewards/check_solution": -7.191666922569275, "rewards/check_solution_words": -5.3703335279226305, "rewards/check_word_guesses": 5.566333475112915, "step": 1050 }, { "completion_length": 290.3, "epoch": 0.24444444444444444, "grad_norm": NaN, "kl": 5930.2736493730545, "learning_rate": 4.689831798008002e-06, "loss": 237.2109, "reward": -16.739000663757324, "reward_std": 6.145890753269196, "rewards/check_first_pass": -9.928000049591065, "rewards/check_solution": -7.291666860580444, "rewards/check_solution_words": -5.206000205874443, "rewards/check_word_guesses": 5.686666803359985, "step": 1100 }, { "completion_length": 313.16, "epoch": 0.25555555555555554, "grad_norm": NaN, "kl": 790.292287569046, "learning_rate": 4.641418616462938e-06, "loss": 31.6117, "reward": -18.431500701904298, "reward_std": 5.68714599609375, "rewards/check_first_pass": -9.87116668701172, "rewards/check_solution": -7.8250002098083495, "rewards/check_solution_words": -6.450666869878769, "rewards/check_word_guesses": 5.7153334808349605, "step": 1150 }, { "completion_length": 288.62, "epoch": 0.26666666666666666, "grad_norm": NaN, "kl": 299.9416353178024, "learning_rate": 4.589784537902062e-06, "loss": 11.9977, "reward": -17.612167091369628, "reward_std": 5.032542688846588, "rewards/check_first_pass": -9.781500082015992, "rewards/check_solution": -7.425000247955322, "rewards/check_solution_words": -6.234666793346405, "rewards/check_word_guesses": 5.829000115394592, "step": 1200 }, { "completion_length": 322.98, "epoch": 0.2777777777777778, "grad_norm": NaN, "kl": 2793.8524017858504, "learning_rate": 4.53500722488433e-06, "loss": 111.7541, "reward": -17.664333724975585, "reward_std": 5.747455310821533, "rewards/check_first_pass": -9.923000030517578, "rewards/check_solution": -7.4000002384185795, "rewards/check_solution_words": -6.01900016926229, "rewards/check_word_guesses": 5.6776668119430544, "step": 1250 }, { "completion_length": 339.3, "epoch": 0.28888888888888886, "grad_norm": NaN, "kl": 3813.7429452037813, "learning_rate": 4.477169067691902e-06, "loss": 152.5497, "reward": -17.690833921432496, "reward_std": 5.353043854236603, "rewards/check_first_pass": -9.892833366394044, "rewards/check_solution": -7.600000247955323, "rewards/check_solution_words": -5.904333523511887, "rewards/check_word_guesses": 5.706333441734314, "step": 1300 }, { "completion_length": 310.96, "epoch": 0.3, "grad_norm": NaN, "kl": 304.47424302577974, "learning_rate": 4.416357060407332e-06, "loss": 12.179, "reward": -17.26916711807251, "reward_std": 5.263389755487442, "rewards/check_first_pass": -9.854500017166139, "rewards/check_solution": -7.250000243186951, "rewards/check_solution_words": -5.872333557605743, "rewards/check_word_guesses": 5.707666797637939, "step": 1350 }, { "completion_length": 315.5, "epoch": 0.3111111111111111, "grad_norm": NaN, "kl": 13639.832137713433, "learning_rate": 4.3526626700662e-06, "loss": 545.5934, "reward": -18.18800064086914, "reward_std": 6.097169952392578, "rewards/check_first_pass": -9.844333381652833, "rewards/check_solution": -7.566666946411133, "rewards/check_solution_words": -6.689333482980728, "rewards/check_word_guesses": 5.91233346939087, "step": 1400 }, { "completion_length": 351.12, "epoch": 0.32222222222222224, "grad_norm": NaN, "kl": 2326.0249900770186, "learning_rate": 4.286181699082008e-06, "loss": 93.041, "reward": -18.623000659942626, "reward_std": 6.812479295730591, "rewards/check_first_pass": -9.90633337020874, "rewards/check_solution": -7.350000295639038, "rewards/check_solution_words": -7.338333506584167, "rewards/check_word_guesses": 5.971666851043701, "step": 1450 }, { "completion_length": 366.32, "epoch": 0.3333333333333333, "grad_norm": NaN, "kl": 16549.004248199464, "learning_rate": 4.217014141150248e-06, "loss": 661.9602, "reward": -18.263500604629517, "reward_std": 6.059882239103318, "rewards/check_first_pass": -9.878166694641113, "rewards/check_solution": -7.3666668796539305, "rewards/check_solution_words": -6.7500001257658, "rewards/check_word_guesses": 5.731333417892456, "step": 1500 }, { "completion_length": 320.5, "epoch": 0.34444444444444444, "grad_norm": NaN, "kl": 8347.091685709953, "learning_rate": 4.145264030848381e-06, "loss": 333.8836, "reward": -17.722667150497436, "reward_std": 5.240287501811981, "rewards/check_first_pass": -9.976666679382324, "rewards/check_solution": -7.566666932106018, "rewards/check_solution_words": -5.952333456873894, "rewards/check_word_guesses": 5.773000164031982, "step": 1550 }, { "completion_length": 313.42, "epoch": 0.35555555555555557, "grad_norm": NaN, "kl": 864.0743899011612, "learning_rate": 4.071039287157953e-06, "loss": 34.563, "reward": -17.983000602722168, "reward_std": 5.850111997127533, "rewards/check_first_pass": -9.918333358764649, "rewards/check_solution": -7.208333625793457, "rewards/check_solution_words": -6.453333538174629, "rewards/check_word_guesses": 5.597000193595886, "step": 1600 }, { "completion_length": 329.72, "epoch": 0.36666666666666664, "grad_norm": NaN, "kl": 396.7952742242813, "learning_rate": 3.9944515511441995e-06, "loss": 15.8718, "reward": -16.43366714477539, "reward_std": 7.244253120422363, "rewards/check_first_pass": -9.88666669845581, "rewards/check_solution": -6.900000200271607, "rewards/check_solution_words": -5.581333435922861, "rewards/check_word_guesses": 5.93433349609375, "step": 1650 }, { "completion_length": 295.6, "epoch": 0.37777777777777777, "grad_norm": NaN, "kl": 2458.04032143116, "learning_rate": 3.915616018037271e-06, "loss": 98.3216, "reward": -16.582167387008667, "reward_std": 6.116619675159455, "rewards/check_first_pass": -9.845500040054322, "rewards/check_solution": -7.29166687965393, "rewards/check_solution_words": -5.515666830142339, "rewards/check_word_guesses": 6.0706668472290035, "step": 1700 }, { "completion_length": 305.44, "epoch": 0.3888888888888889, "grad_norm": NaN, "kl": 6297.921968564987, "learning_rate": 3.834651263967667e-06, "loss": 251.9169, "reward": -17.544833850860595, "reward_std": 6.234307850599289, "rewards/check_first_pass": -9.903166675567627, "rewards/check_solution": -7.258333530426025, "rewards/check_solution_words": -6.216000239551067, "rewards/check_word_guesses": 5.832666802406311, "step": 1750 }, { "completion_length": 293.7, "epoch": 0.4, "grad_norm": NaN, "kl": 4064.312862081528, "learning_rate": 3.7516790676164795e-06, "loss": 162.5725, "reward": -17.36033399581909, "reward_std": 5.197294096946717, "rewards/check_first_pass": -9.983333339691162, "rewards/check_solution": -7.516666932106018, "rewards/check_solution_words": -5.423000110387802, "rewards/check_word_guesses": 5.562666816711426, "step": 1800 }, { "completion_length": 320.76, "epoch": 0.4111111111111111, "grad_norm": NaN, "kl": 4698.212507400513, "learning_rate": 3.6668242270486736e-06, "loss": 187.9285, "reward": -17.611333808898927, "reward_std": 5.37955255150795, "rewards/check_first_pass": -9.976666679382324, "rewards/check_solution": -7.441666889190674, "rewards/check_solution_words": -6.205666851997376, "rewards/check_word_guesses": 6.012666845321656, "step": 1850 }, { "completion_length": 318.02, "epoch": 0.4222222222222222, "grad_norm": NaN, "kl": 467.2588349723816, "learning_rate": 3.5802143720049565e-06, "loss": 18.6904, "reward": -18.40666706085205, "reward_std": 5.797463660240173, "rewards/check_first_pass": -9.95166669845581, "rewards/check_solution": -7.400000267028808, "rewards/check_solution_words": -6.630000138878822, "rewards/check_word_guesses": 5.575000162124634, "step": 1900 }, { "completion_length": 317.28, "epoch": 0.43333333333333335, "grad_norm": NaN, "kl": 889510.5962282228, "learning_rate": 3.4919797719345172e-06, "loss": 35580.425, "reward": -17.448667163848878, "reward_std": 4.699667553901673, "rewards/check_first_pass": -9.881666679382324, "rewards/check_solution": -7.475000233650207, "rewards/check_solution_words": -5.838000079877674, "rewards/check_word_guesses": 5.746000151634217, "step": 1950 }, { "completion_length": 302.48, "epoch": 0.4444444444444444, "grad_norm": NaN, "kl": 5604.918187556267, "learning_rate": 3.402253140057402e-06, "loss": 224.1967, "reward": -16.404333744049072, "reward_std": 5.4958923101425174, "rewards/check_first_pass": -9.890000019073486, "rewards/check_solution": -7.033333592414856, "rewards/check_solution_words": -5.368000164031982, "rewards/check_word_guesses": 5.887000150680542, "step": 2000 }, { "completion_length": 321.74, "epoch": 0.45555555555555555, "grad_norm": NaN, "kl": 7411.243695282936, "learning_rate": 3.311169433751226e-06, "loss": 296.4498, "reward": -16.623500537872314, "reward_std": 6.51646169424057, "rewards/check_first_pass": -9.827833366394042, "rewards/check_solution": -6.908333498239517, "rewards/check_solution_words": -5.6363335295766595, "rewards/check_word_guesses": 5.749000191688538, "step": 2050 }, { "completion_length": 288.4, "epoch": 0.4666666666666667, "grad_norm": NaN, "kl": 6086.714874463081, "learning_rate": 3.2188656515624543e-06, "loss": 243.4686, "reward": -16.536833896636963, "reward_std": 6.763874979019165, "rewards/check_first_pass": -9.893166675567628, "rewards/check_solution": -7.266666870117188, "rewards/check_solution_words": -5.305333442389965, "rewards/check_word_guesses": 5.928333501815796, "step": 2100 }, { "completion_length": 285.4, "epoch": 0.4777777777777778, "grad_norm": NaN, "kl": 5326.300771965981, "learning_rate": 3.125480627147578e-06, "loss": 213.0521, "reward": -15.859833927154542, "reward_std": 5.706487397551537, "rewards/check_first_pass": -9.873166694641114, "rewards/check_solution": -7.225000219345093, "rewards/check_solution_words": -4.785666776001453, "rewards/check_word_guesses": 6.024000158309937, "step": 2150 }, { "completion_length": 364.84, "epoch": 0.4888888888888889, "grad_norm": NaN, "kl": 211.01880962848662, "learning_rate": 3.031154820454103e-06, "loss": 8.4408, "reward": -16.69666706085205, "reward_std": 5.748878970146179, "rewards/check_first_pass": -9.925000019073487, "rewards/check_solution": -7.075000238418579, "rewards/check_solution_words": -5.818666743040085, "rewards/check_word_guesses": 6.122000126838684, "step": 2200 }, { "completion_length": 344.1, "epoch": 0.5, "grad_norm": NaN, "kl": 628.3742008972168, "learning_rate": 2.9360301064554514e-06, "loss": 25.135, "reward": -17.17866720199585, "reward_std": 6.766914665699005, "rewards/check_first_pass": -9.844333362579345, "rewards/check_solution": -7.108333559036255, "rewards/check_solution_words": -5.930000205039978, "rewards/check_word_guesses": 5.704000172615051, "step": 2250 }, { "completion_length": 348.84, "epoch": 0.5111111111111111, "grad_norm": NaN, "kl": 1024.9925995969772, "learning_rate": 2.8402495617575194e-06, "loss": 40.9997, "reward": -16.76200065612793, "reward_std": 7.183129785060882, "rewards/check_first_pass": -9.864666690826416, "rewards/check_solution": -7.2250002205371855, "rewards/check_solution_words": -5.689333559274673, "rewards/check_word_guesses": 6.017000169754028, "step": 2300 }, { "completion_length": 289.6, "epoch": 0.5222222222222223, "grad_norm": NaN, "kl": 16142.521054096222, "learning_rate": 2.743957249397874e-06, "loss": 645.7007, "reward": -16.45316722869873, "reward_std": 4.948937799930572, "rewards/check_first_pass": -9.819833374023437, "rewards/check_solution": -7.300000195503235, "rewards/check_solution_words": -5.422000164464116, "rewards/check_word_guesses": 6.088666820526123, "step": 2350 }, { "completion_length": 317.74, "epoch": 0.5333333333333333, "grad_norm": NaN, "kl": 6416.924462666511, "learning_rate": 2.647298002161259e-06, "loss": 256.677, "reward": -17.084333810806275, "reward_std": 7.063381274342537, "rewards/check_first_pass": -9.841333360671998, "rewards/check_solution": -6.800000195503235, "rewards/check_solution_words": -6.629000161886215, "rewards/check_word_guesses": 6.186000170707703, "step": 2400 }, { "completion_length": 340.08, "epoch": 0.5444444444444444, "grad_norm": NaN, "kl": 495.02255284786224, "learning_rate": 2.5504172047373307e-06, "loss": 19.8009, "reward": -17.46050048828125, "reward_std": 5.4614636421203615, "rewards/check_first_pass": -9.889833354949952, "rewards/check_solution": -7.275000338554382, "rewards/check_solution_words": -5.955000147819519, "rewards/check_word_guesses": 5.6593334770202635, "step": 2450 }, { "completion_length": 285.88, "epoch": 0.5555555555555556, "grad_norm": NaN, "kl": 10988.790504102706, "learning_rate": 2.453460575048269e-06, "loss": 439.5517, "reward": -17.500333890914916, "reward_std": 5.9168941748142245, "rewards/check_first_pass": -9.958333339691162, "rewards/check_solution": -7.166666883230209, "rewards/check_solution_words": -5.92033349275589, "rewards/check_word_guesses": 5.545000162124634, "step": 2500 }, { "completion_length": 340.49666687011717, "epoch": 0.5666666666666667, "grad_norm": NaN, "kl": 171.32317941427232, "learning_rate": 2.356573945075186e-06, "loss": 6.8529, "reward": -16.252167136669158, "reward_std": 6.710142252445221, "rewards/check_first_pass": -9.75783338546753, "rewards/check_solution": -6.816666860580444, "rewards/check_solution_words": -6.226333482265472, "rewards/check_word_guesses": 6.548666839599609, "step": 2550 }, { "completion_length": 284.82, "epoch": 0.5777777777777777, "grad_norm": NaN, "kl": 8672.753908576966, "learning_rate": 2.259903041512972e-06, "loss": 346.9101, "reward": -16.83800039291382, "reward_std": 5.799649630486965, "rewards/check_first_pass": -9.978333339691162, "rewards/check_solution": -7.1666668510437015, "rewards/check_solution_words": -5.5156668204069135, "rewards/check_word_guesses": 5.822666735649109, "step": 2600 }, { "completion_length": 301.34, "epoch": 0.5888888888888889, "grad_norm": NaN, "kl": 686.374276099205, "learning_rate": 2.1635932665835018e-06, "loss": 27.455, "reward": -17.0438338804245, "reward_std": 5.962910556793213, "rewards/check_first_pass": -9.84416669845581, "rewards/check_solution": -7.325000264644623, "rewards/check_solution_words": -5.813000155165792, "rewards/check_word_guesses": 5.938333511352539, "step": 2650 }, { "completion_length": 292.34, "epoch": 0.6, "grad_norm": NaN, "kl": 1963.2864310121536, "learning_rate": 2.067789479336885e-06, "loss": 78.5315, "reward": -15.607000465393066, "reward_std": 5.905590240955353, "rewards/check_first_pass": -9.895000019073485, "rewards/check_solution": -6.775000203847885, "rewards/check_solution_words": -4.82066678121686, "rewards/check_word_guesses": 5.8836668109893795, "step": 2700 }, { "completion_length": 350.4, "epoch": 0.6111111111111112, "grad_norm": NaN, "kl": 688.4688113546372, "learning_rate": 1.9726357777696866e-06, "loss": 27.5388, "reward": -17.202000522613524, "reward_std": 5.623795807361603, "rewards/check_first_pass": -9.915000019073487, "rewards/check_solution": -7.375000247955322, "rewards/check_solution_words": -5.567666873335838, "rewards/check_word_guesses": 5.655666809082032, "step": 2750 }, { "completion_length": 308.42, "epoch": 0.6222222222222222, "grad_norm": NaN, "kl": 485312.439354682, "learning_rate": 1.8782752820878636e-06, "loss": 19412.4987, "reward": -15.53150053024292, "reward_std": 6.146713740825653, "rewards/check_first_pass": -9.897833366394043, "rewards/check_solution": -6.983333554267883, "rewards/check_solution_words": -4.4923334294557575, "rewards/check_word_guesses": 5.842000155448914, "step": 2800 }, { "completion_length": 326.0, "epoch": 0.6333333333333333, "grad_norm": NaN, "kl": 26110951.201275483, "learning_rate": 1.7848499194403675e-06, "loss": 1044437.84, "reward": -15.57800045967102, "reward_std": 7.1403307795524595, "rewards/check_first_pass": -9.891666717529297, "rewards/check_solution": -6.400000190734863, "rewards/check_solution_words": -5.083000233620405, "rewards/check_word_guesses": 5.796666831970215, "step": 2850 }, { "completion_length": 312.12, "epoch": 0.6444444444444445, "grad_norm": NaN, "kl": 3913.757111439705, "learning_rate": 1.6925002104472419e-06, "loss": 156.5503, "reward": -16.04266695022583, "reward_std": 7.127046866416931, "rewards/check_first_pass": -9.839000053405762, "rewards/check_solution": -7.0666668462753295, "rewards/check_solution_words": -4.946666868329048, "rewards/check_word_guesses": 5.809666833877563, "step": 2900 }, { "completion_length": 356.46, "epoch": 0.6555555555555556, "grad_norm": NaN, "kl": 445.1412619686127, "learning_rate": 1.6013650578432507e-06, "loss": 17.8057, "reward": -16.154833793640137, "reward_std": 7.507874011993408, "rewards/check_first_pass": -9.85783338546753, "rewards/check_solution": -6.758333542346954, "rewards/check_solution_words": -5.6680002117156985, "rewards/check_word_guesses": 6.129333510398864, "step": 2950 }, { "completion_length": 285.36, "epoch": 0.6666666666666666, "grad_norm": NaN, "kl": 9137.483488841057, "learning_rate": 1.5115815375549814e-06, "loss": 365.4994, "reward": -16.04333374977112, "reward_std": 6.235447915792466, "rewards/check_first_pass": -9.858000030517578, "rewards/check_solution": -6.933333530426025, "rewards/check_solution_words": -5.124000203659136, "rewards/check_word_guesses": 5.8720001077651975, "step": 3000 }, { "completion_length": 308.92, "epoch": 0.6777777777777778, "grad_norm": NaN, "kl": 1857.2192311406136, "learning_rate": 1.4232846925256205e-06, "loss": 74.2888, "reward": -16.59900043487549, "reward_std": 5.802737797498703, "rewards/check_first_pass": -9.956666679382325, "rewards/check_solution": -7.358333554267883, "rewards/check_solution_words": -5.168666838780045, "rewards/check_word_guesses": 5.884666800498962, "step": 3050 }, { "completion_length": 314.78, "epoch": 0.6888888888888889, "grad_norm": NaN, "kl": 2205.9235731983185, "learning_rate": 1.3366073295975462e-06, "loss": 88.2369, "reward": -15.489000520706178, "reward_std": 7.704707877635956, "rewards/check_first_pass": -9.765000047683715, "rewards/check_solution": -6.691666855812072, "rewards/check_solution_words": -4.798666906654835, "rewards/check_word_guesses": 5.766333532333374, "step": 3100 }, { "completion_length": 371.68, "epoch": 0.7, "grad_norm": NaN, "kl": 2451.063106870651, "learning_rate": 1.2516798197582186e-06, "loss": 98.0425, "reward": -16.43950053215027, "reward_std": 7.733632433414459, "rewards/check_first_pass": -9.95650001525879, "rewards/check_solution": -7.050000228881836, "rewards/check_solution_words": -5.417666826248169, "rewards/check_word_guesses": 5.984666795730591, "step": 3150 }, { "completion_length": 290.58, "epoch": 0.7111111111111111, "grad_norm": NaN, "kl": 1174.035637011528, "learning_rate": 1.1686299020498307e-06, "loss": 46.9614, "reward": -15.596167030334472, "reward_std": 6.710944714546204, "rewards/check_first_pass": -9.721166715621948, "rewards/check_solution": -6.516666855812073, "rewards/check_solution_words": -5.293000175356865, "rewards/check_word_guesses": 5.934666833877563, "step": 3200 } ], "logging_steps": 50, "max_steps": 4500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 12, "trial_name": null, "trial_params": null }