| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 100, | |
| "global_step": 283, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 385.89532871246337, | |
| "epoch": 0.0176678445229682, | |
| "grad_norm": 0.5257675647735596, | |
| "kl": 0.0012482523918151856, | |
| "learning_rate": 3.448275862068966e-06, | |
| "loss": 0.0001, | |
| "reward": 0.6444196742027998, | |
| "reward_std": 0.4479222524911165, | |
| "rewards/accuracy_reward": 0.1539062581025064, | |
| "rewards/format_reward": 0.49051341526210307, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 229.56596965789794, | |
| "epoch": 0.0353356890459364, | |
| "grad_norm": 0.5334790945053101, | |
| "kl": 0.3332817077636719, | |
| "learning_rate": 6.896551724137932e-06, | |
| "loss": 0.0133, | |
| "reward": 0.9689732536673545, | |
| "reward_std": 0.26624082447960973, | |
| "rewards/accuracy_reward": 0.08214286140864716, | |
| "rewards/format_reward": 0.8868303976953029, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 157.9798059463501, | |
| "epoch": 0.053003533568904596, | |
| "grad_norm": 0.287689745426178, | |
| "kl": 0.04854583740234375, | |
| "learning_rate": 1.0344827586206898e-05, | |
| "loss": 0.0019, | |
| "reward": 1.062388438731432, | |
| "reward_std": 0.20184952337294818, | |
| "rewards/accuracy_reward": 0.10022321890573949, | |
| "rewards/format_reward": 0.962165217846632, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 171.60927028656005, | |
| "epoch": 0.0706713780918728, | |
| "grad_norm": 0.2567259669303894, | |
| "kl": 0.078216552734375, | |
| "learning_rate": 1.3793103448275863e-05, | |
| "loss": 0.0031, | |
| "reward": 1.1186384424567222, | |
| "reward_std": 0.24078293070197104, | |
| "rewards/accuracy_reward": 0.15345982832368463, | |
| "rewards/format_reward": 0.9651786126196384, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 210.81998748779296, | |
| "epoch": 0.08833922261484099, | |
| "grad_norm": 0.5349090695381165, | |
| "kl": 0.112786865234375, | |
| "learning_rate": 1.7241379310344828e-05, | |
| "loss": 0.0045, | |
| "reward": 1.1583705946803093, | |
| "reward_std": 0.3293624483048916, | |
| "rewards/accuracy_reward": 0.21551340306177735, | |
| "rewards/format_reward": 0.9428571835160255, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 293.2053703308105, | |
| "epoch": 0.10600706713780919, | |
| "grad_norm": 0.19419077038764954, | |
| "kl": 0.09522705078125, | |
| "learning_rate": 1.999923511388017e-05, | |
| "loss": 0.0038, | |
| "reward": 1.2208705872297287, | |
| "reward_std": 0.36669483222067356, | |
| "rewards/accuracy_reward": 0.2871651930734515, | |
| "rewards/format_reward": 0.9337054029107094, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 323.5448799133301, | |
| "epoch": 0.12367491166077739, | |
| "grad_norm": 0.22766603529453278, | |
| "kl": 0.39053955078125, | |
| "learning_rate": 1.9972476383747748e-05, | |
| "loss": 0.0157, | |
| "reward": 1.2860491678118706, | |
| "reward_std": 0.30488016121089456, | |
| "rewards/accuracy_reward": 0.3202009078115225, | |
| "rewards/format_reward": 0.9658482551574707, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 285.7590543746948, | |
| "epoch": 0.1413427561837456, | |
| "grad_norm": 0.3550393283367157, | |
| "kl": 0.118524169921875, | |
| "learning_rate": 1.9907590277344582e-05, | |
| "loss": 0.0047, | |
| "reward": 1.2512277334928512, | |
| "reward_std": 0.34426755234599116, | |
| "rewards/accuracy_reward": 0.30758929941803215, | |
| "rewards/format_reward": 0.9436384335160255, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 269.9738945007324, | |
| "epoch": 0.15901060070671377, | |
| "grad_norm": 2.2805335521698, | |
| "kl": 1.9856201171875, | |
| "learning_rate": 1.9804824871166254e-05, | |
| "loss": 0.0796, | |
| "reward": 1.1523438021540642, | |
| "reward_std": 0.46314122155308723, | |
| "rewards/accuracy_reward": 0.316294657997787, | |
| "rewards/format_reward": 0.8360491469502449, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 249.81585788726807, | |
| "epoch": 0.17667844522968199, | |
| "grad_norm": 6.213081359863281, | |
| "kl": 3.11435546875, | |
| "learning_rate": 1.9664573064143604e-05, | |
| "loss": 0.1247, | |
| "reward": 0.8168527133762836, | |
| "reward_std": 0.42027441747486594, | |
| "rewards/accuracy_reward": 0.22991072395816445, | |
| "rewards/format_reward": 0.5869419902563096, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 837.4478942871094, | |
| "epoch": 0.19434628975265017, | |
| "grad_norm": 0.1708061844110489, | |
| "kl": 0.7300537109375, | |
| "learning_rate": 1.948737107548771e-05, | |
| "loss": 0.0292, | |
| "reward": 0.0875000040512532, | |
| "reward_std": 0.16735761975869537, | |
| "rewards/accuracy_reward": 0.03895089457510039, | |
| "rewards/format_reward": 0.048549109499435875, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 1024.0, | |
| "epoch": 0.21201413427561838, | |
| "grad_norm": 0.16706973314285278, | |
| "kl": 0.364111328125, | |
| "learning_rate": 1.9273896394584103e-05, | |
| "loss": 0.0146, | |
| "reward": 0.09107143309665844, | |
| "reward_std": 0.14619714424479752, | |
| "rewards/accuracy_reward": 0.09084821873111651, | |
| "rewards/format_reward": 0.00022321429569274187, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 1024.0, | |
| "epoch": 0.22968197879858657, | |
| "grad_norm": 0.10041823238134384, | |
| "kl": 0.2697998046875, | |
| "learning_rate": 1.9024965190774262e-05, | |
| "loss": 0.0108, | |
| "reward": 0.17421875786967575, | |
| "reward_std": 0.23079481534659863, | |
| "rewards/accuracy_reward": 0.17421875786967575, | |
| "rewards/format_reward": 0.0, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 1024.0, | |
| "epoch": 0.24734982332155478, | |
| "grad_norm": 0.3122642934322357, | |
| "kl": 0.28310546875, | |
| "learning_rate": 1.8741529192927528e-05, | |
| "loss": 0.0113, | |
| "reward": 0.19218751024454833, | |
| "reward_std": 0.23374766409397124, | |
| "rewards/accuracy_reward": 0.19218751024454833, | |
| "rewards/format_reward": 0.0, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 1024.0, | |
| "epoch": 0.26501766784452296, | |
| "grad_norm": 0.3012256622314453, | |
| "kl": 0.3281494140625, | |
| "learning_rate": 1.8424672050733577e-05, | |
| "loss": 0.0131, | |
| "reward": 0.20256697395816445, | |
| "reward_std": 0.23207057397812605, | |
| "rewards/accuracy_reward": 0.20122768832370638, | |
| "rewards/format_reward": 0.0013392857741564511, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 1024.0, | |
| "epoch": 0.2826855123674912, | |
| "grad_norm": 0.2279616743326187, | |
| "kl": 0.569970703125, | |
| "learning_rate": 1.8075605191627242e-05, | |
| "loss": 0.0228, | |
| "reward": 0.1684151873923838, | |
| "reward_std": 0.1943290094844997, | |
| "rewards/accuracy_reward": 0.16808036593720316, | |
| "rewards/format_reward": 0.0003348214435391128, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 1023.7799125671387, | |
| "epoch": 0.3003533568904594, | |
| "grad_norm": 0.1516251116991043, | |
| "kl": 0.51630859375, | |
| "learning_rate": 1.7695663189185703e-05, | |
| "loss": 0.0207, | |
| "reward": 0.21607143906876444, | |
| "reward_std": 0.20886625591665506, | |
| "rewards/accuracy_reward": 0.21607143906876444, | |
| "rewards/format_reward": 0.0, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 1023.1735549926758, | |
| "epoch": 0.31802120141342755, | |
| "grad_norm": 0.09773323684930801, | |
| "kl": 0.39691162109375, | |
| "learning_rate": 1.7286298660705877e-05, | |
| "loss": 0.0159, | |
| "reward": 0.2613839410245419, | |
| "reward_std": 0.2178319870494306, | |
| "rewards/accuracy_reward": 0.2613839410245419, | |
| "rewards/format_reward": 0.0, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 1022.9333808898925, | |
| "epoch": 0.33568904593639576, | |
| "grad_norm": 0.2660332918167114, | |
| "kl": 0.3833740234375, | |
| "learning_rate": 1.6849076713469914e-05, | |
| "loss": 0.0153, | |
| "reward": 0.2964285839349031, | |
| "reward_std": 0.23299281364306806, | |
| "rewards/accuracy_reward": 0.2964285839349031, | |
| "rewards/format_reward": 0.0, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 1022.3206634521484, | |
| "epoch": 0.35335689045936397, | |
| "grad_norm": 1.3615121841430664, | |
| "kl": 0.835400390625, | |
| "learning_rate": 1.6385668960932143e-05, | |
| "loss": 0.0334, | |
| "reward": 0.1241071482654661, | |
| "reward_std": 0.16077298847958446, | |
| "rewards/accuracy_reward": 0.1241071482654661, | |
| "rewards/format_reward": 0.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.35335689045936397, | |
| "eval_completion_length": 1021.8020172119141, | |
| "eval_kl": 0.794921875, | |
| "eval_loss": 0.03294466808438301, | |
| "eval_reward": 0.1272321492433548, | |
| "eval_reward_std": 0.20546478778123856, | |
| "eval_rewards/accuracy_reward": 0.1272321492433548, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 83.4283, | |
| "eval_samples_per_second": 1.187, | |
| "eval_steps_per_second": 0.012, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 1023.4450950622559, | |
| "epoch": 0.3710247349823322, | |
| "grad_norm": 1.5146944522857666, | |
| "kl": 1.085205078125, | |
| "learning_rate": 1.5897847131705194e-05, | |
| "loss": 0.0434, | |
| "reward": 0.12544643497094513, | |
| "reward_std": 0.1771216381341219, | |
| "rewards/accuracy_reward": 0.12544643497094513, | |
| "rewards/format_reward": 0.0, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 1023.4677505493164, | |
| "epoch": 0.38869257950530034, | |
| "grad_norm": 0.6001514196395874, | |
| "kl": 1.1712890625, | |
| "learning_rate": 1.5387476295779737e-05, | |
| "loss": 0.0469, | |
| "reward": 0.16138393647270277, | |
| "reward_std": 0.20901122770737857, | |
| "rewards/accuracy_reward": 0.15982143628643825, | |
| "rewards/format_reward": 0.0015625000698491931, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 1023.904689025879, | |
| "epoch": 0.40636042402826855, | |
| "grad_norm": 0.357604444026947, | |
| "kl": 0.9640380859375, | |
| "learning_rate": 1.4856507733875837e-05, | |
| "loss": 0.0386, | |
| "reward": 0.2774553706869483, | |
| "reward_std": 0.22977914968505503, | |
| "rewards/accuracy_reward": 0.27712054904550315, | |
| "rewards/format_reward": 0.0003348214435391128, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 1023.7766761779785, | |
| "epoch": 0.42402826855123676, | |
| "grad_norm": 2.9171788692474365, | |
| "kl": 1.54794921875, | |
| "learning_rate": 1.4306971477188223e-05, | |
| "loss": 0.0619, | |
| "reward": 0.29587054867297413, | |
| "reward_std": 0.22431441079825162, | |
| "rewards/accuracy_reward": 0.2954241203144193, | |
| "rewards/format_reward": 0.00044642859138548373, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 1023.5507850646973, | |
| "epoch": 0.4416961130742049, | |
| "grad_norm": 1.7767579555511475, | |
| "kl": 1.08740234375, | |
| "learning_rate": 1.3740968546047935e-05, | |
| "loss": 0.0435, | |
| "reward": 0.26272322572767737, | |
| "reward_std": 0.2170619947835803, | |
| "rewards/accuracy_reward": 0.2623884044587612, | |
| "rewards/format_reward": 0.0003348214435391128, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 1023.353914642334, | |
| "epoch": 0.45936395759717313, | |
| "grad_norm": 0.626575231552124, | |
| "kl": 1.624072265625, | |
| "learning_rate": 1.3160662917174045e-05, | |
| "loss": 0.065, | |
| "reward": 0.2738839427009225, | |
| "reward_std": 0.21952001163735985, | |
| "rewards/accuracy_reward": 0.27332590725272893, | |
| "rewards/format_reward": 0.0005580357392318547, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 1023.6706512451171, | |
| "epoch": 0.47703180212014135, | |
| "grad_norm": 2.2120277881622314, | |
| "kl": 1.8474609375, | |
| "learning_rate": 1.2568273250226681e-05, | |
| "loss": 0.0739, | |
| "reward": 0.27935269083827735, | |
| "reward_std": 0.2362092829309404, | |
| "rewards/accuracy_reward": 0.2791294766589999, | |
| "rewards/format_reward": 0.00022321429569274187, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 1023.9494422912597, | |
| "epoch": 0.49469964664310956, | |
| "grad_norm": 3.246752977371216, | |
| "kl": 2.728662109375, | |
| "learning_rate": 1.1966064405292887e-05, | |
| "loss": 0.1091, | |
| "reward": 0.2922991219907999, | |
| "reward_std": 0.23428730978630483, | |
| "rewards/accuracy_reward": 0.2919643005356193, | |
| "rewards/format_reward": 0.0003348214435391128, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 1023.9381698608398, | |
| "epoch": 0.5123674911660777, | |
| "grad_norm": 11.377167701721191, | |
| "kl": 2.4001953125, | |
| "learning_rate": 1.1356338783736256e-05, | |
| "loss": 0.096, | |
| "reward": 0.31104912236332893, | |
| "reward_std": 0.2248768277466297, | |
| "rewards/accuracy_reward": 0.31093751527369023, | |
| "rewards/format_reward": 0.00011160714784637093, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 1023.8702018737793, | |
| "epoch": 0.5300353356890459, | |
| "grad_norm": 4.7356181144714355, | |
| "kl": 0.99365234375, | |
| "learning_rate": 1.0741427525516463e-05, | |
| "loss": 0.0398, | |
| "reward": 0.2786830499768257, | |
| "reward_std": 0.22981408620253205, | |
| "rewards/accuracy_reward": 0.2775669790804386, | |
| "rewards/format_reward": 0.0011160714784637094, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 1023.8892868041992, | |
| "epoch": 0.5477031802120141, | |
| "grad_norm": 0.48026323318481445, | |
| "kl": 1.191552734375, | |
| "learning_rate": 1.012368159663363e-05, | |
| "loss": 0.0477, | |
| "reward": 0.2672991200350225, | |
| "reward_std": 0.2149678454734385, | |
| "rewards/accuracy_reward": 0.2670759057626128, | |
| "rewards/format_reward": 0.00022321429569274187, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 1023.9833709716797, | |
| "epoch": 0.5653710247349824, | |
| "grad_norm": 0.2914314866065979, | |
| "kl": 0.31871337890625, | |
| "learning_rate": 9.505462800772612e-06, | |
| "loss": 0.0127, | |
| "reward": 0.2546875115483999, | |
| "reward_std": 0.21248297598212956, | |
| "rewards/accuracy_reward": 0.2534598330967128, | |
| "rewards/format_reward": 0.0012276786263100802, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 1023.8994430541992, | |
| "epoch": 0.5830388692579506, | |
| "grad_norm": 0.403013676404953, | |
| "kl": 0.38603515625, | |
| "learning_rate": 8.889134749511956e-06, | |
| "loss": 0.0154, | |
| "reward": 0.23883929578587412, | |
| "reward_std": 0.21736905500292777, | |
| "rewards/accuracy_reward": 0.23627233160659672, | |
| "rewards/format_reward": 0.0025669644004665316, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 1023.8919662475586, | |
| "epoch": 0.6007067137809188, | |
| "grad_norm": 2.4790608882904053, | |
| "kl": 0.843505859375, | |
| "learning_rate": 8.277053825620836e-06, | |
| "loss": 0.0337, | |
| "reward": 0.22020090287551283, | |
| "reward_std": 0.225645115878433, | |
| "rewards/accuracy_reward": 0.21886161714792252, | |
| "rewards/format_reward": 0.0013392857741564511, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 1023.5271240234375, | |
| "epoch": 0.6183745583038869, | |
| "grad_norm": 1.517407774925232, | |
| "kl": 1.15859375, | |
| "learning_rate": 7.671560173993588e-06, | |
| "loss": 0.0463, | |
| "reward": 0.2196428684517741, | |
| "reward_std": 0.22536969408392907, | |
| "rewards/accuracy_reward": 0.21886161863803863, | |
| "rewards/format_reward": 0.0007812500349245966, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 1023.7799140930176, | |
| "epoch": 0.6360424028268551, | |
| "grad_norm": 3.477236747741699, | |
| "kl": 1.853125, | |
| "learning_rate": 7.07496875466589e-06, | |
| "loss": 0.0742, | |
| "reward": 0.18906251017469913, | |
| "reward_std": 0.21059290650300683, | |
| "rewards/accuracy_reward": 0.18861608181614428, | |
| "rewards/format_reward": 0.00044642859138548373, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 1023.8558059692383, | |
| "epoch": 0.6537102473498233, | |
| "grad_norm": 4.594732284545898, | |
| "kl": 1.274169921875, | |
| "learning_rate": 6.489560492119225e-06, | |
| "loss": 0.051, | |
| "reward": 0.22857143869623542, | |
| "reward_std": 0.23066243380308152, | |
| "rewards/accuracy_reward": 0.22745536724105478, | |
| "rewards/format_reward": 0.0011160714784637094, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 1023.9906257629394, | |
| "epoch": 0.6713780918727915, | |
| "grad_norm": 5.503405570983887, | |
| "kl": 1.36650390625, | |
| "learning_rate": 5.9175735547120975e-06, | |
| "loss": 0.0547, | |
| "reward": 0.2881696566008031, | |
| "reward_std": 0.24815111914649607, | |
| "rewards/accuracy_reward": 0.28437501201406123, | |
| "rewards/format_reward": 0.0037946430500596763, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 1023.9542419433594, | |
| "epoch": 0.6890459363957597, | |
| "grad_norm": 1.3981853723526, | |
| "kl": 2.093359375, | |
| "learning_rate": 5.361194797579108e-06, | |
| "loss": 0.0838, | |
| "reward": 0.29575894251465795, | |
| "reward_std": 0.2531373543664813, | |
| "rewards/accuracy_reward": 0.28939733523875477, | |
| "rewards/format_reward": 0.0063616074505262075, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 1023.9152908325195, | |
| "epoch": 0.7067137809187279, | |
| "grad_norm": 7.645321369171143, | |
| "kl": 1.69521484375, | |
| "learning_rate": 4.8225514017138205e-06, | |
| "loss": 0.0678, | |
| "reward": 0.26662947684526445, | |
| "reward_std": 0.2479257956147194, | |
| "rewards/accuracy_reward": 0.25881697619333865, | |
| "rewards/format_reward": 0.007812500360887497, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7067137809187279, | |
| "eval_completion_length": 1024.0, | |
| "eval_kl": 2.50390625, | |
| "eval_loss": 0.10209327191114426, | |
| "eval_reward": 0.310267873108387, | |
| "eval_reward_std": 0.22309495136141777, | |
| "eval_rewards/accuracy_reward": 0.305803582072258, | |
| "eval_rewards/format_reward": 0.004464285913854837, | |
| "eval_runtime": 91.3405, | |
| "eval_samples_per_second": 1.084, | |
| "eval_steps_per_second": 0.011, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 1024.0, | |
| "epoch": 0.7243816254416962, | |
| "grad_norm": 4.966090679168701, | |
| "kl": 2.449951171875, | |
| "learning_rate": 4.303702741201431e-06, | |
| "loss": 0.098, | |
| "reward": 0.24832590399309992, | |
| "reward_std": 0.24168844958767294, | |
| "rewards/accuracy_reward": 0.24118304727599024, | |
| "rewards/format_reward": 0.007142857485450804, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 1023.8795768737793, | |
| "epoch": 0.7420494699646644, | |
| "grad_norm": 7.903662204742432, | |
| "kl": 1.901416015625, | |
| "learning_rate": 3.8066325096949153e-06, | |
| "loss": 0.076, | |
| "reward": 0.25558037031441927, | |
| "reward_std": 0.2529288594610989, | |
| "rewards/accuracy_reward": 0.2431919751688838, | |
| "rewards/format_reward": 0.012388393504079432, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 1023.8687515258789, | |
| "epoch": 0.7597173144876325, | |
| "grad_norm": 2.2350668907165527, | |
| "kl": 1.812060546875, | |
| "learning_rate": 3.3332411362372063e-06, | |
| "loss": 0.0725, | |
| "reward": 0.258482154738158, | |
| "reward_std": 0.26162059921771286, | |
| "rewards/accuracy_reward": 0.24118304559960962, | |
| "rewards/format_reward": 0.01729910804424435, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 1023.8536842346191, | |
| "epoch": 0.7773851590106007, | |
| "grad_norm": 2.8165674209594727, | |
| "kl": 1.880078125, | |
| "learning_rate": 2.8853385194256677e-06, | |
| "loss": 0.0752, | |
| "reward": 0.24375001061707735, | |
| "reward_std": 0.25496505089104177, | |
| "rewards/accuracy_reward": 0.23002233263105154, | |
| "rewards/format_reward": 0.013727679383009672, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 1024.0, | |
| "epoch": 0.7950530035335689, | |
| "grad_norm": 3.0542807579040527, | |
| "kl": 1.645703125, | |
| "learning_rate": 2.464637107698046e-06, | |
| "loss": 0.0659, | |
| "reward": 0.26216519055888055, | |
| "reward_std": 0.27050148248672484, | |
| "rewards/accuracy_reward": 0.23950894009321927, | |
| "rewards/format_reward": 0.022656251245643945, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 1023.835604095459, | |
| "epoch": 0.8127208480565371, | |
| "grad_norm": 1.1989268064498901, | |
| "kl": 1.440283203125, | |
| "learning_rate": 2.072745352195794e-06, | |
| "loss": 0.0576, | |
| "reward": 0.2896205481141806, | |
| "reward_std": 0.28690752685070037, | |
| "rewards/accuracy_reward": 0.2671875134110451, | |
| "rewards/format_reward": 0.0224330369848758, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 1023.9050231933594, | |
| "epoch": 0.8303886925795053, | |
| "grad_norm": 0.9027553200721741, | |
| "kl": 1.21279296875, | |
| "learning_rate": 1.7111615572361628e-06, | |
| "loss": 0.0485, | |
| "reward": 0.2984375137835741, | |
| "reward_std": 0.29320847503840924, | |
| "rewards/accuracy_reward": 0.27187501136213543, | |
| "rewards/format_reward": 0.026562501350417732, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 1023.9849334716797, | |
| "epoch": 0.8480565371024735, | |
| "grad_norm": 1.0079853534698486, | |
| "kl": 0.9890380859375, | |
| "learning_rate": 1.381268151904298e-06, | |
| "loss": 0.0396, | |
| "reward": 0.3024553697556257, | |
| "reward_std": 0.3057010589167476, | |
| "rewards/accuracy_reward": 0.2669642990455031, | |
| "rewards/format_reward": 0.03549107307335362, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 1023.9882820129394, | |
| "epoch": 0.8657243816254417, | |
| "grad_norm": 0.6375559568405151, | |
| "kl": 0.80595703125, | |
| "learning_rate": 1.0843264046665558e-06, | |
| "loss": 0.0322, | |
| "reward": 0.28783483654260633, | |
| "reward_std": 0.2901507246308029, | |
| "rewards/accuracy_reward": 0.256250012293458, | |
| "rewards/format_reward": 0.0315848228870891, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 1023.8103805541992, | |
| "epoch": 0.8833922261484098, | |
| "grad_norm": 0.3038390576839447, | |
| "kl": 0.7016357421875, | |
| "learning_rate": 8.214716012124491e-07, | |
| "loss": 0.0281, | |
| "reward": 0.28537947684526443, | |
| "reward_std": 0.2940663579851389, | |
| "rewards/accuracy_reward": 0.2517857262864709, | |
| "rewards/format_reward": 0.033593751536682245, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 1023.9936386108399, | |
| "epoch": 0.901060070671378, | |
| "grad_norm": 0.7605869174003601, | |
| "kl": 0.6392333984375, | |
| "learning_rate": 5.937087039615619e-07, | |
| "loss": 0.0256, | |
| "reward": 0.3013393010944128, | |
| "reward_std": 0.2973380209878087, | |
| "rewards/accuracy_reward": 0.26551340594887735, | |
| "rewards/format_reward": 0.035825894423760475, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 1024.0, | |
| "epoch": 0.9187279151943463, | |
| "grad_norm": 0.4661541283130646, | |
| "kl": 0.6113037109375, | |
| "learning_rate": 4.019085098303077e-07, | |
| "loss": 0.0245, | |
| "reward": 0.30870537031441925, | |
| "reward_std": 0.301556083932519, | |
| "rewards/accuracy_reward": 0.27533483430743216, | |
| "rewards/format_reward": 0.03337053736904636, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 1023.9744422912597, | |
| "epoch": 0.9363957597173145, | |
| "grad_norm": 0.5394344925880432, | |
| "kl": 0.59677734375, | |
| "learning_rate": 2.4680432094837394e-07, | |
| "loss": 0.0239, | |
| "reward": 0.2986607299186289, | |
| "reward_std": 0.3167744716629386, | |
| "rewards/accuracy_reward": 0.2603794766589999, | |
| "rewards/format_reward": 0.03828125168802217, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 1023.7898460388184, | |
| "epoch": 0.9540636042402827, | |
| "grad_norm": 0.35241708159446716, | |
| "kl": 0.6021240234375, | |
| "learning_rate": 1.289891410535593e-07, | |
| "loss": 0.0241, | |
| "reward": 0.30212054755538703, | |
| "reward_std": 0.30877320375293493, | |
| "rewards/accuracy_reward": 0.2652901913970709, | |
| "rewards/format_reward": 0.036830358975566926, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 1023.873885345459, | |
| "epoch": 0.9717314487632509, | |
| "grad_norm": 0.3670821189880371, | |
| "kl": 0.5589111328125, | |
| "learning_rate": 4.8913408283934874e-08, | |
| "loss": 0.0224, | |
| "reward": 0.30368304885923864, | |
| "reward_std": 0.30286577958613636, | |
| "rewards/accuracy_reward": 0.2606026901863515, | |
| "rewards/format_reward": 0.04308035911526531, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 1023.9140632629394, | |
| "epoch": 0.9893992932862191, | |
| "grad_norm": 0.4401331841945648, | |
| "kl": 0.5879638671875, | |
| "learning_rate": 6.883273035447335e-09, | |
| "loss": 0.0235, | |
| "reward": 0.31294644335284827, | |
| "reward_std": 0.3160593772307038, | |
| "rewards/accuracy_reward": 0.2747767997905612, | |
| "rewards/format_reward": 0.038169644912704824, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 1023.8789800008138, | |
| "epoch": 1.0, | |
| "kl": 0.8118896484375, | |
| "reward": 0.30822173940638703, | |
| "reward_std": 0.3040016482894619, | |
| "rewards/accuracy_reward": 0.27101935756703216, | |
| "rewards/format_reward": 0.03720238265426209, | |
| "step": 283, | |
| "total_flos": 0.0, | |
| "train_loss": 0.040519028099076065, | |
| "train_runtime": 54178.7039, | |
| "train_samples_per_second": 1.337, | |
| "train_steps_per_second": 0.005 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 283, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |