{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 630.2682495117188, "epoch": 0.0032, "grad_norm": 0.18100202083587646, "kl": 0.0, "learning_rate": 9.375e-08, "loss": 0.0591, "reward": 0.4453125149011612, "reward_std": 0.4270050972700119, "rewards/accuracy_reward": 0.4453125149011612, "rewards/format_reward": 0.0, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 619.2024908065796, "epoch": 0.016, "grad_norm": 0.18559527397155762, "kl": 3.332272171974182e-05, "learning_rate": 4.6875e-07, "loss": 0.0321, "reward": 0.4544270965270698, "reward_std": 0.3922502617351711, "rewards/accuracy_reward": 0.4544270965270698, "rewards/format_reward": 0.0, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 616.4765823364257, "epoch": 0.032, "grad_norm": 0.14138484001159668, "kl": 5.02467155456543e-05, "learning_rate": 9.375e-07, "loss": 0.0469, "reward": 0.45833334475755694, "reward_std": 0.38790060505270957, "rewards/accuracy_reward": 0.45833334475755694, "rewards/format_reward": 0.0, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 618.3765830993652, "epoch": 0.048, "grad_norm": 0.13938194513320923, "kl": 6.646513938903808e-05, "learning_rate": 1.40625e-06, "loss": 0.0664, "reward": 0.4510416783392429, "reward_std": 0.4062742032110691, "rewards/accuracy_reward": 0.4510416783392429, "rewards/format_reward": 0.0, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 630.8286605834961, "epoch": 0.064, "grad_norm": 0.17926669120788574, "kl": 0.000231248140335083, "learning_rate": 1.875e-06, "loss": 0.0335, "reward": 0.4291666807141155, "reward_std": 0.37871336173266174, "rewards/accuracy_reward": 0.4291666807141155, "rewards/format_reward": 0.0, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 595.8833557128906, "epoch": 0.08, "grad_norm": 0.15497559309005737, "kl": 0.0014862060546875, "learning_rate": 2.3437500000000002e-06, "loss": 0.0384, "reward": 0.4395833477377892, "reward_std": 0.35170271508395673, "rewards/accuracy_reward": 0.4395833477377892, "rewards/format_reward": 0.0, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 584.5953315734863, "epoch": 0.096, "grad_norm": 0.14022083580493927, "kl": 0.0031821727752685547, "learning_rate": 2.8125e-06, "loss": 0.0485, "reward": 0.5661458514630795, "reward_std": 0.35945560447871683, "rewards/accuracy_reward": 0.5661458514630795, "rewards/format_reward": 0.0, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 614.5599136352539, "epoch": 0.112, "grad_norm": 0.19263891875743866, "kl": 0.014362907409667969, "learning_rate": 2.9991503375003e-06, "loss": 0.0663, "reward": 0.5843750223517418, "reward_std": 0.3453484205529094, "rewards/accuracy_reward": 0.5843750223517418, "rewards/format_reward": 0.0, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 604.9989761352539, "epoch": 0.128, "grad_norm": 0.14800268411636353, "kl": 0.10425767898559571, "learning_rate": 2.993961440992859e-06, "loss": 0.0981, "reward": 0.6046875186264515, "reward_std": 0.3031945077702403, "rewards/accuracy_reward": 0.6046875186264515, "rewards/format_reward": 0.0, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 598.249494934082, "epoch": 0.144, "grad_norm": 0.15018028020858765, "kl": 0.0073070526123046875, "learning_rate": 2.984071989079555e-06, "loss": 0.0797, "reward": 0.6151041837409139, "reward_std": 0.30993619058281185, "rewards/accuracy_reward": 0.6151041837409139, "rewards/format_reward": 0.0, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 583.8656433105468, "epoch": 0.16, "grad_norm": 0.09353411942720413, "kl": 0.00832061767578125, "learning_rate": 2.9695130976348534e-06, "loss": 0.0532, "reward": 0.6708333496004343, "reward_std": 0.2739528791978955, "rewards/accuracy_reward": 0.6708333496004343, "rewards/format_reward": 0.0, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 585.1677238464356, "epoch": 0.176, "grad_norm": 0.11824514716863632, "kl": 0.006532478332519531, "learning_rate": 2.9503305743175096e-06, "loss": 0.0745, "reward": 0.6526041820645332, "reward_std": 0.2813040278851986, "rewards/accuracy_reward": 0.6526041820645332, "rewards/format_reward": 0.0, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 603.057827758789, "epoch": 0.192, "grad_norm": 0.12121513485908508, "kl": 0.006975364685058594, "learning_rate": 2.9265847744427307e-06, "loss": 0.0656, "reward": 0.6848958484828472, "reward_std": 0.28820009287446735, "rewards/accuracy_reward": 0.6848958484828472, "rewards/format_reward": 0.0, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 540.6729316711426, "epoch": 0.208, "grad_norm": 0.12078370153903961, "kl": 0.007327461242675781, "learning_rate": 2.8983504110820214e-06, "loss": 0.0526, "reward": 0.7104166865348815, "reward_std": 0.2454069536179304, "rewards/accuracy_reward": 0.7104166865348815, "rewards/format_reward": 0.0, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 556.3765823364258, "epoch": 0.224, "grad_norm": 0.13182389736175537, "kl": 0.013681411743164062, "learning_rate": 2.865716319988224e-06, "loss": 0.0439, "reward": 0.6989583551883698, "reward_std": 0.22192855682224036, "rewards/accuracy_reward": 0.6989583551883698, "rewards/format_reward": 0.0, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 595.7729370117188, "epoch": 0.24, "grad_norm": 0.1350754201412201, "kl": 0.007706451416015625, "learning_rate": 2.82878518008537e-06, "loss": 0.0371, "reward": 0.6541666880249977, "reward_std": 0.26559926718473437, "rewards/accuracy_reward": 0.6541666880249977, "rewards/format_reward": 0.0, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 578.9286628723145, "epoch": 0.256, "grad_norm": 0.07105021178722382, "kl": 0.01440887451171875, "learning_rate": 2.7876731904027993e-06, "loss": 0.0635, "reward": 0.6364583499729634, "reward_std": 0.2588253363966942, "rewards/accuracy_reward": 0.6364583499729634, "rewards/format_reward": 0.0, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 556.3427207946777, "epoch": 0.272, "grad_norm": 0.13926267623901367, "kl": 0.009515762329101562, "learning_rate": 2.7425097044700246e-06, "loss": 0.0885, "reward": 0.6906250216066837, "reward_std": 0.28641235511749985, "rewards/accuracy_reward": 0.6906250216066837, "rewards/format_reward": 0.0, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 603.8635650634766, "epoch": 0.288, "grad_norm": 0.5559701919555664, "kl": 0.01368865966796875, "learning_rate": 2.6934368233226715e-06, "loss": 0.0412, "reward": 0.6442708518356085, "reward_std": 0.2803659217432141, "rewards/accuracy_reward": 0.6442708518356085, "rewards/format_reward": 0.0, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 584.773974609375, "epoch": 0.304, "grad_norm": 0.20091742277145386, "kl": 0.015287017822265625, "learning_rate": 2.6406089484000465e-06, "loss": 0.0501, "reward": 0.6864583514630794, "reward_std": 0.2797013459727168, "rewards/accuracy_reward": 0.6864583514630794, "rewards/format_reward": 0.0, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 611.5390769958497, "epoch": 0.32, "grad_norm": 0.19931000471115112, "kl": 0.014678192138671876, "learning_rate": 2.584192295741087e-06, "loss": 0.0677, "reward": 0.6432291803881526, "reward_std": 0.27812601886689664, "rewards/accuracy_reward": 0.6432291803881526, "rewards/format_reward": 0.0, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 559.4989761352539, "epoch": 0.336, "grad_norm": 0.674874484539032, "kl": 0.026818084716796874, "learning_rate": 2.5243643730072105e-06, "loss": 0.1125, "reward": 0.6921875223517417, "reward_std": 0.3017659166827798, "rewards/accuracy_reward": 0.6921875223517417, "rewards/format_reward": 0.0, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 615.2713722229004, "epoch": 0.352, "grad_norm": 0.26349034905433655, "kl": 0.034295654296875, "learning_rate": 2.461313420977536e-06, "loss": 0.0773, "reward": 0.6260416809469461, "reward_std": 0.2920157488435507, "rewards/accuracy_reward": 0.6260416809469461, "rewards/format_reward": 0.0, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 612.5937660217285, "epoch": 0.368, "grad_norm": 0.31401100754737854, "kl": 0.0447296142578125, "learning_rate": 2.3952378212737554e-06, "loss": 0.1057, "reward": 0.6614583518356085, "reward_std": 0.3118196573108435, "rewards/accuracy_reward": 0.6614583518356085, "rewards/format_reward": 0.0, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 609.3489768981933, "epoch": 0.384, "grad_norm": 0.2624000310897827, "kl": 0.09675827026367187, "learning_rate": 2.3263454721781537e-06, "loss": 0.1104, "reward": 0.6385416872799397, "reward_std": 0.31758001670241354, "rewards/accuracy_reward": 0.6385416872799397, "rewards/format_reward": 0.0, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 603.4942878723144, "epoch": 0.4, "grad_norm": 0.3383127748966217, "kl": 0.08062591552734374, "learning_rate": 2.2548531345087003e-06, "loss": 0.0889, "reward": 0.5843750182539225, "reward_std": 0.2997864054515958, "rewards/accuracy_reward": 0.5843750182539225, "rewards/format_reward": 0.0, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 566.6187614440918, "epoch": 0.416, "grad_norm": 22.833431243896484, "kl": 0.4265045166015625, "learning_rate": 2.18098574960932e-06, "loss": 0.1108, "reward": 0.6468750156462193, "reward_std": 0.3227476103231311, "rewards/accuracy_reward": 0.6468750156462193, "rewards/format_reward": 0.0, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 607.7297058105469, "epoch": 0.432, "grad_norm": 4.733725547790527, "kl": 0.2651611328125, "learning_rate": 2.104975731601208e-06, "loss": 0.1039, "reward": 0.4890625137835741, "reward_std": 0.3437335778027773, "rewards/accuracy_reward": 0.4890625137835741, "rewards/format_reward": 0.0, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 618.4291839599609, "epoch": 0.448, "grad_norm": 6.314213752746582, "kl": 1.424951171875, "learning_rate": 2.027062236122014e-06, "loss": 0.1678, "reward": 0.5697916850447655, "reward_std": 0.3359814532101154, "rewards/accuracy_reward": 0.5697916850447655, "rewards/format_reward": 0.0, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 612.9937713623046, "epoch": 0.464, "grad_norm": 6.824621200561523, "kl": 1.3690185546875, "learning_rate": 1.9474904078537343e-06, "loss": 0.181, "reward": 0.5895833546295762, "reward_std": 0.30473556015640496, "rewards/accuracy_reward": 0.5895833546295762, "rewards/format_reward": 0.0, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 605.7078346252441, "epoch": 0.48, "grad_norm": 4.807999134063721, "kl": 1.19837646484375, "learning_rate": 1.866510609206841e-06, "loss": 0.1727, "reward": 0.6302083544433117, "reward_std": 0.294296738691628, "rewards/accuracy_reward": 0.6302083544433117, "rewards/format_reward": 0.0, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 611.7161628723145, "epoch": 0.496, "grad_norm": 2.232267141342163, "kl": 0.758135986328125, "learning_rate": 1.784377632587518e-06, "loss": 0.1231, "reward": 0.6208333497866988, "reward_std": 0.28419284280389545, "rewards/accuracy_reward": 0.6208333497866988, "rewards/format_reward": 0.0, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 636.8765823364258, "epoch": 0.512, "grad_norm": 1.6407095193862915, "kl": 1.393798828125, "learning_rate": 1.7013498987264833e-06, "loss": 0.1483, "reward": 0.5781250152736902, "reward_std": 0.30876432694494726, "rewards/accuracy_reward": 0.5781250152736902, "rewards/format_reward": 0.0, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 588.2614753723144, "epoch": 0.528, "grad_norm": 2.2173068523406982, "kl": 1.1007537841796875, "learning_rate": 1.6176886435917677e-06, "loss": 0.138, "reward": 0.6020833482500165, "reward_std": 0.2803305178880692, "rewards/accuracy_reward": 0.6020833482500165, "rewards/format_reward": 0.0, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 595.8026168823242, "epoch": 0.544, "grad_norm": 2.1072630882263184, "kl": 1.0657928466796875, "learning_rate": 1.5336570964437077e-06, "loss": 0.1183, "reward": 0.5796875182539225, "reward_std": 0.30551951825618745, "rewards/accuracy_reward": 0.5796875182539225, "rewards/format_reward": 0.0, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 581.3338653564454, "epoch": 0.56, "grad_norm": 4.4301981925964355, "kl": 1.3940032958984374, "learning_rate": 1.4495196516183096e-06, "loss": 0.1518, "reward": 0.5713541805744171, "reward_std": 0.30100441314280035, "rewards/accuracy_reward": 0.5713541805744171, "rewards/format_reward": 0.0, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 565.2599090576172, "epoch": 0.576, "grad_norm": 2.6226236820220947, "kl": 0.6679443359375, "learning_rate": 1.3655410366448499e-06, "loss": 0.1095, "reward": 0.6213541828095913, "reward_std": 0.3027579264715314, "rewards/accuracy_reward": 0.6213541828095913, "rewards/format_reward": 0.0, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 633.1932518005372, "epoch": 0.592, "grad_norm": 2.93147611618042, "kl": 1.785247802734375, "learning_rate": 1.2819854793151313e-06, "loss": 0.1128, "reward": 0.5682291792705655, "reward_std": 0.306190599501133, "rewards/accuracy_reward": 0.5682291792705655, "rewards/format_reward": 0.0, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 578.0770973205566, "epoch": 0.608, "grad_norm": 1.838672399520874, "kl": 0.891180419921875, "learning_rate": 1.199115876325091e-06, "loss": 0.115, "reward": 0.5697916863020509, "reward_std": 0.3095055213198066, "rewards/accuracy_reward": 0.5697916863020509, "rewards/format_reward": 0.0, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 574.092724609375, "epoch": 0.624, "grad_norm": 13.621001243591309, "kl": 1.43966064453125, "learning_rate": 1.1171929661045361e-06, "loss": 0.1622, "reward": 0.6223958551883697, "reward_std": 0.3022162653505802, "rewards/accuracy_reward": 0.6223958551883697, "rewards/format_reward": 0.0, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 594.4786628723144, "epoch": 0.64, "grad_norm": 3.5379152297973633, "kl": 1.4238235473632812, "learning_rate": 1.036474508437579e-06, "loss": 0.1298, "reward": 0.6213541850447655, "reward_std": 0.30077949464321135, "rewards/accuracy_reward": 0.6213541850447655, "rewards/format_reward": 0.0, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 571.3896018981934, "epoch": 0.656, "grad_norm": 2.5569472312927246, "kl": 1.15596923828125, "learning_rate": 9.57214473454992e-07, "loss": 0.1368, "reward": 0.603645849507302, "reward_std": 0.2896819781512022, "rewards/accuracy_reward": 0.603645849507302, "rewards/format_reward": 0.0, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 580.802619934082, "epoch": 0.672, "grad_norm": 2.466930866241455, "kl": 1.370220947265625, "learning_rate": 8.796622425502193e-07, "loss": 0.1354, "reward": 0.6088541835546494, "reward_std": 0.280212614685297, "rewards/accuracy_reward": 0.6088541835546494, "rewards/format_reward": 0.0, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 627.9989791870117, "epoch": 0.688, "grad_norm": 13.0488862991333, "kl": 1.364794921875, "learning_rate": 8.040618237332491e-07, "loss": 0.1359, "reward": 0.5328125124797225, "reward_std": 0.3135726748034358, "rewards/accuracy_reward": 0.5328125124797225, "rewards/format_reward": 0.0, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 606.6250205993653, "epoch": 0.704, "grad_norm": 1.2970831394195557, "kl": 1.391845703125, "learning_rate": 7.30651083891141e-07, "loss": 0.1338, "reward": 0.5791666809469461, "reward_std": 0.31143888011574744, "rewards/accuracy_reward": 0.5791666809469461, "rewards/format_reward": 0.0, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 615.9635604858398, "epoch": 0.72, "grad_norm": 6.471405982971191, "kl": 1.4354736328125, "learning_rate": 6.596610003707959e-07, "loss": 0.13, "reward": 0.6151041848585009, "reward_std": 0.3179119948297739, "rewards/accuracy_reward": 0.6151041848585009, "rewards/format_reward": 0.0, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 642.402099609375, "epoch": 0.736, "grad_norm": 1.8979251384735107, "kl": 2.40401611328125, "learning_rate": 5.913149342387704e-07, "loss": 0.1908, "reward": 0.5401041831821203, "reward_std": 0.3176923610270023, "rewards/accuracy_reward": 0.5401041831821203, "rewards/format_reward": 0.0, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 586.7692855834961, "epoch": 0.752, "grad_norm": 3.6844191551208496, "kl": 1.32728271484375, "learning_rate": 5.258279275047247e-07, "loss": 0.1608, "reward": 0.596875012665987, "reward_std": 0.3037980867549777, "rewards/accuracy_reward": 0.596875012665987, "rewards/format_reward": 0.0, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 566.0083518981934, "epoch": 0.768, "grad_norm": 1.5891520977020264, "kl": 1.258824920654297, "learning_rate": 4.63406026519703e-07, "loss": 0.1496, "reward": 0.6276041828095913, "reward_std": 0.2718733722344041, "rewards/accuracy_reward": 0.6276041828095913, "rewards/format_reward": 0.0, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 616.7140800476075, "epoch": 0.784, "grad_norm": 4.892991065979004, "kl": 1.6791976928710937, "learning_rate": 4.042456336780838e-07, "loss": 0.1725, "reward": 0.5750000186264514, "reward_std": 0.3162312388420105, "rewards/accuracy_reward": 0.5750000186264514, "rewards/format_reward": 0.0, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 600.1448112487793, "epoch": 0.8, "grad_norm": 11.495079040527344, "kl": 1.700238037109375, "learning_rate": 3.4853288946298335e-07, "loss": 0.1475, "reward": 0.5687500169035047, "reward_std": 0.28788632694631816, "rewards/accuracy_reward": 0.5687500169035047, "rewards/format_reward": 0.0, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 581.4828277587891, "epoch": 0.816, "grad_norm": 2.291238307952881, "kl": 1.24267578125, "learning_rate": 2.9644308677943315e-07, "loss": 0.1563, "reward": 0.607291679084301, "reward_std": 0.30686128605157137, "rewards/accuracy_reward": 0.607291679084301, "rewards/format_reward": 0.0, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 598.8916816711426, "epoch": 0.832, "grad_norm": 1137.7882080078125, "kl": 14.761802673339844, "learning_rate": 2.48140119418046e-07, "loss": 0.5569, "reward": 0.6078125163912773, "reward_std": 0.3036690015345812, "rewards/accuracy_reward": 0.6078125163912773, "rewards/format_reward": 0.0, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 618.4854362487793, "epoch": 0.848, "grad_norm": 1.0645900964736938, "kl": 1.7237060546875, "learning_rate": 2.0377596638451812e-07, "loss": 0.1439, "reward": 0.592187512665987, "reward_std": 0.3175499288365245, "rewards/accuracy_reward": 0.592187512665987, "rewards/format_reward": 0.0, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 598.6474105834961, "epoch": 0.864, "grad_norm": 3.2106242179870605, "kl": 1.365283203125, "learning_rate": 1.634902137174483e-07, "loss": 0.1479, "reward": 0.6187500197440385, "reward_std": 0.2985101878643036, "rewards/accuracy_reward": 0.6187500197440385, "rewards/format_reward": 0.0, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 604.251577758789, "epoch": 0.88, "grad_norm": 4.822841644287109, "kl": 1.6898468017578125, "learning_rate": 1.274096152990203e-07, "loss": 0.1747, "reward": 0.6166666835546494, "reward_std": 0.30709347426891326, "rewards/accuracy_reward": 0.6166666835546494, "rewards/format_reward": 0.0, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 577.0427276611329, "epoch": 0.896, "grad_norm": 3.7293879985809326, "kl": 1.522021484375, "learning_rate": 9.564769404039419e-08, "loss": 0.1367, "reward": 0.6046875163912773, "reward_std": 0.3109951946884394, "rewards/accuracy_reward": 0.6046875163912773, "rewards/format_reward": 0.0, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 596.1869979858399, "epoch": 0.912, "grad_norm": 1.625959873199463, "kl": 1.5546356201171876, "learning_rate": 6.830438469662892e-08, "loss": 0.153, "reward": 0.5796875137835741, "reward_std": 0.2965929379686713, "rewards/accuracy_reward": 0.5796875137835741, "rewards/format_reward": 0.0, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 590.7567916870117, "epoch": 0.928, "grad_norm": 1.6399497985839844, "kl": 1.5036117553710937, "learning_rate": 4.546571943496969e-08, "loss": 0.1512, "reward": 0.6286458489950746, "reward_std": 0.33254577573388816, "rewards/accuracy_reward": 0.6286458489950746, "rewards/format_reward": 0.0, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 603.7135581970215, "epoch": 0.944, "grad_norm": 3.5563366413116455, "kl": 1.557330322265625, "learning_rate": 2.72035571458224e-08, "loss": 0.147, "reward": 0.6046875178813934, "reward_std": 0.32659342624247073, "rewards/accuracy_reward": 0.6046875178813934, "rewards/format_reward": 0.0, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 575.8109519958496, "epoch": 0.96, "grad_norm": 2.608484983444214, "kl": 1.5316009521484375, "learning_rate": 1.357535734809795e-08, "loss": 0.1585, "reward": 0.623437518440187, "reward_std": 0.3020533608272672, "rewards/accuracy_reward": 0.623437518440187, "rewards/format_reward": 0.0, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 591.2427284240723, "epoch": 0.976, "grad_norm": 1.911941647529602, "kl": 1.328106689453125, "learning_rate": 4.623999400308054e-09, "loss": 0.1507, "reward": 0.6088541898876428, "reward_std": 0.30317605994641783, "rewards/accuracy_reward": 0.6088541898876428, "rewards/format_reward": 0.0, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 589.5010604858398, "epoch": 0.992, "grad_norm": 2.221292495727539, "kl": 1.4125, "learning_rate": 3.77647586240204e-10, "loss": 0.1615, "reward": 0.6026041828095913, "reward_std": 0.3082280207425356, "rewards/accuracy_reward": 0.6026041828095913, "rewards/format_reward": 0.0, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 618.7890777587891, "epoch": 0.9984, "kl": 1.5830078125, "reward": 0.617187513038516, "reward_std": 0.27739145373925567, "rewards/accuracy_reward": 0.617187513038516, "rewards/format_reward": 0.0, "step": 312, "total_flos": 0.0, "train_loss": 0.12025522345151657, "train_runtime": 24302.053, "train_samples_per_second": 0.309, "train_steps_per_second": 0.013 } ], "logging_steps": 5, "max_steps": 312, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }