{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.13661868748475237, "eval_steps": 10, "global_step": 560, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00024396194193705782, "grad_norm": 4.489601135253906, "learning_rate": 2.4999420463141455e-07, "loss": 3.306, "step": 1 }, { "epoch": 0.00024396194193705782, "eval_loss": 2.9874014854431152, "eval_runtime": 85.0857, "eval_samples_per_second": 3.009, "eval_steps_per_second": 0.752, "step": 1 }, { "epoch": 0.00048792388387411563, "grad_norm": 4.510775566101074, "learning_rate": 2.4998840671678217e-07, "loss": 2.7398, "step": 2 }, { "epoch": 0.0007318858258111735, "grad_norm": 5.426989555358887, "learning_rate": 2.499826062544247e-07, "loss": 3.3804, "step": 3 }, { "epoch": 0.0009758477677482313, "grad_norm": 3.3621065616607666, "learning_rate": 2.4997680324266246e-07, "loss": 2.8696, "step": 4 }, { "epoch": 0.0012198097096852891, "grad_norm": 4.033163070678711, "learning_rate": 2.499709976798144e-07, "loss": 3.4674, "step": 5 }, { "epoch": 0.001463771651622347, "grad_norm": 3.398115634918213, "learning_rate": 2.4996518956419777e-07, "loss": 3.3783, "step": 6 }, { "epoch": 0.0017077335935594047, "grad_norm": 2.1340525150299072, "learning_rate": 2.499593788941286e-07, "loss": 2.6618, "step": 7 }, { "epoch": 0.0019516955354964625, "grad_norm": 2.8017404079437256, "learning_rate": 2.499535656679212e-07, "loss": 3.1798, "step": 8 }, { "epoch": 0.0021956574774335204, "grad_norm": 2.3321993350982666, "learning_rate": 2.499477498838886e-07, "loss": 3.1456, "step": 9 }, { "epoch": 0.0024396194193705783, "grad_norm": 1.805264949798584, "learning_rate": 2.4994193154034227e-07, "loss": 2.7338, "step": 10 }, { "epoch": 0.0024396194193705783, "eval_loss": 2.8181073665618896, "eval_runtime": 82.6944, "eval_samples_per_second": 3.096, "eval_steps_per_second": 0.774, "step": 10 }, { "epoch": 0.002683581361307636, "grad_norm": 2.295260190963745, "learning_rate": 2.499361106355922e-07, "loss": 2.876, "step": 11 }, { "epoch": 0.002927543303244694, "grad_norm": 1.9248020648956299, "learning_rate": 2.499302871679468e-07, "loss": 3.0328, "step": 12 }, { "epoch": 0.0031715052451817514, "grad_norm": 1.5429915189743042, "learning_rate": 2.4992446113571303e-07, "loss": 2.7445, "step": 13 }, { "epoch": 0.0034154671871188093, "grad_norm": 1.659693717956543, "learning_rate": 2.4991863253719657e-07, "loss": 2.9829, "step": 14 }, { "epoch": 0.003659429129055867, "grad_norm": 1.6439993381500244, "learning_rate": 2.4991280137070126e-07, "loss": 2.7864, "step": 15 }, { "epoch": 0.003903391070992925, "grad_norm": 1.731806993484497, "learning_rate": 2.499069676345297e-07, "loss": 2.9397, "step": 16 }, { "epoch": 0.004147353012929983, "grad_norm": 1.397567629814148, "learning_rate": 2.499011313269829e-07, "loss": 2.7251, "step": 17 }, { "epoch": 0.004391314954867041, "grad_norm": 1.5513560771942139, "learning_rate": 2.498952924463603e-07, "loss": 2.9325, "step": 18 }, { "epoch": 0.004635276896804099, "grad_norm": 1.7622836828231812, "learning_rate": 2.498894509909601e-07, "loss": 2.7478, "step": 19 }, { "epoch": 0.0048792388387411565, "grad_norm": 1.4812703132629395, "learning_rate": 2.4988360695907864e-07, "loss": 2.7757, "step": 20 }, { "epoch": 0.0048792388387411565, "eval_loss": 2.6998705863952637, "eval_runtime": 82.4721, "eval_samples_per_second": 3.104, "eval_steps_per_second": 0.776, "step": 20 }, { "epoch": 0.005123200780678214, "grad_norm": 1.8131576776504517, "learning_rate": 2.49877760349011e-07, "loss": 2.5977, "step": 21 }, { "epoch": 0.005367162722615272, "grad_norm": 1.084616780281067, "learning_rate": 2.498719111590508e-07, "loss": 2.3549, "step": 22 }, { "epoch": 0.00561112466455233, "grad_norm": 1.909567952156067, "learning_rate": 2.498660593874899e-07, "loss": 3.0033, "step": 23 }, { "epoch": 0.005855086606489388, "grad_norm": 1.3913578987121582, "learning_rate": 2.4986020503261886e-07, "loss": 2.6655, "step": 24 }, { "epoch": 0.006099048548426446, "grad_norm": 1.499211311340332, "learning_rate": 2.498543480927266e-07, "loss": 2.7763, "step": 25 }, { "epoch": 0.006343010490363503, "grad_norm": 1.3160173892974854, "learning_rate": 2.4984848856610065e-07, "loss": 2.7146, "step": 26 }, { "epoch": 0.006586972432300561, "grad_norm": 1.4656383991241455, "learning_rate": 2.4984262645102706e-07, "loss": 2.8071, "step": 27 }, { "epoch": 0.006830934374237619, "grad_norm": 1.5149258375167847, "learning_rate": 2.4983676174579014e-07, "loss": 2.8613, "step": 28 }, { "epoch": 0.0070748963161746765, "grad_norm": 1.3552800416946411, "learning_rate": 2.498308944486729e-07, "loss": 2.6073, "step": 29 }, { "epoch": 0.007318858258111734, "grad_norm": 1.8789068460464478, "learning_rate": 2.4982502455795676e-07, "loss": 2.8036, "step": 30 }, { "epoch": 0.007318858258111734, "eval_loss": 2.5998635292053223, "eval_runtime": 82.4021, "eval_samples_per_second": 3.107, "eval_steps_per_second": 0.777, "step": 30 }, { "epoch": 0.007562820200048792, "grad_norm": 1.5838264226913452, "learning_rate": 2.498191520719216e-07, "loss": 2.7908, "step": 31 }, { "epoch": 0.00780678214198585, "grad_norm": 1.575810194015503, "learning_rate": 2.4981327698884575e-07, "loss": 2.5728, "step": 32 }, { "epoch": 0.008050744083922909, "grad_norm": 1.1017578840255737, "learning_rate": 2.498073993070061e-07, "loss": 2.519, "step": 33 }, { "epoch": 0.008294706025859966, "grad_norm": 1.5795230865478516, "learning_rate": 2.49801519024678e-07, "loss": 2.8713, "step": 34 }, { "epoch": 0.008538667967797023, "grad_norm": 1.3720916509628296, "learning_rate": 2.497956361401352e-07, "loss": 2.6911, "step": 35 }, { "epoch": 0.008782629909734082, "grad_norm": 1.3356428146362305, "learning_rate": 2.4978975065165004e-07, "loss": 2.5879, "step": 36 }, { "epoch": 0.009026591851671139, "grad_norm": 2.031726121902466, "learning_rate": 2.497838625574932e-07, "loss": 2.9549, "step": 37 }, { "epoch": 0.009270553793608197, "grad_norm": 1.4513427019119263, "learning_rate": 2.497779718559339e-07, "loss": 2.8033, "step": 38 }, { "epoch": 0.009514515735545254, "grad_norm": 1.4715417623519897, "learning_rate": 2.497720785452398e-07, "loss": 2.5233, "step": 39 }, { "epoch": 0.009758477677482313, "grad_norm": 1.3367327451705933, "learning_rate": 2.497661826236771e-07, "loss": 2.6558, "step": 40 }, { "epoch": 0.009758477677482313, "eval_loss": 2.5077946186065674, "eval_runtime": 82.473, "eval_samples_per_second": 3.104, "eval_steps_per_second": 0.776, "step": 40 }, { "epoch": 0.01000243961941937, "grad_norm": 1.8079203367233276, "learning_rate": 2.497602840895103e-07, "loss": 2.7062, "step": 41 }, { "epoch": 0.010246401561356429, "grad_norm": 1.297031283378601, "learning_rate": 2.4975438294100266e-07, "loss": 2.4938, "step": 42 }, { "epoch": 0.010490363503293486, "grad_norm": 2.0549025535583496, "learning_rate": 2.497484791764155e-07, "loss": 2.8457, "step": 43 }, { "epoch": 0.010734325445230545, "grad_norm": 2.118145227432251, "learning_rate": 2.4974257279400897e-07, "loss": 2.6677, "step": 44 }, { "epoch": 0.010978287387167602, "grad_norm": 1.6570909023284912, "learning_rate": 2.497366637920414e-07, "loss": 2.7371, "step": 45 }, { "epoch": 0.01122224932910466, "grad_norm": 2.100497007369995, "learning_rate": 2.497307521687697e-07, "loss": 2.7241, "step": 46 }, { "epoch": 0.011466211271041717, "grad_norm": 1.4280970096588135, "learning_rate": 2.497248379224492e-07, "loss": 2.53, "step": 47 }, { "epoch": 0.011710173212978776, "grad_norm": 1.6932001113891602, "learning_rate": 2.497189210513339e-07, "loss": 2.815, "step": 48 }, { "epoch": 0.011954135154915833, "grad_norm": 1.8314259052276611, "learning_rate": 2.497130015536758e-07, "loss": 2.7558, "step": 49 }, { "epoch": 0.012198097096852892, "grad_norm": 1.3970531225204468, "learning_rate": 2.497070794277257e-07, "loss": 2.5018, "step": 50 }, { "epoch": 0.012198097096852892, "eval_loss": 2.428684949874878, "eval_runtime": 82.4898, "eval_samples_per_second": 3.103, "eval_steps_per_second": 0.776, "step": 50 }, { "epoch": 0.012442059038789949, "grad_norm": 2.6512253284454346, "learning_rate": 2.497011546717327e-07, "loss": 2.6264, "step": 51 }, { "epoch": 0.012686020980727006, "grad_norm": 1.2588273286819458, "learning_rate": 2.496952272839445e-07, "loss": 2.3415, "step": 52 }, { "epoch": 0.012929982922664065, "grad_norm": 1.6063730716705322, "learning_rate": 2.4968929726260705e-07, "loss": 2.4857, "step": 53 }, { "epoch": 0.013173944864601122, "grad_norm": 1.344925045967102, "learning_rate": 2.4968336460596485e-07, "loss": 2.4742, "step": 54 }, { "epoch": 0.01341790680653818, "grad_norm": 1.141435146331787, "learning_rate": 2.4967742931226075e-07, "loss": 2.2668, "step": 55 }, { "epoch": 0.013661868748475237, "grad_norm": 1.4117755889892578, "learning_rate": 2.4967149137973625e-07, "loss": 2.4959, "step": 56 }, { "epoch": 0.013905830690412296, "grad_norm": 1.292641520500183, "learning_rate": 2.496655508066309e-07, "loss": 2.3535, "step": 57 }, { "epoch": 0.014149792632349353, "grad_norm": 1.3126784563064575, "learning_rate": 2.4965960759118313e-07, "loss": 2.3842, "step": 58 }, { "epoch": 0.014393754574286412, "grad_norm": 1.4474728107452393, "learning_rate": 2.4965366173162953e-07, "loss": 2.5879, "step": 59 }, { "epoch": 0.014637716516223469, "grad_norm": 1.4832170009613037, "learning_rate": 2.4964771322620516e-07, "loss": 2.5618, "step": 60 }, { "epoch": 0.014637716516223469, "eval_loss": 2.370858907699585, "eval_runtime": 82.5008, "eval_samples_per_second": 3.103, "eval_steps_per_second": 0.776, "step": 60 }, { "epoch": 0.014881678458160527, "grad_norm": 3.201457977294922, "learning_rate": 2.4964176207314356e-07, "loss": 2.4885, "step": 61 }, { "epoch": 0.015125640400097584, "grad_norm": 1.71616530418396, "learning_rate": 2.496358082706767e-07, "loss": 2.5725, "step": 62 }, { "epoch": 0.015369602342034643, "grad_norm": 1.8521833419799805, "learning_rate": 2.4962985181703483e-07, "loss": 2.4873, "step": 63 }, { "epoch": 0.0156135642839717, "grad_norm": 1.3216487169265747, "learning_rate": 2.496238927104469e-07, "loss": 2.4606, "step": 64 }, { "epoch": 0.015857526225908757, "grad_norm": 1.3042290210723877, "learning_rate": 2.4961793094913995e-07, "loss": 2.5374, "step": 65 }, { "epoch": 0.016101488167845818, "grad_norm": 1.4314343929290771, "learning_rate": 2.4961196653133975e-07, "loss": 2.5357, "step": 66 }, { "epoch": 0.016345450109782875, "grad_norm": 1.6834615468978882, "learning_rate": 2.4960599945527027e-07, "loss": 2.2217, "step": 67 }, { "epoch": 0.01658941205171993, "grad_norm": 1.1532320976257324, "learning_rate": 2.49600029719154e-07, "loss": 2.4338, "step": 68 }, { "epoch": 0.01683337399365699, "grad_norm": 1.009998083114624, "learning_rate": 2.495940573212118e-07, "loss": 2.18, "step": 69 }, { "epoch": 0.017077335935594046, "grad_norm": 1.4564158916473389, "learning_rate": 2.4958808225966306e-07, "loss": 2.3481, "step": 70 }, { "epoch": 0.017077335935594046, "eval_loss": 2.3252947330474854, "eval_runtime": 82.5205, "eval_samples_per_second": 3.102, "eval_steps_per_second": 0.776, "step": 70 }, { "epoch": 0.017321297877531106, "grad_norm": 1.5508018732070923, "learning_rate": 2.4958210453272533e-07, "loss": 2.5017, "step": 71 }, { "epoch": 0.017565259819468163, "grad_norm": 1.2118752002716064, "learning_rate": 2.4957612413861483e-07, "loss": 2.5198, "step": 72 }, { "epoch": 0.01780922176140522, "grad_norm": 1.1424647569656372, "learning_rate": 2.4957014107554603e-07, "loss": 2.4275, "step": 73 }, { "epoch": 0.018053183703342277, "grad_norm": 1.1591920852661133, "learning_rate": 2.4956415534173195e-07, "loss": 2.2197, "step": 74 }, { "epoch": 0.018297145645279338, "grad_norm": 1.1998584270477295, "learning_rate": 2.495581669353838e-07, "loss": 2.2739, "step": 75 }, { "epoch": 0.018541107587216395, "grad_norm": 1.0552688837051392, "learning_rate": 2.4955217585471147e-07, "loss": 2.4085, "step": 76 }, { "epoch": 0.01878506952915345, "grad_norm": 1.0630302429199219, "learning_rate": 2.495461820979229e-07, "loss": 2.3622, "step": 77 }, { "epoch": 0.01902903147109051, "grad_norm": 1.6193199157714844, "learning_rate": 2.4954018566322477e-07, "loss": 2.4233, "step": 78 }, { "epoch": 0.01927299341302757, "grad_norm": 1.0845212936401367, "learning_rate": 2.4953418654882195e-07, "loss": 2.2942, "step": 79 }, { "epoch": 0.019516955354964626, "grad_norm": 0.9362667202949524, "learning_rate": 2.495281847529178e-07, "loss": 2.3475, "step": 80 }, { "epoch": 0.019516955354964626, "eval_loss": 2.2899718284606934, "eval_runtime": 82.7023, "eval_samples_per_second": 3.095, "eval_steps_per_second": 0.774, "step": 80 }, { "epoch": 0.019760917296901683, "grad_norm": 1.1640156507492065, "learning_rate": 2.4952218027371403e-07, "loss": 2.4911, "step": 81 }, { "epoch": 0.02000487923883874, "grad_norm": 1.158489465713501, "learning_rate": 2.495161731094107e-07, "loss": 2.3697, "step": 82 }, { "epoch": 0.0202488411807758, "grad_norm": 1.056389570236206, "learning_rate": 2.4951016325820637e-07, "loss": 2.3726, "step": 83 }, { "epoch": 0.020492803122712858, "grad_norm": 1.1232126951217651, "learning_rate": 2.4950415071829794e-07, "loss": 2.3631, "step": 84 }, { "epoch": 0.020736765064649915, "grad_norm": 1.4430733919143677, "learning_rate": 2.4949813548788067e-07, "loss": 2.4389, "step": 85 }, { "epoch": 0.02098072700658697, "grad_norm": 1.4792566299438477, "learning_rate": 2.4949211756514816e-07, "loss": 2.5851, "step": 86 }, { "epoch": 0.02122468894852403, "grad_norm": 0.8782404661178589, "learning_rate": 2.494860969482926e-07, "loss": 2.3258, "step": 87 }, { "epoch": 0.02146865089046109, "grad_norm": 0.9481968879699707, "learning_rate": 2.4948007363550424e-07, "loss": 2.3977, "step": 88 }, { "epoch": 0.021712612832398146, "grad_norm": 1.0738717317581177, "learning_rate": 2.4947404762497197e-07, "loss": 2.0767, "step": 89 }, { "epoch": 0.021956574774335203, "grad_norm": 1.3180803060531616, "learning_rate": 2.49468018914883e-07, "loss": 2.4085, "step": 90 }, { "epoch": 0.021956574774335203, "eval_loss": 2.2610299587249756, "eval_runtime": 82.4227, "eval_samples_per_second": 3.106, "eval_steps_per_second": 0.776, "step": 90 }, { "epoch": 0.02220053671627226, "grad_norm": 0.9324449896812439, "learning_rate": 2.4946198750342283e-07, "loss": 2.3142, "step": 91 }, { "epoch": 0.02244449865820932, "grad_norm": 1.5807453393936157, "learning_rate": 2.4945595338877547e-07, "loss": 2.3756, "step": 92 }, { "epoch": 0.022688460600146378, "grad_norm": 1.279068112373352, "learning_rate": 2.494499165691231e-07, "loss": 2.2482, "step": 93 }, { "epoch": 0.022932422542083435, "grad_norm": 1.6906729936599731, "learning_rate": 2.4944387704264644e-07, "loss": 2.3038, "step": 94 }, { "epoch": 0.02317638448402049, "grad_norm": 1.2444514036178589, "learning_rate": 2.494378348075246e-07, "loss": 2.1946, "step": 95 }, { "epoch": 0.023420346425957552, "grad_norm": 0.9085439443588257, "learning_rate": 2.494317898619349e-07, "loss": 2.0829, "step": 96 }, { "epoch": 0.02366430836789461, "grad_norm": 1.0624847412109375, "learning_rate": 2.4942574220405314e-07, "loss": 2.2917, "step": 97 }, { "epoch": 0.023908270309831666, "grad_norm": 0.9223533868789673, "learning_rate": 2.4941969183205344e-07, "loss": 2.4056, "step": 98 }, { "epoch": 0.024152232251768723, "grad_norm": 4.390754699707031, "learning_rate": 2.494136387441083e-07, "loss": 2.4544, "step": 99 }, { "epoch": 0.024396194193705784, "grad_norm": 0.999297559261322, "learning_rate": 2.494075829383886e-07, "loss": 2.2129, "step": 100 }, { "epoch": 0.024396194193705784, "eval_loss": 2.2379603385925293, "eval_runtime": 82.4809, "eval_samples_per_second": 3.104, "eval_steps_per_second": 0.776, "step": 100 }, { "epoch": 0.02464015613564284, "grad_norm": 0.9902031421661377, "learning_rate": 2.494015244130635e-07, "loss": 2.0829, "step": 101 }, { "epoch": 0.024884118077579898, "grad_norm": 1.0847697257995605, "learning_rate": 2.493954631663007e-07, "loss": 2.2557, "step": 102 }, { "epoch": 0.025128080019516955, "grad_norm": 0.8790014982223511, "learning_rate": 2.493893991962659e-07, "loss": 2.2532, "step": 103 }, { "epoch": 0.02537204196145401, "grad_norm": 0.8715433478355408, "learning_rate": 2.493833325011235e-07, "loss": 2.348, "step": 104 }, { "epoch": 0.025616003903391072, "grad_norm": 0.9393193125724792, "learning_rate": 2.4937726307903606e-07, "loss": 2.3185, "step": 105 }, { "epoch": 0.02585996584532813, "grad_norm": 1.0732641220092773, "learning_rate": 2.493711909281646e-07, "loss": 2.3618, "step": 106 }, { "epoch": 0.026103927787265186, "grad_norm": 1.067499041557312, "learning_rate": 2.493651160466685e-07, "loss": 2.446, "step": 107 }, { "epoch": 0.026347889729202243, "grad_norm": 1.0830148458480835, "learning_rate": 2.493590384327053e-07, "loss": 2.4026, "step": 108 }, { "epoch": 0.026591851671139304, "grad_norm": 1.4239816665649414, "learning_rate": 2.49352958084431e-07, "loss": 2.2756, "step": 109 }, { "epoch": 0.02683581361307636, "grad_norm": 0.7910580635070801, "learning_rate": 2.49346875e-07, "loss": 2.1831, "step": 110 }, { "epoch": 0.02683581361307636, "eval_loss": 2.2186379432678223, "eval_runtime": 82.443, "eval_samples_per_second": 3.105, "eval_steps_per_second": 0.776, "step": 110 }, { "epoch": 0.027079775555013418, "grad_norm": 1.1051030158996582, "learning_rate": 2.49340789177565e-07, "loss": 2.2737, "step": 111 }, { "epoch": 0.027323737496950475, "grad_norm": 1.0497264862060547, "learning_rate": 2.4933470061527687e-07, "loss": 2.2902, "step": 112 }, { "epoch": 0.027567699438887535, "grad_norm": 0.87137770652771, "learning_rate": 2.493286093112851e-07, "loss": 2.1482, "step": 113 }, { "epoch": 0.027811661380824592, "grad_norm": 1.1766656637191772, "learning_rate": 2.493225152637374e-07, "loss": 2.353, "step": 114 }, { "epoch": 0.02805562332276165, "grad_norm": 0.9225324988365173, "learning_rate": 2.4931641847077963e-07, "loss": 2.1774, "step": 115 }, { "epoch": 0.028299585264698706, "grad_norm": 0.8458165526390076, "learning_rate": 2.493103189305562e-07, "loss": 2.112, "step": 116 }, { "epoch": 0.028543547206635766, "grad_norm": 0.975180983543396, "learning_rate": 2.493042166412099e-07, "loss": 2.331, "step": 117 }, { "epoch": 0.028787509148572824, "grad_norm": 0.879942774772644, "learning_rate": 2.492981116008816e-07, "loss": 2.2939, "step": 118 }, { "epoch": 0.02903147109050988, "grad_norm": 1.5170252323150635, "learning_rate": 2.492920038077106e-07, "loss": 2.435, "step": 119 }, { "epoch": 0.029275433032446938, "grad_norm": 1.0312517881393433, "learning_rate": 2.492858932598346e-07, "loss": 2.2375, "step": 120 }, { "epoch": 0.029275433032446938, "eval_loss": 2.201087474822998, "eval_runtime": 82.5159, "eval_samples_per_second": 3.102, "eval_steps_per_second": 0.776, "step": 120 }, { "epoch": 0.029519394974383995, "grad_norm": 0.999895453453064, "learning_rate": 2.4927977995538954e-07, "loss": 2.2366, "step": 121 }, { "epoch": 0.029763356916321055, "grad_norm": 0.75639408826828, "learning_rate": 2.4927366389250973e-07, "loss": 2.1457, "step": 122 }, { "epoch": 0.030007318858258112, "grad_norm": 1.004939079284668, "learning_rate": 2.4926754506932774e-07, "loss": 2.2746, "step": 123 }, { "epoch": 0.03025128080019517, "grad_norm": 0.9373717308044434, "learning_rate": 2.4926142348397453e-07, "loss": 2.2899, "step": 124 }, { "epoch": 0.030495242742132226, "grad_norm": 22.97978401184082, "learning_rate": 2.492552991345792e-07, "loss": 2.3833, "step": 125 }, { "epoch": 0.030739204684069286, "grad_norm": 1.2713708877563477, "learning_rate": 2.4924917201926936e-07, "loss": 2.3817, "step": 126 }, { "epoch": 0.030983166626006343, "grad_norm": 1.0373756885528564, "learning_rate": 2.492430421361708e-07, "loss": 2.0911, "step": 127 }, { "epoch": 0.0312271285679434, "grad_norm": 1.1061681509017944, "learning_rate": 2.4923690948340783e-07, "loss": 2.2405, "step": 128 }, { "epoch": 0.03147109050988046, "grad_norm": 0.8704845309257507, "learning_rate": 2.4923077405910264e-07, "loss": 2.3703, "step": 129 }, { "epoch": 0.031715052451817514, "grad_norm": 0.8377108573913574, "learning_rate": 2.4922463586137616e-07, "loss": 2.1756, "step": 130 }, { "epoch": 0.031715052451817514, "eval_loss": 2.1864898204803467, "eval_runtime": 82.5766, "eval_samples_per_second": 3.1, "eval_steps_per_second": 0.775, "step": 130 }, { "epoch": 0.03195901439375457, "grad_norm": 0.9043044447898865, "learning_rate": 2.4921849488834745e-07, "loss": 2.3177, "step": 131 }, { "epoch": 0.032202976335691635, "grad_norm": 0.9630913138389587, "learning_rate": 2.4921235113813376e-07, "loss": 2.3257, "step": 132 }, { "epoch": 0.03244693827762869, "grad_norm": 0.9598456025123596, "learning_rate": 2.492062046088508e-07, "loss": 2.1792, "step": 133 }, { "epoch": 0.03269090021956575, "grad_norm": 0.9462944865226746, "learning_rate": 2.4920005529861254e-07, "loss": 2.1268, "step": 134 }, { "epoch": 0.032934862161502806, "grad_norm": 0.9108706116676331, "learning_rate": 2.491939032055311e-07, "loss": 2.2696, "step": 135 }, { "epoch": 0.03317882410343986, "grad_norm": 1.834842324256897, "learning_rate": 2.491877483277171e-07, "loss": 2.3871, "step": 136 }, { "epoch": 0.03342278604537692, "grad_norm": 0.9213249087333679, "learning_rate": 2.4918159066327943e-07, "loss": 2.1749, "step": 137 }, { "epoch": 0.03366674798731398, "grad_norm": 0.8780300617218018, "learning_rate": 2.49175430210325e-07, "loss": 2.2413, "step": 138 }, { "epoch": 0.033910709929251034, "grad_norm": 118.5926284790039, "learning_rate": 2.491692669669594e-07, "loss": 2.2684, "step": 139 }, { "epoch": 0.03415467187118809, "grad_norm": 0.8691902160644531, "learning_rate": 2.4916310093128616e-07, "loss": 2.1863, "step": 140 }, { "epoch": 0.03415467187118809, "eval_loss": 2.174090623855591, "eval_runtime": 82.5703, "eval_samples_per_second": 3.1, "eval_steps_per_second": 0.775, "step": 140 }, { "epoch": 0.034398633813125155, "grad_norm": 0.8839126825332642, "learning_rate": 2.491569321014073e-07, "loss": 2.2502, "step": 141 }, { "epoch": 0.03464259575506221, "grad_norm": 1.0043957233428955, "learning_rate": 2.49150760475423e-07, "loss": 2.2624, "step": 142 }, { "epoch": 0.03488655769699927, "grad_norm": 0.7548794746398926, "learning_rate": 2.4914458605143187e-07, "loss": 2.2126, "step": 143 }, { "epoch": 0.035130519638936326, "grad_norm": 1.0142180919647217, "learning_rate": 2.491384088275306e-07, "loss": 2.1853, "step": 144 }, { "epoch": 0.03537448158087338, "grad_norm": 0.9584774971008301, "learning_rate": 2.491322288018143e-07, "loss": 2.0712, "step": 145 }, { "epoch": 0.03561844352281044, "grad_norm": 1.2019151449203491, "learning_rate": 2.4912604597237626e-07, "loss": 2.1924, "step": 146 }, { "epoch": 0.0358624054647475, "grad_norm": 0.8215965628623962, "learning_rate": 2.4911986033730807e-07, "loss": 2.1359, "step": 147 }, { "epoch": 0.036106367406684554, "grad_norm": 0.8709486722946167, "learning_rate": 2.491136718946997e-07, "loss": 2.4015, "step": 148 }, { "epoch": 0.03635032934862162, "grad_norm": 0.7629618644714355, "learning_rate": 2.4910748064263914e-07, "loss": 2.2006, "step": 149 }, { "epoch": 0.036594291290558675, "grad_norm": 1.1265887022018433, "learning_rate": 2.491012865792129e-07, "loss": 2.2315, "step": 150 }, { "epoch": 0.036594291290558675, "eval_loss": 2.162862539291382, "eval_runtime": 82.4917, "eval_samples_per_second": 3.103, "eval_steps_per_second": 0.776, "step": 150 }, { "epoch": 0.03683825323249573, "grad_norm": 0.8692772388458252, "learning_rate": 2.490950897025056e-07, "loss": 2.1676, "step": 151 }, { "epoch": 0.03708221517443279, "grad_norm": 0.865197479724884, "learning_rate": 2.4908889001060015e-07, "loss": 2.2783, "step": 152 }, { "epoch": 0.037326177116369846, "grad_norm": 0.922754168510437, "learning_rate": 2.490826875015777e-07, "loss": 2.3296, "step": 153 }, { "epoch": 0.0375701390583069, "grad_norm": 0.9516766667366028, "learning_rate": 2.490764821735178e-07, "loss": 2.3431, "step": 154 }, { "epoch": 0.03781410100024396, "grad_norm": 0.9996930360794067, "learning_rate": 2.4907027402449803e-07, "loss": 2.2365, "step": 155 }, { "epoch": 0.03805806294218102, "grad_norm": 0.7445939779281616, "learning_rate": 2.4906406305259434e-07, "loss": 2.1925, "step": 156 }, { "epoch": 0.038302024884118074, "grad_norm": 0.8426290154457092, "learning_rate": 2.4905784925588094e-07, "loss": 2.2349, "step": 157 }, { "epoch": 0.03854598682605514, "grad_norm": 1.0926883220672607, "learning_rate": 2.4905163263243023e-07, "loss": 2.2426, "step": 158 }, { "epoch": 0.038789948767992195, "grad_norm": 1.0484980344772339, "learning_rate": 2.4904541318031294e-07, "loss": 2.1593, "step": 159 }, { "epoch": 0.03903391070992925, "grad_norm": 0.8670967817306519, "learning_rate": 2.49039190897598e-07, "loss": 2.2036, "step": 160 }, { "epoch": 0.03903391070992925, "eval_loss": 2.152311086654663, "eval_runtime": 82.7226, "eval_samples_per_second": 3.095, "eval_steps_per_second": 0.774, "step": 160 }, { "epoch": 0.03927787265186631, "grad_norm": 0.8905733227729797, "learning_rate": 2.490329657823525e-07, "loss": 2.0864, "step": 161 }, { "epoch": 0.039521834593803366, "grad_norm": 0.8731813430786133, "learning_rate": 2.490267378326419e-07, "loss": 2.218, "step": 162 }, { "epoch": 0.03976579653574042, "grad_norm": 0.7908375263214111, "learning_rate": 2.490205070465299e-07, "loss": 2.1147, "step": 163 }, { "epoch": 0.04000975847767748, "grad_norm": 0.9735328555107117, "learning_rate": 2.4901427342207823e-07, "loss": 2.1494, "step": 164 }, { "epoch": 0.04025372041961454, "grad_norm": 0.7927246689796448, "learning_rate": 2.490080369573472e-07, "loss": 2.1533, "step": 165 }, { "epoch": 0.0404976823615516, "grad_norm": 0.8604568839073181, "learning_rate": 2.4900179765039496e-07, "loss": 2.1147, "step": 166 }, { "epoch": 0.04074164430348866, "grad_norm": 1.905282735824585, "learning_rate": 2.489955554992782e-07, "loss": 2.1228, "step": 167 }, { "epoch": 0.040985606245425715, "grad_norm": 0.8245707154273987, "learning_rate": 2.489893105020518e-07, "loss": 2.0648, "step": 168 }, { "epoch": 0.04122956818736277, "grad_norm": 0.9537479281425476, "learning_rate": 2.489830626567686e-07, "loss": 2.2827, "step": 169 }, { "epoch": 0.04147353012929983, "grad_norm": 0.8153314590454102, "learning_rate": 2.4897681196148e-07, "loss": 2.1892, "step": 170 }, { "epoch": 0.04147353012929983, "eval_loss": 2.14363694190979, "eval_runtime": 84.0031, "eval_samples_per_second": 3.048, "eval_steps_per_second": 0.762, "step": 170 }, { "epoch": 0.041717492071236886, "grad_norm": 0.9862874150276184, "learning_rate": 2.4897055841423537e-07, "loss": 2.1974, "step": 171 }, { "epoch": 0.04196145401317394, "grad_norm": 0.7293988466262817, "learning_rate": 2.489643020130825e-07, "loss": 2.1471, "step": 172 }, { "epoch": 0.042205415955111, "grad_norm": 0.981706976890564, "learning_rate": 2.4895804275606724e-07, "loss": 2.1939, "step": 173 }, { "epoch": 0.04244937789704806, "grad_norm": 0.7976970076560974, "learning_rate": 2.489517806412337e-07, "loss": 2.3129, "step": 174 }, { "epoch": 0.04269333983898512, "grad_norm": 1.0813937187194824, "learning_rate": 2.4894551566662435e-07, "loss": 2.4131, "step": 175 }, { "epoch": 0.04293730178092218, "grad_norm": 0.7610160708427429, "learning_rate": 2.4893924783027967e-07, "loss": 2.2202, "step": 176 }, { "epoch": 0.043181263722859235, "grad_norm": 0.8845246434211731, "learning_rate": 2.4893297713023835e-07, "loss": 2.0994, "step": 177 }, { "epoch": 0.04342522566479629, "grad_norm": 0.8146999478340149, "learning_rate": 2.4892670356453745e-07, "loss": 2.2112, "step": 178 }, { "epoch": 0.04366918760673335, "grad_norm": 53.56632995605469, "learning_rate": 2.4892042713121207e-07, "loss": 2.2625, "step": 179 }, { "epoch": 0.043913149548670406, "grad_norm": 0.8157463073730469, "learning_rate": 2.4891414782829566e-07, "loss": 2.206, "step": 180 }, { "epoch": 0.043913149548670406, "eval_loss": 2.1358835697174072, "eval_runtime": 84.0237, "eval_samples_per_second": 3.047, "eval_steps_per_second": 0.762, "step": 180 }, { "epoch": 0.04415711149060746, "grad_norm": 0.776892364025116, "learning_rate": 2.4890786565381976e-07, "loss": 2.1461, "step": 181 }, { "epoch": 0.04440107343254452, "grad_norm": 1.0113416910171509, "learning_rate": 2.489015806058142e-07, "loss": 2.2795, "step": 182 }, { "epoch": 0.044645035374481584, "grad_norm": 1.4400883913040161, "learning_rate": 2.4889529268230683e-07, "loss": 2.252, "step": 183 }, { "epoch": 0.04488899731641864, "grad_norm": 1.5715802907943726, "learning_rate": 2.4888900188132405e-07, "loss": 2.254, "step": 184 }, { "epoch": 0.0451329592583557, "grad_norm": 0.8854827284812927, "learning_rate": 2.4888270820089003e-07, "loss": 2.2534, "step": 185 }, { "epoch": 0.045376921200292755, "grad_norm": 0.8536761403083801, "learning_rate": 2.488764116390274e-07, "loss": 2.243, "step": 186 }, { "epoch": 0.04562088314222981, "grad_norm": 1.156900405883789, "learning_rate": 2.488701121937568e-07, "loss": 2.2924, "step": 187 }, { "epoch": 0.04586484508416687, "grad_norm": 0.9670676589012146, "learning_rate": 2.488638098630973e-07, "loss": 2.1767, "step": 188 }, { "epoch": 0.046108807026103926, "grad_norm": 0.6777328848838806, "learning_rate": 2.4885750464506606e-07, "loss": 2.2506, "step": 189 }, { "epoch": 0.04635276896804098, "grad_norm": 1.1340001821517944, "learning_rate": 2.488511965376782e-07, "loss": 2.2462, "step": 190 }, { "epoch": 0.04635276896804098, "eval_loss": 2.1293463706970215, "eval_runtime": 82.7369, "eval_samples_per_second": 3.094, "eval_steps_per_second": 0.774, "step": 190 }, { "epoch": 0.04659673090997804, "grad_norm": 0.9202519655227661, "learning_rate": 2.488448855389473e-07, "loss": 2.264, "step": 191 }, { "epoch": 0.046840692851915104, "grad_norm": 0.7520004510879517, "learning_rate": 2.48838571646885e-07, "loss": 2.184, "step": 192 }, { "epoch": 0.04708465479385216, "grad_norm": 0.9756651520729065, "learning_rate": 2.488322548595012e-07, "loss": 2.2218, "step": 193 }, { "epoch": 0.04732861673578922, "grad_norm": 1.1608117818832397, "learning_rate": 2.488259351748038e-07, "loss": 2.3409, "step": 194 }, { "epoch": 0.047572578677726275, "grad_norm": 0.9417536854743958, "learning_rate": 2.48819612590799e-07, "loss": 2.2483, "step": 195 }, { "epoch": 0.04781654061966333, "grad_norm": 0.8184861540794373, "learning_rate": 2.4881328710549126e-07, "loss": 2.1357, "step": 196 }, { "epoch": 0.04806050256160039, "grad_norm": 0.8574642539024353, "learning_rate": 2.48806958716883e-07, "loss": 2.2997, "step": 197 }, { "epoch": 0.048304464503537446, "grad_norm": 0.762095034122467, "learning_rate": 2.488006274229749e-07, "loss": 2.1704, "step": 198 }, { "epoch": 0.0485484264454745, "grad_norm": 0.876278281211853, "learning_rate": 2.4879429322176583e-07, "loss": 2.1739, "step": 199 }, { "epoch": 0.04879238838741157, "grad_norm": 0.7141769528388977, "learning_rate": 2.4878795611125284e-07, "loss": 2.1301, "step": 200 }, { "epoch": 0.04879238838741157, "eval_loss": 2.1227312088012695, "eval_runtime": 82.5491, "eval_samples_per_second": 3.101, "eval_steps_per_second": 0.775, "step": 200 }, { "epoch": 0.049036350329348624, "grad_norm": 0.793533444404602, "learning_rate": 2.487816160894311e-07, "loss": 2.105, "step": 201 }, { "epoch": 0.04928031227128568, "grad_norm": 1.4064488410949707, "learning_rate": 2.4877527315429387e-07, "loss": 2.2669, "step": 202 }, { "epoch": 0.04952427421322274, "grad_norm": 0.7336916923522949, "learning_rate": 2.4876892730383267e-07, "loss": 2.2367, "step": 203 }, { "epoch": 0.049768236155159795, "grad_norm": 5.0201287269592285, "learning_rate": 2.4876257853603717e-07, "loss": 2.3826, "step": 204 }, { "epoch": 0.05001219809709685, "grad_norm": 1.0578281879425049, "learning_rate": 2.4875622684889513e-07, "loss": 2.178, "step": 205 }, { "epoch": 0.05025616003903391, "grad_norm": 0.7075783610343933, "learning_rate": 2.4874987224039246e-07, "loss": 2.2244, "step": 206 }, { "epoch": 0.050500121980970966, "grad_norm": 1.0379695892333984, "learning_rate": 2.4874351470851334e-07, "loss": 2.2114, "step": 207 }, { "epoch": 0.05074408392290802, "grad_norm": 0.8745626211166382, "learning_rate": 2.4873715425123986e-07, "loss": 2.2984, "step": 208 }, { "epoch": 0.05098804586484509, "grad_norm": 0.7211839556694031, "learning_rate": 2.4873079086655244e-07, "loss": 2.2514, "step": 209 }, { "epoch": 0.051232007806782144, "grad_norm": 1.0270675420761108, "learning_rate": 2.487244245524296e-07, "loss": 2.0273, "step": 210 }, { "epoch": 0.051232007806782144, "eval_loss": 2.1163525581359863, "eval_runtime": 82.5285, "eval_samples_per_second": 3.102, "eval_steps_per_second": 0.775, "step": 210 }, { "epoch": 0.0514759697487192, "grad_norm": 0.7041739225387573, "learning_rate": 2.487180553068481e-07, "loss": 2.2021, "step": 211 }, { "epoch": 0.05171993169065626, "grad_norm": 0.7967678904533386, "learning_rate": 2.487116831277826e-07, "loss": 2.0156, "step": 212 }, { "epoch": 0.051963893632593315, "grad_norm": 0.9619562029838562, "learning_rate": 2.4870530801320607e-07, "loss": 2.0662, "step": 213 }, { "epoch": 0.05220785557453037, "grad_norm": 0.6969118714332581, "learning_rate": 2.486989299610895e-07, "loss": 2.1831, "step": 214 }, { "epoch": 0.05245181751646743, "grad_norm": 0.9280874729156494, "learning_rate": 2.4869254896940207e-07, "loss": 2.1238, "step": 215 }, { "epoch": 0.052695779458404486, "grad_norm": 0.8420175909996033, "learning_rate": 2.4868616503611124e-07, "loss": 2.0776, "step": 216 }, { "epoch": 0.05293974140034155, "grad_norm": 0.737402081489563, "learning_rate": 2.486797781591823e-07, "loss": 1.9902, "step": 217 }, { "epoch": 0.05318370334227861, "grad_norm": 1.4963791370391846, "learning_rate": 2.4867338833657884e-07, "loss": 2.1924, "step": 218 }, { "epoch": 0.053427665284215664, "grad_norm": 0.8707871437072754, "learning_rate": 2.4866699556626256e-07, "loss": 2.3329, "step": 219 }, { "epoch": 0.05367162722615272, "grad_norm": 2.2541556358337402, "learning_rate": 2.486605998461933e-07, "loss": 2.2628, "step": 220 }, { "epoch": 0.05367162722615272, "eval_loss": 2.110635995864868, "eval_runtime": 82.4261, "eval_samples_per_second": 3.106, "eval_steps_per_second": 0.776, "step": 220 }, { "epoch": 0.05391558916808978, "grad_norm": 0.7243703603744507, "learning_rate": 2.4865420117432884e-07, "loss": 2.2738, "step": 221 }, { "epoch": 0.054159551110026835, "grad_norm": 2.9058852195739746, "learning_rate": 2.4864779954862536e-07, "loss": 2.2954, "step": 222 }, { "epoch": 0.05440351305196389, "grad_norm": 0.6986073851585388, "learning_rate": 2.486413949670369e-07, "loss": 2.062, "step": 223 }, { "epoch": 0.05464747499390095, "grad_norm": 0.8654193878173828, "learning_rate": 2.486349874275158e-07, "loss": 1.8318, "step": 224 }, { "epoch": 0.054891436935838006, "grad_norm": 1.1562286615371704, "learning_rate": 2.486285769280123e-07, "loss": 2.3011, "step": 225 }, { "epoch": 0.05513539887777507, "grad_norm": 0.9906584024429321, "learning_rate": 2.48622163466475e-07, "loss": 2.0338, "step": 226 }, { "epoch": 0.05537936081971213, "grad_norm": 0.7650761008262634, "learning_rate": 2.486157470408504e-07, "loss": 2.2399, "step": 227 }, { "epoch": 0.055623322761649184, "grad_norm": 0.8273106217384338, "learning_rate": 2.4860932764908314e-07, "loss": 2.1346, "step": 228 }, { "epoch": 0.05586728470358624, "grad_norm": 0.8235612511634827, "learning_rate": 2.486029052891161e-07, "loss": 2.1117, "step": 229 }, { "epoch": 0.0561112466455233, "grad_norm": 0.7551500201225281, "learning_rate": 2.4859647995889003e-07, "loss": 2.0045, "step": 230 }, { "epoch": 0.0561112466455233, "eval_loss": 2.105764150619507, "eval_runtime": 82.5574, "eval_samples_per_second": 3.101, "eval_steps_per_second": 0.775, "step": 230 }, { "epoch": 0.056355208587460355, "grad_norm": 0.6984049081802368, "learning_rate": 2.4859005165634397e-07, "loss": 1.9406, "step": 231 }, { "epoch": 0.05659917052939741, "grad_norm": 0.7007377743721008, "learning_rate": 2.4858362037941493e-07, "loss": 2.1012, "step": 232 }, { "epoch": 0.05684313247133447, "grad_norm": 0.7448357343673706, "learning_rate": 2.485771861260381e-07, "loss": 2.0201, "step": 233 }, { "epoch": 0.05708709441327153, "grad_norm": 0.8038643002510071, "learning_rate": 2.485707488941467e-07, "loss": 2.113, "step": 234 }, { "epoch": 0.05733105635520859, "grad_norm": 0.683526873588562, "learning_rate": 2.48564308681672e-07, "loss": 2.2006, "step": 235 }, { "epoch": 0.05757501829714565, "grad_norm": 1.10641610622406, "learning_rate": 2.485578654865435e-07, "loss": 2.1394, "step": 236 }, { "epoch": 0.057818980239082704, "grad_norm": 0.9854748249053955, "learning_rate": 2.485514193066886e-07, "loss": 2.0461, "step": 237 }, { "epoch": 0.05806294218101976, "grad_norm": 0.9628323316574097, "learning_rate": 2.485449701400329e-07, "loss": 2.0795, "step": 238 }, { "epoch": 0.05830690412295682, "grad_norm": 0.7303637862205505, "learning_rate": 2.485385179845001e-07, "loss": 2.1231, "step": 239 }, { "epoch": 0.058550866064893875, "grad_norm": 1.024084448814392, "learning_rate": 2.4853206283801187e-07, "loss": 2.2952, "step": 240 }, { "epoch": 0.058550866064893875, "eval_loss": 2.1005725860595703, "eval_runtime": 82.6605, "eval_samples_per_second": 3.097, "eval_steps_per_second": 0.774, "step": 240 }, { "epoch": 0.05879482800683093, "grad_norm": 0.7839118242263794, "learning_rate": 2.4852560469848794e-07, "loss": 2.1039, "step": 241 }, { "epoch": 0.05903878994876799, "grad_norm": 0.8608528971672058, "learning_rate": 2.4851914356384624e-07, "loss": 2.0228, "step": 242 }, { "epoch": 0.05928275189070505, "grad_norm": 0.7604301571846008, "learning_rate": 2.485126794320027e-07, "loss": 2.085, "step": 243 }, { "epoch": 0.05952671383264211, "grad_norm": 0.6948022842407227, "learning_rate": 2.4850621230087125e-07, "loss": 2.1667, "step": 244 }, { "epoch": 0.05977067577457917, "grad_norm": 0.9847801327705383, "learning_rate": 2.4849974216836405e-07, "loss": 2.132, "step": 245 }, { "epoch": 0.060014637716516224, "grad_norm": 0.647404670715332, "learning_rate": 2.4849326903239115e-07, "loss": 2.0881, "step": 246 }, { "epoch": 0.06025859965845328, "grad_norm": 0.8329553604125977, "learning_rate": 2.4848679289086074e-07, "loss": 2.1774, "step": 247 }, { "epoch": 0.06050256160039034, "grad_norm": 0.6948926448822021, "learning_rate": 2.4848031374167913e-07, "loss": 2.1431, "step": 248 }, { "epoch": 0.060746523542327395, "grad_norm": 1.132636547088623, "learning_rate": 2.484738315827505e-07, "loss": 2.1713, "step": 249 }, { "epoch": 0.06099048548426445, "grad_norm": 0.7370600700378418, "learning_rate": 2.484673464119773e-07, "loss": 2.1122, "step": 250 }, { "epoch": 0.06099048548426445, "eval_loss": 2.0957703590393066, "eval_runtime": 82.5102, "eval_samples_per_second": 3.103, "eval_steps_per_second": 0.776, "step": 250 }, { "epoch": 0.061234447426201516, "grad_norm": 0.9531499147415161, "learning_rate": 2.484608582272598e-07, "loss": 2.1948, "step": 251 }, { "epoch": 0.06147840936813857, "grad_norm": 49.09546661376953, "learning_rate": 2.4845436702649656e-07, "loss": 2.3097, "step": 252 }, { "epoch": 0.06172237131007563, "grad_norm": 0.9064823389053345, "learning_rate": 2.48447872807584e-07, "loss": 2.1416, "step": 253 }, { "epoch": 0.06196633325201269, "grad_norm": 0.8501002788543701, "learning_rate": 2.484413755684167e-07, "loss": 2.1266, "step": 254 }, { "epoch": 0.062210295193949744, "grad_norm": 0.9969101548194885, "learning_rate": 2.484348753068872e-07, "loss": 2.1585, "step": 255 }, { "epoch": 0.0624542571358868, "grad_norm": 0.7964323163032532, "learning_rate": 2.484283720208861e-07, "loss": 2.0506, "step": 256 }, { "epoch": 0.06269821907782386, "grad_norm": 0.8279253840446472, "learning_rate": 2.4842186570830207e-07, "loss": 2.0881, "step": 257 }, { "epoch": 0.06294218101976091, "grad_norm": 0.751754879951477, "learning_rate": 2.484153563670218e-07, "loss": 1.93, "step": 258 }, { "epoch": 0.06318614296169797, "grad_norm": 0.7673382759094238, "learning_rate": 2.4840884399493006e-07, "loss": 2.0995, "step": 259 }, { "epoch": 0.06343010490363503, "grad_norm": 0.6529645919799805, "learning_rate": 2.4840232858990943e-07, "loss": 2.0436, "step": 260 }, { "epoch": 0.06343010490363503, "eval_loss": 2.0920004844665527, "eval_runtime": 82.4308, "eval_samples_per_second": 3.106, "eval_steps_per_second": 0.776, "step": 260 }, { "epoch": 0.06367406684557209, "grad_norm": 1.109803557395935, "learning_rate": 2.4839581014984084e-07, "loss": 2.1963, "step": 261 }, { "epoch": 0.06391802878750914, "grad_norm": 0.8001452088356018, "learning_rate": 2.48389288672603e-07, "loss": 2.2763, "step": 262 }, { "epoch": 0.0641619907294462, "grad_norm": 0.8246157169342041, "learning_rate": 2.483827641560728e-07, "loss": 2.0909, "step": 263 }, { "epoch": 0.06440595267138327, "grad_norm": 1.1484432220458984, "learning_rate": 2.48376236598125e-07, "loss": 2.0256, "step": 264 }, { "epoch": 0.06464991461332033, "grad_norm": 0.7669978737831116, "learning_rate": 2.4836970599663255e-07, "loss": 2.119, "step": 265 }, { "epoch": 0.06489387655525738, "grad_norm": 0.8040069341659546, "learning_rate": 2.4836317234946626e-07, "loss": 2.2469, "step": 266 }, { "epoch": 0.06513783849719444, "grad_norm": 1.2191132307052612, "learning_rate": 2.48356635654495e-07, "loss": 2.1695, "step": 267 }, { "epoch": 0.0653818004391315, "grad_norm": 0.7855357527732849, "learning_rate": 2.4835009590958575e-07, "loss": 2.165, "step": 268 }, { "epoch": 0.06562576238106856, "grad_norm": 0.692392110824585, "learning_rate": 2.483435531126034e-07, "loss": 2.1611, "step": 269 }, { "epoch": 0.06586972432300561, "grad_norm": 0.7394700646400452, "learning_rate": 2.483370072614108e-07, "loss": 2.0499, "step": 270 }, { "epoch": 0.06586972432300561, "eval_loss": 2.0882444381713867, "eval_runtime": 82.4623, "eval_samples_per_second": 3.104, "eval_steps_per_second": 0.776, "step": 270 }, { "epoch": 0.06611368626494267, "grad_norm": 0.8597064018249512, "learning_rate": 2.483304583538689e-07, "loss": 2.1721, "step": 271 }, { "epoch": 0.06635764820687973, "grad_norm": 0.6948525905609131, "learning_rate": 2.4832390638783666e-07, "loss": 2.1164, "step": 272 }, { "epoch": 0.06660161014881678, "grad_norm": 0.9271346926689148, "learning_rate": 2.4831735136117095e-07, "loss": 2.222, "step": 273 }, { "epoch": 0.06684557209075384, "grad_norm": 0.7124070525169373, "learning_rate": 2.4831079327172674e-07, "loss": 2.1953, "step": 274 }, { "epoch": 0.0670895340326909, "grad_norm": 0.7730213403701782, "learning_rate": 2.4830423211735686e-07, "loss": 2.1612, "step": 275 }, { "epoch": 0.06733349597462795, "grad_norm": 0.7319573760032654, "learning_rate": 2.482976678959123e-07, "loss": 2.1091, "step": 276 }, { "epoch": 0.06757745791656501, "grad_norm": 0.7052650451660156, "learning_rate": 2.4829110060524197e-07, "loss": 2.0802, "step": 277 }, { "epoch": 0.06782141985850207, "grad_norm": 0.6924635171890259, "learning_rate": 2.482845302431927e-07, "loss": 2.0242, "step": 278 }, { "epoch": 0.06806538180043913, "grad_norm": 0.8777127861976624, "learning_rate": 2.4827795680760933e-07, "loss": 2.1926, "step": 279 }, { "epoch": 0.06830934374237618, "grad_norm": 0.6327698230743408, "learning_rate": 2.482713802963348e-07, "loss": 2.0477, "step": 280 }, { "epoch": 0.06830934374237618, "eval_loss": 2.0836799144744873, "eval_runtime": 82.5559, "eval_samples_per_second": 3.101, "eval_steps_per_second": 0.775, "step": 280 }, { "epoch": 0.06855330568431325, "grad_norm": 0.8194009065628052, "learning_rate": 2.4826480070720985e-07, "loss": 2.1662, "step": 281 }, { "epoch": 0.06879726762625031, "grad_norm": 2.9831783771514893, "learning_rate": 2.482582180380734e-07, "loss": 2.0296, "step": 282 }, { "epoch": 0.06904122956818737, "grad_norm": 0.7184981107711792, "learning_rate": 2.482516322867622e-07, "loss": 2.0806, "step": 283 }, { "epoch": 0.06928519151012442, "grad_norm": 0.9403382539749146, "learning_rate": 2.48245043451111e-07, "loss": 2.267, "step": 284 }, { "epoch": 0.06952915345206148, "grad_norm": 0.7837559580802917, "learning_rate": 2.482384515289525e-07, "loss": 2.1853, "step": 285 }, { "epoch": 0.06977311539399854, "grad_norm": 0.6912328004837036, "learning_rate": 2.482318565181174e-07, "loss": 2.1157, "step": 286 }, { "epoch": 0.0700170773359356, "grad_norm": 0.7220721244812012, "learning_rate": 2.4822525841643453e-07, "loss": 2.1604, "step": 287 }, { "epoch": 0.07026103927787265, "grad_norm": 0.6910116672515869, "learning_rate": 2.482186572217303e-07, "loss": 2.3089, "step": 288 }, { "epoch": 0.07050500121980971, "grad_norm": 0.6678460836410522, "learning_rate": 2.482120529318294e-07, "loss": 2.144, "step": 289 }, { "epoch": 0.07074896316174677, "grad_norm": 0.8362970948219299, "learning_rate": 2.482054455445545e-07, "loss": 2.1724, "step": 290 }, { "epoch": 0.07074896316174677, "eval_loss": 2.079866409301758, "eval_runtime": 82.5785, "eval_samples_per_second": 3.1, "eval_steps_per_second": 0.775, "step": 290 }, { "epoch": 0.07099292510368382, "grad_norm": 0.6900479793548584, "learning_rate": 2.481988350577259e-07, "loss": 2.0982, "step": 291 }, { "epoch": 0.07123688704562088, "grad_norm": 0.737310528755188, "learning_rate": 2.481922214691622e-07, "loss": 2.0816, "step": 292 }, { "epoch": 0.07148084898755794, "grad_norm": 0.6768659949302673, "learning_rate": 2.481856047766798e-07, "loss": 2.1823, "step": 293 }, { "epoch": 0.071724810929495, "grad_norm": 0.6438968181610107, "learning_rate": 2.4817898497809304e-07, "loss": 2.0001, "step": 294 }, { "epoch": 0.07196877287143205, "grad_norm": 0.9217244982719421, "learning_rate": 2.4817236207121427e-07, "loss": 2.1251, "step": 295 }, { "epoch": 0.07221273481336911, "grad_norm": 0.984214186668396, "learning_rate": 2.4816573605385374e-07, "loss": 2.2588, "step": 296 }, { "epoch": 0.07245669675530617, "grad_norm": 0.6501964330673218, "learning_rate": 2.481591069238197e-07, "loss": 1.9671, "step": 297 }, { "epoch": 0.07270065869724324, "grad_norm": 0.7080546617507935, "learning_rate": 2.481524746789182e-07, "loss": 2.1113, "step": 298 }, { "epoch": 0.0729446206391803, "grad_norm": 0.759996235370636, "learning_rate": 2.4814583931695343e-07, "loss": 2.1671, "step": 299 }, { "epoch": 0.07318858258111735, "grad_norm": 0.7538090348243713, "learning_rate": 2.4813920083572734e-07, "loss": 2.0294, "step": 300 }, { "epoch": 0.07318858258111735, "eval_loss": 2.0766212940216064, "eval_runtime": 82.5747, "eval_samples_per_second": 3.1, "eval_steps_per_second": 0.775, "step": 300 }, { "epoch": 0.07343254452305441, "grad_norm": 1.0901976823806763, "learning_rate": 2.481325592330399e-07, "loss": 2.1481, "step": 301 }, { "epoch": 0.07367650646499146, "grad_norm": 0.5969391465187073, "learning_rate": 2.4812591450668896e-07, "loss": 2.0585, "step": 302 }, { "epoch": 0.07392046840692852, "grad_norm": 1.0443135499954224, "learning_rate": 2.4811926665447034e-07, "loss": 2.1669, "step": 303 }, { "epoch": 0.07416443034886558, "grad_norm": 0.7203410863876343, "learning_rate": 2.481126156741779e-07, "loss": 2.1098, "step": 304 }, { "epoch": 0.07440839229080264, "grad_norm": 0.8156256079673767, "learning_rate": 2.481059615636031e-07, "loss": 2.0854, "step": 305 }, { "epoch": 0.07465235423273969, "grad_norm": 0.9032734036445618, "learning_rate": 2.480993043205356e-07, "loss": 2.0709, "step": 306 }, { "epoch": 0.07489631617467675, "grad_norm": 0.7365292310714722, "learning_rate": 2.4809264394276297e-07, "loss": 2.0466, "step": 307 }, { "epoch": 0.0751402781166138, "grad_norm": 1.4419785737991333, "learning_rate": 2.4808598042807057e-07, "loss": 2.1844, "step": 308 }, { "epoch": 0.07538424005855086, "grad_norm": 0.8487168550491333, "learning_rate": 2.4807931377424167e-07, "loss": 2.0844, "step": 309 }, { "epoch": 0.07562820200048792, "grad_norm": 0.6521626114845276, "learning_rate": 2.4807264397905757e-07, "loss": 2.1657, "step": 310 }, { "epoch": 0.07562820200048792, "eval_loss": 2.0734453201293945, "eval_runtime": 82.6342, "eval_samples_per_second": 3.098, "eval_steps_per_second": 0.774, "step": 310 }, { "epoch": 0.07587216394242498, "grad_norm": 1.0338906049728394, "learning_rate": 2.480659710402974e-07, "loss": 2.0817, "step": 311 }, { "epoch": 0.07611612588436203, "grad_norm": 0.7622800469398499, "learning_rate": 2.480592949557383e-07, "loss": 2.2028, "step": 312 }, { "epoch": 0.07636008782629909, "grad_norm": 0.698598325252533, "learning_rate": 2.4805261572315513e-07, "loss": 2.1197, "step": 313 }, { "epoch": 0.07660404976823615, "grad_norm": 0.9301127195358276, "learning_rate": 2.480459333403207e-07, "loss": 2.0719, "step": 314 }, { "epoch": 0.07684801171017322, "grad_norm": 0.7141032814979553, "learning_rate": 2.480392478050059e-07, "loss": 1.9979, "step": 315 }, { "epoch": 0.07709197365211028, "grad_norm": 0.8011495471000671, "learning_rate": 2.4803255911497927e-07, "loss": 2.2752, "step": 316 }, { "epoch": 0.07733593559404733, "grad_norm": 0.6986822485923767, "learning_rate": 2.4802586726800744e-07, "loss": 2.1448, "step": 317 }, { "epoch": 0.07757989753598439, "grad_norm": 0.7055565118789673, "learning_rate": 2.4801917226185476e-07, "loss": 2.1929, "step": 318 }, { "epoch": 0.07782385947792145, "grad_norm": 0.6355963945388794, "learning_rate": 2.480124740942837e-07, "loss": 2.0868, "step": 319 }, { "epoch": 0.0780678214198585, "grad_norm": 0.6996073126792908, "learning_rate": 2.480057727630543e-07, "loss": 2.1996, "step": 320 }, { "epoch": 0.0780678214198585, "eval_loss": 2.070326805114746, "eval_runtime": 82.8377, "eval_samples_per_second": 3.09, "eval_steps_per_second": 0.773, "step": 320 }, { "epoch": 0.07831178336179556, "grad_norm": 0.9807422757148743, "learning_rate": 2.479990682659248e-07, "loss": 2.1283, "step": 321 }, { "epoch": 0.07855574530373262, "grad_norm": 0.660064697265625, "learning_rate": 2.4799236060065104e-07, "loss": 2.1273, "step": 322 }, { "epoch": 0.07879970724566968, "grad_norm": 0.6279289722442627, "learning_rate": 2.47985649764987e-07, "loss": 2.0789, "step": 323 }, { "epoch": 0.07904366918760673, "grad_norm": 0.8380366563796997, "learning_rate": 2.4797893575668437e-07, "loss": 2.0606, "step": 324 }, { "epoch": 0.07928763112954379, "grad_norm": 0.6436744928359985, "learning_rate": 2.4797221857349267e-07, "loss": 2.1684, "step": 325 }, { "epoch": 0.07953159307148085, "grad_norm": 0.9657268524169922, "learning_rate": 2.4796549821315954e-07, "loss": 2.1487, "step": 326 }, { "epoch": 0.0797755550134179, "grad_norm": 0.7531183362007141, "learning_rate": 2.479587746734302e-07, "loss": 2.2133, "step": 327 }, { "epoch": 0.08001951695535496, "grad_norm": 0.6298527121543884, "learning_rate": 2.4795204795204794e-07, "loss": 2.1123, "step": 328 }, { "epoch": 0.08026347889729202, "grad_norm": 1.2394922971725464, "learning_rate": 2.479453180467538e-07, "loss": 2.1458, "step": 329 }, { "epoch": 0.08050744083922907, "grad_norm": 0.7295296788215637, "learning_rate": 2.479385849552867e-07, "loss": 2.127, "step": 330 }, { "epoch": 0.08050744083922907, "eval_loss": 2.0674209594726562, "eval_runtime": 82.4919, "eval_samples_per_second": 3.103, "eval_steps_per_second": 0.776, "step": 330 }, { "epoch": 0.08075140278116613, "grad_norm": 0.7010200619697571, "learning_rate": 2.479318486753834e-07, "loss": 1.9559, "step": 331 }, { "epoch": 0.0809953647231032, "grad_norm": 0.9205324053764343, "learning_rate": 2.479251092047787e-07, "loss": 2.3111, "step": 332 }, { "epoch": 0.08123932666504026, "grad_norm": 0.7850201725959778, "learning_rate": 2.4791836654120494e-07, "loss": 2.0908, "step": 333 }, { "epoch": 0.08148328860697732, "grad_norm": 0.8352589011192322, "learning_rate": 2.4791162068239256e-07, "loss": 2.0713, "step": 334 }, { "epoch": 0.08172725054891437, "grad_norm": 1.1444090604782104, "learning_rate": 2.4790487162606977e-07, "loss": 2.2373, "step": 335 }, { "epoch": 0.08197121249085143, "grad_norm": 0.6652919054031372, "learning_rate": 2.478981193699626e-07, "loss": 2.0393, "step": 336 }, { "epoch": 0.08221517443278849, "grad_norm": 1.2522311210632324, "learning_rate": 2.478913639117949e-07, "loss": 2.1158, "step": 337 }, { "epoch": 0.08245913637472554, "grad_norm": 0.6700438261032104, "learning_rate": 2.478846052492885e-07, "loss": 2.1475, "step": 338 }, { "epoch": 0.0827030983166626, "grad_norm": 0.594170868396759, "learning_rate": 2.478778433801629e-07, "loss": 2.2158, "step": 339 }, { "epoch": 0.08294706025859966, "grad_norm": 0.7548016905784607, "learning_rate": 2.478710783021355e-07, "loss": 2.1094, "step": 340 }, { "epoch": 0.08294706025859966, "eval_loss": 2.0647270679473877, "eval_runtime": 82.4405, "eval_samples_per_second": 3.105, "eval_steps_per_second": 0.776, "step": 340 }, { "epoch": 0.08319102220053672, "grad_norm": 0.6635969877243042, "learning_rate": 2.4786431001292156e-07, "loss": 2.0065, "step": 341 }, { "epoch": 0.08343498414247377, "grad_norm": 0.5917369723320007, "learning_rate": 2.478575385102342e-07, "loss": 2.1334, "step": 342 }, { "epoch": 0.08367894608441083, "grad_norm": 0.7114012241363525, "learning_rate": 2.4785076379178427e-07, "loss": 2.2898, "step": 343 }, { "epoch": 0.08392290802634789, "grad_norm": 0.6210088729858398, "learning_rate": 2.478439858552805e-07, "loss": 2.1155, "step": 344 }, { "epoch": 0.08416686996828494, "grad_norm": 3.0671420097351074, "learning_rate": 2.4783720469842943e-07, "loss": 2.2391, "step": 345 }, { "epoch": 0.084410831910222, "grad_norm": 0.6841104030609131, "learning_rate": 2.4783042031893544e-07, "loss": 1.9859, "step": 346 }, { "epoch": 0.08465479385215906, "grad_norm": 0.8301260471343994, "learning_rate": 2.478236327145007e-07, "loss": 2.1063, "step": 347 }, { "epoch": 0.08489875579409611, "grad_norm": 0.688525378704071, "learning_rate": 2.4781684188282526e-07, "loss": 2.0468, "step": 348 }, { "epoch": 0.08514271773603319, "grad_norm": 0.7203090190887451, "learning_rate": 2.4781004782160693e-07, "loss": 2.1424, "step": 349 }, { "epoch": 0.08538667967797024, "grad_norm": 0.6987703442573547, "learning_rate": 2.478032505285412e-07, "loss": 2.0616, "step": 350 }, { "epoch": 0.08538667967797024, "eval_loss": 2.0618653297424316, "eval_runtime": 82.4978, "eval_samples_per_second": 3.103, "eval_steps_per_second": 0.776, "step": 350 }, { "epoch": 0.0856306416199073, "grad_norm": 0.8080345392227173, "learning_rate": 2.4779645000132166e-07, "loss": 2.1916, "step": 351 }, { "epoch": 0.08587460356184436, "grad_norm": 0.6830054521560669, "learning_rate": 2.477896462376395e-07, "loss": 2.0308, "step": 352 }, { "epoch": 0.08611856550378141, "grad_norm": 0.9455615282058716, "learning_rate": 2.4778283923518366e-07, "loss": 2.2529, "step": 353 }, { "epoch": 0.08636252744571847, "grad_norm": 1.0438265800476074, "learning_rate": 2.477760289916411e-07, "loss": 2.1359, "step": 354 }, { "epoch": 0.08660648938765553, "grad_norm": 0.7835482358932495, "learning_rate": 2.477692155046964e-07, "loss": 2.2987, "step": 355 }, { "epoch": 0.08685045132959258, "grad_norm": 1.003544569015503, "learning_rate": 2.47762398772032e-07, "loss": 2.2476, "step": 356 }, { "epoch": 0.08709441327152964, "grad_norm": 0.7639005780220032, "learning_rate": 2.4775557879132803e-07, "loss": 2.0089, "step": 357 }, { "epoch": 0.0873383752134667, "grad_norm": 0.6183759570121765, "learning_rate": 2.4774875556026265e-07, "loss": 2.0263, "step": 358 }, { "epoch": 0.08758233715540376, "grad_norm": 0.9802903532981873, "learning_rate": 2.477419290765115e-07, "loss": 2.0125, "step": 359 }, { "epoch": 0.08782629909734081, "grad_norm": 0.7445787787437439, "learning_rate": 2.4773509933774833e-07, "loss": 1.9287, "step": 360 }, { "epoch": 0.08782629909734081, "eval_loss": 2.060185194015503, "eval_runtime": 82.5332, "eval_samples_per_second": 3.102, "eval_steps_per_second": 0.775, "step": 360 }, { "epoch": 0.08807026103927787, "grad_norm": 0.6245632767677307, "learning_rate": 2.4772826634164435e-07, "loss": 1.9036, "step": 361 }, { "epoch": 0.08831422298121493, "grad_norm": 0.822571873664856, "learning_rate": 2.4772143008586876e-07, "loss": 2.1521, "step": 362 }, { "epoch": 0.08855818492315198, "grad_norm": 0.7154176235198975, "learning_rate": 2.4771459056808844e-07, "loss": 2.1507, "step": 363 }, { "epoch": 0.08880214686508904, "grad_norm": 0.7716903686523438, "learning_rate": 2.477077477859681e-07, "loss": 2.0469, "step": 364 }, { "epoch": 0.0890461088070261, "grad_norm": 0.8829085230827332, "learning_rate": 2.4770090173717014e-07, "loss": 2.0799, "step": 365 }, { "epoch": 0.08929007074896317, "grad_norm": 0.6315395832061768, "learning_rate": 2.4769405241935484e-07, "loss": 2.0283, "step": 366 }, { "epoch": 0.08953403269090023, "grad_norm": 0.6415708661079407, "learning_rate": 2.476871998301802e-07, "loss": 2.0164, "step": 367 }, { "epoch": 0.08977799463283728, "grad_norm": 0.6306458115577698, "learning_rate": 2.476803439673019e-07, "loss": 2.1085, "step": 368 }, { "epoch": 0.09002195657477434, "grad_norm": 0.8052152991294861, "learning_rate": 2.476734848283735e-07, "loss": 2.1536, "step": 369 }, { "epoch": 0.0902659185167114, "grad_norm": 0.7443030476570129, "learning_rate": 2.476666224110462e-07, "loss": 2.2221, "step": 370 }, { "epoch": 0.0902659185167114, "eval_loss": 2.057771682739258, "eval_runtime": 82.4526, "eval_samples_per_second": 3.105, "eval_steps_per_second": 0.776, "step": 370 }, { "epoch": 0.09050988045864845, "grad_norm": 0.717291533946991, "learning_rate": 2.476597567129691e-07, "loss": 1.9786, "step": 371 }, { "epoch": 0.09075384240058551, "grad_norm": 0.7804278135299683, "learning_rate": 2.4765288773178894e-07, "loss": 2.1336, "step": 372 }, { "epoch": 0.09099780434252257, "grad_norm": 0.5851336121559143, "learning_rate": 2.476460154651503e-07, "loss": 2.0796, "step": 373 }, { "epoch": 0.09124176628445962, "grad_norm": 0.7063292860984802, "learning_rate": 2.4763913991069527e-07, "loss": 2.1067, "step": 374 }, { "epoch": 0.09148572822639668, "grad_norm": 0.6482515335083008, "learning_rate": 2.4763226106606407e-07, "loss": 2.0642, "step": 375 }, { "epoch": 0.09172969016833374, "grad_norm": 1.2077556848526, "learning_rate": 2.476253789288943e-07, "loss": 1.9154, "step": 376 }, { "epoch": 0.0919736521102708, "grad_norm": 0.8584334254264832, "learning_rate": 2.4761849349682154e-07, "loss": 1.9837, "step": 377 }, { "epoch": 0.09221761405220785, "grad_norm": 0.5971949696540833, "learning_rate": 2.4761160476747895e-07, "loss": 1.9219, "step": 378 }, { "epoch": 0.09246157599414491, "grad_norm": 1.1772984266281128, "learning_rate": 2.4760471273849755e-07, "loss": 2.1976, "step": 379 }, { "epoch": 0.09270553793608197, "grad_norm": 1.0061829090118408, "learning_rate": 2.47597817407506e-07, "loss": 2.1878, "step": 380 }, { "epoch": 0.09270553793608197, "eval_loss": 2.0550973415374756, "eval_runtime": 82.389, "eval_samples_per_second": 3.107, "eval_steps_per_second": 0.777, "step": 380 }, { "epoch": 0.09294949987801902, "grad_norm": 89.18840789794922, "learning_rate": 2.475909187721307e-07, "loss": 2.1648, "step": 381 }, { "epoch": 0.09319346181995608, "grad_norm": 1.159109115600586, "learning_rate": 2.4758401682999573e-07, "loss": 2.1707, "step": 382 }, { "epoch": 0.09343742376189315, "grad_norm": 0.7915341258049011, "learning_rate": 2.475771115787231e-07, "loss": 2.11, "step": 383 }, { "epoch": 0.09368138570383021, "grad_norm": 0.691881537437439, "learning_rate": 2.475702030159322e-07, "loss": 1.9882, "step": 384 }, { "epoch": 0.09392534764576727, "grad_norm": 1.018052577972412, "learning_rate": 2.475632911392405e-07, "loss": 2.0549, "step": 385 }, { "epoch": 0.09416930958770432, "grad_norm": 0.9836434721946716, "learning_rate": 2.475563759462629e-07, "loss": 2.177, "step": 386 }, { "epoch": 0.09441327152964138, "grad_norm": 0.7274807691574097, "learning_rate": 2.475494574346122e-07, "loss": 2.1897, "step": 387 }, { "epoch": 0.09465723347157844, "grad_norm": 0.853111982345581, "learning_rate": 2.475425356018988e-07, "loss": 2.0191, "step": 388 }, { "epoch": 0.0949011954135155, "grad_norm": 0.7096747159957886, "learning_rate": 2.475356104457307e-07, "loss": 2.0043, "step": 389 }, { "epoch": 0.09514515735545255, "grad_norm": 0.59073406457901, "learning_rate": 2.4752868196371393e-07, "loss": 2.1815, "step": 390 }, { "epoch": 0.09514515735545255, "eval_loss": 2.0534627437591553, "eval_runtime": 82.449, "eval_samples_per_second": 3.105, "eval_steps_per_second": 0.776, "step": 390 }, { "epoch": 0.09538911929738961, "grad_norm": 1.0871241092681885, "learning_rate": 2.47521750153452e-07, "loss": 2.3593, "step": 391 }, { "epoch": 0.09563308123932666, "grad_norm": 0.7196955680847168, "learning_rate": 2.4751481501254606e-07, "loss": 2.1606, "step": 392 }, { "epoch": 0.09587704318126372, "grad_norm": 0.6455821394920349, "learning_rate": 2.4750787653859505e-07, "loss": 2.1609, "step": 393 }, { "epoch": 0.09612100512320078, "grad_norm": 0.8399761915206909, "learning_rate": 2.475009347291956e-07, "loss": 2.2308, "step": 394 }, { "epoch": 0.09636496706513784, "grad_norm": 0.7365739941596985, "learning_rate": 2.47493989581942e-07, "loss": 2.1575, "step": 395 }, { "epoch": 0.09660892900707489, "grad_norm": 0.7569345235824585, "learning_rate": 2.4748704109442635e-07, "loss": 2.1495, "step": 396 }, { "epoch": 0.09685289094901195, "grad_norm": 0.6441726088523865, "learning_rate": 2.4748008926423817e-07, "loss": 2.0264, "step": 397 }, { "epoch": 0.097096852890949, "grad_norm": 0.6600158214569092, "learning_rate": 2.474731340889649e-07, "loss": 2.1404, "step": 398 }, { "epoch": 0.09734081483288606, "grad_norm": 0.6377738118171692, "learning_rate": 2.4746617556619163e-07, "loss": 2.0164, "step": 399 }, { "epoch": 0.09758477677482313, "grad_norm": 0.7105040550231934, "learning_rate": 2.4745921369350094e-07, "loss": 2.1193, "step": 400 }, { "epoch": 0.09758477677482313, "eval_loss": 2.051236391067505, "eval_runtime": 82.3788, "eval_samples_per_second": 3.108, "eval_steps_per_second": 0.777, "step": 400 }, { "epoch": 0.09782873871676019, "grad_norm": 0.9596676230430603, "learning_rate": 2.474522484684733e-07, "loss": 2.1332, "step": 401 }, { "epoch": 0.09807270065869725, "grad_norm": 0.7559407353401184, "learning_rate": 2.4744527988868673e-07, "loss": 2.0184, "step": 402 }, { "epoch": 0.0983166626006343, "grad_norm": 0.9651991128921509, "learning_rate": 2.4743830795171695e-07, "loss": 2.0775, "step": 403 }, { "epoch": 0.09856062454257136, "grad_norm": 0.6360098719596863, "learning_rate": 2.474313326551373e-07, "loss": 1.9806, "step": 404 }, { "epoch": 0.09880458648450842, "grad_norm": 0.8414661288261414, "learning_rate": 2.474243539965189e-07, "loss": 2.0598, "step": 405 }, { "epoch": 0.09904854842644548, "grad_norm": 0.593665361404419, "learning_rate": 2.4741737197343045e-07, "loss": 2.1291, "step": 406 }, { "epoch": 0.09929251036838253, "grad_norm": 0.6151170134544373, "learning_rate": 2.4741038658343824e-07, "loss": 2.0288, "step": 407 }, { "epoch": 0.09953647231031959, "grad_norm": 0.8486893773078918, "learning_rate": 2.474033978241063e-07, "loss": 2.1914, "step": 408 }, { "epoch": 0.09978043425225665, "grad_norm": 0.5917589664459229, "learning_rate": 2.4739640569299634e-07, "loss": 1.997, "step": 409 }, { "epoch": 0.1000243961941937, "grad_norm": 0.6412562727928162, "learning_rate": 2.4738941018766753e-07, "loss": 2.1638, "step": 410 }, { "epoch": 0.1000243961941937, "eval_loss": 2.0491929054260254, "eval_runtime": 82.4035, "eval_samples_per_second": 3.107, "eval_steps_per_second": 0.777, "step": 410 }, { "epoch": 0.10026835813613076, "grad_norm": 0.8086695075035095, "learning_rate": 2.47382411305677e-07, "loss": 2.0884, "step": 411 }, { "epoch": 0.10051232007806782, "grad_norm": 0.7479122281074524, "learning_rate": 2.4737540904457914e-07, "loss": 2.1612, "step": 412 }, { "epoch": 0.10075628202000488, "grad_norm": 0.6791818141937256, "learning_rate": 2.4736840340192635e-07, "loss": 2.0849, "step": 413 }, { "epoch": 0.10100024396194193, "grad_norm": 0.7554202675819397, "learning_rate": 2.4736139437526835e-07, "loss": 2.0364, "step": 414 }, { "epoch": 0.10124420590387899, "grad_norm": 1.182301640510559, "learning_rate": 2.4735438196215273e-07, "loss": 2.0642, "step": 415 }, { "epoch": 0.10148816784581605, "grad_norm": 0.7850328087806702, "learning_rate": 2.4734736616012457e-07, "loss": 2.0743, "step": 416 }, { "epoch": 0.10173212978775312, "grad_norm": 0.8438737988471985, "learning_rate": 2.4734034696672667e-07, "loss": 2.1803, "step": 417 }, { "epoch": 0.10197609172969017, "grad_norm": 5.938900947570801, "learning_rate": 2.473333243794993e-07, "loss": 2.1007, "step": 418 }, { "epoch": 0.10222005367162723, "grad_norm": 0.7063493728637695, "learning_rate": 2.4732629839598054e-07, "loss": 2.0102, "step": 419 }, { "epoch": 0.10246401561356429, "grad_norm": 0.7309712171554565, "learning_rate": 2.4731926901370596e-07, "loss": 2.0201, "step": 420 }, { "epoch": 0.10246401561356429, "eval_loss": 2.047513484954834, "eval_runtime": 82.2828, "eval_samples_per_second": 3.111, "eval_steps_per_second": 0.778, "step": 420 }, { "epoch": 0.10270797755550135, "grad_norm": 0.6622780561447144, "learning_rate": 2.473122362302088e-07, "loss": 2.1224, "step": 421 }, { "epoch": 0.1029519394974384, "grad_norm": 0.5755351185798645, "learning_rate": 2.4730520004301997e-07, "loss": 2.0315, "step": 422 }, { "epoch": 0.10319590143937546, "grad_norm": 0.6571397185325623, "learning_rate": 2.472981604496678e-07, "loss": 2.1613, "step": 423 }, { "epoch": 0.10343986338131252, "grad_norm": 0.6562328338623047, "learning_rate": 2.472911174476784e-07, "loss": 2.1833, "step": 424 }, { "epoch": 0.10368382532324957, "grad_norm": 0.6494048237800598, "learning_rate": 2.4728407103457554e-07, "loss": 2.0679, "step": 425 }, { "epoch": 0.10392778726518663, "grad_norm": 1.0681666135787964, "learning_rate": 2.472770212078803e-07, "loss": 2.1321, "step": 426 }, { "epoch": 0.10417174920712369, "grad_norm": 0.7042210698127747, "learning_rate": 2.4726996796511157e-07, "loss": 2.1927, "step": 427 }, { "epoch": 0.10441571114906074, "grad_norm": 0.6480265259742737, "learning_rate": 2.4726291130378586e-07, "loss": 2.3393, "step": 428 }, { "epoch": 0.1046596730909978, "grad_norm": 0.6662938594818115, "learning_rate": 2.472558512214172e-07, "loss": 2.0135, "step": 429 }, { "epoch": 0.10490363503293486, "grad_norm": 0.6291259527206421, "learning_rate": 2.4724878771551725e-07, "loss": 2.1968, "step": 430 }, { "epoch": 0.10490363503293486, "eval_loss": 2.0453972816467285, "eval_runtime": 82.4758, "eval_samples_per_second": 3.104, "eval_steps_per_second": 0.776, "step": 430 }, { "epoch": 0.10514759697487192, "grad_norm": 3.447300910949707, "learning_rate": 2.4724172078359513e-07, "loss": 2.0465, "step": 431 }, { "epoch": 0.10539155891680897, "grad_norm": 1.3743339776992798, "learning_rate": 2.4723465042315776e-07, "loss": 2.0544, "step": 432 }, { "epoch": 0.10563552085874603, "grad_norm": 0.8059394359588623, "learning_rate": 2.4722757663170946e-07, "loss": 2.1522, "step": 433 }, { "epoch": 0.1058794828006831, "grad_norm": 0.6414276361465454, "learning_rate": 2.4722049940675223e-07, "loss": 2.1575, "step": 434 }, { "epoch": 0.10612344474262016, "grad_norm": 1.0368949174880981, "learning_rate": 2.472134187457856e-07, "loss": 1.9911, "step": 435 }, { "epoch": 0.10636740668455721, "grad_norm": 0.6836427450180054, "learning_rate": 2.4720633464630656e-07, "loss": 2.074, "step": 436 }, { "epoch": 0.10661136862649427, "grad_norm": 0.6823791861534119, "learning_rate": 2.4719924710581e-07, "loss": 2.0613, "step": 437 }, { "epoch": 0.10685533056843133, "grad_norm": 0.6797323822975159, "learning_rate": 2.4719215612178795e-07, "loss": 2.0521, "step": 438 }, { "epoch": 0.10709929251036839, "grad_norm": 0.6761137247085571, "learning_rate": 2.471850616917303e-07, "loss": 2.0434, "step": 439 }, { "epoch": 0.10734325445230544, "grad_norm": 0.6031590104103088, "learning_rate": 2.4717796381312446e-07, "loss": 2.1448, "step": 440 }, { "epoch": 0.10734325445230544, "eval_loss": 2.0439488887786865, "eval_runtime": 82.3918, "eval_samples_per_second": 3.107, "eval_steps_per_second": 0.777, "step": 440 }, { "epoch": 0.1075872163942425, "grad_norm": 0.7096999287605286, "learning_rate": 2.471708624834553e-07, "loss": 2.236, "step": 441 }, { "epoch": 0.10783117833617956, "grad_norm": 0.7802643775939941, "learning_rate": 2.471637577002053e-07, "loss": 2.0246, "step": 442 }, { "epoch": 0.10807514027811661, "grad_norm": 0.8068674802780151, "learning_rate": 2.471566494608545e-07, "loss": 2.1481, "step": 443 }, { "epoch": 0.10831910222005367, "grad_norm": 0.792898416519165, "learning_rate": 2.4714953776288044e-07, "loss": 1.9699, "step": 444 }, { "epoch": 0.10856306416199073, "grad_norm": 0.6124379634857178, "learning_rate": 2.471424226037583e-07, "loss": 2.1199, "step": 445 }, { "epoch": 0.10880702610392778, "grad_norm": 1.238782525062561, "learning_rate": 2.471353039809606e-07, "loss": 2.1193, "step": 446 }, { "epoch": 0.10905098804586484, "grad_norm": 0.5847099423408508, "learning_rate": 2.471281818919577e-07, "loss": 2.2197, "step": 447 }, { "epoch": 0.1092949499878019, "grad_norm": 0.7409020066261292, "learning_rate": 2.4712105633421726e-07, "loss": 2.1878, "step": 448 }, { "epoch": 0.10953891192973896, "grad_norm": 0.8623873591423035, "learning_rate": 2.471139273052045e-07, "loss": 2.102, "step": 449 }, { "epoch": 0.10978287387167601, "grad_norm": 0.9536486864089966, "learning_rate": 2.471067948023822e-07, "loss": 2.0028, "step": 450 }, { "epoch": 0.10978287387167601, "eval_loss": 2.042297601699829, "eval_runtime": 82.3539, "eval_samples_per_second": 3.109, "eval_steps_per_second": 0.777, "step": 450 }, { "epoch": 0.11002683581361308, "grad_norm": 2.532158136367798, "learning_rate": 2.4709965882321085e-07, "loss": 2.1556, "step": 451 }, { "epoch": 0.11027079775555014, "grad_norm": 0.7216641306877136, "learning_rate": 2.470925193651481e-07, "loss": 2.2463, "step": 452 }, { "epoch": 0.1105147596974872, "grad_norm": 0.7760343551635742, "learning_rate": 2.470853764256495e-07, "loss": 2.102, "step": 453 }, { "epoch": 0.11075872163942425, "grad_norm": 0.8911886811256409, "learning_rate": 2.4707823000216777e-07, "loss": 2.3057, "step": 454 }, { "epoch": 0.11100268358136131, "grad_norm": 0.7775437235832214, "learning_rate": 2.470710800921534e-07, "loss": 2.0745, "step": 455 }, { "epoch": 0.11124664552329837, "grad_norm": 0.6954789757728577, "learning_rate": 2.470639266930543e-07, "loss": 1.9749, "step": 456 }, { "epoch": 0.11149060746523543, "grad_norm": 0.5103456974029541, "learning_rate": 2.4705676980231577e-07, "loss": 2.053, "step": 457 }, { "epoch": 0.11173456940717248, "grad_norm": 0.9708994626998901, "learning_rate": 2.4704960941738093e-07, "loss": 2.0905, "step": 458 }, { "epoch": 0.11197853134910954, "grad_norm": 0.7622889876365662, "learning_rate": 2.4704244553569005e-07, "loss": 2.2957, "step": 459 }, { "epoch": 0.1122224932910466, "grad_norm": 0.7484825849533081, "learning_rate": 2.470352781546811e-07, "loss": 2.0953, "step": 460 }, { "epoch": 0.1122224932910466, "eval_loss": 2.0406336784362793, "eval_runtime": 82.4811, "eval_samples_per_second": 3.104, "eval_steps_per_second": 0.776, "step": 460 }, { "epoch": 0.11246645523298365, "grad_norm": 0.9012355208396912, "learning_rate": 2.4702810727178955e-07, "loss": 2.092, "step": 461 }, { "epoch": 0.11271041717492071, "grad_norm": 0.9759025573730469, "learning_rate": 2.470209328844483e-07, "loss": 2.2126, "step": 462 }, { "epoch": 0.11295437911685777, "grad_norm": 0.6701992154121399, "learning_rate": 2.470137549900877e-07, "loss": 2.0805, "step": 463 }, { "epoch": 0.11319834105879482, "grad_norm": 0.9370368123054504, "learning_rate": 2.4700657358613573e-07, "loss": 2.1702, "step": 464 }, { "epoch": 0.11344230300073188, "grad_norm": 0.7537663578987122, "learning_rate": 2.4699938867001765e-07, "loss": 2.1348, "step": 465 }, { "epoch": 0.11368626494266894, "grad_norm": 28.197704315185547, "learning_rate": 2.469922002391564e-07, "loss": 2.1985, "step": 466 }, { "epoch": 0.113930226884606, "grad_norm": 0.7190595269203186, "learning_rate": 2.4698500829097235e-07, "loss": 2.065, "step": 467 }, { "epoch": 0.11417418882654307, "grad_norm": 0.6859692931175232, "learning_rate": 2.469778128228832e-07, "loss": 2.2628, "step": 468 }, { "epoch": 0.11441815076848012, "grad_norm": 1.0769257545471191, "learning_rate": 2.4697061383230436e-07, "loss": 2.099, "step": 469 }, { "epoch": 0.11466211271041718, "grad_norm": 0.7370059490203857, "learning_rate": 2.469634113166485e-07, "loss": 2.0264, "step": 470 }, { "epoch": 0.11466211271041718, "eval_loss": 2.03948712348938, "eval_runtime": 82.5195, "eval_samples_per_second": 3.102, "eval_steps_per_second": 0.776, "step": 470 }, { "epoch": 0.11490607465235424, "grad_norm": 1.2912418842315674, "learning_rate": 2.4695620527332587e-07, "loss": 1.9958, "step": 471 }, { "epoch": 0.1151500365942913, "grad_norm": 0.7174116373062134, "learning_rate": 2.4694899569974417e-07, "loss": 2.1067, "step": 472 }, { "epoch": 0.11539399853622835, "grad_norm": 0.8682342171669006, "learning_rate": 2.4694178259330843e-07, "loss": 2.0431, "step": 473 }, { "epoch": 0.11563796047816541, "grad_norm": 0.7805954217910767, "learning_rate": 2.4693456595142144e-07, "loss": 1.936, "step": 474 }, { "epoch": 0.11588192242010247, "grad_norm": 0.6474030017852783, "learning_rate": 2.46927345771483e-07, "loss": 1.9181, "step": 475 }, { "epoch": 0.11612588436203952, "grad_norm": 1.033121943473816, "learning_rate": 2.4692012205089086e-07, "loss": 2.1393, "step": 476 }, { "epoch": 0.11636984630397658, "grad_norm": 0.6728079915046692, "learning_rate": 2.469128947870398e-07, "loss": 2.0636, "step": 477 }, { "epoch": 0.11661380824591364, "grad_norm": 0.7054829001426697, "learning_rate": 2.4690566397732225e-07, "loss": 2.0255, "step": 478 }, { "epoch": 0.11685777018785069, "grad_norm": 0.6209118962287903, "learning_rate": 2.4689842961912813e-07, "loss": 2.2017, "step": 479 }, { "epoch": 0.11710173212978775, "grad_norm": 0.8064582943916321, "learning_rate": 2.4689119170984457e-07, "loss": 2.0825, "step": 480 }, { "epoch": 0.11710173212978775, "eval_loss": 2.0377635955810547, "eval_runtime": 82.47, "eval_samples_per_second": 3.104, "eval_steps_per_second": 0.776, "step": 480 }, { "epoch": 0.11734569407172481, "grad_norm": 0.6298145651817322, "learning_rate": 2.4688395024685635e-07, "loss": 1.9996, "step": 481 }, { "epoch": 0.11758965601366186, "grad_norm": 0.6311830282211304, "learning_rate": 2.4687670522754556e-07, "loss": 2.025, "step": 482 }, { "epoch": 0.11783361795559892, "grad_norm": 0.7297334671020508, "learning_rate": 2.468694566492918e-07, "loss": 2.2315, "step": 483 }, { "epoch": 0.11807757989753598, "grad_norm": 0.8511336445808411, "learning_rate": 2.468622045094721e-07, "loss": 2.0218, "step": 484 }, { "epoch": 0.11832154183947305, "grad_norm": 1.0287209749221802, "learning_rate": 2.4685494880546075e-07, "loss": 2.0803, "step": 485 }, { "epoch": 0.1185655037814101, "grad_norm": 1.0278816223144531, "learning_rate": 2.468476895346296e-07, "loss": 2.0773, "step": 486 }, { "epoch": 0.11880946572334716, "grad_norm": 0.6326528787612915, "learning_rate": 2.468404266943481e-07, "loss": 1.9748, "step": 487 }, { "epoch": 0.11905342766528422, "grad_norm": 0.7467640042304993, "learning_rate": 2.4683316028198264e-07, "loss": 2.1406, "step": 488 }, { "epoch": 0.11929738960722128, "grad_norm": 2.9902219772338867, "learning_rate": 2.4682589029489734e-07, "loss": 2.2492, "step": 489 }, { "epoch": 0.11954135154915833, "grad_norm": 0.7884283661842346, "learning_rate": 2.468186167304538e-07, "loss": 2.125, "step": 490 }, { "epoch": 0.11954135154915833, "eval_loss": 2.0366787910461426, "eval_runtime": 82.4749, "eval_samples_per_second": 3.104, "eval_steps_per_second": 0.776, "step": 490 }, { "epoch": 0.11978531349109539, "grad_norm": 0.8623523116111755, "learning_rate": 2.4681133958601076e-07, "loss": 2.0296, "step": 491 }, { "epoch": 0.12002927543303245, "grad_norm": 1.98282790184021, "learning_rate": 2.4680405885892456e-07, "loss": 1.9251, "step": 492 }, { "epoch": 0.1202732373749695, "grad_norm": 0.6307066082954407, "learning_rate": 2.4679677454654887e-07, "loss": 2.0975, "step": 493 }, { "epoch": 0.12051719931690656, "grad_norm": 0.6835567951202393, "learning_rate": 2.4678948664623467e-07, "loss": 2.0954, "step": 494 }, { "epoch": 0.12076116125884362, "grad_norm": 0.6431268453598022, "learning_rate": 2.467821951553305e-07, "loss": 1.9843, "step": 495 }, { "epoch": 0.12100512320078068, "grad_norm": 0.6810166835784912, "learning_rate": 2.467749000711822e-07, "loss": 1.9317, "step": 496 }, { "epoch": 0.12124908514271773, "grad_norm": 0.6672168970108032, "learning_rate": 2.467676013911329e-07, "loss": 1.8837, "step": 497 }, { "epoch": 0.12149304708465479, "grad_norm": 0.6653776168823242, "learning_rate": 2.467602991125233e-07, "loss": 2.1792, "step": 498 }, { "epoch": 0.12173700902659185, "grad_norm": 0.7671879529953003, "learning_rate": 2.467529932326913e-07, "loss": 2.0863, "step": 499 }, { "epoch": 0.1219809709685289, "grad_norm": 0.7360325455665588, "learning_rate": 2.467456837489723e-07, "loss": 2.2625, "step": 500 }, { "epoch": 0.1219809709685289, "eval_loss": 2.0352420806884766, "eval_runtime": 82.4773, "eval_samples_per_second": 3.104, "eval_steps_per_second": 0.776, "step": 500 }, { "epoch": 0.12222493291046596, "grad_norm": 1.1433802843093872, "learning_rate": 2.46738370658699e-07, "loss": 2.0469, "step": 501 }, { "epoch": 0.12246889485240303, "grad_norm": 0.774438738822937, "learning_rate": 2.467310539592016e-07, "loss": 2.159, "step": 502 }, { "epoch": 0.12271285679434009, "grad_norm": 0.5934178233146667, "learning_rate": 2.467237336478074e-07, "loss": 2.1611, "step": 503 }, { "epoch": 0.12295681873627715, "grad_norm": 0.76198810338974, "learning_rate": 2.4671640972184124e-07, "loss": 2.065, "step": 504 }, { "epoch": 0.1232007806782142, "grad_norm": 2.332010507583618, "learning_rate": 2.4670908217862535e-07, "loss": 2.2185, "step": 505 }, { "epoch": 0.12344474262015126, "grad_norm": 0.590925395488739, "learning_rate": 2.4670175101547916e-07, "loss": 1.9858, "step": 506 }, { "epoch": 0.12368870456208832, "grad_norm": 0.6301215887069702, "learning_rate": 2.466944162297197e-07, "loss": 2.0693, "step": 507 }, { "epoch": 0.12393266650402537, "grad_norm": 0.6086316704750061, "learning_rate": 2.466870778186611e-07, "loss": 2.1286, "step": 508 }, { "epoch": 0.12417662844596243, "grad_norm": 0.5990573167800903, "learning_rate": 2.466797357796149e-07, "loss": 1.9842, "step": 509 }, { "epoch": 0.12442059038789949, "grad_norm": 0.7765055894851685, "learning_rate": 2.466723901098901e-07, "loss": 2.0005, "step": 510 }, { "epoch": 0.12442059038789949, "eval_loss": 2.033975124359131, "eval_runtime": 82.4194, "eval_samples_per_second": 3.106, "eval_steps_per_second": 0.777, "step": 510 }, { "epoch": 0.12466455232983654, "grad_norm": 0.7282994389533997, "learning_rate": 2.466650408067929e-07, "loss": 2.0578, "step": 511 }, { "epoch": 0.1249085142717736, "grad_norm": 0.6947245597839355, "learning_rate": 2.4665768786762685e-07, "loss": 2.1235, "step": 512 }, { "epoch": 0.12515247621371067, "grad_norm": 1.0526394844055176, "learning_rate": 2.4665033128969293e-07, "loss": 2.0394, "step": 513 }, { "epoch": 0.12539643815564772, "grad_norm": 0.7054687142372131, "learning_rate": 2.466429710702893e-07, "loss": 2.1627, "step": 514 }, { "epoch": 0.1256404000975848, "grad_norm": 1.6336970329284668, "learning_rate": 2.466356072067116e-07, "loss": 1.9868, "step": 515 }, { "epoch": 0.12588436203952183, "grad_norm": 0.5906395316123962, "learning_rate": 2.4662823969625266e-07, "loss": 2.0956, "step": 516 }, { "epoch": 0.1261283239814589, "grad_norm": 0.6716820001602173, "learning_rate": 2.466208685362027e-07, "loss": 1.7824, "step": 517 }, { "epoch": 0.12637228592339594, "grad_norm": 0.6512242555618286, "learning_rate": 2.4661349372384934e-07, "loss": 1.9613, "step": 518 }, { "epoch": 0.12661624786533301, "grad_norm": 0.6093871593475342, "learning_rate": 2.466061152564773e-07, "loss": 2.133, "step": 519 }, { "epoch": 0.12686020980727006, "grad_norm": 0.6348795890808105, "learning_rate": 2.4659873313136873e-07, "loss": 1.9917, "step": 520 }, { "epoch": 0.12686020980727006, "eval_loss": 2.032553195953369, "eval_runtime": 82.4844, "eval_samples_per_second": 3.104, "eval_steps_per_second": 0.776, "step": 520 }, { "epoch": 0.12710417174920713, "grad_norm": 0.7311941385269165, "learning_rate": 2.465913473458031e-07, "loss": 1.989, "step": 521 }, { "epoch": 0.12734813369114417, "grad_norm": 0.6399905681610107, "learning_rate": 2.465839578970572e-07, "loss": 2.0483, "step": 522 }, { "epoch": 0.12759209563308124, "grad_norm": 0.6352974772453308, "learning_rate": 2.46576564782405e-07, "loss": 2.1475, "step": 523 }, { "epoch": 0.12783605757501829, "grad_norm": 1.216728687286377, "learning_rate": 2.4656916799911783e-07, "loss": 2.1861, "step": 524 }, { "epoch": 0.12808001951695536, "grad_norm": 16.101139068603516, "learning_rate": 2.4656176754446437e-07, "loss": 2.1574, "step": 525 }, { "epoch": 0.1283239814588924, "grad_norm": 0.6247526407241821, "learning_rate": 2.4655436341571053e-07, "loss": 2.1521, "step": 526 }, { "epoch": 0.12856794340082947, "grad_norm": 0.6813322305679321, "learning_rate": 2.4654695561011943e-07, "loss": 2.1452, "step": 527 }, { "epoch": 0.12881190534276654, "grad_norm": 1.005342721939087, "learning_rate": 2.4653954412495173e-07, "loss": 2.0553, "step": 528 }, { "epoch": 0.12905586728470358, "grad_norm": 0.6594173312187195, "learning_rate": 2.46532128957465e-07, "loss": 2.0046, "step": 529 }, { "epoch": 0.12929982922664066, "grad_norm": 0.6536688208580017, "learning_rate": 2.465247101049144e-07, "loss": 2.0084, "step": 530 }, { "epoch": 0.12929982922664066, "eval_loss": 2.0311105251312256, "eval_runtime": 82.4257, "eval_samples_per_second": 3.106, "eval_steps_per_second": 0.776, "step": 530 }, { "epoch": 0.1295437911685777, "grad_norm": 0.6012054085731506, "learning_rate": 2.465172875645522e-07, "loss": 2.2109, "step": 531 }, { "epoch": 0.12978775311051477, "grad_norm": 0.7324656248092651, "learning_rate": 2.4650986133362793e-07, "loss": 2.0499, "step": 532 }, { "epoch": 0.1300317150524518, "grad_norm": 0.6712090969085693, "learning_rate": 2.465024314093885e-07, "loss": 1.9613, "step": 533 }, { "epoch": 0.13027567699438888, "grad_norm": 0.8287779688835144, "learning_rate": 2.4649499778907805e-07, "loss": 2.0791, "step": 534 }, { "epoch": 0.13051963893632593, "grad_norm": 0.7180325388908386, "learning_rate": 2.4648756046993777e-07, "loss": 2.1055, "step": 535 }, { "epoch": 0.130763600878263, "grad_norm": 0.8076268434524536, "learning_rate": 2.4648011944920643e-07, "loss": 2.0083, "step": 536 }, { "epoch": 0.13100756282020004, "grad_norm": 0.6361818909645081, "learning_rate": 2.464726747241198e-07, "loss": 2.069, "step": 537 }, { "epoch": 0.1312515247621371, "grad_norm": 0.6086621284484863, "learning_rate": 2.46465226291911e-07, "loss": 1.9604, "step": 538 }, { "epoch": 0.13149548670407415, "grad_norm": 0.5492677092552185, "learning_rate": 2.4645777414981045e-07, "loss": 2.0372, "step": 539 }, { "epoch": 0.13173944864601123, "grad_norm": 0.7199727892875671, "learning_rate": 2.4645031829504564e-07, "loss": 2.0154, "step": 540 }, { "epoch": 0.13173944864601123, "eval_loss": 2.030088186264038, "eval_runtime": 82.3631, "eval_samples_per_second": 3.108, "eval_steps_per_second": 0.777, "step": 540 }, { "epoch": 0.13198341058794827, "grad_norm": 1.0681871175765991, "learning_rate": 2.464428587248415e-07, "loss": 2.1332, "step": 541 }, { "epoch": 0.13222737252988534, "grad_norm": 2.4582200050354004, "learning_rate": 2.4643539543642e-07, "loss": 2.0242, "step": 542 }, { "epoch": 0.13247133447182238, "grad_norm": 0.556507408618927, "learning_rate": 2.4642792842700055e-07, "loss": 2.0096, "step": 543 }, { "epoch": 0.13271529641375945, "grad_norm": 0.6344817280769348, "learning_rate": 2.464204576937995e-07, "loss": 1.9323, "step": 544 }, { "epoch": 0.13295925835569652, "grad_norm": 0.6195700764656067, "learning_rate": 2.4641298323403077e-07, "loss": 2.1069, "step": 545 }, { "epoch": 0.13320322029763357, "grad_norm": 0.7518613338470459, "learning_rate": 2.464055050449052e-07, "loss": 2.0032, "step": 546 }, { "epoch": 0.13344718223957064, "grad_norm": 0.710758626461029, "learning_rate": 2.46398023123631e-07, "loss": 2.1871, "step": 547 }, { "epoch": 0.13369114418150768, "grad_norm": 0.6719003915786743, "learning_rate": 2.463905374674136e-07, "loss": 1.9962, "step": 548 }, { "epoch": 0.13393510612344475, "grad_norm": 0.9001761078834534, "learning_rate": 2.4638304807345555e-07, "loss": 2.0175, "step": 549 }, { "epoch": 0.1341790680653818, "grad_norm": 0.8284860849380493, "learning_rate": 2.463755549389567e-07, "loss": 2.0108, "step": 550 }, { "epoch": 0.1341790680653818, "eval_loss": 2.0290093421936035, "eval_runtime": 82.3581, "eval_samples_per_second": 3.108, "eval_steps_per_second": 0.777, "step": 550 }, { "epoch": 0.13442303000731887, "grad_norm": 0.5253940224647522, "learning_rate": 2.46368058061114e-07, "loss": 2.055, "step": 551 }, { "epoch": 0.1346669919492559, "grad_norm": 0.7425602078437805, "learning_rate": 2.4636055743712173e-07, "loss": 2.1348, "step": 552 }, { "epoch": 0.13491095389119298, "grad_norm": 0.5845130681991577, "learning_rate": 2.4635305306417126e-07, "loss": 2.0106, "step": 553 }, { "epoch": 0.13515491583313002, "grad_norm": 0.6662041544914246, "learning_rate": 2.463455449394512e-07, "loss": 2.1305, "step": 554 }, { "epoch": 0.1353988777750671, "grad_norm": 0.6484659910202026, "learning_rate": 2.4633803306014726e-07, "loss": 2.1217, "step": 555 }, { "epoch": 0.13564283971700414, "grad_norm": 0.7985727787017822, "learning_rate": 2.4633051742344244e-07, "loss": 2.1338, "step": 556 }, { "epoch": 0.1358868016589412, "grad_norm": 0.6523579359054565, "learning_rate": 2.463229980265169e-07, "loss": 2.0912, "step": 557 }, { "epoch": 0.13613076360087825, "grad_norm": 0.7691523432731628, "learning_rate": 2.4631547486654805e-07, "loss": 1.9642, "step": 558 }, { "epoch": 0.13637472554281532, "grad_norm": 0.7172589898109436, "learning_rate": 2.4630794794071024e-07, "loss": 1.974, "step": 559 }, { "epoch": 0.13661868748475237, "grad_norm": 0.9193669557571411, "learning_rate": 2.4630041724617526e-07, "loss": 2.0944, "step": 560 }, { "epoch": 0.13661868748475237, "eval_loss": 2.027602195739746, "eval_runtime": 82.4834, "eval_samples_per_second": 3.104, "eval_steps_per_second": 0.776, "step": 560 } ], "logging_steps": 1, "max_steps": 4099, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.767317518548992e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }