{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 11.0, "eval_steps": 3118, "global_step": 171457, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.032079042761364, "grad_norm": 1.7145991325378418, "learning_rate": 0.0007978742969568658, "loss": 0.4787, "step": 500 }, { "epoch": 0.064158085522728, "grad_norm": 1.2714065313339233, "learning_rate": 0.0007957357627082398, "loss": 0.3972, "step": 1000 }, { "epoch": 0.096237128284092, "grad_norm": 1.1843117475509644, "learning_rate": 0.0007935972284596139, "loss": 0.3736, "step": 1500 }, { "epoch": 0.128316171045456, "grad_norm": 1.1524124145507812, "learning_rate": 0.0007914629712794851, "loss": 0.3626, "step": 2000 }, { "epoch": 0.16039521380682, "grad_norm": 1.0530339479446411, "learning_rate": 0.0007893244370308591, "loss": 0.355, "step": 2500 }, { "epoch": 0.192474256568184, "grad_norm": 1.0044286251068115, "learning_rate": 0.0007871859027822331, "loss": 0.3527, "step": 3000 }, { "epoch": 0.2000449106598659, "eval_loss": 0.3370795249938965, "eval_runtime": 6.2984, "eval_samples_per_second": 79.386, "eval_steps_per_second": 10.003, "step": 3118 }, { "epoch": 0.224553299329548, "grad_norm": 1.4645978212356567, "learning_rate": 0.0007850473685336071, "loss": 0.3394, "step": 3500 }, { "epoch": 0.256632342090912, "grad_norm": 1.2185771465301514, "learning_rate": 0.0007829088342849811, "loss": 0.3398, "step": 4000 }, { "epoch": 0.288711384852276, "grad_norm": 1.3087233304977417, "learning_rate": 0.0007807703000363551, "loss": 0.3399, "step": 4500 }, { "epoch": 0.32079042761364, "grad_norm": 1.411337971687317, "learning_rate": 0.0007786360428562264, "loss": 0.3355, "step": 5000 }, { "epoch": 0.352869470375004, "grad_norm": 1.1822813749313354, "learning_rate": 0.0007764975086076004, "loss": 0.3329, "step": 5500 }, { "epoch": 0.384948513136368, "grad_norm": 0.9898410439491272, "learning_rate": 0.0007743632514274716, "loss": 0.3311, "step": 6000 }, { "epoch": 0.4000898213197318, "eval_loss": 0.31450170278549194, "eval_runtime": 9.8807, "eval_samples_per_second": 50.604, "eval_steps_per_second": 6.376, "step": 6236 }, { "epoch": 0.417027555897732, "grad_norm": 0.9827210307121277, "learning_rate": 0.0007722247171788456, "loss": 0.3251, "step": 6500 }, { "epoch": 0.449106598659096, "grad_norm": 1.2011586427688599, "learning_rate": 0.0007700861829302197, "loss": 0.3271, "step": 7000 }, { "epoch": 0.48118564142046, "grad_norm": 1.3871015310287476, "learning_rate": 0.0007679476486815936, "loss": 0.3259, "step": 7500 }, { "epoch": 0.513264684181824, "grad_norm": 1.1245856285095215, "learning_rate": 0.0007658091144329678, "loss": 0.3192, "step": 8000 }, { "epoch": 0.545343726943188, "grad_norm": 1.0883513689041138, "learning_rate": 0.0007636705801843417, "loss": 0.3209, "step": 8500 }, { "epoch": 0.577422769704552, "grad_norm": 1.0555607080459595, "learning_rate": 0.0007615320459357157, "loss": 0.3178, "step": 9000 }, { "epoch": 0.6001347319795978, "eval_loss": 0.3085324764251709, "eval_runtime": 7.9945, "eval_samples_per_second": 62.543, "eval_steps_per_second": 7.88, "step": 9354 }, { "epoch": 0.6095018124659161, "grad_norm": 1.0429250001907349, "learning_rate": 0.0007593935116870898, "loss": 0.3169, "step": 9500 }, { "epoch": 0.64158085522728, "grad_norm": 1.2145720720291138, "learning_rate": 0.0007572549774384637, "loss": 0.3152, "step": 10000 }, { "epoch": 0.673659897988644, "grad_norm": 1.3464765548706055, "learning_rate": 0.0007551207202583349, "loss": 0.3125, "step": 10500 }, { "epoch": 0.705738940750008, "grad_norm": 1.1744924783706665, "learning_rate": 0.000752982186009709, "loss": 0.3086, "step": 11000 }, { "epoch": 0.7378179835113721, "grad_norm": 1.234157919883728, "learning_rate": 0.000750843651761083, "loss": 0.3133, "step": 11500 }, { "epoch": 0.769897026272736, "grad_norm": 0.9248010516166687, "learning_rate": 0.0007487051175124569, "loss": 0.3122, "step": 12000 }, { "epoch": 0.8001796426394636, "eval_loss": 0.3022182583808899, "eval_runtime": 8.0646, "eval_samples_per_second": 61.999, "eval_steps_per_second": 7.812, "step": 12472 }, { "epoch": 0.8019760690341, "grad_norm": 0.862856388092041, "learning_rate": 0.000746566583263831, "loss": 0.308, "step": 12500 }, { "epoch": 0.834055111795464, "grad_norm": 1.0224480628967285, "learning_rate": 0.000744428049015205, "loss": 0.31, "step": 13000 }, { "epoch": 0.8661341545568281, "grad_norm": 1.008195161819458, "learning_rate": 0.0007422937918350763, "loss": 0.3079, "step": 13500 }, { "epoch": 0.898213197318192, "grad_norm": 1.29293954372406, "learning_rate": 0.0007401552575864503, "loss": 0.3047, "step": 14000 }, { "epoch": 0.930292240079556, "grad_norm": 0.9913870692253113, "learning_rate": 0.0007380167233378243, "loss": 0.3074, "step": 14500 }, { "epoch": 0.96237128284092, "grad_norm": 1.1802239418029785, "learning_rate": 0.0007358781890891983, "loss": 0.3055, "step": 15000 }, { "epoch": 0.9944503256022841, "grad_norm": 1.0421093702316284, "learning_rate": 0.0007337396548405722, "loss": 0.3106, "step": 15500 }, { "epoch": 1.0001924742565682, "eval_loss": 0.30451422929763794, "eval_runtime": 6.8665, "eval_samples_per_second": 72.817, "eval_steps_per_second": 9.175, "step": 15590 }, { "epoch": 1.0264972893208866, "grad_norm": 1.1343709230422974, "learning_rate": 0.0007316011205919463, "loss": 0.3028, "step": 16000 }, { "epoch": 1.0585763320822508, "grad_norm": 0.992735743522644, "learning_rate": 0.0007294625863433203, "loss": 0.3026, "step": 16500 }, { "epoch": 1.0906553748436147, "grad_norm": 0.9756571650505066, "learning_rate": 0.0007273240520946943, "loss": 0.3004, "step": 17000 }, { "epoch": 1.1227344176049787, "grad_norm": 1.005118489265442, "learning_rate": 0.0007251897949145656, "loss": 0.2953, "step": 17500 }, { "epoch": 1.1548134603663427, "grad_norm": 1.2004996538162231, "learning_rate": 0.0007230512606659396, "loss": 0.3039, "step": 18000 }, { "epoch": 1.1868925031277067, "grad_norm": 1.2973647117614746, "learning_rate": 0.0007209127264173135, "loss": 0.2997, "step": 18500 }, { "epoch": 1.200237384916434, "eval_loss": 0.2830166518688202, "eval_runtime": 6.4164, "eval_samples_per_second": 77.926, "eval_steps_per_second": 9.819, "step": 18708 }, { "epoch": 1.2189715458890706, "grad_norm": 1.2329021692276, "learning_rate": 0.0007187741921686877, "loss": 0.2972, "step": 19000 }, { "epoch": 1.2510505886504346, "grad_norm": 0.8207685947418213, "learning_rate": 0.0007166399349885588, "loss": 0.2979, "step": 19500 }, { "epoch": 1.2831296314117986, "grad_norm": 1.1549195051193237, "learning_rate": 0.000714501400739933, "loss": 0.2982, "step": 20000 }, { "epoch": 1.3152086741731628, "grad_norm": 1.0019187927246094, "learning_rate": 0.0007123628664913069, "loss": 0.2975, "step": 20500 }, { "epoch": 1.3472877169345265, "grad_norm": 1.023240327835083, "learning_rate": 0.0007102243322426809, "loss": 0.2957, "step": 21000 }, { "epoch": 1.3793667596958907, "grad_norm": 1.0395528078079224, "learning_rate": 0.0007080900750625522, "loss": 0.2957, "step": 21500 }, { "epoch": 1.4002822955763001, "eval_loss": 0.28411924839019775, "eval_runtime": 8.0825, "eval_samples_per_second": 61.862, "eval_steps_per_second": 7.795, "step": 21826 }, { "epoch": 1.4114458024572547, "grad_norm": 1.2371494770050049, "learning_rate": 0.0007059515408139262, "loss": 0.2948, "step": 22000 }, { "epoch": 1.4435248452186187, "grad_norm": 1.1509361267089844, "learning_rate": 0.0007038130065653001, "loss": 0.2939, "step": 22500 }, { "epoch": 1.4756038879799827, "grad_norm": 0.9148961305618286, "learning_rate": 0.0007016744723166742, "loss": 0.2945, "step": 23000 }, { "epoch": 1.5076829307413466, "grad_norm": 1.301382064819336, "learning_rate": 0.0006995359380680482, "loss": 0.2935, "step": 23500 }, { "epoch": 1.5397619735027108, "grad_norm": 1.1036344766616821, "learning_rate": 0.0006974059579564167, "loss": 0.2946, "step": 24000 }, { "epoch": 1.5718410162640746, "grad_norm": 1.6663960218429565, "learning_rate": 0.0006952674237077907, "loss": 0.2922, "step": 24500 }, { "epoch": 1.600327206236166, "eval_loss": 0.2768917977809906, "eval_runtime": 6.447, "eval_samples_per_second": 77.556, "eval_steps_per_second": 9.772, "step": 24944 }, { "epoch": 1.6039200590254388, "grad_norm": 1.0343741178512573, "learning_rate": 0.0006931288894591647, "loss": 0.2928, "step": 25000 }, { "epoch": 1.6359991017868025, "grad_norm": 1.3965007066726685, "learning_rate": 0.0006909903552105388, "loss": 0.2923, "step": 25500 }, { "epoch": 1.6680781445481667, "grad_norm": 1.1331425905227661, "learning_rate": 0.0006888518209619127, "loss": 0.2936, "step": 26000 }, { "epoch": 1.7001571873095307, "grad_norm": 1.0256651639938354, "learning_rate": 0.0006867132867132868, "loss": 0.2901, "step": 26500 }, { "epoch": 1.7322362300708947, "grad_norm": 1.1987167596817017, "learning_rate": 0.0006845747524646608, "loss": 0.2932, "step": 27000 }, { "epoch": 1.7643152728322586, "grad_norm": 0.9224876165390015, "learning_rate": 0.0006824362182160348, "loss": 0.2957, "step": 27500 }, { "epoch": 1.7963943155936226, "grad_norm": 1.1595081090927124, "learning_rate": 0.0006802976839674088, "loss": 0.2895, "step": 28000 }, { "epoch": 1.8003721168960318, "eval_loss": 0.2766253650188446, "eval_runtime": 6.995, "eval_samples_per_second": 71.48, "eval_steps_per_second": 9.006, "step": 28062 }, { "epoch": 1.8284733583549868, "grad_norm": 1.0937904119491577, "learning_rate": 0.0006781634267872801, "loss": 0.2908, "step": 28500 }, { "epoch": 1.8605524011163506, "grad_norm": 1.6129448413848877, "learning_rate": 0.0006760291696071513, "loss": 0.2865, "step": 29000 }, { "epoch": 1.8926314438777148, "grad_norm": 1.5731992721557617, "learning_rate": 0.0006738906353585254, "loss": 0.2865, "step": 29500 }, { "epoch": 1.9247104866390787, "grad_norm": 1.1734341382980347, "learning_rate": 0.0006717521011098993, "loss": 0.2858, "step": 30000 }, { "epoch": 1.9567895294004427, "grad_norm": 1.277829647064209, "learning_rate": 0.0006696135668612733, "loss": 0.2878, "step": 30500 }, { "epoch": 1.9888685721618067, "grad_norm": 1.2518751621246338, "learning_rate": 0.0006674793096811446, "loss": 0.2847, "step": 31000 }, { "epoch": 2.0003849485131364, "eval_loss": 0.2660098075866699, "eval_runtime": 8.1469, "eval_samples_per_second": 61.373, "eval_steps_per_second": 7.733, "step": 31180 }, { "epoch": 2.0209155358804094, "grad_norm": 1.0406543016433716, "learning_rate": 0.0006653407754325186, "loss": 0.2821, "step": 31500 }, { "epoch": 2.052994578641773, "grad_norm": 1.1324297189712524, "learning_rate": 0.0006632022411838926, "loss": 0.2848, "step": 32000 }, { "epoch": 2.0850736214031373, "grad_norm": 1.0970991849899292, "learning_rate": 0.0006610637069352665, "loss": 0.2815, "step": 32500 }, { "epoch": 2.1171526641645015, "grad_norm": 1.0462357997894287, "learning_rate": 0.0006589251726866407, "loss": 0.2825, "step": 33000 }, { "epoch": 2.1492317069258653, "grad_norm": 1.122344732284546, "learning_rate": 0.0006567866384380146, "loss": 0.2816, "step": 33500 }, { "epoch": 2.1813107496872295, "grad_norm": 1.108782410621643, "learning_rate": 0.0006546481041893886, "loss": 0.2826, "step": 34000 }, { "epoch": 2.2004298591730023, "eval_loss": 0.2733038365840912, "eval_runtime": 7.489, "eval_samples_per_second": 66.765, "eval_steps_per_second": 8.412, "step": 34298 }, { "epoch": 2.2133897924485932, "grad_norm": 1.092699646949768, "learning_rate": 0.0006525095699407627, "loss": 0.2836, "step": 34500 }, { "epoch": 2.2454688352099574, "grad_norm": 1.0987275838851929, "learning_rate": 0.0006503753127606339, "loss": 0.2779, "step": 35000 }, { "epoch": 2.277547877971321, "grad_norm": 1.1260478496551514, "learning_rate": 0.0006482367785120078, "loss": 0.2788, "step": 35500 }, { "epoch": 2.3096269207326854, "grad_norm": 1.2035844326019287, "learning_rate": 0.0006460982442633819, "loss": 0.284, "step": 36000 }, { "epoch": 2.341705963494049, "grad_norm": 1.0569308996200562, "learning_rate": 0.0006439597100147559, "loss": 0.2812, "step": 36500 }, { "epoch": 2.3737850062554133, "grad_norm": 0.8856455683708191, "learning_rate": 0.0006418254528346272, "loss": 0.2767, "step": 37000 }, { "epoch": 2.400474769832868, "eval_loss": 0.2658798098564148, "eval_runtime": 8.0192, "eval_samples_per_second": 62.35, "eval_steps_per_second": 7.856, "step": 37416 }, { "epoch": 2.4058640490167775, "grad_norm": 0.9563117027282715, "learning_rate": 0.0006396911956544984, "loss": 0.2787, "step": 37500 }, { "epoch": 2.4379430917781413, "grad_norm": 0.9825711250305176, "learning_rate": 0.0006375526614058725, "loss": 0.2775, "step": 38000 }, { "epoch": 2.4700221345395055, "grad_norm": 1.1553630828857422, "learning_rate": 0.0006354141271572465, "loss": 0.2779, "step": 38500 }, { "epoch": 2.5021011773008692, "grad_norm": 1.1160862445831299, "learning_rate": 0.0006332755929086204, "loss": 0.2819, "step": 39000 }, { "epoch": 2.5341802200622334, "grad_norm": 1.2663159370422363, "learning_rate": 0.0006311370586599945, "loss": 0.2797, "step": 39500 }, { "epoch": 2.566259262823597, "grad_norm": 1.028053641319275, "learning_rate": 0.0006289985244113685, "loss": 0.2776, "step": 40000 }, { "epoch": 2.5983383055849614, "grad_norm": 1.0482547283172607, "learning_rate": 0.0006268642672312397, "loss": 0.2737, "step": 40500 }, { "epoch": 2.600519680492734, "eval_loss": 0.26375094056129456, "eval_runtime": 7.2264, "eval_samples_per_second": 69.19, "eval_steps_per_second": 8.718, "step": 40534 }, { "epoch": 2.6304173483463256, "grad_norm": 1.0435316562652588, "learning_rate": 0.0006247257329826138, "loss": 0.2783, "step": 41000 }, { "epoch": 2.6624963911076893, "grad_norm": 0.866886556148529, "learning_rate": 0.0006225871987339878, "loss": 0.2738, "step": 41500 }, { "epoch": 2.694575433869053, "grad_norm": 1.069162368774414, "learning_rate": 0.0006204486644853617, "loss": 0.2783, "step": 42000 }, { "epoch": 2.7266544766304173, "grad_norm": 1.450871229171753, "learning_rate": 0.0006183101302367357, "loss": 0.275, "step": 42500 }, { "epoch": 2.7587335193917815, "grad_norm": 1.0573506355285645, "learning_rate": 0.000616175873056607, "loss": 0.2762, "step": 43000 }, { "epoch": 2.790812562153145, "grad_norm": 1.1295759677886963, "learning_rate": 0.000614037338807981, "loss": 0.2734, "step": 43500 }, { "epoch": 2.8005645911526003, "eval_loss": 0.26422494649887085, "eval_runtime": 8.0132, "eval_samples_per_second": 62.397, "eval_steps_per_second": 7.862, "step": 43652 }, { "epoch": 2.8228916049145094, "grad_norm": 1.1608061790466309, "learning_rate": 0.000611898804559355, "loss": 0.272, "step": 44000 }, { "epoch": 2.8549706476758736, "grad_norm": 1.2949804067611694, "learning_rate": 0.000609760270310729, "loss": 0.272, "step": 44500 }, { "epoch": 2.8870496904372374, "grad_norm": 0.9073989391326904, "learning_rate": 0.0006076217360621031, "loss": 0.2744, "step": 45000 }, { "epoch": 2.919128733198601, "grad_norm": 0.8703099489212036, "learning_rate": 0.0006054874788819743, "loss": 0.2762, "step": 45500 }, { "epoch": 2.9512077759599653, "grad_norm": 1.0385627746582031, "learning_rate": 0.0006033489446333484, "loss": 0.2732, "step": 46000 }, { "epoch": 2.9832868187213295, "grad_norm": 1.1464110612869263, "learning_rate": 0.0006012104103847223, "loss": 0.2701, "step": 46500 }, { "epoch": 3.0005774227697044, "eval_loss": 0.2622199058532715, "eval_runtime": 6.3897, "eval_samples_per_second": 78.251, "eval_steps_per_second": 9.86, "step": 46770 }, { "epoch": 3.015333782439932, "grad_norm": 1.2512695789337158, "learning_rate": 0.0005990718761360964, "loss": 0.2697, "step": 47000 }, { "epoch": 3.047412825201296, "grad_norm": 0.7882702946662903, "learning_rate": 0.0005969376189559676, "loss": 0.2685, "step": 47500 }, { "epoch": 3.07949186796266, "grad_norm": 1.282387614250183, "learning_rate": 0.0005947990847073416, "loss": 0.2674, "step": 48000 }, { "epoch": 3.111570910724024, "grad_norm": 1.4220795631408691, "learning_rate": 0.0005926605504587157, "loss": 0.2676, "step": 48500 }, { "epoch": 3.143649953485388, "grad_norm": 1.0576750040054321, "learning_rate": 0.0005905220162100896, "loss": 0.269, "step": 49000 }, { "epoch": 3.175728996246752, "grad_norm": 1.0533617734909058, "learning_rate": 0.0005883834819614636, "loss": 0.2629, "step": 49500 }, { "epoch": 3.2006223334295703, "eval_loss": 0.2548312246799469, "eval_runtime": 6.5962, "eval_samples_per_second": 75.802, "eval_steps_per_second": 9.551, "step": 49888 }, { "epoch": 3.207808039008116, "grad_norm": 1.1989775896072388, "learning_rate": 0.0005862449477128377, "loss": 0.2666, "step": 50000 }, { "epoch": 3.23988708176948, "grad_norm": 1.0356203317642212, "learning_rate": 0.0005841064134642117, "loss": 0.2622, "step": 50500 }, { "epoch": 3.2719661245308442, "grad_norm": 1.0133503675460815, "learning_rate": 0.0005819721562840828, "loss": 0.2671, "step": 51000 }, { "epoch": 3.304045167292208, "grad_norm": 1.1607108116149902, "learning_rate": 0.000579833622035457, "loss": 0.2633, "step": 51500 }, { "epoch": 3.336124210053572, "grad_norm": 1.0809621810913086, "learning_rate": 0.0005776950877868309, "loss": 0.267, "step": 52000 }, { "epoch": 3.368203252814936, "grad_norm": 1.1140522956848145, "learning_rate": 0.0005755565535382049, "loss": 0.2655, "step": 52500 }, { "epoch": 3.4002822955763, "grad_norm": 1.4667320251464844, "learning_rate": 0.0005734222963580762, "loss": 0.2652, "step": 53000 }, { "epoch": 3.4006672440894365, "eval_loss": 0.25107353925704956, "eval_runtime": 8.1009, "eval_samples_per_second": 61.721, "eval_steps_per_second": 7.777, "step": 53006 }, { "epoch": 3.432361338337664, "grad_norm": 1.0906621217727661, "learning_rate": 0.0005712837621094502, "loss": 0.2633, "step": 53500 }, { "epoch": 3.464440381099028, "grad_norm": 1.7066080570220947, "learning_rate": 0.0005691452278608242, "loss": 0.2624, "step": 54000 }, { "epoch": 3.496519423860392, "grad_norm": 1.1421736478805542, "learning_rate": 0.0005670066936121983, "loss": 0.267, "step": 54500 }, { "epoch": 3.528598466621756, "grad_norm": 0.9186555743217468, "learning_rate": 0.0005648681593635722, "loss": 0.2646, "step": 55000 }, { "epoch": 3.5606775093831198, "grad_norm": 0.8647829294204712, "learning_rate": 0.0005627339021834436, "loss": 0.2614, "step": 55500 }, { "epoch": 3.592756552144484, "grad_norm": 0.9938694834709167, "learning_rate": 0.0005605953679348175, "loss": 0.263, "step": 56000 }, { "epoch": 3.6007121547493024, "eval_loss": 0.2563716173171997, "eval_runtime": 6.467, "eval_samples_per_second": 77.315, "eval_steps_per_second": 9.742, "step": 56124 }, { "epoch": 3.624835594905848, "grad_norm": 0.8723744750022888, "learning_rate": 0.0005584568336861915, "loss": 0.2612, "step": 56500 }, { "epoch": 3.656914637667212, "grad_norm": 1.0959194898605347, "learning_rate": 0.0005563182994375655, "loss": 0.2611, "step": 57000 }, { "epoch": 3.688993680428576, "grad_norm": 1.2405571937561035, "learning_rate": 0.0005541797651889395, "loss": 0.2635, "step": 57500 }, { "epoch": 3.72107272318994, "grad_norm": 1.137342929840088, "learning_rate": 0.0005520412309403136, "loss": 0.2595, "step": 58000 }, { "epoch": 3.753151765951304, "grad_norm": 0.8755650520324707, "learning_rate": 0.0005499026966916875, "loss": 0.2619, "step": 58500 }, { "epoch": 3.785230808712668, "grad_norm": 1.2968310117721558, "learning_rate": 0.0005477641624430616, "loss": 0.2612, "step": 59000 }, { "epoch": 3.8007570654091682, "eval_loss": 0.2506495714187622, "eval_runtime": 8.1115, "eval_samples_per_second": 61.641, "eval_steps_per_second": 7.767, "step": 59242 }, { "epoch": 3.817309851474032, "grad_norm": 0.9842492938041687, "learning_rate": 0.0005456299052629328, "loss": 0.2624, "step": 59500 }, { "epoch": 3.849388894235396, "grad_norm": 0.8897719979286194, "learning_rate": 0.0005434913710143068, "loss": 0.2573, "step": 60000 }, { "epoch": 3.88146793699676, "grad_norm": 0.9902140498161316, "learning_rate": 0.0005413528367656809, "loss": 0.2624, "step": 60500 }, { "epoch": 3.913546979758124, "grad_norm": 0.9460390210151672, "learning_rate": 0.0005392185795855521, "loss": 0.2582, "step": 61000 }, { "epoch": 3.945626022519488, "grad_norm": 1.1403892040252686, "learning_rate": 0.0005370800453369261, "loss": 0.2627, "step": 61500 }, { "epoch": 3.977705065280852, "grad_norm": 0.987301766872406, "learning_rate": 0.0005349415110883001, "loss": 0.2616, "step": 62000 }, { "epoch": 4.000769897026273, "eval_loss": 0.2439550757408142, "eval_runtime": 7.3235, "eval_samples_per_second": 68.274, "eval_steps_per_second": 8.602, "step": 62360 }, { "epoch": 4.009752028999455, "grad_norm": 0.9634618163108826, "learning_rate": 0.0005328029768396741, "loss": 0.2522, "step": 62500 }, { "epoch": 4.041831071760819, "grad_norm": 1.0599830150604248, "learning_rate": 0.0005306687196595454, "loss": 0.2548, "step": 63000 }, { "epoch": 4.0739101145221825, "grad_norm": 1.1028203964233398, "learning_rate": 0.0005285301854109194, "loss": 0.2529, "step": 63500 }, { "epoch": 4.105989157283546, "grad_norm": 1.1508251428604126, "learning_rate": 0.0005263916511622933, "loss": 0.2539, "step": 64000 }, { "epoch": 4.138068200044911, "grad_norm": 1.0701543092727661, "learning_rate": 0.0005242531169136675, "loss": 0.2502, "step": 64500 }, { "epoch": 4.170147242806275, "grad_norm": 1.0984095335006714, "learning_rate": 0.0005221145826650414, "loss": 0.2574, "step": 65000 }, { "epoch": 4.200814807686139, "eval_loss": 0.24038133025169373, "eval_runtime": 6.2912, "eval_samples_per_second": 79.476, "eval_steps_per_second": 10.014, "step": 65478 }, { "epoch": 4.202226285567638, "grad_norm": 0.9908430576324463, "learning_rate": 0.0005199803254849126, "loss": 0.2564, "step": 65500 }, { "epoch": 4.234305328329003, "grad_norm": 1.0797010660171509, "learning_rate": 0.0005178417912362867, "loss": 0.2568, "step": 66000 }, { "epoch": 4.266384371090367, "grad_norm": 1.3110967874526978, "learning_rate": 0.0005157032569876607, "loss": 0.2546, "step": 66500 }, { "epoch": 4.298463413851731, "grad_norm": 1.22694993019104, "learning_rate": 0.0005135647227390347, "loss": 0.253, "step": 67000 }, { "epoch": 4.330542456613094, "grad_norm": 1.146044373512268, "learning_rate": 0.000511430465558906, "loss": 0.2529, "step": 67500 }, { "epoch": 4.362621499374459, "grad_norm": 1.0950199365615845, "learning_rate": 0.0005092919313102799, "loss": 0.2506, "step": 68000 }, { "epoch": 4.394700542135823, "grad_norm": 1.1739941835403442, "learning_rate": 0.0005071533970616539, "loss": 0.2501, "step": 68500 }, { "epoch": 4.4008597183460045, "eval_loss": 0.24005259573459625, "eval_runtime": 7.6188, "eval_samples_per_second": 65.627, "eval_steps_per_second": 8.269, "step": 68596 }, { "epoch": 4.4267795848971865, "grad_norm": 1.3007426261901855, "learning_rate": 0.000505014862813028, "loss": 0.2507, "step": 69000 }, { "epoch": 4.45885862765855, "grad_norm": 1.0202151536941528, "learning_rate": 0.0005028763285644019, "loss": 0.2535, "step": 69500 }, { "epoch": 4.490937670419915, "grad_norm": 1.5328317880630493, "learning_rate": 0.000500737794315776, "loss": 0.2522, "step": 70000 }, { "epoch": 4.523016713181279, "grad_norm": 1.0239914655685425, "learning_rate": 0.0004986035371356472, "loss": 0.2496, "step": 70500 }, { "epoch": 4.555095755942642, "grad_norm": 0.9700740575790405, "learning_rate": 0.0004964650028870213, "loss": 0.2512, "step": 71000 }, { "epoch": 4.587174798704007, "grad_norm": 1.2554644346237183, "learning_rate": 0.0004943264686383952, "loss": 0.2506, "step": 71500 }, { "epoch": 4.60090462900587, "eval_loss": 0.24138595163822174, "eval_runtime": 8.1086, "eval_samples_per_second": 61.663, "eval_steps_per_second": 7.77, "step": 71714 }, { "epoch": 4.619253841465371, "grad_norm": 1.5016344785690308, "learning_rate": 0.0004921879343897693, "loss": 0.2471, "step": 72000 }, { "epoch": 4.6513328842267345, "grad_norm": 1.251976490020752, "learning_rate": 0.0004900494001411433, "loss": 0.2531, "step": 72500 }, { "epoch": 4.683411926988098, "grad_norm": 1.0139933824539185, "learning_rate": 0.0004879151429610145, "loss": 0.2488, "step": 73000 }, { "epoch": 4.715490969749463, "grad_norm": 0.9058252573013306, "learning_rate": 0.0004857766087123886, "loss": 0.2478, "step": 73500 }, { "epoch": 4.747570012510827, "grad_norm": 0.8362458944320679, "learning_rate": 0.00048363807446376257, "loss": 0.2495, "step": 74000 }, { "epoch": 4.77964905527219, "grad_norm": 0.996514081954956, "learning_rate": 0.00048149954021513655, "loss": 0.251, "step": 74500 }, { "epoch": 4.800949539665736, "eval_loss": 0.23780158162117004, "eval_runtime": 6.3001, "eval_samples_per_second": 79.364, "eval_steps_per_second": 10.0, "step": 74832 }, { "epoch": 4.811728098033555, "grad_norm": 1.115113377571106, "learning_rate": 0.00047936528303500785, "loss": 0.2477, "step": 75000 }, { "epoch": 4.843807140794919, "grad_norm": 1.2068469524383545, "learning_rate": 0.00047722674878638183, "loss": 0.2462, "step": 75500 }, { "epoch": 4.875886183556283, "grad_norm": 1.0453870296478271, "learning_rate": 0.0004750882145377558, "loss": 0.2456, "step": 76000 }, { "epoch": 4.907965226317646, "grad_norm": 1.0999488830566406, "learning_rate": 0.0004729496802891299, "loss": 0.2492, "step": 76500 }, { "epoch": 4.940044269079011, "grad_norm": 0.8874688148498535, "learning_rate": 0.00047081114604050387, "loss": 0.2467, "step": 77000 }, { "epoch": 4.972123311840375, "grad_norm": 1.060391902923584, "learning_rate": 0.0004686726117918779, "loss": 0.2456, "step": 77500 }, { "epoch": 5.000962371282841, "eval_loss": 0.2328067272901535, "eval_runtime": 6.3234, "eval_samples_per_second": 79.071, "eval_steps_per_second": 9.963, "step": 77950 }, { "epoch": 5.004170275558978, "grad_norm": 1.7267987728118896, "learning_rate": 0.00046653835461174916, "loss": 0.2407, "step": 78000 }, { "epoch": 5.036249318320341, "grad_norm": 0.986152172088623, "learning_rate": 0.00046439982036312313, "loss": 0.2428, "step": 78500 }, { "epoch": 5.068328361081705, "grad_norm": 1.4283899068832397, "learning_rate": 0.00046226128611449716, "loss": 0.2451, "step": 79000 }, { "epoch": 5.100407403843069, "grad_norm": 1.2393862009048462, "learning_rate": 0.00046012275186587114, "loss": 0.2414, "step": 79500 }, { "epoch": 5.1324864466044335, "grad_norm": 0.9746513962745667, "learning_rate": 0.00045798849468574245, "loss": 0.2421, "step": 80000 }, { "epoch": 5.164565489365797, "grad_norm": 1.897164225578308, "learning_rate": 0.0004558499604371164, "loss": 0.2408, "step": 80500 }, { "epoch": 5.196644532127161, "grad_norm": 1.1321191787719727, "learning_rate": 0.0004537114261884904, "loss": 0.2418, "step": 81000 }, { "epoch": 5.201007281942707, "eval_loss": 0.237562894821167, "eval_runtime": 6.3075, "eval_samples_per_second": 79.271, "eval_steps_per_second": 9.988, "step": 81068 }, { "epoch": 5.228723574888526, "grad_norm": 1.1551389694213867, "learning_rate": 0.0004515728919398645, "loss": 0.2417, "step": 81500 }, { "epoch": 5.260802617649889, "grad_norm": 1.0562560558319092, "learning_rate": 0.00044943435769123847, "loss": 0.241, "step": 82000 }, { "epoch": 5.292881660411253, "grad_norm": 1.0487314462661743, "learning_rate": 0.00044730010051110967, "loss": 0.2413, "step": 82500 }, { "epoch": 5.324960703172617, "grad_norm": 1.2570559978485107, "learning_rate": 0.00044516156626248375, "loss": 0.2405, "step": 83000 }, { "epoch": 5.357039745933982, "grad_norm": 1.0819013118743896, "learning_rate": 0.00044302303201385773, "loss": 0.2414, "step": 83500 }, { "epoch": 5.389118788695345, "grad_norm": 0.9494850039482117, "learning_rate": 0.00044088449776523176, "loss": 0.2395, "step": 84000 }, { "epoch": 5.401052192602573, "eval_loss": 0.2342357486486435, "eval_runtime": 6.4142, "eval_samples_per_second": 77.952, "eval_steps_per_second": 9.822, "step": 84186 }, { "epoch": 5.421197831456709, "grad_norm": 1.060643196105957, "learning_rate": 0.00043874596351660574, "loss": 0.2388, "step": 84500 }, { "epoch": 5.453276874218074, "grad_norm": 1.2068923711776733, "learning_rate": 0.0004366074292679797, "loss": 0.2387, "step": 85000 }, { "epoch": 5.4853559169794375, "grad_norm": 1.0651592016220093, "learning_rate": 0.000434473172087851, "loss": 0.2406, "step": 85500 }, { "epoch": 5.517434959740801, "grad_norm": 0.8992927670478821, "learning_rate": 0.000432334637839225, "loss": 0.2365, "step": 86000 }, { "epoch": 5.549514002502165, "grad_norm": 1.0418347120285034, "learning_rate": 0.000430196103590599, "loss": 0.2381, "step": 86500 }, { "epoch": 5.58159304526353, "grad_norm": 1.3282594680786133, "learning_rate": 0.00042805756934197306, "loss": 0.2364, "step": 87000 }, { "epoch": 5.601097103262439, "eval_loss": 0.22406485676765442, "eval_runtime": 8.0745, "eval_samples_per_second": 61.923, "eval_steps_per_second": 7.802, "step": 87304 }, { "epoch": 5.613672088024893, "grad_norm": 1.2747199535369873, "learning_rate": 0.00042592331216184426, "loss": 0.2379, "step": 87500 }, { "epoch": 5.645751130786257, "grad_norm": 0.9160233736038208, "learning_rate": 0.00042378477791321835, "loss": 0.235, "step": 88000 }, { "epoch": 5.677830173547621, "grad_norm": 1.2351807355880737, "learning_rate": 0.0004216462436645923, "loss": 0.2371, "step": 88500 }, { "epoch": 5.7099092163089855, "grad_norm": 0.9699601531028748, "learning_rate": 0.0004195077094159663, "loss": 0.2359, "step": 89000 }, { "epoch": 5.741988259070349, "grad_norm": 0.8815991282463074, "learning_rate": 0.00041736917516734033, "loss": 0.2362, "step": 89500 }, { "epoch": 5.774067301831713, "grad_norm": 0.9497590661048889, "learning_rate": 0.0004152306409187143, "loss": 0.2334, "step": 90000 }, { "epoch": 5.801142013922305, "eval_loss": 0.22982336580753326, "eval_runtime": 6.3324, "eval_samples_per_second": 78.959, "eval_steps_per_second": 9.949, "step": 90422 }, { "epoch": 5.806146344593078, "grad_norm": 0.9176979660987854, "learning_rate": 0.0004130921066700883, "loss": 0.2342, "step": 90500 }, { "epoch": 5.838225387354441, "grad_norm": 0.910997211933136, "learning_rate": 0.0004109578494899596, "loss": 0.2326, "step": 91000 }, { "epoch": 5.870304430115805, "grad_norm": 1.2072116136550903, "learning_rate": 0.00040881931524133357, "loss": 0.2351, "step": 91500 }, { "epoch": 5.902383472877169, "grad_norm": 1.1762892007827759, "learning_rate": 0.00040668078099270766, "loss": 0.2323, "step": 92000 }, { "epoch": 5.9344625156385336, "grad_norm": 1.4378530979156494, "learning_rate": 0.00040454224674408163, "loss": 0.2347, "step": 92500 }, { "epoch": 5.966541558399897, "grad_norm": 1.3818738460540771, "learning_rate": 0.0004024037124954556, "loss": 0.234, "step": 93000 }, { "epoch": 5.998620601161261, "grad_norm": 1.3341606855392456, "learning_rate": 0.0004002651782468297, "loss": 0.2309, "step": 93500 }, { "epoch": 6.001154845539409, "eval_loss": 0.22399960458278656, "eval_runtime": 6.5776, "eval_samples_per_second": 76.016, "eval_steps_per_second": 9.578, "step": 93540 }, { "epoch": 6.030667564879864, "grad_norm": 1.144217848777771, "learning_rate": 0.00039812664399820367, "loss": 0.2278, "step": 94000 }, { "epoch": 6.062746607641228, "grad_norm": 1.1111032962799072, "learning_rate": 0.00039598810974957765, "loss": 0.2308, "step": 94500 }, { "epoch": 6.094825650402592, "grad_norm": 1.0390712022781372, "learning_rate": 0.00039385385256944896, "loss": 0.2262, "step": 95000 }, { "epoch": 6.126904693163956, "grad_norm": 0.9997388124465942, "learning_rate": 0.00039171531832082293, "loss": 0.2296, "step": 95500 }, { "epoch": 6.15898373592532, "grad_norm": 1.1013994216918945, "learning_rate": 0.0003895767840721969, "loss": 0.2311, "step": 96000 }, { "epoch": 6.191062778686684, "grad_norm": 1.474907636642456, "learning_rate": 0.00038743824982357094, "loss": 0.2291, "step": 96500 }, { "epoch": 6.201199756199275, "eval_loss": 0.2199297547340393, "eval_runtime": 6.3926, "eval_samples_per_second": 78.215, "eval_steps_per_second": 9.855, "step": 96658 }, { "epoch": 6.223141821448048, "grad_norm": 1.4461069107055664, "learning_rate": 0.0003852997155749449, "loss": 0.2281, "step": 97000 }, { "epoch": 6.255220864209412, "grad_norm": 1.023522138595581, "learning_rate": 0.00038316118132631895, "loss": 0.2247, "step": 97500 }, { "epoch": 6.287299906970776, "grad_norm": 1.3563698530197144, "learning_rate": 0.000381022647077693, "loss": 0.2258, "step": 98000 }, { "epoch": 6.31937894973214, "grad_norm": 0.9750008583068848, "learning_rate": 0.000378884112829067, "loss": 0.2248, "step": 98500 }, { "epoch": 6.351457992493504, "grad_norm": 1.0582396984100342, "learning_rate": 0.00037674985564893827, "loss": 0.2265, "step": 99000 }, { "epoch": 6.383537035254868, "grad_norm": 1.0544843673706055, "learning_rate": 0.0003746155984688095, "loss": 0.2283, "step": 99500 }, { "epoch": 6.4012446668591405, "eval_loss": 0.21453991532325745, "eval_runtime": 8.069, "eval_samples_per_second": 61.965, "eval_steps_per_second": 7.808, "step": 99776 }, { "epoch": 6.415616078016232, "grad_norm": 1.091102957725525, "learning_rate": 0.0003724770642201835, "loss": 0.2235, "step": 100000 }, { "epoch": 6.447695120777596, "grad_norm": 1.0541837215423584, "learning_rate": 0.00037033852997155753, "loss": 0.2311, "step": 100500 }, { "epoch": 6.47977416353896, "grad_norm": 0.8617345690727234, "learning_rate": 0.00036819999572293156, "loss": 0.2265, "step": 101000 }, { "epoch": 6.511853206300324, "grad_norm": 1.0250686407089233, "learning_rate": 0.00036606146147430554, "loss": 0.2252, "step": 101500 }, { "epoch": 6.5439322490616885, "grad_norm": 0.8736539483070374, "learning_rate": 0.0003639272042941768, "loss": 0.2255, "step": 102000 }, { "epoch": 6.576011291823052, "grad_norm": 0.8904435634613037, "learning_rate": 0.00036178867004555077, "loss": 0.2208, "step": 102500 }, { "epoch": 6.601289577519006, "eval_loss": 0.21710003912448883, "eval_runtime": 8.0277, "eval_samples_per_second": 62.284, "eval_steps_per_second": 7.848, "step": 102894 }, { "epoch": 6.608090334584416, "grad_norm": 1.0822809934616089, "learning_rate": 0.0003596501357969248, "loss": 0.2236, "step": 103000 }, { "epoch": 6.64016937734578, "grad_norm": 1.1805315017700195, "learning_rate": 0.0003575116015482988, "loss": 0.2223, "step": 103500 }, { "epoch": 6.672248420107144, "grad_norm": 1.241454005241394, "learning_rate": 0.0003553730672996728, "loss": 0.2248, "step": 104000 }, { "epoch": 6.704327462868508, "grad_norm": 0.9509809613227844, "learning_rate": 0.00035323881011954406, "loss": 0.2242, "step": 104500 }, { "epoch": 6.736406505629872, "grad_norm": 0.9897216558456421, "learning_rate": 0.0003511002758709181, "loss": 0.225, "step": 105000 }, { "epoch": 6.768485548391236, "grad_norm": 0.9336084127426147, "learning_rate": 0.0003489617416222921, "loss": 0.2244, "step": 105500 }, { "epoch": 6.8005645911526, "grad_norm": 0.9609190821647644, "learning_rate": 0.0003468274844421634, "loss": 0.2236, "step": 106000 }, { "epoch": 6.801334488178873, "eval_loss": 0.21265725791454315, "eval_runtime": 8.153, "eval_samples_per_second": 61.327, "eval_steps_per_second": 7.727, "step": 106012 }, { "epoch": 6.832643633913964, "grad_norm": 0.9923797249794006, "learning_rate": 0.00034468895019353735, "loss": 0.2242, "step": 106500 }, { "epoch": 6.864722676675328, "grad_norm": 0.7628911137580872, "learning_rate": 0.0003425504159449114, "loss": 0.2243, "step": 107000 }, { "epoch": 6.8968017194366915, "grad_norm": 0.849974513053894, "learning_rate": 0.0003404118816962854, "loss": 0.2256, "step": 107500 }, { "epoch": 6.928880762198056, "grad_norm": 1.531506896018982, "learning_rate": 0.0003382733474476594, "loss": 0.2224, "step": 108000 }, { "epoch": 6.96095980495942, "grad_norm": 0.7624678611755371, "learning_rate": 0.00033613481319903337, "loss": 0.2195, "step": 108500 }, { "epoch": 6.993038847720784, "grad_norm": 1.0769827365875244, "learning_rate": 0.0003339962789504074, "loss": 0.2208, "step": 109000 }, { "epoch": 7.001347319795977, "eval_loss": 0.21117287874221802, "eval_runtime": 6.3976, "eval_samples_per_second": 78.155, "eval_steps_per_second": 9.847, "step": 109130 }, { "epoch": 7.025085811439387, "grad_norm": 1.1820608377456665, "learning_rate": 0.00033185774470178143, "loss": 0.2185, "step": 109500 }, { "epoch": 7.05716485420075, "grad_norm": 1.0467668771743774, "learning_rate": 0.0003297192104531554, "loss": 0.2152, "step": 110000 }, { "epoch": 7.089243896962115, "grad_norm": 0.915417492389679, "learning_rate": 0.00032758495327302666, "loss": 0.2186, "step": 110500 }, { "epoch": 7.121322939723479, "grad_norm": 1.3223015069961548, "learning_rate": 0.0003254464190244007, "loss": 0.2176, "step": 111000 }, { "epoch": 7.1534019824848425, "grad_norm": 1.0352325439453125, "learning_rate": 0.0003233078847757747, "loss": 0.2122, "step": 111500 }, { "epoch": 7.185481025246206, "grad_norm": 0.9833923578262329, "learning_rate": 0.0003211693505271487, "loss": 0.2172, "step": 112000 }, { "epoch": 7.201392230455843, "eval_loss": 0.20997634530067444, "eval_runtime": 6.2972, "eval_samples_per_second": 79.4, "eval_steps_per_second": 10.004, "step": 112248 }, { "epoch": 7.217560068007571, "grad_norm": 0.9086174964904785, "learning_rate": 0.00031903081627852273, "loss": 0.2161, "step": 112500 }, { "epoch": 7.249639110768935, "grad_norm": 0.8965845704078674, "learning_rate": 0.0003168922820298967, "loss": 0.2134, "step": 113000 }, { "epoch": 7.281718153530298, "grad_norm": 1.3317846059799194, "learning_rate": 0.0003147537477812707, "loss": 0.2167, "step": 113500 }, { "epoch": 7.313797196291663, "grad_norm": 1.0240646600723267, "learning_rate": 0.000312619490601142, "loss": 0.2143, "step": 114000 }, { "epoch": 7.345876239053027, "grad_norm": 0.8263606429100037, "learning_rate": 0.000310480956352516, "loss": 0.213, "step": 114500 }, { "epoch": 7.3779552818143905, "grad_norm": 0.957937479019165, "learning_rate": 0.00030834242210389, "loss": 0.212, "step": 115000 }, { "epoch": 7.401437141115709, "eval_loss": 0.20440179109573364, "eval_runtime": 6.4324, "eval_samples_per_second": 77.732, "eval_steps_per_second": 9.794, "step": 115366 }, { "epoch": 7.410034324575754, "grad_norm": 0.9134410619735718, "learning_rate": 0.00030620388785526404, "loss": 0.2133, "step": 115500 }, { "epoch": 7.442113367337119, "grad_norm": 0.9886873364448547, "learning_rate": 0.000304065353606638, "loss": 0.2094, "step": 116000 }, { "epoch": 7.474192410098483, "grad_norm": 1.055210828781128, "learning_rate": 0.00030192681935801204, "loss": 0.2154, "step": 116500 }, { "epoch": 7.506271452859846, "grad_norm": 0.9217848181724548, "learning_rate": 0.0002997882851093861, "loss": 0.2126, "step": 117000 }, { "epoch": 7.538350495621211, "grad_norm": 1.216321349143982, "learning_rate": 0.00029764975086076005, "loss": 0.2133, "step": 117500 }, { "epoch": 7.570429538382575, "grad_norm": 1.0500450134277344, "learning_rate": 0.0002955154936806313, "loss": 0.2111, "step": 118000 }, { "epoch": 7.601482051775575, "eval_loss": 0.2064415067434311, "eval_runtime": 6.4269, "eval_samples_per_second": 77.798, "eval_steps_per_second": 9.803, "step": 118484 }, { "epoch": 7.602508581143939, "grad_norm": 1.1116931438446045, "learning_rate": 0.00029338123650050256, "loss": 0.2097, "step": 118500 }, { "epoch": 7.634587623905302, "grad_norm": 0.8931957483291626, "learning_rate": 0.0002912427022518766, "loss": 0.2103, "step": 119000 }, { "epoch": 7.666666666666667, "grad_norm": 0.8822094202041626, "learning_rate": 0.0002891041680032506, "loss": 0.211, "step": 119500 }, { "epoch": 7.698745709428031, "grad_norm": 1.1010546684265137, "learning_rate": 0.0002869656337546246, "loss": 0.2117, "step": 120000 }, { "epoch": 7.7308247521893945, "grad_norm": 0.8556534647941589, "learning_rate": 0.0002848270995059986, "loss": 0.2152, "step": 120500 }, { "epoch": 7.762903794950759, "grad_norm": 1.0679911375045776, "learning_rate": 0.00028269284232586983, "loss": 0.2075, "step": 121000 }, { "epoch": 7.794982837712123, "grad_norm": 1.2181644439697266, "learning_rate": 0.00028055430807724386, "loss": 0.2115, "step": 121500 }, { "epoch": 7.801526962435441, "eval_loss": 0.20034563541412354, "eval_runtime": 8.0626, "eval_samples_per_second": 62.015, "eval_steps_per_second": 7.814, "step": 121602 }, { "epoch": 7.827061880473487, "grad_norm": 1.366958498954773, "learning_rate": 0.0002784157738286179, "loss": 0.2105, "step": 122000 }, { "epoch": 7.85914092323485, "grad_norm": 0.9196767807006836, "learning_rate": 0.00027627723957999187, "loss": 0.2051, "step": 122500 }, { "epoch": 7.891219965996215, "grad_norm": 0.9501635432243347, "learning_rate": 0.0002741387053313659, "loss": 0.2089, "step": 123000 }, { "epoch": 7.923299008757579, "grad_norm": 1.1504069566726685, "learning_rate": 0.00027200444815123716, "loss": 0.2087, "step": 123500 }, { "epoch": 7.9553780515189425, "grad_norm": 1.0922268629074097, "learning_rate": 0.0002698659139026112, "loss": 0.2104, "step": 124000 }, { "epoch": 7.987457094280307, "grad_norm": 1.3557689189910889, "learning_rate": 0.00026772737965398516, "loss": 0.2102, "step": 124500 }, { "epoch": 8.001539794052546, "eval_loss": 0.20050643384456635, "eval_runtime": 7.6334, "eval_samples_per_second": 65.501, "eval_steps_per_second": 8.253, "step": 124720 }, { "epoch": 8.01950405799891, "grad_norm": 1.0815114974975586, "learning_rate": 0.0002655931224738564, "loss": 0.1992, "step": 125000 }, { "epoch": 8.051583100760274, "grad_norm": 0.9353795647621155, "learning_rate": 0.00026345458822523045, "loss": 0.204, "step": 125500 }, { "epoch": 8.083662143521638, "grad_norm": 1.0060148239135742, "learning_rate": 0.0002613160539766045, "loss": 0.2009, "step": 126000 }, { "epoch": 8.115741186283001, "grad_norm": 0.8582931160926819, "learning_rate": 0.00025917751972797846, "loss": 0.2026, "step": 126500 }, { "epoch": 8.147820229044365, "grad_norm": 1.093616008758545, "learning_rate": 0.00025703898547935243, "loss": 0.2051, "step": 127000 }, { "epoch": 8.179899271805729, "grad_norm": 0.9444632530212402, "learning_rate": 0.00025490045123072646, "loss": 0.2028, "step": 127500 }, { "epoch": 8.201584704712412, "eval_loss": 0.19587960839271545, "eval_runtime": 6.3939, "eval_samples_per_second": 78.199, "eval_steps_per_second": 9.853, "step": 127838 }, { "epoch": 8.211978314567093, "grad_norm": 1.0667012929916382, "learning_rate": 0.0002527619169821005, "loss": 0.2047, "step": 128000 }, { "epoch": 8.244057357328458, "grad_norm": 0.8280072212219238, "learning_rate": 0.0002506233827334745, "loss": 0.2025, "step": 128500 }, { "epoch": 8.276136400089822, "grad_norm": 1.2170900106430054, "learning_rate": 0.0002484848484848485, "loss": 0.2048, "step": 129000 }, { "epoch": 8.308215442851186, "grad_norm": 1.0770434141159058, "learning_rate": 0.00024635059130471976, "loss": 0.2059, "step": 129500 }, { "epoch": 8.34029448561255, "grad_norm": 1.0188692808151245, "learning_rate": 0.0002442120570560938, "loss": 0.1993, "step": 130000 }, { "epoch": 8.372373528373913, "grad_norm": 0.9579035639762878, "learning_rate": 0.00024207352280746777, "loss": 0.2011, "step": 130500 }, { "epoch": 8.401629615372277, "eval_loss": 0.1946863979101181, "eval_runtime": 7.4225, "eval_samples_per_second": 67.362, "eval_steps_per_second": 8.488, "step": 130956 }, { "epoch": 8.404452571135277, "grad_norm": 1.2595388889312744, "learning_rate": 0.00023993498855884177, "loss": 0.2005, "step": 131000 }, { "epoch": 8.43653161389664, "grad_norm": 0.7708470225334167, "learning_rate": 0.0002377964543102158, "loss": 0.1979, "step": 131500 }, { "epoch": 8.468610656658006, "grad_norm": 0.985543966293335, "learning_rate": 0.00023565792006158978, "loss": 0.2005, "step": 132000 }, { "epoch": 8.50068969941937, "grad_norm": 1.2451292276382446, "learning_rate": 0.0002335193858129638, "loss": 0.1993, "step": 132500 }, { "epoch": 8.532768742180734, "grad_norm": 0.9347227811813354, "learning_rate": 0.00023138085156433781, "loss": 0.2038, "step": 133000 }, { "epoch": 8.564847784942097, "grad_norm": 1.0957634449005127, "learning_rate": 0.00022924231731571184, "loss": 0.2012, "step": 133500 }, { "epoch": 8.596926827703461, "grad_norm": 1.079504132270813, "learning_rate": 0.0002271080601355831, "loss": 0.1967, "step": 134000 }, { "epoch": 8.601674526032143, "eval_loss": 0.19414910674095154, "eval_runtime": 8.1556, "eval_samples_per_second": 61.308, "eval_steps_per_second": 7.725, "step": 134074 }, { "epoch": 8.629005870464825, "grad_norm": 1.141271710395813, "learning_rate": 0.00022497380295545435, "loss": 0.1989, "step": 134500 }, { "epoch": 8.661084913226189, "grad_norm": 0.9533087611198425, "learning_rate": 0.00022283526870682833, "loss": 0.1981, "step": 135000 }, { "epoch": 8.693163955987554, "grad_norm": 0.803256094455719, "learning_rate": 0.00022069673445820236, "loss": 0.1964, "step": 135500 }, { "epoch": 8.725242998748918, "grad_norm": 1.0055620670318604, "learning_rate": 0.0002185582002095764, "loss": 0.1995, "step": 136000 }, { "epoch": 8.757322041510282, "grad_norm": 1.3381000757217407, "learning_rate": 0.00021641966596095037, "loss": 0.1911, "step": 136500 }, { "epoch": 8.789401084271645, "grad_norm": 1.2408932447433472, "learning_rate": 0.00021428113171232437, "loss": 0.1954, "step": 137000 }, { "epoch": 8.801719436692009, "eval_loss": 0.1906564086675644, "eval_runtime": 6.4646, "eval_samples_per_second": 77.345, "eval_steps_per_second": 9.745, "step": 137192 }, { "epoch": 8.82148012703301, "grad_norm": 1.1862565279006958, "learning_rate": 0.00021214687453219563, "loss": 0.195, "step": 137500 }, { "epoch": 8.853559169794373, "grad_norm": 1.1294955015182495, "learning_rate": 0.00021000834028356966, "loss": 0.1954, "step": 138000 }, { "epoch": 8.885638212555737, "grad_norm": 1.6489890813827515, "learning_rate": 0.0002078698060349437, "loss": 0.1977, "step": 138500 }, { "epoch": 8.9177172553171, "grad_norm": 0.9208526611328125, "learning_rate": 0.00020573127178631767, "loss": 0.1989, "step": 139000 }, { "epoch": 8.949796298078466, "grad_norm": 0.9415667653083801, "learning_rate": 0.00020359273753769167, "loss": 0.1999, "step": 139500 }, { "epoch": 8.98187534083983, "grad_norm": 1.1843030452728271, "learning_rate": 0.00020145848035756295, "loss": 0.1951, "step": 140000 }, { "epoch": 9.001732268309114, "eval_loss": 0.18711215257644653, "eval_runtime": 8.0101, "eval_samples_per_second": 62.422, "eval_steps_per_second": 7.865, "step": 140310 }, { "epoch": 9.013922304558433, "grad_norm": 1.0561912059783936, "learning_rate": 0.00019931994610893693, "loss": 0.1937, "step": 140500 }, { "epoch": 9.046001347319796, "grad_norm": 1.0678902864456177, "learning_rate": 0.00019718141186031096, "loss": 0.1894, "step": 141000 }, { "epoch": 9.07808039008116, "grad_norm": 1.2322587966918945, "learning_rate": 0.00019504287761168496, "loss": 0.1882, "step": 141500 }, { "epoch": 9.110159432842524, "grad_norm": 1.0550870895385742, "learning_rate": 0.00019290434336305897, "loss": 0.189, "step": 142000 }, { "epoch": 9.142238475603888, "grad_norm": 1.0499299764633179, "learning_rate": 0.00019076580911443297, "loss": 0.1861, "step": 142500 }, { "epoch": 9.174317518365251, "grad_norm": 0.9951316118240356, "learning_rate": 0.00018862727486580698, "loss": 0.1927, "step": 143000 }, { "epoch": 9.20177717896898, "eval_loss": 0.18626904487609863, "eval_runtime": 8.0616, "eval_samples_per_second": 62.023, "eval_steps_per_second": 7.815, "step": 143428 }, { "epoch": 9.206396561126615, "grad_norm": 1.09773588180542, "learning_rate": 0.00018649301768567823, "loss": 0.193, "step": 143500 }, { "epoch": 9.23847560388798, "grad_norm": 1.0234872102737427, "learning_rate": 0.00018435448343705226, "loss": 0.1904, "step": 144000 }, { "epoch": 9.270554646649344, "grad_norm": 0.9982895851135254, "learning_rate": 0.00018221594918842627, "loss": 0.1879, "step": 144500 }, { "epoch": 9.302633689410708, "grad_norm": 1.2350847721099854, "learning_rate": 0.00018007741493980027, "loss": 0.1889, "step": 145000 }, { "epoch": 9.334712732172072, "grad_norm": 0.8660911321640015, "learning_rate": 0.00017793888069117427, "loss": 0.1897, "step": 145500 }, { "epoch": 9.366791774933436, "grad_norm": 1.0273090600967407, "learning_rate": 0.00017580462351104553, "loss": 0.1869, "step": 146000 }, { "epoch": 9.3988708176948, "grad_norm": 1.0155673027038574, "learning_rate": 0.00017366608926241953, "loss": 0.1847, "step": 146500 }, { "epoch": 9.401822089628846, "eval_loss": 0.18227200210094452, "eval_runtime": 7.0487, "eval_samples_per_second": 70.935, "eval_steps_per_second": 8.938, "step": 146546 }, { "epoch": 9.430949860456163, "grad_norm": 0.9077481627464294, "learning_rate": 0.00017153183208229079, "loss": 0.1861, "step": 147000 }, { "epoch": 9.463028903217529, "grad_norm": 0.9706925749778748, "learning_rate": 0.00016939329783366482, "loss": 0.1876, "step": 147500 }, { "epoch": 9.495107945978893, "grad_norm": 0.8658551573753357, "learning_rate": 0.00016725476358503882, "loss": 0.1835, "step": 148000 }, { "epoch": 9.527186988740256, "grad_norm": 1.520696759223938, "learning_rate": 0.00016511622933641283, "loss": 0.1881, "step": 148500 }, { "epoch": 9.55926603150162, "grad_norm": 1.1027703285217285, "learning_rate": 0.00016297769508778683, "loss": 0.1871, "step": 149000 }, { "epoch": 9.591345074262984, "grad_norm": 0.9667887091636658, "learning_rate": 0.00016083916083916083, "loss": 0.1879, "step": 149500 }, { "epoch": 9.601867000288712, "eval_loss": 0.18134662508964539, "eval_runtime": 8.0678, "eval_samples_per_second": 61.974, "eval_steps_per_second": 7.809, "step": 149664 }, { "epoch": 9.623424117024348, "grad_norm": 0.8166322708129883, "learning_rate": 0.00015870062659053486, "loss": 0.1864, "step": 150000 }, { "epoch": 9.655503159785711, "grad_norm": 0.9341521263122559, "learning_rate": 0.00015656636941040612, "loss": 0.1887, "step": 150500 }, { "epoch": 9.687582202547077, "grad_norm": 1.0415754318237305, "learning_rate": 0.00015442783516178012, "loss": 0.1867, "step": 151000 }, { "epoch": 9.71966124530844, "grad_norm": 0.8907636404037476, "learning_rate": 0.00015228930091315413, "loss": 0.183, "step": 151500 }, { "epoch": 9.751740288069804, "grad_norm": 0.8122469186782837, "learning_rate": 0.00015015076666452813, "loss": 0.1853, "step": 152000 }, { "epoch": 9.783819330831168, "grad_norm": 2.5243399143218994, "learning_rate": 0.00014801223241590214, "loss": 0.1844, "step": 152500 }, { "epoch": 9.801911910948577, "eval_loss": 0.1786118447780609, "eval_runtime": 8.097, "eval_samples_per_second": 61.751, "eval_steps_per_second": 7.781, "step": 152782 }, { "epoch": 9.815898373592532, "grad_norm": 1.0579113960266113, "learning_rate": 0.00014587369816727617, "loss": 0.1802, "step": 153000 }, { "epoch": 9.847977416353896, "grad_norm": 0.9247124791145325, "learning_rate": 0.00014373516391865017, "loss": 0.1837, "step": 153500 }, { "epoch": 9.88005645911526, "grad_norm": 1.0255844593048096, "learning_rate": 0.00014159662967002417, "loss": 0.1798, "step": 154000 }, { "epoch": 9.912135501876623, "grad_norm": 0.9637705087661743, "learning_rate": 0.00013946237248989546, "loss": 0.1812, "step": 154500 }, { "epoch": 9.944214544637989, "grad_norm": 0.9629892706871033, "learning_rate": 0.00013732383824126943, "loss": 0.1811, "step": 155000 }, { "epoch": 9.976293587399352, "grad_norm": 1.3292585611343384, "learning_rate": 0.00013518530399264344, "loss": 0.1881, "step": 155500 }, { "epoch": 10.001924742565683, "eval_loss": 0.17706386744976044, "eval_runtime": 7.2557, "eval_samples_per_second": 68.911, "eval_steps_per_second": 8.683, "step": 155900 }, { "epoch": 10.008340551117955, "grad_norm": 0.9670829176902771, "learning_rate": 0.00013304676974401747, "loss": 0.1802, "step": 156000 }, { "epoch": 10.040419593879319, "grad_norm": 0.9530666470527649, "learning_rate": 0.00013090823549539147, "loss": 0.1739, "step": 156500 }, { "epoch": 10.072498636640683, "grad_norm": 0.8907625675201416, "learning_rate": 0.00012876970124676548, "loss": 0.1776, "step": 157000 }, { "epoch": 10.104577679402047, "grad_norm": 1.6122727394104004, "learning_rate": 0.00012663116699813948, "loss": 0.1742, "step": 157500 }, { "epoch": 10.13665672216341, "grad_norm": 1.0635969638824463, "learning_rate": 0.00012449690981801073, "loss": 0.1751, "step": 158000 }, { "epoch": 10.168735764924774, "grad_norm": 0.9459207057952881, "learning_rate": 0.00012235837556938474, "loss": 0.1759, "step": 158500 }, { "epoch": 10.200814807686138, "grad_norm": 0.9655419588088989, "learning_rate": 0.00012021984132075877, "loss": 0.1738, "step": 159000 }, { "epoch": 10.201969653225548, "eval_loss": 0.17458562552928925, "eval_runtime": 7.2001, "eval_samples_per_second": 69.443, "eval_steps_per_second": 8.75, "step": 159018 }, { "epoch": 10.232893850447503, "grad_norm": 1.003448247909546, "learning_rate": 0.00011808130707213276, "loss": 0.1734, "step": 159500 }, { "epoch": 10.264972893208867, "grad_norm": 1.0024570226669312, "learning_rate": 0.00011594277282350679, "loss": 0.1759, "step": 160000 }, { "epoch": 10.29705193597023, "grad_norm": 1.1451829671859741, "learning_rate": 0.00011380851564337805, "loss": 0.1775, "step": 160500 }, { "epoch": 10.329130978731595, "grad_norm": 1.0935543775558472, "learning_rate": 0.00011166998139475205, "loss": 0.1747, "step": 161000 }, { "epoch": 10.361210021492958, "grad_norm": 1.3235210180282593, "learning_rate": 0.00010953144714612604, "loss": 0.1758, "step": 161500 }, { "epoch": 10.393289064254322, "grad_norm": 1.1496840715408325, "learning_rate": 0.00010739291289750007, "loss": 0.1753, "step": 162000 }, { "epoch": 10.402014563885414, "eval_loss": 0.17369601130485535, "eval_runtime": 6.4411, "eval_samples_per_second": 77.626, "eval_steps_per_second": 9.781, "step": 162136 }, { "epoch": 10.425368107015686, "grad_norm": 1.040515422821045, "learning_rate": 0.00010525437864887406, "loss": 0.174, "step": 162500 }, { "epoch": 10.457447149777051, "grad_norm": 1.2366424798965454, "learning_rate": 0.00010311584440024808, "loss": 0.174, "step": 163000 }, { "epoch": 10.489526192538415, "grad_norm": 0.9126259684562683, "learning_rate": 0.00010097731015162208, "loss": 0.1754, "step": 163500 }, { "epoch": 10.521605235299779, "grad_norm": 1.0338482856750488, "learning_rate": 9.88387759029961e-05, "loss": 0.1745, "step": 164000 }, { "epoch": 10.553684278061143, "grad_norm": 0.889621376991272, "learning_rate": 9.670451872286735e-05, "loss": 0.1754, "step": 164500 }, { "epoch": 10.585763320822506, "grad_norm": 1.0261045694351196, "learning_rate": 9.457026154273861e-05, "loss": 0.1718, "step": 165000 }, { "epoch": 10.60205947454528, "eval_loss": 0.17038601636886597, "eval_runtime": 7.9736, "eval_samples_per_second": 62.707, "eval_steps_per_second": 7.901, "step": 165254 }, { "epoch": 10.61784236358387, "grad_norm": 0.9808095693588257, "learning_rate": 9.243172729411263e-05, "loss": 0.1728, "step": 165500 }, { "epoch": 10.649921406345234, "grad_norm": 1.1389505863189697, "learning_rate": 9.029319304548663e-05, "loss": 0.1694, "step": 166000 }, { "epoch": 10.6820004491066, "grad_norm": 1.1859121322631836, "learning_rate": 8.815465879686063e-05, "loss": 0.1715, "step": 166500 }, { "epoch": 10.714079491867963, "grad_norm": 1.1248379945755005, "learning_rate": 8.601612454823464e-05, "loss": 0.1729, "step": 167000 }, { "epoch": 10.746158534629327, "grad_norm": 1.1436078548431396, "learning_rate": 8.387759029960866e-05, "loss": 0.1725, "step": 167500 }, { "epoch": 10.77823757739069, "grad_norm": 1.2486170530319214, "learning_rate": 8.173905605098266e-05, "loss": 0.1714, "step": 168000 }, { "epoch": 10.802104385205146, "eval_loss": 0.1677413433790207, "eval_runtime": 6.9506, "eval_samples_per_second": 71.936, "eval_steps_per_second": 9.064, "step": 168372 }, { "epoch": 10.810316620152054, "grad_norm": 1.4387327432632446, "learning_rate": 7.960052180235666e-05, "loss": 0.1718, "step": 168500 }, { "epoch": 10.842395662913418, "grad_norm": 1.0564631223678589, "learning_rate": 7.746626462222792e-05, "loss": 0.17, "step": 169000 }, { "epoch": 10.874474705674782, "grad_norm": 1.898914098739624, "learning_rate": 7.532773037360194e-05, "loss": 0.1708, "step": 169500 }, { "epoch": 10.906553748436147, "grad_norm": 1.5556339025497437, "learning_rate": 7.318919612497594e-05, "loss": 0.1705, "step": 170000 }, { "epoch": 10.938632791197511, "grad_norm": 1.0785949230194092, "learning_rate": 7.105493894484721e-05, "loss": 0.1677, "step": 170500 }, { "epoch": 10.970711833958875, "grad_norm": 1.062652349472046, "learning_rate": 6.891640469622121e-05, "loss": 0.1699, "step": 171000 } ], "logging_steps": 500, "max_steps": 187044, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.831310324689273e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }