lemexp-task1-v3-template_full_nodefs-Llama-3.2-1B-8lr-12epochs-no-eos
/
checkpoint-77935
/trainer_state.json
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 3118, | |
| "global_step": 77935, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.032079042761364, | |
| "grad_norm": 1.7145991325378418, | |
| "learning_rate": 0.0007978742969568658, | |
| "loss": 0.4787, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.064158085522728, | |
| "grad_norm": 1.2714065313339233, | |
| "learning_rate": 0.0007957357627082398, | |
| "loss": 0.3972, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.096237128284092, | |
| "grad_norm": 1.1843117475509644, | |
| "learning_rate": 0.0007935972284596139, | |
| "loss": 0.3736, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.128316171045456, | |
| "grad_norm": 1.1524124145507812, | |
| "learning_rate": 0.0007914629712794851, | |
| "loss": 0.3626, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.16039521380682, | |
| "grad_norm": 1.0530339479446411, | |
| "learning_rate": 0.0007893244370308591, | |
| "loss": 0.355, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.192474256568184, | |
| "grad_norm": 1.0044286251068115, | |
| "learning_rate": 0.0007871859027822331, | |
| "loss": 0.3527, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.2000449106598659, | |
| "eval_loss": 0.3370795249938965, | |
| "eval_runtime": 6.2984, | |
| "eval_samples_per_second": 79.386, | |
| "eval_steps_per_second": 10.003, | |
| "step": 3118 | |
| }, | |
| { | |
| "epoch": 0.224553299329548, | |
| "grad_norm": 1.4645978212356567, | |
| "learning_rate": 0.0007850473685336071, | |
| "loss": 0.3394, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.256632342090912, | |
| "grad_norm": 1.2185771465301514, | |
| "learning_rate": 0.0007829088342849811, | |
| "loss": 0.3398, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.288711384852276, | |
| "grad_norm": 1.3087233304977417, | |
| "learning_rate": 0.0007807703000363551, | |
| "loss": 0.3399, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.32079042761364, | |
| "grad_norm": 1.411337971687317, | |
| "learning_rate": 0.0007786360428562264, | |
| "loss": 0.3355, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.352869470375004, | |
| "grad_norm": 1.1822813749313354, | |
| "learning_rate": 0.0007764975086076004, | |
| "loss": 0.3329, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.384948513136368, | |
| "grad_norm": 0.9898410439491272, | |
| "learning_rate": 0.0007743632514274716, | |
| "loss": 0.3311, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.4000898213197318, | |
| "eval_loss": 0.31450170278549194, | |
| "eval_runtime": 9.8807, | |
| "eval_samples_per_second": 50.604, | |
| "eval_steps_per_second": 6.376, | |
| "step": 6236 | |
| }, | |
| { | |
| "epoch": 0.417027555897732, | |
| "grad_norm": 0.9827210307121277, | |
| "learning_rate": 0.0007722247171788456, | |
| "loss": 0.3251, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.449106598659096, | |
| "grad_norm": 1.2011586427688599, | |
| "learning_rate": 0.0007700861829302197, | |
| "loss": 0.3271, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.48118564142046, | |
| "grad_norm": 1.3871015310287476, | |
| "learning_rate": 0.0007679476486815936, | |
| "loss": 0.3259, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.513264684181824, | |
| "grad_norm": 1.1245856285095215, | |
| "learning_rate": 0.0007658091144329678, | |
| "loss": 0.3192, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.545343726943188, | |
| "grad_norm": 1.0883513689041138, | |
| "learning_rate": 0.0007636705801843417, | |
| "loss": 0.3209, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.577422769704552, | |
| "grad_norm": 1.0555607080459595, | |
| "learning_rate": 0.0007615320459357157, | |
| "loss": 0.3178, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.6001347319795978, | |
| "eval_loss": 0.3085324764251709, | |
| "eval_runtime": 7.9945, | |
| "eval_samples_per_second": 62.543, | |
| "eval_steps_per_second": 7.88, | |
| "step": 9354 | |
| }, | |
| { | |
| "epoch": 0.6095018124659161, | |
| "grad_norm": 1.0429250001907349, | |
| "learning_rate": 0.0007593935116870898, | |
| "loss": 0.3169, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.64158085522728, | |
| "grad_norm": 1.2145720720291138, | |
| "learning_rate": 0.0007572549774384637, | |
| "loss": 0.3152, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.673659897988644, | |
| "grad_norm": 1.3464765548706055, | |
| "learning_rate": 0.0007551207202583349, | |
| "loss": 0.3125, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.705738940750008, | |
| "grad_norm": 1.1744924783706665, | |
| "learning_rate": 0.000752982186009709, | |
| "loss": 0.3086, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.7378179835113721, | |
| "grad_norm": 1.234157919883728, | |
| "learning_rate": 0.000750843651761083, | |
| "loss": 0.3133, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.769897026272736, | |
| "grad_norm": 0.9248010516166687, | |
| "learning_rate": 0.0007487051175124569, | |
| "loss": 0.3122, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.8001796426394636, | |
| "eval_loss": 0.3022182583808899, | |
| "eval_runtime": 8.0646, | |
| "eval_samples_per_second": 61.999, | |
| "eval_steps_per_second": 7.812, | |
| "step": 12472 | |
| }, | |
| { | |
| "epoch": 0.8019760690341, | |
| "grad_norm": 0.862856388092041, | |
| "learning_rate": 0.000746566583263831, | |
| "loss": 0.308, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.834055111795464, | |
| "grad_norm": 1.0224480628967285, | |
| "learning_rate": 0.000744428049015205, | |
| "loss": 0.31, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.8661341545568281, | |
| "grad_norm": 1.008195161819458, | |
| "learning_rate": 0.0007422937918350763, | |
| "loss": 0.3079, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.898213197318192, | |
| "grad_norm": 1.29293954372406, | |
| "learning_rate": 0.0007401552575864503, | |
| "loss": 0.3047, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.930292240079556, | |
| "grad_norm": 0.9913870692253113, | |
| "learning_rate": 0.0007380167233378243, | |
| "loss": 0.3074, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.96237128284092, | |
| "grad_norm": 1.1802239418029785, | |
| "learning_rate": 0.0007358781890891983, | |
| "loss": 0.3055, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.9944503256022841, | |
| "grad_norm": 1.0421093702316284, | |
| "learning_rate": 0.0007337396548405722, | |
| "loss": 0.3106, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.0001924742565682, | |
| "eval_loss": 0.30451422929763794, | |
| "eval_runtime": 6.8665, | |
| "eval_samples_per_second": 72.817, | |
| "eval_steps_per_second": 9.175, | |
| "step": 15590 | |
| }, | |
| { | |
| "epoch": 1.0264972893208866, | |
| "grad_norm": 1.1343709230422974, | |
| "learning_rate": 0.0007316011205919463, | |
| "loss": 0.3028, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.0585763320822508, | |
| "grad_norm": 0.992735743522644, | |
| "learning_rate": 0.0007294625863433203, | |
| "loss": 0.3026, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.0906553748436147, | |
| "grad_norm": 0.9756571650505066, | |
| "learning_rate": 0.0007273240520946943, | |
| "loss": 0.3004, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.1227344176049787, | |
| "grad_norm": 1.005118489265442, | |
| "learning_rate": 0.0007251897949145656, | |
| "loss": 0.2953, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.1548134603663427, | |
| "grad_norm": 1.2004996538162231, | |
| "learning_rate": 0.0007230512606659396, | |
| "loss": 0.3039, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.1868925031277067, | |
| "grad_norm": 1.2973647117614746, | |
| "learning_rate": 0.0007209127264173135, | |
| "loss": 0.2997, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.200237384916434, | |
| "eval_loss": 0.2830166518688202, | |
| "eval_runtime": 6.4164, | |
| "eval_samples_per_second": 77.926, | |
| "eval_steps_per_second": 9.819, | |
| "step": 18708 | |
| }, | |
| { | |
| "epoch": 1.2189715458890706, | |
| "grad_norm": 1.2329021692276, | |
| "learning_rate": 0.0007187741921686877, | |
| "loss": 0.2972, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.2510505886504346, | |
| "grad_norm": 0.8207685947418213, | |
| "learning_rate": 0.0007166399349885588, | |
| "loss": 0.2979, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.2831296314117986, | |
| "grad_norm": 1.1549195051193237, | |
| "learning_rate": 0.000714501400739933, | |
| "loss": 0.2982, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.3152086741731628, | |
| "grad_norm": 1.0019187927246094, | |
| "learning_rate": 0.0007123628664913069, | |
| "loss": 0.2975, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.3472877169345265, | |
| "grad_norm": 1.023240327835083, | |
| "learning_rate": 0.0007102243322426809, | |
| "loss": 0.2957, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.3793667596958907, | |
| "grad_norm": 1.0395528078079224, | |
| "learning_rate": 0.0007080900750625522, | |
| "loss": 0.2957, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.4002822955763001, | |
| "eval_loss": 0.28411924839019775, | |
| "eval_runtime": 8.0825, | |
| "eval_samples_per_second": 61.862, | |
| "eval_steps_per_second": 7.795, | |
| "step": 21826 | |
| }, | |
| { | |
| "epoch": 1.4114458024572547, | |
| "grad_norm": 1.2371494770050049, | |
| "learning_rate": 0.0007059515408139262, | |
| "loss": 0.2948, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.4435248452186187, | |
| "grad_norm": 1.1509361267089844, | |
| "learning_rate": 0.0007038130065653001, | |
| "loss": 0.2939, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.4756038879799827, | |
| "grad_norm": 0.9148961305618286, | |
| "learning_rate": 0.0007016744723166742, | |
| "loss": 0.2945, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.5076829307413466, | |
| "grad_norm": 1.301382064819336, | |
| "learning_rate": 0.0006995359380680482, | |
| "loss": 0.2935, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.5397619735027108, | |
| "grad_norm": 1.1036344766616821, | |
| "learning_rate": 0.0006974059579564167, | |
| "loss": 0.2946, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.5718410162640746, | |
| "grad_norm": 1.6663960218429565, | |
| "learning_rate": 0.0006952674237077907, | |
| "loss": 0.2922, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.600327206236166, | |
| "eval_loss": 0.2768917977809906, | |
| "eval_runtime": 6.447, | |
| "eval_samples_per_second": 77.556, | |
| "eval_steps_per_second": 9.772, | |
| "step": 24944 | |
| }, | |
| { | |
| "epoch": 1.6039200590254388, | |
| "grad_norm": 1.0343741178512573, | |
| "learning_rate": 0.0006931288894591647, | |
| "loss": 0.2928, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.6359991017868025, | |
| "grad_norm": 1.3965007066726685, | |
| "learning_rate": 0.0006909903552105388, | |
| "loss": 0.2923, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.6680781445481667, | |
| "grad_norm": 1.1331425905227661, | |
| "learning_rate": 0.0006888518209619127, | |
| "loss": 0.2936, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.7001571873095307, | |
| "grad_norm": 1.0256651639938354, | |
| "learning_rate": 0.0006867132867132868, | |
| "loss": 0.2901, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 1.7322362300708947, | |
| "grad_norm": 1.1987167596817017, | |
| "learning_rate": 0.0006845747524646608, | |
| "loss": 0.2932, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.7643152728322586, | |
| "grad_norm": 0.9224876165390015, | |
| "learning_rate": 0.0006824362182160348, | |
| "loss": 0.2957, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 1.7963943155936226, | |
| "grad_norm": 1.1595081090927124, | |
| "learning_rate": 0.0006802976839674088, | |
| "loss": 0.2895, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.8003721168960318, | |
| "eval_loss": 0.2766253650188446, | |
| "eval_runtime": 6.995, | |
| "eval_samples_per_second": 71.48, | |
| "eval_steps_per_second": 9.006, | |
| "step": 28062 | |
| }, | |
| { | |
| "epoch": 1.8284733583549868, | |
| "grad_norm": 1.0937904119491577, | |
| "learning_rate": 0.0006781634267872801, | |
| "loss": 0.2908, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 1.8605524011163506, | |
| "grad_norm": 1.6129448413848877, | |
| "learning_rate": 0.0006760291696071513, | |
| "loss": 0.2865, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.8926314438777148, | |
| "grad_norm": 1.5731992721557617, | |
| "learning_rate": 0.0006738906353585254, | |
| "loss": 0.2865, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 1.9247104866390787, | |
| "grad_norm": 1.1734341382980347, | |
| "learning_rate": 0.0006717521011098993, | |
| "loss": 0.2858, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.9567895294004427, | |
| "grad_norm": 1.277829647064209, | |
| "learning_rate": 0.0006696135668612733, | |
| "loss": 0.2878, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 1.9888685721618067, | |
| "grad_norm": 1.2518751621246338, | |
| "learning_rate": 0.0006674793096811446, | |
| "loss": 0.2847, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 2.0003849485131364, | |
| "eval_loss": 0.2660098075866699, | |
| "eval_runtime": 8.1469, | |
| "eval_samples_per_second": 61.373, | |
| "eval_steps_per_second": 7.733, | |
| "step": 31180 | |
| }, | |
| { | |
| "epoch": 2.0209155358804094, | |
| "grad_norm": 1.0406543016433716, | |
| "learning_rate": 0.0006653407754325186, | |
| "loss": 0.2821, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 2.052994578641773, | |
| "grad_norm": 1.1324297189712524, | |
| "learning_rate": 0.0006632022411838926, | |
| "loss": 0.2848, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 2.0850736214031373, | |
| "grad_norm": 1.0970991849899292, | |
| "learning_rate": 0.0006610637069352665, | |
| "loss": 0.2815, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 2.1171526641645015, | |
| "grad_norm": 1.0462357997894287, | |
| "learning_rate": 0.0006589251726866407, | |
| "loss": 0.2825, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 2.1492317069258653, | |
| "grad_norm": 1.122344732284546, | |
| "learning_rate": 0.0006567866384380146, | |
| "loss": 0.2816, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 2.1813107496872295, | |
| "grad_norm": 1.108782410621643, | |
| "learning_rate": 0.0006546481041893886, | |
| "loss": 0.2826, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 2.2004298591730023, | |
| "eval_loss": 0.2733038365840912, | |
| "eval_runtime": 7.489, | |
| "eval_samples_per_second": 66.765, | |
| "eval_steps_per_second": 8.412, | |
| "step": 34298 | |
| }, | |
| { | |
| "epoch": 2.2133897924485932, | |
| "grad_norm": 1.092699646949768, | |
| "learning_rate": 0.0006525095699407627, | |
| "loss": 0.2836, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 2.2454688352099574, | |
| "grad_norm": 1.0987275838851929, | |
| "learning_rate": 0.0006503753127606339, | |
| "loss": 0.2779, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 2.277547877971321, | |
| "grad_norm": 1.1260478496551514, | |
| "learning_rate": 0.0006482367785120078, | |
| "loss": 0.2788, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 2.3096269207326854, | |
| "grad_norm": 1.2035844326019287, | |
| "learning_rate": 0.0006460982442633819, | |
| "loss": 0.284, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 2.341705963494049, | |
| "grad_norm": 1.0569308996200562, | |
| "learning_rate": 0.0006439597100147559, | |
| "loss": 0.2812, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 2.3737850062554133, | |
| "grad_norm": 0.8856455683708191, | |
| "learning_rate": 0.0006418254528346272, | |
| "loss": 0.2767, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 2.400474769832868, | |
| "eval_loss": 0.2658798098564148, | |
| "eval_runtime": 8.0192, | |
| "eval_samples_per_second": 62.35, | |
| "eval_steps_per_second": 7.856, | |
| "step": 37416 | |
| }, | |
| { | |
| "epoch": 2.4058640490167775, | |
| "grad_norm": 0.9563117027282715, | |
| "learning_rate": 0.0006396911956544984, | |
| "loss": 0.2787, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 2.4379430917781413, | |
| "grad_norm": 0.9825711250305176, | |
| "learning_rate": 0.0006375526614058725, | |
| "loss": 0.2775, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 2.4700221345395055, | |
| "grad_norm": 1.1553630828857422, | |
| "learning_rate": 0.0006354141271572465, | |
| "loss": 0.2779, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 2.5021011773008692, | |
| "grad_norm": 1.1160862445831299, | |
| "learning_rate": 0.0006332755929086204, | |
| "loss": 0.2819, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 2.5341802200622334, | |
| "grad_norm": 1.2663159370422363, | |
| "learning_rate": 0.0006311370586599945, | |
| "loss": 0.2797, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 2.566259262823597, | |
| "grad_norm": 1.028053641319275, | |
| "learning_rate": 0.0006289985244113685, | |
| "loss": 0.2776, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 2.5983383055849614, | |
| "grad_norm": 1.0482547283172607, | |
| "learning_rate": 0.0006268642672312397, | |
| "loss": 0.2737, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 2.600519680492734, | |
| "eval_loss": 0.26375094056129456, | |
| "eval_runtime": 7.2264, | |
| "eval_samples_per_second": 69.19, | |
| "eval_steps_per_second": 8.718, | |
| "step": 40534 | |
| }, | |
| { | |
| "epoch": 2.6304173483463256, | |
| "grad_norm": 1.0435316562652588, | |
| "learning_rate": 0.0006247257329826138, | |
| "loss": 0.2783, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 2.6624963911076893, | |
| "grad_norm": 0.866886556148529, | |
| "learning_rate": 0.0006225871987339878, | |
| "loss": 0.2738, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 2.694575433869053, | |
| "grad_norm": 1.069162368774414, | |
| "learning_rate": 0.0006204486644853617, | |
| "loss": 0.2783, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 2.7266544766304173, | |
| "grad_norm": 1.450871229171753, | |
| "learning_rate": 0.0006183101302367357, | |
| "loss": 0.275, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 2.7587335193917815, | |
| "grad_norm": 1.0573506355285645, | |
| "learning_rate": 0.000616175873056607, | |
| "loss": 0.2762, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 2.790812562153145, | |
| "grad_norm": 1.1295759677886963, | |
| "learning_rate": 0.000614037338807981, | |
| "loss": 0.2734, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 2.8005645911526003, | |
| "eval_loss": 0.26422494649887085, | |
| "eval_runtime": 8.0132, | |
| "eval_samples_per_second": 62.397, | |
| "eval_steps_per_second": 7.862, | |
| "step": 43652 | |
| }, | |
| { | |
| "epoch": 2.8228916049145094, | |
| "grad_norm": 1.1608061790466309, | |
| "learning_rate": 0.000611898804559355, | |
| "loss": 0.272, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 2.8549706476758736, | |
| "grad_norm": 1.2949804067611694, | |
| "learning_rate": 0.000609760270310729, | |
| "loss": 0.272, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 2.8870496904372374, | |
| "grad_norm": 0.9073989391326904, | |
| "learning_rate": 0.0006076217360621031, | |
| "loss": 0.2744, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 2.919128733198601, | |
| "grad_norm": 0.8703099489212036, | |
| "learning_rate": 0.0006054874788819743, | |
| "loss": 0.2762, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 2.9512077759599653, | |
| "grad_norm": 1.0385627746582031, | |
| "learning_rate": 0.0006033489446333484, | |
| "loss": 0.2732, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 2.9832868187213295, | |
| "grad_norm": 1.1464110612869263, | |
| "learning_rate": 0.0006012104103847223, | |
| "loss": 0.2701, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 3.0005774227697044, | |
| "eval_loss": 0.2622199058532715, | |
| "eval_runtime": 6.3897, | |
| "eval_samples_per_second": 78.251, | |
| "eval_steps_per_second": 9.86, | |
| "step": 46770 | |
| }, | |
| { | |
| "epoch": 3.015333782439932, | |
| "grad_norm": 1.2512695789337158, | |
| "learning_rate": 0.0005990718761360964, | |
| "loss": 0.2697, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 3.047412825201296, | |
| "grad_norm": 0.7882702946662903, | |
| "learning_rate": 0.0005969376189559676, | |
| "loss": 0.2685, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 3.07949186796266, | |
| "grad_norm": 1.282387614250183, | |
| "learning_rate": 0.0005947990847073416, | |
| "loss": 0.2674, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 3.111570910724024, | |
| "grad_norm": 1.4220795631408691, | |
| "learning_rate": 0.0005926605504587157, | |
| "loss": 0.2676, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 3.143649953485388, | |
| "grad_norm": 1.0576750040054321, | |
| "learning_rate": 0.0005905220162100896, | |
| "loss": 0.269, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 3.175728996246752, | |
| "grad_norm": 1.0533617734909058, | |
| "learning_rate": 0.0005883834819614636, | |
| "loss": 0.2629, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 3.2006223334295703, | |
| "eval_loss": 0.2548312246799469, | |
| "eval_runtime": 6.5962, | |
| "eval_samples_per_second": 75.802, | |
| "eval_steps_per_second": 9.551, | |
| "step": 49888 | |
| }, | |
| { | |
| "epoch": 3.207808039008116, | |
| "grad_norm": 1.1989775896072388, | |
| "learning_rate": 0.0005862449477128377, | |
| "loss": 0.2666, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 3.23988708176948, | |
| "grad_norm": 1.0356203317642212, | |
| "learning_rate": 0.0005841064134642117, | |
| "loss": 0.2622, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 3.2719661245308442, | |
| "grad_norm": 1.0133503675460815, | |
| "learning_rate": 0.0005819721562840828, | |
| "loss": 0.2671, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 3.304045167292208, | |
| "grad_norm": 1.1607108116149902, | |
| "learning_rate": 0.000579833622035457, | |
| "loss": 0.2633, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 3.336124210053572, | |
| "grad_norm": 1.0809621810913086, | |
| "learning_rate": 0.0005776950877868309, | |
| "loss": 0.267, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 3.368203252814936, | |
| "grad_norm": 1.1140522956848145, | |
| "learning_rate": 0.0005755565535382049, | |
| "loss": 0.2655, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 3.4002822955763, | |
| "grad_norm": 1.4667320251464844, | |
| "learning_rate": 0.0005734222963580762, | |
| "loss": 0.2652, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 3.4006672440894365, | |
| "eval_loss": 0.25107353925704956, | |
| "eval_runtime": 8.1009, | |
| "eval_samples_per_second": 61.721, | |
| "eval_steps_per_second": 7.777, | |
| "step": 53006 | |
| }, | |
| { | |
| "epoch": 3.432361338337664, | |
| "grad_norm": 1.0906621217727661, | |
| "learning_rate": 0.0005712837621094502, | |
| "loss": 0.2633, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 3.464440381099028, | |
| "grad_norm": 1.7066080570220947, | |
| "learning_rate": 0.0005691452278608242, | |
| "loss": 0.2624, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 3.496519423860392, | |
| "grad_norm": 1.1421736478805542, | |
| "learning_rate": 0.0005670066936121983, | |
| "loss": 0.267, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 3.528598466621756, | |
| "grad_norm": 0.9186555743217468, | |
| "learning_rate": 0.0005648681593635722, | |
| "loss": 0.2646, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 3.5606775093831198, | |
| "grad_norm": 0.8647829294204712, | |
| "learning_rate": 0.0005627339021834436, | |
| "loss": 0.2614, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 3.592756552144484, | |
| "grad_norm": 0.9938694834709167, | |
| "learning_rate": 0.0005605953679348175, | |
| "loss": 0.263, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 3.6007121547493024, | |
| "eval_loss": 0.2563716173171997, | |
| "eval_runtime": 6.467, | |
| "eval_samples_per_second": 77.315, | |
| "eval_steps_per_second": 9.742, | |
| "step": 56124 | |
| }, | |
| { | |
| "epoch": 3.624835594905848, | |
| "grad_norm": 0.8723744750022888, | |
| "learning_rate": 0.0005584568336861915, | |
| "loss": 0.2612, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 3.656914637667212, | |
| "grad_norm": 1.0959194898605347, | |
| "learning_rate": 0.0005563182994375655, | |
| "loss": 0.2611, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 3.688993680428576, | |
| "grad_norm": 1.2405571937561035, | |
| "learning_rate": 0.0005541797651889395, | |
| "loss": 0.2635, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 3.72107272318994, | |
| "grad_norm": 1.137342929840088, | |
| "learning_rate": 0.0005520412309403136, | |
| "loss": 0.2595, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 3.753151765951304, | |
| "grad_norm": 0.8755650520324707, | |
| "learning_rate": 0.0005499026966916875, | |
| "loss": 0.2619, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 3.785230808712668, | |
| "grad_norm": 1.2968310117721558, | |
| "learning_rate": 0.0005477641624430616, | |
| "loss": 0.2612, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 3.8007570654091682, | |
| "eval_loss": 0.2506495714187622, | |
| "eval_runtime": 8.1115, | |
| "eval_samples_per_second": 61.641, | |
| "eval_steps_per_second": 7.767, | |
| "step": 59242 | |
| }, | |
| { | |
| "epoch": 3.817309851474032, | |
| "grad_norm": 0.9842492938041687, | |
| "learning_rate": 0.0005456299052629328, | |
| "loss": 0.2624, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 3.849388894235396, | |
| "grad_norm": 0.8897719979286194, | |
| "learning_rate": 0.0005434913710143068, | |
| "loss": 0.2573, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 3.88146793699676, | |
| "grad_norm": 0.9902140498161316, | |
| "learning_rate": 0.0005413528367656809, | |
| "loss": 0.2624, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 3.913546979758124, | |
| "grad_norm": 0.9460390210151672, | |
| "learning_rate": 0.0005392185795855521, | |
| "loss": 0.2582, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 3.945626022519488, | |
| "grad_norm": 1.1403892040252686, | |
| "learning_rate": 0.0005370800453369261, | |
| "loss": 0.2627, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 3.977705065280852, | |
| "grad_norm": 0.987301766872406, | |
| "learning_rate": 0.0005349415110883001, | |
| "loss": 0.2616, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 4.000769897026273, | |
| "eval_loss": 0.2439550757408142, | |
| "eval_runtime": 7.3235, | |
| "eval_samples_per_second": 68.274, | |
| "eval_steps_per_second": 8.602, | |
| "step": 62360 | |
| }, | |
| { | |
| "epoch": 4.009752028999455, | |
| "grad_norm": 0.9634618163108826, | |
| "learning_rate": 0.0005328029768396741, | |
| "loss": 0.2522, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 4.041831071760819, | |
| "grad_norm": 1.0599830150604248, | |
| "learning_rate": 0.0005306687196595454, | |
| "loss": 0.2548, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 4.0739101145221825, | |
| "grad_norm": 1.1028203964233398, | |
| "learning_rate": 0.0005285301854109194, | |
| "loss": 0.2529, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 4.105989157283546, | |
| "grad_norm": 1.1508251428604126, | |
| "learning_rate": 0.0005263916511622933, | |
| "loss": 0.2539, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 4.138068200044911, | |
| "grad_norm": 1.0701543092727661, | |
| "learning_rate": 0.0005242531169136675, | |
| "loss": 0.2502, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 4.170147242806275, | |
| "grad_norm": 1.0984095335006714, | |
| "learning_rate": 0.0005221145826650414, | |
| "loss": 0.2574, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 4.200814807686139, | |
| "eval_loss": 0.24038133025169373, | |
| "eval_runtime": 6.2912, | |
| "eval_samples_per_second": 79.476, | |
| "eval_steps_per_second": 10.014, | |
| "step": 65478 | |
| }, | |
| { | |
| "epoch": 4.202226285567638, | |
| "grad_norm": 0.9908430576324463, | |
| "learning_rate": 0.0005199803254849126, | |
| "loss": 0.2564, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 4.234305328329003, | |
| "grad_norm": 1.0797010660171509, | |
| "learning_rate": 0.0005178417912362867, | |
| "loss": 0.2568, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 4.266384371090367, | |
| "grad_norm": 1.3110967874526978, | |
| "learning_rate": 0.0005157032569876607, | |
| "loss": 0.2546, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 4.298463413851731, | |
| "grad_norm": 1.22694993019104, | |
| "learning_rate": 0.0005135647227390347, | |
| "loss": 0.253, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 4.330542456613094, | |
| "grad_norm": 1.146044373512268, | |
| "learning_rate": 0.000511430465558906, | |
| "loss": 0.2529, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 4.362621499374459, | |
| "grad_norm": 1.0950199365615845, | |
| "learning_rate": 0.0005092919313102799, | |
| "loss": 0.2506, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 4.394700542135823, | |
| "grad_norm": 1.1739941835403442, | |
| "learning_rate": 0.0005071533970616539, | |
| "loss": 0.2501, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 4.4008597183460045, | |
| "eval_loss": 0.24005259573459625, | |
| "eval_runtime": 7.6188, | |
| "eval_samples_per_second": 65.627, | |
| "eval_steps_per_second": 8.269, | |
| "step": 68596 | |
| }, | |
| { | |
| "epoch": 4.4267795848971865, | |
| "grad_norm": 1.3007426261901855, | |
| "learning_rate": 0.000505014862813028, | |
| "loss": 0.2507, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 4.45885862765855, | |
| "grad_norm": 1.0202151536941528, | |
| "learning_rate": 0.0005028763285644019, | |
| "loss": 0.2535, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 4.490937670419915, | |
| "grad_norm": 1.5328317880630493, | |
| "learning_rate": 0.000500737794315776, | |
| "loss": 0.2522, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 4.523016713181279, | |
| "grad_norm": 1.0239914655685425, | |
| "learning_rate": 0.0004986035371356472, | |
| "loss": 0.2496, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 4.555095755942642, | |
| "grad_norm": 0.9700740575790405, | |
| "learning_rate": 0.0004964650028870213, | |
| "loss": 0.2512, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 4.587174798704007, | |
| "grad_norm": 1.2554644346237183, | |
| "learning_rate": 0.0004943264686383952, | |
| "loss": 0.2506, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 4.60090462900587, | |
| "eval_loss": 0.24138595163822174, | |
| "eval_runtime": 8.1086, | |
| "eval_samples_per_second": 61.663, | |
| "eval_steps_per_second": 7.77, | |
| "step": 71714 | |
| }, | |
| { | |
| "epoch": 4.619253841465371, | |
| "grad_norm": 1.5016344785690308, | |
| "learning_rate": 0.0004921879343897693, | |
| "loss": 0.2471, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 4.6513328842267345, | |
| "grad_norm": 1.251976490020752, | |
| "learning_rate": 0.0004900494001411433, | |
| "loss": 0.2531, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 4.683411926988098, | |
| "grad_norm": 1.0139933824539185, | |
| "learning_rate": 0.0004879151429610145, | |
| "loss": 0.2488, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 4.715490969749463, | |
| "grad_norm": 0.9058252573013306, | |
| "learning_rate": 0.0004857766087123886, | |
| "loss": 0.2478, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 4.747570012510827, | |
| "grad_norm": 0.8362458944320679, | |
| "learning_rate": 0.00048363807446376257, | |
| "loss": 0.2495, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 4.77964905527219, | |
| "grad_norm": 0.996514081954956, | |
| "learning_rate": 0.00048149954021513655, | |
| "loss": 0.251, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 4.800949539665736, | |
| "eval_loss": 0.23780158162117004, | |
| "eval_runtime": 6.3001, | |
| "eval_samples_per_second": 79.364, | |
| "eval_steps_per_second": 10.0, | |
| "step": 74832 | |
| }, | |
| { | |
| "epoch": 4.811728098033555, | |
| "grad_norm": 1.115113377571106, | |
| "learning_rate": 0.00047936528303500785, | |
| "loss": 0.2477, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 4.843807140794919, | |
| "grad_norm": 1.2068469524383545, | |
| "learning_rate": 0.00047722674878638183, | |
| "loss": 0.2462, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 4.875886183556283, | |
| "grad_norm": 1.0453870296478271, | |
| "learning_rate": 0.0004750882145377558, | |
| "loss": 0.2456, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 4.907965226317646, | |
| "grad_norm": 1.0999488830566406, | |
| "learning_rate": 0.0004729496802891299, | |
| "loss": 0.2492, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 4.940044269079011, | |
| "grad_norm": 0.8874688148498535, | |
| "learning_rate": 0.00047081114604050387, | |
| "loss": 0.2467, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 4.972123311840375, | |
| "grad_norm": 1.060391902923584, | |
| "learning_rate": 0.0004686726117918779, | |
| "loss": 0.2456, | |
| "step": 77500 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 187044, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 12, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2869610745448366e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |