{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 15353, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003256692503093858, "grad_norm": 3.83524227142334, "learning_rate": 4.1666666666666665e-05, "loss": 9.305, "step": 50 }, { "epoch": 0.006513385006187716, "grad_norm": 3.6058454513549805, "learning_rate": 8.333333333333333e-05, "loss": 2.3661, "step": 100 }, { "epoch": 0.009770077509281574, "grad_norm": 2.746100902557373, "learning_rate": 0.000125, "loss": 2.1607, "step": 150 }, { "epoch": 0.013026770012375432, "grad_norm": 1.6440224647521973, "learning_rate": 0.00016666666666666666, "loss": 1.9886, "step": 200 }, { "epoch": 0.01628346251546929, "grad_norm": 2.5189008712768555, "learning_rate": 0.00020833333333333335, "loss": 1.3986, "step": 250 }, { "epoch": 0.019540155018563148, "grad_norm": 1.8205941915512085, "learning_rate": 0.00025, "loss": 1.9198, "step": 300 }, { "epoch": 0.022796847521657004, "grad_norm": 1.2250250577926636, "learning_rate": 0.0002491696007440377, "loss": 1.5733, "step": 350 }, { "epoch": 0.026053540024750865, "grad_norm": 1.1779519319534302, "learning_rate": 0.0002483392014880755, "loss": 1.5086, "step": 400 }, { "epoch": 0.02931023252784472, "grad_norm": 1.1469779014587402, "learning_rate": 0.0002475088022321132, "loss": 1.2551, "step": 450 }, { "epoch": 0.03256692503093858, "grad_norm": 1.115845799446106, "learning_rate": 0.00024667840297615094, "loss": 1.4877, "step": 500 }, { "epoch": 0.03582361753403244, "grad_norm": 1.4156146049499512, "learning_rate": 0.00024584800372018866, "loss": 1.6161, "step": 550 }, { "epoch": 0.039080310037126295, "grad_norm": 0.7899070978164673, "learning_rate": 0.00024501760446422644, "loss": 1.643, "step": 600 }, { "epoch": 0.04233700254022015, "grad_norm": 1.0918740034103394, "learning_rate": 0.00024418720520826416, "loss": 1.4248, "step": 650 }, { "epoch": 0.04559369504331401, "grad_norm": 1.0206936597824097, "learning_rate": 0.00024335680595230185, "loss": 1.2039, "step": 700 }, { "epoch": 0.048850387546407865, "grad_norm": 1.018481969833374, "learning_rate": 0.0002425264066963396, "loss": 1.2592, "step": 750 }, { "epoch": 0.05210708004950173, "grad_norm": 1.174776554107666, "learning_rate": 0.00024169600744037735, "loss": 1.421, "step": 800 }, { "epoch": 0.055363772552595586, "grad_norm": 1.697268009185791, "learning_rate": 0.0002408656081844151, "loss": 1.5579, "step": 850 }, { "epoch": 0.05862046505568944, "grad_norm": 1.5305922031402588, "learning_rate": 0.0002400352089284528, "loss": 1.4551, "step": 900 }, { "epoch": 0.0618771575587833, "grad_norm": 1.0665664672851562, "learning_rate": 0.00023920480967249053, "loss": 1.4691, "step": 950 }, { "epoch": 0.06513385006187716, "grad_norm": 1.4318559169769287, "learning_rate": 0.00023837441041652828, "loss": 1.4589, "step": 1000 }, { "epoch": 0.06839054256497101, "grad_norm": 0.9295191764831543, "learning_rate": 0.000237544011160566, "loss": 1.6214, "step": 1050 }, { "epoch": 0.07164723506806488, "grad_norm": 1.217563271522522, "learning_rate": 0.00023671361190460375, "loss": 1.4662, "step": 1100 }, { "epoch": 0.07490392757115873, "grad_norm": 1.0227612257003784, "learning_rate": 0.00023588321264864147, "loss": 1.4038, "step": 1150 }, { "epoch": 0.07816062007425259, "grad_norm": 1.14272940158844, "learning_rate": 0.0002350528133926792, "loss": 1.4528, "step": 1200 }, { "epoch": 0.08141731257734644, "grad_norm": 0.9993245005607605, "learning_rate": 0.00023422241413671694, "loss": 1.4288, "step": 1250 }, { "epoch": 0.0846740050804403, "grad_norm": 0.4217207133769989, "learning_rate": 0.0002333920148807547, "loss": 1.1432, "step": 1300 }, { "epoch": 0.08793069758353417, "grad_norm": 1.0615357160568237, "learning_rate": 0.0002325616156247924, "loss": 1.6334, "step": 1350 }, { "epoch": 0.09118739008662802, "grad_norm": 0.9344422817230225, "learning_rate": 0.00023173121636883013, "loss": 1.2844, "step": 1400 }, { "epoch": 0.09444408258972188, "grad_norm": 0.8281479477882385, "learning_rate": 0.00023090081711286788, "loss": 1.2908, "step": 1450 }, { "epoch": 0.09770077509281573, "grad_norm": 0.8958244919776917, "learning_rate": 0.0002300704178569056, "loss": 1.473, "step": 1500 }, { "epoch": 0.1009574675959096, "grad_norm": 0.42699888348579407, "learning_rate": 0.00022924001860094334, "loss": 1.4183, "step": 1550 }, { "epoch": 0.10421416009900346, "grad_norm": 1.2085658311843872, "learning_rate": 0.0002284096193449811, "loss": 1.4231, "step": 1600 }, { "epoch": 0.10747085260209731, "grad_norm": 0.9281423687934875, "learning_rate": 0.00022757922008901878, "loss": 1.4042, "step": 1650 }, { "epoch": 0.11072754510519117, "grad_norm": 1.2586039304733276, "learning_rate": 0.00022674882083305653, "loss": 1.2469, "step": 1700 }, { "epoch": 0.11398423760828502, "grad_norm": 0.855798065662384, "learning_rate": 0.00022591842157709428, "loss": 1.3069, "step": 1750 }, { "epoch": 0.11724093011137889, "grad_norm": 0.8409788012504578, "learning_rate": 0.000225088022321132, "loss": 1.3592, "step": 1800 }, { "epoch": 0.12049762261447274, "grad_norm": 0.8641282320022583, "learning_rate": 0.00022425762306516972, "loss": 1.2472, "step": 1850 }, { "epoch": 0.1237543151175666, "grad_norm": 0.4140438139438629, "learning_rate": 0.00022342722380920747, "loss": 1.2858, "step": 1900 }, { "epoch": 0.12701100762066045, "grad_norm": 0.5507912635803223, "learning_rate": 0.00022259682455324522, "loss": 1.3481, "step": 1950 }, { "epoch": 0.13026770012375433, "grad_norm": 0.8350768089294434, "learning_rate": 0.00022176642529728294, "loss": 1.0213, "step": 2000 }, { "epoch": 0.13352439262684818, "grad_norm": 0.809004545211792, "learning_rate": 0.00022093602604132068, "loss": 1.3442, "step": 2050 }, { "epoch": 0.13678108512994203, "grad_norm": 0.8750516176223755, "learning_rate": 0.0002201056267853584, "loss": 1.2175, "step": 2100 }, { "epoch": 0.14003777763303588, "grad_norm": 0.8275018930435181, "learning_rate": 0.00021927522752939613, "loss": 0.9712, "step": 2150 }, { "epoch": 0.14329447013612975, "grad_norm": 1.3380861282348633, "learning_rate": 0.00021844482827343387, "loss": 1.3666, "step": 2200 }, { "epoch": 0.1465511626392236, "grad_norm": 0.9239203333854675, "learning_rate": 0.00021761442901747162, "loss": 1.3622, "step": 2250 }, { "epoch": 0.14980785514231745, "grad_norm": 1.127252459526062, "learning_rate": 0.00021678402976150934, "loss": 1.3597, "step": 2300 }, { "epoch": 0.15306454764541133, "grad_norm": 0.8589280247688293, "learning_rate": 0.00021595363050554706, "loss": 1.1863, "step": 2350 }, { "epoch": 0.15632124014850518, "grad_norm": 1.353403091430664, "learning_rate": 0.0002151232312495848, "loss": 1.3816, "step": 2400 }, { "epoch": 0.15957793265159903, "grad_norm": 0.9564713835716248, "learning_rate": 0.00021429283199362253, "loss": 1.3415, "step": 2450 }, { "epoch": 0.16283462515469288, "grad_norm": 0.8895342946052551, "learning_rate": 0.00021346243273766028, "loss": 1.3033, "step": 2500 }, { "epoch": 0.16609131765778676, "grad_norm": 0.4100666046142578, "learning_rate": 0.00021263203348169803, "loss": 1.272, "step": 2550 }, { "epoch": 0.1693480101608806, "grad_norm": 0.8230715990066528, "learning_rate": 0.00021180163422573572, "loss": 1.2489, "step": 2600 }, { "epoch": 0.17260470266397446, "grad_norm": 0.4173627495765686, "learning_rate": 0.00021097123496977347, "loss": 1.0422, "step": 2650 }, { "epoch": 0.17586139516706834, "grad_norm": 0.9721052646636963, "learning_rate": 0.00021014083571381121, "loss": 1.4685, "step": 2700 }, { "epoch": 0.17911808767016218, "grad_norm": 0.8877786993980408, "learning_rate": 0.00020931043645784893, "loss": 1.3648, "step": 2750 }, { "epoch": 0.18237478017325603, "grad_norm": 0.9426137804985046, "learning_rate": 0.00020848003720188668, "loss": 1.3702, "step": 2800 }, { "epoch": 0.1856314726763499, "grad_norm": 0.8882205486297607, "learning_rate": 0.0002076496379459244, "loss": 1.121, "step": 2850 }, { "epoch": 0.18888816517944376, "grad_norm": 0.43097740411758423, "learning_rate": 0.00020681923868996212, "loss": 1.3049, "step": 2900 }, { "epoch": 0.1921448576825376, "grad_norm": 0.8727696537971497, "learning_rate": 0.00020598883943399987, "loss": 0.9122, "step": 2950 }, { "epoch": 0.19540155018563146, "grad_norm": 0.8810783624649048, "learning_rate": 0.00020515844017803762, "loss": 1.0816, "step": 3000 }, { "epoch": 0.19865824268872534, "grad_norm": 0.8670744299888611, "learning_rate": 0.00020432804092207534, "loss": 1.3254, "step": 3050 }, { "epoch": 0.2019149351918192, "grad_norm": 0.4958556294441223, "learning_rate": 0.00020349764166611306, "loss": 1.2428, "step": 3100 }, { "epoch": 0.20517162769491304, "grad_norm": 0.3851606845855713, "learning_rate": 0.0002026672424101508, "loss": 0.9055, "step": 3150 }, { "epoch": 0.20842832019800692, "grad_norm": 0.38893207907676697, "learning_rate": 0.00020183684315418855, "loss": 1.3058, "step": 3200 }, { "epoch": 0.21168501270110077, "grad_norm": 0.3959135413169861, "learning_rate": 0.00020100644389822627, "loss": 1.0508, "step": 3250 }, { "epoch": 0.21494170520419462, "grad_norm": 0.3882121443748474, "learning_rate": 0.000200176044642264, "loss": 1.2141, "step": 3300 }, { "epoch": 0.21819839770728847, "grad_norm": 0.5256990790367126, "learning_rate": 0.00019934564538630174, "loss": 1.3038, "step": 3350 }, { "epoch": 0.22145509021038234, "grad_norm": 0.8662853240966797, "learning_rate": 0.00019851524613033946, "loss": 0.9852, "step": 3400 }, { "epoch": 0.2247117827134762, "grad_norm": 0.38438016176223755, "learning_rate": 0.0001976848468743772, "loss": 1.2163, "step": 3450 }, { "epoch": 0.22796847521657004, "grad_norm": 0.6737759113311768, "learning_rate": 0.00019685444761841496, "loss": 1.0586, "step": 3500 }, { "epoch": 0.23122516771966392, "grad_norm": 0.933364987373352, "learning_rate": 0.00019602404836245265, "loss": 0.9754, "step": 3550 }, { "epoch": 0.23448186022275777, "grad_norm": 1.460632562637329, "learning_rate": 0.0001951936491064904, "loss": 1.0282, "step": 3600 }, { "epoch": 0.23773855272585162, "grad_norm": 0.8858475089073181, "learning_rate": 0.00019436324985052815, "loss": 1.308, "step": 3650 }, { "epoch": 0.24099524522894547, "grad_norm": 0.8820046186447144, "learning_rate": 0.00019353285059456587, "loss": 1.4124, "step": 3700 }, { "epoch": 0.24425193773203935, "grad_norm": 0.40998589992523193, "learning_rate": 0.00019270245133860362, "loss": 1.2425, "step": 3750 }, { "epoch": 0.2475086302351332, "grad_norm": 0.9782607555389404, "learning_rate": 0.00019187205208264134, "loss": 0.9811, "step": 3800 }, { "epoch": 0.25076532273822705, "grad_norm": 0.8702423572540283, "learning_rate": 0.00019104165282667906, "loss": 1.1369, "step": 3850 }, { "epoch": 0.2540220152413209, "grad_norm": 0.9311513304710388, "learning_rate": 0.0001902112535707168, "loss": 1.0685, "step": 3900 }, { "epoch": 0.25727870774441475, "grad_norm": 1.455741047859192, "learning_rate": 0.00018938085431475455, "loss": 1.2978, "step": 3950 }, { "epoch": 0.26053540024750865, "grad_norm": 0.39552852511405945, "learning_rate": 0.00018855045505879225, "loss": 1.1416, "step": 4000 }, { "epoch": 0.2637920927506025, "grad_norm": 0.8851374387741089, "learning_rate": 0.00018772005580283, "loss": 1.226, "step": 4050 }, { "epoch": 0.26704878525369635, "grad_norm": 0.3884711265563965, "learning_rate": 0.00018688965654686774, "loss": 1.5563, "step": 4100 }, { "epoch": 0.2703054777567902, "grad_norm": 0.8756555318832397, "learning_rate": 0.0001860592572909055, "loss": 1.1869, "step": 4150 }, { "epoch": 0.27356217025988405, "grad_norm": 0.828703761100769, "learning_rate": 0.0001852288580349432, "loss": 1.1819, "step": 4200 }, { "epoch": 0.2768188627629779, "grad_norm": 0.347127228975296, "learning_rate": 0.00018439845877898093, "loss": 1.0252, "step": 4250 }, { "epoch": 0.28007555526607175, "grad_norm": 0.8565844297409058, "learning_rate": 0.00018356805952301868, "loss": 1.2743, "step": 4300 }, { "epoch": 0.28333224776916566, "grad_norm": 0.9678516387939453, "learning_rate": 0.0001827376602670564, "loss": 1.3789, "step": 4350 }, { "epoch": 0.2865889402722595, "grad_norm": 0.8504292368888855, "learning_rate": 0.00018190726101109415, "loss": 1.2299, "step": 4400 }, { "epoch": 0.28984563277535336, "grad_norm": 0.42703336477279663, "learning_rate": 0.0001810768617551319, "loss": 1.047, "step": 4450 }, { "epoch": 0.2931023252784472, "grad_norm": 0.8038926720619202, "learning_rate": 0.00018024646249916959, "loss": 1.3346, "step": 4500 }, { "epoch": 0.29635901778154106, "grad_norm": 0.866212785243988, "learning_rate": 0.00017941606324320733, "loss": 1.1536, "step": 4550 }, { "epoch": 0.2996157102846349, "grad_norm": 0.8326146602630615, "learning_rate": 0.00017858566398724508, "loss": 0.9321, "step": 4600 }, { "epoch": 0.30287240278772876, "grad_norm": 0.36909860372543335, "learning_rate": 0.0001777552647312828, "loss": 0.8376, "step": 4650 }, { "epoch": 0.30612909529082266, "grad_norm": 0.8465257287025452, "learning_rate": 0.00017692486547532055, "loss": 1.0493, "step": 4700 }, { "epoch": 0.3093857877939165, "grad_norm": 0.3820868134498596, "learning_rate": 0.00017609446621935827, "loss": 0.9343, "step": 4750 }, { "epoch": 0.31264248029701036, "grad_norm": 0.8800477981567383, "learning_rate": 0.000175264066963396, "loss": 1.2047, "step": 4800 }, { "epoch": 0.3158991728001042, "grad_norm": 0.3992657959461212, "learning_rate": 0.00017443366770743374, "loss": 1.0367, "step": 4850 }, { "epoch": 0.31915586530319806, "grad_norm": 0.8573354482650757, "learning_rate": 0.00017360326845147149, "loss": 1.1821, "step": 4900 }, { "epoch": 0.3224125578062919, "grad_norm": 1.021317481994629, "learning_rate": 0.00017277286919550918, "loss": 1.2267, "step": 4950 }, { "epoch": 0.32566925030938576, "grad_norm": 0.8740228414535522, "learning_rate": 0.00017194246993954693, "loss": 0.9201, "step": 5000 }, { "epoch": 0.32892594281247967, "grad_norm": 0.9602545499801636, "learning_rate": 0.00017111207068358467, "loss": 0.9957, "step": 5050 }, { "epoch": 0.3321826353155735, "grad_norm": 1.4221771955490112, "learning_rate": 0.00017028167142762242, "loss": 1.3471, "step": 5100 }, { "epoch": 0.33543932781866737, "grad_norm": 0.8135348558425903, "learning_rate": 0.00016945127217166014, "loss": 1.0403, "step": 5150 }, { "epoch": 0.3386960203217612, "grad_norm": 0.8601133227348328, "learning_rate": 0.00016862087291569786, "loss": 1.2678, "step": 5200 }, { "epoch": 0.34195271282485507, "grad_norm": 0.8583667874336243, "learning_rate": 0.0001677904736597356, "loss": 1.0848, "step": 5250 }, { "epoch": 0.3452094053279489, "grad_norm": 1.00385320186615, "learning_rate": 0.00016696007440377333, "loss": 1.1022, "step": 5300 }, { "epoch": 0.3484660978310428, "grad_norm": 0.8718904852867126, "learning_rate": 0.00016612967514781108, "loss": 1.0018, "step": 5350 }, { "epoch": 0.35172279033413667, "grad_norm": 0.8922727108001709, "learning_rate": 0.00016529927589184883, "loss": 1.1681, "step": 5400 }, { "epoch": 0.3549794828372305, "grad_norm": 1.4163098335266113, "learning_rate": 0.00016446887663588652, "loss": 1.3161, "step": 5450 }, { "epoch": 0.35823617534032437, "grad_norm": 0.9237558841705322, "learning_rate": 0.00016363847737992427, "loss": 1.261, "step": 5500 }, { "epoch": 0.3614928678434182, "grad_norm": 0.982936680316925, "learning_rate": 0.00016280807812396202, "loss": 0.8599, "step": 5550 }, { "epoch": 0.36474956034651207, "grad_norm": 0.8180039525032043, "learning_rate": 0.00016197767886799974, "loss": 1.0713, "step": 5600 }, { "epoch": 0.3680062528496059, "grad_norm": 0.9373849034309387, "learning_rate": 0.00016114727961203748, "loss": 0.9283, "step": 5650 }, { "epoch": 0.3712629453526998, "grad_norm": 0.3987419605255127, "learning_rate": 0.0001603168803560752, "loss": 1.1252, "step": 5700 }, { "epoch": 0.3745196378557937, "grad_norm": 0.8707359433174133, "learning_rate": 0.00015948648110011292, "loss": 0.962, "step": 5750 }, { "epoch": 0.3777763303588875, "grad_norm": 0.46297964453697205, "learning_rate": 0.00015865608184415067, "loss": 1.1484, "step": 5800 }, { "epoch": 0.3810330228619814, "grad_norm": 0.8748087882995605, "learning_rate": 0.00015782568258818842, "loss": 0.9178, "step": 5850 }, { "epoch": 0.3842897153650752, "grad_norm": 0.405792772769928, "learning_rate": 0.0001569952833322261, "loss": 1.0671, "step": 5900 }, { "epoch": 0.3875464078681691, "grad_norm": 0.3936058282852173, "learning_rate": 0.00015616488407626386, "loss": 1.0665, "step": 5950 }, { "epoch": 0.3908031003712629, "grad_norm": 1.9616318941116333, "learning_rate": 0.0001553344848203016, "loss": 1.1888, "step": 6000 }, { "epoch": 0.39405979287435683, "grad_norm": 0.8864133954048157, "learning_rate": 0.00015450408556433933, "loss": 0.9978, "step": 6050 }, { "epoch": 0.3973164853774507, "grad_norm": 1.070251226425171, "learning_rate": 0.00015367368630837708, "loss": 0.9154, "step": 6100 }, { "epoch": 0.40057317788054453, "grad_norm": 0.9131673574447632, "learning_rate": 0.0001528432870524148, "loss": 1.5743, "step": 6150 }, { "epoch": 0.4038298703836384, "grad_norm": 0.72733473777771, "learning_rate": 0.00015201288779645254, "loss": 0.9825, "step": 6200 }, { "epoch": 0.40708656288673223, "grad_norm": 0.8344869613647461, "learning_rate": 0.00015118248854049027, "loss": 1.3235, "step": 6250 }, { "epoch": 0.4103432553898261, "grad_norm": 1.4349044561386108, "learning_rate": 0.000150352089284528, "loss": 0.9427, "step": 6300 }, { "epoch": 0.41359994789291993, "grad_norm": 1.4858365058898926, "learning_rate": 0.00014952169002856576, "loss": 1.1993, "step": 6350 }, { "epoch": 0.41685664039601383, "grad_norm": 0.8102648854255676, "learning_rate": 0.00014869129077260345, "loss": 1.0731, "step": 6400 }, { "epoch": 0.4201133328991077, "grad_norm": 0.8815705180168152, "learning_rate": 0.0001478608915166412, "loss": 1.1213, "step": 6450 }, { "epoch": 0.42337002540220153, "grad_norm": 0.415800005197525, "learning_rate": 0.00014703049226067895, "loss": 1.1725, "step": 6500 }, { "epoch": 0.4266267179052954, "grad_norm": 0.8518320322036743, "learning_rate": 0.00014620009300471667, "loss": 1.1265, "step": 6550 }, { "epoch": 0.42988341040838923, "grad_norm": 1.0205966234207153, "learning_rate": 0.00014536969374875442, "loss": 1.1185, "step": 6600 }, { "epoch": 0.4331401029114831, "grad_norm": 0.8417080640792847, "learning_rate": 0.00014453929449279214, "loss": 0.8096, "step": 6650 }, { "epoch": 0.43639679541457693, "grad_norm": 0.9479381442070007, "learning_rate": 0.00014370889523682986, "loss": 1.2273, "step": 6700 }, { "epoch": 0.43965348791767084, "grad_norm": 0.9140861630439758, "learning_rate": 0.0001428784959808676, "loss": 0.9376, "step": 6750 }, { "epoch": 0.4429101804207647, "grad_norm": 0.38954612612724304, "learning_rate": 0.00014204809672490535, "loss": 0.9415, "step": 6800 }, { "epoch": 0.44616687292385854, "grad_norm": 0.8957207202911377, "learning_rate": 0.00014121769746894305, "loss": 1.0328, "step": 6850 }, { "epoch": 0.4494235654269524, "grad_norm": 0.9086111187934875, "learning_rate": 0.0001403872982129808, "loss": 1.1913, "step": 6900 }, { "epoch": 0.45268025793004624, "grad_norm": 0.8815196752548218, "learning_rate": 0.00013955689895701854, "loss": 1.0778, "step": 6950 }, { "epoch": 0.4559369504331401, "grad_norm": 0.8498084545135498, "learning_rate": 0.00013872649970105626, "loss": 1.0653, "step": 7000 }, { "epoch": 0.45919364293623394, "grad_norm": 0.39098435640335083, "learning_rate": 0.000137896100445094, "loss": 1.2029, "step": 7050 }, { "epoch": 0.46245033543932784, "grad_norm": 1.4128884077072144, "learning_rate": 0.00013706570118913173, "loss": 1.0752, "step": 7100 }, { "epoch": 0.4657070279424217, "grad_norm": 0.3744083344936371, "learning_rate": 0.00013623530193316945, "loss": 1.2738, "step": 7150 }, { "epoch": 0.46896372044551554, "grad_norm": 0.8997066617012024, "learning_rate": 0.0001354049026772072, "loss": 1.2614, "step": 7200 }, { "epoch": 0.4722204129486094, "grad_norm": 0.9351083040237427, "learning_rate": 0.00013457450342124495, "loss": 1.0052, "step": 7250 }, { "epoch": 0.47547710545170324, "grad_norm": 0.8742404580116272, "learning_rate": 0.0001337441041652827, "loss": 1.0708, "step": 7300 }, { "epoch": 0.4787337979547971, "grad_norm": 0.9399948120117188, "learning_rate": 0.0001329137049093204, "loss": 0.8287, "step": 7350 }, { "epoch": 0.48199049045789094, "grad_norm": 0.8766932487487793, "learning_rate": 0.00013208330565335814, "loss": 0.7347, "step": 7400 }, { "epoch": 0.48524718296098485, "grad_norm": 0.9277735948562622, "learning_rate": 0.00013125290639739588, "loss": 0.8516, "step": 7450 }, { "epoch": 0.4885038754640787, "grad_norm": 0.34736573696136475, "learning_rate": 0.0001304225071414336, "loss": 1.1256, "step": 7500 }, { "epoch": 0.49176056796717255, "grad_norm": 0.9531852602958679, "learning_rate": 0.00012959210788547135, "loss": 0.9971, "step": 7550 }, { "epoch": 0.4950172604702664, "grad_norm": 0.4136572480201721, "learning_rate": 0.00012876170862950907, "loss": 1.0296, "step": 7600 }, { "epoch": 0.49827395297336025, "grad_norm": 0.8161017298698425, "learning_rate": 0.0001279313093735468, "loss": 1.0189, "step": 7650 }, { "epoch": 0.5015306454764541, "grad_norm": 0.22858373820781708, "learning_rate": 0.00012710091011758454, "loss": 0.9872, "step": 7700 }, { "epoch": 0.504787337979548, "grad_norm": 1.4660929441452026, "learning_rate": 0.0001262705108616223, "loss": 1.1404, "step": 7750 }, { "epoch": 0.5080440304826418, "grad_norm": 0.8792383074760437, "learning_rate": 0.00012544011160566, "loss": 1.335, "step": 7800 }, { "epoch": 0.5113007229857357, "grad_norm": 0.3488224744796753, "learning_rate": 0.00012460971234969776, "loss": 1.2426, "step": 7850 }, { "epoch": 0.5145574154888295, "grad_norm": 1.4280941486358643, "learning_rate": 0.00012377931309373548, "loss": 0.9777, "step": 7900 }, { "epoch": 0.5178141079919234, "grad_norm": 0.8988615274429321, "learning_rate": 0.0001229489138377732, "loss": 1.099, "step": 7950 }, { "epoch": 0.5210708004950173, "grad_norm": 0.395367830991745, "learning_rate": 0.00012211851458181094, "loss": 0.7406, "step": 8000 }, { "epoch": 0.5243274929981111, "grad_norm": 1.5085158348083496, "learning_rate": 0.00012128811532584868, "loss": 1.0318, "step": 8050 }, { "epoch": 0.527584185501205, "grad_norm": 0.9578683376312256, "learning_rate": 0.0001204577160698864, "loss": 0.8106, "step": 8100 }, { "epoch": 0.5308408780042988, "grad_norm": 0.41102516651153564, "learning_rate": 0.00011962731681392413, "loss": 1.0833, "step": 8150 }, { "epoch": 0.5340975705073927, "grad_norm": 0.9245774149894714, "learning_rate": 0.00011879691755796188, "loss": 1.0606, "step": 8200 }, { "epoch": 0.5373542630104865, "grad_norm": 1.5689363479614258, "learning_rate": 0.0001179665183019996, "loss": 0.854, "step": 8250 }, { "epoch": 0.5406109555135804, "grad_norm": 0.8547614812850952, "learning_rate": 0.00011713611904603734, "loss": 1.1463, "step": 8300 }, { "epoch": 0.5438676480166743, "grad_norm": 0.8573695421218872, "learning_rate": 0.00011630571979007507, "loss": 1.3493, "step": 8350 }, { "epoch": 0.5471243405197681, "grad_norm": 1.4601247310638428, "learning_rate": 0.0001154753205341128, "loss": 1.1454, "step": 8400 }, { "epoch": 0.550381033022862, "grad_norm": 1.4803024530410767, "learning_rate": 0.00011464492127815054, "loss": 1.0541, "step": 8450 }, { "epoch": 0.5536377255259558, "grad_norm": 0.9090033769607544, "learning_rate": 0.00011381452202218827, "loss": 0.8359, "step": 8500 }, { "epoch": 0.5568944180290497, "grad_norm": 0.8718037605285645, "learning_rate": 0.000112984122766226, "loss": 1.324, "step": 8550 }, { "epoch": 0.5601511105321435, "grad_norm": 0.41582703590393066, "learning_rate": 0.00011215372351026374, "loss": 0.8871, "step": 8600 }, { "epoch": 0.5634078030352374, "grad_norm": 1.0510010719299316, "learning_rate": 0.00011132332425430147, "loss": 1.003, "step": 8650 }, { "epoch": 0.5666644955383313, "grad_norm": 0.8834037780761719, "learning_rate": 0.0001104929249983392, "loss": 1.1137, "step": 8700 }, { "epoch": 0.5699211880414251, "grad_norm": 0.884861171245575, "learning_rate": 0.00010966252574237694, "loss": 1.0844, "step": 8750 }, { "epoch": 0.573177880544519, "grad_norm": 0.8952613472938538, "learning_rate": 0.00010883212648641468, "loss": 1.1878, "step": 8800 }, { "epoch": 0.5764345730476128, "grad_norm": 0.9889765977859497, "learning_rate": 0.0001080017272304524, "loss": 1.2236, "step": 8850 }, { "epoch": 0.5796912655507067, "grad_norm": 0.8893088698387146, "learning_rate": 0.00010717132797449014, "loss": 1.3734, "step": 8900 }, { "epoch": 0.5829479580538005, "grad_norm": 0.814263105392456, "learning_rate": 0.00010634092871852786, "loss": 1.0553, "step": 8950 }, { "epoch": 0.5862046505568944, "grad_norm": 0.3948083519935608, "learning_rate": 0.0001055105294625656, "loss": 1.3409, "step": 9000 }, { "epoch": 0.5894613430599883, "grad_norm": 0.39876481890678406, "learning_rate": 0.00010468013020660335, "loss": 0.9506, "step": 9050 }, { "epoch": 0.5927180355630821, "grad_norm": 0.9205226898193359, "learning_rate": 0.00010384973095064107, "loss": 1.2607, "step": 9100 }, { "epoch": 0.595974728066176, "grad_norm": 0.9102646708488464, "learning_rate": 0.00010301933169467881, "loss": 1.0062, "step": 9150 }, { "epoch": 0.5992314205692698, "grad_norm": 0.3873719274997711, "learning_rate": 0.00010218893243871654, "loss": 1.0456, "step": 9200 }, { "epoch": 0.6024881130723637, "grad_norm": 0.8773135542869568, "learning_rate": 0.00010135853318275427, "loss": 0.9987, "step": 9250 }, { "epoch": 0.6057448055754575, "grad_norm": 0.2187870293855667, "learning_rate": 0.000100528133926792, "loss": 0.9182, "step": 9300 }, { "epoch": 0.6090014980785514, "grad_norm": 0.990605354309082, "learning_rate": 9.969773467082974e-05, "loss": 1.0887, "step": 9350 }, { "epoch": 0.6122581905816453, "grad_norm": 1.0098599195480347, "learning_rate": 9.886733541486747e-05, "loss": 0.9684, "step": 9400 }, { "epoch": 0.6155148830847391, "grad_norm": 0.9786407947540283, "learning_rate": 9.80369361589052e-05, "loss": 1.0016, "step": 9450 }, { "epoch": 0.618771575587833, "grad_norm": 0.3889008164405823, "learning_rate": 9.720653690294294e-05, "loss": 1.0738, "step": 9500 }, { "epoch": 0.6220282680909268, "grad_norm": 1.005382776260376, "learning_rate": 9.637613764698066e-05, "loss": 1.0616, "step": 9550 }, { "epoch": 0.6252849605940207, "grad_norm": 0.7357724905014038, "learning_rate": 9.554573839101841e-05, "loss": 1.1784, "step": 9600 }, { "epoch": 0.6285416530971145, "grad_norm": 0.8405910134315491, "learning_rate": 9.471533913505613e-05, "loss": 1.1642, "step": 9650 }, { "epoch": 0.6317983456002084, "grad_norm": 0.9371947646141052, "learning_rate": 9.388493987909388e-05, "loss": 0.9979, "step": 9700 }, { "epoch": 0.6350550381033023, "grad_norm": 0.9320511817932129, "learning_rate": 9.305454062313161e-05, "loss": 1.2844, "step": 9750 }, { "epoch": 0.6383117306063961, "grad_norm": 0.3955633044242859, "learning_rate": 9.222414136716933e-05, "loss": 1.0699, "step": 9800 }, { "epoch": 0.64156842310949, "grad_norm": 0.3828829526901245, "learning_rate": 9.139374211120708e-05, "loss": 1.1274, "step": 9850 }, { "epoch": 0.6448251156125838, "grad_norm": 1.0724050998687744, "learning_rate": 9.05633428552448e-05, "loss": 1.0878, "step": 9900 }, { "epoch": 0.6480818081156777, "grad_norm": 1.4844028949737549, "learning_rate": 8.973294359928253e-05, "loss": 0.86, "step": 9950 }, { "epoch": 0.6513385006187715, "grad_norm": 0.9288794994354248, "learning_rate": 8.890254434332028e-05, "loss": 1.1705, "step": 10000 }, { "epoch": 0.6545951931218654, "grad_norm": 0.8749659657478333, "learning_rate": 8.8072145087358e-05, "loss": 0.8074, "step": 10050 }, { "epoch": 0.6578518856249593, "grad_norm": 1.0020465850830078, "learning_rate": 8.724174583139573e-05, "loss": 0.8711, "step": 10100 }, { "epoch": 0.6611085781280531, "grad_norm": 1.0365781784057617, "learning_rate": 8.641134657543347e-05, "loss": 1.1082, "step": 10150 }, { "epoch": 0.664365270631147, "grad_norm": 1.534722924232483, "learning_rate": 8.55809473194712e-05, "loss": 0.9426, "step": 10200 }, { "epoch": 0.6676219631342408, "grad_norm": 0.3390073776245117, "learning_rate": 8.475054806350894e-05, "loss": 0.9053, "step": 10250 }, { "epoch": 0.6708786556373347, "grad_norm": 0.38182663917541504, "learning_rate": 8.392014880754667e-05, "loss": 0.901, "step": 10300 }, { "epoch": 0.6741353481404285, "grad_norm": 0.8459188938140869, "learning_rate": 8.30897495515844e-05, "loss": 0.9118, "step": 10350 }, { "epoch": 0.6773920406435224, "grad_norm": 0.867975115776062, "learning_rate": 8.225935029562214e-05, "loss": 0.7572, "step": 10400 }, { "epoch": 0.6806487331466163, "grad_norm": 0.8717624545097351, "learning_rate": 8.142895103965987e-05, "loss": 0.9569, "step": 10450 }, { "epoch": 0.6839054256497101, "grad_norm": 0.8374590277671814, "learning_rate": 8.05985517836976e-05, "loss": 1.1444, "step": 10500 }, { "epoch": 0.687162118152804, "grad_norm": 1.5119789838790894, "learning_rate": 7.976815252773534e-05, "loss": 1.2996, "step": 10550 }, { "epoch": 0.6904188106558978, "grad_norm": 0.8487447500228882, "learning_rate": 7.893775327177306e-05, "loss": 0.9225, "step": 10600 }, { "epoch": 0.6936755031589917, "grad_norm": 0.9299342632293701, "learning_rate": 7.81073540158108e-05, "loss": 0.8821, "step": 10650 }, { "epoch": 0.6969321956620856, "grad_norm": 0.9675909280776978, "learning_rate": 7.727695475984854e-05, "loss": 0.9443, "step": 10700 }, { "epoch": 0.7001888881651794, "grad_norm": 0.8528934717178345, "learning_rate": 7.644655550388626e-05, "loss": 1.1312, "step": 10750 }, { "epoch": 0.7034455806682733, "grad_norm": 0.9038677215576172, "learning_rate": 7.561615624792401e-05, "loss": 1.3058, "step": 10800 }, { "epoch": 0.7067022731713671, "grad_norm": 0.8895506858825684, "learning_rate": 7.478575699196173e-05, "loss": 0.9965, "step": 10850 }, { "epoch": 0.709958965674461, "grad_norm": 0.3630858063697815, "learning_rate": 7.395535773599947e-05, "loss": 0.852, "step": 10900 }, { "epoch": 0.7132156581775548, "grad_norm": 0.9154430627822876, "learning_rate": 7.312495848003721e-05, "loss": 1.0899, "step": 10950 }, { "epoch": 0.7164723506806487, "grad_norm": 0.38302820920944214, "learning_rate": 7.229455922407493e-05, "loss": 0.8218, "step": 11000 }, { "epoch": 0.7197290431837426, "grad_norm": 0.9081019759178162, "learning_rate": 7.146415996811267e-05, "loss": 0.9718, "step": 11050 }, { "epoch": 0.7229857356868364, "grad_norm": 0.3805684745311737, "learning_rate": 7.06337607121504e-05, "loss": 0.7987, "step": 11100 }, { "epoch": 0.7262424281899303, "grad_norm": 0.9899412989616394, "learning_rate": 6.980336145618814e-05, "loss": 0.7598, "step": 11150 }, { "epoch": 0.7294991206930241, "grad_norm": 0.8916687369346619, "learning_rate": 6.897296220022586e-05, "loss": 1.031, "step": 11200 }, { "epoch": 0.732755813196118, "grad_norm": 0.8672785758972168, "learning_rate": 6.81425629442636e-05, "loss": 1.025, "step": 11250 }, { "epoch": 0.7360125056992118, "grad_norm": 0.8024415969848633, "learning_rate": 6.731216368830134e-05, "loss": 1.0707, "step": 11300 }, { "epoch": 0.7392691982023057, "grad_norm": 1.065266728401184, "learning_rate": 6.648176443233907e-05, "loss": 1.1815, "step": 11350 }, { "epoch": 0.7425258907053996, "grad_norm": 0.34304752945899963, "learning_rate": 6.565136517637681e-05, "loss": 1.14, "step": 11400 }, { "epoch": 0.7457825832084934, "grad_norm": 0.8879311680793762, "learning_rate": 6.482096592041453e-05, "loss": 0.962, "step": 11450 }, { "epoch": 0.7490392757115873, "grad_norm": 1.5473666191101074, "learning_rate": 6.399056666445228e-05, "loss": 1.0096, "step": 11500 }, { "epoch": 0.7522959682146811, "grad_norm": 0.40275564789772034, "learning_rate": 6.316016740849001e-05, "loss": 0.9501, "step": 11550 }, { "epoch": 0.755552660717775, "grad_norm": 1.0201236009597778, "learning_rate": 6.232976815252773e-05, "loss": 0.9252, "step": 11600 }, { "epoch": 0.7588093532208688, "grad_norm": 0.8717750906944275, "learning_rate": 6.149936889656546e-05, "loss": 1.0638, "step": 11650 }, { "epoch": 0.7620660457239627, "grad_norm": 0.916990339756012, "learning_rate": 6.06689696406032e-05, "loss": 1.1011, "step": 11700 }, { "epoch": 0.7653227382270567, "grad_norm": 0.3874845802783966, "learning_rate": 5.983857038464094e-05, "loss": 1.1084, "step": 11750 }, { "epoch": 0.7685794307301504, "grad_norm": 0.9004361629486084, "learning_rate": 5.900817112867867e-05, "loss": 0.8258, "step": 11800 }, { "epoch": 0.7718361232332444, "grad_norm": 1.5847445726394653, "learning_rate": 5.81777718727164e-05, "loss": 0.9595, "step": 11850 }, { "epoch": 0.7750928157363381, "grad_norm": 1.475354552268982, "learning_rate": 5.7347372616754134e-05, "loss": 1.0662, "step": 11900 }, { "epoch": 0.778349508239432, "grad_norm": 0.8929204344749451, "learning_rate": 5.651697336079187e-05, "loss": 1.0709, "step": 11950 }, { "epoch": 0.7816062007425258, "grad_norm": 0.3107670545578003, "learning_rate": 5.56865741048296e-05, "loss": 1.0773, "step": 12000 }, { "epoch": 0.7848628932456198, "grad_norm": 0.8564940094947815, "learning_rate": 5.485617484886734e-05, "loss": 0.9426, "step": 12050 }, { "epoch": 0.7881195857487137, "grad_norm": 0.4336640536785126, "learning_rate": 5.402577559290507e-05, "loss": 0.9225, "step": 12100 }, { "epoch": 0.7913762782518075, "grad_norm": 1.5745172500610352, "learning_rate": 5.3195376336942805e-05, "loss": 1.0294, "step": 12150 }, { "epoch": 0.7946329707549014, "grad_norm": 0.3745453655719757, "learning_rate": 5.236497708098054e-05, "loss": 0.9682, "step": 12200 }, { "epoch": 0.7978896632579952, "grad_norm": 0.926125168800354, "learning_rate": 5.1534577825018266e-05, "loss": 1.0388, "step": 12250 }, { "epoch": 0.8011463557610891, "grad_norm": 0.38713493943214417, "learning_rate": 5.0704178569056e-05, "loss": 1.1054, "step": 12300 }, { "epoch": 0.8044030482641829, "grad_norm": 0.37707558274269104, "learning_rate": 4.987377931309374e-05, "loss": 1.2333, "step": 12350 }, { "epoch": 0.8076597407672768, "grad_norm": 0.9945393204689026, "learning_rate": 4.904338005713147e-05, "loss": 0.9781, "step": 12400 }, { "epoch": 0.8109164332703707, "grad_norm": 0.3638940453529358, "learning_rate": 4.82129808011692e-05, "loss": 1.0137, "step": 12450 }, { "epoch": 0.8141731257734645, "grad_norm": 0.42148882150650024, "learning_rate": 4.7382581545206937e-05, "loss": 1.0074, "step": 12500 }, { "epoch": 0.8174298182765584, "grad_norm": 0.9623544812202454, "learning_rate": 4.655218228924467e-05, "loss": 1.0196, "step": 12550 }, { "epoch": 0.8206865107796522, "grad_norm": 1.0282769203186035, "learning_rate": 4.57217830332824e-05, "loss": 1.0768, "step": 12600 }, { "epoch": 0.8239432032827461, "grad_norm": 0.8840997815132141, "learning_rate": 4.489138377732014e-05, "loss": 0.8632, "step": 12650 }, { "epoch": 0.8271998957858399, "grad_norm": 1.0058900117874146, "learning_rate": 4.406098452135787e-05, "loss": 1.1787, "step": 12700 }, { "epoch": 0.8304565882889338, "grad_norm": 0.9101676344871521, "learning_rate": 4.323058526539561e-05, "loss": 0.8222, "step": 12750 }, { "epoch": 0.8337132807920277, "grad_norm": 2.6229491233825684, "learning_rate": 4.2400186009433334e-05, "loss": 1.1567, "step": 12800 }, { "epoch": 0.8369699732951215, "grad_norm": 0.35638782382011414, "learning_rate": 4.156978675347107e-05, "loss": 1.0064, "step": 12850 }, { "epoch": 0.8402266657982154, "grad_norm": 1.0537761449813843, "learning_rate": 4.07393874975088e-05, "loss": 0.8113, "step": 12900 }, { "epoch": 0.8434833583013092, "grad_norm": 0.9127081632614136, "learning_rate": 3.990898824154653e-05, "loss": 0.9494, "step": 12950 }, { "epoch": 0.8467400508044031, "grad_norm": 0.8901873230934143, "learning_rate": 3.907858898558427e-05, "loss": 0.8887, "step": 13000 }, { "epoch": 0.8499967433074969, "grad_norm": 1.5736379623413086, "learning_rate": 3.8248189729622005e-05, "loss": 0.8705, "step": 13050 }, { "epoch": 0.8532534358105908, "grad_norm": 0.8398550748825073, "learning_rate": 3.741779047365974e-05, "loss": 0.9059, "step": 13100 }, { "epoch": 0.8565101283136847, "grad_norm": 0.4221436381340027, "learning_rate": 3.6587391217697466e-05, "loss": 0.9535, "step": 13150 }, { "epoch": 0.8597668208167785, "grad_norm": 0.9178587794303894, "learning_rate": 3.57569919617352e-05, "loss": 0.9296, "step": 13200 }, { "epoch": 0.8630235133198724, "grad_norm": 0.9222294688224792, "learning_rate": 3.4926592705772934e-05, "loss": 0.7618, "step": 13250 }, { "epoch": 0.8662802058229662, "grad_norm": 0.9021453857421875, "learning_rate": 3.4096193449810675e-05, "loss": 0.9893, "step": 13300 }, { "epoch": 0.8695368983260601, "grad_norm": 1.5189154148101807, "learning_rate": 3.32657941938484e-05, "loss": 1.1592, "step": 13350 }, { "epoch": 0.8727935908291539, "grad_norm": 0.38935962319374084, "learning_rate": 3.2435394937886136e-05, "loss": 1.206, "step": 13400 }, { "epoch": 0.8760502833322478, "grad_norm": 0.8991114497184753, "learning_rate": 3.160499568192387e-05, "loss": 0.9909, "step": 13450 }, { "epoch": 0.8793069758353417, "grad_norm": 1.0564295053482056, "learning_rate": 3.0774596425961604e-05, "loss": 1.0172, "step": 13500 }, { "epoch": 0.8825636683384355, "grad_norm": 0.8721094131469727, "learning_rate": 2.994419716999934e-05, "loss": 1.1926, "step": 13550 }, { "epoch": 0.8858203608415294, "grad_norm": 1.5451397895812988, "learning_rate": 2.911379791403707e-05, "loss": 0.7894, "step": 13600 }, { "epoch": 0.8890770533446232, "grad_norm": 0.902381956577301, "learning_rate": 2.82833986580748e-05, "loss": 0.9943, "step": 13650 }, { "epoch": 0.8923337458477171, "grad_norm": 0.4030613899230957, "learning_rate": 2.7452999402112537e-05, "loss": 1.0051, "step": 13700 }, { "epoch": 0.8955904383508109, "grad_norm": 1.0364080667495728, "learning_rate": 2.6622600146150268e-05, "loss": 1.147, "step": 13750 }, { "epoch": 0.8988471308539048, "grad_norm": 1.012832760810852, "learning_rate": 2.5792200890188002e-05, "loss": 1.02, "step": 13800 }, { "epoch": 0.9021038233569987, "grad_norm": 0.872316837310791, "learning_rate": 2.4961801634225736e-05, "loss": 1.1203, "step": 13850 }, { "epoch": 0.9053605158600925, "grad_norm": 0.8862074613571167, "learning_rate": 2.413140237826347e-05, "loss": 0.7, "step": 13900 }, { "epoch": 0.9086172083631864, "grad_norm": 1.6284419298171997, "learning_rate": 2.33010031223012e-05, "loss": 0.9277, "step": 13950 }, { "epoch": 0.9118739008662802, "grad_norm": 1.1000895500183105, "learning_rate": 2.247060386633894e-05, "loss": 0.9749, "step": 14000 }, { "epoch": 0.9151305933693741, "grad_norm": 0.9143940210342407, "learning_rate": 2.164020461037667e-05, "loss": 0.8839, "step": 14050 }, { "epoch": 0.9183872858724679, "grad_norm": 0.9441725611686707, "learning_rate": 2.0809805354414403e-05, "loss": 0.9014, "step": 14100 }, { "epoch": 0.9216439783755618, "grad_norm": 0.8585488200187683, "learning_rate": 1.9979406098452137e-05, "loss": 0.9963, "step": 14150 }, { "epoch": 0.9249006708786557, "grad_norm": 0.9217290878295898, "learning_rate": 1.9149006842489868e-05, "loss": 0.977, "step": 14200 }, { "epoch": 0.9281573633817495, "grad_norm": 1.580613136291504, "learning_rate": 1.8318607586527602e-05, "loss": 0.9786, "step": 14250 }, { "epoch": 0.9314140558848434, "grad_norm": 0.4184972941875458, "learning_rate": 1.7488208330565336e-05, "loss": 1.1275, "step": 14300 }, { "epoch": 0.9346707483879372, "grad_norm": 1.5584691762924194, "learning_rate": 1.665780907460307e-05, "loss": 1.0094, "step": 14350 }, { "epoch": 0.9379274408910311, "grad_norm": 0.9363598227500916, "learning_rate": 1.58274098186408e-05, "loss": 1.0117, "step": 14400 }, { "epoch": 0.9411841333941249, "grad_norm": 0.8482846617698669, "learning_rate": 1.4997010562678537e-05, "loss": 0.94, "step": 14450 }, { "epoch": 0.9444408258972188, "grad_norm": 0.947212815284729, "learning_rate": 1.4166611306716269e-05, "loss": 1.0283, "step": 14500 }, { "epoch": 0.9476975184003127, "grad_norm": 1.6431479454040527, "learning_rate": 1.3336212050754001e-05, "loss": 1.0093, "step": 14550 }, { "epoch": 0.9509542109034065, "grad_norm": 0.9370154142379761, "learning_rate": 1.2505812794791735e-05, "loss": 0.7167, "step": 14600 }, { "epoch": 0.9542109034065004, "grad_norm": 1.5658189058303833, "learning_rate": 1.167541353882947e-05, "loss": 0.826, "step": 14650 }, { "epoch": 0.9574675959095942, "grad_norm": 1.0563361644744873, "learning_rate": 1.0845014282867202e-05, "loss": 0.9998, "step": 14700 }, { "epoch": 0.9607242884126881, "grad_norm": 1.5338733196258545, "learning_rate": 1.0014615026904936e-05, "loss": 1.3073, "step": 14750 }, { "epoch": 0.9639809809157819, "grad_norm": 0.43706658482551575, "learning_rate": 9.18421577094267e-06, "loss": 0.7607, "step": 14800 }, { "epoch": 0.9672376734188758, "grad_norm": 0.8777522444725037, "learning_rate": 8.353816514980402e-06, "loss": 0.8597, "step": 14850 }, { "epoch": 0.9704943659219697, "grad_norm": 0.9181737899780273, "learning_rate": 7.5234172590181365e-06, "loss": 1.0577, "step": 14900 }, { "epoch": 0.9737510584250635, "grad_norm": 0.9147255420684814, "learning_rate": 6.693018003055869e-06, "loss": 0.8303, "step": 14950 }, { "epoch": 0.9770077509281574, "grad_norm": 0.32864147424697876, "learning_rate": 5.862618747093602e-06, "loss": 0.9726, "step": 15000 }, { "epoch": 0.9802644434312512, "grad_norm": 0.8894370794296265, "learning_rate": 5.032219491131336e-06, "loss": 1.0291, "step": 15050 }, { "epoch": 0.9835211359343451, "grad_norm": 0.9387512803077698, "learning_rate": 4.2018202351690694e-06, "loss": 0.941, "step": 15100 }, { "epoch": 0.9867778284374389, "grad_norm": 1.0713492631912231, "learning_rate": 3.3714209792068027e-06, "loss": 0.9106, "step": 15150 }, { "epoch": 0.9900345209405328, "grad_norm": 0.4232991635799408, "learning_rate": 2.541021723244536e-06, "loss": 0.6604, "step": 15200 }, { "epoch": 0.9932912134436267, "grad_norm": 1.5282920598983765, "learning_rate": 1.7106224672822693e-06, "loss": 1.1954, "step": 15250 }, { "epoch": 0.9965479059467205, "grad_norm": 0.9117501378059387, "learning_rate": 8.802232113200027e-07, "loss": 1.0523, "step": 15300 }, { "epoch": 0.9998045984498144, "grad_norm": 0.8948280215263367, "learning_rate": 4.9823955357736e-08, "loss": 0.6739, "step": 15350 } ], "logging_steps": 50, "max_steps": 15353, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }