| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 10665, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.015002344116268168, | |
| "grad_norm": 13.25, | |
| "learning_rate": 2.905342080599813e-07, | |
| "loss": 0.8909, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.030004688232536336, | |
| "grad_norm": 14.25, | |
| "learning_rate": 5.904404873477039e-07, | |
| "loss": 0.8198, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.0450070323488045, | |
| "grad_norm": 13.0625, | |
| "learning_rate": 8.903467666354265e-07, | |
| "loss": 0.7483, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.06000937646507267, | |
| "grad_norm": 14.25, | |
| "learning_rate": 1.1902530459231491e-06, | |
| "loss": 0.8148, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.07501172058134084, | |
| "grad_norm": 7.375, | |
| "learning_rate": 1.4901593252108717e-06, | |
| "loss": 0.7295, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.090014064697609, | |
| "grad_norm": 14.75, | |
| "learning_rate": 1.7900656044985943e-06, | |
| "loss": 0.692, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.10501640881387717, | |
| "grad_norm": 7.9375, | |
| "learning_rate": 2.089971883786317e-06, | |
| "loss": 0.656, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.12001875293014534, | |
| "grad_norm": 11.5, | |
| "learning_rate": 2.3898781630740394e-06, | |
| "loss": 0.5897, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.1350210970464135, | |
| "grad_norm": 12.6875, | |
| "learning_rate": 2.689784442361762e-06, | |
| "loss": 0.5782, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.15002344116268168, | |
| "grad_norm": 7.5, | |
| "learning_rate": 2.9896907216494846e-06, | |
| "loss": 0.5428, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.16502578527894984, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 3.2895970009372076e-06, | |
| "loss": 0.5027, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.180028129395218, | |
| "grad_norm": 9.625, | |
| "learning_rate": 3.58950328022493e-06, | |
| "loss": 0.4795, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.19503047351148617, | |
| "grad_norm": 9.375, | |
| "learning_rate": 3.889409559512652e-06, | |
| "loss": 0.4694, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.21003281762775433, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 4.189315838800375e-06, | |
| "loss": 0.4382, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.2250351617440225, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 4.489222118088098e-06, | |
| "loss": 0.4647, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.24003750586029068, | |
| "grad_norm": 8.375, | |
| "learning_rate": 4.789128397375821e-06, | |
| "loss": 0.4764, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.2550398499765588, | |
| "grad_norm": 13.25, | |
| "learning_rate": 5.0890346766635435e-06, | |
| "loss": 0.4333, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.270042194092827, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 5.388940955951266e-06, | |
| "loss": 0.4481, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.28504453820909514, | |
| "grad_norm": 13.1875, | |
| "learning_rate": 5.688847235238988e-06, | |
| "loss": 0.4314, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 0.30004688232536336, | |
| "grad_norm": 12.625, | |
| "learning_rate": 5.98875351452671e-06, | |
| "loss": 0.3941, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.3150492264416315, | |
| "grad_norm": 6.59375, | |
| "learning_rate": 6.288659793814433e-06, | |
| "loss": 0.3834, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 0.3300515705578997, | |
| "grad_norm": 8.625, | |
| "learning_rate": 6.588566073102156e-06, | |
| "loss": 0.3548, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 0.34505391467416785, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 6.888472352389879e-06, | |
| "loss": 0.3715, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 0.360056258790436, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 7.1883786316776015e-06, | |
| "loss": 0.3073, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 0.3750586029067042, | |
| "grad_norm": 16.25, | |
| "learning_rate": 7.488284910965324e-06, | |
| "loss": 0.3536, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.39006094702297234, | |
| "grad_norm": 9.125, | |
| "learning_rate": 7.788191190253046e-06, | |
| "loss": 0.3526, | |
| "step": 832 | |
| }, | |
| { | |
| "epoch": 0.4050632911392405, | |
| "grad_norm": 13.375, | |
| "learning_rate": 8.08809746954077e-06, | |
| "loss": 0.3374, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 0.42006563525550866, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 8.388003748828491e-06, | |
| "loss": 0.3048, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 0.4350679793717768, | |
| "grad_norm": 10.625, | |
| "learning_rate": 8.687910028116214e-06, | |
| "loss": 0.3356, | |
| "step": 928 | |
| }, | |
| { | |
| "epoch": 0.450070323488045, | |
| "grad_norm": 19.75, | |
| "learning_rate": 8.987816307403938e-06, | |
| "loss": 0.3006, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.46507266760431315, | |
| "grad_norm": 11.625, | |
| "learning_rate": 9.28772258669166e-06, | |
| "loss": 0.2912, | |
| "step": 992 | |
| }, | |
| { | |
| "epoch": 0.48007501172058137, | |
| "grad_norm": 10.625, | |
| "learning_rate": 9.587628865979383e-06, | |
| "loss": 0.3286, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 0.49507735583684953, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 9.887535145267105e-06, | |
| "loss": 0.2886, | |
| "step": 1056 | |
| }, | |
| { | |
| "epoch": 0.5100796999531176, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 9.999892863685326e-06, | |
| "loss": 0.2993, | |
| "step": 1088 | |
| }, | |
| { | |
| "epoch": 0.5250820440693859, | |
| "grad_norm": 9.375, | |
| "learning_rate": 9.999275773410506e-06, | |
| "loss": 0.2741, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.540084388185654, | |
| "grad_norm": 7.65625, | |
| "learning_rate": 9.998110227713216e-06, | |
| "loss": 0.3421, | |
| "step": 1152 | |
| }, | |
| { | |
| "epoch": 0.5550867323019222, | |
| "grad_norm": 23.375, | |
| "learning_rate": 9.996396354461945e-06, | |
| "loss": 0.3402, | |
| "step": 1184 | |
| }, | |
| { | |
| "epoch": 0.5700890764181903, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 9.994134341680546e-06, | |
| "loss": 0.3022, | |
| "step": 1216 | |
| }, | |
| { | |
| "epoch": 0.5850914205344585, | |
| "grad_norm": 14.6875, | |
| "learning_rate": 9.991324437527599e-06, | |
| "loss": 0.3044, | |
| "step": 1248 | |
| }, | |
| { | |
| "epoch": 0.6000937646507267, | |
| "grad_norm": 11.1875, | |
| "learning_rate": 9.987966950269184e-06, | |
| "loss": 0.3214, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.6150961087669948, | |
| "grad_norm": 12.0, | |
| "learning_rate": 9.984062248245078e-06, | |
| "loss": 0.3197, | |
| "step": 1312 | |
| }, | |
| { | |
| "epoch": 0.630098452883263, | |
| "grad_norm": 11.5, | |
| "learning_rate": 9.979610759828324e-06, | |
| "loss": 0.2518, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 0.6451007969995312, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 9.974612973378252e-06, | |
| "loss": 0.3286, | |
| "step": 1376 | |
| }, | |
| { | |
| "epoch": 0.6601031411157994, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 9.969069437186899e-06, | |
| "loss": 0.3097, | |
| "step": 1408 | |
| }, | |
| { | |
| "epoch": 0.6751054852320675, | |
| "grad_norm": 9.875, | |
| "learning_rate": 9.962980759418844e-06, | |
| "loss": 0.2941, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.6901078293483357, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 9.956347608044512e-06, | |
| "loss": 0.308, | |
| "step": 1472 | |
| }, | |
| { | |
| "epoch": 0.7051101734646038, | |
| "grad_norm": 12.3125, | |
| "learning_rate": 9.949170710766875e-06, | |
| "loss": 0.2987, | |
| "step": 1504 | |
| }, | |
| { | |
| "epoch": 0.720112517580872, | |
| "grad_norm": 9.75, | |
| "learning_rate": 9.94145085494162e-06, | |
| "loss": 0.2916, | |
| "step": 1536 | |
| }, | |
| { | |
| "epoch": 0.7351148616971401, | |
| "grad_norm": 6.75, | |
| "learning_rate": 9.933188887490784e-06, | |
| "loss": 0.2931, | |
| "step": 1568 | |
| }, | |
| { | |
| "epoch": 0.7501172058134083, | |
| "grad_norm": 12.125, | |
| "learning_rate": 9.924385714809818e-06, | |
| "loss": 0.3164, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.7651195499296765, | |
| "grad_norm": 7.65625, | |
| "learning_rate": 9.91504230266817e-06, | |
| "loss": 0.2986, | |
| "step": 1632 | |
| }, | |
| { | |
| "epoch": 0.7801218940459447, | |
| "grad_norm": 13.3125, | |
| "learning_rate": 9.905159676103322e-06, | |
| "loss": 0.2648, | |
| "step": 1664 | |
| }, | |
| { | |
| "epoch": 0.7951242381622129, | |
| "grad_norm": 8.5, | |
| "learning_rate": 9.89473891930834e-06, | |
| "loss": 0.291, | |
| "step": 1696 | |
| }, | |
| { | |
| "epoch": 0.810126582278481, | |
| "grad_norm": 9.125, | |
| "learning_rate": 9.88378117551293e-06, | |
| "loss": 0.2966, | |
| "step": 1728 | |
| }, | |
| { | |
| "epoch": 0.8251289263947492, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 9.872287646858015e-06, | |
| "loss": 0.2927, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.8401312705110173, | |
| "grad_norm": 12.875, | |
| "learning_rate": 9.860259594263858e-06, | |
| "loss": 0.2829, | |
| "step": 1792 | |
| }, | |
| { | |
| "epoch": 0.8551336146272855, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 9.847698337291725e-06, | |
| "loss": 0.2519, | |
| "step": 1824 | |
| }, | |
| { | |
| "epoch": 0.8701359587435537, | |
| "grad_norm": 10.375, | |
| "learning_rate": 9.834605253999119e-06, | |
| "loss": 0.2922, | |
| "step": 1856 | |
| }, | |
| { | |
| "epoch": 0.8851383028598219, | |
| "grad_norm": 13.125, | |
| "learning_rate": 9.820981780788604e-06, | |
| "loss": 0.2954, | |
| "step": 1888 | |
| }, | |
| { | |
| "epoch": 0.90014064697609, | |
| "grad_norm": 8.875, | |
| "learning_rate": 9.806829412250215e-06, | |
| "loss": 0.3013, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.9151429910923582, | |
| "grad_norm": 7.96875, | |
| "learning_rate": 9.792149700997492e-06, | |
| "loss": 0.284, | |
| "step": 1952 | |
| }, | |
| { | |
| "epoch": 0.9301453352086263, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 9.776944257497157e-06, | |
| "loss": 0.3089, | |
| "step": 1984 | |
| }, | |
| { | |
| "epoch": 0.9451476793248945, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 9.761214749892411e-06, | |
| "loss": 0.3033, | |
| "step": 2016 | |
| }, | |
| { | |
| "epoch": 0.9601500234411627, | |
| "grad_norm": 8.25, | |
| "learning_rate": 9.74496290381996e-06, | |
| "loss": 0.3033, | |
| "step": 2048 | |
| }, | |
| { | |
| "epoch": 0.9751523675574308, | |
| "grad_norm": 7.4375, | |
| "learning_rate": 9.728190502220673e-06, | |
| "loss": 0.3294, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.9901547116736991, | |
| "grad_norm": 10.625, | |
| "learning_rate": 9.710899385143993e-06, | |
| "loss": 0.306, | |
| "step": 2112 | |
| }, | |
| { | |
| "epoch": 1.0051570557899672, | |
| "grad_norm": 6.1875, | |
| "learning_rate": 9.693091449546068e-06, | |
| "loss": 0.2592, | |
| "step": 2144 | |
| }, | |
| { | |
| "epoch": 1.0201593999062353, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 9.674768649081647e-06, | |
| "loss": 0.2325, | |
| "step": 2176 | |
| }, | |
| { | |
| "epoch": 1.0351617440225036, | |
| "grad_norm": 14.1875, | |
| "learning_rate": 9.655932993889742e-06, | |
| "loss": 0.2529, | |
| "step": 2208 | |
| }, | |
| { | |
| "epoch": 1.0501640881387717, | |
| "grad_norm": 10.0, | |
| "learning_rate": 9.636586550373105e-06, | |
| "loss": 0.2018, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.0651664322550398, | |
| "grad_norm": 14.6875, | |
| "learning_rate": 9.616731440971536e-06, | |
| "loss": 0.2427, | |
| "step": 2272 | |
| }, | |
| { | |
| "epoch": 1.080168776371308, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 9.596369843929022e-06, | |
| "loss": 0.2289, | |
| "step": 2304 | |
| }, | |
| { | |
| "epoch": 1.0951711204875763, | |
| "grad_norm": 8.1875, | |
| "learning_rate": 9.575503993054787e-06, | |
| "loss": 0.2156, | |
| "step": 2336 | |
| }, | |
| { | |
| "epoch": 1.1101734646038444, | |
| "grad_norm": 8.5, | |
| "learning_rate": 9.554136177478206e-06, | |
| "loss": 0.2186, | |
| "step": 2368 | |
| }, | |
| { | |
| "epoch": 1.1251758087201125, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 9.532268741397692e-06, | |
| "loss": 0.2451, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.1401781528363806, | |
| "grad_norm": 8.25, | |
| "learning_rate": 9.50990408382351e-06, | |
| "loss": 0.2421, | |
| "step": 2432 | |
| }, | |
| { | |
| "epoch": 1.155180496952649, | |
| "grad_norm": 7.59375, | |
| "learning_rate": 9.487044658314585e-06, | |
| "loss": 0.2165, | |
| "step": 2464 | |
| }, | |
| { | |
| "epoch": 1.170182841068917, | |
| "grad_norm": 7.75, | |
| "learning_rate": 9.463692972709349e-06, | |
| "loss": 0.2326, | |
| "step": 2496 | |
| }, | |
| { | |
| "epoch": 1.1851851851851851, | |
| "grad_norm": 10.875, | |
| "learning_rate": 9.439851588850586e-06, | |
| "loss": 0.2585, | |
| "step": 2528 | |
| }, | |
| { | |
| "epoch": 1.2001875293014534, | |
| "grad_norm": 5.90625, | |
| "learning_rate": 9.4155231223044e-06, | |
| "loss": 0.2165, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 1.2151898734177216, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 9.390710242073265e-06, | |
| "loss": 0.268, | |
| "step": 2592 | |
| }, | |
| { | |
| "epoch": 1.2301922175339897, | |
| "grad_norm": 5.8125, | |
| "learning_rate": 9.365415670303214e-06, | |
| "loss": 0.2386, | |
| "step": 2624 | |
| }, | |
| { | |
| "epoch": 1.2451945616502578, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 9.339642181985196e-06, | |
| "loss": 0.259, | |
| "step": 2656 | |
| }, | |
| { | |
| "epoch": 1.260196905766526, | |
| "grad_norm": 10.25, | |
| "learning_rate": 9.313392604650655e-06, | |
| "loss": 0.2222, | |
| "step": 2688 | |
| }, | |
| { | |
| "epoch": 1.2751992498827942, | |
| "grad_norm": 8.75, | |
| "learning_rate": 9.286669818061316e-06, | |
| "loss": 0.2383, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 1.2902015939990623, | |
| "grad_norm": 7.75, | |
| "learning_rate": 9.259476753893258e-06, | |
| "loss": 0.2221, | |
| "step": 2752 | |
| }, | |
| { | |
| "epoch": 1.3052039381153304, | |
| "grad_norm": 7.6875, | |
| "learning_rate": 9.231816395415294e-06, | |
| "loss": 0.2397, | |
| "step": 2784 | |
| }, | |
| { | |
| "epoch": 1.3202062822315987, | |
| "grad_norm": 9.875, | |
| "learning_rate": 9.20369177716168e-06, | |
| "loss": 0.2348, | |
| "step": 2816 | |
| }, | |
| { | |
| "epoch": 1.3352086263478669, | |
| "grad_norm": 6.34375, | |
| "learning_rate": 9.17510598459921e-06, | |
| "loss": 0.2499, | |
| "step": 2848 | |
| }, | |
| { | |
| "epoch": 1.350210970464135, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 9.146062153788716e-06, | |
| "loss": 0.227, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 1.3652133145804033, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 9.116563471041018e-06, | |
| "loss": 0.2308, | |
| "step": 2912 | |
| }, | |
| { | |
| "epoch": 1.3802156586966714, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 9.086613172567368e-06, | |
| "loss": 0.2016, | |
| "step": 2944 | |
| }, | |
| { | |
| "epoch": 1.3952180028129395, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 9.056214544124414e-06, | |
| "loss": 0.2356, | |
| "step": 2976 | |
| }, | |
| { | |
| "epoch": 1.4102203469292076, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 9.025370920653723e-06, | |
| "loss": 0.2306, | |
| "step": 3008 | |
| }, | |
| { | |
| "epoch": 1.4252226910454757, | |
| "grad_norm": 10.75, | |
| "learning_rate": 8.994085685915934e-06, | |
| "loss": 0.2276, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 1.440225035161744, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 8.962362272119504e-06, | |
| "loss": 0.2352, | |
| "step": 3072 | |
| }, | |
| { | |
| "epoch": 1.4552273792780122, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 8.930204159544208e-06, | |
| "loss": 0.2316, | |
| "step": 3104 | |
| }, | |
| { | |
| "epoch": 1.4702297233942803, | |
| "grad_norm": 11.0625, | |
| "learning_rate": 8.89761487615929e-06, | |
| "loss": 0.2264, | |
| "step": 3136 | |
| }, | |
| { | |
| "epoch": 1.4852320675105486, | |
| "grad_norm": 6.375, | |
| "learning_rate": 8.864597997236454e-06, | |
| "loss": 0.2414, | |
| "step": 3168 | |
| }, | |
| { | |
| "epoch": 1.5002344116268167, | |
| "grad_norm": 14.5625, | |
| "learning_rate": 8.831157144957612e-06, | |
| "loss": 0.2165, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.5152367557430848, | |
| "grad_norm": 7.0625, | |
| "learning_rate": 8.797295988017506e-06, | |
| "loss": 0.2418, | |
| "step": 3232 | |
| }, | |
| { | |
| "epoch": 1.5302390998593531, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 8.763018241221241e-06, | |
| "loss": 0.2129, | |
| "step": 3264 | |
| }, | |
| { | |
| "epoch": 1.5452414439756212, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 8.728327665076726e-06, | |
| "loss": 0.253, | |
| "step": 3296 | |
| }, | |
| { | |
| "epoch": 1.5602437880918893, | |
| "grad_norm": 13.875, | |
| "learning_rate": 8.693228065382131e-06, | |
| "loss": 0.2156, | |
| "step": 3328 | |
| }, | |
| { | |
| "epoch": 1.5752461322081577, | |
| "grad_norm": 15.75, | |
| "learning_rate": 8.657723292808365e-06, | |
| "loss": 0.2261, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 1.5902484763244256, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 8.621817242476626e-06, | |
| "loss": 0.2187, | |
| "step": 3392 | |
| }, | |
| { | |
| "epoch": 1.605250820440694, | |
| "grad_norm": 15.1875, | |
| "learning_rate": 8.58551385353108e-06, | |
| "loss": 0.2559, | |
| "step": 3424 | |
| }, | |
| { | |
| "epoch": 1.620253164556962, | |
| "grad_norm": 7.34375, | |
| "learning_rate": 8.548817108706714e-06, | |
| "loss": 0.2257, | |
| "step": 3456 | |
| }, | |
| { | |
| "epoch": 1.63525550867323, | |
| "grad_norm": 18.875, | |
| "learning_rate": 8.511731033892397e-06, | |
| "loss": 0.247, | |
| "step": 3488 | |
| }, | |
| { | |
| "epoch": 1.6502578527894984, | |
| "grad_norm": 6.90625, | |
| "learning_rate": 8.474259697689211e-06, | |
| "loss": 0.2775, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 1.6652601969057665, | |
| "grad_norm": 7.4375, | |
| "learning_rate": 8.436407210964101e-06, | |
| "loss": 0.2468, | |
| "step": 3552 | |
| }, | |
| { | |
| "epoch": 1.6802625410220347, | |
| "grad_norm": 6.8125, | |
| "learning_rate": 8.398177726398887e-06, | |
| "loss": 0.2642, | |
| "step": 3584 | |
| }, | |
| { | |
| "epoch": 1.695264885138303, | |
| "grad_norm": 12.125, | |
| "learning_rate": 8.359575438034671e-06, | |
| "loss": 0.2571, | |
| "step": 3616 | |
| }, | |
| { | |
| "epoch": 1.7102672292545709, | |
| "grad_norm": 9.625, | |
| "learning_rate": 8.320604580811744e-06, | |
| "loss": 0.2121, | |
| "step": 3648 | |
| }, | |
| { | |
| "epoch": 1.7252695733708392, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 8.281269430104965e-06, | |
| "loss": 0.2512, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 1.7402719174871075, | |
| "grad_norm": 13.0625, | |
| "learning_rate": 8.241574301254733e-06, | |
| "loss": 0.2273, | |
| "step": 3712 | |
| }, | |
| { | |
| "epoch": 1.7552742616033754, | |
| "grad_norm": 12.0625, | |
| "learning_rate": 8.201523549093552e-06, | |
| "loss": 0.2298, | |
| "step": 3744 | |
| }, | |
| { | |
| "epoch": 1.7702766057196437, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 8.161121567468298e-06, | |
| "loss": 0.2484, | |
| "step": 3776 | |
| }, | |
| { | |
| "epoch": 1.7852789498359118, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 8.120372788758152e-06, | |
| "loss": 0.2269, | |
| "step": 3808 | |
| }, | |
| { | |
| "epoch": 1.80028129395218, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 8.079281683388368e-06, | |
| "loss": 0.2263, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 1.8152836380684483, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 8.037852759339814e-06, | |
| "loss": 0.2294, | |
| "step": 3872 | |
| }, | |
| { | |
| "epoch": 1.8302859821847164, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 7.99609056165443e-06, | |
| "loss": 0.2391, | |
| "step": 3904 | |
| }, | |
| { | |
| "epoch": 1.8452883263009845, | |
| "grad_norm": 12.3125, | |
| "learning_rate": 7.953999671936591e-06, | |
| "loss": 0.241, | |
| "step": 3936 | |
| }, | |
| { | |
| "epoch": 1.8602906704172528, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 7.911584707850487e-06, | |
| "loss": 0.1985, | |
| "step": 3968 | |
| }, | |
| { | |
| "epoch": 1.8752930145335207, | |
| "grad_norm": 6.8125, | |
| "learning_rate": 7.868850322613525e-06, | |
| "loss": 0.2431, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.890295358649789, | |
| "grad_norm": 11.125, | |
| "learning_rate": 7.825801204485837e-06, | |
| "loss": 0.2325, | |
| "step": 4032 | |
| }, | |
| { | |
| "epoch": 1.9052977027660571, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 7.782442076255952e-06, | |
| "loss": 0.2256, | |
| "step": 4064 | |
| }, | |
| { | |
| "epoch": 1.9203000468823253, | |
| "grad_norm": 11.0, | |
| "learning_rate": 7.738777694722666e-06, | |
| "loss": 0.2618, | |
| "step": 4096 | |
| }, | |
| { | |
| "epoch": 1.9353023909985936, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 7.694812850173197e-06, | |
| "loss": 0.224, | |
| "step": 4128 | |
| }, | |
| { | |
| "epoch": 1.9503047351148617, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 7.650552365857648e-06, | |
| "loss": 0.2272, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 1.9653070792311298, | |
| "grad_norm": 12.875, | |
| "learning_rate": 7.606001097459865e-06, | |
| "loss": 0.2467, | |
| "step": 4192 | |
| }, | |
| { | |
| "epoch": 1.9803094233473981, | |
| "grad_norm": 8.125, | |
| "learning_rate": 7.561163932564739e-06, | |
| "loss": 0.2399, | |
| "step": 4224 | |
| }, | |
| { | |
| "epoch": 1.9953117674636662, | |
| "grad_norm": 4.78125, | |
| "learning_rate": 7.516045790122e-06, | |
| "loss": 0.2398, | |
| "step": 4256 | |
| }, | |
| { | |
| "epoch": 2.0103141115799343, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 7.470651619906574e-06, | |
| "loss": 0.1666, | |
| "step": 4288 | |
| }, | |
| { | |
| "epoch": 2.0253164556962027, | |
| "grad_norm": 13.125, | |
| "learning_rate": 7.424986401975561e-06, | |
| "loss": 0.226, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 2.0403187998124706, | |
| "grad_norm": 6.09375, | |
| "learning_rate": 7.379055146121884e-06, | |
| "loss": 0.1728, | |
| "step": 4352 | |
| }, | |
| { | |
| "epoch": 2.055321143928739, | |
| "grad_norm": 12.875, | |
| "learning_rate": 7.332862891324681e-06, | |
| "loss": 0.2048, | |
| "step": 4384 | |
| }, | |
| { | |
| "epoch": 2.070323488045007, | |
| "grad_norm": 9.75, | |
| "learning_rate": 7.286414705196499e-06, | |
| "loss": 0.1943, | |
| "step": 4416 | |
| }, | |
| { | |
| "epoch": 2.085325832161275, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 7.2397156834273295e-06, | |
| "loss": 0.2017, | |
| "step": 4448 | |
| }, | |
| { | |
| "epoch": 2.1003281762775434, | |
| "grad_norm": 7.625, | |
| "learning_rate": 7.192770949225591e-06, | |
| "loss": 0.1626, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 2.1153305203938118, | |
| "grad_norm": 6.78125, | |
| "learning_rate": 7.1455856527560666e-06, | |
| "loss": 0.1603, | |
| "step": 4512 | |
| }, | |
| { | |
| "epoch": 2.1303328645100796, | |
| "grad_norm": 14.3125, | |
| "learning_rate": 7.0981649705748955e-06, | |
| "loss": 0.1555, | |
| "step": 4544 | |
| }, | |
| { | |
| "epoch": 2.145335208626348, | |
| "grad_norm": 11.1875, | |
| "learning_rate": 7.050514105061679e-06, | |
| "loss": 0.1704, | |
| "step": 4576 | |
| }, | |
| { | |
| "epoch": 2.160337552742616, | |
| "grad_norm": 6.25, | |
| "learning_rate": 7.002638283848726e-06, | |
| "loss": 0.1642, | |
| "step": 4608 | |
| }, | |
| { | |
| "epoch": 2.175339896858884, | |
| "grad_norm": 6.0, | |
| "learning_rate": 6.95454275924756e-06, | |
| "loss": 0.173, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 2.1903422409751525, | |
| "grad_norm": 7.84375, | |
| "learning_rate": 6.906232807672699e-06, | |
| "loss": 0.1726, | |
| "step": 4672 | |
| }, | |
| { | |
| "epoch": 2.2053445850914204, | |
| "grad_norm": 8.0, | |
| "learning_rate": 6.857713729062794e-06, | |
| "loss": 0.1741, | |
| "step": 4704 | |
| }, | |
| { | |
| "epoch": 2.2203469292076887, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 6.80899084629919e-06, | |
| "loss": 0.2037, | |
| "step": 4736 | |
| }, | |
| { | |
| "epoch": 2.235349273323957, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 6.760069504621971e-06, | |
| "loss": 0.2404, | |
| "step": 4768 | |
| }, | |
| { | |
| "epoch": 2.250351617440225, | |
| "grad_norm": 6.09375, | |
| "learning_rate": 6.710955071043547e-06, | |
| "loss": 0.1778, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.2653539615564933, | |
| "grad_norm": 6.4375, | |
| "learning_rate": 6.661652933759856e-06, | |
| "loss": 0.1708, | |
| "step": 4832 | |
| }, | |
| { | |
| "epoch": 2.280356305672761, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 6.612168501559242e-06, | |
| "loss": 0.1854, | |
| "step": 4864 | |
| }, | |
| { | |
| "epoch": 2.2953586497890295, | |
| "grad_norm": 6.0625, | |
| "learning_rate": 6.5625072032290735e-06, | |
| "loss": 0.1601, | |
| "step": 4896 | |
| }, | |
| { | |
| "epoch": 2.310360993905298, | |
| "grad_norm": 14.125, | |
| "learning_rate": 6.512674486960166e-06, | |
| "loss": 0.1539, | |
| "step": 4928 | |
| }, | |
| { | |
| "epoch": 2.3253633380215657, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 6.462675819749082e-06, | |
| "loss": 0.1474, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 2.340365682137834, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 6.412516686798354e-06, | |
| "loss": 0.166, | |
| "step": 4992 | |
| }, | |
| { | |
| "epoch": 2.3553680262541024, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 6.362202590914728e-06, | |
| "loss": 0.1863, | |
| "step": 5024 | |
| }, | |
| { | |
| "epoch": 2.3703703703703702, | |
| "grad_norm": 13.75, | |
| "learning_rate": 6.311739051905468e-06, | |
| "loss": 0.1523, | |
| "step": 5056 | |
| }, | |
| { | |
| "epoch": 2.3853727144866386, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 6.261131605972785e-06, | |
| "loss": 0.1795, | |
| "step": 5088 | |
| }, | |
| { | |
| "epoch": 2.400375058602907, | |
| "grad_norm": 7.78125, | |
| "learning_rate": 6.2103858051064915e-06, | |
| "loss": 0.187, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 2.415377402719175, | |
| "grad_norm": 8.625, | |
| "learning_rate": 6.159507216474891e-06, | |
| "loss": 0.2099, | |
| "step": 5152 | |
| }, | |
| { | |
| "epoch": 2.430379746835443, | |
| "grad_norm": 10.0, | |
| "learning_rate": 6.108501421814039e-06, | |
| "loss": 0.2008, | |
| "step": 5184 | |
| }, | |
| { | |
| "epoch": 2.4453820909517114, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 6.057374016815376e-06, | |
| "loss": 0.2017, | |
| "step": 5216 | |
| }, | |
| { | |
| "epoch": 2.4603844350679793, | |
| "grad_norm": 11.875, | |
| "learning_rate": 6.0061306105118474e-06, | |
| "loss": 0.1826, | |
| "step": 5248 | |
| }, | |
| { | |
| "epoch": 2.4753867791842477, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 5.954776824662547e-06, | |
| "loss": 0.1757, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 2.4903891233005155, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 5.90331829313598e-06, | |
| "loss": 0.1693, | |
| "step": 5312 | |
| }, | |
| { | |
| "epoch": 2.505391467416784, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 5.851760661291977e-06, | |
| "loss": 0.1838, | |
| "step": 5344 | |
| }, | |
| { | |
| "epoch": 2.520393811533052, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 5.80010958536237e-06, | |
| "loss": 0.1887, | |
| "step": 5376 | |
| }, | |
| { | |
| "epoch": 2.53539615564932, | |
| "grad_norm": 7.84375, | |
| "learning_rate": 5.748370731830456e-06, | |
| "loss": 0.1932, | |
| "step": 5408 | |
| }, | |
| { | |
| "epoch": 2.5503984997655884, | |
| "grad_norm": 16.375, | |
| "learning_rate": 5.696549776809346e-06, | |
| "loss": 0.1739, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 2.5654008438818563, | |
| "grad_norm": 8.5, | |
| "learning_rate": 5.6446524054192605e-06, | |
| "loss": 0.1857, | |
| "step": 5472 | |
| }, | |
| { | |
| "epoch": 2.5804031879981246, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 5.592684311163827e-06, | |
| "loss": 0.1872, | |
| "step": 5504 | |
| }, | |
| { | |
| "epoch": 2.595405532114393, | |
| "grad_norm": 7.75, | |
| "learning_rate": 5.540651195305464e-06, | |
| "loss": 0.2011, | |
| "step": 5536 | |
| }, | |
| { | |
| "epoch": 2.610407876230661, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 5.488558766239916e-06, | |
| "loss": 0.1989, | |
| "step": 5568 | |
| }, | |
| { | |
| "epoch": 2.625410220346929, | |
| "grad_norm": 17.25, | |
| "learning_rate": 5.436412738869995e-06, | |
| "loss": 0.1745, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 2.6404125644631975, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 5.384218833978626e-06, | |
| "loss": 0.1712, | |
| "step": 5632 | |
| }, | |
| { | |
| "epoch": 2.6554149085794654, | |
| "grad_norm": 10.75, | |
| "learning_rate": 5.331982777601228e-06, | |
| "loss": 0.1865, | |
| "step": 5664 | |
| }, | |
| { | |
| "epoch": 2.6704172526957337, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 5.279710300397537e-06, | |
| "loss": 0.1839, | |
| "step": 5696 | |
| }, | |
| { | |
| "epoch": 2.685419596812002, | |
| "grad_norm": 6.46875, | |
| "learning_rate": 5.227407137022902e-06, | |
| "loss": 0.2113, | |
| "step": 5728 | |
| }, | |
| { | |
| "epoch": 2.70042194092827, | |
| "grad_norm": 12.25, | |
| "learning_rate": 5.175079025499163e-06, | |
| "loss": 0.1619, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 2.7154242850445383, | |
| "grad_norm": 9.375, | |
| "learning_rate": 5.1227317065851445e-06, | |
| "loss": 0.1825, | |
| "step": 5792 | |
| }, | |
| { | |
| "epoch": 2.7304266291608066, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 5.070370923146855e-06, | |
| "loss": 0.1654, | |
| "step": 5824 | |
| }, | |
| { | |
| "epoch": 2.7454289732770745, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 5.0180024195274555e-06, | |
| "loss": 0.1499, | |
| "step": 5856 | |
| }, | |
| { | |
| "epoch": 2.760431317393343, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 4.965631940917068e-06, | |
| "loss": 0.1633, | |
| "step": 5888 | |
| }, | |
| { | |
| "epoch": 2.775433661509611, | |
| "grad_norm": 18.25, | |
| "learning_rate": 4.91326523272248e-06, | |
| "loss": 0.1708, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 2.790436005625879, | |
| "grad_norm": 6.40625, | |
| "learning_rate": 4.860908039936839e-06, | |
| "loss": 0.202, | |
| "step": 5952 | |
| }, | |
| { | |
| "epoch": 2.8054383497421473, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 4.80856610650939e-06, | |
| "loss": 0.18, | |
| "step": 5984 | |
| }, | |
| { | |
| "epoch": 2.8204406938584152, | |
| "grad_norm": 11.375, | |
| "learning_rate": 4.756245174715315e-06, | |
| "loss": 0.1835, | |
| "step": 6016 | |
| }, | |
| { | |
| "epoch": 2.8354430379746836, | |
| "grad_norm": 10.875, | |
| "learning_rate": 4.703950984525774e-06, | |
| "loss": 0.2188, | |
| "step": 6048 | |
| }, | |
| { | |
| "epoch": 2.8504453820909514, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 4.6516892729781815e-06, | |
| "loss": 0.1718, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 2.8654477262072198, | |
| "grad_norm": 9.625, | |
| "learning_rate": 4.599465773546822e-06, | |
| "loss": 0.1803, | |
| "step": 6112 | |
| }, | |
| { | |
| "epoch": 2.880450070323488, | |
| "grad_norm": 12.5, | |
| "learning_rate": 4.547286215513846e-06, | |
| "loss": 0.1736, | |
| "step": 6144 | |
| }, | |
| { | |
| "epoch": 2.895452414439756, | |
| "grad_norm": 9.25, | |
| "learning_rate": 4.495156323340724e-06, | |
| "loss": 0.2059, | |
| "step": 6176 | |
| }, | |
| { | |
| "epoch": 2.9104547585560243, | |
| "grad_norm": 7.71875, | |
| "learning_rate": 4.443081816040233e-06, | |
| "loss": 0.2204, | |
| "step": 6208 | |
| }, | |
| { | |
| "epoch": 2.9254571026722926, | |
| "grad_norm": 10.25, | |
| "learning_rate": 4.391068406549049e-06, | |
| "loss": 0.1991, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 2.9404594467885605, | |
| "grad_norm": 13.375, | |
| "learning_rate": 4.339121801100982e-06, | |
| "loss": 0.2167, | |
| "step": 6272 | |
| }, | |
| { | |
| "epoch": 2.955461790904829, | |
| "grad_norm": 4.3125, | |
| "learning_rate": 4.287247698600987e-06, | |
| "loss": 0.1526, | |
| "step": 6304 | |
| }, | |
| { | |
| "epoch": 2.970464135021097, | |
| "grad_norm": 7.15625, | |
| "learning_rate": 4.235451789999928e-06, | |
| "loss": 0.1693, | |
| "step": 6336 | |
| }, | |
| { | |
| "epoch": 2.985466479137365, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 4.1837397576702576e-06, | |
| "loss": 0.2256, | |
| "step": 6368 | |
| }, | |
| { | |
| "epoch": 3.0004688232536334, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 4.132117274782616e-06, | |
| "loss": 0.2014, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 3.0154711673699017, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 4.0805900046834405e-06, | |
| "loss": 0.1782, | |
| "step": 6432 | |
| }, | |
| { | |
| "epoch": 3.0304735114861696, | |
| "grad_norm": 6.59375, | |
| "learning_rate": 4.0291636002736725e-06, | |
| "loss": 0.1622, | |
| "step": 6464 | |
| }, | |
| { | |
| "epoch": 3.045475855602438, | |
| "grad_norm": 16.125, | |
| "learning_rate": 3.977843703388572e-06, | |
| "loss": 0.1664, | |
| "step": 6496 | |
| }, | |
| { | |
| "epoch": 3.0604781997187063, | |
| "grad_norm": 15.3125, | |
| "learning_rate": 3.926635944178788e-06, | |
| "loss": 0.1734, | |
| "step": 6528 | |
| }, | |
| { | |
| "epoch": 3.075480543834974, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 3.875545940492681e-06, | |
| "loss": 0.1617, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 3.0904828879512425, | |
| "grad_norm": 13.75, | |
| "learning_rate": 3.824579297260006e-06, | |
| "loss": 0.1748, | |
| "step": 6592 | |
| }, | |
| { | |
| "epoch": 3.1054852320675104, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 3.773741605877026e-06, | |
| "loss": 0.1841, | |
| "step": 6624 | |
| }, | |
| { | |
| "epoch": 3.1204875761837787, | |
| "grad_norm": 13.5, | |
| "learning_rate": 3.7230384435930785e-06, | |
| "loss": 0.1718, | |
| "step": 6656 | |
| }, | |
| { | |
| "epoch": 3.135489920300047, | |
| "grad_norm": 11.625, | |
| "learning_rate": 3.6724753728987206e-06, | |
| "loss": 0.1408, | |
| "step": 6688 | |
| }, | |
| { | |
| "epoch": 3.150492264416315, | |
| "grad_norm": 12.25, | |
| "learning_rate": 3.6220579409154888e-06, | |
| "loss": 0.1576, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 3.1654946085325832, | |
| "grad_norm": 7.5, | |
| "learning_rate": 3.571791678787332e-06, | |
| "loss": 0.1769, | |
| "step": 6752 | |
| }, | |
| { | |
| "epoch": 3.1804969526488516, | |
| "grad_norm": 9.625, | |
| "learning_rate": 3.521682101073818e-06, | |
| "loss": 0.1473, | |
| "step": 6784 | |
| }, | |
| { | |
| "epoch": 3.1954992967651195, | |
| "grad_norm": 12.625, | |
| "learning_rate": 3.471734705145138e-06, | |
| "loss": 0.162, | |
| "step": 6816 | |
| }, | |
| { | |
| "epoch": 3.210501640881388, | |
| "grad_norm": 14.3125, | |
| "learning_rate": 3.421954970579008e-06, | |
| "loss": 0.1884, | |
| "step": 6848 | |
| }, | |
| { | |
| "epoch": 3.2255039849976557, | |
| "grad_norm": 18.25, | |
| "learning_rate": 3.3723483585595256e-06, | |
| "loss": 0.1563, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 3.240506329113924, | |
| "grad_norm": 9.875, | |
| "learning_rate": 3.3229203112780382e-06, | |
| "loss": 0.1876, | |
| "step": 6912 | |
| }, | |
| { | |
| "epoch": 3.2555086732301923, | |
| "grad_norm": 9.75, | |
| "learning_rate": 3.2736762513360963e-06, | |
| "loss": 0.1674, | |
| "step": 6944 | |
| }, | |
| { | |
| "epoch": 3.27051101734646, | |
| "grad_norm": 13.25, | |
| "learning_rate": 3.224621581150553e-06, | |
| "loss": 0.1422, | |
| "step": 6976 | |
| }, | |
| { | |
| "epoch": 3.2855133614627285, | |
| "grad_norm": 8.375, | |
| "learning_rate": 3.175761682360885e-06, | |
| "loss": 0.1676, | |
| "step": 7008 | |
| }, | |
| { | |
| "epoch": 3.300515705578997, | |
| "grad_norm": 12.25, | |
| "learning_rate": 3.1271019152387917e-06, | |
| "loss": 0.1543, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 3.3155180496952648, | |
| "grad_norm": 13.1875, | |
| "learning_rate": 3.0786476181001263e-06, | |
| "loss": 0.1648, | |
| "step": 7072 | |
| }, | |
| { | |
| "epoch": 3.330520393811533, | |
| "grad_norm": 8.375, | |
| "learning_rate": 3.030404106719259e-06, | |
| "loss": 0.1869, | |
| "step": 7104 | |
| }, | |
| { | |
| "epoch": 3.3455227379278014, | |
| "grad_norm": 12.5, | |
| "learning_rate": 2.982376673745887e-06, | |
| "loss": 0.1838, | |
| "step": 7136 | |
| }, | |
| { | |
| "epoch": 3.3605250820440693, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 2.934570588124399e-06, | |
| "loss": 0.14, | |
| "step": 7168 | |
| }, | |
| { | |
| "epoch": 3.3755274261603376, | |
| "grad_norm": 9.625, | |
| "learning_rate": 2.8869910945158407e-06, | |
| "loss": 0.1635, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 3.390529770276606, | |
| "grad_norm": 6.5, | |
| "learning_rate": 2.839643412722525e-06, | |
| "loss": 0.1738, | |
| "step": 7232 | |
| }, | |
| { | |
| "epoch": 3.405532114392874, | |
| "grad_norm": 7.28125, | |
| "learning_rate": 2.7925327371153998e-06, | |
| "loss": 0.1335, | |
| "step": 7264 | |
| }, | |
| { | |
| "epoch": 3.420534458509142, | |
| "grad_norm": 7.6875, | |
| "learning_rate": 2.7456642360641772e-06, | |
| "loss": 0.178, | |
| "step": 7296 | |
| }, | |
| { | |
| "epoch": 3.43553680262541, | |
| "grad_norm": 9.5, | |
| "learning_rate": 2.6990430513703316e-06, | |
| "loss": 0.1827, | |
| "step": 7328 | |
| }, | |
| { | |
| "epoch": 3.4505391467416784, | |
| "grad_norm": 8.625, | |
| "learning_rate": 2.6526742977030084e-06, | |
| "loss": 0.1587, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 3.4655414908579467, | |
| "grad_norm": 8.75, | |
| "learning_rate": 2.6065630620379062e-06, | |
| "loss": 0.1642, | |
| "step": 7392 | |
| }, | |
| { | |
| "epoch": 3.4805438349742146, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 2.5607144030992093e-06, | |
| "loss": 0.1472, | |
| "step": 7424 | |
| }, | |
| { | |
| "epoch": 3.495546179090483, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 2.515133350804598e-06, | |
| "loss": 0.1556, | |
| "step": 7456 | |
| }, | |
| { | |
| "epoch": 3.510548523206751, | |
| "grad_norm": 6.6875, | |
| "learning_rate": 2.4698249057134377e-06, | |
| "loss": 0.1422, | |
| "step": 7488 | |
| }, | |
| { | |
| "epoch": 3.525550867323019, | |
| "grad_norm": 10.125, | |
| "learning_rate": 2.4247940384781834e-06, | |
| "loss": 0.1675, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 3.5405532114392875, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 2.38004568929906e-06, | |
| "loss": 0.157, | |
| "step": 7552 | |
| }, | |
| { | |
| "epoch": 3.5555555555555554, | |
| "grad_norm": 12.375, | |
| "learning_rate": 2.335584767382098e-06, | |
| "loss": 0.1715, | |
| "step": 7584 | |
| }, | |
| { | |
| "epoch": 3.5705578996718237, | |
| "grad_norm": 7.3125, | |
| "learning_rate": 2.291416150400547e-06, | |
| "loss": 0.1809, | |
| "step": 7616 | |
| }, | |
| { | |
| "epoch": 3.585560243788092, | |
| "grad_norm": 10.875, | |
| "learning_rate": 2.247544683959767e-06, | |
| "loss": 0.1555, | |
| "step": 7648 | |
| }, | |
| { | |
| "epoch": 3.60056258790436, | |
| "grad_norm": 7.03125, | |
| "learning_rate": 2.203975181065632e-06, | |
| "loss": 0.1868, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 3.6155649320206282, | |
| "grad_norm": 9.75, | |
| "learning_rate": 2.160712421596506e-06, | |
| "loss": 0.1816, | |
| "step": 7712 | |
| }, | |
| { | |
| "epoch": 3.6305672761368966, | |
| "grad_norm": 14.4375, | |
| "learning_rate": 2.1177611517788655e-06, | |
| "loss": 0.1442, | |
| "step": 7744 | |
| }, | |
| { | |
| "epoch": 3.6455696202531644, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 2.0751260836665947e-06, | |
| "loss": 0.1596, | |
| "step": 7776 | |
| }, | |
| { | |
| "epoch": 3.6605719643694328, | |
| "grad_norm": 7.5, | |
| "learning_rate": 2.0328118946240473e-06, | |
| "loss": 0.1852, | |
| "step": 7808 | |
| }, | |
| { | |
| "epoch": 3.675574308485701, | |
| "grad_norm": 4.40625, | |
| "learning_rate": 1.9908232268129037e-06, | |
| "loss": 0.1865, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 3.690576652601969, | |
| "grad_norm": 5.84375, | |
| "learning_rate": 1.9491646866828927e-06, | |
| "loss": 0.1619, | |
| "step": 7872 | |
| }, | |
| { | |
| "epoch": 3.7055789967182373, | |
| "grad_norm": 20.5, | |
| "learning_rate": 1.9078408444664417e-06, | |
| "loss": 0.1725, | |
| "step": 7904 | |
| }, | |
| { | |
| "epoch": 3.7205813408345056, | |
| "grad_norm": 10.75, | |
| "learning_rate": 1.8668562336772734e-06, | |
| "loss": 0.2191, | |
| "step": 7936 | |
| }, | |
| { | |
| "epoch": 3.7355836849507735, | |
| "grad_norm": 25.375, | |
| "learning_rate": 1.826215350613062e-06, | |
| "loss": 0.1488, | |
| "step": 7968 | |
| }, | |
| { | |
| "epoch": 3.750586029067042, | |
| "grad_norm": 9.75, | |
| "learning_rate": 1.7859226538621487e-06, | |
| "loss": 0.1691, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 3.7655883731833097, | |
| "grad_norm": 13.0, | |
| "learning_rate": 1.745982563814414e-06, | |
| "loss": 0.1527, | |
| "step": 8032 | |
| }, | |
| { | |
| "epoch": 3.780590717299578, | |
| "grad_norm": 8.375, | |
| "learning_rate": 1.7063994621763176e-06, | |
| "loss": 0.1541, | |
| "step": 8064 | |
| }, | |
| { | |
| "epoch": 3.795593061415846, | |
| "grad_norm": 5.6875, | |
| "learning_rate": 1.6671776914902027e-06, | |
| "loss": 0.1823, | |
| "step": 8096 | |
| }, | |
| { | |
| "epoch": 3.8105954055321143, | |
| "grad_norm": 15.5, | |
| "learning_rate": 1.6283215546578862e-06, | |
| "loss": 0.1835, | |
| "step": 8128 | |
| }, | |
| { | |
| "epoch": 3.8255977496483826, | |
| "grad_norm": 13.5, | |
| "learning_rate": 1.5898353144686036e-06, | |
| "loss": 0.157, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 3.8406000937646505, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 1.5517231931313454e-06, | |
| "loss": 0.1473, | |
| "step": 8192 | |
| }, | |
| { | |
| "epoch": 3.855602437880919, | |
| "grad_norm": 8.5, | |
| "learning_rate": 1.513989371811656e-06, | |
| "loss": 0.119, | |
| "step": 8224 | |
| }, | |
| { | |
| "epoch": 3.870604781997187, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 1.4766379901729272e-06, | |
| "loss": 0.1714, | |
| "step": 8256 | |
| }, | |
| { | |
| "epoch": 3.885607126113455, | |
| "grad_norm": 6.5625, | |
| "learning_rate": 1.4396731459222546e-06, | |
| "loss": 0.1421, | |
| "step": 8288 | |
| }, | |
| { | |
| "epoch": 3.9006094702297234, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 1.4030988943608826e-06, | |
| "loss": 0.1796, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 3.9156118143459917, | |
| "grad_norm": 6.3125, | |
| "learning_rate": 1.3669192479393145e-06, | |
| "loss": 0.1611, | |
| "step": 8352 | |
| }, | |
| { | |
| "epoch": 3.9306141584622596, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 1.3311381758171165e-06, | |
| "loss": 0.1537, | |
| "step": 8384 | |
| }, | |
| { | |
| "epoch": 3.945616502578528, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 1.2957596034274732e-06, | |
| "loss": 0.1732, | |
| "step": 8416 | |
| }, | |
| { | |
| "epoch": 3.9606188466947962, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 1.2607874120465457e-06, | |
| "loss": 0.1605, | |
| "step": 8448 | |
| }, | |
| { | |
| "epoch": 3.975621190811064, | |
| "grad_norm": 7.25, | |
| "learning_rate": 1.2262254383676597e-06, | |
| "loss": 0.1486, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 3.9906235349273325, | |
| "grad_norm": 12.75, | |
| "learning_rate": 1.192077474080398e-06, | |
| "loss": 0.1754, | |
| "step": 8512 | |
| }, | |
| { | |
| "epoch": 4.005625879043601, | |
| "grad_norm": 12.5, | |
| "learning_rate": 1.1583472654546257e-06, | |
| "loss": 0.1472, | |
| "step": 8544 | |
| }, | |
| { | |
| "epoch": 4.020628223159869, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 1.1250385129295005e-06, | |
| "loss": 0.1482, | |
| "step": 8576 | |
| }, | |
| { | |
| "epoch": 4.035630567276137, | |
| "grad_norm": 11.875, | |
| "learning_rate": 1.0921548707075026e-06, | |
| "loss": 0.1918, | |
| "step": 8608 | |
| }, | |
| { | |
| "epoch": 4.050632911392405, | |
| "grad_norm": 6.15625, | |
| "learning_rate": 1.059699946353549e-06, | |
| "loss": 0.1499, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 4.065635255508673, | |
| "grad_norm": 13.0, | |
| "learning_rate": 1.0276773003992157e-06, | |
| "loss": 0.166, | |
| "step": 8672 | |
| }, | |
| { | |
| "epoch": 4.080637599624941, | |
| "grad_norm": 11.6875, | |
| "learning_rate": 9.96090445952121e-07, | |
| "loss": 0.1712, | |
| "step": 8704 | |
| }, | |
| { | |
| "epoch": 4.09563994374121, | |
| "grad_norm": 10.375, | |
| "learning_rate": 9.649428483105204e-07, | |
| "loss": 0.1802, | |
| "step": 8736 | |
| }, | |
| { | |
| "epoch": 4.110642287857478, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 9.34237924583129e-07, | |
| "loss": 0.138, | |
| "step": 8768 | |
| }, | |
| { | |
| "epoch": 4.125644631973746, | |
| "grad_norm": 11.25, | |
| "learning_rate": 9.039790433142481e-07, | |
| "loss": 0.1896, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 4.140646976090014, | |
| "grad_norm": 10.5, | |
| "learning_rate": 8.741695241142095e-07, | |
| "loss": 0.1624, | |
| "step": 8832 | |
| }, | |
| { | |
| "epoch": 4.155649320206282, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 8.448126372951904e-07, | |
| "loss": 0.1418, | |
| "step": 8864 | |
| }, | |
| { | |
| "epoch": 4.17065166432255, | |
| "grad_norm": 17.0, | |
| "learning_rate": 8.159116035124431e-07, | |
| "loss": 0.1635, | |
| "step": 8896 | |
| }, | |
| { | |
| "epoch": 4.185654008438819, | |
| "grad_norm": 8.25, | |
| "learning_rate": 7.874695934109583e-07, | |
| "loss": 0.1583, | |
| "step": 8928 | |
| }, | |
| { | |
| "epoch": 4.200656352555087, | |
| "grad_norm": 9.5, | |
| "learning_rate": 7.594897272776275e-07, | |
| "loss": 0.1837, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 4.215658696671355, | |
| "grad_norm": 14.1875, | |
| "learning_rate": 7.319750746989262e-07, | |
| "loss": 0.1752, | |
| "step": 8992 | |
| }, | |
| { | |
| "epoch": 4.2306610407876235, | |
| "grad_norm": 11.5, | |
| "learning_rate": 7.049286542241573e-07, | |
| "loss": 0.1455, | |
| "step": 9024 | |
| }, | |
| { | |
| "epoch": 4.245663384903891, | |
| "grad_norm": 6.59375, | |
| "learning_rate": 6.783534330342984e-07, | |
| "loss": 0.15, | |
| "step": 9056 | |
| }, | |
| { | |
| "epoch": 4.260665729020159, | |
| "grad_norm": 7.0, | |
| "learning_rate": 6.522523266164759e-07, | |
| "loss": 0.1644, | |
| "step": 9088 | |
| }, | |
| { | |
| "epoch": 4.275668073136427, | |
| "grad_norm": 6.4375, | |
| "learning_rate": 6.266281984441214e-07, | |
| "loss": 0.1311, | |
| "step": 9120 | |
| }, | |
| { | |
| "epoch": 4.290670417252696, | |
| "grad_norm": 8.625, | |
| "learning_rate": 6.014838596628225e-07, | |
| "loss": 0.1386, | |
| "step": 9152 | |
| }, | |
| { | |
| "epoch": 4.305672761368964, | |
| "grad_norm": 8.125, | |
| "learning_rate": 5.768220687819271e-07, | |
| "loss": 0.167, | |
| "step": 9184 | |
| }, | |
| { | |
| "epoch": 4.320675105485232, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 5.526455313719126e-07, | |
| "loss": 0.1587, | |
| "step": 9216 | |
| }, | |
| { | |
| "epoch": 4.3356774496015005, | |
| "grad_norm": 12.125, | |
| "learning_rate": 5.289568997675643e-07, | |
| "loss": 0.1834, | |
| "step": 9248 | |
| }, | |
| { | |
| "epoch": 4.350679793717768, | |
| "grad_norm": 11.6875, | |
| "learning_rate": 5.057587727769981e-07, | |
| "loss": 0.1602, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 4.365682137834036, | |
| "grad_norm": 7.1875, | |
| "learning_rate": 4.830536953965531e-07, | |
| "loss": 0.1472, | |
| "step": 9312 | |
| }, | |
| { | |
| "epoch": 4.380684481950305, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 4.6084415853158537e-07, | |
| "loss": 0.1334, | |
| "step": 9344 | |
| }, | |
| { | |
| "epoch": 4.395686826066573, | |
| "grad_norm": 25.0, | |
| "learning_rate": 4.391325987232037e-07, | |
| "loss": 0.2039, | |
| "step": 9376 | |
| }, | |
| { | |
| "epoch": 4.410689170182841, | |
| "grad_norm": 19.125, | |
| "learning_rate": 4.17921397880956e-07, | |
| "loss": 0.1607, | |
| "step": 9408 | |
| }, | |
| { | |
| "epoch": 4.42569151429911, | |
| "grad_norm": 4.875, | |
| "learning_rate": 3.9721288302152493e-07, | |
| "loss": 0.187, | |
| "step": 9440 | |
| }, | |
| { | |
| "epoch": 4.4406938584153774, | |
| "grad_norm": 7.71875, | |
| "learning_rate": 3.770093260134322e-07, | |
| "loss": 0.1658, | |
| "step": 9472 | |
| }, | |
| { | |
| "epoch": 4.455696202531645, | |
| "grad_norm": 8.875, | |
| "learning_rate": 3.573129433278011e-07, | |
| "loss": 0.1951, | |
| "step": 9504 | |
| }, | |
| { | |
| "epoch": 4.470698546647914, | |
| "grad_norm": 12.5, | |
| "learning_rate": 3.381258957951983e-07, | |
| "loss": 0.162, | |
| "step": 9536 | |
| }, | |
| { | |
| "epoch": 4.485700890764182, | |
| "grad_norm": 20.875, | |
| "learning_rate": 3.194502883685663e-07, | |
| "loss": 0.1915, | |
| "step": 9568 | |
| }, | |
| { | |
| "epoch": 4.50070323488045, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 3.0128816989230315e-07, | |
| "loss": 0.1639, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 4.515705578996718, | |
| "grad_norm": 3.625, | |
| "learning_rate": 2.836415328774872e-07, | |
| "loss": 0.1495, | |
| "step": 9632 | |
| }, | |
| { | |
| "epoch": 4.5307079231129865, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 2.665123132832842e-07, | |
| "loss": 0.1508, | |
| "step": 9664 | |
| }, | |
| { | |
| "epoch": 4.545710267229254, | |
| "grad_norm": 10.375, | |
| "learning_rate": 2.499023903045622e-07, | |
| "loss": 0.1933, | |
| "step": 9696 | |
| }, | |
| { | |
| "epoch": 4.560712611345522, | |
| "grad_norm": 9.125, | |
| "learning_rate": 2.3381358616572593e-07, | |
| "loss": 0.1807, | |
| "step": 9728 | |
| }, | |
| { | |
| "epoch": 4.575714955461791, | |
| "grad_norm": 5.25, | |
| "learning_rate": 2.1824766592080937e-07, | |
| "loss": 0.145, | |
| "step": 9760 | |
| }, | |
| { | |
| "epoch": 4.590717299578059, | |
| "grad_norm": 13.75, | |
| "learning_rate": 2.0320633725983641e-07, | |
| "loss": 0.1707, | |
| "step": 9792 | |
| }, | |
| { | |
| "epoch": 4.605719643694327, | |
| "grad_norm": 8.1875, | |
| "learning_rate": 1.8869125032147384e-07, | |
| "loss": 0.1769, | |
| "step": 9824 | |
| }, | |
| { | |
| "epoch": 4.620721987810596, | |
| "grad_norm": 15.625, | |
| "learning_rate": 1.747039975120035e-07, | |
| "loss": 0.1786, | |
| "step": 9856 | |
| }, | |
| { | |
| "epoch": 4.6357243319268635, | |
| "grad_norm": 13.3125, | |
| "learning_rate": 1.6124611333062036e-07, | |
| "loss": 0.1512, | |
| "step": 9888 | |
| }, | |
| { | |
| "epoch": 4.650726676043131, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 1.4831907420108705e-07, | |
| "loss": 0.133, | |
| "step": 9920 | |
| }, | |
| { | |
| "epoch": 4.6657290201594, | |
| "grad_norm": 8.75, | |
| "learning_rate": 1.3592429830976362e-07, | |
| "loss": 0.1517, | |
| "step": 9952 | |
| }, | |
| { | |
| "epoch": 4.680731364275668, | |
| "grad_norm": 10.5, | |
| "learning_rate": 1.2406314545001795e-07, | |
| "loss": 0.1268, | |
| "step": 9984 | |
| }, | |
| { | |
| "epoch": 4.695733708391936, | |
| "grad_norm": 10.375, | |
| "learning_rate": 1.1273691687305299e-07, | |
| "loss": 0.1799, | |
| "step": 10016 | |
| }, | |
| { | |
| "epoch": 4.710736052508205, | |
| "grad_norm": 13.375, | |
| "learning_rate": 1.0194685514514302e-07, | |
| "loss": 0.1589, | |
| "step": 10048 | |
| }, | |
| { | |
| "epoch": 4.725738396624473, | |
| "grad_norm": 7.1875, | |
| "learning_rate": 9.16941440113206e-08, | |
| "loss": 0.1619, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 4.7407407407407405, | |
| "grad_norm": 6.78125, | |
| "learning_rate": 8.197990826551094e-08, | |
| "loss": 0.1367, | |
| "step": 10112 | |
| }, | |
| { | |
| "epoch": 4.755743084857009, | |
| "grad_norm": 10.75, | |
| "learning_rate": 7.280521362713122e-08, | |
| "loss": 0.1789, | |
| "step": 10144 | |
| }, | |
| { | |
| "epoch": 4.770745428973277, | |
| "grad_norm": 13.875, | |
| "learning_rate": 6.417106662417849e-08, | |
| "loss": 0.1452, | |
| "step": 10176 | |
| }, | |
| { | |
| "epoch": 4.785747773089545, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 5.607841448280194e-08, | |
| "loss": 0.1715, | |
| "step": 10208 | |
| }, | |
| { | |
| "epoch": 4.800750117205814, | |
| "grad_norm": 6.1875, | |
| "learning_rate": 4.852814502338765e-08, | |
| "loss": 0.1644, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 4.815752461322082, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 4.1521086563159344e-08, | |
| "loss": 0.1696, | |
| "step": 10272 | |
| }, | |
| { | |
| "epoch": 4.83075480543835, | |
| "grad_norm": 6.53125, | |
| "learning_rate": 3.5058007825303774e-08, | |
| "loss": 0.1565, | |
| "step": 10304 | |
| }, | |
| { | |
| "epoch": 4.845757149554618, | |
| "grad_norm": 6.46875, | |
| "learning_rate": 2.9139617854639368e-08, | |
| "loss": 0.1656, | |
| "step": 10336 | |
| }, | |
| { | |
| "epoch": 4.860759493670886, | |
| "grad_norm": 13.0625, | |
| "learning_rate": 2.3766565939826734e-08, | |
| "loss": 0.1673, | |
| "step": 10368 | |
| }, | |
| { | |
| "epoch": 4.875761837787154, | |
| "grad_norm": 3.703125, | |
| "learning_rate": 1.8939441542138448e-08, | |
| "loss": 0.1369, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 4.890764181903423, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 1.4658774230789653e-08, | |
| "loss": 0.157, | |
| "step": 10432 | |
| }, | |
| { | |
| "epoch": 4.905766526019691, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 1.0925033624842874e-08, | |
| "loss": 0.1443, | |
| "step": 10464 | |
| }, | |
| { | |
| "epoch": 4.920768870135959, | |
| "grad_norm": 14.6875, | |
| "learning_rate": 7.73862934168479e-09, | |
| "loss": 0.1617, | |
| "step": 10496 | |
| }, | |
| { | |
| "epoch": 4.9357712142522265, | |
| "grad_norm": 14.5, | |
| "learning_rate": 5.099910952091059e-09, | |
| "loss": 0.1769, | |
| "step": 10528 | |
| }, | |
| { | |
| "epoch": 4.950773558368495, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 3.0091679418742248e-09, | |
| "loss": 0.1684, | |
| "step": 10560 | |
| }, | |
| { | |
| "epoch": 4.965775902484763, | |
| "grad_norm": 10.625, | |
| "learning_rate": 1.4666296801252312e-09, | |
| "loss": 0.1523, | |
| "step": 10592 | |
| }, | |
| { | |
| "epoch": 4.980778246601031, | |
| "grad_norm": 15.375, | |
| "learning_rate": 4.724653940513246e-10, | |
| "loss": 0.1202, | |
| "step": 10624 | |
| }, | |
| { | |
| "epoch": 4.9957805907173, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 2.6784150408132315e-11, | |
| "loss": 0.196, | |
| "step": 10656 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "step": 10665, | |
| "total_flos": 1.232747073490944e+17, | |
| "train_loss": 0.22699634635386978, | |
| "train_runtime": 3211.3705, | |
| "train_samples_per_second": 3.321, | |
| "train_steps_per_second": 3.321 | |
| } | |
| ], | |
| "logging_steps": 32, | |
| "max_steps": 10665, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.232747073490944e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |