{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 106, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009478672985781991, "grad_norm": 1.1718926429748535, "learning_rate": 0.0, "loss": 2.6629, "step": 1 }, { "epoch": 0.018957345971563982, "grad_norm": 1.1212024688720703, "learning_rate": 6.666666666666667e-07, "loss": 2.5806, "step": 2 }, { "epoch": 0.02843601895734597, "grad_norm": 1.225563406944275, "learning_rate": 1.3333333333333334e-06, "loss": 2.6841, "step": 3 }, { "epoch": 0.037914691943127965, "grad_norm": 1.2255617380142212, "learning_rate": 2.0000000000000003e-06, "loss": 2.6513, "step": 4 }, { "epoch": 0.04739336492890995, "grad_norm": 1.000268578529358, "learning_rate": 2.666666666666667e-06, "loss": 2.6722, "step": 5 }, { "epoch": 0.05687203791469194, "grad_norm": 0.9953453540802002, "learning_rate": 3.3333333333333333e-06, "loss": 2.4666, "step": 6 }, { "epoch": 0.06635071090047394, "grad_norm": 1.158920168876648, "learning_rate": 4.000000000000001e-06, "loss": 2.6296, "step": 7 }, { "epoch": 0.07582938388625593, "grad_norm": 1.0453510284423828, "learning_rate": 4.666666666666667e-06, "loss": 2.599, "step": 8 }, { "epoch": 0.08530805687203792, "grad_norm": 0.9960002899169922, "learning_rate": 5.333333333333334e-06, "loss": 2.6475, "step": 9 }, { "epoch": 0.0947867298578199, "grad_norm": 1.2907602787017822, "learning_rate": 6e-06, "loss": 2.5586, "step": 10 }, { "epoch": 0.10426540284360189, "grad_norm": 1.0515801906585693, "learning_rate": 6.666666666666667e-06, "loss": 2.5806, "step": 11 }, { "epoch": 0.11374407582938388, "grad_norm": 1.070308804512024, "learning_rate": 7.333333333333333e-06, "loss": 2.7149, "step": 12 }, { "epoch": 0.12322274881516587, "grad_norm": 0.9910991191864014, "learning_rate": 8.000000000000001e-06, "loss": 2.6353, "step": 13 }, { "epoch": 0.13270142180094788, "grad_norm": 0.8306079506874084, "learning_rate": 8.666666666666668e-06, "loss": 2.5439, "step": 14 }, { "epoch": 0.14218009478672985, "grad_norm": 0.945061981678009, "learning_rate": 9.333333333333334e-06, "loss": 2.5597, "step": 15 }, { "epoch": 0.15165876777251186, "grad_norm": 0.8972373604774475, "learning_rate": 1e-05, "loss": 2.4496, "step": 16 }, { "epoch": 0.16113744075829384, "grad_norm": 0.9250657558441162, "learning_rate": 1.0666666666666667e-05, "loss": 2.5905, "step": 17 }, { "epoch": 0.17061611374407584, "grad_norm": 0.8590799570083618, "learning_rate": 1.1333333333333334e-05, "loss": 2.4993, "step": 18 }, { "epoch": 0.18009478672985782, "grad_norm": 0.8903495669364929, "learning_rate": 1.2e-05, "loss": 2.4999, "step": 19 }, { "epoch": 0.1895734597156398, "grad_norm": 0.9213452935218811, "learning_rate": 1.2666666666666667e-05, "loss": 2.4677, "step": 20 }, { "epoch": 0.1990521327014218, "grad_norm": 0.8921183943748474, "learning_rate": 1.3333333333333333e-05, "loss": 2.4343, "step": 21 }, { "epoch": 0.20853080568720378, "grad_norm": 0.767433226108551, "learning_rate": 1.4e-05, "loss": 2.5294, "step": 22 }, { "epoch": 0.21800947867298578, "grad_norm": 0.8856372237205505, "learning_rate": 1.4666666666666666e-05, "loss": 2.4663, "step": 23 }, { "epoch": 0.22748815165876776, "grad_norm": 0.8512458801269531, "learning_rate": 1.5333333333333334e-05, "loss": 2.5763, "step": 24 }, { "epoch": 0.23696682464454977, "grad_norm": 0.8298342823982239, "learning_rate": 1.6000000000000003e-05, "loss": 2.4612, "step": 25 }, { "epoch": 0.24644549763033174, "grad_norm": 1.0803931951522827, "learning_rate": 1.6666666666666667e-05, "loss": 2.515, "step": 26 }, { "epoch": 0.2559241706161137, "grad_norm": 0.8421900868415833, "learning_rate": 1.7333333333333336e-05, "loss": 2.5489, "step": 27 }, { "epoch": 0.26540284360189575, "grad_norm": 0.9771528244018555, "learning_rate": 1.8e-05, "loss": 2.4214, "step": 28 }, { "epoch": 0.27488151658767773, "grad_norm": 1.0094590187072754, "learning_rate": 1.866666666666667e-05, "loss": 2.3959, "step": 29 }, { "epoch": 0.2843601895734597, "grad_norm": 0.9626856446266174, "learning_rate": 1.9333333333333333e-05, "loss": 2.4792, "step": 30 }, { "epoch": 0.2938388625592417, "grad_norm": 0.843460738658905, "learning_rate": 2e-05, "loss": 2.5486, "step": 31 }, { "epoch": 0.3033175355450237, "grad_norm": 0.8346056342124939, "learning_rate": 1.99994050500015e-05, "loss": 2.3746, "step": 32 }, { "epoch": 0.3127962085308057, "grad_norm": 0.9779443144798279, "learning_rate": 1.999762027079909e-05, "loss": 2.4543, "step": 33 }, { "epoch": 0.3222748815165877, "grad_norm": 1.2702308893203735, "learning_rate": 1.9994645874763657e-05, "loss": 2.4247, "step": 34 }, { "epoch": 0.33175355450236965, "grad_norm": 0.9161669611930847, "learning_rate": 1.999048221581858e-05, "loss": 2.5287, "step": 35 }, { "epoch": 0.3412322274881517, "grad_norm": 0.8298777937889099, "learning_rate": 1.9985129789397633e-05, "loss": 2.4522, "step": 36 }, { "epoch": 0.35071090047393366, "grad_norm": 0.8846487402915955, "learning_rate": 1.9978589232386036e-05, "loss": 2.4225, "step": 37 }, { "epoch": 0.36018957345971564, "grad_norm": 0.7903144955635071, "learning_rate": 1.9970861323044667e-05, "loss": 2.421, "step": 38 }, { "epoch": 0.3696682464454976, "grad_norm": 0.8443423509597778, "learning_rate": 1.9961946980917457e-05, "loss": 2.4323, "step": 39 }, { "epoch": 0.3791469194312796, "grad_norm": 0.9062912464141846, "learning_rate": 1.995184726672197e-05, "loss": 2.4179, "step": 40 }, { "epoch": 0.3886255924170616, "grad_norm": 0.8565083146095276, "learning_rate": 1.9940563382223196e-05, "loss": 2.3007, "step": 41 }, { "epoch": 0.3981042654028436, "grad_norm": 0.806698739528656, "learning_rate": 1.9928096670090552e-05, "loss": 2.3509, "step": 42 }, { "epoch": 0.4075829383886256, "grad_norm": 0.8576929569244385, "learning_rate": 1.9914448613738107e-05, "loss": 2.4125, "step": 43 }, { "epoch": 0.41706161137440756, "grad_norm": 1.650278925895691, "learning_rate": 1.989962083714808e-05, "loss": 2.3508, "step": 44 }, { "epoch": 0.4265402843601896, "grad_norm": 0.8928993344306946, "learning_rate": 1.988361510467761e-05, "loss": 2.3976, "step": 45 }, { "epoch": 0.43601895734597157, "grad_norm": 0.8806268572807312, "learning_rate": 1.9866433320848793e-05, "loss": 2.3067, "step": 46 }, { "epoch": 0.44549763033175355, "grad_norm": 0.8654487729072571, "learning_rate": 1.9848077530122083e-05, "loss": 2.4552, "step": 47 }, { "epoch": 0.4549763033175355, "grad_norm": 1.3034582138061523, "learning_rate": 1.9828549916653013e-05, "loss": 2.3354, "step": 48 }, { "epoch": 0.46445497630331756, "grad_norm": 0.9486224055290222, "learning_rate": 1.9807852804032306e-05, "loss": 2.4246, "step": 49 }, { "epoch": 0.47393364928909953, "grad_norm": 0.9173024296760559, "learning_rate": 1.9785988655009386e-05, "loss": 2.4309, "step": 50 }, { "epoch": 0.4834123222748815, "grad_norm": 0.9614086747169495, "learning_rate": 1.9762960071199334e-05, "loss": 2.1943, "step": 51 }, { "epoch": 0.4928909952606635, "grad_norm": 0.9894729852676392, "learning_rate": 1.9738769792773338e-05, "loss": 2.3974, "step": 52 }, { "epoch": 0.5023696682464455, "grad_norm": 0.8646323084831238, "learning_rate": 1.9713420698132614e-05, "loss": 2.3535, "step": 53 }, { "epoch": 0.5118483412322274, "grad_norm": 0.9549570083618164, "learning_rate": 1.9686915803565934e-05, "loss": 2.3486, "step": 54 }, { "epoch": 0.5213270142180095, "grad_norm": 0.8866903781890869, "learning_rate": 1.9659258262890683e-05, "loss": 2.3381, "step": 55 }, { "epoch": 0.5308056872037915, "grad_norm": 0.8965452909469604, "learning_rate": 1.963045136707763e-05, "loss": 2.3125, "step": 56 }, { "epoch": 0.5402843601895735, "grad_norm": 1.01371169090271, "learning_rate": 1.960049854385929e-05, "loss": 2.4584, "step": 57 }, { "epoch": 0.5497630331753555, "grad_norm": 0.9794333577156067, "learning_rate": 1.956940335732209e-05, "loss": 2.4287, "step": 58 }, { "epoch": 0.5592417061611374, "grad_norm": 1.146530270576477, "learning_rate": 1.953716950748227e-05, "loss": 2.4154, "step": 59 }, { "epoch": 0.5687203791469194, "grad_norm": 0.9316839575767517, "learning_rate": 1.9503800829845613e-05, "loss": 2.4184, "step": 60 }, { "epoch": 0.5781990521327014, "grad_norm": 0.8945372700691223, "learning_rate": 1.946930129495106e-05, "loss": 2.3579, "step": 61 }, { "epoch": 0.5876777251184834, "grad_norm": 0.9607009291648865, "learning_rate": 1.9433675007898255e-05, "loss": 2.4485, "step": 62 }, { "epoch": 0.5971563981042654, "grad_norm": 0.986282467842102, "learning_rate": 1.9396926207859085e-05, "loss": 2.4574, "step": 63 }, { "epoch": 0.6066350710900474, "grad_norm": 0.9698584079742432, "learning_rate": 1.935905926757326e-05, "loss": 2.3278, "step": 64 }, { "epoch": 0.6161137440758294, "grad_norm": 0.9542692303657532, "learning_rate": 1.932007869282799e-05, "loss": 2.3371, "step": 65 }, { "epoch": 0.6255924170616114, "grad_norm": 0.9010556936264038, "learning_rate": 1.9279989121921846e-05, "loss": 2.34, "step": 66 }, { "epoch": 0.6350710900473934, "grad_norm": 0.9130122661590576, "learning_rate": 1.9238795325112867e-05, "loss": 2.3215, "step": 67 }, { "epoch": 0.6445497630331753, "grad_norm": 1.236346960067749, "learning_rate": 1.9196502204050925e-05, "loss": 2.3667, "step": 68 }, { "epoch": 0.6540284360189573, "grad_norm": 1.457154393196106, "learning_rate": 1.9153114791194475e-05, "loss": 2.3762, "step": 69 }, { "epoch": 0.6635071090047393, "grad_norm": 0.8446010947227478, "learning_rate": 1.910863824921176e-05, "loss": 2.2987, "step": 70 }, { "epoch": 0.6729857819905213, "grad_norm": 0.9021546840667725, "learning_rate": 1.9063077870366504e-05, "loss": 2.3261, "step": 71 }, { "epoch": 0.6824644549763034, "grad_norm": 1.0001662969589233, "learning_rate": 1.901643907588816e-05, "loss": 2.3167, "step": 72 }, { "epoch": 0.6919431279620853, "grad_norm": 0.9423337578773499, "learning_rate": 1.8968727415326885e-05, "loss": 2.4218, "step": 73 }, { "epoch": 0.7014218009478673, "grad_norm": 0.9422402381896973, "learning_rate": 1.8919948565893144e-05, "loss": 2.3388, "step": 74 }, { "epoch": 0.7109004739336493, "grad_norm": 0.9633351564407349, "learning_rate": 1.887010833178222e-05, "loss": 2.3728, "step": 75 }, { "epoch": 0.7203791469194313, "grad_norm": 0.9586493968963623, "learning_rate": 1.881921264348355e-05, "loss": 2.3689, "step": 76 }, { "epoch": 0.7298578199052133, "grad_norm": 0.8738155961036682, "learning_rate": 1.876726755707508e-05, "loss": 2.2707, "step": 77 }, { "epoch": 0.7393364928909952, "grad_norm": 0.9077432751655579, "learning_rate": 1.8714279253502616e-05, "loss": 2.385, "step": 78 }, { "epoch": 0.7488151658767772, "grad_norm": 0.8949682116508484, "learning_rate": 1.866025403784439e-05, "loss": 2.2765, "step": 79 }, { "epoch": 0.7582938388625592, "grad_norm": 0.8834524750709534, "learning_rate": 1.860519833856079e-05, "loss": 2.274, "step": 80 }, { "epoch": 0.7677725118483413, "grad_norm": 1.1563708782196045, "learning_rate": 1.854911870672947e-05, "loss": 2.3958, "step": 81 }, { "epoch": 0.7772511848341233, "grad_norm": 0.914176881313324, "learning_rate": 1.849202181526579e-05, "loss": 2.2595, "step": 82 }, { "epoch": 0.7867298578199052, "grad_norm": 1.2525924444198608, "learning_rate": 1.843391445812886e-05, "loss": 2.3114, "step": 83 }, { "epoch": 0.7962085308056872, "grad_norm": 0.9277448654174805, "learning_rate": 1.837480354951308e-05, "loss": 2.2949, "step": 84 }, { "epoch": 0.8056872037914692, "grad_norm": 0.9295048117637634, "learning_rate": 1.8314696123025456e-05, "loss": 2.2778, "step": 85 }, { "epoch": 0.8151658767772512, "grad_norm": 0.9221338033676147, "learning_rate": 1.8253599330848638e-05, "loss": 2.35, "step": 86 }, { "epoch": 0.8246445497630331, "grad_norm": 0.910460889339447, "learning_rate": 1.819152044288992e-05, "loss": 2.2655, "step": 87 }, { "epoch": 0.8341232227488151, "grad_norm": 1.0350533723831177, "learning_rate": 1.8128466845916156e-05, "loss": 2.4712, "step": 88 }, { "epoch": 0.8436018957345972, "grad_norm": 1.0058749914169312, "learning_rate": 1.806444604267483e-05, "loss": 2.4232, "step": 89 }, { "epoch": 0.8530805687203792, "grad_norm": 0.8652654886245728, "learning_rate": 1.7999465651001297e-05, "loss": 2.3399, "step": 90 }, { "epoch": 0.8625592417061612, "grad_norm": 0.9553002119064331, "learning_rate": 1.7933533402912354e-05, "loss": 2.2772, "step": 91 }, { "epoch": 0.8720379146919431, "grad_norm": 0.9334009885787964, "learning_rate": 1.786665714368617e-05, "loss": 2.3421, "step": 92 }, { "epoch": 0.8815165876777251, "grad_norm": 0.9072061777114868, "learning_rate": 1.7798844830928818e-05, "loss": 2.2691, "step": 93 }, { "epoch": 0.8909952606635071, "grad_norm": 0.9546661376953125, "learning_rate": 1.773010453362737e-05, "loss": 2.3345, "step": 94 }, { "epoch": 0.9004739336492891, "grad_norm": 1.0151286125183105, "learning_rate": 1.766044443118978e-05, "loss": 2.2977, "step": 95 }, { "epoch": 0.909952606635071, "grad_norm": 0.8759846687316895, "learning_rate": 1.758987281247162e-05, "loss": 2.3405, "step": 96 }, { "epoch": 0.919431279620853, "grad_norm": 1.0017191171646118, "learning_rate": 1.7518398074789776e-05, "loss": 2.3445, "step": 97 }, { "epoch": 0.9289099526066351, "grad_norm": 0.9899027943611145, "learning_rate": 1.7446028722923266e-05, "loss": 2.3084, "step": 98 }, { "epoch": 0.9383886255924171, "grad_norm": 0.9640536904335022, "learning_rate": 1.737277336810124e-05, "loss": 2.2341, "step": 99 }, { "epoch": 0.9478672985781991, "grad_norm": 0.917986273765564, "learning_rate": 1.7298640726978357e-05, "loss": 2.3085, "step": 100 }, { "epoch": 0.957345971563981, "grad_norm": 0.9574633240699768, "learning_rate": 1.7223639620597556e-05, "loss": 2.3822, "step": 101 }, { "epoch": 0.966824644549763, "grad_norm": 0.9915265440940857, "learning_rate": 1.7147778973340466e-05, "loss": 2.3087, "step": 102 }, { "epoch": 0.976303317535545, "grad_norm": 0.9175536632537842, "learning_rate": 1.7071067811865477e-05, "loss": 2.3098, "step": 103 }, { "epoch": 0.985781990521327, "grad_norm": 0.9363272190093994, "learning_rate": 1.699351526403367e-05, "loss": 2.3394, "step": 104 }, { "epoch": 0.995260663507109, "grad_norm": 1.0321186780929565, "learning_rate": 1.6915130557822698e-05, "loss": 2.1852, "step": 105 }, { "epoch": 1.0, "grad_norm": 1.5194014310836792, "learning_rate": 1.6835923020228714e-05, "loss": 2.3883, "step": 106 } ], "logging_steps": 1, "max_steps": 318, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1900878220998656e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }