{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9975369458128078, "eval_steps": 500, "global_step": 270, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003694581280788177, "grad_norm": 1.3164679266754151, "learning_rate": 3.7037037037037036e-07, "loss": 0.5867, "step": 1 }, { "epoch": 0.01847290640394089, "grad_norm": 0.8941813303905811, "learning_rate": 1.8518518518518519e-06, "loss": 0.5684, "step": 5 }, { "epoch": 0.03694581280788178, "grad_norm": 0.4848149582760561, "learning_rate": 3.7037037037037037e-06, "loss": 0.5272, "step": 10 }, { "epoch": 0.05541871921182266, "grad_norm": 0.23489374203812283, "learning_rate": 5.555555555555557e-06, "loss": 0.3213, "step": 15 }, { "epoch": 0.07389162561576355, "grad_norm": 0.17590513475537736, "learning_rate": 7.4074074074074075e-06, "loss": 0.1719, "step": 20 }, { "epoch": 0.09236453201970443, "grad_norm": 0.09924337039651394, "learning_rate": 9.25925925925926e-06, "loss": 0.1392, "step": 25 }, { "epoch": 0.11083743842364532, "grad_norm": 0.06829427058420333, "learning_rate": 9.996239762521152e-06, "loss": 0.1168, "step": 30 }, { "epoch": 0.12931034482758622, "grad_norm": 0.06324690561682543, "learning_rate": 9.973281012033009e-06, "loss": 0.1009, "step": 35 }, { "epoch": 0.1477832512315271, "grad_norm": 0.049072943668117305, "learning_rate": 9.929548316723983e-06, "loss": 0.0886, "step": 40 }, { "epoch": 0.16625615763546797, "grad_norm": 0.04472692512923622, "learning_rate": 9.86522435289912e-06, "loss": 0.0772, "step": 45 }, { "epoch": 0.18472906403940886, "grad_norm": 0.04320806158867543, "learning_rate": 9.7805778088694e-06, "loss": 0.0677, "step": 50 }, { "epoch": 0.20320197044334976, "grad_norm": 0.04347088345515666, "learning_rate": 9.67596226261095e-06, "loss": 0.0652, "step": 55 }, { "epoch": 0.22167487684729065, "grad_norm": 0.04636829853932609, "learning_rate": 9.551814704830734e-06, "loss": 0.0618, "step": 60 }, { "epoch": 0.24014778325123154, "grad_norm": 0.04911651390500423, "learning_rate": 9.40865371360804e-06, "loss": 0.0564, "step": 65 }, { "epoch": 0.25862068965517243, "grad_norm": 0.046124586761473525, "learning_rate": 9.247077288236488e-06, "loss": 0.0496, "step": 70 }, { "epoch": 0.2770935960591133, "grad_norm": 0.04282690739591252, "learning_rate": 9.067760351314838e-06, "loss": 0.0443, "step": 75 }, { "epoch": 0.2955665024630542, "grad_norm": 0.0501495787415551, "learning_rate": 8.871451929520662e-06, "loss": 0.0472, "step": 80 }, { "epoch": 0.31403940886699505, "grad_norm": 0.048856614296215864, "learning_rate": 8.658972024843063e-06, "loss": 0.045, "step": 85 }, { "epoch": 0.33251231527093594, "grad_norm": 0.052135344528722635, "learning_rate": 8.43120818934367e-06, "loss": 0.0407, "step": 90 }, { "epoch": 0.35098522167487683, "grad_norm": 0.04496874450828456, "learning_rate": 8.18911181775353e-06, "loss": 0.0443, "step": 95 }, { "epoch": 0.3694581280788177, "grad_norm": 0.046961464584805046, "learning_rate": 7.93369417339209e-06, "loss": 0.043, "step": 100 }, { "epoch": 0.3879310344827586, "grad_norm": 0.03774079876533218, "learning_rate": 7.666022164008458e-06, "loss": 0.039, "step": 105 }, { "epoch": 0.4064039408866995, "grad_norm": 0.039388091387549375, "learning_rate": 7.387213885189746e-06, "loss": 0.043, "step": 110 }, { "epoch": 0.4248768472906404, "grad_norm": 0.04010524552891231, "learning_rate": 7.098433949952146e-06, "loss": 0.0418, "step": 115 }, { "epoch": 0.4433497536945813, "grad_norm": 0.03364245597783716, "learning_rate": 6.800888624023552e-06, "loss": 0.0396, "step": 120 }, { "epoch": 0.4618226600985222, "grad_norm": 0.03214650791918716, "learning_rate": 6.495820787138209e-06, "loss": 0.0343, "step": 125 }, { "epoch": 0.4802955665024631, "grad_norm": 0.04248682387196562, "learning_rate": 6.184504741390596e-06, "loss": 0.0385, "step": 130 }, { "epoch": 0.4987684729064039, "grad_norm": 0.041745690695414894, "learning_rate": 5.8682408883346535e-06, "loss": 0.0354, "step": 135 }, { "epoch": 0.5172413793103449, "grad_norm": 0.043998119847084, "learning_rate": 5.548350297062659e-06, "loss": 0.0363, "step": 140 }, { "epoch": 0.5357142857142857, "grad_norm": 0.03523911995673237, "learning_rate": 5.2261691859535325e-06, "loss": 0.0311, "step": 145 }, { "epoch": 0.5541871921182266, "grad_norm": 0.051683301339415226, "learning_rate": 4.903043341140879e-06, "loss": 0.0322, "step": 150 }, { "epoch": 0.5726600985221675, "grad_norm": 0.029645536386539162, "learning_rate": 4.580322495015466e-06, "loss": 0.0297, "step": 155 }, { "epoch": 0.5911330049261084, "grad_norm": 0.038478089898929216, "learning_rate": 4.259354688243758e-06, "loss": 0.0373, "step": 160 }, { "epoch": 0.6096059113300493, "grad_norm": 0.04016397060959619, "learning_rate": 3.941480638852948e-06, "loss": 0.0293, "step": 165 }, { "epoch": 0.6280788177339901, "grad_norm": 0.03295300026030036, "learning_rate": 3.6280281419034934e-06, "loss": 0.0306, "step": 170 }, { "epoch": 0.646551724137931, "grad_norm": 0.04371251122688708, "learning_rate": 3.3203065231422904e-06, "loss": 0.0292, "step": 175 }, { "epoch": 0.6650246305418719, "grad_norm": 0.02878432244430226, "learning_rate": 3.019601169804216e-06, "loss": 0.0348, "step": 180 }, { "epoch": 0.6834975369458128, "grad_norm": 0.03582599925982462, "learning_rate": 2.7271681614074973e-06, "loss": 0.0292, "step": 185 }, { "epoch": 0.7019704433497537, "grad_norm": 0.04498392884678493, "learning_rate": 2.4442290229706344e-06, "loss": 0.0355, "step": 190 }, { "epoch": 0.7204433497536946, "grad_norm": 0.03888587884987569, "learning_rate": 2.171965622567308e-06, "loss": 0.0311, "step": 195 }, { "epoch": 0.7389162561576355, "grad_norm": 0.04684421314626146, "learning_rate": 1.9115152345327154e-06, "loss": 0.0391, "step": 200 }, { "epoch": 0.7573891625615764, "grad_norm": 0.031274018167510506, "learning_rate": 1.6639657889429017e-06, "loss": 0.0275, "step": 205 }, { "epoch": 0.7758620689655172, "grad_norm": 0.03793766745215515, "learning_rate": 1.4303513272105057e-06, "loss": 0.0312, "step": 210 }, { "epoch": 0.7943349753694581, "grad_norm": 0.03084526133895099, "learning_rate": 1.2116476827794104e-06, "loss": 0.0334, "step": 215 }, { "epoch": 0.812807881773399, "grad_norm": 0.035909978135080484, "learning_rate": 1.008768404960535e-06, "loss": 0.0329, "step": 220 }, { "epoch": 0.8312807881773399, "grad_norm": 0.03337500963867465, "learning_rate": 8.225609429353187e-07, "loss": 0.0299, "step": 225 }, { "epoch": 0.8497536945812808, "grad_norm": 0.04194808091582502, "learning_rate": 6.53803105866761e-07, "loss": 0.0312, "step": 230 }, { "epoch": 0.8682266009852216, "grad_norm": 0.030185202275612954, "learning_rate": 5.031998139045352e-07, "loss": 0.0291, "step": 235 }, { "epoch": 0.8866995073891626, "grad_norm": 0.03678202209697851, "learning_rate": 3.7138015365554834e-07, "loss": 0.0329, "step": 240 }, { "epoch": 0.9051724137931034, "grad_norm": 0.026908956915127933, "learning_rate": 2.5889475041961767e-07, "loss": 0.0318, "step": 245 }, { "epoch": 0.9236453201970444, "grad_norm": 0.04036220391547687, "learning_rate": 1.6621346816668993e-07, "loss": 0.0304, "step": 250 }, { "epoch": 0.9421182266009852, "grad_norm": 0.037246305929994866, "learning_rate": 9.372344686307655e-08, "loss": 0.036, "step": 255 }, { "epoch": 0.9605911330049262, "grad_norm": 0.031641745281581694, "learning_rate": 4.172748534499449e-08, "loss": 0.0296, "step": 260 }, { "epoch": 0.979064039408867, "grad_norm": 0.03135655037281048, "learning_rate": 1.044277649433989e-08, "loss": 0.0294, "step": 265 }, { "epoch": 0.9975369458128078, "grad_norm": 0.03097346411772941, "learning_rate": 0.0, "loss": 0.033, "step": 270 }, { "epoch": 0.9975369458128078, "step": 270, "total_flos": 8.643970128528015e+17, "train_loss": 0.07066047384783074, "train_runtime": 3995.5402, "train_samples_per_second": 1.625, "train_steps_per_second": 0.068 } ], "logging_steps": 5, "max_steps": 270, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.643970128528015e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }