| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.9942446043165467, | |
| "eval_steps": 500, | |
| "global_step": 1388, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "learning_rate": 3.999994877043978e-05, | |
| "loss": 1.5749, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "learning_rate": 3.997951166621575e-05, | |
| "loss": 1.2447, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "learning_rate": 3.9918088642045126e-05, | |
| "loss": 1.1636, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "learning_rate": 3.981585677303025e-05, | |
| "loss": 1.1379, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "learning_rate": 3.967302551523671e-05, | |
| "loss": 1.1114, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "learning_rate": 3.948988750611294e-05, | |
| "loss": 1.1086, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "learning_rate": 3.9266817964924905e-05, | |
| "loss": 1.0915, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "learning_rate": 3.900427392399429e-05, | |
| "loss": 1.075, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "learning_rate": 3.870279329231546e-05, | |
| "loss": 1.0875, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "learning_rate": 3.836299375346956e-05, | |
| "loss": 1.0696, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "learning_rate": 3.798557150009373e-05, | |
| "loss": 1.0614, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "learning_rate": 3.757129980749847e-05, | |
| "loss": 1.0638, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "learning_rate": 3.712102744935529e-05, | |
| "loss": 1.0545, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "learning_rate": 3.6635676958700946e-05, | |
| "loss": 1.0508, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "learning_rate": 3.611624273782092e-05, | |
| "loss": 1.0566, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "learning_rate": 3.556378902088484e-05, | |
| "loss": 1.0577, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "learning_rate": 3.4979447693508e-05, | |
| "loss": 1.0428, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "learning_rate": 3.436441597370635e-05, | |
| "loss": 1.0456, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.035969614982605, | |
| "eval_runtime": 20.059, | |
| "eval_samples_per_second": 13.909, | |
| "eval_steps_per_second": 3.49, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "learning_rate": 3.371995395899618e-05, | |
| "loss": 1.0082, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "learning_rate": 3.304738204466437e-05, | |
| "loss": 0.9889, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "learning_rate": 3.234807821849838e-05, | |
| "loss": 0.9786, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "learning_rate": 3.162347523751894e-05, | |
| "loss": 0.9881, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "learning_rate": 3.0875057692499566e-05, | |
| "loss": 0.9747, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "learning_rate": 3.0104358966287503e-05, | |
| "loss": 0.9842, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "learning_rate": 2.9312958092157724e-05, | |
| "loss": 0.9846, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "learning_rate": 2.850247651863686e-05, | |
| "loss": 0.9801, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "learning_rate": 2.767457478742533e-05, | |
| "loss": 0.9834, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "learning_rate": 2.6830949131224118e-05, | |
| "loss": 0.9831, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "learning_rate": 2.5973327998436527e-05, | |
| "loss": 0.9787, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "learning_rate": 2.5103468511865456e-05, | |
| "loss": 0.981, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "learning_rate": 2.4223152868661535e-05, | |
| "loss": 0.9845, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "learning_rate": 2.3334184688898107e-05, | |
| "loss": 0.9754, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "learning_rate": 2.2438385320254234e-05, | |
| "loss": 0.9779, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "learning_rate": 2.1537590106376758e-05, | |
| "loss": 0.9737, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "learning_rate": 2.0633644626567007e-05, | |
| "loss": 0.9714, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 1.0180176496505737, | |
| "eval_runtime": 20.0699, | |
| "eval_samples_per_second": 13.901, | |
| "eval_steps_per_second": 3.488, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "learning_rate": 1.9728400914496288e-05, | |
| "loss": 0.9669, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "learning_rate": 1.882371366369749e-05, | |
| "loss": 0.9478, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "learning_rate": 1.79214364276071e-05, | |
| "loss": 0.9458, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "learning_rate": 1.702341782194301e-05, | |
| "loss": 0.9307, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "learning_rate": 1.6131497737198942e-05, | |
| "loss": 0.9435, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "learning_rate": 1.5247503569015413e-05, | |
| "loss": 0.945, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "learning_rate": 1.437324647415053e-05, | |
| "loss": 0.9416, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "learning_rate": 1.3510517659721583e-05, | |
| "loss": 0.9476, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "learning_rate": 1.2661084713320093e-05, | |
| "loss": 0.946, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "learning_rate": 1.182668798151939e-05, | |
| "loss": 0.9414, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "learning_rate": 1.1009037004194424e-05, | |
| "loss": 0.9439, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "learning_rate": 1.020980701195946e-05, | |
| "loss": 0.9486, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "learning_rate": 9.430635493899609e-06, | |
| "loss": 0.949, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "learning_rate": 8.673118842628595e-06, | |
| "loss": 0.9376, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "learning_rate": 7.938809083546264e-06, | |
| "loss": 0.9432, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "learning_rate": 7.229210694997113e-06, | |
| "loss": 0.9457, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "learning_rate": 6.545777525844883e-06, | |
| "loss": 0.9357, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "learning_rate": 5.889909816778458e-06, | |
| "loss": 0.9335, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 1.0176299810409546, | |
| "eval_runtime": 20.0069, | |
| "eval_samples_per_second": 13.945, | |
| "eval_steps_per_second": 3.499, | |
| "step": 1042 | |
| }, | |
| { | |
| "epoch": 3.05, | |
| "learning_rate": 5.262951331452011e-06, | |
| "loss": 0.937, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 3.11, | |
| "learning_rate": 4.6661866033371506e-06, | |
| "loss": 0.9351, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 3.17, | |
| "learning_rate": 4.100838303927914e-06, | |
| "loss": 0.9415, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 3.22, | |
| "learning_rate": 3.5680647376905666e-06, | |
| "loss": 0.9293, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 3.28, | |
| "learning_rate": 3.0689574688907607e-06, | |
| "loss": 0.9304, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "learning_rate": 2.604539085160218e-06, | |
| "loss": 0.9254, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "learning_rate": 2.1757611023850876e-06, | |
| "loss": 0.9293, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 3.45, | |
| "learning_rate": 1.7835020152084116e-06, | |
| "loss": 0.9391, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 3.51, | |
| "learning_rate": 1.4285654971409902e-06, | |
| "loss": 0.9363, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 3.57, | |
| "learning_rate": 1.1116787539682571e-06, | |
| "loss": 0.9506, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 3.63, | |
| "learning_rate": 8.334910338268054e-07, | |
| "loss": 0.9226, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "learning_rate": 5.945722970031332e-07, | |
| "loss": 0.9305, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 3.74, | |
| "learning_rate": 3.9541204817997283e-07, | |
| "loss": 0.9306, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "learning_rate": 2.3641833352276768e-07, | |
| "loss": 0.9344, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 3.86, | |
| "learning_rate": 1.1791690466107286e-07, | |
| "loss": 0.93, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 3.91, | |
| "learning_rate": 4.0150551277724494e-08, | |
| "loss": 0.9344, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 3.97, | |
| "learning_rate": 3.2786036732557203e-09, | |
| "loss": 0.9348, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 3.99, | |
| "eval_loss": 1.0186352729797363, | |
| "eval_runtime": 19.987, | |
| "eval_samples_per_second": 13.959, | |
| "eval_steps_per_second": 3.502, | |
| "step": 1388 | |
| }, | |
| { | |
| "epoch": 3.99, | |
| "step": 1388, | |
| "total_flos": 1.7642090681398723e+18, | |
| "train_loss": 0.9866050820185747, | |
| "train_runtime": 37340.4665, | |
| "train_samples_per_second": 3.722, | |
| "train_steps_per_second": 0.037 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 1388, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 80, | |
| "total_flos": 1.7642090681398723e+18, | |
| "train_batch_size": 10, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |