| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9328575759695888, | |
| "eval_steps": 500, | |
| "global_step": 20000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.3762490749359131, | |
| "learning_rate": 0.00046641791044776124, | |
| "loss": 8.4353, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "eval_loss": 6.534626483917236, | |
| "eval_runtime": 213.0459, | |
| "eval_samples_per_second": 264.877, | |
| "eval_steps_per_second": 8.28, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.518302321434021, | |
| "learning_rate": 0.0009328358208955225, | |
| "loss": 5.5553, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "eval_loss": 4.976442813873291, | |
| "eval_runtime": 215.6331, | |
| "eval_samples_per_second": 261.699, | |
| "eval_steps_per_second": 8.181, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.8457584381103516, | |
| "learning_rate": 0.0009789877154220063, | |
| "loss": 4.5996, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "eval_loss": 4.408080101013184, | |
| "eval_runtime": 215.4944, | |
| "eval_samples_per_second": 261.868, | |
| "eval_steps_per_second": 8.186, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.3413056135177612, | |
| "learning_rate": 0.0009544406539991162, | |
| "loss": 4.1961, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "eval_loss": 4.121068954467773, | |
| "eval_runtime": 216.0979, | |
| "eval_samples_per_second": 261.136, | |
| "eval_steps_per_second": 8.163, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.2387056350708008, | |
| "learning_rate": 0.0009298935925762261, | |
| "loss": 3.9584, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "eval_loss": 3.929608106613159, | |
| "eval_runtime": 216.438, | |
| "eval_samples_per_second": 260.726, | |
| "eval_steps_per_second": 8.15, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 1.3727014064788818, | |
| "learning_rate": 0.0009053465311533362, | |
| "loss": 3.7833, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "eval_loss": 3.787693738937378, | |
| "eval_runtime": 218.2815, | |
| "eval_samples_per_second": 258.524, | |
| "eval_steps_per_second": 8.081, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.5133860111236572, | |
| "learning_rate": 0.0008807994697304462, | |
| "loss": 3.6616, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_loss": 3.683837652206421, | |
| "eval_runtime": 217.4429, | |
| "eval_samples_per_second": 259.521, | |
| "eval_steps_per_second": 8.112, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.3134872913360596, | |
| "learning_rate": 0.0008562524083075562, | |
| "loss": 3.5679, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "eval_loss": 3.6060750484466553, | |
| "eval_runtime": 216.1087, | |
| "eval_samples_per_second": 261.123, | |
| "eval_steps_per_second": 8.163, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.3464657068252563, | |
| "learning_rate": 0.0008317053468846664, | |
| "loss": 3.4964, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "eval_loss": 3.54757022857666, | |
| "eval_runtime": 216.7125, | |
| "eval_samples_per_second": 260.396, | |
| "eval_steps_per_second": 8.14, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.3200907707214355, | |
| "learning_rate": 0.0008071582854617764, | |
| "loss": 3.4488, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "eval_loss": 3.4986045360565186, | |
| "eval_runtime": 217.9581, | |
| "eval_samples_per_second": 258.908, | |
| "eval_steps_per_second": 8.093, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.514103651046753, | |
| "learning_rate": 0.0007826112240388863, | |
| "loss": 3.3973, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "eval_loss": 3.4664642810821533, | |
| "eval_runtime": 218.2073, | |
| "eval_samples_per_second": 258.612, | |
| "eval_steps_per_second": 8.084, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.4389874935150146, | |
| "learning_rate": 0.0007580641626159963, | |
| "loss": 3.3587, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "eval_loss": 3.4231605529785156, | |
| "eval_runtime": 218.4087, | |
| "eval_samples_per_second": 258.373, | |
| "eval_steps_per_second": 8.077, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.4937726259231567, | |
| "learning_rate": 0.0007335171011931065, | |
| "loss": 3.3231, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "eval_loss": 3.390798330307007, | |
| "eval_runtime": 216.9327, | |
| "eval_samples_per_second": 260.131, | |
| "eval_steps_per_second": 8.132, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.5942374467849731, | |
| "learning_rate": 0.0007089700397702165, | |
| "loss": 3.2971, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "eval_loss": 3.361341714859009, | |
| "eval_runtime": 217.461, | |
| "eval_samples_per_second": 259.499, | |
| "eval_steps_per_second": 8.112, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.6310980319976807, | |
| "learning_rate": 0.0006844229783473265, | |
| "loss": 3.2679, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "eval_loss": 3.33766770362854, | |
| "eval_runtime": 217.3952, | |
| "eval_samples_per_second": 259.578, | |
| "eval_steps_per_second": 8.114, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.6001648902893066, | |
| "learning_rate": 0.0006598759169244364, | |
| "loss": 3.2436, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "eval_loss": 3.3167307376861572, | |
| "eval_runtime": 217.5617, | |
| "eval_samples_per_second": 259.379, | |
| "eval_steps_per_second": 8.108, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.590154767036438, | |
| "learning_rate": 0.0006353288555015467, | |
| "loss": 3.2322, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_loss": 3.2952842712402344, | |
| "eval_runtime": 217.0219, | |
| "eval_samples_per_second": 260.024, | |
| "eval_steps_per_second": 8.128, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.7063907384872437, | |
| "learning_rate": 0.0006107817940786566, | |
| "loss": 3.208, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "eval_loss": 3.2771995067596436, | |
| "eval_runtime": 218.9112, | |
| "eval_samples_per_second": 257.78, | |
| "eval_steps_per_second": 8.058, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.544815182685852, | |
| "learning_rate": 0.0005862347326557666, | |
| "loss": 3.1923, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "eval_loss": 3.263777017593384, | |
| "eval_runtime": 218.0114, | |
| "eval_samples_per_second": 258.844, | |
| "eval_steps_per_second": 8.091, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.615902304649353, | |
| "learning_rate": 0.0005616876712328766, | |
| "loss": 3.1683, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "eval_loss": 3.2488255500793457, | |
| "eval_runtime": 217.5444, | |
| "eval_samples_per_second": 259.4, | |
| "eval_steps_per_second": 8.109, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 1.533378005027771, | |
| "learning_rate": 0.0005371897039328324, | |
| "loss": 3.158, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "eval_loss": 3.2329940795898438, | |
| "eval_runtime": 214.5885, | |
| "eval_samples_per_second": 262.973, | |
| "eval_steps_per_second": 8.22, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.9465535879135132, | |
| "learning_rate": 0.0005126426425099425, | |
| "loss": 3.1439, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "eval_loss": 3.2245969772338867, | |
| "eval_runtime": 216.8559, | |
| "eval_samples_per_second": 260.224, | |
| "eval_steps_per_second": 8.134, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.7405271530151367, | |
| "learning_rate": 0.0004880955810870526, | |
| "loss": 3.1374, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "eval_loss": 3.208115339279175, | |
| "eval_runtime": 217.1863, | |
| "eval_samples_per_second": 259.828, | |
| "eval_steps_per_second": 8.122, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.6891497373580933, | |
| "learning_rate": 0.0004635485196641626, | |
| "loss": 3.1213, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "eval_loss": 3.200620651245117, | |
| "eval_runtime": 218.4169, | |
| "eval_samples_per_second": 258.364, | |
| "eval_steps_per_second": 8.076, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.73305082321167, | |
| "learning_rate": 0.00043909964648696415, | |
| "loss": 3.1144, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "eval_loss": 3.188239812850952, | |
| "eval_runtime": 215.556, | |
| "eval_samples_per_second": 261.793, | |
| "eval_steps_per_second": 8.183, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.7891405820846558, | |
| "learning_rate": 0.0004145525850640742, | |
| "loss": 3.0963, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "eval_loss": 3.174497365951538, | |
| "eval_runtime": 214.2396, | |
| "eval_samples_per_second": 263.401, | |
| "eval_steps_per_second": 8.234, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 1.6677337884902954, | |
| "learning_rate": 0.00039000552364118425, | |
| "loss": 3.097, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "eval_loss": 3.165832042694092, | |
| "eval_runtime": 214.1769, | |
| "eval_samples_per_second": 263.479, | |
| "eval_steps_per_second": 8.236, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.7241294384002686, | |
| "learning_rate": 0.00036545846221829433, | |
| "loss": 3.076, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "eval_loss": 3.1584606170654297, | |
| "eval_runtime": 264.545, | |
| "eval_samples_per_second": 213.313, | |
| "eval_steps_per_second": 6.668, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.835271954536438, | |
| "learning_rate": 0.00034091140079540435, | |
| "loss": 3.0666, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "eval_loss": 3.151036262512207, | |
| "eval_runtime": 218.4608, | |
| "eval_samples_per_second": 258.312, | |
| "eval_steps_per_second": 8.075, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.7558091878890991, | |
| "learning_rate": 0.00031636433937251427, | |
| "loss": 3.0709, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "eval_loss": 3.139883279800415, | |
| "eval_runtime": 11111.1382, | |
| "eval_samples_per_second": 5.079, | |
| "eval_steps_per_second": 0.159, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.6270012855529785, | |
| "learning_rate": 0.00029186637207247017, | |
| "loss": 3.0609, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "eval_loss": 3.1347908973693848, | |
| "eval_runtime": 258.6315, | |
| "eval_samples_per_second": 218.191, | |
| "eval_steps_per_second": 6.821, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.971229910850525, | |
| "learning_rate": 0.0002673193106495802, | |
| "loss": 3.0527, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "eval_loss": 3.127486228942871, | |
| "eval_runtime": 243.3689, | |
| "eval_samples_per_second": 231.874, | |
| "eval_steps_per_second": 7.248, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.7010846138000488, | |
| "learning_rate": 0.00024277224922669027, | |
| "loss": 3.0449, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "eval_loss": 3.122706890106201, | |
| "eval_runtime": 259.8928, | |
| "eval_samples_per_second": 217.132, | |
| "eval_steps_per_second": 6.787, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 2.0105388164520264, | |
| "learning_rate": 0.00021822518780380032, | |
| "loss": 3.0407, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "eval_loss": 3.1165504455566406, | |
| "eval_runtime": 264.3004, | |
| "eval_samples_per_second": 213.511, | |
| "eval_steps_per_second": 6.674, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 1.7834789752960205, | |
| "learning_rate": 0.00019372722050375605, | |
| "loss": 3.0376, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "eval_loss": 3.10659122467041, | |
| "eval_runtime": 265.7435, | |
| "eval_samples_per_second": 212.351, | |
| "eval_steps_per_second": 6.638, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.8829165697097778, | |
| "learning_rate": 0.0001691801590808661, | |
| "loss": 3.0251, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "eval_loss": 3.1006741523742676, | |
| "eval_runtime": 267.8646, | |
| "eval_samples_per_second": 210.67, | |
| "eval_steps_per_second": 6.585, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.7196826934814453, | |
| "learning_rate": 0.00014463309765797616, | |
| "loss": 3.0226, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "eval_loss": 3.0954883098602295, | |
| "eval_runtime": 263.6733, | |
| "eval_samples_per_second": 214.019, | |
| "eval_steps_per_second": 6.69, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.7544931173324585, | |
| "learning_rate": 0.00012008603623508621, | |
| "loss": 3.0203, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "eval_loss": 3.0885541439056396, | |
| "eval_runtime": 225.0692, | |
| "eval_samples_per_second": 250.727, | |
| "eval_steps_per_second": 7.838, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 1.659536361694336, | |
| "learning_rate": 9.558806893504194e-05, | |
| "loss": 3.0151, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "eval_loss": 3.083906888961792, | |
| "eval_runtime": 257.4684, | |
| "eval_samples_per_second": 219.176, | |
| "eval_steps_per_second": 6.851, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 1.6146643161773682, | |
| "learning_rate": 7.1041007512152e-05, | |
| "loss": 3.0018, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "eval_loss": 3.0793216228485107, | |
| "eval_runtime": 231.3706, | |
| "eval_samples_per_second": 243.899, | |
| "eval_steps_per_second": 7.624, | |
| "step": 20000 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 21439, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 10000, | |
| "total_flos": 196340613120000.0, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |