{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 3118, "global_step": 15587, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.032079042761364, "grad_norm": 1.7145991325378418, "learning_rate": 0.0007978742969568658, "loss": 0.4787, "step": 500 }, { "epoch": 0.064158085522728, "grad_norm": 1.2714065313339233, "learning_rate": 0.0007957357627082398, "loss": 0.3972, "step": 1000 }, { "epoch": 0.096237128284092, "grad_norm": 1.1843117475509644, "learning_rate": 0.0007935972284596139, "loss": 0.3736, "step": 1500 }, { "epoch": 0.128316171045456, "grad_norm": 1.1524124145507812, "learning_rate": 0.0007914629712794851, "loss": 0.3626, "step": 2000 }, { "epoch": 0.16039521380682, "grad_norm": 1.0530339479446411, "learning_rate": 0.0007893244370308591, "loss": 0.355, "step": 2500 }, { "epoch": 0.192474256568184, "grad_norm": 1.0044286251068115, "learning_rate": 0.0007871859027822331, "loss": 0.3527, "step": 3000 }, { "epoch": 0.2000449106598659, "eval_loss": 0.3370795249938965, "eval_runtime": 6.2984, "eval_samples_per_second": 79.386, "eval_steps_per_second": 10.003, "step": 3118 }, { "epoch": 0.224553299329548, "grad_norm": 1.4645978212356567, "learning_rate": 0.0007850473685336071, "loss": 0.3394, "step": 3500 }, { "epoch": 0.256632342090912, "grad_norm": 1.2185771465301514, "learning_rate": 0.0007829088342849811, "loss": 0.3398, "step": 4000 }, { "epoch": 0.288711384852276, "grad_norm": 1.3087233304977417, "learning_rate": 0.0007807703000363551, "loss": 0.3399, "step": 4500 }, { "epoch": 0.32079042761364, "grad_norm": 1.411337971687317, "learning_rate": 0.0007786360428562264, "loss": 0.3355, "step": 5000 }, { "epoch": 0.352869470375004, "grad_norm": 1.1822813749313354, "learning_rate": 0.0007764975086076004, "loss": 0.3329, "step": 5500 }, { "epoch": 0.384948513136368, "grad_norm": 0.9898410439491272, "learning_rate": 0.0007743632514274716, "loss": 0.3311, "step": 6000 }, { "epoch": 0.4000898213197318, "eval_loss": 0.31450170278549194, "eval_runtime": 9.8807, "eval_samples_per_second": 50.604, "eval_steps_per_second": 6.376, "step": 6236 }, { "epoch": 0.417027555897732, "grad_norm": 0.9827210307121277, "learning_rate": 0.0007722247171788456, "loss": 0.3251, "step": 6500 }, { "epoch": 0.449106598659096, "grad_norm": 1.2011586427688599, "learning_rate": 0.0007700861829302197, "loss": 0.3271, "step": 7000 }, { "epoch": 0.48118564142046, "grad_norm": 1.3871015310287476, "learning_rate": 0.0007679476486815936, "loss": 0.3259, "step": 7500 }, { "epoch": 0.513264684181824, "grad_norm": 1.1245856285095215, "learning_rate": 0.0007658091144329678, "loss": 0.3192, "step": 8000 }, { "epoch": 0.545343726943188, "grad_norm": 1.0883513689041138, "learning_rate": 0.0007636705801843417, "loss": 0.3209, "step": 8500 }, { "epoch": 0.577422769704552, "grad_norm": 1.0555607080459595, "learning_rate": 0.0007615320459357157, "loss": 0.3178, "step": 9000 }, { "epoch": 0.6001347319795978, "eval_loss": 0.3085324764251709, "eval_runtime": 7.9945, "eval_samples_per_second": 62.543, "eval_steps_per_second": 7.88, "step": 9354 }, { "epoch": 0.6095018124659161, "grad_norm": 1.0429250001907349, "learning_rate": 0.0007593935116870898, "loss": 0.3169, "step": 9500 }, { "epoch": 0.64158085522728, "grad_norm": 1.2145720720291138, "learning_rate": 0.0007572549774384637, "loss": 0.3152, "step": 10000 }, { "epoch": 0.673659897988644, "grad_norm": 1.3464765548706055, "learning_rate": 0.0007551207202583349, "loss": 0.3125, "step": 10500 }, { "epoch": 0.705738940750008, "grad_norm": 1.1744924783706665, "learning_rate": 0.000752982186009709, "loss": 0.3086, "step": 11000 }, { "epoch": 0.7378179835113721, "grad_norm": 1.234157919883728, "learning_rate": 0.000750843651761083, "loss": 0.3133, "step": 11500 }, { "epoch": 0.769897026272736, "grad_norm": 0.9248010516166687, "learning_rate": 0.0007487051175124569, "loss": 0.3122, "step": 12000 }, { "epoch": 0.8001796426394636, "eval_loss": 0.3022182583808899, "eval_runtime": 8.0646, "eval_samples_per_second": 61.999, "eval_steps_per_second": 7.812, "step": 12472 }, { "epoch": 0.8019760690341, "grad_norm": 0.862856388092041, "learning_rate": 0.000746566583263831, "loss": 0.308, "step": 12500 }, { "epoch": 0.834055111795464, "grad_norm": 1.0224480628967285, "learning_rate": 0.000744428049015205, "loss": 0.31, "step": 13000 }, { "epoch": 0.8661341545568281, "grad_norm": 1.008195161819458, "learning_rate": 0.0007422937918350763, "loss": 0.3079, "step": 13500 }, { "epoch": 0.898213197318192, "grad_norm": 1.29293954372406, "learning_rate": 0.0007401552575864503, "loss": 0.3047, "step": 14000 }, { "epoch": 0.930292240079556, "grad_norm": 0.9913870692253113, "learning_rate": 0.0007380167233378243, "loss": 0.3074, "step": 14500 }, { "epoch": 0.96237128284092, "grad_norm": 1.1802239418029785, "learning_rate": 0.0007358781890891983, "loss": 0.3055, "step": 15000 }, { "epoch": 0.9944503256022841, "grad_norm": 1.0421093702316284, "learning_rate": 0.0007337396548405722, "loss": 0.3106, "step": 15500 } ], "logging_steps": 500, "max_steps": 187044, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.573917452542935e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }