zgrgr's picture
Upload model files with Nebius access
c0fa390 verified
{
"best_metric": 1.077925443649292,
"best_model_checkpoint": "./outputs/instruct-lora-8b-aplly_chat_template-capital/checkpoint-560",
"epoch": 1.0169348010160881,
"eval_steps": 20,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001693480101608806,
"eval_loss": 1.4109762907028198,
"eval_runtime": 44.8838,
"eval_samples_per_second": 23.394,
"eval_steps_per_second": 5.86,
"step": 1
},
{
"epoch": 0.03386960203217612,
"grad_norm": 0.7916573882102966,
"learning_rate": 3.3898305084745763e-06,
"loss": 1.4877,
"step": 20
},
{
"epoch": 0.03386960203217612,
"eval_loss": 1.4077372550964355,
"eval_runtime": 46.2342,
"eval_samples_per_second": 22.71,
"eval_steps_per_second": 5.688,
"step": 20
},
{
"epoch": 0.06773920406435224,
"grad_norm": 0.7490960359573364,
"learning_rate": 6.779661016949153e-06,
"loss": 1.4093,
"step": 40
},
{
"epoch": 0.06773920406435224,
"eval_loss": 1.3683991432189941,
"eval_runtime": 46.9004,
"eval_samples_per_second": 22.388,
"eval_steps_per_second": 5.608,
"step": 40
},
{
"epoch": 0.10160880609652836,
"grad_norm": 0.7929967641830444,
"learning_rate": 1.016949152542373e-05,
"loss": 1.3575,
"step": 60
},
{
"epoch": 0.10160880609652836,
"eval_loss": 1.2657679319381714,
"eval_runtime": 48.1658,
"eval_samples_per_second": 21.8,
"eval_steps_per_second": 5.46,
"step": 60
},
{
"epoch": 0.1354784081287045,
"grad_norm": 0.8403828740119934,
"learning_rate": 1.3559322033898305e-05,
"loss": 1.2616,
"step": 80
},
{
"epoch": 0.1354784081287045,
"eval_loss": 1.2246084213256836,
"eval_runtime": 44.8525,
"eval_samples_per_second": 23.41,
"eval_steps_per_second": 5.864,
"step": 80
},
{
"epoch": 0.1693480101608806,
"grad_norm": 1.071118950843811,
"learning_rate": 1.694915254237288e-05,
"loss": 1.2205,
"step": 100
},
{
"epoch": 0.1693480101608806,
"eval_loss": 1.2001967430114746,
"eval_runtime": 44.851,
"eval_samples_per_second": 23.411,
"eval_steps_per_second": 5.864,
"step": 100
},
{
"epoch": 0.20321761219305673,
"grad_norm": 1.0767000913619995,
"learning_rate": 2.033898305084746e-05,
"loss": 1.2187,
"step": 120
},
{
"epoch": 0.20321761219305673,
"eval_loss": 1.183129906654358,
"eval_runtime": 45.0629,
"eval_samples_per_second": 23.301,
"eval_steps_per_second": 5.836,
"step": 120
},
{
"epoch": 0.23708721422523285,
"grad_norm": 1.061275839805603,
"learning_rate": 2.3728813559322036e-05,
"loss": 1.1836,
"step": 140
},
{
"epoch": 0.23708721422523285,
"eval_loss": 1.1728578805923462,
"eval_runtime": 44.9354,
"eval_samples_per_second": 23.367,
"eval_steps_per_second": 5.853,
"step": 140
},
{
"epoch": 0.270956816257409,
"grad_norm": 1.1500962972640991,
"learning_rate": 2.711864406779661e-05,
"loss": 1.1149,
"step": 160
},
{
"epoch": 0.270956816257409,
"eval_loss": 1.1615699529647827,
"eval_runtime": 44.8412,
"eval_samples_per_second": 23.416,
"eval_steps_per_second": 5.865,
"step": 160
},
{
"epoch": 0.3048264182895851,
"grad_norm": 1.274034023284912,
"learning_rate": 2.9999737474980266e-05,
"loss": 1.1049,
"step": 180
},
{
"epoch": 0.3048264182895851,
"eval_loss": 1.151460886001587,
"eval_runtime": 44.8418,
"eval_samples_per_second": 23.416,
"eval_steps_per_second": 5.865,
"step": 180
},
{
"epoch": 0.3386960203217612,
"grad_norm": 1.2562557458877563,
"learning_rate": 2.998457196315866e-05,
"loss": 1.1254,
"step": 200
},
{
"epoch": 0.3386960203217612,
"eval_loss": 1.1418647766113281,
"eval_runtime": 44.8753,
"eval_samples_per_second": 23.398,
"eval_steps_per_second": 5.861,
"step": 200
},
{
"epoch": 0.37256562235393736,
"grad_norm": 1.3587268590927124,
"learning_rate": 2.9946097849501546e-05,
"loss": 1.1043,
"step": 220
},
{
"epoch": 0.37256562235393736,
"eval_loss": 1.1385948657989502,
"eval_runtime": 49.1418,
"eval_samples_per_second": 21.367,
"eval_steps_per_second": 5.352,
"step": 220
},
{
"epoch": 0.40643522438611346,
"grad_norm": 1.3932058811187744,
"learning_rate": 2.988437498074987e-05,
"loss": 1.1076,
"step": 240
},
{
"epoch": 0.40643522438611346,
"eval_loss": 1.1299232244491577,
"eval_runtime": 47.3037,
"eval_samples_per_second": 22.197,
"eval_steps_per_second": 5.56,
"step": 240
},
{
"epoch": 0.4403048264182896,
"grad_norm": 1.4880738258361816,
"learning_rate": 2.9799499367238472e-05,
"loss": 1.1307,
"step": 260
},
{
"epoch": 0.4403048264182896,
"eval_loss": 1.1247954368591309,
"eval_runtime": 44.8376,
"eval_samples_per_second": 23.418,
"eval_steps_per_second": 5.866,
"step": 260
},
{
"epoch": 0.4741744284504657,
"grad_norm": 1.3387629985809326,
"learning_rate": 2.969160303355143e-05,
"loss": 1.0895,
"step": 280
},
{
"epoch": 0.4741744284504657,
"eval_loss": 1.1187076568603516,
"eval_runtime": 44.9374,
"eval_samples_per_second": 23.366,
"eval_steps_per_second": 5.853,
"step": 280
},
{
"epoch": 0.5080440304826418,
"grad_norm": 1.467492699623108,
"learning_rate": 2.95608538131569e-05,
"loss": 1.0912,
"step": 300
},
{
"epoch": 0.5080440304826418,
"eval_loss": 1.1132631301879883,
"eval_runtime": 47.2897,
"eval_samples_per_second": 22.204,
"eval_steps_per_second": 5.561,
"step": 300
},
{
"epoch": 0.541913632514818,
"grad_norm": 1.5851348638534546,
"learning_rate": 2.940745508734104e-05,
"loss": 1.0932,
"step": 320
},
{
"epoch": 0.541913632514818,
"eval_loss": 1.1109942197799683,
"eval_runtime": 45.5061,
"eval_samples_per_second": 23.074,
"eval_steps_per_second": 5.779,
"step": 320
},
{
"epoch": 0.5757832345469941,
"grad_norm": 1.569037914276123,
"learning_rate": 2.9231645468847078e-05,
"loss": 1.113,
"step": 340
},
{
"epoch": 0.5757832345469941,
"eval_loss": 1.1062214374542236,
"eval_runtime": 44.8719,
"eval_samples_per_second": 23.4,
"eval_steps_per_second": 5.861,
"step": 340
},
{
"epoch": 0.6096528365791702,
"grad_norm": 1.525785207748413,
"learning_rate": 2.903369843071157e-05,
"loss": 1.0623,
"step": 360
},
{
"epoch": 0.6096528365791702,
"eval_loss": 1.1012595891952515,
"eval_runtime": 48.3892,
"eval_samples_per_second": 21.699,
"eval_steps_per_second": 5.435,
"step": 360
},
{
"epoch": 0.6435224386113463,
"grad_norm": 1.4866666793823242,
"learning_rate": 2.881392188087528e-05,
"loss": 1.0934,
"step": 380
},
{
"epoch": 0.6435224386113463,
"eval_loss": 1.100132942199707,
"eval_runtime": 44.8502,
"eval_samples_per_second": 23.411,
"eval_steps_per_second": 5.864,
"step": 380
},
{
"epoch": 0.6773920406435224,
"grad_norm": 1.5744224786758423,
"learning_rate": 2.8572657683230322e-05,
"loss": 1.0663,
"step": 400
},
{
"epoch": 0.6773920406435224,
"eval_loss": 1.0957869291305542,
"eval_runtime": 44.8474,
"eval_samples_per_second": 23.413,
"eval_steps_per_second": 5.864,
"step": 400
},
{
"epoch": 0.7112616426756986,
"grad_norm": 1.4972140789031982,
"learning_rate": 2.8310281125848574e-05,
"loss": 1.081,
"step": 420
},
{
"epoch": 0.7112616426756986,
"eval_loss": 1.0911184549331665,
"eval_runtime": 44.9252,
"eval_samples_per_second": 23.372,
"eval_steps_per_second": 5.854,
"step": 420
},
{
"epoch": 0.7451312447078747,
"grad_norm": 1.7231398820877075,
"learning_rate": 2.80272003372186e-05,
"loss": 1.0572,
"step": 440
},
{
"epoch": 0.7451312447078747,
"eval_loss": 1.089966058731079,
"eval_runtime": 44.8357,
"eval_samples_per_second": 23.419,
"eval_steps_per_second": 5.866,
"step": 440
},
{
"epoch": 0.7790008467400508,
"grad_norm": 1.5352925062179565,
"learning_rate": 2.7723855651399027e-05,
"loss": 1.0459,
"step": 460
},
{
"epoch": 0.7790008467400508,
"eval_loss": 1.084614872932434,
"eval_runtime": 44.8258,
"eval_samples_per_second": 23.424,
"eval_steps_per_second": 5.867,
"step": 460
},
{
"epoch": 0.8128704487722269,
"grad_norm": 1.5007786750793457,
"learning_rate": 2.7400718923076004e-05,
"loss": 1.0621,
"step": 480
},
{
"epoch": 0.8128704487722269,
"eval_loss": 1.0817887783050537,
"eval_runtime": 44.8244,
"eval_samples_per_second": 23.425,
"eval_steps_per_second": 5.867,
"step": 480
},
{
"epoch": 0.8467400508044031,
"grad_norm": 1.6158475875854492,
"learning_rate": 2.7058292793590064e-05,
"loss": 1.0584,
"step": 500
},
{
"epoch": 0.8467400508044031,
"eval_loss": 1.0819875001907349,
"eval_runtime": 47.2009,
"eval_samples_per_second": 22.245,
"eval_steps_per_second": 5.572,
"step": 500
},
{
"epoch": 0.8806096528365792,
"grad_norm": 1.740872859954834,
"learning_rate": 2.6697109909074174e-05,
"loss": 1.0393,
"step": 520
},
{
"epoch": 0.8806096528365792,
"eval_loss": 1.0836067199707031,
"eval_runtime": 44.9083,
"eval_samples_per_second": 23.381,
"eval_steps_per_second": 5.856,
"step": 520
},
{
"epoch": 0.9144792548687553,
"grad_norm": 1.5816829204559326,
"learning_rate": 2.6317732091919095e-05,
"loss": 1.0342,
"step": 540
},
{
"epoch": 0.9144792548687553,
"eval_loss": 1.0809314250946045,
"eval_runtime": 45.5743,
"eval_samples_per_second": 23.039,
"eval_steps_per_second": 5.771,
"step": 540
},
{
"epoch": 0.9483488569009314,
"grad_norm": 1.5456470251083374,
"learning_rate": 2.5920749466854923e-05,
"loss": 1.0132,
"step": 560
},
{
"epoch": 0.9483488569009314,
"eval_loss": 1.077925443649292,
"eval_runtime": 47.3001,
"eval_samples_per_second": 22.199,
"eval_steps_per_second": 5.56,
"step": 560
},
{
"epoch": 0.9822184589331076,
"grad_norm": 1.646410584449768,
"learning_rate": 2.550677954300811e-05,
"loss": 1.0256,
"step": 580
},
{
"epoch": 0.9822184589331076,
"eval_loss": 1.077932596206665,
"eval_runtime": 48.3066,
"eval_samples_per_second": 21.736,
"eval_steps_per_second": 5.444,
"step": 580
},
{
"epoch": 1.0169348010160881,
"grad_norm": 1.9027409553527832,
"learning_rate": 2.5076466253361893e-05,
"loss": 1.0651,
"step": 600
},
{
"epoch": 1.0169348010160881,
"eval_loss": 1.0789446830749512,
"eval_runtime": 44.8937,
"eval_samples_per_second": 23.389,
"eval_steps_per_second": 5.858,
"step": 600
}
],
"logging_steps": 20,
"max_steps": 1770,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 2
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.462097548054364e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}