| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.979683972911964, | |
| "eval_steps": 50, | |
| "global_step": 330, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.045146726862302484, | |
| "grad_norm": 18.43708432360805, | |
| "learning_rate": 5e-07, | |
| "loss": 1.7371, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.09029345372460497, | |
| "grad_norm": 12.96518208503692, | |
| "learning_rate": 1e-06, | |
| "loss": 1.6227, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.13544018058690746, | |
| "grad_norm": 7.284644208168829, | |
| "learning_rate": 9.993977281025862e-07, | |
| "loss": 1.2966, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.18058690744920994, | |
| "grad_norm": 4.047553649149097, | |
| "learning_rate": 9.975923633360984e-07, | |
| "loss": 1.1552, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.22573363431151242, | |
| "grad_norm": 3.7692359566921234, | |
| "learning_rate": 9.945882549823904e-07, | |
| "loss": 1.0619, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.2708803611738149, | |
| "grad_norm": 3.6288204791918646, | |
| "learning_rate": 9.90392640201615e-07, | |
| "loss": 1.0028, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3160270880361174, | |
| "grad_norm": 3.475318929765548, | |
| "learning_rate": 9.85015626597272e-07, | |
| "loss": 1.0079, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.3611738148984199, | |
| "grad_norm": 3.5252734455616874, | |
| "learning_rate": 9.784701678661044e-07, | |
| "loss": 0.971, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.40632054176072235, | |
| "grad_norm": 3.515836010245366, | |
| "learning_rate": 9.707720325915103e-07, | |
| "loss": 0.9516, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.45146726862302483, | |
| "grad_norm": 3.6385086224106713, | |
| "learning_rate": 9.619397662556433e-07, | |
| "loss": 0.9534, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.45146726862302483, | |
| "eval_loss": 0.9350618124008179, | |
| "eval_runtime": 55.811, | |
| "eval_samples_per_second": 56.441, | |
| "eval_steps_per_second": 0.896, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.4966139954853273, | |
| "grad_norm": 3.433110767032438, | |
| "learning_rate": 9.519946465617217e-07, | |
| "loss": 0.9279, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.5417607223476298, | |
| "grad_norm": 3.370215115653193, | |
| "learning_rate": 9.409606321741774e-07, | |
| "loss": 0.9373, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.5869074492099323, | |
| "grad_norm": 3.4752063755478906, | |
| "learning_rate": 9.28864305000136e-07, | |
| "loss": 0.9134, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.6320541760722348, | |
| "grad_norm": 3.444843183525781, | |
| "learning_rate": 9.157348061512726e-07, | |
| "loss": 0.8902, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.6772009029345373, | |
| "grad_norm": 3.3753005171135877, | |
| "learning_rate": 9.016037657403223e-07, | |
| "loss": 0.8907, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.7223476297968398, | |
| "grad_norm": 3.3797166413654636, | |
| "learning_rate": 8.865052266813685e-07, | |
| "loss": 0.8798, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.7674943566591422, | |
| "grad_norm": 3.399852464899469, | |
| "learning_rate": 8.704755626774795e-07, | |
| "loss": 0.8873, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.8126410835214447, | |
| "grad_norm": 3.3736345000901644, | |
| "learning_rate": 8.535533905932737e-07, | |
| "loss": 0.8951, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.8577878103837472, | |
| "grad_norm": 3.3317984949333104, | |
| "learning_rate": 8.357794774235092e-07, | |
| "loss": 0.8813, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.9029345372460497, | |
| "grad_norm": 3.7790547931864826, | |
| "learning_rate": 8.171966420818227e-07, | |
| "loss": 0.8729, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.9029345372460497, | |
| "eval_loss": 0.8780717253684998, | |
| "eval_runtime": 55.3629, | |
| "eval_samples_per_second": 56.897, | |
| "eval_steps_per_second": 0.903, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.9480812641083521, | |
| "grad_norm": 3.843510457952551, | |
| "learning_rate": 7.978496522462167e-07, | |
| "loss": 0.8738, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.9932279909706546, | |
| "grad_norm": 3.8556187579575734, | |
| "learning_rate": 7.777851165098011e-07, | |
| "loss": 0.8693, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.0383747178329572, | |
| "grad_norm": 3.5905132811137124, | |
| "learning_rate": 7.570513720966107e-07, | |
| "loss": 0.8148, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.0835214446952597, | |
| "grad_norm": 3.621093760889929, | |
| "learning_rate": 7.356983684129989e-07, | |
| "loss": 0.8324, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.1286681715575622, | |
| "grad_norm": 3.483178959852461, | |
| "learning_rate": 7.13777546715141e-07, | |
| "loss": 0.7914, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.1738148984198646, | |
| "grad_norm": 3.3812503372602465, | |
| "learning_rate": 6.913417161825449e-07, | |
| "loss": 0.7891, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.2189616252821671, | |
| "grad_norm": 3.42376548092314, | |
| "learning_rate": 6.684449266961099e-07, | |
| "loss": 0.7905, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.2641083521444696, | |
| "grad_norm": 3.670874737486496, | |
| "learning_rate": 6.451423386272311e-07, | |
| "loss": 0.7919, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.309255079006772, | |
| "grad_norm": 3.466865973590349, | |
| "learning_rate": 6.21490089951632e-07, | |
| "loss": 0.8129, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.3544018058690745, | |
| "grad_norm": 3.8317546120945494, | |
| "learning_rate": 5.975451610080642e-07, | |
| "loss": 0.8045, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.3544018058690745, | |
| "eval_loss": 0.859570324420929, | |
| "eval_runtime": 55.3516, | |
| "eval_samples_per_second": 56.909, | |
| "eval_steps_per_second": 0.903, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.399548532731377, | |
| "grad_norm": 3.6970944236498684, | |
| "learning_rate": 5.733652372276809e-07, | |
| "loss": 0.8012, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.4446952595936795, | |
| "grad_norm": 3.320805141993761, | |
| "learning_rate": 5.490085701647804e-07, | |
| "loss": 0.7977, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.489841986455982, | |
| "grad_norm": 3.4859328024402685, | |
| "learning_rate": 5.245338371637091e-07, | |
| "loss": 0.7953, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.5349887133182845, | |
| "grad_norm": 3.64297775104021, | |
| "learning_rate": 5e-07, | |
| "loss": 0.7917, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.580135440180587, | |
| "grad_norm": 3.470131936511277, | |
| "learning_rate": 4.75466162836291e-07, | |
| "loss": 0.7905, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.6252821670428894, | |
| "grad_norm": 3.559410134845727, | |
| "learning_rate": 4.5099142983521963e-07, | |
| "loss": 0.7948, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.670428893905192, | |
| "grad_norm": 3.4417325502766594, | |
| "learning_rate": 4.2663476277231915e-07, | |
| "loss": 0.7983, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.7155756207674944, | |
| "grad_norm": 3.669007752905804, | |
| "learning_rate": 4.0245483899193586e-07, | |
| "loss": 0.8006, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.7607223476297968, | |
| "grad_norm": 3.4191546038129483, | |
| "learning_rate": 3.785099100483681e-07, | |
| "loss": 0.7879, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.8058690744920993, | |
| "grad_norm": 3.697454786550471, | |
| "learning_rate": 3.548576613727689e-07, | |
| "loss": 0.7875, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.8058690744920993, | |
| "eval_loss": 0.847703754901886, | |
| "eval_runtime": 55.4234, | |
| "eval_samples_per_second": 56.835, | |
| "eval_steps_per_second": 0.902, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.8510158013544018, | |
| "grad_norm": 3.691082664320291, | |
| "learning_rate": 3.3155507330388996e-07, | |
| "loss": 0.7951, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.8961625282167043, | |
| "grad_norm": 3.5717461634355288, | |
| "learning_rate": 3.086582838174551e-07, | |
| "loss": 0.7895, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.9413092550790068, | |
| "grad_norm": 3.579734970143079, | |
| "learning_rate": 2.8622245328485907e-07, | |
| "loss": 0.7825, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.9864559819413092, | |
| "grad_norm": 3.444703074876304, | |
| "learning_rate": 2.6430163158700113e-07, | |
| "loss": 0.7836, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.0316027088036117, | |
| "grad_norm": 3.6858962070265617, | |
| "learning_rate": 2.4294862790338916e-07, | |
| "loss": 0.7546, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.0767494356659144, | |
| "grad_norm": 3.5888237456746976, | |
| "learning_rate": 2.2221488349019902e-07, | |
| "loss": 0.7482, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.1218961625282167, | |
| "grad_norm": 3.681045633726427, | |
| "learning_rate": 2.021503477537833e-07, | |
| "loss": 0.7402, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.1670428893905194, | |
| "grad_norm": 3.5167220269101747, | |
| "learning_rate": 1.828033579181773e-07, | |
| "loss": 0.7451, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.2121896162528216, | |
| "grad_norm": 3.883082217563066, | |
| "learning_rate": 1.6422052257649077e-07, | |
| "loss": 0.7465, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.2573363431151243, | |
| "grad_norm": 3.923342604410762, | |
| "learning_rate": 1.4644660940672627e-07, | |
| "loss": 0.7391, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.2573363431151243, | |
| "eval_loss": 0.8480744957923889, | |
| "eval_runtime": 55.3896, | |
| "eval_samples_per_second": 56.87, | |
| "eval_steps_per_second": 0.903, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.3024830699774266, | |
| "grad_norm": 3.4937657027708307, | |
| "learning_rate": 1.2952443732252054e-07, | |
| "loss": 0.7314, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.3476297968397293, | |
| "grad_norm": 3.7212940895257525, | |
| "learning_rate": 1.134947733186315e-07, | |
| "loss": 0.7484, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.3927765237020315, | |
| "grad_norm": 3.4800342638669424, | |
| "learning_rate": 9.839623425967758e-08, | |
| "loss": 0.7429, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.4379232505643342, | |
| "grad_norm": 3.6208493287123598, | |
| "learning_rate": 8.426519384872732e-08, | |
| "loss": 0.7335, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.4830699774266365, | |
| "grad_norm": 3.6587749488735075, | |
| "learning_rate": 7.1135694999864e-08, | |
| "loss": 0.757, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.528216704288939, | |
| "grad_norm": 3.651673581185134, | |
| "learning_rate": 5.9039367825822526e-08, | |
| "loss": 0.7237, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.5733634311512414, | |
| "grad_norm": 3.6732031228054653, | |
| "learning_rate": 4.800535343827833e-08, | |
| "loss": 0.7379, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.618510158013544, | |
| "grad_norm": 3.7552824088763455, | |
| "learning_rate": 3.806023374435663e-08, | |
| "loss": 0.7406, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.6636568848758464, | |
| "grad_norm": 3.9856453979787916, | |
| "learning_rate": 2.922796740848965e-08, | |
| "loss": 0.7231, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.708803611738149, | |
| "grad_norm": 3.61930072668026, | |
| "learning_rate": 2.1529832133895588e-08, | |
| "loss": 0.7339, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.708803611738149, | |
| "eval_loss": 0.8461548089981079, | |
| "eval_runtime": 55.4056, | |
| "eval_samples_per_second": 56.853, | |
| "eval_steps_per_second": 0.902, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.7539503386004514, | |
| "grad_norm": 3.739949022395361, | |
| "learning_rate": 1.4984373402728012e-08, | |
| "loss": 0.7323, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.799097065462754, | |
| "grad_norm": 3.5216583130079537, | |
| "learning_rate": 9.607359798384784e-09, | |
| "loss": 0.7267, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.8442437923250563, | |
| "grad_norm": 3.78870498477702, | |
| "learning_rate": 5.411745017609493e-09, | |
| "loss": 0.7242, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.889390519187359, | |
| "grad_norm": 3.7009903709257244, | |
| "learning_rate": 2.407636663901591e-09, | |
| "loss": 0.7201, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.9345372460496613, | |
| "grad_norm": 3.677700055727154, | |
| "learning_rate": 6.022718974137975e-10, | |
| "loss": 0.7308, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.979683972911964, | |
| "grad_norm": 3.5525511423180545, | |
| "learning_rate": 0.0, | |
| "loss": 0.7362, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.979683972911964, | |
| "step": 330, | |
| "total_flos": 1945595623243776.0, | |
| "train_loss": 0.8518027869137851, | |
| "train_runtime": 4686.6831, | |
| "train_samples_per_second": 18.147, | |
| "train_steps_per_second": 0.07 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 330, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 10000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1945595623243776.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |