Training in progress, step 95000, checkpoint

Browse files

Files changed (8) hide show

last-checkpoint/optimizer.pt +1 -1
last-checkpoint/pytorch_model.bin +1 -1
last-checkpoint/rng_state_0.pth +1 -1
last-checkpoint/rng_state_1.pth +1 -1
last-checkpoint/rng_state_2.pth +1 -1
last-checkpoint/rng_state_3.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +703 -3

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:01d2ceac97f73c093f9b4fed4765109a459c1fa9b8b573df85550939576674df
 size 715030586

 version https://git-lfs.github.com/spec/v1
+oid sha256:248b28822a3145fc2f6193ec7ba37b31470701e449bd9c308191df07f57c85c3
 size 715030586

last-checkpoint/pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6451cfc2a20fb3fd15f749d1e043f05a2c806d9461918bc614725b68f7ba6cfc
 size 1032262338

 version https://git-lfs.github.com/spec/v1
+oid sha256:1414cb3a52afd00748c40a10edd86f3990dcbdfb6536ec482813a861d6bd393d
 size 1032262338

last-checkpoint/rng_state_0.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:14143faf81c1cd8ceba352e6122de0360addd8865bd0ce2353d8646d9188bea0
 size 14960

 version https://git-lfs.github.com/spec/v1
+oid sha256:83461f3cb631035bb80570fe8ff5f003da6889c3b4c5b07ae3097b40b998cc74
 size 14960

last-checkpoint/rng_state_1.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:de365418887061ab87d31ae9ad245c02882a9ad21db0efd24d33c8558ef695fe
 size 14960

 version https://git-lfs.github.com/spec/v1
+oid sha256:9b1a3a3a92a32908a931fd058a89bd5a31451a6c67bff5977120ddb2fbd625f2
 size 14960

last-checkpoint/rng_state_2.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:637d66e8132ef1dbfe022393c501554a36675913203a5bf0e4721acc7df4662e
 size 14960

 version https://git-lfs.github.com/spec/v1
+oid sha256:117ca0bb908b4df12aafc342e2a9002967a120b9e6b2bc1d22f28fe482f613e7
 size 14960

last-checkpoint/rng_state_3.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0baf20442b528504d801bf89ae3478ab05cb7583f9be269ec49b3cc0c9cebacd
 size 14960

 version https://git-lfs.github.com/spec/v1
+oid sha256:8ac12b6daed9ec4e4bf445463e78941442596bd18188c92ccb0b9804c0d0a5af
 size 14960

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bb1e105ca4cf24aa8616ea182594bd1cf0393c2fff610e568268f8da5ed86f74
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:da8924fd9ebf1bfc34c9d222c0eeb1de5a903b56bd5f2b099e5c970eea697fbe
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.13924358146342042,
   "eval_steps": 500,
-  "global_step": 94000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -65808,6 +65808,706 @@
       "learning_rate": 0.0004769107257863772,
       "loss": 13.9491,
       "step": 94000
     }
   ],
   "logging_steps": 10,
@@ -65827,7 +66527,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 2.0314506168189216e+20,
   "train_batch_size": 48,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.14072489615983977,
   "eval_steps": 500,
+  "global_step": 95000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "learning_rate": 0.0004769107257863772,
       "loss": 13.9491,
       "step": 94000
+    },
+    {
+      "epoch": 0.1392583946103846,
+      "grad_norm": 5.90625,
+      "learning_rate": 0.00047690825631887347,
+      "loss": 13.9904,
+      "step": 94010
+    },
+    {
+      "epoch": 0.1392732077573488,
+      "grad_norm": 6.4375,
+      "learning_rate": 0.00047690578685136967,
+      "loss": 13.8586,
+      "step": 94020
+    },
+    {
+      "epoch": 0.139288020904313,
+      "grad_norm": 7.09375,
+      "learning_rate": 0.00047690331738386586,
+      "loss": 13.8061,
+      "step": 94030
+    },
+    {
+      "epoch": 0.1393028340512772,
+      "grad_norm": 5.90625,
+      "learning_rate": 0.0004769008479163621,
+      "loss": 13.9197,
+      "step": 94040
+    },
+    {
+      "epoch": 0.13931764719824138,
+      "grad_norm": 7.78125,
+      "learning_rate": 0.0004768983784488583,
+      "loss": 13.8176,
+      "step": 94050
+    },
+    {
+      "epoch": 0.13933246034520558,
+      "grad_norm": 32.25,
+      "learning_rate": 0.00047689590898135456,
+      "loss": 13.8751,
+      "step": 94060
+    },
+    {
+      "epoch": 0.13934727349216977,
+      "grad_norm": 6.4375,
+      "learning_rate": 0.00047689343951385076,
+      "loss": 13.8901,
+      "step": 94070
+    },
+    {
+      "epoch": 0.13936208663913396,
+      "grad_norm": 5.90625,
+      "learning_rate": 0.00047689097004634696,
+      "loss": 13.914,
+      "step": 94080
+    },
+    {
+      "epoch": 0.13937689978609816,
+      "grad_norm": 5.875,
+      "learning_rate": 0.0004768885005788432,
+      "loss": 13.7749,
+      "step": 94090
+    },
+    {
+      "epoch": 0.13939171293306235,
+      "grad_norm": 7.875,
+      "learning_rate": 0.00047688603111133946,
+      "loss": 13.8745,
+      "step": 94100
+    },
+    {
+      "epoch": 0.13940652608002654,
+      "grad_norm": 5.84375,
+      "learning_rate": 0.0004768835616438356,
+      "loss": 13.8541,
+      "step": 94110
+    },
+    {
+      "epoch": 0.13942133922699074,
+      "grad_norm": 6.84375,
+      "learning_rate": 0.00047688109217633185,
+      "loss": 13.9397,
+      "step": 94120
+    },
+    {
+      "epoch": 0.13943615237395493,
+      "grad_norm": 6.6875,
+      "learning_rate": 0.00047687862270882805,
+      "loss": 13.9031,
+      "step": 94130
+    },
+    {
+      "epoch": 0.13945096552091912,
+      "grad_norm": 7.0625,
+      "learning_rate": 0.0004768761532413243,
+      "loss": 13.9231,
+      "step": 94140
+    },
+    {
+      "epoch": 0.13946577866788332,
+      "grad_norm": 6.75,
+      "learning_rate": 0.0004768736837738205,
+      "loss": 13.9564,
+      "step": 94150
+    },
+    {
+      "epoch": 0.1394805918148475,
+      "grad_norm": 5.5625,
+      "learning_rate": 0.0004768712143063167,
+      "loss": 13.7778,
+      "step": 94160
+    },
+    {
+      "epoch": 0.1394954049618117,
+      "grad_norm": 6.65625,
+      "learning_rate": 0.00047686874483881294,
+      "loss": 13.8651,
+      "step": 94170
+    },
+    {
+      "epoch": 0.1395102181087759,
+      "grad_norm": 8.375,
+      "learning_rate": 0.00047686627537130914,
+      "loss": 13.8363,
+      "step": 94180
+    },
+    {
+      "epoch": 0.1395250312557401,
+      "grad_norm": 6.8125,
+      "learning_rate": 0.00047686380590380534,
+      "loss": 13.8701,
+      "step": 94190
+    },
+    {
+      "epoch": 0.13953984440270428,
+      "grad_norm": 5.59375,
+      "learning_rate": 0.0004768613364363016,
+      "loss": 13.8929,
+      "step": 94200
+    },
+    {
+      "epoch": 0.13955465754966848,
+      "grad_norm": 5.75,
+      "learning_rate": 0.0004768588669687978,
+      "loss": 13.8003,
+      "step": 94210
+    },
+    {
+      "epoch": 0.13956947069663267,
+      "grad_norm": 6.65625,
+      "learning_rate": 0.000476856397501294,
+      "loss": 13.8948,
+      "step": 94220
+    },
+    {
+      "epoch": 0.13958428384359686,
+      "grad_norm": 6.8125,
+      "learning_rate": 0.00047685392803379023,
+      "loss": 13.9076,
+      "step": 94230
+    },
+    {
+      "epoch": 0.13959909699056106,
+      "grad_norm": 6.21875,
+      "learning_rate": 0.00047685145856628643,
+      "loss": 13.9548,
+      "step": 94240
+    },
+    {
+      "epoch": 0.13961391013752525,
+      "grad_norm": 7.125,
+      "learning_rate": 0.0004768489890987827,
+      "loss": 13.9511,
+      "step": 94250
+    },
+    {
+      "epoch": 0.13962872328448944,
+      "grad_norm": 5.78125,
+      "learning_rate": 0.0004768465196312789,
+      "loss": 13.8748,
+      "step": 94260
+    },
+    {
+      "epoch": 0.13964353643145364,
+      "grad_norm": 7.28125,
+      "learning_rate": 0.0004768440501637751,
+      "loss": 13.9524,
+      "step": 94270
+    },
+    {
+      "epoch": 0.13965834957841783,
+      "grad_norm": 6.78125,
+      "learning_rate": 0.0004768415806962713,
+      "loss": 13.8717,
+      "step": 94280
+    },
+    {
+      "epoch": 0.13967316272538202,
+      "grad_norm": 6.15625,
+      "learning_rate": 0.0004768391112287676,
+      "loss": 13.9239,
+      "step": 94290
+    },
+    {
+      "epoch": 0.13968797587234622,
+      "grad_norm": 6.71875,
+      "learning_rate": 0.0004768366417612637,
+      "loss": 13.9367,
+      "step": 94300
+    },
+    {
+      "epoch": 0.1397027890193104,
+      "grad_norm": 7.28125,
+      "learning_rate": 0.00047683417229375997,
+      "loss": 13.896,
+      "step": 94310
+    },
+    {
+      "epoch": 0.1397176021662746,
+      "grad_norm": 6.28125,
+      "learning_rate": 0.00047683170282625617,
+      "loss": 13.9525,
+      "step": 94320
+    },
+    {
+      "epoch": 0.1397324153132388,
+      "grad_norm": 8.25,
+      "learning_rate": 0.00047682923335875236,
+      "loss": 13.8564,
+      "step": 94330
+    },
+    {
+      "epoch": 0.139747228460203,
+      "grad_norm": 7.53125,
+      "learning_rate": 0.0004768267638912486,
+      "loss": 13.7938,
+      "step": 94340
+    },
+    {
+      "epoch": 0.13976204160716718,
+      "grad_norm": 42.75,
+      "learning_rate": 0.0004768242944237448,
+      "loss": 13.941,
+      "step": 94350
+    },
+    {
+      "epoch": 0.13977685475413137,
+      "grad_norm": 5.625,
+      "learning_rate": 0.00047682182495624106,
+      "loss": 13.8934,
+      "step": 94360
+    },
+    {
+      "epoch": 0.13979166790109557,
+      "grad_norm": 5.8125,
+      "learning_rate": 0.00047681935548873726,
+      "loss": 13.8162,
+      "step": 94370
+    },
+    {
+      "epoch": 0.13980648104805976,
+      "grad_norm": 5.65625,
+      "learning_rate": 0.00047681688602123346,
+      "loss": 13.769,
+      "step": 94380
+    },
+    {
+      "epoch": 0.13982129419502395,
+      "grad_norm": 6.03125,
+      "learning_rate": 0.0004768144165537297,
+      "loss": 13.8411,
+      "step": 94390
+    },
+    {
+      "epoch": 0.13983610734198818,
+      "grad_norm": 6.71875,
+      "learning_rate": 0.00047681194708622596,
+      "loss": 13.7353,
+      "step": 94400
+    },
+    {
+      "epoch": 0.13985092048895237,
+      "grad_norm": 7.0,
+      "learning_rate": 0.0004768094776187221,
+      "loss": 13.9244,
+      "step": 94410
+    },
+    {
+      "epoch": 0.13986573363591656,
+      "grad_norm": 6.8125,
+      "learning_rate": 0.00047680700815121835,
+      "loss": 13.9016,
+      "step": 94420
+    },
+    {
+      "epoch": 0.13988054678288075,
+      "grad_norm": 5.53125,
+      "learning_rate": 0.00047680453868371455,
+      "loss": 13.9302,
+      "step": 94430
+    },
+    {
+      "epoch": 0.13989535992984495,
+      "grad_norm": 6.21875,
+      "learning_rate": 0.0004768020692162108,
+      "loss": 13.8326,
+      "step": 94440
+    },
+    {
+      "epoch": 0.13991017307680914,
+      "grad_norm": 7.59375,
+      "learning_rate": 0.000476799599748707,
+      "loss": 13.9934,
+      "step": 94450
+    },
+    {
+      "epoch": 0.13992498622377333,
+      "grad_norm": 6.3125,
+      "learning_rate": 0.0004767971302812032,
+      "loss": 13.9006,
+      "step": 94460
+    },
+    {
+      "epoch": 0.13993979937073753,
+      "grad_norm": 6.0625,
+      "learning_rate": 0.00047679466081369944,
+      "loss": 13.8914,
+      "step": 94470
+    },
+    {
+      "epoch": 0.13995461251770172,
+      "grad_norm": 5.65625,
+      "learning_rate": 0.0004767921913461957,
+      "loss": 13.9409,
+      "step": 94480
+    },
+    {
+      "epoch": 0.13996942566466591,
+      "grad_norm": 7.53125,
+      "learning_rate": 0.00047678972187869184,
+      "loss": 13.7752,
+      "step": 94490
+    },
+    {
+      "epoch": 0.1399842388116301,
+      "grad_norm": 5.90625,
+      "learning_rate": 0.0004767872524111881,
+      "loss": 13.8204,
+      "step": 94500
+    },
+    {
+      "epoch": 0.1399990519585943,
+      "grad_norm": 6.3125,
+      "learning_rate": 0.0004767847829436843,
+      "loss": 13.9536,
+      "step": 94510
+    },
+    {
+      "epoch": 0.1400138651055585,
+      "grad_norm": 6.25,
+      "learning_rate": 0.0004767823134761805,
+      "loss": 13.8974,
+      "step": 94520
+    },
+    {
+      "epoch": 0.1400286782525227,
+      "grad_norm": 6.53125,
+      "learning_rate": 0.00047677984400867673,
+      "loss": 13.8577,
+      "step": 94530
+    },
+    {
+      "epoch": 0.14004349139948688,
+      "grad_norm": 7.59375,
+      "learning_rate": 0.00047677737454117293,
+      "loss": 13.8667,
+      "step": 94540
+    },
+    {
+      "epoch": 0.14005830454645107,
+      "grad_norm": 9.1875,
+      "learning_rate": 0.0004767749050736692,
+      "loss": 13.8806,
+      "step": 94550
+    },
+    {
+      "epoch": 0.14007311769341527,
+      "grad_norm": 5.84375,
+      "learning_rate": 0.0004767724356061654,
+      "loss": 13.9042,
+      "step": 94560
+    },
+    {
+      "epoch": 0.14008793084037946,
+      "grad_norm": 6.9375,
+      "learning_rate": 0.0004767699661386616,
+      "loss": 13.8953,
+      "step": 94570
+    },
+    {
+      "epoch": 0.14010274398734365,
+      "grad_norm": 6.90625,
+      "learning_rate": 0.0004767674966711578,
+      "loss": 13.8876,
+      "step": 94580
+    },
+    {
+      "epoch": 0.14011755713430785,
+      "grad_norm": 13.4375,
+      "learning_rate": 0.0004767650272036541,
+      "loss": 13.8303,
+      "step": 94590
+    },
+    {
+      "epoch": 0.14013237028127204,
+      "grad_norm": 6.09375,
+      "learning_rate": 0.0004767625577361502,
+      "loss": 13.8466,
+      "step": 94600
+    },
+    {
+      "epoch": 0.14014718342823623,
+      "grad_norm": 6.125,
+      "learning_rate": 0.00047676008826864647,
+      "loss": 13.885,
+      "step": 94610
+    },
+    {
+      "epoch": 0.14016199657520043,
+      "grad_norm": 10.25,
+      "learning_rate": 0.00047675761880114267,
+      "loss": 13.8564,
+      "step": 94620
+    },
+    {
+      "epoch": 0.14017680972216462,
+      "grad_norm": 5.9375,
+      "learning_rate": 0.0004767551493336389,
+      "loss": 13.9354,
+      "step": 94630
+    },
+    {
+      "epoch": 0.1401916228691288,
+      "grad_norm": 7.03125,
+      "learning_rate": 0.0004767526798661351,
+      "loss": 13.8914,
+      "step": 94640
+    },
+    {
+      "epoch": 0.140206436016093,
+      "grad_norm": 6.40625,
+      "learning_rate": 0.0004767502103986313,
+      "loss": 13.859,
+      "step": 94650
+    },
+    {
+      "epoch": 0.1402212491630572,
+      "grad_norm": 6.21875,
+      "learning_rate": 0.00047674774093112756,
+      "loss": 13.8946,
+      "step": 94660
+    },
+    {
+      "epoch": 0.1402360623100214,
+      "grad_norm": 12.0,
+      "learning_rate": 0.00047674527146362376,
+      "loss": 13.8798,
+      "step": 94670
+    },
+    {
+      "epoch": 0.14025087545698559,
+      "grad_norm": 6.90625,
+      "learning_rate": 0.00047674280199611996,
+      "loss": 13.9638,
+      "step": 94680
+    },
+    {
+      "epoch": 0.14026568860394978,
+      "grad_norm": 6.125,
+      "learning_rate": 0.0004767403325286162,
+      "loss": 13.99,
+      "step": 94690
+    },
+    {
+      "epoch": 0.14028050175091397,
+      "grad_norm": 6.96875,
+      "learning_rate": 0.00047673786306111246,
+      "loss": 13.9276,
+      "step": 94700
+    },
+    {
+      "epoch": 0.14029531489787817,
+      "grad_norm": 6.0,
+      "learning_rate": 0.0004767353935936086,
+      "loss": 13.9056,
+      "step": 94710
+    },
+    {
+      "epoch": 0.14031012804484236,
+      "grad_norm": 7.78125,
+      "learning_rate": 0.00047673292412610485,
+      "loss": 13.8641,
+      "step": 94720
+    },
+    {
+      "epoch": 0.14032494119180655,
+      "grad_norm": 6.46875,
+      "learning_rate": 0.00047673045465860105,
+      "loss": 13.8805,
+      "step": 94730
+    },
+    {
+      "epoch": 0.14033975433877074,
+      "grad_norm": 6.3125,
+      "learning_rate": 0.0004767279851910973,
+      "loss": 13.938,
+      "step": 94740
+    },
+    {
+      "epoch": 0.14035456748573494,
+      "grad_norm": 6.75,
+      "learning_rate": 0.0004767255157235935,
+      "loss": 13.8394,
+      "step": 94750
+    },
+    {
+      "epoch": 0.14036938063269913,
+      "grad_norm": 6.9375,
+      "learning_rate": 0.0004767230462560897,
+      "loss": 13.907,
+      "step": 94760
+    },
+    {
+      "epoch": 0.14038419377966332,
+      "grad_norm": 13.25,
+      "learning_rate": 0.00047672057678858594,
+      "loss": 13.8627,
+      "step": 94770
+    },
+    {
+      "epoch": 0.14039900692662752,
+      "grad_norm": 7.6875,
+      "learning_rate": 0.0004767181073210822,
+      "loss": 13.9326,
+      "step": 94780
+    },
+    {
+      "epoch": 0.1404138200735917,
+      "grad_norm": 6.125,
+      "learning_rate": 0.00047671563785357834,
+      "loss": 13.9368,
+      "step": 94790
+    },
+    {
+      "epoch": 0.1404286332205559,
+      "grad_norm": 9.125,
+      "learning_rate": 0.0004767131683860746,
+      "loss": 13.8717,
+      "step": 94800
+    },
+    {
+      "epoch": 0.1404434463675201,
+      "grad_norm": 7.5625,
+      "learning_rate": 0.0004767106989185708,
+      "loss": 13.9224,
+      "step": 94810
+    },
+    {
+      "epoch": 0.1404582595144843,
+      "grad_norm": 6.0,
+      "learning_rate": 0.00047670822945106704,
+      "loss": 13.8873,
+      "step": 94820
+    },
+    {
+      "epoch": 0.14047307266144848,
+      "grad_norm": 8.0625,
+      "learning_rate": 0.00047670575998356323,
+      "loss": 13.9651,
+      "step": 94830
+    },
+    {
+      "epoch": 0.14048788580841268,
+      "grad_norm": 5.625,
+      "learning_rate": 0.00047670329051605943,
+      "loss": 13.8329,
+      "step": 94840
+    },
+    {
+      "epoch": 0.14050269895537687,
+      "grad_norm": 7.59375,
+      "learning_rate": 0.0004767008210485557,
+      "loss": 13.8635,
+      "step": 94850
+    },
+    {
+      "epoch": 0.14051751210234106,
+      "grad_norm": 5.875,
+      "learning_rate": 0.0004766983515810519,
+      "loss": 13.8424,
+      "step": 94860
+    },
+    {
+      "epoch": 0.14053232524930526,
+      "grad_norm": 6.59375,
+      "learning_rate": 0.0004766958821135481,
+      "loss": 13.8477,
+      "step": 94870
+    },
+    {
+      "epoch": 0.14054713839626945,
+      "grad_norm": 6.09375,
+      "learning_rate": 0.0004766934126460443,
+      "loss": 13.9522,
+      "step": 94880
+    },
+    {
+      "epoch": 0.14056195154323364,
+      "grad_norm": 8.4375,
+      "learning_rate": 0.0004766909431785406,
+      "loss": 13.8646,
+      "step": 94890
+    },
+    {
+      "epoch": 0.14057676469019784,
+      "grad_norm": 5.96875,
+      "learning_rate": 0.0004766884737110367,
+      "loss": 13.8809,
+      "step": 94900
+    },
+    {
+      "epoch": 0.14059157783716203,
+      "grad_norm": 6.0,
+      "learning_rate": 0.00047668600424353297,
+      "loss": 13.8384,
+      "step": 94910
+    },
+    {
+      "epoch": 0.14060639098412622,
+      "grad_norm": 6.5,
+      "learning_rate": 0.00047668353477602917,
+      "loss": 13.8713,
+      "step": 94920
+    },
+    {
+      "epoch": 0.14062120413109042,
+      "grad_norm": 6.75,
+      "learning_rate": 0.0004766810653085254,
+      "loss": 13.8549,
+      "step": 94930
+    },
+    {
+      "epoch": 0.1406360172780546,
+      "grad_norm": 6.53125,
+      "learning_rate": 0.0004766785958410216,
+      "loss": 13.855,
+      "step": 94940
+    },
+    {
+      "epoch": 0.1406508304250188,
+      "grad_norm": 6.15625,
+      "learning_rate": 0.0004766761263735178,
+      "loss": 13.7981,
+      "step": 94950
+    },
+    {
+      "epoch": 0.140665643571983,
+      "grad_norm": 6.21875,
+      "learning_rate": 0.00047667365690601406,
+      "loss": 13.7989,
+      "step": 94960
+    },
+    {
+      "epoch": 0.1406804567189472,
+      "grad_norm": 6.0625,
+      "learning_rate": 0.0004766711874385103,
+      "loss": 13.8386,
+      "step": 94970
+    },
+    {
+      "epoch": 0.14069526986591138,
+      "grad_norm": 5.75,
+      "learning_rate": 0.00047666871797100646,
+      "loss": 13.8376,
+      "step": 94980
+    },
+    {
+      "epoch": 0.14071008301287558,
+      "grad_norm": 7.28125,
+      "learning_rate": 0.0004766662485035027,
+      "loss": 13.8766,
+      "step": 94990
+    },
+    {
+      "epoch": 0.14072489615983977,
+      "grad_norm": 6.53125,
+      "learning_rate": 0.00047666377903599896,
+      "loss": 13.9248,
+      "step": 95000
     }
   ],
   "logging_steps": 10,
       "attributes": {}
     }
   },
+  "total_flos": 2.0530642742225456e+20,
   "train_batch_size": 48,
   "trial_name": null,
   "trial_params": null