| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.7231476163124642, | |
| "eval_steps": 100, | |
| "global_step": 3000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005743825387708214, | |
| "grad_norm": 1.664604902267456, | |
| "learning_rate": 1.9948305571510626e-05, | |
| "loss": 0.6766, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.011487650775416428, | |
| "grad_norm": 1.8649318218231201, | |
| "learning_rate": 1.9890867317633546e-05, | |
| "loss": 0.6443, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01723147616312464, | |
| "grad_norm": 2.0676939487457275, | |
| "learning_rate": 1.9833429063756463e-05, | |
| "loss": 0.6516, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.022975301550832855, | |
| "grad_norm": 2.077414035797119, | |
| "learning_rate": 1.9775990809879383e-05, | |
| "loss": 0.5468, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.02871912693854107, | |
| "grad_norm": 2.4310338497161865, | |
| "learning_rate": 1.97185525560023e-05, | |
| "loss": 0.6289, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03446295232624928, | |
| "grad_norm": 2.605480670928955, | |
| "learning_rate": 1.9661114302125216e-05, | |
| "loss": 0.583, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.040206777713957496, | |
| "grad_norm": 2.7097268104553223, | |
| "learning_rate": 1.9603676048248136e-05, | |
| "loss": 0.6029, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.04595060310166571, | |
| "grad_norm": 4.125054359436035, | |
| "learning_rate": 1.9546237794371053e-05, | |
| "loss": 0.5545, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.051694428489373924, | |
| "grad_norm": 4.592983245849609, | |
| "learning_rate": 1.948879954049397e-05, | |
| "loss": 0.5027, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.05743825387708214, | |
| "grad_norm": 4.799990653991699, | |
| "learning_rate": 1.9431361286616886e-05, | |
| "loss": 0.4913, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06318207926479034, | |
| "grad_norm": 5.146159648895264, | |
| "learning_rate": 1.9373923032739806e-05, | |
| "loss": 0.5419, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.06892590465249857, | |
| "grad_norm": 6.291711807250977, | |
| "learning_rate": 1.9316484778862726e-05, | |
| "loss": 0.478, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.07466973004020677, | |
| "grad_norm": 4.076691627502441, | |
| "learning_rate": 1.9259046524985643e-05, | |
| "loss": 0.489, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.08041355542791499, | |
| "grad_norm": 5.057150363922119, | |
| "learning_rate": 1.920160827110856e-05, | |
| "loss": 0.5435, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.0861573808156232, | |
| "grad_norm": 3.5335211753845215, | |
| "learning_rate": 1.914417001723148e-05, | |
| "loss": 0.4445, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.09190120620333142, | |
| "grad_norm": 6.209961414337158, | |
| "learning_rate": 1.9086731763354396e-05, | |
| "loss": 0.4516, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.09764503159103963, | |
| "grad_norm": 7.1031341552734375, | |
| "learning_rate": 1.9029293509477313e-05, | |
| "loss": 0.576, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.10338885697874785, | |
| "grad_norm": 4.959975719451904, | |
| "learning_rate": 1.8971855255600233e-05, | |
| "loss": 0.4841, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.10913268236645605, | |
| "grad_norm": 4.904316425323486, | |
| "learning_rate": 1.891441700172315e-05, | |
| "loss": 0.5007, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.11487650775416428, | |
| "grad_norm": 8.833961486816406, | |
| "learning_rate": 1.8856978747846066e-05, | |
| "loss": 0.5416, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.12062033314187248, | |
| "grad_norm": 6.040881156921387, | |
| "learning_rate": 1.8799540493968982e-05, | |
| "loss": 0.5162, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.1263641585295807, | |
| "grad_norm": 7.167252063751221, | |
| "learning_rate": 1.8742102240091902e-05, | |
| "loss": 0.4965, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.13210798391728892, | |
| "grad_norm": 3.9503183364868164, | |
| "learning_rate": 1.8684663986214822e-05, | |
| "loss": 0.5129, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.13785180930499713, | |
| "grad_norm": 4.403937339782715, | |
| "learning_rate": 1.862722573233774e-05, | |
| "loss": 0.5192, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.14359563469270534, | |
| "grad_norm": 6.813930034637451, | |
| "learning_rate": 1.8569787478460656e-05, | |
| "loss": 0.4927, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.14933946008041354, | |
| "grad_norm": 4.355352878570557, | |
| "learning_rate": 1.8512349224583576e-05, | |
| "loss": 0.4857, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.15508328546812178, | |
| "grad_norm": 4.012150287628174, | |
| "learning_rate": 1.8454910970706492e-05, | |
| "loss": 0.3763, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.16082711085582999, | |
| "grad_norm": 7.179994106292725, | |
| "learning_rate": 1.839747271682941e-05, | |
| "loss": 0.4625, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.1665709362435382, | |
| "grad_norm": 3.701215982437134, | |
| "learning_rate": 1.834003446295233e-05, | |
| "loss": 0.4954, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.1723147616312464, | |
| "grad_norm": 5.150369167327881, | |
| "learning_rate": 1.8282596209075246e-05, | |
| "loss": 0.4875, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.17805858701895463, | |
| "grad_norm": 4.432060241699219, | |
| "learning_rate": 1.8225157955198162e-05, | |
| "loss": 0.5032, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.18380241240666284, | |
| "grad_norm": 4.560088634490967, | |
| "learning_rate": 1.816771970132108e-05, | |
| "loss": 0.4443, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.18954623779437105, | |
| "grad_norm": 6.000060558319092, | |
| "learning_rate": 1.8110281447444e-05, | |
| "loss": 0.4805, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.19529006318207925, | |
| "grad_norm": 6.462589740753174, | |
| "learning_rate": 1.805284319356692e-05, | |
| "loss": 0.4009, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.2010338885697875, | |
| "grad_norm": 4.6327104568481445, | |
| "learning_rate": 1.7995404939689835e-05, | |
| "loss": 0.5126, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.2067777139574957, | |
| "grad_norm": 5.640307426452637, | |
| "learning_rate": 1.7937966685812752e-05, | |
| "loss": 0.4308, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.2125215393452039, | |
| "grad_norm": 6.553674697875977, | |
| "learning_rate": 1.7880528431935672e-05, | |
| "loss": 0.4686, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.2182653647329121, | |
| "grad_norm": 6.482476234436035, | |
| "learning_rate": 1.782309017805859e-05, | |
| "loss": 0.4214, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.22400919012062034, | |
| "grad_norm": 6.897400379180908, | |
| "learning_rate": 1.7765651924181505e-05, | |
| "loss": 0.4827, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.22975301550832855, | |
| "grad_norm": 3.935074806213379, | |
| "learning_rate": 1.7708213670304425e-05, | |
| "loss": 0.4212, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.23549684089603676, | |
| "grad_norm": 5.766780853271484, | |
| "learning_rate": 1.7650775416427342e-05, | |
| "loss": 0.4421, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.24124066628374496, | |
| "grad_norm": 8.834909439086914, | |
| "learning_rate": 1.759333716255026e-05, | |
| "loss": 0.4676, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.2469844916714532, | |
| "grad_norm": 4.853402614593506, | |
| "learning_rate": 1.7535898908673175e-05, | |
| "loss": 0.4888, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2527283170591614, | |
| "grad_norm": 5.489238739013672, | |
| "learning_rate": 1.7478460654796095e-05, | |
| "loss": 0.3763, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.2584721424468696, | |
| "grad_norm": 3.805001974105835, | |
| "learning_rate": 1.7421022400919015e-05, | |
| "loss": 0.5266, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.26421596783457785, | |
| "grad_norm": 5.9350080490112305, | |
| "learning_rate": 1.7363584147041932e-05, | |
| "loss": 0.4053, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.269959793222286, | |
| "grad_norm": 8.31826400756836, | |
| "learning_rate": 1.730614589316485e-05, | |
| "loss": 0.4775, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.27570361860999426, | |
| "grad_norm": 5.709866046905518, | |
| "learning_rate": 1.724870763928777e-05, | |
| "loss": 0.3681, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.2814474439977025, | |
| "grad_norm": 2.6657683849334717, | |
| "learning_rate": 1.7191269385410685e-05, | |
| "loss": 0.3913, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.2871912693854107, | |
| "grad_norm": 8.942373275756836, | |
| "learning_rate": 1.71338311315336e-05, | |
| "loss": 0.4454, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2929350947731189, | |
| "grad_norm": 7.566854476928711, | |
| "learning_rate": 1.707639287765652e-05, | |
| "loss": 0.4704, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.2986789201608271, | |
| "grad_norm": 5.819560527801514, | |
| "learning_rate": 1.7018954623779438e-05, | |
| "loss": 0.4245, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.3044227455485353, | |
| "grad_norm": 7.269554138183594, | |
| "learning_rate": 1.6961516369902355e-05, | |
| "loss": 0.523, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.31016657093624356, | |
| "grad_norm": 4.569669246673584, | |
| "learning_rate": 1.690407811602527e-05, | |
| "loss": 0.5053, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.31591039632395174, | |
| "grad_norm": 6.8373332023620605, | |
| "learning_rate": 1.684663986214819e-05, | |
| "loss": 0.4592, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.32165422171165997, | |
| "grad_norm": 7.736797332763672, | |
| "learning_rate": 1.678920160827111e-05, | |
| "loss": 0.4756, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.3273980470993682, | |
| "grad_norm": 5.857370853424072, | |
| "learning_rate": 1.6731763354394028e-05, | |
| "loss": 0.4906, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3331418724870764, | |
| "grad_norm": 4.2734785079956055, | |
| "learning_rate": 1.6674325100516945e-05, | |
| "loss": 0.4623, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.3388856978747846, | |
| "grad_norm": 7.859753131866455, | |
| "learning_rate": 1.6616886846639865e-05, | |
| "loss": 0.4568, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.3446295232624928, | |
| "grad_norm": 4.936352252960205, | |
| "learning_rate": 1.655944859276278e-05, | |
| "loss": 0.3774, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.35037334865020103, | |
| "grad_norm": 7.164617538452148, | |
| "learning_rate": 1.6502010338885698e-05, | |
| "loss": 0.5373, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.35611717403790927, | |
| "grad_norm": 7.279328346252441, | |
| "learning_rate": 1.6444572085008618e-05, | |
| "loss": 0.4183, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.36186099942561745, | |
| "grad_norm": 8.805562973022461, | |
| "learning_rate": 1.6387133831131535e-05, | |
| "loss": 0.4232, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.3676048248133257, | |
| "grad_norm": 6.055676460266113, | |
| "learning_rate": 1.632969557725445e-05, | |
| "loss": 0.4316, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.3733486502010339, | |
| "grad_norm": 5.835373401641846, | |
| "learning_rate": 1.627225732337737e-05, | |
| "loss": 0.4573, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3790924755887421, | |
| "grad_norm": 2.7914931774139404, | |
| "learning_rate": 1.6214819069500288e-05, | |
| "loss": 0.3691, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.38483630097645033, | |
| "grad_norm": 5.780324459075928, | |
| "learning_rate": 1.6157380815623208e-05, | |
| "loss": 0.3761, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.3905801263641585, | |
| "grad_norm": 5.7374348640441895, | |
| "learning_rate": 1.6099942561746125e-05, | |
| "loss": 0.4276, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.39632395175186674, | |
| "grad_norm": 4.771572589874268, | |
| "learning_rate": 1.604250430786904e-05, | |
| "loss": 0.4092, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.402067777139575, | |
| "grad_norm": 7.6168928146362305, | |
| "learning_rate": 1.598506605399196e-05, | |
| "loss": 0.4651, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.40781160252728316, | |
| "grad_norm": 6.272294998168945, | |
| "learning_rate": 1.5927627800114878e-05, | |
| "loss": 0.4107, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.4135554279149914, | |
| "grad_norm": 8.060046195983887, | |
| "learning_rate": 1.5870189546237794e-05, | |
| "loss": 0.4117, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.41929925330269957, | |
| "grad_norm": 7.290645122528076, | |
| "learning_rate": 1.5812751292360714e-05, | |
| "loss": 0.5005, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.4250430786904078, | |
| "grad_norm": 8.02639102935791, | |
| "learning_rate": 1.575531303848363e-05, | |
| "loss": 0.4561, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.43078690407811604, | |
| "grad_norm": 8.455704689025879, | |
| "learning_rate": 1.5697874784606548e-05, | |
| "loss": 0.4592, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4365307294658242, | |
| "grad_norm": 2.9264323711395264, | |
| "learning_rate": 1.5640436530729468e-05, | |
| "loss": 0.4532, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.44227455485353245, | |
| "grad_norm": 5.246438980102539, | |
| "learning_rate": 1.5582998276852384e-05, | |
| "loss": 0.4931, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.4480183802412407, | |
| "grad_norm": 3.869628667831421, | |
| "learning_rate": 1.5525560022975304e-05, | |
| "loss": 0.3476, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.45376220562894887, | |
| "grad_norm": 4.742978572845459, | |
| "learning_rate": 1.546812176909822e-05, | |
| "loss": 0.4274, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.4595060310166571, | |
| "grad_norm": 6.9326300621032715, | |
| "learning_rate": 1.5410683515221138e-05, | |
| "loss": 0.5299, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4652498564043653, | |
| "grad_norm": 6.16196346282959, | |
| "learning_rate": 1.5353245261344058e-05, | |
| "loss": 0.4805, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.4709936817920735, | |
| "grad_norm": 3.637753486633301, | |
| "learning_rate": 1.5295807007466974e-05, | |
| "loss": 0.4393, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.47673750717978175, | |
| "grad_norm": 3.867966413497925, | |
| "learning_rate": 1.523836875358989e-05, | |
| "loss": 0.4358, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.48248133256748993, | |
| "grad_norm": 4.753304481506348, | |
| "learning_rate": 1.5180930499712809e-05, | |
| "loss": 0.4737, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.48822515795519816, | |
| "grad_norm": 7.7434515953063965, | |
| "learning_rate": 1.5123492245835727e-05, | |
| "loss": 0.4466, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.4939689833429064, | |
| "grad_norm": 6.001241683959961, | |
| "learning_rate": 1.5066053991958644e-05, | |
| "loss": 0.4016, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.4997128087306146, | |
| "grad_norm": 7.710599422454834, | |
| "learning_rate": 1.5008615738081564e-05, | |
| "loss": 0.4758, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.5054566341183228, | |
| "grad_norm": 5.139091491699219, | |
| "learning_rate": 1.4951177484204482e-05, | |
| "loss": 0.4019, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.511200459506031, | |
| "grad_norm": 9.428766250610352, | |
| "learning_rate": 1.4893739230327399e-05, | |
| "loss": 0.3742, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.5169442848937392, | |
| "grad_norm": 9.071430206298828, | |
| "learning_rate": 1.4836300976450317e-05, | |
| "loss": 0.4183, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5226881102814475, | |
| "grad_norm": 10.34457015991211, | |
| "learning_rate": 1.4778862722573236e-05, | |
| "loss": 0.437, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.5284319356691557, | |
| "grad_norm": 7.01099967956543, | |
| "learning_rate": 1.4721424468696152e-05, | |
| "loss": 0.4421, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.5341757610568638, | |
| "grad_norm": 3.454484701156616, | |
| "learning_rate": 1.466398621481907e-05, | |
| "loss": 0.4283, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.539919586444572, | |
| "grad_norm": 5.86787223815918, | |
| "learning_rate": 1.4606547960941987e-05, | |
| "loss": 0.4094, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.5456634118322803, | |
| "grad_norm": 8.41640567779541, | |
| "learning_rate": 1.4549109707064906e-05, | |
| "loss": 0.414, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.5514072372199885, | |
| "grad_norm": 6.936724662780762, | |
| "learning_rate": 1.4491671453187824e-05, | |
| "loss": 0.3936, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.5571510626076968, | |
| "grad_norm": 7.984805583953857, | |
| "learning_rate": 1.4434233199310744e-05, | |
| "loss": 0.4071, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.562894887995405, | |
| "grad_norm": 4.381742477416992, | |
| "learning_rate": 1.437679494543366e-05, | |
| "loss": 0.3393, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.5686387133831131, | |
| "grad_norm": 7.115252494812012, | |
| "learning_rate": 1.4319356691556579e-05, | |
| "loss": 0.5616, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.5743825387708213, | |
| "grad_norm": 6.60570764541626, | |
| "learning_rate": 1.4261918437679495e-05, | |
| "loss": 0.4633, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5801263641585296, | |
| "grad_norm": 9.552061080932617, | |
| "learning_rate": 1.4204480183802414e-05, | |
| "loss": 0.4973, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.5858701895462378, | |
| "grad_norm": 3.6467580795288086, | |
| "learning_rate": 1.4147041929925332e-05, | |
| "loss": 0.3958, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.591614014933946, | |
| "grad_norm": 2.663799524307251, | |
| "learning_rate": 1.4089603676048249e-05, | |
| "loss": 0.3402, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.5973578403216542, | |
| "grad_norm": 7.468900203704834, | |
| "learning_rate": 1.4032165422171167e-05, | |
| "loss": 0.3341, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.6031016657093624, | |
| "grad_norm": 10.396268844604492, | |
| "learning_rate": 1.3974727168294084e-05, | |
| "loss": 0.5122, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.6088454910970706, | |
| "grad_norm": 7.79450798034668, | |
| "learning_rate": 1.3917288914417002e-05, | |
| "loss": 0.5581, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.6145893164847789, | |
| "grad_norm": 8.077397346496582, | |
| "learning_rate": 1.385985066053992e-05, | |
| "loss": 0.4247, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.6203331418724871, | |
| "grad_norm": 8.327542304992676, | |
| "learning_rate": 1.380241240666284e-05, | |
| "loss": 0.4122, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.6260769672601952, | |
| "grad_norm": 7.940774917602539, | |
| "learning_rate": 1.3744974152785757e-05, | |
| "loss": 0.5199, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.6318207926479035, | |
| "grad_norm": 5.148271560668945, | |
| "learning_rate": 1.3687535898908675e-05, | |
| "loss": 0.4534, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6375646180356117, | |
| "grad_norm": 7.042996883392334, | |
| "learning_rate": 1.3630097645031592e-05, | |
| "loss": 0.4737, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.6433084434233199, | |
| "grad_norm": 5.131284236907959, | |
| "learning_rate": 1.357265939115451e-05, | |
| "loss": 0.3637, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.6490522688110282, | |
| "grad_norm": 10.73865795135498, | |
| "learning_rate": 1.3515221137277428e-05, | |
| "loss": 0.4152, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.6547960941987364, | |
| "grad_norm": 6.061016082763672, | |
| "learning_rate": 1.3457782883400345e-05, | |
| "loss": 0.343, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.6605399195864445, | |
| "grad_norm": 11.200201988220215, | |
| "learning_rate": 1.3400344629523263e-05, | |
| "loss": 0.4781, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.6662837449741528, | |
| "grad_norm": 6.987539768218994, | |
| "learning_rate": 1.334290637564618e-05, | |
| "loss": 0.4046, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.672027570361861, | |
| "grad_norm": 7.3787713050842285, | |
| "learning_rate": 1.3285468121769098e-05, | |
| "loss": 0.4136, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.6777713957495692, | |
| "grad_norm": 8.829427719116211, | |
| "learning_rate": 1.3228029867892018e-05, | |
| "loss": 0.3807, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.6835152211372775, | |
| "grad_norm": 9.648842811584473, | |
| "learning_rate": 1.3170591614014937e-05, | |
| "loss": 0.3273, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.6892590465249856, | |
| "grad_norm": 7.307587146759033, | |
| "learning_rate": 1.3113153360137853e-05, | |
| "loss": 0.3351, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6950028719126938, | |
| "grad_norm": 6.445584297180176, | |
| "learning_rate": 1.3055715106260772e-05, | |
| "loss": 0.4776, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.7007466973004021, | |
| "grad_norm": 7.078349590301514, | |
| "learning_rate": 1.2998276852383688e-05, | |
| "loss": 0.4438, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.7064905226881103, | |
| "grad_norm": 9.63571834564209, | |
| "learning_rate": 1.2940838598506606e-05, | |
| "loss": 0.4027, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.7122343480758185, | |
| "grad_norm": 3.2861080169677734, | |
| "learning_rate": 1.2883400344629525e-05, | |
| "loss": 0.3334, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.7179781734635267, | |
| "grad_norm": 7.433917999267578, | |
| "learning_rate": 1.2825962090752441e-05, | |
| "loss": 0.4418, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.7237219988512349, | |
| "grad_norm": 7.511765003204346, | |
| "learning_rate": 1.276852383687536e-05, | |
| "loss": 0.3693, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.7294658242389431, | |
| "grad_norm": 9.38160228729248, | |
| "learning_rate": 1.2711085582998276e-05, | |
| "loss": 0.3989, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.7352096496266514, | |
| "grad_norm": 2.012756586074829, | |
| "learning_rate": 1.2653647329121195e-05, | |
| "loss": 0.3867, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.7409534750143596, | |
| "grad_norm": 6.777096748352051, | |
| "learning_rate": 1.2596209075244115e-05, | |
| "loss": 0.5528, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.7466973004020678, | |
| "grad_norm": 6.928145885467529, | |
| "learning_rate": 1.2538770821367033e-05, | |
| "loss": 0.3403, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.752441125789776, | |
| "grad_norm": 7.99967622756958, | |
| "learning_rate": 1.248133256748995e-05, | |
| "loss": 0.5006, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.7581849511774842, | |
| "grad_norm": 4.8364033699035645, | |
| "learning_rate": 1.2423894313612868e-05, | |
| "loss": 0.4266, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.7639287765651924, | |
| "grad_norm": 4.017831802368164, | |
| "learning_rate": 1.2366456059735785e-05, | |
| "loss": 0.3444, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.7696726019529007, | |
| "grad_norm": 5.27449893951416, | |
| "learning_rate": 1.2309017805858703e-05, | |
| "loss": 0.3962, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.7754164273406089, | |
| "grad_norm": 7.989853858947754, | |
| "learning_rate": 1.2251579551981621e-05, | |
| "loss": 0.4172, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.781160252728317, | |
| "grad_norm": 5.878440856933594, | |
| "learning_rate": 1.2194141298104538e-05, | |
| "loss": 0.4723, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.7869040781160253, | |
| "grad_norm": 9.140411376953125, | |
| "learning_rate": 1.2136703044227456e-05, | |
| "loss": 0.4012, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.7926479035037335, | |
| "grad_norm": 3.9119012355804443, | |
| "learning_rate": 1.2079264790350373e-05, | |
| "loss": 0.3545, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.7983917288914417, | |
| "grad_norm": 6.248933792114258, | |
| "learning_rate": 1.2021826536473291e-05, | |
| "loss": 0.4787, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.80413555427915, | |
| "grad_norm": 8.478959083557129, | |
| "learning_rate": 1.1964388282596211e-05, | |
| "loss": 0.4077, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.8098793796668581, | |
| "grad_norm": 6.234384059906006, | |
| "learning_rate": 1.190695002871913e-05, | |
| "loss": 0.4044, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.8156232050545663, | |
| "grad_norm": 5.093031883239746, | |
| "learning_rate": 1.1849511774842046e-05, | |
| "loss": 0.3298, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.8213670304422745, | |
| "grad_norm": 5.755350112915039, | |
| "learning_rate": 1.1792073520964964e-05, | |
| "loss": 0.4099, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.8271108558299828, | |
| "grad_norm": 9.269704818725586, | |
| "learning_rate": 1.1734635267087881e-05, | |
| "loss": 0.366, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.832854681217691, | |
| "grad_norm": 4.977533340454102, | |
| "learning_rate": 1.16771970132108e-05, | |
| "loss": 0.3156, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.8385985066053991, | |
| "grad_norm": 6.767063140869141, | |
| "learning_rate": 1.1619758759333718e-05, | |
| "loss": 0.3966, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.8443423319931074, | |
| "grad_norm": 6.855627536773682, | |
| "learning_rate": 1.1562320505456634e-05, | |
| "loss": 0.3488, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.8500861573808156, | |
| "grad_norm": 3.408679723739624, | |
| "learning_rate": 1.1504882251579552e-05, | |
| "loss": 0.4016, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.8558299827685238, | |
| "grad_norm": 8.43376636505127, | |
| "learning_rate": 1.1447443997702469e-05, | |
| "loss": 0.3951, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.8615738081562321, | |
| "grad_norm": 7.106573104858398, | |
| "learning_rate": 1.1390005743825389e-05, | |
| "loss": 0.3056, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.8673176335439403, | |
| "grad_norm": 3.373734474182129, | |
| "learning_rate": 1.1332567489948307e-05, | |
| "loss": 0.4548, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.8730614589316484, | |
| "grad_norm": 4.657841205596924, | |
| "learning_rate": 1.1275129236071226e-05, | |
| "loss": 0.36, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.8788052843193567, | |
| "grad_norm": 8.218329429626465, | |
| "learning_rate": 1.1217690982194142e-05, | |
| "loss": 0.4297, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.8845491097070649, | |
| "grad_norm": 7.709052562713623, | |
| "learning_rate": 1.116025272831706e-05, | |
| "loss": 0.3766, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.8902929350947731, | |
| "grad_norm": 6.875143527984619, | |
| "learning_rate": 1.1102814474439977e-05, | |
| "loss": 0.4106, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.8960367604824814, | |
| "grad_norm": 5.460892200469971, | |
| "learning_rate": 1.1045376220562896e-05, | |
| "loss": 0.3016, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.9017805858701895, | |
| "grad_norm": 3.4830429553985596, | |
| "learning_rate": 1.0987937966685814e-05, | |
| "loss": 0.371, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.9075244112578977, | |
| "grad_norm": 8.233579635620117, | |
| "learning_rate": 1.093049971280873e-05, | |
| "loss": 0.4591, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.913268236645606, | |
| "grad_norm": 7.001081466674805, | |
| "learning_rate": 1.0873061458931649e-05, | |
| "loss": 0.3856, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.9190120620333142, | |
| "grad_norm": 7.473963260650635, | |
| "learning_rate": 1.0815623205054565e-05, | |
| "loss": 0.4937, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.9247558874210224, | |
| "grad_norm": 4.046863079071045, | |
| "learning_rate": 1.0758184951177485e-05, | |
| "loss": 0.4141, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.9304997128087306, | |
| "grad_norm": 5.425885200500488, | |
| "learning_rate": 1.0700746697300404e-05, | |
| "loss": 0.3545, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.9362435381964388, | |
| "grad_norm": 5.255967140197754, | |
| "learning_rate": 1.0643308443423322e-05, | |
| "loss": 0.3881, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.941987363584147, | |
| "grad_norm": 7.365703105926514, | |
| "learning_rate": 1.0585870189546239e-05, | |
| "loss": 0.3648, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.9477311889718553, | |
| "grad_norm": 5.149658679962158, | |
| "learning_rate": 1.0528431935669157e-05, | |
| "loss": 0.2952, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.9534750143595635, | |
| "grad_norm": 5.5968194007873535, | |
| "learning_rate": 1.0470993681792074e-05, | |
| "loss": 0.3604, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.9592188397472717, | |
| "grad_norm": 6.176368713378906, | |
| "learning_rate": 1.0413555427914992e-05, | |
| "loss": 0.3584, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.9649626651349799, | |
| "grad_norm": 5.876338958740234, | |
| "learning_rate": 1.035611717403791e-05, | |
| "loss": 0.4076, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.9707064905226881, | |
| "grad_norm": 11.697908401489258, | |
| "learning_rate": 1.0298678920160827e-05, | |
| "loss": 0.3648, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.9764503159103963, | |
| "grad_norm": 7.04163122177124, | |
| "learning_rate": 1.0241240666283745e-05, | |
| "loss": 0.349, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.9821941412981046, | |
| "grad_norm": 6.7707133293151855, | |
| "learning_rate": 1.0183802412406662e-05, | |
| "loss": 0.3888, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.9879379666858128, | |
| "grad_norm": 3.8108270168304443, | |
| "learning_rate": 1.0126364158529582e-05, | |
| "loss": 0.3753, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.9936817920735209, | |
| "grad_norm": 11.013320922851562, | |
| "learning_rate": 1.00689259046525e-05, | |
| "loss": 0.3977, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.9994256174612292, | |
| "grad_norm": 4.042791843414307, | |
| "learning_rate": 1.0011487650775419e-05, | |
| "loss": 0.3724, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.0051694428489375, | |
| "grad_norm": 6.258309841156006, | |
| "learning_rate": 9.954049396898335e-06, | |
| "loss": 0.3591, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.0109132682366455, | |
| "grad_norm": 7.884782314300537, | |
| "learning_rate": 9.896611143021253e-06, | |
| "loss": 0.4114, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.0166570936243537, | |
| "grad_norm": 4.9663567543029785, | |
| "learning_rate": 9.83917288914417e-06, | |
| "loss": 0.4462, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.022400919012062, | |
| "grad_norm": 7.046320915222168, | |
| "learning_rate": 9.781734635267088e-06, | |
| "loss": 0.3386, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.0281447443997702, | |
| "grad_norm": 6.846945762634277, | |
| "learning_rate": 9.724296381390007e-06, | |
| "loss": 0.3181, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.0338885697874785, | |
| "grad_norm": 5.925526142120361, | |
| "learning_rate": 9.666858127512925e-06, | |
| "loss": 0.3357, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.0396323951751867, | |
| "grad_norm": 14.302725791931152, | |
| "learning_rate": 9.609419873635842e-06, | |
| "loss": 0.2859, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.045376220562895, | |
| "grad_norm": 8.27291488647461, | |
| "learning_rate": 9.55198161975876e-06, | |
| "loss": 0.3688, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.0511200459506032, | |
| "grad_norm": 5.266950607299805, | |
| "learning_rate": 9.494543365881678e-06, | |
| "loss": 0.3106, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.0568638713383114, | |
| "grad_norm": 11.347005844116211, | |
| "learning_rate": 9.437105112004595e-06, | |
| "loss": 0.3389, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.0626076967260196, | |
| "grad_norm": 4.7072906494140625, | |
| "learning_rate": 9.379666858127515e-06, | |
| "loss": 0.3519, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.0683515221137276, | |
| "grad_norm": 4.05309534072876, | |
| "learning_rate": 9.322228604250432e-06, | |
| "loss": 0.3006, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.0740953475014359, | |
| "grad_norm": 5.578520774841309, | |
| "learning_rate": 9.26479035037335e-06, | |
| "loss": 0.3645, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.079839172889144, | |
| "grad_norm": 7.405791282653809, | |
| "learning_rate": 9.207352096496266e-06, | |
| "loss": 0.3104, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.0855829982768523, | |
| "grad_norm": 9.269173622131348, | |
| "learning_rate": 9.149913842619185e-06, | |
| "loss": 0.3576, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.0913268236645606, | |
| "grad_norm": 5.276297569274902, | |
| "learning_rate": 9.092475588742103e-06, | |
| "loss": 0.3354, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.0970706490522688, | |
| "grad_norm": 8.320406913757324, | |
| "learning_rate": 9.035037334865021e-06, | |
| "loss": 0.3232, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.102814474439977, | |
| "grad_norm": 6.023215293884277, | |
| "learning_rate": 8.977599080987938e-06, | |
| "loss": 0.3427, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.1085582998276853, | |
| "grad_norm": 8.178590774536133, | |
| "learning_rate": 8.920160827110856e-06, | |
| "loss": 0.3103, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.1143021252153935, | |
| "grad_norm": 6.056619644165039, | |
| "learning_rate": 8.862722573233775e-06, | |
| "loss": 0.3848, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.1200459506031017, | |
| "grad_norm": 6.109485626220703, | |
| "learning_rate": 8.805284319356693e-06, | |
| "loss": 0.3102, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.12578977599081, | |
| "grad_norm": 6.949984550476074, | |
| "learning_rate": 8.747846065479611e-06, | |
| "loss": 0.3013, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.1315336013785182, | |
| "grad_norm": 4.320880889892578, | |
| "learning_rate": 8.690407811602528e-06, | |
| "loss": 0.3169, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.1372774267662262, | |
| "grad_norm": 9.62964916229248, | |
| "learning_rate": 8.632969557725446e-06, | |
| "loss": 0.3577, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.1430212521539345, | |
| "grad_norm": 7.1865105628967285, | |
| "learning_rate": 8.575531303848363e-06, | |
| "loss": 0.4245, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.1487650775416427, | |
| "grad_norm": 11.42944622039795, | |
| "learning_rate": 8.518093049971281e-06, | |
| "loss": 0.3421, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.154508902929351, | |
| "grad_norm": 10.365814208984375, | |
| "learning_rate": 8.4606547960942e-06, | |
| "loss": 0.2971, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.1602527283170592, | |
| "grad_norm": 4.546888828277588, | |
| "learning_rate": 8.403216542217118e-06, | |
| "loss": 0.3762, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.1659965537047674, | |
| "grad_norm": 9.672823905944824, | |
| "learning_rate": 8.345778288340034e-06, | |
| "loss": 0.3411, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.1717403790924756, | |
| "grad_norm": 4.738915920257568, | |
| "learning_rate": 8.288340034462953e-06, | |
| "loss": 0.3453, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.1774842044801839, | |
| "grad_norm": 10.187810897827148, | |
| "learning_rate": 8.230901780585871e-06, | |
| "loss": 0.3351, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.183228029867892, | |
| "grad_norm": 6.290671348571777, | |
| "learning_rate": 8.17346352670879e-06, | |
| "loss": 0.3286, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.1889718552556001, | |
| "grad_norm": 9.14261531829834, | |
| "learning_rate": 8.116025272831708e-06, | |
| "loss": 0.2878, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.1947156806433084, | |
| "grad_norm": 7.814758777618408, | |
| "learning_rate": 8.058587018954624e-06, | |
| "loss": 0.3469, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.2004595060310166, | |
| "grad_norm": 10.085731506347656, | |
| "learning_rate": 8.001148765077543e-06, | |
| "loss": 0.3025, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.2062033314187248, | |
| "grad_norm": 10.734376907348633, | |
| "learning_rate": 7.94371051120046e-06, | |
| "loss": 0.2442, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.211947156806433, | |
| "grad_norm": 13.61286735534668, | |
| "learning_rate": 7.88627225732338e-06, | |
| "loss": 0.3063, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.2176909821941413, | |
| "grad_norm": 8.572850227355957, | |
| "learning_rate": 7.828834003446296e-06, | |
| "loss": 0.3863, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.2234348075818495, | |
| "grad_norm": 6.247170448303223, | |
| "learning_rate": 7.771395749569214e-06, | |
| "loss": 0.3214, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.2291786329695578, | |
| "grad_norm": 7.438636779785156, | |
| "learning_rate": 7.71395749569213e-06, | |
| "loss": 0.381, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.234922458357266, | |
| "grad_norm": 6.11846399307251, | |
| "learning_rate": 7.656519241815049e-06, | |
| "loss": 0.3556, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.2406662837449742, | |
| "grad_norm": 10.697092056274414, | |
| "learning_rate": 7.5990809879379666e-06, | |
| "loss": 0.3767, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.2464101091326825, | |
| "grad_norm": 5.3118205070495605, | |
| "learning_rate": 7.541642734060886e-06, | |
| "loss": 0.3112, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.2521539345203907, | |
| "grad_norm": 5.907925128936768, | |
| "learning_rate": 7.484204480183803e-06, | |
| "loss": 0.2833, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.2578977599080987, | |
| "grad_norm": 7.271302223205566, | |
| "learning_rate": 7.426766226306721e-06, | |
| "loss": 0.1935, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.263641585295807, | |
| "grad_norm": 12.389423370361328, | |
| "learning_rate": 7.369327972429638e-06, | |
| "loss": 0.3946, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.2693854106835152, | |
| "grad_norm": 9.09422492980957, | |
| "learning_rate": 7.3118897185525564e-06, | |
| "loss": 0.3191, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.2751292360712234, | |
| "grad_norm": 8.75156307220459, | |
| "learning_rate": 7.254451464675475e-06, | |
| "loss": 0.4077, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.2808730614589316, | |
| "grad_norm": 7.306863784790039, | |
| "learning_rate": 7.197013210798392e-06, | |
| "loss": 0.3295, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.2866168868466399, | |
| "grad_norm": 9.715473175048828, | |
| "learning_rate": 7.1395749569213105e-06, | |
| "loss": 0.4163, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.2923607122343481, | |
| "grad_norm": 6.315252780914307, | |
| "learning_rate": 7.082136703044228e-06, | |
| "loss": 0.3817, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.2981045376220564, | |
| "grad_norm": 8.821859359741211, | |
| "learning_rate": 7.0246984491671455e-06, | |
| "loss": 0.3265, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.3038483630097644, | |
| "grad_norm": 6.838233947753906, | |
| "learning_rate": 6.967260195290065e-06, | |
| "loss": 0.2966, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 1.3095921883974726, | |
| "grad_norm": 9.925073623657227, | |
| "learning_rate": 6.909821941412982e-06, | |
| "loss": 0.4484, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.3153360137851808, | |
| "grad_norm": 5.026411056518555, | |
| "learning_rate": 6.8523836875358996e-06, | |
| "loss": 0.4647, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.321079839172889, | |
| "grad_norm": 4.3956732749938965, | |
| "learning_rate": 6.794945433658817e-06, | |
| "loss": 0.2968, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.3268236645605973, | |
| "grad_norm": 6.904971599578857, | |
| "learning_rate": 6.7375071797817345e-06, | |
| "loss": 0.2888, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 1.3325674899483055, | |
| "grad_norm": 3.1684281826019287, | |
| "learning_rate": 6.680068925904653e-06, | |
| "loss": 0.2975, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.3383113153360138, | |
| "grad_norm": 7.3333420753479, | |
| "learning_rate": 6.622630672027571e-06, | |
| "loss": 0.3911, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 1.344055140723722, | |
| "grad_norm": 7.822445392608643, | |
| "learning_rate": 6.565192418150489e-06, | |
| "loss": 0.3199, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.3497989661114302, | |
| "grad_norm": 9.02872371673584, | |
| "learning_rate": 6.507754164273407e-06, | |
| "loss": 0.4698, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.3555427914991385, | |
| "grad_norm": 3.9332520961761475, | |
| "learning_rate": 6.450315910396324e-06, | |
| "loss": 0.3651, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.3612866168868467, | |
| "grad_norm": 7.590347766876221, | |
| "learning_rate": 6.392877656519242e-06, | |
| "loss": 0.3121, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 1.367030442274555, | |
| "grad_norm": 8.964584350585938, | |
| "learning_rate": 6.335439402642161e-06, | |
| "loss": 0.271, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.3727742676622632, | |
| "grad_norm": 8.058918952941895, | |
| "learning_rate": 6.2780011487650785e-06, | |
| "loss": 0.3906, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 1.3785180930499714, | |
| "grad_norm": 6.742099761962891, | |
| "learning_rate": 6.220562894887996e-06, | |
| "loss": 0.3072, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.3842619184376794, | |
| "grad_norm": 5.961569309234619, | |
| "learning_rate": 6.1631246410109134e-06, | |
| "loss": 0.3673, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 1.3900057438253877, | |
| "grad_norm": 9.705893516540527, | |
| "learning_rate": 6.105686387133831e-06, | |
| "loss": 0.3544, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 1.395749569213096, | |
| "grad_norm": 4.435375690460205, | |
| "learning_rate": 6.04824813325675e-06, | |
| "loss": 0.2372, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 1.4014933946008041, | |
| "grad_norm": 5.375720977783203, | |
| "learning_rate": 5.9908098793796675e-06, | |
| "loss": 0.2264, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 1.4072372199885124, | |
| "grad_norm": 5.602358818054199, | |
| "learning_rate": 5.933371625502585e-06, | |
| "loss": 0.3449, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.4129810453762206, | |
| "grad_norm": 10.811373710632324, | |
| "learning_rate": 5.875933371625503e-06, | |
| "loss": 0.3663, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 1.4187248707639288, | |
| "grad_norm": 10.196518898010254, | |
| "learning_rate": 5.818495117748421e-06, | |
| "loss": 0.3546, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 1.424468696151637, | |
| "grad_norm": 10.06306266784668, | |
| "learning_rate": 5.761056863871339e-06, | |
| "loss": 0.3282, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 1.430212521539345, | |
| "grad_norm": 4.978325843811035, | |
| "learning_rate": 5.703618609994257e-06, | |
| "loss": 0.2961, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 1.4359563469270533, | |
| "grad_norm": 10.731146812438965, | |
| "learning_rate": 5.646180356117175e-06, | |
| "loss": 0.3349, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.4417001723147616, | |
| "grad_norm": 8.913891792297363, | |
| "learning_rate": 5.588742102240092e-06, | |
| "loss": 0.3131, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 1.4474439977024698, | |
| "grad_norm": 5.1745195388793945, | |
| "learning_rate": 5.53130384836301e-06, | |
| "loss": 0.3842, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 1.453187823090178, | |
| "grad_norm": 8.361491203308105, | |
| "learning_rate": 5.473865594485927e-06, | |
| "loss": 0.3598, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 1.4589316484778863, | |
| "grad_norm": 6.487078666687012, | |
| "learning_rate": 5.4164273406088464e-06, | |
| "loss": 0.3244, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 1.4646754738655945, | |
| "grad_norm": 4.129726409912109, | |
| "learning_rate": 5.358989086731764e-06, | |
| "loss": 0.3152, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.4704192992533027, | |
| "grad_norm": 9.363592147827148, | |
| "learning_rate": 5.301550832854681e-06, | |
| "loss": 0.3321, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 1.476163124641011, | |
| "grad_norm": 6.334773063659668, | |
| "learning_rate": 5.2441125789776e-06, | |
| "loss": 0.3312, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 1.4819069500287192, | |
| "grad_norm": 7.404930114746094, | |
| "learning_rate": 5.186674325100517e-06, | |
| "loss": 0.3739, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 1.4876507754164274, | |
| "grad_norm": 7.487016201019287, | |
| "learning_rate": 5.1292360712234355e-06, | |
| "loss": 0.2865, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 1.4933946008041357, | |
| "grad_norm": 13.322307586669922, | |
| "learning_rate": 5.071797817346353e-06, | |
| "loss": 0.3444, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.499138426191844, | |
| "grad_norm": 9.053878784179688, | |
| "learning_rate": 5.014359563469271e-06, | |
| "loss": 0.3799, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 1.5048822515795521, | |
| "grad_norm": 4.018943786621094, | |
| "learning_rate": 4.956921309592189e-06, | |
| "loss": 0.2567, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 1.5106260769672601, | |
| "grad_norm": 2.2457354068756104, | |
| "learning_rate": 4.899483055715107e-06, | |
| "loss": 0.2978, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 1.5163699023549684, | |
| "grad_norm": 4.894889831542969, | |
| "learning_rate": 4.8420448018380245e-06, | |
| "loss": 0.2861, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 1.5221137277426766, | |
| "grad_norm": 6.843629360198975, | |
| "learning_rate": 4.784606547960942e-06, | |
| "loss": 0.3254, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.5278575531303848, | |
| "grad_norm": 7.173573970794678, | |
| "learning_rate": 4.72716829408386e-06, | |
| "loss": 0.3481, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 1.533601378518093, | |
| "grad_norm": 12.328543663024902, | |
| "learning_rate": 4.669730040206778e-06, | |
| "loss": 0.3124, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 1.5393452039058013, | |
| "grad_norm": 9.337592124938965, | |
| "learning_rate": 4.612291786329696e-06, | |
| "loss": 0.3005, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 1.5450890292935093, | |
| "grad_norm": 5.477969646453857, | |
| "learning_rate": 4.5548535324526135e-06, | |
| "loss": 0.3457, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 1.5508328546812176, | |
| "grad_norm": 5.083920955657959, | |
| "learning_rate": 4.497415278575532e-06, | |
| "loss": 0.2858, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.5565766800689258, | |
| "grad_norm": 6.250855445861816, | |
| "learning_rate": 4.439977024698449e-06, | |
| "loss": 0.2673, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 1.562320505456634, | |
| "grad_norm": 6.169952392578125, | |
| "learning_rate": 4.382538770821368e-06, | |
| "loss": 0.2971, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 1.5680643308443423, | |
| "grad_norm": 8.261754989624023, | |
| "learning_rate": 4.325100516944285e-06, | |
| "loss": 0.3009, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 1.5738081562320505, | |
| "grad_norm": 8.477384567260742, | |
| "learning_rate": 4.267662263067203e-06, | |
| "loss": 0.3017, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 1.5795519816197587, | |
| "grad_norm": 8.52374267578125, | |
| "learning_rate": 4.210224009190121e-06, | |
| "loss": 0.4511, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.585295807007467, | |
| "grad_norm": 8.646858215332031, | |
| "learning_rate": 4.152785755313039e-06, | |
| "loss": 0.2699, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 1.5910396323951752, | |
| "grad_norm": 7.500174522399902, | |
| "learning_rate": 4.095347501435957e-06, | |
| "loss": 0.2743, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 1.5967834577828834, | |
| "grad_norm": 5.454465389251709, | |
| "learning_rate": 4.037909247558874e-06, | |
| "loss": 0.3023, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 1.6025272831705917, | |
| "grad_norm": 2.6998019218444824, | |
| "learning_rate": 3.9804709936817925e-06, | |
| "loss": 0.3941, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 1.6082711085583, | |
| "grad_norm": 4.594570159912109, | |
| "learning_rate": 3.92303273980471e-06, | |
| "loss": 0.252, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.6140149339460081, | |
| "grad_norm": 5.87538480758667, | |
| "learning_rate": 3.865594485927628e-06, | |
| "loss": 0.3093, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 1.6197587593337164, | |
| "grad_norm": 5.358250617980957, | |
| "learning_rate": 3.808156232050546e-06, | |
| "loss": 0.3674, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 1.6255025847214246, | |
| "grad_norm": 4.871222972869873, | |
| "learning_rate": 3.7507179781734636e-06, | |
| "loss": 0.2407, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 1.6312464101091326, | |
| "grad_norm": 6.6836838722229, | |
| "learning_rate": 3.693279724296382e-06, | |
| "loss": 0.2978, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 1.6369902354968409, | |
| "grad_norm": 7.67780065536499, | |
| "learning_rate": 3.6358414704192994e-06, | |
| "loss": 0.3655, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.642734060884549, | |
| "grad_norm": 6.890137672424316, | |
| "learning_rate": 3.5784032165422173e-06, | |
| "loss": 0.3618, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 1.6484778862722573, | |
| "grad_norm": 7.769665241241455, | |
| "learning_rate": 3.5209649626651356e-06, | |
| "loss": 0.2703, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 1.6542217116599656, | |
| "grad_norm": 4.888299465179443, | |
| "learning_rate": 3.463526708788053e-06, | |
| "loss": 0.3257, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 1.6599655370476738, | |
| "grad_norm": 4.726266384124756, | |
| "learning_rate": 3.406088454910971e-06, | |
| "loss": 0.3573, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 1.6657093624353818, | |
| "grad_norm": 6.836297035217285, | |
| "learning_rate": 3.348650201033889e-06, | |
| "loss": 0.3325, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.67145318782309, | |
| "grad_norm": 6.571508884429932, | |
| "learning_rate": 3.2912119471568067e-06, | |
| "loss": 0.292, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 1.6771970132107983, | |
| "grad_norm": 9.843769073486328, | |
| "learning_rate": 3.2337736932797246e-06, | |
| "loss": 0.4027, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 1.6829408385985065, | |
| "grad_norm": 6.089911937713623, | |
| "learning_rate": 3.1763354394026425e-06, | |
| "loss": 0.3627, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 1.6886846639862148, | |
| "grad_norm": 6.846927165985107, | |
| "learning_rate": 3.11889718552556e-06, | |
| "loss": 0.3541, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 1.694428489373923, | |
| "grad_norm": 10.23181438446045, | |
| "learning_rate": 3.0614589316484783e-06, | |
| "loss": 0.3418, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.7001723147616312, | |
| "grad_norm": 5.403523921966553, | |
| "learning_rate": 3.0040206777713958e-06, | |
| "loss": 0.3498, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 1.7059161401493395, | |
| "grad_norm": 8.252917289733887, | |
| "learning_rate": 2.9465824238943137e-06, | |
| "loss": 0.3401, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 1.7116599655370477, | |
| "grad_norm": 7.06523323059082, | |
| "learning_rate": 2.889144170017232e-06, | |
| "loss": 0.2331, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 1.717403790924756, | |
| "grad_norm": 7.739984035491943, | |
| "learning_rate": 2.8317059161401494e-06, | |
| "loss": 0.3678, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 1.7231476163124642, | |
| "grad_norm": 4.021157741546631, | |
| "learning_rate": 2.7742676622630677e-06, | |
| "loss": 0.3537, | |
| "step": 3000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3482, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6356845526704128.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |