Text Classification
Transformers
Safetensors
English
distilbert
Safety
Content Moderation
Hate Speech Detection
Toxicity Detection
PurrBERT-v1.1 / checkpoint-3000 /trainer_state.json
FlameF0X's picture
Upload folder using huggingface_hub
a572ebe verified
raw
history blame
52.7 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.7231476163124642,
"eval_steps": 100,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005743825387708214,
"grad_norm": 1.664604902267456,
"learning_rate": 1.9948305571510626e-05,
"loss": 0.6766,
"step": 10
},
{
"epoch": 0.011487650775416428,
"grad_norm": 1.8649318218231201,
"learning_rate": 1.9890867317633546e-05,
"loss": 0.6443,
"step": 20
},
{
"epoch": 0.01723147616312464,
"grad_norm": 2.0676939487457275,
"learning_rate": 1.9833429063756463e-05,
"loss": 0.6516,
"step": 30
},
{
"epoch": 0.022975301550832855,
"grad_norm": 2.077414035797119,
"learning_rate": 1.9775990809879383e-05,
"loss": 0.5468,
"step": 40
},
{
"epoch": 0.02871912693854107,
"grad_norm": 2.4310338497161865,
"learning_rate": 1.97185525560023e-05,
"loss": 0.6289,
"step": 50
},
{
"epoch": 0.03446295232624928,
"grad_norm": 2.605480670928955,
"learning_rate": 1.9661114302125216e-05,
"loss": 0.583,
"step": 60
},
{
"epoch": 0.040206777713957496,
"grad_norm": 2.7097268104553223,
"learning_rate": 1.9603676048248136e-05,
"loss": 0.6029,
"step": 70
},
{
"epoch": 0.04595060310166571,
"grad_norm": 4.125054359436035,
"learning_rate": 1.9546237794371053e-05,
"loss": 0.5545,
"step": 80
},
{
"epoch": 0.051694428489373924,
"grad_norm": 4.592983245849609,
"learning_rate": 1.948879954049397e-05,
"loss": 0.5027,
"step": 90
},
{
"epoch": 0.05743825387708214,
"grad_norm": 4.799990653991699,
"learning_rate": 1.9431361286616886e-05,
"loss": 0.4913,
"step": 100
},
{
"epoch": 0.06318207926479034,
"grad_norm": 5.146159648895264,
"learning_rate": 1.9373923032739806e-05,
"loss": 0.5419,
"step": 110
},
{
"epoch": 0.06892590465249857,
"grad_norm": 6.291711807250977,
"learning_rate": 1.9316484778862726e-05,
"loss": 0.478,
"step": 120
},
{
"epoch": 0.07466973004020677,
"grad_norm": 4.076691627502441,
"learning_rate": 1.9259046524985643e-05,
"loss": 0.489,
"step": 130
},
{
"epoch": 0.08041355542791499,
"grad_norm": 5.057150363922119,
"learning_rate": 1.920160827110856e-05,
"loss": 0.5435,
"step": 140
},
{
"epoch": 0.0861573808156232,
"grad_norm": 3.5335211753845215,
"learning_rate": 1.914417001723148e-05,
"loss": 0.4445,
"step": 150
},
{
"epoch": 0.09190120620333142,
"grad_norm": 6.209961414337158,
"learning_rate": 1.9086731763354396e-05,
"loss": 0.4516,
"step": 160
},
{
"epoch": 0.09764503159103963,
"grad_norm": 7.1031341552734375,
"learning_rate": 1.9029293509477313e-05,
"loss": 0.576,
"step": 170
},
{
"epoch": 0.10338885697874785,
"grad_norm": 4.959975719451904,
"learning_rate": 1.8971855255600233e-05,
"loss": 0.4841,
"step": 180
},
{
"epoch": 0.10913268236645605,
"grad_norm": 4.904316425323486,
"learning_rate": 1.891441700172315e-05,
"loss": 0.5007,
"step": 190
},
{
"epoch": 0.11487650775416428,
"grad_norm": 8.833961486816406,
"learning_rate": 1.8856978747846066e-05,
"loss": 0.5416,
"step": 200
},
{
"epoch": 0.12062033314187248,
"grad_norm": 6.040881156921387,
"learning_rate": 1.8799540493968982e-05,
"loss": 0.5162,
"step": 210
},
{
"epoch": 0.1263641585295807,
"grad_norm": 7.167252063751221,
"learning_rate": 1.8742102240091902e-05,
"loss": 0.4965,
"step": 220
},
{
"epoch": 0.13210798391728892,
"grad_norm": 3.9503183364868164,
"learning_rate": 1.8684663986214822e-05,
"loss": 0.5129,
"step": 230
},
{
"epoch": 0.13785180930499713,
"grad_norm": 4.403937339782715,
"learning_rate": 1.862722573233774e-05,
"loss": 0.5192,
"step": 240
},
{
"epoch": 0.14359563469270534,
"grad_norm": 6.813930034637451,
"learning_rate": 1.8569787478460656e-05,
"loss": 0.4927,
"step": 250
},
{
"epoch": 0.14933946008041354,
"grad_norm": 4.355352878570557,
"learning_rate": 1.8512349224583576e-05,
"loss": 0.4857,
"step": 260
},
{
"epoch": 0.15508328546812178,
"grad_norm": 4.012150287628174,
"learning_rate": 1.8454910970706492e-05,
"loss": 0.3763,
"step": 270
},
{
"epoch": 0.16082711085582999,
"grad_norm": 7.179994106292725,
"learning_rate": 1.839747271682941e-05,
"loss": 0.4625,
"step": 280
},
{
"epoch": 0.1665709362435382,
"grad_norm": 3.701215982437134,
"learning_rate": 1.834003446295233e-05,
"loss": 0.4954,
"step": 290
},
{
"epoch": 0.1723147616312464,
"grad_norm": 5.150369167327881,
"learning_rate": 1.8282596209075246e-05,
"loss": 0.4875,
"step": 300
},
{
"epoch": 0.17805858701895463,
"grad_norm": 4.432060241699219,
"learning_rate": 1.8225157955198162e-05,
"loss": 0.5032,
"step": 310
},
{
"epoch": 0.18380241240666284,
"grad_norm": 4.560088634490967,
"learning_rate": 1.816771970132108e-05,
"loss": 0.4443,
"step": 320
},
{
"epoch": 0.18954623779437105,
"grad_norm": 6.000060558319092,
"learning_rate": 1.8110281447444e-05,
"loss": 0.4805,
"step": 330
},
{
"epoch": 0.19529006318207925,
"grad_norm": 6.462589740753174,
"learning_rate": 1.805284319356692e-05,
"loss": 0.4009,
"step": 340
},
{
"epoch": 0.2010338885697875,
"grad_norm": 4.6327104568481445,
"learning_rate": 1.7995404939689835e-05,
"loss": 0.5126,
"step": 350
},
{
"epoch": 0.2067777139574957,
"grad_norm": 5.640307426452637,
"learning_rate": 1.7937966685812752e-05,
"loss": 0.4308,
"step": 360
},
{
"epoch": 0.2125215393452039,
"grad_norm": 6.553674697875977,
"learning_rate": 1.7880528431935672e-05,
"loss": 0.4686,
"step": 370
},
{
"epoch": 0.2182653647329121,
"grad_norm": 6.482476234436035,
"learning_rate": 1.782309017805859e-05,
"loss": 0.4214,
"step": 380
},
{
"epoch": 0.22400919012062034,
"grad_norm": 6.897400379180908,
"learning_rate": 1.7765651924181505e-05,
"loss": 0.4827,
"step": 390
},
{
"epoch": 0.22975301550832855,
"grad_norm": 3.935074806213379,
"learning_rate": 1.7708213670304425e-05,
"loss": 0.4212,
"step": 400
},
{
"epoch": 0.23549684089603676,
"grad_norm": 5.766780853271484,
"learning_rate": 1.7650775416427342e-05,
"loss": 0.4421,
"step": 410
},
{
"epoch": 0.24124066628374496,
"grad_norm": 8.834909439086914,
"learning_rate": 1.759333716255026e-05,
"loss": 0.4676,
"step": 420
},
{
"epoch": 0.2469844916714532,
"grad_norm": 4.853402614593506,
"learning_rate": 1.7535898908673175e-05,
"loss": 0.4888,
"step": 430
},
{
"epoch": 0.2527283170591614,
"grad_norm": 5.489238739013672,
"learning_rate": 1.7478460654796095e-05,
"loss": 0.3763,
"step": 440
},
{
"epoch": 0.2584721424468696,
"grad_norm": 3.805001974105835,
"learning_rate": 1.7421022400919015e-05,
"loss": 0.5266,
"step": 450
},
{
"epoch": 0.26421596783457785,
"grad_norm": 5.9350080490112305,
"learning_rate": 1.7363584147041932e-05,
"loss": 0.4053,
"step": 460
},
{
"epoch": 0.269959793222286,
"grad_norm": 8.31826400756836,
"learning_rate": 1.730614589316485e-05,
"loss": 0.4775,
"step": 470
},
{
"epoch": 0.27570361860999426,
"grad_norm": 5.709866046905518,
"learning_rate": 1.724870763928777e-05,
"loss": 0.3681,
"step": 480
},
{
"epoch": 0.2814474439977025,
"grad_norm": 2.6657683849334717,
"learning_rate": 1.7191269385410685e-05,
"loss": 0.3913,
"step": 490
},
{
"epoch": 0.2871912693854107,
"grad_norm": 8.942373275756836,
"learning_rate": 1.71338311315336e-05,
"loss": 0.4454,
"step": 500
},
{
"epoch": 0.2929350947731189,
"grad_norm": 7.566854476928711,
"learning_rate": 1.707639287765652e-05,
"loss": 0.4704,
"step": 510
},
{
"epoch": 0.2986789201608271,
"grad_norm": 5.819560527801514,
"learning_rate": 1.7018954623779438e-05,
"loss": 0.4245,
"step": 520
},
{
"epoch": 0.3044227455485353,
"grad_norm": 7.269554138183594,
"learning_rate": 1.6961516369902355e-05,
"loss": 0.523,
"step": 530
},
{
"epoch": 0.31016657093624356,
"grad_norm": 4.569669246673584,
"learning_rate": 1.690407811602527e-05,
"loss": 0.5053,
"step": 540
},
{
"epoch": 0.31591039632395174,
"grad_norm": 6.8373332023620605,
"learning_rate": 1.684663986214819e-05,
"loss": 0.4592,
"step": 550
},
{
"epoch": 0.32165422171165997,
"grad_norm": 7.736797332763672,
"learning_rate": 1.678920160827111e-05,
"loss": 0.4756,
"step": 560
},
{
"epoch": 0.3273980470993682,
"grad_norm": 5.857370853424072,
"learning_rate": 1.6731763354394028e-05,
"loss": 0.4906,
"step": 570
},
{
"epoch": 0.3331418724870764,
"grad_norm": 4.2734785079956055,
"learning_rate": 1.6674325100516945e-05,
"loss": 0.4623,
"step": 580
},
{
"epoch": 0.3388856978747846,
"grad_norm": 7.859753131866455,
"learning_rate": 1.6616886846639865e-05,
"loss": 0.4568,
"step": 590
},
{
"epoch": 0.3446295232624928,
"grad_norm": 4.936352252960205,
"learning_rate": 1.655944859276278e-05,
"loss": 0.3774,
"step": 600
},
{
"epoch": 0.35037334865020103,
"grad_norm": 7.164617538452148,
"learning_rate": 1.6502010338885698e-05,
"loss": 0.5373,
"step": 610
},
{
"epoch": 0.35611717403790927,
"grad_norm": 7.279328346252441,
"learning_rate": 1.6444572085008618e-05,
"loss": 0.4183,
"step": 620
},
{
"epoch": 0.36186099942561745,
"grad_norm": 8.805562973022461,
"learning_rate": 1.6387133831131535e-05,
"loss": 0.4232,
"step": 630
},
{
"epoch": 0.3676048248133257,
"grad_norm": 6.055676460266113,
"learning_rate": 1.632969557725445e-05,
"loss": 0.4316,
"step": 640
},
{
"epoch": 0.3733486502010339,
"grad_norm": 5.835373401641846,
"learning_rate": 1.627225732337737e-05,
"loss": 0.4573,
"step": 650
},
{
"epoch": 0.3790924755887421,
"grad_norm": 2.7914931774139404,
"learning_rate": 1.6214819069500288e-05,
"loss": 0.3691,
"step": 660
},
{
"epoch": 0.38483630097645033,
"grad_norm": 5.780324459075928,
"learning_rate": 1.6157380815623208e-05,
"loss": 0.3761,
"step": 670
},
{
"epoch": 0.3905801263641585,
"grad_norm": 5.7374348640441895,
"learning_rate": 1.6099942561746125e-05,
"loss": 0.4276,
"step": 680
},
{
"epoch": 0.39632395175186674,
"grad_norm": 4.771572589874268,
"learning_rate": 1.604250430786904e-05,
"loss": 0.4092,
"step": 690
},
{
"epoch": 0.402067777139575,
"grad_norm": 7.6168928146362305,
"learning_rate": 1.598506605399196e-05,
"loss": 0.4651,
"step": 700
},
{
"epoch": 0.40781160252728316,
"grad_norm": 6.272294998168945,
"learning_rate": 1.5927627800114878e-05,
"loss": 0.4107,
"step": 710
},
{
"epoch": 0.4135554279149914,
"grad_norm": 8.060046195983887,
"learning_rate": 1.5870189546237794e-05,
"loss": 0.4117,
"step": 720
},
{
"epoch": 0.41929925330269957,
"grad_norm": 7.290645122528076,
"learning_rate": 1.5812751292360714e-05,
"loss": 0.5005,
"step": 730
},
{
"epoch": 0.4250430786904078,
"grad_norm": 8.02639102935791,
"learning_rate": 1.575531303848363e-05,
"loss": 0.4561,
"step": 740
},
{
"epoch": 0.43078690407811604,
"grad_norm": 8.455704689025879,
"learning_rate": 1.5697874784606548e-05,
"loss": 0.4592,
"step": 750
},
{
"epoch": 0.4365307294658242,
"grad_norm": 2.9264323711395264,
"learning_rate": 1.5640436530729468e-05,
"loss": 0.4532,
"step": 760
},
{
"epoch": 0.44227455485353245,
"grad_norm": 5.246438980102539,
"learning_rate": 1.5582998276852384e-05,
"loss": 0.4931,
"step": 770
},
{
"epoch": 0.4480183802412407,
"grad_norm": 3.869628667831421,
"learning_rate": 1.5525560022975304e-05,
"loss": 0.3476,
"step": 780
},
{
"epoch": 0.45376220562894887,
"grad_norm": 4.742978572845459,
"learning_rate": 1.546812176909822e-05,
"loss": 0.4274,
"step": 790
},
{
"epoch": 0.4595060310166571,
"grad_norm": 6.9326300621032715,
"learning_rate": 1.5410683515221138e-05,
"loss": 0.5299,
"step": 800
},
{
"epoch": 0.4652498564043653,
"grad_norm": 6.16196346282959,
"learning_rate": 1.5353245261344058e-05,
"loss": 0.4805,
"step": 810
},
{
"epoch": 0.4709936817920735,
"grad_norm": 3.637753486633301,
"learning_rate": 1.5295807007466974e-05,
"loss": 0.4393,
"step": 820
},
{
"epoch": 0.47673750717978175,
"grad_norm": 3.867966413497925,
"learning_rate": 1.523836875358989e-05,
"loss": 0.4358,
"step": 830
},
{
"epoch": 0.48248133256748993,
"grad_norm": 4.753304481506348,
"learning_rate": 1.5180930499712809e-05,
"loss": 0.4737,
"step": 840
},
{
"epoch": 0.48822515795519816,
"grad_norm": 7.7434515953063965,
"learning_rate": 1.5123492245835727e-05,
"loss": 0.4466,
"step": 850
},
{
"epoch": 0.4939689833429064,
"grad_norm": 6.001241683959961,
"learning_rate": 1.5066053991958644e-05,
"loss": 0.4016,
"step": 860
},
{
"epoch": 0.4997128087306146,
"grad_norm": 7.710599422454834,
"learning_rate": 1.5008615738081564e-05,
"loss": 0.4758,
"step": 870
},
{
"epoch": 0.5054566341183228,
"grad_norm": 5.139091491699219,
"learning_rate": 1.4951177484204482e-05,
"loss": 0.4019,
"step": 880
},
{
"epoch": 0.511200459506031,
"grad_norm": 9.428766250610352,
"learning_rate": 1.4893739230327399e-05,
"loss": 0.3742,
"step": 890
},
{
"epoch": 0.5169442848937392,
"grad_norm": 9.071430206298828,
"learning_rate": 1.4836300976450317e-05,
"loss": 0.4183,
"step": 900
},
{
"epoch": 0.5226881102814475,
"grad_norm": 10.34457015991211,
"learning_rate": 1.4778862722573236e-05,
"loss": 0.437,
"step": 910
},
{
"epoch": 0.5284319356691557,
"grad_norm": 7.01099967956543,
"learning_rate": 1.4721424468696152e-05,
"loss": 0.4421,
"step": 920
},
{
"epoch": 0.5341757610568638,
"grad_norm": 3.454484701156616,
"learning_rate": 1.466398621481907e-05,
"loss": 0.4283,
"step": 930
},
{
"epoch": 0.539919586444572,
"grad_norm": 5.86787223815918,
"learning_rate": 1.4606547960941987e-05,
"loss": 0.4094,
"step": 940
},
{
"epoch": 0.5456634118322803,
"grad_norm": 8.41640567779541,
"learning_rate": 1.4549109707064906e-05,
"loss": 0.414,
"step": 950
},
{
"epoch": 0.5514072372199885,
"grad_norm": 6.936724662780762,
"learning_rate": 1.4491671453187824e-05,
"loss": 0.3936,
"step": 960
},
{
"epoch": 0.5571510626076968,
"grad_norm": 7.984805583953857,
"learning_rate": 1.4434233199310744e-05,
"loss": 0.4071,
"step": 970
},
{
"epoch": 0.562894887995405,
"grad_norm": 4.381742477416992,
"learning_rate": 1.437679494543366e-05,
"loss": 0.3393,
"step": 980
},
{
"epoch": 0.5686387133831131,
"grad_norm": 7.115252494812012,
"learning_rate": 1.4319356691556579e-05,
"loss": 0.5616,
"step": 990
},
{
"epoch": 0.5743825387708213,
"grad_norm": 6.60570764541626,
"learning_rate": 1.4261918437679495e-05,
"loss": 0.4633,
"step": 1000
},
{
"epoch": 0.5801263641585296,
"grad_norm": 9.552061080932617,
"learning_rate": 1.4204480183802414e-05,
"loss": 0.4973,
"step": 1010
},
{
"epoch": 0.5858701895462378,
"grad_norm": 3.6467580795288086,
"learning_rate": 1.4147041929925332e-05,
"loss": 0.3958,
"step": 1020
},
{
"epoch": 0.591614014933946,
"grad_norm": 2.663799524307251,
"learning_rate": 1.4089603676048249e-05,
"loss": 0.3402,
"step": 1030
},
{
"epoch": 0.5973578403216542,
"grad_norm": 7.468900203704834,
"learning_rate": 1.4032165422171167e-05,
"loss": 0.3341,
"step": 1040
},
{
"epoch": 0.6031016657093624,
"grad_norm": 10.396268844604492,
"learning_rate": 1.3974727168294084e-05,
"loss": 0.5122,
"step": 1050
},
{
"epoch": 0.6088454910970706,
"grad_norm": 7.79450798034668,
"learning_rate": 1.3917288914417002e-05,
"loss": 0.5581,
"step": 1060
},
{
"epoch": 0.6145893164847789,
"grad_norm": 8.077397346496582,
"learning_rate": 1.385985066053992e-05,
"loss": 0.4247,
"step": 1070
},
{
"epoch": 0.6203331418724871,
"grad_norm": 8.327542304992676,
"learning_rate": 1.380241240666284e-05,
"loss": 0.4122,
"step": 1080
},
{
"epoch": 0.6260769672601952,
"grad_norm": 7.940774917602539,
"learning_rate": 1.3744974152785757e-05,
"loss": 0.5199,
"step": 1090
},
{
"epoch": 0.6318207926479035,
"grad_norm": 5.148271560668945,
"learning_rate": 1.3687535898908675e-05,
"loss": 0.4534,
"step": 1100
},
{
"epoch": 0.6375646180356117,
"grad_norm": 7.042996883392334,
"learning_rate": 1.3630097645031592e-05,
"loss": 0.4737,
"step": 1110
},
{
"epoch": 0.6433084434233199,
"grad_norm": 5.131284236907959,
"learning_rate": 1.357265939115451e-05,
"loss": 0.3637,
"step": 1120
},
{
"epoch": 0.6490522688110282,
"grad_norm": 10.73865795135498,
"learning_rate": 1.3515221137277428e-05,
"loss": 0.4152,
"step": 1130
},
{
"epoch": 0.6547960941987364,
"grad_norm": 6.061016082763672,
"learning_rate": 1.3457782883400345e-05,
"loss": 0.343,
"step": 1140
},
{
"epoch": 0.6605399195864445,
"grad_norm": 11.200201988220215,
"learning_rate": 1.3400344629523263e-05,
"loss": 0.4781,
"step": 1150
},
{
"epoch": 0.6662837449741528,
"grad_norm": 6.987539768218994,
"learning_rate": 1.334290637564618e-05,
"loss": 0.4046,
"step": 1160
},
{
"epoch": 0.672027570361861,
"grad_norm": 7.3787713050842285,
"learning_rate": 1.3285468121769098e-05,
"loss": 0.4136,
"step": 1170
},
{
"epoch": 0.6777713957495692,
"grad_norm": 8.829427719116211,
"learning_rate": 1.3228029867892018e-05,
"loss": 0.3807,
"step": 1180
},
{
"epoch": 0.6835152211372775,
"grad_norm": 9.648842811584473,
"learning_rate": 1.3170591614014937e-05,
"loss": 0.3273,
"step": 1190
},
{
"epoch": 0.6892590465249856,
"grad_norm": 7.307587146759033,
"learning_rate": 1.3113153360137853e-05,
"loss": 0.3351,
"step": 1200
},
{
"epoch": 0.6950028719126938,
"grad_norm": 6.445584297180176,
"learning_rate": 1.3055715106260772e-05,
"loss": 0.4776,
"step": 1210
},
{
"epoch": 0.7007466973004021,
"grad_norm": 7.078349590301514,
"learning_rate": 1.2998276852383688e-05,
"loss": 0.4438,
"step": 1220
},
{
"epoch": 0.7064905226881103,
"grad_norm": 9.63571834564209,
"learning_rate": 1.2940838598506606e-05,
"loss": 0.4027,
"step": 1230
},
{
"epoch": 0.7122343480758185,
"grad_norm": 3.2861080169677734,
"learning_rate": 1.2883400344629525e-05,
"loss": 0.3334,
"step": 1240
},
{
"epoch": 0.7179781734635267,
"grad_norm": 7.433917999267578,
"learning_rate": 1.2825962090752441e-05,
"loss": 0.4418,
"step": 1250
},
{
"epoch": 0.7237219988512349,
"grad_norm": 7.511765003204346,
"learning_rate": 1.276852383687536e-05,
"loss": 0.3693,
"step": 1260
},
{
"epoch": 0.7294658242389431,
"grad_norm": 9.38160228729248,
"learning_rate": 1.2711085582998276e-05,
"loss": 0.3989,
"step": 1270
},
{
"epoch": 0.7352096496266514,
"grad_norm": 2.012756586074829,
"learning_rate": 1.2653647329121195e-05,
"loss": 0.3867,
"step": 1280
},
{
"epoch": 0.7409534750143596,
"grad_norm": 6.777096748352051,
"learning_rate": 1.2596209075244115e-05,
"loss": 0.5528,
"step": 1290
},
{
"epoch": 0.7466973004020678,
"grad_norm": 6.928145885467529,
"learning_rate": 1.2538770821367033e-05,
"loss": 0.3403,
"step": 1300
},
{
"epoch": 0.752441125789776,
"grad_norm": 7.99967622756958,
"learning_rate": 1.248133256748995e-05,
"loss": 0.5006,
"step": 1310
},
{
"epoch": 0.7581849511774842,
"grad_norm": 4.8364033699035645,
"learning_rate": 1.2423894313612868e-05,
"loss": 0.4266,
"step": 1320
},
{
"epoch": 0.7639287765651924,
"grad_norm": 4.017831802368164,
"learning_rate": 1.2366456059735785e-05,
"loss": 0.3444,
"step": 1330
},
{
"epoch": 0.7696726019529007,
"grad_norm": 5.27449893951416,
"learning_rate": 1.2309017805858703e-05,
"loss": 0.3962,
"step": 1340
},
{
"epoch": 0.7754164273406089,
"grad_norm": 7.989853858947754,
"learning_rate": 1.2251579551981621e-05,
"loss": 0.4172,
"step": 1350
},
{
"epoch": 0.781160252728317,
"grad_norm": 5.878440856933594,
"learning_rate": 1.2194141298104538e-05,
"loss": 0.4723,
"step": 1360
},
{
"epoch": 0.7869040781160253,
"grad_norm": 9.140411376953125,
"learning_rate": 1.2136703044227456e-05,
"loss": 0.4012,
"step": 1370
},
{
"epoch": 0.7926479035037335,
"grad_norm": 3.9119012355804443,
"learning_rate": 1.2079264790350373e-05,
"loss": 0.3545,
"step": 1380
},
{
"epoch": 0.7983917288914417,
"grad_norm": 6.248933792114258,
"learning_rate": 1.2021826536473291e-05,
"loss": 0.4787,
"step": 1390
},
{
"epoch": 0.80413555427915,
"grad_norm": 8.478959083557129,
"learning_rate": 1.1964388282596211e-05,
"loss": 0.4077,
"step": 1400
},
{
"epoch": 0.8098793796668581,
"grad_norm": 6.234384059906006,
"learning_rate": 1.190695002871913e-05,
"loss": 0.4044,
"step": 1410
},
{
"epoch": 0.8156232050545663,
"grad_norm": 5.093031883239746,
"learning_rate": 1.1849511774842046e-05,
"loss": 0.3298,
"step": 1420
},
{
"epoch": 0.8213670304422745,
"grad_norm": 5.755350112915039,
"learning_rate": 1.1792073520964964e-05,
"loss": 0.4099,
"step": 1430
},
{
"epoch": 0.8271108558299828,
"grad_norm": 9.269704818725586,
"learning_rate": 1.1734635267087881e-05,
"loss": 0.366,
"step": 1440
},
{
"epoch": 0.832854681217691,
"grad_norm": 4.977533340454102,
"learning_rate": 1.16771970132108e-05,
"loss": 0.3156,
"step": 1450
},
{
"epoch": 0.8385985066053991,
"grad_norm": 6.767063140869141,
"learning_rate": 1.1619758759333718e-05,
"loss": 0.3966,
"step": 1460
},
{
"epoch": 0.8443423319931074,
"grad_norm": 6.855627536773682,
"learning_rate": 1.1562320505456634e-05,
"loss": 0.3488,
"step": 1470
},
{
"epoch": 0.8500861573808156,
"grad_norm": 3.408679723739624,
"learning_rate": 1.1504882251579552e-05,
"loss": 0.4016,
"step": 1480
},
{
"epoch": 0.8558299827685238,
"grad_norm": 8.43376636505127,
"learning_rate": 1.1447443997702469e-05,
"loss": 0.3951,
"step": 1490
},
{
"epoch": 0.8615738081562321,
"grad_norm": 7.106573104858398,
"learning_rate": 1.1390005743825389e-05,
"loss": 0.3056,
"step": 1500
},
{
"epoch": 0.8673176335439403,
"grad_norm": 3.373734474182129,
"learning_rate": 1.1332567489948307e-05,
"loss": 0.4548,
"step": 1510
},
{
"epoch": 0.8730614589316484,
"grad_norm": 4.657841205596924,
"learning_rate": 1.1275129236071226e-05,
"loss": 0.36,
"step": 1520
},
{
"epoch": 0.8788052843193567,
"grad_norm": 8.218329429626465,
"learning_rate": 1.1217690982194142e-05,
"loss": 0.4297,
"step": 1530
},
{
"epoch": 0.8845491097070649,
"grad_norm": 7.709052562713623,
"learning_rate": 1.116025272831706e-05,
"loss": 0.3766,
"step": 1540
},
{
"epoch": 0.8902929350947731,
"grad_norm": 6.875143527984619,
"learning_rate": 1.1102814474439977e-05,
"loss": 0.4106,
"step": 1550
},
{
"epoch": 0.8960367604824814,
"grad_norm": 5.460892200469971,
"learning_rate": 1.1045376220562896e-05,
"loss": 0.3016,
"step": 1560
},
{
"epoch": 0.9017805858701895,
"grad_norm": 3.4830429553985596,
"learning_rate": 1.0987937966685814e-05,
"loss": 0.371,
"step": 1570
},
{
"epoch": 0.9075244112578977,
"grad_norm": 8.233579635620117,
"learning_rate": 1.093049971280873e-05,
"loss": 0.4591,
"step": 1580
},
{
"epoch": 0.913268236645606,
"grad_norm": 7.001081466674805,
"learning_rate": 1.0873061458931649e-05,
"loss": 0.3856,
"step": 1590
},
{
"epoch": 0.9190120620333142,
"grad_norm": 7.473963260650635,
"learning_rate": 1.0815623205054565e-05,
"loss": 0.4937,
"step": 1600
},
{
"epoch": 0.9247558874210224,
"grad_norm": 4.046863079071045,
"learning_rate": 1.0758184951177485e-05,
"loss": 0.4141,
"step": 1610
},
{
"epoch": 0.9304997128087306,
"grad_norm": 5.425885200500488,
"learning_rate": 1.0700746697300404e-05,
"loss": 0.3545,
"step": 1620
},
{
"epoch": 0.9362435381964388,
"grad_norm": 5.255967140197754,
"learning_rate": 1.0643308443423322e-05,
"loss": 0.3881,
"step": 1630
},
{
"epoch": 0.941987363584147,
"grad_norm": 7.365703105926514,
"learning_rate": 1.0585870189546239e-05,
"loss": 0.3648,
"step": 1640
},
{
"epoch": 0.9477311889718553,
"grad_norm": 5.149658679962158,
"learning_rate": 1.0528431935669157e-05,
"loss": 0.2952,
"step": 1650
},
{
"epoch": 0.9534750143595635,
"grad_norm": 5.5968194007873535,
"learning_rate": 1.0470993681792074e-05,
"loss": 0.3604,
"step": 1660
},
{
"epoch": 0.9592188397472717,
"grad_norm": 6.176368713378906,
"learning_rate": 1.0413555427914992e-05,
"loss": 0.3584,
"step": 1670
},
{
"epoch": 0.9649626651349799,
"grad_norm": 5.876338958740234,
"learning_rate": 1.035611717403791e-05,
"loss": 0.4076,
"step": 1680
},
{
"epoch": 0.9707064905226881,
"grad_norm": 11.697908401489258,
"learning_rate": 1.0298678920160827e-05,
"loss": 0.3648,
"step": 1690
},
{
"epoch": 0.9764503159103963,
"grad_norm": 7.04163122177124,
"learning_rate": 1.0241240666283745e-05,
"loss": 0.349,
"step": 1700
},
{
"epoch": 0.9821941412981046,
"grad_norm": 6.7707133293151855,
"learning_rate": 1.0183802412406662e-05,
"loss": 0.3888,
"step": 1710
},
{
"epoch": 0.9879379666858128,
"grad_norm": 3.8108270168304443,
"learning_rate": 1.0126364158529582e-05,
"loss": 0.3753,
"step": 1720
},
{
"epoch": 0.9936817920735209,
"grad_norm": 11.013320922851562,
"learning_rate": 1.00689259046525e-05,
"loss": 0.3977,
"step": 1730
},
{
"epoch": 0.9994256174612292,
"grad_norm": 4.042791843414307,
"learning_rate": 1.0011487650775419e-05,
"loss": 0.3724,
"step": 1740
},
{
"epoch": 1.0051694428489375,
"grad_norm": 6.258309841156006,
"learning_rate": 9.954049396898335e-06,
"loss": 0.3591,
"step": 1750
},
{
"epoch": 1.0109132682366455,
"grad_norm": 7.884782314300537,
"learning_rate": 9.896611143021253e-06,
"loss": 0.4114,
"step": 1760
},
{
"epoch": 1.0166570936243537,
"grad_norm": 4.9663567543029785,
"learning_rate": 9.83917288914417e-06,
"loss": 0.4462,
"step": 1770
},
{
"epoch": 1.022400919012062,
"grad_norm": 7.046320915222168,
"learning_rate": 9.781734635267088e-06,
"loss": 0.3386,
"step": 1780
},
{
"epoch": 1.0281447443997702,
"grad_norm": 6.846945762634277,
"learning_rate": 9.724296381390007e-06,
"loss": 0.3181,
"step": 1790
},
{
"epoch": 1.0338885697874785,
"grad_norm": 5.925526142120361,
"learning_rate": 9.666858127512925e-06,
"loss": 0.3357,
"step": 1800
},
{
"epoch": 1.0396323951751867,
"grad_norm": 14.302725791931152,
"learning_rate": 9.609419873635842e-06,
"loss": 0.2859,
"step": 1810
},
{
"epoch": 1.045376220562895,
"grad_norm": 8.27291488647461,
"learning_rate": 9.55198161975876e-06,
"loss": 0.3688,
"step": 1820
},
{
"epoch": 1.0511200459506032,
"grad_norm": 5.266950607299805,
"learning_rate": 9.494543365881678e-06,
"loss": 0.3106,
"step": 1830
},
{
"epoch": 1.0568638713383114,
"grad_norm": 11.347005844116211,
"learning_rate": 9.437105112004595e-06,
"loss": 0.3389,
"step": 1840
},
{
"epoch": 1.0626076967260196,
"grad_norm": 4.7072906494140625,
"learning_rate": 9.379666858127515e-06,
"loss": 0.3519,
"step": 1850
},
{
"epoch": 1.0683515221137276,
"grad_norm": 4.05309534072876,
"learning_rate": 9.322228604250432e-06,
"loss": 0.3006,
"step": 1860
},
{
"epoch": 1.0740953475014359,
"grad_norm": 5.578520774841309,
"learning_rate": 9.26479035037335e-06,
"loss": 0.3645,
"step": 1870
},
{
"epoch": 1.079839172889144,
"grad_norm": 7.405791282653809,
"learning_rate": 9.207352096496266e-06,
"loss": 0.3104,
"step": 1880
},
{
"epoch": 1.0855829982768523,
"grad_norm": 9.269173622131348,
"learning_rate": 9.149913842619185e-06,
"loss": 0.3576,
"step": 1890
},
{
"epoch": 1.0913268236645606,
"grad_norm": 5.276297569274902,
"learning_rate": 9.092475588742103e-06,
"loss": 0.3354,
"step": 1900
},
{
"epoch": 1.0970706490522688,
"grad_norm": 8.320406913757324,
"learning_rate": 9.035037334865021e-06,
"loss": 0.3232,
"step": 1910
},
{
"epoch": 1.102814474439977,
"grad_norm": 6.023215293884277,
"learning_rate": 8.977599080987938e-06,
"loss": 0.3427,
"step": 1920
},
{
"epoch": 1.1085582998276853,
"grad_norm": 8.178590774536133,
"learning_rate": 8.920160827110856e-06,
"loss": 0.3103,
"step": 1930
},
{
"epoch": 1.1143021252153935,
"grad_norm": 6.056619644165039,
"learning_rate": 8.862722573233775e-06,
"loss": 0.3848,
"step": 1940
},
{
"epoch": 1.1200459506031017,
"grad_norm": 6.109485626220703,
"learning_rate": 8.805284319356693e-06,
"loss": 0.3102,
"step": 1950
},
{
"epoch": 1.12578977599081,
"grad_norm": 6.949984550476074,
"learning_rate": 8.747846065479611e-06,
"loss": 0.3013,
"step": 1960
},
{
"epoch": 1.1315336013785182,
"grad_norm": 4.320880889892578,
"learning_rate": 8.690407811602528e-06,
"loss": 0.3169,
"step": 1970
},
{
"epoch": 1.1372774267662262,
"grad_norm": 9.62964916229248,
"learning_rate": 8.632969557725446e-06,
"loss": 0.3577,
"step": 1980
},
{
"epoch": 1.1430212521539345,
"grad_norm": 7.1865105628967285,
"learning_rate": 8.575531303848363e-06,
"loss": 0.4245,
"step": 1990
},
{
"epoch": 1.1487650775416427,
"grad_norm": 11.42944622039795,
"learning_rate": 8.518093049971281e-06,
"loss": 0.3421,
"step": 2000
},
{
"epoch": 1.154508902929351,
"grad_norm": 10.365814208984375,
"learning_rate": 8.4606547960942e-06,
"loss": 0.2971,
"step": 2010
},
{
"epoch": 1.1602527283170592,
"grad_norm": 4.546888828277588,
"learning_rate": 8.403216542217118e-06,
"loss": 0.3762,
"step": 2020
},
{
"epoch": 1.1659965537047674,
"grad_norm": 9.672823905944824,
"learning_rate": 8.345778288340034e-06,
"loss": 0.3411,
"step": 2030
},
{
"epoch": 1.1717403790924756,
"grad_norm": 4.738915920257568,
"learning_rate": 8.288340034462953e-06,
"loss": 0.3453,
"step": 2040
},
{
"epoch": 1.1774842044801839,
"grad_norm": 10.187810897827148,
"learning_rate": 8.230901780585871e-06,
"loss": 0.3351,
"step": 2050
},
{
"epoch": 1.183228029867892,
"grad_norm": 6.290671348571777,
"learning_rate": 8.17346352670879e-06,
"loss": 0.3286,
"step": 2060
},
{
"epoch": 1.1889718552556001,
"grad_norm": 9.14261531829834,
"learning_rate": 8.116025272831708e-06,
"loss": 0.2878,
"step": 2070
},
{
"epoch": 1.1947156806433084,
"grad_norm": 7.814758777618408,
"learning_rate": 8.058587018954624e-06,
"loss": 0.3469,
"step": 2080
},
{
"epoch": 1.2004595060310166,
"grad_norm": 10.085731506347656,
"learning_rate": 8.001148765077543e-06,
"loss": 0.3025,
"step": 2090
},
{
"epoch": 1.2062033314187248,
"grad_norm": 10.734376907348633,
"learning_rate": 7.94371051120046e-06,
"loss": 0.2442,
"step": 2100
},
{
"epoch": 1.211947156806433,
"grad_norm": 13.61286735534668,
"learning_rate": 7.88627225732338e-06,
"loss": 0.3063,
"step": 2110
},
{
"epoch": 1.2176909821941413,
"grad_norm": 8.572850227355957,
"learning_rate": 7.828834003446296e-06,
"loss": 0.3863,
"step": 2120
},
{
"epoch": 1.2234348075818495,
"grad_norm": 6.247170448303223,
"learning_rate": 7.771395749569214e-06,
"loss": 0.3214,
"step": 2130
},
{
"epoch": 1.2291786329695578,
"grad_norm": 7.438636779785156,
"learning_rate": 7.71395749569213e-06,
"loss": 0.381,
"step": 2140
},
{
"epoch": 1.234922458357266,
"grad_norm": 6.11846399307251,
"learning_rate": 7.656519241815049e-06,
"loss": 0.3556,
"step": 2150
},
{
"epoch": 1.2406662837449742,
"grad_norm": 10.697092056274414,
"learning_rate": 7.5990809879379666e-06,
"loss": 0.3767,
"step": 2160
},
{
"epoch": 1.2464101091326825,
"grad_norm": 5.3118205070495605,
"learning_rate": 7.541642734060886e-06,
"loss": 0.3112,
"step": 2170
},
{
"epoch": 1.2521539345203907,
"grad_norm": 5.907925128936768,
"learning_rate": 7.484204480183803e-06,
"loss": 0.2833,
"step": 2180
},
{
"epoch": 1.2578977599080987,
"grad_norm": 7.271302223205566,
"learning_rate": 7.426766226306721e-06,
"loss": 0.1935,
"step": 2190
},
{
"epoch": 1.263641585295807,
"grad_norm": 12.389423370361328,
"learning_rate": 7.369327972429638e-06,
"loss": 0.3946,
"step": 2200
},
{
"epoch": 1.2693854106835152,
"grad_norm": 9.09422492980957,
"learning_rate": 7.3118897185525564e-06,
"loss": 0.3191,
"step": 2210
},
{
"epoch": 1.2751292360712234,
"grad_norm": 8.75156307220459,
"learning_rate": 7.254451464675475e-06,
"loss": 0.4077,
"step": 2220
},
{
"epoch": 1.2808730614589316,
"grad_norm": 7.306863784790039,
"learning_rate": 7.197013210798392e-06,
"loss": 0.3295,
"step": 2230
},
{
"epoch": 1.2866168868466399,
"grad_norm": 9.715473175048828,
"learning_rate": 7.1395749569213105e-06,
"loss": 0.4163,
"step": 2240
},
{
"epoch": 1.2923607122343481,
"grad_norm": 6.315252780914307,
"learning_rate": 7.082136703044228e-06,
"loss": 0.3817,
"step": 2250
},
{
"epoch": 1.2981045376220564,
"grad_norm": 8.821859359741211,
"learning_rate": 7.0246984491671455e-06,
"loss": 0.3265,
"step": 2260
},
{
"epoch": 1.3038483630097644,
"grad_norm": 6.838233947753906,
"learning_rate": 6.967260195290065e-06,
"loss": 0.2966,
"step": 2270
},
{
"epoch": 1.3095921883974726,
"grad_norm": 9.925073623657227,
"learning_rate": 6.909821941412982e-06,
"loss": 0.4484,
"step": 2280
},
{
"epoch": 1.3153360137851808,
"grad_norm": 5.026411056518555,
"learning_rate": 6.8523836875358996e-06,
"loss": 0.4647,
"step": 2290
},
{
"epoch": 1.321079839172889,
"grad_norm": 4.3956732749938965,
"learning_rate": 6.794945433658817e-06,
"loss": 0.2968,
"step": 2300
},
{
"epoch": 1.3268236645605973,
"grad_norm": 6.904971599578857,
"learning_rate": 6.7375071797817345e-06,
"loss": 0.2888,
"step": 2310
},
{
"epoch": 1.3325674899483055,
"grad_norm": 3.1684281826019287,
"learning_rate": 6.680068925904653e-06,
"loss": 0.2975,
"step": 2320
},
{
"epoch": 1.3383113153360138,
"grad_norm": 7.3333420753479,
"learning_rate": 6.622630672027571e-06,
"loss": 0.3911,
"step": 2330
},
{
"epoch": 1.344055140723722,
"grad_norm": 7.822445392608643,
"learning_rate": 6.565192418150489e-06,
"loss": 0.3199,
"step": 2340
},
{
"epoch": 1.3497989661114302,
"grad_norm": 9.02872371673584,
"learning_rate": 6.507754164273407e-06,
"loss": 0.4698,
"step": 2350
},
{
"epoch": 1.3555427914991385,
"grad_norm": 3.9332520961761475,
"learning_rate": 6.450315910396324e-06,
"loss": 0.3651,
"step": 2360
},
{
"epoch": 1.3612866168868467,
"grad_norm": 7.590347766876221,
"learning_rate": 6.392877656519242e-06,
"loss": 0.3121,
"step": 2370
},
{
"epoch": 1.367030442274555,
"grad_norm": 8.964584350585938,
"learning_rate": 6.335439402642161e-06,
"loss": 0.271,
"step": 2380
},
{
"epoch": 1.3727742676622632,
"grad_norm": 8.058918952941895,
"learning_rate": 6.2780011487650785e-06,
"loss": 0.3906,
"step": 2390
},
{
"epoch": 1.3785180930499714,
"grad_norm": 6.742099761962891,
"learning_rate": 6.220562894887996e-06,
"loss": 0.3072,
"step": 2400
},
{
"epoch": 1.3842619184376794,
"grad_norm": 5.961569309234619,
"learning_rate": 6.1631246410109134e-06,
"loss": 0.3673,
"step": 2410
},
{
"epoch": 1.3900057438253877,
"grad_norm": 9.705893516540527,
"learning_rate": 6.105686387133831e-06,
"loss": 0.3544,
"step": 2420
},
{
"epoch": 1.395749569213096,
"grad_norm": 4.435375690460205,
"learning_rate": 6.04824813325675e-06,
"loss": 0.2372,
"step": 2430
},
{
"epoch": 1.4014933946008041,
"grad_norm": 5.375720977783203,
"learning_rate": 5.9908098793796675e-06,
"loss": 0.2264,
"step": 2440
},
{
"epoch": 1.4072372199885124,
"grad_norm": 5.602358818054199,
"learning_rate": 5.933371625502585e-06,
"loss": 0.3449,
"step": 2450
},
{
"epoch": 1.4129810453762206,
"grad_norm": 10.811373710632324,
"learning_rate": 5.875933371625503e-06,
"loss": 0.3663,
"step": 2460
},
{
"epoch": 1.4187248707639288,
"grad_norm": 10.196518898010254,
"learning_rate": 5.818495117748421e-06,
"loss": 0.3546,
"step": 2470
},
{
"epoch": 1.424468696151637,
"grad_norm": 10.06306266784668,
"learning_rate": 5.761056863871339e-06,
"loss": 0.3282,
"step": 2480
},
{
"epoch": 1.430212521539345,
"grad_norm": 4.978325843811035,
"learning_rate": 5.703618609994257e-06,
"loss": 0.2961,
"step": 2490
},
{
"epoch": 1.4359563469270533,
"grad_norm": 10.731146812438965,
"learning_rate": 5.646180356117175e-06,
"loss": 0.3349,
"step": 2500
},
{
"epoch": 1.4417001723147616,
"grad_norm": 8.913891792297363,
"learning_rate": 5.588742102240092e-06,
"loss": 0.3131,
"step": 2510
},
{
"epoch": 1.4474439977024698,
"grad_norm": 5.1745195388793945,
"learning_rate": 5.53130384836301e-06,
"loss": 0.3842,
"step": 2520
},
{
"epoch": 1.453187823090178,
"grad_norm": 8.361491203308105,
"learning_rate": 5.473865594485927e-06,
"loss": 0.3598,
"step": 2530
},
{
"epoch": 1.4589316484778863,
"grad_norm": 6.487078666687012,
"learning_rate": 5.4164273406088464e-06,
"loss": 0.3244,
"step": 2540
},
{
"epoch": 1.4646754738655945,
"grad_norm": 4.129726409912109,
"learning_rate": 5.358989086731764e-06,
"loss": 0.3152,
"step": 2550
},
{
"epoch": 1.4704192992533027,
"grad_norm": 9.363592147827148,
"learning_rate": 5.301550832854681e-06,
"loss": 0.3321,
"step": 2560
},
{
"epoch": 1.476163124641011,
"grad_norm": 6.334773063659668,
"learning_rate": 5.2441125789776e-06,
"loss": 0.3312,
"step": 2570
},
{
"epoch": 1.4819069500287192,
"grad_norm": 7.404930114746094,
"learning_rate": 5.186674325100517e-06,
"loss": 0.3739,
"step": 2580
},
{
"epoch": 1.4876507754164274,
"grad_norm": 7.487016201019287,
"learning_rate": 5.1292360712234355e-06,
"loss": 0.2865,
"step": 2590
},
{
"epoch": 1.4933946008041357,
"grad_norm": 13.322307586669922,
"learning_rate": 5.071797817346353e-06,
"loss": 0.3444,
"step": 2600
},
{
"epoch": 1.499138426191844,
"grad_norm": 9.053878784179688,
"learning_rate": 5.014359563469271e-06,
"loss": 0.3799,
"step": 2610
},
{
"epoch": 1.5048822515795521,
"grad_norm": 4.018943786621094,
"learning_rate": 4.956921309592189e-06,
"loss": 0.2567,
"step": 2620
},
{
"epoch": 1.5106260769672601,
"grad_norm": 2.2457354068756104,
"learning_rate": 4.899483055715107e-06,
"loss": 0.2978,
"step": 2630
},
{
"epoch": 1.5163699023549684,
"grad_norm": 4.894889831542969,
"learning_rate": 4.8420448018380245e-06,
"loss": 0.2861,
"step": 2640
},
{
"epoch": 1.5221137277426766,
"grad_norm": 6.843629360198975,
"learning_rate": 4.784606547960942e-06,
"loss": 0.3254,
"step": 2650
},
{
"epoch": 1.5278575531303848,
"grad_norm": 7.173573970794678,
"learning_rate": 4.72716829408386e-06,
"loss": 0.3481,
"step": 2660
},
{
"epoch": 1.533601378518093,
"grad_norm": 12.328543663024902,
"learning_rate": 4.669730040206778e-06,
"loss": 0.3124,
"step": 2670
},
{
"epoch": 1.5393452039058013,
"grad_norm": 9.337592124938965,
"learning_rate": 4.612291786329696e-06,
"loss": 0.3005,
"step": 2680
},
{
"epoch": 1.5450890292935093,
"grad_norm": 5.477969646453857,
"learning_rate": 4.5548535324526135e-06,
"loss": 0.3457,
"step": 2690
},
{
"epoch": 1.5508328546812176,
"grad_norm": 5.083920955657959,
"learning_rate": 4.497415278575532e-06,
"loss": 0.2858,
"step": 2700
},
{
"epoch": 1.5565766800689258,
"grad_norm": 6.250855445861816,
"learning_rate": 4.439977024698449e-06,
"loss": 0.2673,
"step": 2710
},
{
"epoch": 1.562320505456634,
"grad_norm": 6.169952392578125,
"learning_rate": 4.382538770821368e-06,
"loss": 0.2971,
"step": 2720
},
{
"epoch": 1.5680643308443423,
"grad_norm": 8.261754989624023,
"learning_rate": 4.325100516944285e-06,
"loss": 0.3009,
"step": 2730
},
{
"epoch": 1.5738081562320505,
"grad_norm": 8.477384567260742,
"learning_rate": 4.267662263067203e-06,
"loss": 0.3017,
"step": 2740
},
{
"epoch": 1.5795519816197587,
"grad_norm": 8.52374267578125,
"learning_rate": 4.210224009190121e-06,
"loss": 0.4511,
"step": 2750
},
{
"epoch": 1.585295807007467,
"grad_norm": 8.646858215332031,
"learning_rate": 4.152785755313039e-06,
"loss": 0.2699,
"step": 2760
},
{
"epoch": 1.5910396323951752,
"grad_norm": 7.500174522399902,
"learning_rate": 4.095347501435957e-06,
"loss": 0.2743,
"step": 2770
},
{
"epoch": 1.5967834577828834,
"grad_norm": 5.454465389251709,
"learning_rate": 4.037909247558874e-06,
"loss": 0.3023,
"step": 2780
},
{
"epoch": 1.6025272831705917,
"grad_norm": 2.6998019218444824,
"learning_rate": 3.9804709936817925e-06,
"loss": 0.3941,
"step": 2790
},
{
"epoch": 1.6082711085583,
"grad_norm": 4.594570159912109,
"learning_rate": 3.92303273980471e-06,
"loss": 0.252,
"step": 2800
},
{
"epoch": 1.6140149339460081,
"grad_norm": 5.87538480758667,
"learning_rate": 3.865594485927628e-06,
"loss": 0.3093,
"step": 2810
},
{
"epoch": 1.6197587593337164,
"grad_norm": 5.358250617980957,
"learning_rate": 3.808156232050546e-06,
"loss": 0.3674,
"step": 2820
},
{
"epoch": 1.6255025847214246,
"grad_norm": 4.871222972869873,
"learning_rate": 3.7507179781734636e-06,
"loss": 0.2407,
"step": 2830
},
{
"epoch": 1.6312464101091326,
"grad_norm": 6.6836838722229,
"learning_rate": 3.693279724296382e-06,
"loss": 0.2978,
"step": 2840
},
{
"epoch": 1.6369902354968409,
"grad_norm": 7.67780065536499,
"learning_rate": 3.6358414704192994e-06,
"loss": 0.3655,
"step": 2850
},
{
"epoch": 1.642734060884549,
"grad_norm": 6.890137672424316,
"learning_rate": 3.5784032165422173e-06,
"loss": 0.3618,
"step": 2860
},
{
"epoch": 1.6484778862722573,
"grad_norm": 7.769665241241455,
"learning_rate": 3.5209649626651356e-06,
"loss": 0.2703,
"step": 2870
},
{
"epoch": 1.6542217116599656,
"grad_norm": 4.888299465179443,
"learning_rate": 3.463526708788053e-06,
"loss": 0.3257,
"step": 2880
},
{
"epoch": 1.6599655370476738,
"grad_norm": 4.726266384124756,
"learning_rate": 3.406088454910971e-06,
"loss": 0.3573,
"step": 2890
},
{
"epoch": 1.6657093624353818,
"grad_norm": 6.836297035217285,
"learning_rate": 3.348650201033889e-06,
"loss": 0.3325,
"step": 2900
},
{
"epoch": 1.67145318782309,
"grad_norm": 6.571508884429932,
"learning_rate": 3.2912119471568067e-06,
"loss": 0.292,
"step": 2910
},
{
"epoch": 1.6771970132107983,
"grad_norm": 9.843769073486328,
"learning_rate": 3.2337736932797246e-06,
"loss": 0.4027,
"step": 2920
},
{
"epoch": 1.6829408385985065,
"grad_norm": 6.089911937713623,
"learning_rate": 3.1763354394026425e-06,
"loss": 0.3627,
"step": 2930
},
{
"epoch": 1.6886846639862148,
"grad_norm": 6.846927165985107,
"learning_rate": 3.11889718552556e-06,
"loss": 0.3541,
"step": 2940
},
{
"epoch": 1.694428489373923,
"grad_norm": 10.23181438446045,
"learning_rate": 3.0614589316484783e-06,
"loss": 0.3418,
"step": 2950
},
{
"epoch": 1.7001723147616312,
"grad_norm": 5.403523921966553,
"learning_rate": 3.0040206777713958e-06,
"loss": 0.3498,
"step": 2960
},
{
"epoch": 1.7059161401493395,
"grad_norm": 8.252917289733887,
"learning_rate": 2.9465824238943137e-06,
"loss": 0.3401,
"step": 2970
},
{
"epoch": 1.7116599655370477,
"grad_norm": 7.06523323059082,
"learning_rate": 2.889144170017232e-06,
"loss": 0.2331,
"step": 2980
},
{
"epoch": 1.717403790924756,
"grad_norm": 7.739984035491943,
"learning_rate": 2.8317059161401494e-06,
"loss": 0.3678,
"step": 2990
},
{
"epoch": 1.7231476163124642,
"grad_norm": 4.021157741546631,
"learning_rate": 2.7742676622630677e-06,
"loss": 0.3537,
"step": 3000
}
],
"logging_steps": 10,
"max_steps": 3482,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6356845526704128.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}