| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 20.217435897435898, | |
| "eval_steps": 500, | |
| "global_step": 4928, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.020512820512820513, | |
| "grad_norm": 0.875, | |
| "learning_rate": 2.9999969395283144e-06, | |
| "loss": 0.6637, | |
| "num_input_tokens_seen": 75456, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.041025641025641026, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 2.9999877581257458e-06, | |
| "loss": 0.6963, | |
| "num_input_tokens_seen": 160992, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06153846153846154, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 2.9999724558297605e-06, | |
| "loss": 0.6919, | |
| "num_input_tokens_seen": 253856, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.08205128205128205, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 2.999951032702801e-06, | |
| "loss": 0.6605, | |
| "num_input_tokens_seen": 328640, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10256410256410256, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 2.9999234888322877e-06, | |
| "loss": 0.5888, | |
| "num_input_tokens_seen": 401504, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.12307692307692308, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 2.9998898243306162e-06, | |
| "loss": 0.6087, | |
| "num_input_tokens_seen": 475168, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.14358974358974358, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 2.99985003933516e-06, | |
| "loss": 0.6405, | |
| "num_input_tokens_seen": 547168, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.1641025641025641, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 2.999804134008266e-06, | |
| "loss": 0.6443, | |
| "num_input_tokens_seen": 618176, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.18461538461538463, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 2.9997521085372565e-06, | |
| "loss": 0.6554, | |
| "num_input_tokens_seen": 701184, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.20512820512820512, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 2.999693963134429e-06, | |
| "loss": 0.6903, | |
| "num_input_tokens_seen": 782656, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.22564102564102564, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 2.9996296980370526e-06, | |
| "loss": 0.6915, | |
| "num_input_tokens_seen": 869280, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.24615384615384617, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 2.99955931350737e-06, | |
| "loss": 0.584, | |
| "num_input_tokens_seen": 944032, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 2.999482809832594e-06, | |
| "loss": 0.5908, | |
| "num_input_tokens_seen": 1025568, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.28717948717948716, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 2.9994001873249074e-06, | |
| "loss": 0.5811, | |
| "num_input_tokens_seen": 1105344, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3076923076923077, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 2.999311446321462e-06, | |
| "loss": 0.5792, | |
| "num_input_tokens_seen": 1182016, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.3282051282051282, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 2.999216587184378e-06, | |
| "loss": 0.5923, | |
| "num_input_tokens_seen": 1265440, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3487179487179487, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 2.9991156103007394e-06, | |
| "loss": 0.5896, | |
| "num_input_tokens_seen": 1347296, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.36923076923076925, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 2.9990085160825954e-06, | |
| "loss": 0.6283, | |
| "num_input_tokens_seen": 1424064, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.38974358974358975, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 2.9988953049669577e-06, | |
| "loss": 0.5329, | |
| "num_input_tokens_seen": 1492416, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.41025641025641024, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 2.998775977415799e-06, | |
| "loss": 0.5582, | |
| "num_input_tokens_seen": 1572192, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4307692307692308, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 2.998650533916051e-06, | |
| "loss": 0.6529, | |
| "num_input_tokens_seen": 1650592, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.4512820512820513, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 2.998518974979602e-06, | |
| "loss": 0.5449, | |
| "num_input_tokens_seen": 1728128, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.4717948717948718, | |
| "grad_norm": 0.494140625, | |
| "learning_rate": 2.998381301143295e-06, | |
| "loss": 0.573, | |
| "num_input_tokens_seen": 1806080, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.49230769230769234, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 2.9982375129689253e-06, | |
| "loss": 0.6065, | |
| "num_input_tokens_seen": 1884544, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5128205128205128, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 2.9980876110432404e-06, | |
| "loss": 0.6232, | |
| "num_input_tokens_seen": 1961376, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 2.9979315959779335e-06, | |
| "loss": 0.5061, | |
| "num_input_tokens_seen": 2036064, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5538461538461539, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 2.9977694684096447e-06, | |
| "loss": 0.5657, | |
| "num_input_tokens_seen": 2109376, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.5743589743589743, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 2.997601228999956e-06, | |
| "loss": 0.5597, | |
| "num_input_tokens_seen": 2181344, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5948717948717949, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 2.99742687843539e-06, | |
| "loss": 0.5514, | |
| "num_input_tokens_seen": 2256768, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.6153846153846154, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 2.997246417427407e-06, | |
| "loss": 0.53, | |
| "num_input_tokens_seen": 2330144, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6358974358974359, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 2.9970598467124008e-06, | |
| "loss": 0.5365, | |
| "num_input_tokens_seen": 2402688, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.6564102564102564, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 2.9968671670516983e-06, | |
| "loss": 0.5578, | |
| "num_input_tokens_seen": 2476800, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.676923076923077, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 2.9966683792315528e-06, | |
| "loss": 0.5492, | |
| "num_input_tokens_seen": 2565792, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.6974358974358974, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 2.9964634840631435e-06, | |
| "loss": 0.5144, | |
| "num_input_tokens_seen": 2637792, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.717948717948718, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 2.9962524823825724e-06, | |
| "loss": 0.5741, | |
| "num_input_tokens_seen": 2718944, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.7384615384615385, | |
| "grad_norm": 0.1513671875, | |
| "learning_rate": 2.9960353750508583e-06, | |
| "loss": 0.5846, | |
| "num_input_tokens_seen": 2802240, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7589743589743589, | |
| "grad_norm": 0.1474609375, | |
| "learning_rate": 2.995812162953936e-06, | |
| "loss": 0.5834, | |
| "num_input_tokens_seen": 2884672, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.7794871794871795, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 2.9955828470026515e-06, | |
| "loss": 0.5493, | |
| "num_input_tokens_seen": 2953536, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 2.9953474281327576e-06, | |
| "loss": 0.5503, | |
| "num_input_tokens_seen": 3026496, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.8205128205128205, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 2.995105907304912e-06, | |
| "loss": 0.5835, | |
| "num_input_tokens_seen": 3105376, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.841025641025641, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 2.9948582855046704e-06, | |
| "loss": 0.5583, | |
| "num_input_tokens_seen": 3179776, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.8615384615384616, | |
| "grad_norm": 0.1826171875, | |
| "learning_rate": 2.9946045637424864e-06, | |
| "loss": 0.6392, | |
| "num_input_tokens_seen": 3262336, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.882051282051282, | |
| "grad_norm": 0.1640625, | |
| "learning_rate": 2.994344743053704e-06, | |
| "loss": 0.5658, | |
| "num_input_tokens_seen": 3343904, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.9025641025641026, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 2.9940788244985557e-06, | |
| "loss": 0.5609, | |
| "num_input_tokens_seen": 3419264, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.9230769230769231, | |
| "grad_norm": 0.458984375, | |
| "learning_rate": 2.9938068091621556e-06, | |
| "loss": 0.5305, | |
| "num_input_tokens_seen": 3496032, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.9435897435897436, | |
| "grad_norm": 0.41015625, | |
| "learning_rate": 2.9935286981544975e-06, | |
| "loss": 0.5652, | |
| "num_input_tokens_seen": 3570880, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9641025641025641, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 2.9932444926104495e-06, | |
| "loss": 0.5348, | |
| "num_input_tokens_seen": 3644096, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.9846153846153847, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 2.992954193689749e-06, | |
| "loss": 0.5344, | |
| "num_input_tokens_seen": 3717088, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.005128205128205, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 2.9926578025769978e-06, | |
| "loss": 0.5977, | |
| "num_input_tokens_seen": 3783648, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.0256410256410255, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 2.992355320481658e-06, | |
| "loss": 0.5304, | |
| "num_input_tokens_seen": 3852160, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.0461538461538462, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 2.9920467486380475e-06, | |
| "loss": 0.5617, | |
| "num_input_tokens_seen": 3924416, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.0666666666666667, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 2.991732088305333e-06, | |
| "loss": 0.5367, | |
| "num_input_tokens_seen": 4006432, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.087179487179487, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 2.991411340767526e-06, | |
| "loss": 0.5207, | |
| "num_input_tokens_seen": 4081888, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.1076923076923078, | |
| "grad_norm": 0.41015625, | |
| "learning_rate": 2.9910845073334793e-06, | |
| "loss": 0.556, | |
| "num_input_tokens_seen": 4155968, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.1282051282051282, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 2.9907515893368784e-06, | |
| "loss": 0.537, | |
| "num_input_tokens_seen": 4234272, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.1487179487179486, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 2.9904125881362378e-06, | |
| "loss": 0.5305, | |
| "num_input_tokens_seen": 4302368, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.1692307692307693, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 2.990067505114896e-06, | |
| "loss": 0.5337, | |
| "num_input_tokens_seen": 4376640, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.1897435897435897, | |
| "grad_norm": 0.451171875, | |
| "learning_rate": 2.9897163416810084e-06, | |
| "loss": 0.592, | |
| "num_input_tokens_seen": 4458208, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.2102564102564102, | |
| "grad_norm": 0.40625, | |
| "learning_rate": 2.9893590992675427e-06, | |
| "loss": 0.5808, | |
| "num_input_tokens_seen": 4537920, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.2307692307692308, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 2.988995779332273e-06, | |
| "loss": 0.5569, | |
| "num_input_tokens_seen": 4606880, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.2512820512820513, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 2.9886263833577725e-06, | |
| "loss": 0.5422, | |
| "num_input_tokens_seen": 4682816, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.2717948717948717, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 2.98825091285141e-06, | |
| "loss": 0.5296, | |
| "num_input_tokens_seen": 4756032, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.2923076923076924, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 2.987869369345341e-06, | |
| "loss": 0.5012, | |
| "num_input_tokens_seen": 4827232, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.3128205128205128, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 2.987481754396502e-06, | |
| "loss": 0.5289, | |
| "num_input_tokens_seen": 4902368, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 2.9870880695866067e-06, | |
| "loss": 0.5245, | |
| "num_input_tokens_seen": 4978080, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.353846153846154, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 2.986688316522136e-06, | |
| "loss": 0.5325, | |
| "num_input_tokens_seen": 5047456, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.3743589743589744, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 2.9862824968343352e-06, | |
| "loss": 0.5068, | |
| "num_input_tokens_seen": 5118720, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.3948717948717948, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 2.9858706121792036e-06, | |
| "loss": 0.5165, | |
| "num_input_tokens_seen": 5196288, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.4153846153846155, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 2.985452664237488e-06, | |
| "loss": 0.5025, | |
| "num_input_tokens_seen": 5272480, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.435897435897436, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 2.98502865471468e-06, | |
| "loss": 0.5285, | |
| "num_input_tokens_seen": 5343296, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.4564102564102563, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 2.9845985853410053e-06, | |
| "loss": 0.4983, | |
| "num_input_tokens_seen": 5415904, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.476923076923077, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 2.9841624578714167e-06, | |
| "loss": 0.5789, | |
| "num_input_tokens_seen": 5502176, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.4974358974358974, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 2.9837202740855897e-06, | |
| "loss": 0.5394, | |
| "num_input_tokens_seen": 5580352, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.5179487179487179, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 2.9832720357879107e-06, | |
| "loss": 0.5664, | |
| "num_input_tokens_seen": 5662912, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.5384615384615383, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 2.9828177448074753e-06, | |
| "loss": 0.5546, | |
| "num_input_tokens_seen": 5743776, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.558974358974359, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 2.9823574029980757e-06, | |
| "loss": 0.5412, | |
| "num_input_tokens_seen": 5812384, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.5794871794871796, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 2.981891012238196e-06, | |
| "loss": 0.5587, | |
| "num_input_tokens_seen": 5892768, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.1640625, | |
| "learning_rate": 2.9814185744310036e-06, | |
| "loss": 0.535, | |
| "num_input_tokens_seen": 5974592, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.6205128205128205, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 2.9809400915043424e-06, | |
| "loss": 0.512, | |
| "num_input_tokens_seen": 6050080, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.641025641025641, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 2.9804555654107243e-06, | |
| "loss": 0.5392, | |
| "num_input_tokens_seen": 6137248, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.6615384615384614, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 2.9799649981273185e-06, | |
| "loss": 0.6444, | |
| "num_input_tokens_seen": 6232704, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.682051282051282, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 2.9794683916559493e-06, | |
| "loss": 0.5202, | |
| "num_input_tokens_seen": 6304064, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.7025641025641025, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 2.9789657480230842e-06, | |
| "loss": 0.5344, | |
| "num_input_tokens_seen": 6392928, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.7230769230769232, | |
| "grad_norm": 0.16015625, | |
| "learning_rate": 2.9784570692798236e-06, | |
| "loss": 0.4614, | |
| "num_input_tokens_seen": 6473120, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.7435897435897436, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 2.977942357501898e-06, | |
| "loss": 0.5036, | |
| "num_input_tokens_seen": 6545312, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.764102564102564, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 2.977421614789655e-06, | |
| "loss": 0.5308, | |
| "num_input_tokens_seen": 6629984, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.7846153846153845, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 2.976894843268051e-06, | |
| "loss": 0.5475, | |
| "num_input_tokens_seen": 6715936, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.8051282051282052, | |
| "grad_norm": 0.150390625, | |
| "learning_rate": 2.976362045086647e-06, | |
| "loss": 0.5704, | |
| "num_input_tokens_seen": 6797472, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.8256410256410256, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 2.975823222419594e-06, | |
| "loss": 0.4781, | |
| "num_input_tokens_seen": 6867808, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.8461538461538463, | |
| "grad_norm": 0.25, | |
| "learning_rate": 2.9752783774656267e-06, | |
| "loss": 0.5247, | |
| "num_input_tokens_seen": 6944480, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.8666666666666667, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 2.974727512448056e-06, | |
| "loss": 0.5161, | |
| "num_input_tokens_seen": 7024064, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.8871794871794871, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 2.974170629614757e-06, | |
| "loss": 0.5049, | |
| "num_input_tokens_seen": 7103488, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.9076923076923076, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 2.9736077312381624e-06, | |
| "loss": 0.5712, | |
| "num_input_tokens_seen": 7190304, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.9282051282051282, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 2.9730388196152513e-06, | |
| "loss": 0.5222, | |
| "num_input_tokens_seen": 7265056, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.9487179487179487, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 2.972463897067541e-06, | |
| "loss": 0.4829, | |
| "num_input_tokens_seen": 7336992, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.9692307692307693, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 2.971882965941077e-06, | |
| "loss": 0.5218, | |
| "num_input_tokens_seen": 7413984, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.9897435897435898, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 2.9712960286064237e-06, | |
| "loss": 0.5543, | |
| "num_input_tokens_seen": 7492768, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 2.01025641025641, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 2.970703087458655e-06, | |
| "loss": 0.5036, | |
| "num_input_tokens_seen": 7563904, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.0307692307692307, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 2.9701041449173426e-06, | |
| "loss": 0.556, | |
| "num_input_tokens_seen": 7634464, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 2.051282051282051, | |
| "grad_norm": 0.375, | |
| "learning_rate": 2.96949920342655e-06, | |
| "loss": 0.5742, | |
| "num_input_tokens_seen": 7711168, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.071794871794872, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 2.968888265454818e-06, | |
| "loss": 0.4905, | |
| "num_input_tokens_seen": 7786656, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 2.0923076923076924, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 2.968271333495158e-06, | |
| "loss": 0.5134, | |
| "num_input_tokens_seen": 7858240, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.112820512820513, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 2.967648410065041e-06, | |
| "loss": 0.6211, | |
| "num_input_tokens_seen": 7952064, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 2.1333333333333333, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 2.9670194977063857e-06, | |
| "loss": 0.515, | |
| "num_input_tokens_seen": 8026688, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.1538461538461537, | |
| "grad_norm": 0.400390625, | |
| "learning_rate": 2.96638459898555e-06, | |
| "loss": 0.5464, | |
| "num_input_tokens_seen": 8104064, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 2.174358974358974, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 2.9657437164933205e-06, | |
| "loss": 0.5331, | |
| "num_input_tokens_seen": 8187968, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.194871794871795, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 2.9650968528449e-06, | |
| "loss": 0.5236, | |
| "num_input_tokens_seen": 8261312, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 2.2153846153846155, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 2.9644440106799e-06, | |
| "loss": 0.4345, | |
| "num_input_tokens_seen": 8336032, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.235897435897436, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 2.963785192662327e-06, | |
| "loss": 0.4853, | |
| "num_input_tokens_seen": 8405024, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 2.2564102564102564, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 2.9631204014805716e-06, | |
| "loss": 0.5, | |
| "num_input_tokens_seen": 8483456, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.276923076923077, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 2.9624496398474014e-06, | |
| "loss": 0.4863, | |
| "num_input_tokens_seen": 8560768, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 2.2974358974358973, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 2.961772910499945e-06, | |
| "loss": 0.48, | |
| "num_input_tokens_seen": 8637888, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.3179487179487177, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 2.9610902161996838e-06, | |
| "loss": 0.5768, | |
| "num_input_tokens_seen": 8715360, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 2.3384615384615386, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 2.96040155973244e-06, | |
| "loss": 0.4751, | |
| "num_input_tokens_seen": 8795200, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.358974358974359, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 2.959706943908365e-06, | |
| "loss": 0.5161, | |
| "num_input_tokens_seen": 8883136, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 2.3794871794871795, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 2.9590063715619287e-06, | |
| "loss": 0.5588, | |
| "num_input_tokens_seen": 8961568, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 2.9582998455519062e-06, | |
| "loss": 0.5527, | |
| "num_input_tokens_seen": 9043360, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 2.4205128205128204, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 2.9575873687613676e-06, | |
| "loss": 0.4897, | |
| "num_input_tokens_seen": 9116448, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.4410256410256412, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 2.9568689440976676e-06, | |
| "loss": 0.5359, | |
| "num_input_tokens_seen": 9193120, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 2.4615384615384617, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 2.95614457449243e-06, | |
| "loss": 0.5763, | |
| "num_input_tokens_seen": 9269920, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.482051282051282, | |
| "grad_norm": 0.1845703125, | |
| "learning_rate": 2.9554142629015382e-06, | |
| "loss": 0.4631, | |
| "num_input_tokens_seen": 9339968, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 2.5025641025641026, | |
| "grad_norm": 0.1650390625, | |
| "learning_rate": 2.954678012305123e-06, | |
| "loss": 0.5349, | |
| "num_input_tokens_seen": 9426976, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.523076923076923, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 2.9539358257075495e-06, | |
| "loss": 0.5532, | |
| "num_input_tokens_seen": 9509056, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 2.5435897435897434, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 2.9531877061374066e-06, | |
| "loss": 0.4748, | |
| "num_input_tokens_seen": 9590720, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.564102564102564, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 2.9524336566474915e-06, | |
| "loss": 0.5022, | |
| "num_input_tokens_seen": 9667648, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 2.5846153846153848, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 2.9516736803148014e-06, | |
| "loss": 0.5005, | |
| "num_input_tokens_seen": 9738016, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.605128205128205, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 2.9509077802405174e-06, | |
| "loss": 0.5297, | |
| "num_input_tokens_seen": 9816224, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 2.6256410256410256, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 2.9501359595499933e-06, | |
| "loss": 0.5399, | |
| "num_input_tokens_seen": 9891104, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.646153846153846, | |
| "grad_norm": 0.177734375, | |
| "learning_rate": 2.9493582213927425e-06, | |
| "loss": 0.4901, | |
| "num_input_tokens_seen": 9969792, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 2.9485745689424267e-06, | |
| "loss": 0.4591, | |
| "num_input_tokens_seen": 10044608, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.6871794871794874, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 2.9477850053968405e-06, | |
| "loss": 0.5729, | |
| "num_input_tokens_seen": 10132640, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 2.707692307692308, | |
| "grad_norm": 0.150390625, | |
| "learning_rate": 2.9469895339778995e-06, | |
| "loss": 0.5405, | |
| "num_input_tokens_seen": 10207968, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.7282051282051283, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 2.946188157931627e-06, | |
| "loss": 0.4786, | |
| "num_input_tokens_seen": 10277408, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 2.7487179487179487, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 2.9453808805281423e-06, | |
| "loss": 0.5035, | |
| "num_input_tokens_seen": 10349184, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.769230769230769, | |
| "grad_norm": 0.2490234375, | |
| "learning_rate": 2.944567705061644e-06, | |
| "loss": 0.4719, | |
| "num_input_tokens_seen": 10434112, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 2.7897435897435896, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 2.9437486348504e-06, | |
| "loss": 0.5118, | |
| "num_input_tokens_seen": 10506208, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.81025641025641, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 2.9429236732367318e-06, | |
| "loss": 0.5014, | |
| "num_input_tokens_seen": 10577696, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 2.830769230769231, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 2.942092823587001e-06, | |
| "loss": 0.4827, | |
| "num_input_tokens_seen": 10657984, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.8512820512820514, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 2.941256089291597e-06, | |
| "loss": 0.5177, | |
| "num_input_tokens_seen": 10734688, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 2.871794871794872, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 2.940413473764923e-06, | |
| "loss": 0.4517, | |
| "num_input_tokens_seen": 10812640, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.8923076923076922, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 2.9395649804453786e-06, | |
| "loss": 0.4574, | |
| "num_input_tokens_seen": 10884800, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 2.9128205128205127, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 2.9387106127953515e-06, | |
| "loss": 0.5092, | |
| "num_input_tokens_seen": 10962016, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.9333333333333336, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 2.937850374301198e-06, | |
| "loss": 0.4888, | |
| "num_input_tokens_seen": 11033280, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 2.953846153846154, | |
| "grad_norm": 0.375, | |
| "learning_rate": 2.9369842684732336e-06, | |
| "loss": 0.5447, | |
| "num_input_tokens_seen": 11113696, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.9743589743589745, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 2.936112298845713e-06, | |
| "loss": 0.5438, | |
| "num_input_tokens_seen": 11195104, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 2.994871794871795, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 2.935234468976822e-06, | |
| "loss": 0.46, | |
| "num_input_tokens_seen": 11270304, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 3.0153846153846153, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 2.934350782448658e-06, | |
| "loss": 0.569, | |
| "num_input_tokens_seen": 11350784, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 3.0358974358974358, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 2.9334612428672175e-06, | |
| "loss": 0.5246, | |
| "num_input_tokens_seen": 11429568, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 3.056410256410256, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 2.9325658538623822e-06, | |
| "loss": 0.4587, | |
| "num_input_tokens_seen": 11502784, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 3.076923076923077, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 2.931664619087902e-06, | |
| "loss": 0.5095, | |
| "num_input_tokens_seen": 11575680, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 3.0974358974358975, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 2.9307575422213813e-06, | |
| "loss": 0.4916, | |
| "num_input_tokens_seen": 11649856, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 3.117948717948718, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 2.929844626964265e-06, | |
| "loss": 0.5647, | |
| "num_input_tokens_seen": 11727616, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 3.1384615384615384, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 2.9289258770418208e-06, | |
| "loss": 0.4448, | |
| "num_input_tokens_seen": 11806208, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 3.158974358974359, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 2.9280012962031263e-06, | |
| "loss": 0.5086, | |
| "num_input_tokens_seen": 11884096, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 3.1794871794871793, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 2.9270708882210525e-06, | |
| "loss": 0.4796, | |
| "num_input_tokens_seen": 11956416, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 2.92613465689225e-06, | |
| "loss": 0.4797, | |
| "num_input_tokens_seen": 12032384, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 3.2205128205128206, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 2.92519260603713e-06, | |
| "loss": 0.4523, | |
| "num_input_tokens_seen": 12107328, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 3.241025641025641, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 2.9242447394998545e-06, | |
| "loss": 0.4795, | |
| "num_input_tokens_seen": 12178848, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 3.2615384615384615, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 2.923291061148314e-06, | |
| "loss": 0.5164, | |
| "num_input_tokens_seen": 12252160, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 3.282051282051282, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 2.9223315748741146e-06, | |
| "loss": 0.4949, | |
| "num_input_tokens_seen": 12325120, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 3.3025641025641024, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 2.9213662845925662e-06, | |
| "loss": 0.4848, | |
| "num_input_tokens_seen": 12398144, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 3.3230769230769233, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 2.9203951942426586e-06, | |
| "loss": 0.5114, | |
| "num_input_tokens_seen": 12475008, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 3.3435897435897437, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 2.9194183077870516e-06, | |
| "loss": 0.6022, | |
| "num_input_tokens_seen": 12562336, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 3.364102564102564, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 2.9184356292120562e-06, | |
| "loss": 0.4922, | |
| "num_input_tokens_seen": 12646560, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 3.3846153846153846, | |
| "grad_norm": 0.1962890625, | |
| "learning_rate": 2.9174471625276198e-06, | |
| "loss": 0.5707, | |
| "num_input_tokens_seen": 12718848, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 3.405128205128205, | |
| "grad_norm": 0.1650390625, | |
| "learning_rate": 2.916452911767307e-06, | |
| "loss": 0.4784, | |
| "num_input_tokens_seen": 12798240, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 3.4256410256410255, | |
| "grad_norm": 0.1552734375, | |
| "learning_rate": 2.915452880988287e-06, | |
| "loss": 0.4423, | |
| "num_input_tokens_seen": 12872608, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 3.4461538461538463, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 2.914447074271314e-06, | |
| "loss": 0.4809, | |
| "num_input_tokens_seen": 12952896, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 3.466666666666667, | |
| "grad_norm": 0.1650390625, | |
| "learning_rate": 2.913435495720712e-06, | |
| "loss": 0.5316, | |
| "num_input_tokens_seen": 13036768, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 3.4871794871794872, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 2.9124181494643574e-06, | |
| "loss": 0.4592, | |
| "num_input_tokens_seen": 13114784, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 3.5076923076923077, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 2.911395039653663e-06, | |
| "loss": 0.4878, | |
| "num_input_tokens_seen": 13188448, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 3.528205128205128, | |
| "grad_norm": 0.1298828125, | |
| "learning_rate": 2.9103661704635604e-06, | |
| "loss": 0.5066, | |
| "num_input_tokens_seen": 13262592, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 3.5487179487179485, | |
| "grad_norm": 0.11962890625, | |
| "learning_rate": 2.909331546092483e-06, | |
| "loss": 0.4649, | |
| "num_input_tokens_seen": 13339936, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 3.569230769230769, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 2.908291170762349e-06, | |
| "loss": 0.5233, | |
| "num_input_tokens_seen": 13416256, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 3.58974358974359, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 2.9072450487185434e-06, | |
| "loss": 0.5018, | |
| "num_input_tokens_seen": 13487392, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 3.6102564102564103, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 2.9061931842299026e-06, | |
| "loss": 0.4602, | |
| "num_input_tokens_seen": 13569984, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 3.6307692307692307, | |
| "grad_norm": 0.1640625, | |
| "learning_rate": 2.9051355815886952e-06, | |
| "loss": 0.5309, | |
| "num_input_tokens_seen": 13650944, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 3.651282051282051, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 2.904072245110605e-06, | |
| "loss": 0.5186, | |
| "num_input_tokens_seen": 13720736, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 3.6717948717948716, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 2.9030031791347136e-06, | |
| "loss": 0.4839, | |
| "num_input_tokens_seen": 13791616, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 3.6923076923076925, | |
| "grad_norm": 0.25, | |
| "learning_rate": 2.901928388023483e-06, | |
| "loss": 0.5199, | |
| "num_input_tokens_seen": 13867488, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 3.712820512820513, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 2.900847876162736e-06, | |
| "loss": 0.5414, | |
| "num_input_tokens_seen": 13954848, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 3.7333333333333334, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 2.899761647961641e-06, | |
| "loss": 0.5451, | |
| "num_input_tokens_seen": 14037792, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 3.753846153846154, | |
| "grad_norm": 1.0, | |
| "learning_rate": 2.898669707852692e-06, | |
| "loss": 0.4448, | |
| "num_input_tokens_seen": 14107520, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 3.7743589743589743, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 2.897572060291692e-06, | |
| "loss": 0.5213, | |
| "num_input_tokens_seen": 14193888, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 3.7948717948717947, | |
| "grad_norm": 0.5, | |
| "learning_rate": 2.896468709757733e-06, | |
| "loss": 0.4968, | |
| "num_input_tokens_seen": 14270976, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 3.815384615384615, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 2.8953596607531788e-06, | |
| "loss": 0.5769, | |
| "num_input_tokens_seen": 14351232, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 3.835897435897436, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 2.894244917803647e-06, | |
| "loss": 0.4925, | |
| "num_input_tokens_seen": 14426912, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 3.8564102564102565, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 2.8931244854579904e-06, | |
| "loss": 0.481, | |
| "num_input_tokens_seen": 14515776, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 3.876923076923077, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 2.891998368288277e-06, | |
| "loss": 0.4699, | |
| "num_input_tokens_seen": 14587104, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 3.8974358974358974, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 2.890866570889773e-06, | |
| "loss": 0.5206, | |
| "num_input_tokens_seen": 14663680, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 3.917948717948718, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 2.8897290978809245e-06, | |
| "loss": 0.5117, | |
| "num_input_tokens_seen": 14747360, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 3.9384615384615387, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 2.888585953903336e-06, | |
| "loss": 0.4891, | |
| "num_input_tokens_seen": 14826464, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 3.958974358974359, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 2.8874371436217534e-06, | |
| "loss": 0.4943, | |
| "num_input_tokens_seen": 14916416, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 3.9794871794871796, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 2.8862826717240464e-06, | |
| "loss": 0.5222, | |
| "num_input_tokens_seen": 14995072, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 2.8851225429211855e-06, | |
| "loss": 0.5197, | |
| "num_input_tokens_seen": 15070304, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 4.02051282051282, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 2.883956761947226e-06, | |
| "loss": 0.501, | |
| "num_input_tokens_seen": 15152480, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 4.041025641025641, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 2.8827853335592876e-06, | |
| "loss": 0.5142, | |
| "num_input_tokens_seen": 15229184, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 4.061538461538461, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 2.8816082625375353e-06, | |
| "loss": 0.5239, | |
| "num_input_tokens_seen": 15311072, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 4.082051282051282, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 2.8804255536851584e-06, | |
| "loss": 0.4823, | |
| "num_input_tokens_seen": 15383232, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 4.102564102564102, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 2.8792372118283528e-06, | |
| "loss": 0.5416, | |
| "num_input_tokens_seen": 15464064, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 4.123076923076923, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 2.878043241816301e-06, | |
| "loss": 0.4889, | |
| "num_input_tokens_seen": 15536480, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 4.143589743589744, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 2.876843648521152e-06, | |
| "loss": 0.5338, | |
| "num_input_tokens_seen": 15618816, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 4.164102564102564, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 2.8756384368380003e-06, | |
| "loss": 0.5101, | |
| "num_input_tokens_seen": 15694304, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 4.184615384615385, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 2.874427611684867e-06, | |
| "loss": 0.4792, | |
| "num_input_tokens_seen": 15770784, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 4.205128205128205, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 2.8732111780026813e-06, | |
| "loss": 0.4959, | |
| "num_input_tokens_seen": 15837312, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 4.225641025641026, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 2.871989140755257e-06, | |
| "loss": 0.5227, | |
| "num_input_tokens_seen": 15908096, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 4.246153846153846, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 2.870761504929275e-06, | |
| "loss": 0.473, | |
| "num_input_tokens_seen": 15982720, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 4.266666666666667, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 2.869528275534261e-06, | |
| "loss": 0.4911, | |
| "num_input_tokens_seen": 16056256, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 4.287179487179487, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 2.8682894576025677e-06, | |
| "loss": 0.4678, | |
| "num_input_tokens_seen": 16128256, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 4.3076923076923075, | |
| "grad_norm": 0.1650390625, | |
| "learning_rate": 2.8670450561893498e-06, | |
| "loss": 0.4534, | |
| "num_input_tokens_seen": 16203808, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 4.328205128205128, | |
| "grad_norm": 0.1796875, | |
| "learning_rate": 2.865795076372549e-06, | |
| "loss": 0.5788, | |
| "num_input_tokens_seen": 16290464, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 4.348717948717948, | |
| "grad_norm": 0.1845703125, | |
| "learning_rate": 2.8645395232528683e-06, | |
| "loss": 0.4744, | |
| "num_input_tokens_seen": 16362688, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 4.36923076923077, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 2.863278401953754e-06, | |
| "loss": 0.4801, | |
| "num_input_tokens_seen": 16434976, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 4.38974358974359, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 2.862011717621375e-06, | |
| "loss": 0.5035, | |
| "num_input_tokens_seen": 16514880, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 4.410256410256411, | |
| "grad_norm": 0.15625, | |
| "learning_rate": 2.860739475424599e-06, | |
| "loss": 0.5456, | |
| "num_input_tokens_seen": 16592544, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 4.430769230769231, | |
| "grad_norm": 0.125, | |
| "learning_rate": 2.859461680554975e-06, | |
| "loss": 0.4773, | |
| "num_input_tokens_seen": 16675360, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 4.4512820512820515, | |
| "grad_norm": 0.1103515625, | |
| "learning_rate": 2.858178338226709e-06, | |
| "loss": 0.4793, | |
| "num_input_tokens_seen": 16753728, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 4.471794871794872, | |
| "grad_norm": 0.11572265625, | |
| "learning_rate": 2.8568894536766462e-06, | |
| "loss": 0.4698, | |
| "num_input_tokens_seen": 16829696, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 4.492307692307692, | |
| "grad_norm": 0.19140625, | |
| "learning_rate": 2.8555950321642444e-06, | |
| "loss": 0.4648, | |
| "num_input_tokens_seen": 16908128, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 4.512820512820513, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 2.8542950789715587e-06, | |
| "loss": 0.4473, | |
| "num_input_tokens_seen": 16979136, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 4.533333333333333, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 2.8529895994032153e-06, | |
| "loss": 0.5128, | |
| "num_input_tokens_seen": 17063296, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 4.553846153846154, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 2.851678598786392e-06, | |
| "loss": 0.4834, | |
| "num_input_tokens_seen": 17145984, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 4.574358974358974, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 2.8503620824707946e-06, | |
| "loss": 0.4581, | |
| "num_input_tokens_seen": 17221152, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 4.5948717948717945, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 2.8490400558286395e-06, | |
| "loss": 0.5189, | |
| "num_input_tokens_seen": 17298688, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 4.615384615384615, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 2.847712524254626e-06, | |
| "loss": 0.4739, | |
| "num_input_tokens_seen": 17374240, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 4.635897435897435, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 2.846379493165918e-06, | |
| "loss": 0.5376, | |
| "num_input_tokens_seen": 17451360, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 4.656410256410257, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 2.8450409680021204e-06, | |
| "loss": 0.5152, | |
| "num_input_tokens_seen": 17535776, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 4.676923076923077, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 2.8436969542252576e-06, | |
| "loss": 0.4957, | |
| "num_input_tokens_seen": 17616256, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 4.697435897435898, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 2.842347457319752e-06, | |
| "loss": 0.4963, | |
| "num_input_tokens_seen": 17691616, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 4.717948717948718, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 2.8409924827923985e-06, | |
| "loss": 0.4868, | |
| "num_input_tokens_seen": 17767264, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 4.7384615384615385, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 2.839632036172346e-06, | |
| "loss": 0.5595, | |
| "num_input_tokens_seen": 17847648, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 4.758974358974359, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 2.8382661230110716e-06, | |
| "loss": 0.6248, | |
| "num_input_tokens_seen": 17932992, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 4.779487179487179, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 2.8368947488823613e-06, | |
| "loss": 0.4935, | |
| "num_input_tokens_seen": 18008544, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 2.8355179193822834e-06, | |
| "loss": 0.472, | |
| "num_input_tokens_seen": 18091904, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 4.82051282051282, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 2.834135640129168e-06, | |
| "loss": 0.4618, | |
| "num_input_tokens_seen": 18169760, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 4.841025641025641, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 2.8327479167635834e-06, | |
| "loss": 0.4375, | |
| "num_input_tokens_seen": 18242016, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 4.861538461538462, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 2.831354754948315e-06, | |
| "loss": 0.4954, | |
| "num_input_tokens_seen": 18316192, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 4.8820512820512825, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 2.829956160368338e-06, | |
| "loss": 0.4885, | |
| "num_input_tokens_seen": 18402720, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 4.902564102564103, | |
| "grad_norm": 0.482421875, | |
| "learning_rate": 2.828552138730798e-06, | |
| "loss": 0.4452, | |
| "num_input_tokens_seen": 18472768, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 4.923076923076923, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 2.8271426957649868e-06, | |
| "loss": 0.4602, | |
| "num_input_tokens_seen": 18548128, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 4.943589743589744, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 2.8257278372223177e-06, | |
| "loss": 0.4391, | |
| "num_input_tokens_seen": 18622112, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 4.964102564102564, | |
| "grad_norm": 0.484375, | |
| "learning_rate": 2.824307568876304e-06, | |
| "loss": 0.4614, | |
| "num_input_tokens_seen": 18704288, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 4.984615384615385, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 2.8228818965225326e-06, | |
| "loss": 0.5284, | |
| "num_input_tokens_seen": 18780128, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 5.005128205128205, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 2.8214508259786443e-06, | |
| "loss": 0.5213, | |
| "num_input_tokens_seen": 18850496, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 5.0256410256410255, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 2.820014363084307e-06, | |
| "loss": 0.5071, | |
| "num_input_tokens_seen": 18926816, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 5.046153846153846, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 2.8185725137011922e-06, | |
| "loss": 0.4964, | |
| "num_input_tokens_seen": 19002624, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 5.066666666666666, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 2.8171252837129523e-06, | |
| "loss": 0.5196, | |
| "num_input_tokens_seen": 19083296, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 5.087179487179487, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 2.815672679025196e-06, | |
| "loss": 0.5272, | |
| "num_input_tokens_seen": 19158048, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 5.107692307692307, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 2.814214705565464e-06, | |
| "loss": 0.5034, | |
| "num_input_tokens_seen": 19233888, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 5.128205128205128, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 2.8127513692832047e-06, | |
| "loss": 0.5069, | |
| "num_input_tokens_seen": 19317472, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 5.148717948717949, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 2.8112826761497507e-06, | |
| "loss": 0.5116, | |
| "num_input_tokens_seen": 19398496, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 5.1692307692307695, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 2.8098086321582937e-06, | |
| "loss": 0.4286, | |
| "num_input_tokens_seen": 19466400, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 5.18974358974359, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 2.8083292433238602e-06, | |
| "loss": 0.5058, | |
| "num_input_tokens_seen": 19550336, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 5.21025641025641, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 2.8068445156832864e-06, | |
| "loss": 0.4587, | |
| "num_input_tokens_seen": 19625792, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 5.230769230769231, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 2.805354455295196e-06, | |
| "loss": 0.4901, | |
| "num_input_tokens_seen": 19711776, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 5.251282051282051, | |
| "grad_norm": 0.173828125, | |
| "learning_rate": 2.8038590682399718e-06, | |
| "loss": 0.5474, | |
| "num_input_tokens_seen": 19794208, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 5.271794871794872, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 2.8023583606197336e-06, | |
| "loss": 0.4452, | |
| "num_input_tokens_seen": 19872128, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 5.292307692307692, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 2.800852338558312e-06, | |
| "loss": 0.5081, | |
| "num_input_tokens_seen": 19953856, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 5.312820512820513, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 2.7993410082012247e-06, | |
| "loss": 0.4863, | |
| "num_input_tokens_seen": 20026848, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 5.333333333333333, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 2.79782437571565e-06, | |
| "loss": 0.4979, | |
| "num_input_tokens_seen": 20100928, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 5.3538461538461535, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 2.7963024472904013e-06, | |
| "loss": 0.4676, | |
| "num_input_tokens_seen": 20173504, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 5.374358974358975, | |
| "grad_norm": 0.12451171875, | |
| "learning_rate": 2.7947752291359053e-06, | |
| "loss": 0.4256, | |
| "num_input_tokens_seen": 20244608, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 5.394871794871795, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 2.7932427274841715e-06, | |
| "loss": 0.4576, | |
| "num_input_tokens_seen": 20324992, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 5.415384615384616, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 2.7917049485887705e-06, | |
| "loss": 0.5155, | |
| "num_input_tokens_seen": 20402304, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 5.435897435897436, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 2.790161898724808e-06, | |
| "loss": 0.4304, | |
| "num_input_tokens_seen": 20480800, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 5.456410256410257, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 2.7886135841888973e-06, | |
| "loss": 0.4759, | |
| "num_input_tokens_seen": 20560096, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 5.476923076923077, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 2.787060011299135e-06, | |
| "loss": 0.4974, | |
| "num_input_tokens_seen": 20645216, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 5.4974358974358974, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 2.785501186395077e-06, | |
| "loss": 0.5174, | |
| "num_input_tokens_seen": 20738688, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 5.517948717948718, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 2.7839371158377077e-06, | |
| "loss": 0.5272, | |
| "num_input_tokens_seen": 20812928, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 5.538461538461538, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 2.78236780600942e-06, | |
| "loss": 0.5129, | |
| "num_input_tokens_seen": 20890592, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 5.558974358974359, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 2.780793263313984e-06, | |
| "loss": 0.484, | |
| "num_input_tokens_seen": 20961984, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 5.579487179487179, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 2.7792134941765247e-06, | |
| "loss": 0.4793, | |
| "num_input_tokens_seen": 21030784, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 2.7776285050434937e-06, | |
| "loss": 0.4521, | |
| "num_input_tokens_seen": 21108960, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 5.62051282051282, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 2.7760383023826425e-06, | |
| "loss": 0.4192, | |
| "num_input_tokens_seen": 21181728, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 5.641025641025641, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 2.7744428926829993e-06, | |
| "loss": 0.5131, | |
| "num_input_tokens_seen": 21255328, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 5.661538461538462, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 2.7728422824548387e-06, | |
| "loss": 0.483, | |
| "num_input_tokens_seen": 21324064, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 5.682051282051282, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 2.7712364782296567e-06, | |
| "loss": 0.4858, | |
| "num_input_tokens_seen": 21399040, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 5.702564102564103, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 2.769625486560145e-06, | |
| "loss": 0.4629, | |
| "num_input_tokens_seen": 21472640, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 5.723076923076923, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 2.7680093140201625e-06, | |
| "loss": 0.5023, | |
| "num_input_tokens_seen": 21544448, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 5.743589743589744, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 2.766387967204709e-06, | |
| "loss": 0.4903, | |
| "num_input_tokens_seen": 21611136, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 5.764102564102564, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 2.7647614527299007e-06, | |
| "loss": 0.5558, | |
| "num_input_tokens_seen": 21703040, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 5.7846153846153845, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 2.763129777232938e-06, | |
| "loss": 0.5612, | |
| "num_input_tokens_seen": 21784096, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 5.805128205128205, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 2.7614929473720847e-06, | |
| "loss": 0.4683, | |
| "num_input_tokens_seen": 21855072, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 5.825641025641025, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 2.7598509698266346e-06, | |
| "loss": 0.5171, | |
| "num_input_tokens_seen": 21933312, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 5.846153846153846, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 2.758203851296889e-06, | |
| "loss": 0.5214, | |
| "num_input_tokens_seen": 22019008, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 5.866666666666667, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 2.756551598504128e-06, | |
| "loss": 0.4975, | |
| "num_input_tokens_seen": 22092864, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 5.887179487179488, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 2.7548942181905816e-06, | |
| "loss": 0.4853, | |
| "num_input_tokens_seen": 22171584, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 5.907692307692308, | |
| "grad_norm": 0.375, | |
| "learning_rate": 2.753231717119405e-06, | |
| "loss": 0.483, | |
| "num_input_tokens_seen": 22241376, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 5.9282051282051285, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 2.751564102074646e-06, | |
| "loss": 0.4965, | |
| "num_input_tokens_seen": 22313664, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 5.948717948717949, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 2.749891379861225e-06, | |
| "loss": 0.5342, | |
| "num_input_tokens_seen": 22397408, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 5.969230769230769, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 2.748213557304899e-06, | |
| "loss": 0.4233, | |
| "num_input_tokens_seen": 22473664, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 5.98974358974359, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 2.74653064125224e-06, | |
| "loss": 0.5244, | |
| "num_input_tokens_seen": 22553760, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 6.01025641025641, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 2.7448426385706036e-06, | |
| "loss": 0.5211, | |
| "num_input_tokens_seen": 22624608, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 6.030769230769231, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 2.7431495561481027e-06, | |
| "loss": 0.5618, | |
| "num_input_tokens_seen": 22710048, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 6.051282051282051, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 2.741451400893578e-06, | |
| "loss": 0.5172, | |
| "num_input_tokens_seen": 22787392, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 6.0717948717948715, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 2.739748179736571e-06, | |
| "loss": 0.5035, | |
| "num_input_tokens_seen": 22865120, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 6.092307692307692, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 2.7380398996272955e-06, | |
| "loss": 0.519, | |
| "num_input_tokens_seen": 22952832, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 6.112820512820512, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 2.736326567536609e-06, | |
| "loss": 0.4438, | |
| "num_input_tokens_seen": 23028544, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 6.133333333333334, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 2.7346081904559827e-06, | |
| "loss": 0.4669, | |
| "num_input_tokens_seen": 23100096, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 6.153846153846154, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 2.732884775397477e-06, | |
| "loss": 0.4702, | |
| "num_input_tokens_seen": 23183392, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 6.174358974358975, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 2.731156329393709e-06, | |
| "loss": 0.5031, | |
| "num_input_tokens_seen": 23266208, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 6.194871794871795, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 2.729422859497825e-06, | |
| "loss": 0.5005, | |
| "num_input_tokens_seen": 23348064, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 6.2153846153846155, | |
| "grad_norm": 0.1552734375, | |
| "learning_rate": 2.7276843727834727e-06, | |
| "loss": 0.4798, | |
| "num_input_tokens_seen": 23420128, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 6.235897435897436, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 2.725940876344771e-06, | |
| "loss": 0.5059, | |
| "num_input_tokens_seen": 23497056, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 6.256410256410256, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 2.7241923772962823e-06, | |
| "loss": 0.4582, | |
| "num_input_tokens_seen": 23564928, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 6.276923076923077, | |
| "grad_norm": 0.1220703125, | |
| "learning_rate": 2.722438882772982e-06, | |
| "loss": 0.4295, | |
| "num_input_tokens_seen": 23646624, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 6.297435897435897, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 2.720680399930231e-06, | |
| "loss": 0.4682, | |
| "num_input_tokens_seen": 23716960, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 6.317948717948718, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 2.7189169359437443e-06, | |
| "loss": 0.4944, | |
| "num_input_tokens_seen": 23796032, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 6.338461538461538, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 2.7171484980095653e-06, | |
| "loss": 0.4405, | |
| "num_input_tokens_seen": 23868768, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 6.358974358974359, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 2.715375093344032e-06, | |
| "loss": 0.4742, | |
| "num_input_tokens_seen": 23937824, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 6.37948717948718, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 2.713596729183751e-06, | |
| "loss": 0.4654, | |
| "num_input_tokens_seen": 24009472, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 2.7118134127855667e-06, | |
| "loss": 0.4686, | |
| "num_input_tokens_seen": 24096256, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 6.420512820512821, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 2.7100251514265317e-06, | |
| "loss": 0.5152, | |
| "num_input_tokens_seen": 24180640, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 6.441025641025641, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 2.7082319524038764e-06, | |
| "loss": 0.4762, | |
| "num_input_tokens_seen": 24251296, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 6.461538461538462, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 2.706433823034981e-06, | |
| "loss": 0.5113, | |
| "num_input_tokens_seen": 24329760, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 6.482051282051282, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 2.7046307706573445e-06, | |
| "loss": 0.4942, | |
| "num_input_tokens_seen": 24420896, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 6.5025641025641026, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 2.702822802628554e-06, | |
| "loss": 0.475, | |
| "num_input_tokens_seen": 24495360, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 6.523076923076923, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 2.701009926326256e-06, | |
| "loss": 0.4801, | |
| "num_input_tokens_seen": 24572192, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 6.543589743589743, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 2.6991921491481267e-06, | |
| "loss": 0.4776, | |
| "num_input_tokens_seen": 24647552, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 6.564102564102564, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 2.6973694785118394e-06, | |
| "loss": 0.4878, | |
| "num_input_tokens_seen": 24719136, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 6.584615384615384, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 2.695541921855037e-06, | |
| "loss": 0.5138, | |
| "num_input_tokens_seen": 24800320, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 6.605128205128205, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 2.6937094866353006e-06, | |
| "loss": 0.4782, | |
| "num_input_tokens_seen": 24877088, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 6.625641025641025, | |
| "grad_norm": 0.498046875, | |
| "learning_rate": 2.6918721803301174e-06, | |
| "loss": 0.5043, | |
| "num_input_tokens_seen": 24954272, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 6.6461538461538465, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 2.690030010436853e-06, | |
| "loss": 0.4237, | |
| "num_input_tokens_seen": 25023744, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 2.688182984472719e-06, | |
| "loss": 0.5302, | |
| "num_input_tokens_seen": 25105664, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 6.687179487179487, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 2.686331109974743e-06, | |
| "loss": 0.4991, | |
| "num_input_tokens_seen": 25183680, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 6.707692307692308, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 2.684474394499738e-06, | |
| "loss": 0.5142, | |
| "num_input_tokens_seen": 25265920, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 6.728205128205128, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 2.6826128456242708e-06, | |
| "loss": 0.4651, | |
| "num_input_tokens_seen": 25343648, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 6.748717948717949, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 2.680746470944631e-06, | |
| "loss": 0.5633, | |
| "num_input_tokens_seen": 25418176, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 6.769230769230769, | |
| "grad_norm": 0.375, | |
| "learning_rate": 2.6788752780768007e-06, | |
| "loss": 0.5124, | |
| "num_input_tokens_seen": 25504832, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 6.78974358974359, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 2.6769992746564256e-06, | |
| "loss": 0.5046, | |
| "num_input_tokens_seen": 25582112, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 6.81025641025641, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 2.6751184683387777e-06, | |
| "loss": 0.484, | |
| "num_input_tokens_seen": 25656992, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 6.8307692307692305, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 2.67323286679873e-06, | |
| "loss": 0.4526, | |
| "num_input_tokens_seen": 25729600, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 6.851282051282051, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 2.671342477730723e-06, | |
| "loss": 0.4563, | |
| "num_input_tokens_seen": 25801536, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 6.871794871794872, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 2.6694473088487324e-06, | |
| "loss": 0.4951, | |
| "num_input_tokens_seen": 25882912, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 6.892307692307693, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 2.6675473678862403e-06, | |
| "loss": 0.5223, | |
| "num_input_tokens_seen": 25957952, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 6.912820512820513, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 2.6656426625961993e-06, | |
| "loss": 0.5471, | |
| "num_input_tokens_seen": 26034432, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 6.933333333333334, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 2.6637332007510063e-06, | |
| "loss": 0.4252, | |
| "num_input_tokens_seen": 26106656, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 6.953846153846154, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 2.661818990142465e-06, | |
| "loss": 0.5269, | |
| "num_input_tokens_seen": 26186976, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 6.9743589743589745, | |
| "grad_norm": 0.25, | |
| "learning_rate": 2.65990003858176e-06, | |
| "loss": 0.4487, | |
| "num_input_tokens_seen": 26259264, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 6.994871794871795, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 2.6579763538994197e-06, | |
| "loss": 0.4705, | |
| "num_input_tokens_seen": 26333248, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 7.015384615384615, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 2.656047943945287e-06, | |
| "loss": 0.4443, | |
| "num_input_tokens_seen": 26404832, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 7.035897435897436, | |
| "grad_norm": 0.1865234375, | |
| "learning_rate": 2.6541148165884885e-06, | |
| "loss": 0.4615, | |
| "num_input_tokens_seen": 26484608, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 7.056410256410256, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 2.652176979717399e-06, | |
| "loss": 0.5042, | |
| "num_input_tokens_seen": 26556224, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 7.076923076923077, | |
| "grad_norm": 0.16796875, | |
| "learning_rate": 2.6502344412396116e-06, | |
| "loss": 0.4601, | |
| "num_input_tokens_seen": 26629632, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 7.097435897435897, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 2.6482872090819053e-06, | |
| "loss": 0.534, | |
| "num_input_tokens_seen": 26709568, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 7.1179487179487175, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 2.646335291190211e-06, | |
| "loss": 0.4875, | |
| "num_input_tokens_seen": 26785728, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 7.138461538461539, | |
| "grad_norm": 0.150390625, | |
| "learning_rate": 2.6443786955295827e-06, | |
| "loss": 0.5223, | |
| "num_input_tokens_seen": 26865024, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 7.158974358974359, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 2.6424174300841606e-06, | |
| "loss": 0.4365, | |
| "num_input_tokens_seen": 26934720, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 7.17948717948718, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 2.6404515028571406e-06, | |
| "loss": 0.4951, | |
| "num_input_tokens_seen": 27008192, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 7.2, | |
| "grad_norm": 0.11962890625, | |
| "learning_rate": 2.638480921870743e-06, | |
| "loss": 0.5132, | |
| "num_input_tokens_seen": 27092000, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 7.220512820512821, | |
| "grad_norm": 0.1298828125, | |
| "learning_rate": 2.636505695166177e-06, | |
| "loss": 0.4713, | |
| "num_input_tokens_seen": 27172160, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 7.241025641025641, | |
| "grad_norm": 0.16015625, | |
| "learning_rate": 2.63452583080361e-06, | |
| "loss": 0.479, | |
| "num_input_tokens_seen": 27255712, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 7.2615384615384615, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 2.6325413368621337e-06, | |
| "loss": 0.4967, | |
| "num_input_tokens_seen": 27343136, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 7.282051282051282, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 2.630552221439732e-06, | |
| "loss": 0.4843, | |
| "num_input_tokens_seen": 27417312, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 7.302564102564102, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 2.6285584926532465e-06, | |
| "loss": 0.4738, | |
| "num_input_tokens_seen": 27505824, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 7.323076923076923, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 2.626560158638344e-06, | |
| "loss": 0.5716, | |
| "num_input_tokens_seen": 27583776, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 7.343589743589743, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 2.6245572275494845e-06, | |
| "loss": 0.515, | |
| "num_input_tokens_seen": 27658912, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 7.364102564102564, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 2.6225497075598865e-06, | |
| "loss": 0.47, | |
| "num_input_tokens_seen": 27733472, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 7.384615384615385, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 2.6205376068614943e-06, | |
| "loss": 0.4749, | |
| "num_input_tokens_seen": 27812160, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 7.4051282051282055, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 2.6185209336649438e-06, | |
| "loss": 0.4727, | |
| "num_input_tokens_seen": 27885024, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 7.425641025641026, | |
| "grad_norm": 0.75, | |
| "learning_rate": 2.61649969619953e-06, | |
| "loss": 0.478, | |
| "num_input_tokens_seen": 27956480, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 7.446153846153846, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 2.614473902713173e-06, | |
| "loss": 0.4778, | |
| "num_input_tokens_seen": 28028032, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 7.466666666666667, | |
| "grad_norm": 0.49609375, | |
| "learning_rate": 2.612443561472385e-06, | |
| "loss": 0.4443, | |
| "num_input_tokens_seen": 28112992, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 7.487179487179487, | |
| "grad_norm": 0.5, | |
| "learning_rate": 2.610408680762234e-06, | |
| "loss": 0.5186, | |
| "num_input_tokens_seen": 28191520, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 7.507692307692308, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 2.6083692688863135e-06, | |
| "loss": 0.5152, | |
| "num_input_tokens_seen": 28277440, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 7.528205128205128, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 2.6063253341667064e-06, | |
| "loss": 0.5173, | |
| "num_input_tokens_seen": 28357440, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 7.5487179487179485, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 2.604276884943953e-06, | |
| "loss": 0.4585, | |
| "num_input_tokens_seen": 28426656, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 7.569230769230769, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 2.602223929577013e-06, | |
| "loss": 0.4611, | |
| "num_input_tokens_seen": 28499968, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 7.589743589743589, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 2.6001664764432363e-06, | |
| "loss": 0.4929, | |
| "num_input_tokens_seen": 28573664, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 7.61025641025641, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 2.5981045339383244e-06, | |
| "loss": 0.5018, | |
| "num_input_tokens_seen": 28658144, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 7.63076923076923, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 2.596038110476301e-06, | |
| "loss": 0.483, | |
| "num_input_tokens_seen": 28730944, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 7.651282051282052, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 2.593967214489473e-06, | |
| "loss": 0.5111, | |
| "num_input_tokens_seen": 28816384, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 7.671794871794872, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 2.591891854428398e-06, | |
| "loss": 0.4689, | |
| "num_input_tokens_seen": 28891616, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 7.6923076923076925, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 2.5898120387618507e-06, | |
| "loss": 0.4917, | |
| "num_input_tokens_seen": 28970400, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 7.712820512820513, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 2.587727775976787e-06, | |
| "loss": 0.4956, | |
| "num_input_tokens_seen": 29051520, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 7.733333333333333, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 2.585639074578309e-06, | |
| "loss": 0.438, | |
| "num_input_tokens_seen": 29128544, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 7.753846153846154, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 2.5835459430896333e-06, | |
| "loss": 0.4644, | |
| "num_input_tokens_seen": 29210496, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 7.774358974358974, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 2.5814483900520522e-06, | |
| "loss": 0.4901, | |
| "num_input_tokens_seen": 29282400, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 7.794871794871795, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 2.5793464240249014e-06, | |
| "loss": 0.4879, | |
| "num_input_tokens_seen": 29352256, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 7.815384615384615, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 2.5772400535855242e-06, | |
| "loss": 0.4552, | |
| "num_input_tokens_seen": 29426336, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 7.835897435897436, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 2.575129287329237e-06, | |
| "loss": 0.5417, | |
| "num_input_tokens_seen": 29512224, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 7.856410256410256, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 2.5730141338692926e-06, | |
| "loss": 0.4637, | |
| "num_input_tokens_seen": 29590112, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 7.876923076923077, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 2.5708946018368487e-06, | |
| "loss": 0.4486, | |
| "num_input_tokens_seen": 29672608, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 7.897435897435898, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 2.568770699880928e-06, | |
| "loss": 0.5094, | |
| "num_input_tokens_seen": 29755520, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 7.917948717948718, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 2.566642436668387e-06, | |
| "loss": 0.5111, | |
| "num_input_tokens_seen": 29833344, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 7.938461538461539, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 2.5645098208838774e-06, | |
| "loss": 0.4737, | |
| "num_input_tokens_seen": 29904800, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 7.958974358974359, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 2.562372861229813e-06, | |
| "loss": 0.4384, | |
| "num_input_tokens_seen": 29975488, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 7.97948717948718, | |
| "grad_norm": 0.197265625, | |
| "learning_rate": 2.5602315664263337e-06, | |
| "loss": 0.4383, | |
| "num_input_tokens_seen": 30046496, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 2.5580859452112685e-06, | |
| "loss": 0.4782, | |
| "num_input_tokens_seen": 30119840, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 8.02051282051282, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 2.555936006340101e-06, | |
| "loss": 0.5371, | |
| "num_input_tokens_seen": 30207040, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 8.04102564102564, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 2.553781758585935e-06, | |
| "loss": 0.4867, | |
| "num_input_tokens_seen": 30283968, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 8.061538461538461, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 2.551623210739455e-06, | |
| "loss": 0.4309, | |
| "num_input_tokens_seen": 30355552, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 8.082051282051282, | |
| "grad_norm": 0.12109375, | |
| "learning_rate": 2.549460371608895e-06, | |
| "loss": 0.5087, | |
| "num_input_tokens_seen": 30435776, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 8.102564102564102, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 2.5472932500199976e-06, | |
| "loss": 0.4746, | |
| "num_input_tokens_seen": 30507616, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 8.123076923076923, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 2.5451218548159823e-06, | |
| "loss": 0.4833, | |
| "num_input_tokens_seen": 30583456, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 8.143589743589743, | |
| "grad_norm": 0.12890625, | |
| "learning_rate": 2.5429461948575077e-06, | |
| "loss": 0.4849, | |
| "num_input_tokens_seen": 30654176, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 8.164102564102564, | |
| "grad_norm": 0.1416015625, | |
| "learning_rate": 2.540766279022634e-06, | |
| "loss": 0.4812, | |
| "num_input_tokens_seen": 30721920, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 8.184615384615384, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 2.53858211620679e-06, | |
| "loss": 0.4976, | |
| "num_input_tokens_seen": 30791264, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 8.205128205128204, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 2.536393715322732e-06, | |
| "loss": 0.4556, | |
| "num_input_tokens_seen": 30862336, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 8.225641025641025, | |
| "grad_norm": 0.150390625, | |
| "learning_rate": 2.5342010853005127e-06, | |
| "loss": 0.4496, | |
| "num_input_tokens_seen": 30940064, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 8.246153846153845, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 2.532004235087441e-06, | |
| "loss": 0.4722, | |
| "num_input_tokens_seen": 31013248, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 8.266666666666667, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 2.529803173648049e-06, | |
| "loss": 0.4875, | |
| "num_input_tokens_seen": 31094496, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 8.287179487179488, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 2.52759790996405e-06, | |
| "loss": 0.4598, | |
| "num_input_tokens_seen": 31171680, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 8.307692307692308, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 2.525388453034307e-06, | |
| "loss": 0.5069, | |
| "num_input_tokens_seen": 31252064, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 8.328205128205129, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 2.5231748118747945e-06, | |
| "loss": 0.5155, | |
| "num_input_tokens_seen": 31329696, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 8.34871794871795, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 2.5209569955185604e-06, | |
| "loss": 0.5436, | |
| "num_input_tokens_seen": 31407648, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 8.36923076923077, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 2.51873501301569e-06, | |
| "loss": 0.4953, | |
| "num_input_tokens_seen": 31475200, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 8.38974358974359, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 2.5165088734332695e-06, | |
| "loss": 0.4804, | |
| "num_input_tokens_seen": 31547104, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 8.41025641025641, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 2.5142785858553486e-06, | |
| "loss": 0.5533, | |
| "num_input_tokens_seen": 31629440, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 8.430769230769231, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 2.512044159382903e-06, | |
| "loss": 0.541, | |
| "num_input_tokens_seen": 31713024, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 8.451282051282051, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 2.5098056031337975e-06, | |
| "loss": 0.4444, | |
| "num_input_tokens_seen": 31790432, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 8.471794871794872, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 2.5075629262427507e-06, | |
| "loss": 0.4869, | |
| "num_input_tokens_seen": 31870592, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 8.492307692307692, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 2.505316137861294e-06, | |
| "loss": 0.4855, | |
| "num_input_tokens_seen": 31945344, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 8.512820512820513, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 2.503065247157737e-06, | |
| "loss": 0.5027, | |
| "num_input_tokens_seen": 32030016, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 8.533333333333333, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 2.500810263317129e-06, | |
| "loss": 0.4885, | |
| "num_input_tokens_seen": 32108160, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 8.553846153846154, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 2.4985511955412238e-06, | |
| "loss": 0.4451, | |
| "num_input_tokens_seen": 32188288, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 8.574358974358974, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 2.4962880530484375e-06, | |
| "loss": 0.4899, | |
| "num_input_tokens_seen": 32266656, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 8.594871794871795, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 2.4940208450738146e-06, | |
| "loss": 0.5083, | |
| "num_input_tokens_seen": 32356544, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 8.615384615384615, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 2.49174958086899e-06, | |
| "loss": 0.4839, | |
| "num_input_tokens_seen": 32434720, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 8.635897435897435, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 2.48947426970215e-06, | |
| "loss": 0.437, | |
| "num_input_tokens_seen": 32507712, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 8.656410256410256, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 2.487194920857995e-06, | |
| "loss": 0.435, | |
| "num_input_tokens_seen": 32577216, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 8.676923076923076, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 2.484911543637702e-06, | |
| "loss": 0.4768, | |
| "num_input_tokens_seen": 32647552, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 8.697435897435897, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 2.4826241473588855e-06, | |
| "loss": 0.4578, | |
| "num_input_tokens_seen": 32727520, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 8.717948717948717, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 2.4803327413555623e-06, | |
| "loss": 0.5142, | |
| "num_input_tokens_seen": 32805440, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 8.73846153846154, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 2.4780373349781083e-06, | |
| "loss": 0.4013, | |
| "num_input_tokens_seen": 32880480, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 8.75897435897436, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 2.4757379375932265e-06, | |
| "loss": 0.4616, | |
| "num_input_tokens_seen": 32951936, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 8.77948717948718, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 2.473434558583903e-06, | |
| "loss": 0.4791, | |
| "num_input_tokens_seen": 33027104, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 8.8, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 2.4711272073493745e-06, | |
| "loss": 0.5163, | |
| "num_input_tokens_seen": 33111040, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 8.820512820512821, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 2.468815893305084e-06, | |
| "loss": 0.4761, | |
| "num_input_tokens_seen": 33202304, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 8.841025641025642, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 2.466500625882646e-06, | |
| "loss": 0.4405, | |
| "num_input_tokens_seen": 33281376, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 8.861538461538462, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 2.464181414529809e-06, | |
| "loss": 0.4538, | |
| "num_input_tokens_seen": 33352640, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 8.882051282051282, | |
| "grad_norm": 0.19140625, | |
| "learning_rate": 2.4618582687104132e-06, | |
| "loss": 0.4598, | |
| "num_input_tokens_seen": 33423232, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 8.902564102564103, | |
| "grad_norm": 0.169921875, | |
| "learning_rate": 2.4595311979043545e-06, | |
| "loss": 0.4556, | |
| "num_input_tokens_seen": 33503744, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 8.923076923076923, | |
| "grad_norm": 0.16796875, | |
| "learning_rate": 2.4572002116075454e-06, | |
| "loss": 0.4665, | |
| "num_input_tokens_seen": 33576800, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 8.943589743589744, | |
| "grad_norm": 0.1748046875, | |
| "learning_rate": 2.454865319331876e-06, | |
| "loss": 0.4683, | |
| "num_input_tokens_seen": 33661120, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 8.964102564102564, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 2.4525265306051755e-06, | |
| "loss": 0.5183, | |
| "num_input_tokens_seen": 33733568, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 8.984615384615385, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 2.4501838549711723e-06, | |
| "loss": 0.4671, | |
| "num_input_tokens_seen": 33802592, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 9.005128205128205, | |
| "grad_norm": 0.126953125, | |
| "learning_rate": 2.447837301989457e-06, | |
| "loss": 0.4858, | |
| "num_input_tokens_seen": 33882272, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 9.025641025641026, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 2.4454868812354403e-06, | |
| "loss": 0.4574, | |
| "num_input_tokens_seen": 33953920, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 9.046153846153846, | |
| "grad_norm": 0.11669921875, | |
| "learning_rate": 2.4431326023003188e-06, | |
| "loss": 0.4419, | |
| "num_input_tokens_seen": 34027552, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 9.066666666666666, | |
| "grad_norm": 0.11669921875, | |
| "learning_rate": 2.44077447479103e-06, | |
| "loss": 0.5126, | |
| "num_input_tokens_seen": 34112480, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 9.087179487179487, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 2.4384125083302178e-06, | |
| "loss": 0.4517, | |
| "num_input_tokens_seen": 34183840, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 9.107692307692307, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 2.4360467125561907e-06, | |
| "loss": 0.5161, | |
| "num_input_tokens_seen": 34258912, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 9.128205128205128, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 2.433677097122883e-06, | |
| "loss": 0.4486, | |
| "num_input_tokens_seen": 34329248, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 9.148717948717948, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 2.4313036716998154e-06, | |
| "loss": 0.5191, | |
| "num_input_tokens_seen": 34411232, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 9.169230769230769, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 2.428926445972058e-06, | |
| "loss": 0.5117, | |
| "num_input_tokens_seen": 34486144, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 9.189743589743589, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 2.4265454296401857e-06, | |
| "loss": 0.4739, | |
| "num_input_tokens_seen": 34564864, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 9.21025641025641, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 2.4241606324202426e-06, | |
| "loss": 0.4468, | |
| "num_input_tokens_seen": 34640704, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 9.23076923076923, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 2.4217720640437015e-06, | |
| "loss": 0.457, | |
| "num_input_tokens_seen": 34715744, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 9.25128205128205, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 2.4193797342574235e-06, | |
| "loss": 0.4915, | |
| "num_input_tokens_seen": 34798144, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 9.271794871794873, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 2.4169836528236187e-06, | |
| "loss": 0.4417, | |
| "num_input_tokens_seen": 34873440, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 9.292307692307693, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 2.4145838295198066e-06, | |
| "loss": 0.4999, | |
| "num_input_tokens_seen": 34951552, | |
| "step": 2265 | |
| }, | |
| { | |
| "epoch": 9.312820512820513, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 2.4121802741387743e-06, | |
| "loss": 0.453, | |
| "num_input_tokens_seen": 35021184, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 9.333333333333334, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 2.4097729964885407e-06, | |
| "loss": 0.4473, | |
| "num_input_tokens_seen": 35098080, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 9.353846153846154, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 2.4073620063923123e-06, | |
| "loss": 0.4749, | |
| "num_input_tokens_seen": 35170336, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 9.374358974358975, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 2.404947313688445e-06, | |
| "loss": 0.5229, | |
| "num_input_tokens_seen": 35255200, | |
| "step": 2285 | |
| }, | |
| { | |
| "epoch": 9.394871794871795, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 2.4025289282304037e-06, | |
| "loss": 0.5158, | |
| "num_input_tokens_seen": 35328928, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 9.415384615384616, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 2.4001068598867216e-06, | |
| "loss": 0.4548, | |
| "num_input_tokens_seen": 35402976, | |
| "step": 2295 | |
| }, | |
| { | |
| "epoch": 9.435897435897436, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 2.397681118540961e-06, | |
| "loss": 0.4313, | |
| "num_input_tokens_seen": 35481344, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 9.456410256410257, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 2.3952517140916724e-06, | |
| "loss": 0.4664, | |
| "num_input_tokens_seen": 35553664, | |
| "step": 2305 | |
| }, | |
| { | |
| "epoch": 9.476923076923077, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 2.392818656452354e-06, | |
| "loss": 0.4948, | |
| "num_input_tokens_seen": 35630592, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 9.497435897435897, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 2.39038195555141e-06, | |
| "loss": 0.5035, | |
| "num_input_tokens_seen": 35712864, | |
| "step": 2315 | |
| }, | |
| { | |
| "epoch": 9.517948717948718, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 2.387941621332114e-06, | |
| "loss": 0.4964, | |
| "num_input_tokens_seen": 35790784, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 9.538461538461538, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 2.3854976637525637e-06, | |
| "loss": 0.4684, | |
| "num_input_tokens_seen": 35868960, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 9.558974358974359, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 2.3830500927856433e-06, | |
| "loss": 0.5117, | |
| "num_input_tokens_seen": 35956832, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 9.57948717948718, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 2.3805989184189813e-06, | |
| "loss": 0.4574, | |
| "num_input_tokens_seen": 36027520, | |
| "step": 2335 | |
| }, | |
| { | |
| "epoch": 9.6, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 2.378144150654911e-06, | |
| "loss": 0.5291, | |
| "num_input_tokens_seen": 36109248, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 9.62051282051282, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 2.3756857995104286e-06, | |
| "loss": 0.4528, | |
| "num_input_tokens_seen": 36179584, | |
| "step": 2345 | |
| }, | |
| { | |
| "epoch": 9.64102564102564, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 2.3732238750171527e-06, | |
| "loss": 0.4976, | |
| "num_input_tokens_seen": 36257216, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 9.661538461538461, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 2.3707583872212837e-06, | |
| "loss": 0.4576, | |
| "num_input_tokens_seen": 36326272, | |
| "step": 2355 | |
| }, | |
| { | |
| "epoch": 9.682051282051281, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 2.3682893461835626e-06, | |
| "loss": 0.5315, | |
| "num_input_tokens_seen": 36405504, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 9.702564102564102, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 2.3658167619792294e-06, | |
| "loss": 0.5407, | |
| "num_input_tokens_seen": 36481536, | |
| "step": 2365 | |
| }, | |
| { | |
| "epoch": 9.723076923076922, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 2.363340644697983e-06, | |
| "loss": 0.4525, | |
| "num_input_tokens_seen": 36563200, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 9.743589743589745, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 2.360861004443939e-06, | |
| "loss": 0.5628, | |
| "num_input_tokens_seen": 36661120, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 9.764102564102565, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 2.358377851335589e-06, | |
| "loss": 0.4633, | |
| "num_input_tokens_seen": 36738880, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 9.784615384615385, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 2.3558911955057592e-06, | |
| "loss": 0.4438, | |
| "num_input_tokens_seen": 36820128, | |
| "step": 2385 | |
| }, | |
| { | |
| "epoch": 9.805128205128206, | |
| "grad_norm": 0.1806640625, | |
| "learning_rate": 2.35340104710157e-06, | |
| "loss": 0.4609, | |
| "num_input_tokens_seen": 36900128, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 9.825641025641026, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 2.350907416284392e-06, | |
| "loss": 0.4843, | |
| "num_input_tokens_seen": 36969024, | |
| "step": 2395 | |
| }, | |
| { | |
| "epoch": 9.846153846153847, | |
| "grad_norm": 0.162109375, | |
| "learning_rate": 2.348410313229808e-06, | |
| "loss": 0.4607, | |
| "num_input_tokens_seen": 37053440, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 9.866666666666667, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 2.3459097481275687e-06, | |
| "loss": 0.5134, | |
| "num_input_tokens_seen": 37132128, | |
| "step": 2405 | |
| }, | |
| { | |
| "epoch": 9.887179487179488, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 2.343405731181552e-06, | |
| "loss": 0.478, | |
| "num_input_tokens_seen": 37209664, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 9.907692307692308, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 2.3408982726097227e-06, | |
| "loss": 0.4864, | |
| "num_input_tokens_seen": 37283936, | |
| "step": 2415 | |
| }, | |
| { | |
| "epoch": 9.928205128205128, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 2.3383873826440878e-06, | |
| "loss": 0.4876, | |
| "num_input_tokens_seen": 37359552, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 9.948717948717949, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 2.3358730715306574e-06, | |
| "loss": 0.5265, | |
| "num_input_tokens_seen": 37434112, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 9.96923076923077, | |
| "grad_norm": 0.12353515625, | |
| "learning_rate": 2.3333553495294033e-06, | |
| "loss": 0.4759, | |
| "num_input_tokens_seen": 37511456, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 9.98974358974359, | |
| "grad_norm": 0.11865234375, | |
| "learning_rate": 2.330834226914214e-06, | |
| "loss": 0.4336, | |
| "num_input_tokens_seen": 37584320, | |
| "step": 2435 | |
| }, | |
| { | |
| "epoch": 10.01025641025641, | |
| "grad_norm": 0.123046875, | |
| "learning_rate": 2.3283097139728557e-06, | |
| "loss": 0.5338, | |
| "num_input_tokens_seen": 37672864, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 10.03076923076923, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 2.3257818210069277e-06, | |
| "loss": 0.4542, | |
| "num_input_tokens_seen": 37743488, | |
| "step": 2445 | |
| }, | |
| { | |
| "epoch": 10.051282051282051, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 2.3232505583318246e-06, | |
| "loss": 0.5379, | |
| "num_input_tokens_seen": 37832256, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 10.071794871794872, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 2.320715936276689e-06, | |
| "loss": 0.4418, | |
| "num_input_tokens_seen": 37908832, | |
| "step": 2455 | |
| }, | |
| { | |
| "epoch": 10.092307692307692, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 2.3181779651843738e-06, | |
| "loss": 0.5294, | |
| "num_input_tokens_seen": 37989088, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 10.112820512820512, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 2.3156366554113967e-06, | |
| "loss": 0.4438, | |
| "num_input_tokens_seen": 38061024, | |
| "step": 2465 | |
| }, | |
| { | |
| "epoch": 10.133333333333333, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 2.3130920173278997e-06, | |
| "loss": 0.4544, | |
| "num_input_tokens_seen": 38136480, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 10.153846153846153, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 2.310544061317607e-06, | |
| "loss": 0.4918, | |
| "num_input_tokens_seen": 38213280, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 10.174358974358974, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 2.307992797777782e-06, | |
| "loss": 0.4821, | |
| "num_input_tokens_seen": 38283296, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 10.194871794871794, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 2.3054382371191836e-06, | |
| "loss": 0.464, | |
| "num_input_tokens_seen": 38360320, | |
| "step": 2485 | |
| }, | |
| { | |
| "epoch": 10.215384615384615, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 2.3028803897660256e-06, | |
| "loss": 0.4829, | |
| "num_input_tokens_seen": 38440000, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 10.235897435897435, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 2.3003192661559346e-06, | |
| "loss": 0.4495, | |
| "num_input_tokens_seen": 38510208, | |
| "step": 2495 | |
| }, | |
| { | |
| "epoch": 10.256410256410255, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 2.297754876739905e-06, | |
| "loss": 0.5503, | |
| "num_input_tokens_seen": 38600256, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 10.276923076923078, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 2.2951872319822597e-06, | |
| "loss": 0.5341, | |
| "num_input_tokens_seen": 38679168, | |
| "step": 2505 | |
| }, | |
| { | |
| "epoch": 10.297435897435898, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 2.2926163423606027e-06, | |
| "loss": 0.4903, | |
| "num_input_tokens_seen": 38752064, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 10.317948717948719, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 2.2900422183657816e-06, | |
| "loss": 0.4543, | |
| "num_input_tokens_seen": 38829504, | |
| "step": 2515 | |
| }, | |
| { | |
| "epoch": 10.338461538461539, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 2.2874648705018403e-06, | |
| "loss": 0.5428, | |
| "num_input_tokens_seen": 38915904, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 10.35897435897436, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 2.28488430928598e-06, | |
| "loss": 0.4588, | |
| "num_input_tokens_seen": 38997760, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 10.37948717948718, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 2.282300545248512e-06, | |
| "loss": 0.4441, | |
| "num_input_tokens_seen": 39068000, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 10.4, | |
| "grad_norm": 0.375, | |
| "learning_rate": 2.27971358893282e-06, | |
| "loss": 0.4441, | |
| "num_input_tokens_seen": 39136480, | |
| "step": 2535 | |
| }, | |
| { | |
| "epoch": 10.42051282051282, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 2.2771234508953116e-06, | |
| "loss": 0.442, | |
| "num_input_tokens_seen": 39219488, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 10.441025641025641, | |
| "grad_norm": 0.625, | |
| "learning_rate": 2.27453014170538e-06, | |
| "loss": 0.5199, | |
| "num_input_tokens_seen": 39295712, | |
| "step": 2545 | |
| }, | |
| { | |
| "epoch": 10.461538461538462, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 2.271933671945357e-06, | |
| "loss": 0.5147, | |
| "num_input_tokens_seen": 39371008, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 10.482051282051282, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 2.2693340522104727e-06, | |
| "loss": 0.5296, | |
| "num_input_tokens_seen": 39450944, | |
| "step": 2555 | |
| }, | |
| { | |
| "epoch": 10.502564102564103, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 2.2667312931088096e-06, | |
| "loss": 0.4707, | |
| "num_input_tokens_seen": 39530912, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 10.523076923076923, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 2.264125405261263e-06, | |
| "loss": 0.4726, | |
| "num_input_tokens_seen": 39602400, | |
| "step": 2565 | |
| }, | |
| { | |
| "epoch": 10.543589743589743, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 2.261516399301493e-06, | |
| "loss": 0.4344, | |
| "num_input_tokens_seen": 39679232, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 10.564102564102564, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 2.2589042858758853e-06, | |
| "loss": 0.4427, | |
| "num_input_tokens_seen": 39755904, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 10.584615384615384, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 2.256289075643506e-06, | |
| "loss": 0.4975, | |
| "num_input_tokens_seen": 39826368, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 10.605128205128205, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 2.2536707792760566e-06, | |
| "loss": 0.5045, | |
| "num_input_tokens_seen": 39906464, | |
| "step": 2585 | |
| }, | |
| { | |
| "epoch": 10.625641025641025, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 2.251049407457833e-06, | |
| "loss": 0.4833, | |
| "num_input_tokens_seen": 39975712, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 10.646153846153846, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 2.2484249708856823e-06, | |
| "loss": 0.4984, | |
| "num_input_tokens_seen": 40060032, | |
| "step": 2595 | |
| }, | |
| { | |
| "epoch": 10.666666666666666, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 2.2457974802689545e-06, | |
| "loss": 0.4186, | |
| "num_input_tokens_seen": 40131520, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 10.687179487179487, | |
| "grad_norm": 0.1962890625, | |
| "learning_rate": 2.2431669463294646e-06, | |
| "loss": 0.4441, | |
| "num_input_tokens_seen": 40205760, | |
| "step": 2605 | |
| }, | |
| { | |
| "epoch": 10.707692307692307, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 2.2405333798014453e-06, | |
| "loss": 0.5337, | |
| "num_input_tokens_seen": 40288992, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 10.728205128205127, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 2.237896791431505e-06, | |
| "loss": 0.4703, | |
| "num_input_tokens_seen": 40369440, | |
| "step": 2615 | |
| }, | |
| { | |
| "epoch": 10.74871794871795, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 2.2352571919785812e-06, | |
| "loss": 0.5217, | |
| "num_input_tokens_seen": 40452288, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 10.76923076923077, | |
| "grad_norm": 0.1572265625, | |
| "learning_rate": 2.2326145922139004e-06, | |
| "loss": 0.4475, | |
| "num_input_tokens_seen": 40523808, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 10.78974358974359, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 2.2299690029209313e-06, | |
| "loss": 0.4734, | |
| "num_input_tokens_seen": 40606496, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 10.810256410256411, | |
| "grad_norm": 0.1259765625, | |
| "learning_rate": 2.227320434895343e-06, | |
| "loss": 0.4686, | |
| "num_input_tokens_seen": 40684672, | |
| "step": 2635 | |
| }, | |
| { | |
| "epoch": 10.830769230769231, | |
| "grad_norm": 0.1279296875, | |
| "learning_rate": 2.2246688989449577e-06, | |
| "loss": 0.5027, | |
| "num_input_tokens_seen": 40762752, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 10.851282051282052, | |
| "grad_norm": 0.12109375, | |
| "learning_rate": 2.2220144058897104e-06, | |
| "loss": 0.4582, | |
| "num_input_tokens_seen": 40851776, | |
| "step": 2645 | |
| }, | |
| { | |
| "epoch": 10.871794871794872, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 2.2193569665616017e-06, | |
| "loss": 0.4516, | |
| "num_input_tokens_seen": 40922304, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 10.892307692307693, | |
| "grad_norm": 0.12109375, | |
| "learning_rate": 2.2166965918046554e-06, | |
| "loss": 0.4346, | |
| "num_input_tokens_seen": 40998784, | |
| "step": 2655 | |
| }, | |
| { | |
| "epoch": 10.912820512820513, | |
| "grad_norm": 0.16015625, | |
| "learning_rate": 2.214033292474874e-06, | |
| "loss": 0.5093, | |
| "num_input_tokens_seen": 41075872, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 10.933333333333334, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 2.2113670794401935e-06, | |
| "loss": 0.4853, | |
| "num_input_tokens_seen": 41149728, | |
| "step": 2665 | |
| }, | |
| { | |
| "epoch": 10.953846153846154, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 2.20869796358044e-06, | |
| "loss": 0.5042, | |
| "num_input_tokens_seen": 41220160, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 10.974358974358974, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 2.2060259557872845e-06, | |
| "loss": 0.4601, | |
| "num_input_tokens_seen": 41287712, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 10.994871794871795, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 2.2033510669641997e-06, | |
| "loss": 0.4138, | |
| "num_input_tokens_seen": 41363264, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 11.015384615384615, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 2.2006733080264144e-06, | |
| "loss": 0.4724, | |
| "num_input_tokens_seen": 41437152, | |
| "step": 2685 | |
| }, | |
| { | |
| "epoch": 11.035897435897436, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 2.197992689900869e-06, | |
| "loss": 0.4932, | |
| "num_input_tokens_seen": 41515520, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 11.056410256410256, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 2.195309223526171e-06, | |
| "loss": 0.4299, | |
| "num_input_tokens_seen": 41587200, | |
| "step": 2695 | |
| }, | |
| { | |
| "epoch": 11.076923076923077, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 2.192622919852551e-06, | |
| "loss": 0.4774, | |
| "num_input_tokens_seen": 41663008, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 11.097435897435897, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 2.1899337898418174e-06, | |
| "loss": 0.5241, | |
| "num_input_tokens_seen": 41743264, | |
| "step": 2705 | |
| }, | |
| { | |
| "epoch": 11.117948717948718, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 2.187241844467313e-06, | |
| "loss": 0.498, | |
| "num_input_tokens_seen": 41823264, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 11.138461538461538, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 2.1845470947138658e-06, | |
| "loss": 0.5311, | |
| "num_input_tokens_seen": 41914560, | |
| "step": 2715 | |
| }, | |
| { | |
| "epoch": 11.158974358974358, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 2.181849551577751e-06, | |
| "loss": 0.4464, | |
| "num_input_tokens_seen": 41988288, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 11.179487179487179, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 2.179149226066641e-06, | |
| "loss": 0.4905, | |
| "num_input_tokens_seen": 42068416, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 11.2, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 2.1764461291995618e-06, | |
| "loss": 0.4629, | |
| "num_input_tokens_seen": 42139744, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 11.22051282051282, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 2.173740272006849e-06, | |
| "loss": 0.5278, | |
| "num_input_tokens_seen": 42228672, | |
| "step": 2735 | |
| }, | |
| { | |
| "epoch": 11.24102564102564, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 2.1710316655301016e-06, | |
| "loss": 0.4513, | |
| "num_input_tokens_seen": 42306528, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 11.261538461538462, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 2.1683203208221375e-06, | |
| "loss": 0.4917, | |
| "num_input_tokens_seen": 42389024, | |
| "step": 2745 | |
| }, | |
| { | |
| "epoch": 11.282051282051283, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 2.165606248946948e-06, | |
| "loss": 0.4159, | |
| "num_input_tokens_seen": 42473088, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 11.302564102564103, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 2.1628894609796533e-06, | |
| "loss": 0.4917, | |
| "num_input_tokens_seen": 42551360, | |
| "step": 2755 | |
| }, | |
| { | |
| "epoch": 11.323076923076924, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 2.1601699680064573e-06, | |
| "loss": 0.5037, | |
| "num_input_tokens_seen": 42626688, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 11.343589743589744, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 2.1574477811246014e-06, | |
| "loss": 0.4756, | |
| "num_input_tokens_seen": 42705056, | |
| "step": 2765 | |
| }, | |
| { | |
| "epoch": 11.364102564102565, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 2.1547229114423207e-06, | |
| "loss": 0.4985, | |
| "num_input_tokens_seen": 42777632, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 11.384615384615385, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 2.1519953700787963e-06, | |
| "loss": 0.4561, | |
| "num_input_tokens_seen": 42845888, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 11.405128205128205, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 2.149265168164113e-06, | |
| "loss": 0.5091, | |
| "num_input_tokens_seen": 42922976, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 11.425641025641026, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 2.146532316839211e-06, | |
| "loss": 0.4711, | |
| "num_input_tokens_seen": 42996000, | |
| "step": 2785 | |
| }, | |
| { | |
| "epoch": 11.446153846153846, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 2.1437968272558435e-06, | |
| "loss": 0.457, | |
| "num_input_tokens_seen": 43074688, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 11.466666666666667, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 2.1410587105765275e-06, | |
| "loss": 0.541, | |
| "num_input_tokens_seen": 43157280, | |
| "step": 2795 | |
| }, | |
| { | |
| "epoch": 11.487179487179487, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 2.138317977974501e-06, | |
| "loss": 0.4279, | |
| "num_input_tokens_seen": 43234016, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 11.507692307692308, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 2.135574640633678e-06, | |
| "loss": 0.5213, | |
| "num_input_tokens_seen": 43310816, | |
| "step": 2805 | |
| }, | |
| { | |
| "epoch": 11.528205128205128, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 2.132828709748598e-06, | |
| "loss": 0.4444, | |
| "num_input_tokens_seen": 43382976, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 11.548717948717949, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 2.130080196524388e-06, | |
| "loss": 0.4768, | |
| "num_input_tokens_seen": 43462944, | |
| "step": 2815 | |
| }, | |
| { | |
| "epoch": 11.569230769230769, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 2.1273291121767094e-06, | |
| "loss": 0.4376, | |
| "num_input_tokens_seen": 43535232, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 11.58974358974359, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 2.124575467931717e-06, | |
| "loss": 0.4342, | |
| "num_input_tokens_seen": 43618528, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 11.61025641025641, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 2.1218192750260114e-06, | |
| "loss": 0.4596, | |
| "num_input_tokens_seen": 43691904, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 11.63076923076923, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 2.119060544706592e-06, | |
| "loss": 0.4811, | |
| "num_input_tokens_seen": 43760480, | |
| "step": 2835 | |
| }, | |
| { | |
| "epoch": 11.65128205128205, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 2.1162992882308147e-06, | |
| "loss": 0.4864, | |
| "num_input_tokens_seen": 43829984, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 11.671794871794871, | |
| "grad_norm": 0.1875, | |
| "learning_rate": 2.1135355168663417e-06, | |
| "loss": 0.4678, | |
| "num_input_tokens_seen": 43906816, | |
| "step": 2845 | |
| }, | |
| { | |
| "epoch": 11.692307692307692, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 2.1107692418910985e-06, | |
| "loss": 0.5001, | |
| "num_input_tokens_seen": 43984960, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 11.712820512820512, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 2.1080004745932274e-06, | |
| "loss": 0.4662, | |
| "num_input_tokens_seen": 44061440, | |
| "step": 2855 | |
| }, | |
| { | |
| "epoch": 11.733333333333333, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 2.1052292262710392e-06, | |
| "loss": 0.526, | |
| "num_input_tokens_seen": 44147008, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 11.753846153846155, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 2.102455508232971e-06, | |
| "loss": 0.4821, | |
| "num_input_tokens_seen": 44224224, | |
| "step": 2865 | |
| }, | |
| { | |
| "epoch": 11.774358974358975, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 2.099679331797536e-06, | |
| "loss": 0.4923, | |
| "num_input_tokens_seen": 44302816, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 11.794871794871796, | |
| "grad_norm": 0.1650390625, | |
| "learning_rate": 2.0969007082932803e-06, | |
| "loss": 0.521, | |
| "num_input_tokens_seen": 44376160, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 11.815384615384616, | |
| "grad_norm": 0.125, | |
| "learning_rate": 2.0941196490587354e-06, | |
| "loss": 0.4932, | |
| "num_input_tokens_seen": 44459200, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 11.835897435897436, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 2.0913361654423723e-06, | |
| "loss": 0.4866, | |
| "num_input_tokens_seen": 44536128, | |
| "step": 2885 | |
| }, | |
| { | |
| "epoch": 11.856410256410257, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 2.0885502688025538e-06, | |
| "loss": 0.4826, | |
| "num_input_tokens_seen": 44605088, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 11.876923076923077, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 2.0857619705074912e-06, | |
| "loss": 0.4433, | |
| "num_input_tokens_seen": 44677984, | |
| "step": 2895 | |
| }, | |
| { | |
| "epoch": 11.897435897435898, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 2.082971281935195e-06, | |
| "loss": 0.4122, | |
| "num_input_tokens_seen": 44751200, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 11.917948717948718, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 2.0801782144734295e-06, | |
| "loss": 0.4266, | |
| "num_input_tokens_seen": 44824672, | |
| "step": 2905 | |
| }, | |
| { | |
| "epoch": 11.938461538461539, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 2.0773827795196667e-06, | |
| "loss": 0.4574, | |
| "num_input_tokens_seen": 44912768, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 11.95897435897436, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 2.074584988481039e-06, | |
| "loss": 0.5026, | |
| "num_input_tokens_seen": 44993632, | |
| "step": 2915 | |
| }, | |
| { | |
| "epoch": 11.97948717948718, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 2.0717848527742935e-06, | |
| "loss": 0.5444, | |
| "num_input_tokens_seen": 45070848, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 2.0689823838257455e-06, | |
| "loss": 0.4509, | |
| "num_input_tokens_seen": 45150496, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 12.02051282051282, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 2.0661775930712297e-06, | |
| "loss": 0.4534, | |
| "num_input_tokens_seen": 45225440, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 12.04102564102564, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 2.0633704919560573e-06, | |
| "loss": 0.5264, | |
| "num_input_tokens_seen": 45300992, | |
| "step": 2935 | |
| }, | |
| { | |
| "epoch": 12.061538461538461, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 2.0605610919349658e-06, | |
| "loss": 0.4373, | |
| "num_input_tokens_seen": 45378944, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 12.082051282051282, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 2.0577494044720746e-06, | |
| "loss": 0.4779, | |
| "num_input_tokens_seen": 45453504, | |
| "step": 2945 | |
| }, | |
| { | |
| "epoch": 12.102564102564102, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 2.0549354410408364e-06, | |
| "loss": 0.556, | |
| "num_input_tokens_seen": 45532992, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 12.123076923076923, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 2.052119213123992e-06, | |
| "loss": 0.5152, | |
| "num_input_tokens_seen": 45609120, | |
| "step": 2955 | |
| }, | |
| { | |
| "epoch": 12.143589743589743, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 2.049300732213522e-06, | |
| "loss": 0.4412, | |
| "num_input_tokens_seen": 45690624, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 12.164102564102564, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 2.046480009810602e-06, | |
| "loss": 0.4553, | |
| "num_input_tokens_seen": 45763264, | |
| "step": 2965 | |
| }, | |
| { | |
| "epoch": 12.184615384615384, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 2.0436570574255523e-06, | |
| "loss": 0.4913, | |
| "num_input_tokens_seen": 45849472, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 12.205128205128204, | |
| "grad_norm": 0.41015625, | |
| "learning_rate": 2.0408318865777953e-06, | |
| "loss": 0.5487, | |
| "num_input_tokens_seen": 45927552, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 12.225641025641025, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 2.0380045087958036e-06, | |
| "loss": 0.465, | |
| "num_input_tokens_seen": 46002656, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 12.246153846153845, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 2.0351749356170574e-06, | |
| "loss": 0.4854, | |
| "num_input_tokens_seen": 46087904, | |
| "step": 2985 | |
| }, | |
| { | |
| "epoch": 12.266666666666667, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 2.032343178587995e-06, | |
| "loss": 0.4568, | |
| "num_input_tokens_seen": 46165408, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 12.287179487179488, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 2.0295092492639657e-06, | |
| "loss": 0.4926, | |
| "num_input_tokens_seen": 46237344, | |
| "step": 2995 | |
| }, | |
| { | |
| "epoch": 12.307692307692308, | |
| "grad_norm": 0.5, | |
| "learning_rate": 2.0266731592091834e-06, | |
| "loss": 0.5093, | |
| "num_input_tokens_seen": 46307456, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 12.328205128205129, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 2.0238349199966793e-06, | |
| "loss": 0.5077, | |
| "num_input_tokens_seen": 46390688, | |
| "step": 3005 | |
| }, | |
| { | |
| "epoch": 12.34871794871795, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 2.020994543208254e-06, | |
| "loss": 0.4541, | |
| "num_input_tokens_seen": 46467904, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 12.36923076923077, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 2.018152040434432e-06, | |
| "loss": 0.4975, | |
| "num_input_tokens_seen": 46545632, | |
| "step": 3015 | |
| }, | |
| { | |
| "epoch": 12.38974358974359, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 2.015307423274411e-06, | |
| "loss": 0.4988, | |
| "num_input_tokens_seen": 46624768, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 12.41025641025641, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 2.0124607033360193e-06, | |
| "loss": 0.4877, | |
| "num_input_tokens_seen": 46694528, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 12.430769230769231, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 2.0096118922356646e-06, | |
| "loss": 0.4621, | |
| "num_input_tokens_seen": 46767520, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 12.451282051282051, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 2.0067610015982868e-06, | |
| "loss": 0.4742, | |
| "num_input_tokens_seen": 46843616, | |
| "step": 3035 | |
| }, | |
| { | |
| "epoch": 12.471794871794872, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 2.0039080430573133e-06, | |
| "loss": 0.3993, | |
| "num_input_tokens_seen": 46915136, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 12.492307692307692, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 2.0010530282546093e-06, | |
| "loss": 0.464, | |
| "num_input_tokens_seen": 46992000, | |
| "step": 3045 | |
| }, | |
| { | |
| "epoch": 12.512820512820513, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 1.9981959688404303e-06, | |
| "loss": 0.4744, | |
| "num_input_tokens_seen": 47075360, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 12.533333333333333, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 1.9953368764733763e-06, | |
| "loss": 0.4788, | |
| "num_input_tokens_seen": 47152704, | |
| "step": 3055 | |
| }, | |
| { | |
| "epoch": 12.553846153846154, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 1.992475762820342e-06, | |
| "loss": 0.4704, | |
| "num_input_tokens_seen": 47229696, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 12.574358974358974, | |
| "grad_norm": 0.193359375, | |
| "learning_rate": 1.9896126395564695e-06, | |
| "loss": 0.4645, | |
| "num_input_tokens_seen": 47307360, | |
| "step": 3065 | |
| }, | |
| { | |
| "epoch": 12.594871794871795, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 1.986747518365104e-06, | |
| "loss": 0.5485, | |
| "num_input_tokens_seen": 47393824, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 12.615384615384615, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 1.9838804109377405e-06, | |
| "loss": 0.511, | |
| "num_input_tokens_seen": 47475104, | |
| "step": 3075 | |
| }, | |
| { | |
| "epoch": 12.635897435897435, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 1.9810113289739818e-06, | |
| "loss": 0.4624, | |
| "num_input_tokens_seen": 47548704, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 12.656410256410256, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 1.9781402841814855e-06, | |
| "loss": 0.5197, | |
| "num_input_tokens_seen": 47638880, | |
| "step": 3085 | |
| }, | |
| { | |
| "epoch": 12.676923076923076, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 1.9752672882759204e-06, | |
| "loss": 0.4277, | |
| "num_input_tokens_seen": 47711456, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 12.697435897435897, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 1.972392352980917e-06, | |
| "loss": 0.5166, | |
| "num_input_tokens_seen": 47787776, | |
| "step": 3095 | |
| }, | |
| { | |
| "epoch": 12.717948717948717, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 1.969515490028019e-06, | |
| "loss": 0.4201, | |
| "num_input_tokens_seen": 47860480, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 12.73846153846154, | |
| "grad_norm": 0.1064453125, | |
| "learning_rate": 1.966636711156636e-06, | |
| "loss": 0.4425, | |
| "num_input_tokens_seen": 47936704, | |
| "step": 3105 | |
| }, | |
| { | |
| "epoch": 12.75897435897436, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 1.9637560281139982e-06, | |
| "loss": 0.5056, | |
| "num_input_tokens_seen": 48021792, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 12.77948717948718, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 1.960873452655102e-06, | |
| "loss": 0.4929, | |
| "num_input_tokens_seen": 48101984, | |
| "step": 3115 | |
| }, | |
| { | |
| "epoch": 12.8, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1.95798899654267e-06, | |
| "loss": 0.4868, | |
| "num_input_tokens_seen": 48186688, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 12.820512820512821, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 1.9551026715470954e-06, | |
| "loss": 0.4668, | |
| "num_input_tokens_seen": 48259872, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 12.841025641025642, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 1.952214489446401e-06, | |
| "loss": 0.53, | |
| "num_input_tokens_seen": 48347232, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 12.861538461538462, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 1.949324462026185e-06, | |
| "loss": 0.4388, | |
| "num_input_tokens_seen": 48422624, | |
| "step": 3135 | |
| }, | |
| { | |
| "epoch": 12.882051282051282, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 1.9464326010795776e-06, | |
| "loss": 0.4246, | |
| "num_input_tokens_seen": 48492288, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 12.902564102564103, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 1.9435389184071895e-06, | |
| "loss": 0.5186, | |
| "num_input_tokens_seen": 48568224, | |
| "step": 3145 | |
| }, | |
| { | |
| "epoch": 12.923076923076923, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 1.9406434258170666e-06, | |
| "loss": 0.4351, | |
| "num_input_tokens_seen": 48637280, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 12.943589743589744, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.9377461351246395e-06, | |
| "loss": 0.5281, | |
| "num_input_tokens_seen": 48717088, | |
| "step": 3155 | |
| }, | |
| { | |
| "epoch": 12.964102564102564, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1.9348470581526763e-06, | |
| "loss": 0.4308, | |
| "num_input_tokens_seen": 48787584, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 12.984615384615385, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 1.9319462067312344e-06, | |
| "loss": 0.4133, | |
| "num_input_tokens_seen": 48864640, | |
| "step": 3165 | |
| }, | |
| { | |
| "epoch": 13.005128205128205, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 1.929043592697612e-06, | |
| "loss": 0.4802, | |
| "num_input_tokens_seen": 48938176, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 13.025641025641026, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 1.926139227896299e-06, | |
| "loss": 0.4504, | |
| "num_input_tokens_seen": 49015040, | |
| "step": 3175 | |
| }, | |
| { | |
| "epoch": 13.046153846153846, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 1.923233124178932e-06, | |
| "loss": 0.494, | |
| "num_input_tokens_seen": 49091680, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 13.066666666666666, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 1.9203252934042403e-06, | |
| "loss": 0.4628, | |
| "num_input_tokens_seen": 49165600, | |
| "step": 3185 | |
| }, | |
| { | |
| "epoch": 13.087179487179487, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 1.9174157474380034e-06, | |
| "loss": 0.4431, | |
| "num_input_tokens_seen": 49247136, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 13.107692307692307, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 1.914504498152998e-06, | |
| "loss": 0.4479, | |
| "num_input_tokens_seen": 49319648, | |
| "step": 3195 | |
| }, | |
| { | |
| "epoch": 13.128205128205128, | |
| "grad_norm": 0.40625, | |
| "learning_rate": 1.9115915574289525e-06, | |
| "loss": 0.4278, | |
| "num_input_tokens_seen": 49393216, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 13.148717948717948, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 1.9086769371524966e-06, | |
| "loss": 0.5124, | |
| "num_input_tokens_seen": 49480864, | |
| "step": 3205 | |
| }, | |
| { | |
| "epoch": 13.169230769230769, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 1.9057606492171144e-06, | |
| "loss": 0.4438, | |
| "num_input_tokens_seen": 49559904, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 13.189743589743589, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 1.9028427055230948e-06, | |
| "loss": 0.4312, | |
| "num_input_tokens_seen": 49630240, | |
| "step": 3215 | |
| }, | |
| { | |
| "epoch": 13.21025641025641, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 1.8999231179774833e-06, | |
| "loss": 0.5431, | |
| "num_input_tokens_seen": 49709888, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 13.23076923076923, | |
| "grad_norm": 0.447265625, | |
| "learning_rate": 1.897001898494033e-06, | |
| "loss": 0.4567, | |
| "num_input_tokens_seen": 49778848, | |
| "step": 3225 | |
| }, | |
| { | |
| "epoch": 13.25128205128205, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 1.8940790589931568e-06, | |
| "loss": 0.4227, | |
| "num_input_tokens_seen": 49849024, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 13.271794871794873, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 1.8911546114018775e-06, | |
| "loss": 0.4738, | |
| "num_input_tokens_seen": 49917952, | |
| "step": 3235 | |
| }, | |
| { | |
| "epoch": 13.292307692307693, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 1.888228567653781e-06, | |
| "loss": 0.4661, | |
| "num_input_tokens_seen": 50003392, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 13.312820512820513, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 1.8853009396889665e-06, | |
| "loss": 0.5039, | |
| "num_input_tokens_seen": 50085280, | |
| "step": 3245 | |
| }, | |
| { | |
| "epoch": 13.333333333333334, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 1.8823717394539966e-06, | |
| "loss": 0.4442, | |
| "num_input_tokens_seen": 50166304, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 13.353846153846154, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 1.8794409789018507e-06, | |
| "loss": 0.5048, | |
| "num_input_tokens_seen": 50247200, | |
| "step": 3255 | |
| }, | |
| { | |
| "epoch": 13.374358974358975, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 1.8765086699918747e-06, | |
| "loss": 0.4615, | |
| "num_input_tokens_seen": 50321120, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 13.394871794871795, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 1.8735748246897337e-06, | |
| "loss": 0.4838, | |
| "num_input_tokens_seen": 50403680, | |
| "step": 3265 | |
| }, | |
| { | |
| "epoch": 13.415384615384616, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 1.8706394549673615e-06, | |
| "loss": 0.4454, | |
| "num_input_tokens_seen": 50481504, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 13.435897435897436, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1.8677025728029122e-06, | |
| "loss": 0.4806, | |
| "num_input_tokens_seen": 50556448, | |
| "step": 3275 | |
| }, | |
| { | |
| "epoch": 13.456410256410257, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 1.8647641901807126e-06, | |
| "loss": 0.4601, | |
| "num_input_tokens_seen": 50630304, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 13.476923076923077, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1.861824319091212e-06, | |
| "loss": 0.4513, | |
| "num_input_tokens_seen": 50704704, | |
| "step": 3285 | |
| }, | |
| { | |
| "epoch": 13.497435897435897, | |
| "grad_norm": 0.1572265625, | |
| "learning_rate": 1.8588829715309324e-06, | |
| "loss": 0.4704, | |
| "num_input_tokens_seen": 50782464, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 13.517948717948718, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 1.855940159502423e-06, | |
| "loss": 0.4959, | |
| "num_input_tokens_seen": 50861696, | |
| "step": 3295 | |
| }, | |
| { | |
| "epoch": 13.538461538461538, | |
| "grad_norm": 0.1591796875, | |
| "learning_rate": 1.8529958950142066e-06, | |
| "loss": 0.5215, | |
| "num_input_tokens_seen": 50949568, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 13.558974358974359, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 1.8500501900807345e-06, | |
| "loss": 0.4692, | |
| "num_input_tokens_seen": 51018432, | |
| "step": 3305 | |
| }, | |
| { | |
| "epoch": 13.57948717948718, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 1.847103056722335e-06, | |
| "loss": 0.5015, | |
| "num_input_tokens_seen": 51100352, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 13.6, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 1.8441545069651665e-06, | |
| "loss": 0.4779, | |
| "num_input_tokens_seen": 51178144, | |
| "step": 3315 | |
| }, | |
| { | |
| "epoch": 13.62051282051282, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 1.8412045528411652e-06, | |
| "loss": 0.4971, | |
| "num_input_tokens_seen": 51262432, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 13.64102564102564, | |
| "grad_norm": 0.11279296875, | |
| "learning_rate": 1.8382532063880005e-06, | |
| "loss": 0.5063, | |
| "num_input_tokens_seen": 51342624, | |
| "step": 3325 | |
| }, | |
| { | |
| "epoch": 13.661538461538461, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 1.8353004796490212e-06, | |
| "loss": 0.4273, | |
| "num_input_tokens_seen": 51413920, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 13.682051282051281, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 1.8323463846732099e-06, | |
| "loss": 0.5491, | |
| "num_input_tokens_seen": 51501184, | |
| "step": 3335 | |
| }, | |
| { | |
| "epoch": 13.702564102564102, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 1.8293909335151316e-06, | |
| "loss": 0.4199, | |
| "num_input_tokens_seen": 51570944, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 13.723076923076922, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1.826434138234886e-06, | |
| "loss": 0.4468, | |
| "num_input_tokens_seen": 51650784, | |
| "step": 3345 | |
| }, | |
| { | |
| "epoch": 13.743589743589745, | |
| "grad_norm": 0.150390625, | |
| "learning_rate": 1.8234760108980572e-06, | |
| "loss": 0.5225, | |
| "num_input_tokens_seen": 51731008, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 13.764102564102565, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 1.820516563575665e-06, | |
| "loss": 0.4634, | |
| "num_input_tokens_seen": 51809056, | |
| "step": 3355 | |
| }, | |
| { | |
| "epoch": 13.784615384615385, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 1.8175558083441164e-06, | |
| "loss": 0.5321, | |
| "num_input_tokens_seen": 51886688, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 13.805128205128206, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 1.814593757285154e-06, | |
| "loss": 0.4633, | |
| "num_input_tokens_seen": 51959488, | |
| "step": 3365 | |
| }, | |
| { | |
| "epoch": 13.825641025641026, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 1.8116304224858092e-06, | |
| "loss": 0.4812, | |
| "num_input_tokens_seen": 52035424, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 13.846153846153847, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 1.8086658160383524e-06, | |
| "loss": 0.506, | |
| "num_input_tokens_seen": 52112576, | |
| "step": 3375 | |
| }, | |
| { | |
| "epoch": 13.866666666666667, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 1.8056999500402414e-06, | |
| "loss": 0.5091, | |
| "num_input_tokens_seen": 52200480, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 13.887179487179488, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 1.8027328365940755e-06, | |
| "loss": 0.4861, | |
| "num_input_tokens_seen": 52274368, | |
| "step": 3385 | |
| }, | |
| { | |
| "epoch": 13.907692307692308, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 1.799764487807543e-06, | |
| "loss": 0.5051, | |
| "num_input_tokens_seen": 52350656, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 13.928205128205128, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 1.7967949157933742e-06, | |
| "loss": 0.4608, | |
| "num_input_tokens_seen": 52427328, | |
| "step": 3395 | |
| }, | |
| { | |
| "epoch": 13.948717948717949, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 1.7938241326692907e-06, | |
| "loss": 0.4719, | |
| "num_input_tokens_seen": 52503840, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 13.96923076923077, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 1.7908521505579554e-06, | |
| "loss": 0.5062, | |
| "num_input_tokens_seen": 52577056, | |
| "step": 3405 | |
| }, | |
| { | |
| "epoch": 13.98974358974359, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 1.7878789815869247e-06, | |
| "loss": 0.5096, | |
| "num_input_tokens_seen": 52656000, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 14.01025641025641, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 1.7849046378885977e-06, | |
| "loss": 0.4476, | |
| "num_input_tokens_seen": 52725312, | |
| "step": 3415 | |
| }, | |
| { | |
| "epoch": 14.03076923076923, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 1.7819291316001679e-06, | |
| "loss": 0.436, | |
| "num_input_tokens_seen": 52804736, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 14.051282051282051, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 1.7789524748635717e-06, | |
| "loss": 0.4761, | |
| "num_input_tokens_seen": 52885056, | |
| "step": 3425 | |
| }, | |
| { | |
| "epoch": 14.071794871794872, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 1.775974679825441e-06, | |
| "loss": 0.5039, | |
| "num_input_tokens_seen": 52963136, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 14.092307692307692, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 1.7729957586370525e-06, | |
| "loss": 0.5039, | |
| "num_input_tokens_seen": 53049856, | |
| "step": 3435 | |
| }, | |
| { | |
| "epoch": 14.112820512820512, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 1.7700157234542773e-06, | |
| "loss": 0.5397, | |
| "num_input_tokens_seen": 53127616, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 14.133333333333333, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 1.7670345864375339e-06, | |
| "loss": 0.4747, | |
| "num_input_tokens_seen": 53201600, | |
| "step": 3445 | |
| }, | |
| { | |
| "epoch": 14.153846153846153, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 1.7640523597517357e-06, | |
| "loss": 0.4582, | |
| "num_input_tokens_seen": 53275520, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 14.174358974358974, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 1.7610690555662435e-06, | |
| "loss": 0.4747, | |
| "num_input_tokens_seen": 53346720, | |
| "step": 3455 | |
| }, | |
| { | |
| "epoch": 14.194871794871794, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 1.7580846860548146e-06, | |
| "loss": 0.4758, | |
| "num_input_tokens_seen": 53420864, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 14.215384615384615, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 1.7550992633955535e-06, | |
| "loss": 0.4305, | |
| "num_input_tokens_seen": 53490976, | |
| "step": 3465 | |
| }, | |
| { | |
| "epoch": 14.235897435897435, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 1.7521127997708621e-06, | |
| "loss": 0.4533, | |
| "num_input_tokens_seen": 53570816, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 14.256410256410255, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 1.7491253073673903e-06, | |
| "loss": 0.5019, | |
| "num_input_tokens_seen": 53647072, | |
| "step": 3475 | |
| }, | |
| { | |
| "epoch": 14.276923076923078, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 1.7461367983759862e-06, | |
| "loss": 0.4575, | |
| "num_input_tokens_seen": 53727648, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 14.297435897435898, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 1.7431472849916455e-06, | |
| "loss": 0.4728, | |
| "num_input_tokens_seen": 53800544, | |
| "step": 3485 | |
| }, | |
| { | |
| "epoch": 14.317948717948719, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 1.7401567794134636e-06, | |
| "loss": 0.4677, | |
| "num_input_tokens_seen": 53875552, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 14.338461538461539, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.737165293844583e-06, | |
| "loss": 0.456, | |
| "num_input_tokens_seen": 53954208, | |
| "step": 3495 | |
| }, | |
| { | |
| "epoch": 14.35897435897436, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 1.7341728404921471e-06, | |
| "loss": 0.4798, | |
| "num_input_tokens_seen": 54033248, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 14.37948717948718, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 1.7311794315672477e-06, | |
| "loss": 0.4166, | |
| "num_input_tokens_seen": 54109440, | |
| "step": 3505 | |
| }, | |
| { | |
| "epoch": 14.4, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 1.7281850792848752e-06, | |
| "loss": 0.4663, | |
| "num_input_tokens_seen": 54186368, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 14.42051282051282, | |
| "grad_norm": 0.1591796875, | |
| "learning_rate": 1.7251897958638704e-06, | |
| "loss": 0.4053, | |
| "num_input_tokens_seen": 54267648, | |
| "step": 3515 | |
| }, | |
| { | |
| "epoch": 14.441025641025641, | |
| "grad_norm": 0.193359375, | |
| "learning_rate": 1.7221935935268735e-06, | |
| "loss": 0.5342, | |
| "num_input_tokens_seen": 54342304, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 14.461538461538462, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 1.719196484500274e-06, | |
| "loss": 0.4992, | |
| "num_input_tokens_seen": 54423808, | |
| "step": 3525 | |
| }, | |
| { | |
| "epoch": 14.482051282051282, | |
| "grad_norm": 0.1494140625, | |
| "learning_rate": 1.7161984810141625e-06, | |
| "loss": 0.5067, | |
| "num_input_tokens_seen": 54501760, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 14.502564102564103, | |
| "grad_norm": 0.126953125, | |
| "learning_rate": 1.7131995953022776e-06, | |
| "loss": 0.4286, | |
| "num_input_tokens_seen": 54576512, | |
| "step": 3535 | |
| }, | |
| { | |
| "epoch": 14.523076923076923, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 1.7101998396019593e-06, | |
| "loss": 0.436, | |
| "num_input_tokens_seen": 54649696, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 14.543589743589743, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 1.7071992261540983e-06, | |
| "loss": 0.4889, | |
| "num_input_tokens_seen": 54728864, | |
| "step": 3545 | |
| }, | |
| { | |
| "epoch": 14.564102564102564, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 1.7041977672030842e-06, | |
| "loss": 0.4585, | |
| "num_input_tokens_seen": 54797856, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 14.584615384615384, | |
| "grad_norm": 0.1064453125, | |
| "learning_rate": 1.7011954749967564e-06, | |
| "loss": 0.4646, | |
| "num_input_tokens_seen": 54875168, | |
| "step": 3555 | |
| }, | |
| { | |
| "epoch": 14.605128205128205, | |
| "grad_norm": 0.1474609375, | |
| "learning_rate": 1.6981923617863566e-06, | |
| "loss": 0.4894, | |
| "num_input_tokens_seen": 54949312, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 14.625641025641025, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1.6951884398264742e-06, | |
| "loss": 0.5159, | |
| "num_input_tokens_seen": 55024768, | |
| "step": 3565 | |
| }, | |
| { | |
| "epoch": 14.646153846153846, | |
| "grad_norm": 0.150390625, | |
| "learning_rate": 1.692183721375001e-06, | |
| "loss": 0.4221, | |
| "num_input_tokens_seen": 55100128, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 14.666666666666666, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 1.6891782186930767e-06, | |
| "loss": 0.5417, | |
| "num_input_tokens_seen": 55182080, | |
| "step": 3575 | |
| }, | |
| { | |
| "epoch": 14.687179487179487, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 1.6861719440450437e-06, | |
| "loss": 0.5072, | |
| "num_input_tokens_seen": 55274048, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 14.707692307692307, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 1.6831649096983923e-06, | |
| "loss": 0.5008, | |
| "num_input_tokens_seen": 55347648, | |
| "step": 3585 | |
| }, | |
| { | |
| "epoch": 14.728205128205127, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 1.680157127923715e-06, | |
| "loss": 0.4411, | |
| "num_input_tokens_seen": 55418368, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 14.74871794871795, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 1.677148610994652e-06, | |
| "loss": 0.4654, | |
| "num_input_tokens_seen": 55490176, | |
| "step": 3595 | |
| }, | |
| { | |
| "epoch": 14.76923076923077, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 1.6741393711878454e-06, | |
| "loss": 0.4576, | |
| "num_input_tokens_seen": 55570784, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 14.78974358974359, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1.6711294207828852e-06, | |
| "loss": 0.4488, | |
| "num_input_tokens_seen": 55643936, | |
| "step": 3605 | |
| }, | |
| { | |
| "epoch": 14.810256410256411, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 1.6681187720622627e-06, | |
| "loss": 0.5175, | |
| "num_input_tokens_seen": 55727936, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 14.830769230769231, | |
| "grad_norm": 0.484375, | |
| "learning_rate": 1.6651074373113176e-06, | |
| "loss": 0.4271, | |
| "num_input_tokens_seen": 55804992, | |
| "step": 3615 | |
| }, | |
| { | |
| "epoch": 14.851282051282052, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 1.66209542881819e-06, | |
| "loss": 0.4631, | |
| "num_input_tokens_seen": 55878976, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 14.871794871794872, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 1.6590827588737685e-06, | |
| "loss": 0.5453, | |
| "num_input_tokens_seen": 55969376, | |
| "step": 3625 | |
| }, | |
| { | |
| "epoch": 14.892307692307693, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 1.6560694397716412e-06, | |
| "loss": 0.5086, | |
| "num_input_tokens_seen": 56041024, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 14.912820512820513, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 1.6530554838080458e-06, | |
| "loss": 0.4602, | |
| "num_input_tokens_seen": 56117152, | |
| "step": 3635 | |
| }, | |
| { | |
| "epoch": 14.933333333333334, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 1.6500409032818175e-06, | |
| "loss": 0.5046, | |
| "num_input_tokens_seen": 56191264, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 14.953846153846154, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 1.6470257104943413e-06, | |
| "loss": 0.53, | |
| "num_input_tokens_seen": 56279264, | |
| "step": 3645 | |
| }, | |
| { | |
| "epoch": 14.974358974358974, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 1.6440099177494991e-06, | |
| "loss": 0.4639, | |
| "num_input_tokens_seen": 56350784, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 14.994871794871795, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 1.6409935373536227e-06, | |
| "loss": 0.5015, | |
| "num_input_tokens_seen": 56430432, | |
| "step": 3655 | |
| }, | |
| { | |
| "epoch": 15.015384615384615, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 1.6379765816154413e-06, | |
| "loss": 0.4503, | |
| "num_input_tokens_seen": 56509056, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 15.035897435897436, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 1.634959062846031e-06, | |
| "loss": 0.4688, | |
| "num_input_tokens_seen": 56578016, | |
| "step": 3665 | |
| }, | |
| { | |
| "epoch": 15.056410256410256, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 1.631940993358767e-06, | |
| "loss": 0.4761, | |
| "num_input_tokens_seen": 56653536, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 15.076923076923077, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 1.6289223854692708e-06, | |
| "loss": 0.4433, | |
| "num_input_tokens_seen": 56730560, | |
| "step": 3675 | |
| }, | |
| { | |
| "epoch": 15.097435897435897, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 1.6259032514953601e-06, | |
| "loss": 0.4315, | |
| "num_input_tokens_seen": 56813312, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 15.117948717948718, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 1.6228836037570015e-06, | |
| "loss": 0.4885, | |
| "num_input_tokens_seen": 56891264, | |
| "step": 3685 | |
| }, | |
| { | |
| "epoch": 15.138461538461538, | |
| "grad_norm": 0.400390625, | |
| "learning_rate": 1.619863454576256e-06, | |
| "loss": 0.4703, | |
| "num_input_tokens_seen": 56964864, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 15.158974358974358, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 1.6168428162772322e-06, | |
| "loss": 0.4518, | |
| "num_input_tokens_seen": 57050144, | |
| "step": 3695 | |
| }, | |
| { | |
| "epoch": 15.179487179487179, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 1.6138217011860336e-06, | |
| "loss": 0.4858, | |
| "num_input_tokens_seen": 57125632, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 15.2, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 1.6108001216307107e-06, | |
| "loss": 0.459, | |
| "num_input_tokens_seen": 57204128, | |
| "step": 3705 | |
| }, | |
| { | |
| "epoch": 15.22051282051282, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 1.6077780899412068e-06, | |
| "loss": 0.4639, | |
| "num_input_tokens_seen": 57281536, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 15.24102564102564, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 1.6047556184493133e-06, | |
| "loss": 0.4496, | |
| "num_input_tokens_seen": 57354816, | |
| "step": 3715 | |
| }, | |
| { | |
| "epoch": 15.261538461538462, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1.601732719488614e-06, | |
| "loss": 0.5043, | |
| "num_input_tokens_seen": 57435328, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 15.282051282051283, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 1.5987094053944384e-06, | |
| "loss": 0.4357, | |
| "num_input_tokens_seen": 57507424, | |
| "step": 3725 | |
| }, | |
| { | |
| "epoch": 15.302564102564103, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1.5956856885038086e-06, | |
| "loss": 0.5211, | |
| "num_input_tokens_seen": 57586880, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 15.323076923076924, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 1.592661581155392e-06, | |
| "loss": 0.5036, | |
| "num_input_tokens_seen": 57669600, | |
| "step": 3735 | |
| }, | |
| { | |
| "epoch": 15.343589743589744, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 1.5896370956894477e-06, | |
| "loss": 0.4751, | |
| "num_input_tokens_seen": 57744608, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 15.364102564102565, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 1.5866122444477794e-06, | |
| "loss": 0.4764, | |
| "num_input_tokens_seen": 57818496, | |
| "step": 3745 | |
| }, | |
| { | |
| "epoch": 15.384615384615385, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 1.5835870397736817e-06, | |
| "loss": 0.4551, | |
| "num_input_tokens_seen": 57898432, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 15.405128205128205, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 1.5805614940118928e-06, | |
| "loss": 0.5198, | |
| "num_input_tokens_seen": 57974624, | |
| "step": 3755 | |
| }, | |
| { | |
| "epoch": 15.425641025641026, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 1.577535619508542e-06, | |
| "loss": 0.5164, | |
| "num_input_tokens_seen": 58052576, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 15.446153846153846, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 1.5745094286111004e-06, | |
| "loss": 0.4699, | |
| "num_input_tokens_seen": 58126592, | |
| "step": 3765 | |
| }, | |
| { | |
| "epoch": 15.466666666666667, | |
| "grad_norm": 0.125, | |
| "learning_rate": 1.5714829336683297e-06, | |
| "loss": 0.4804, | |
| "num_input_tokens_seen": 58214784, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 15.487179487179487, | |
| "grad_norm": 0.1259765625, | |
| "learning_rate": 1.5684561470302337e-06, | |
| "loss": 0.4731, | |
| "num_input_tokens_seen": 58284480, | |
| "step": 3775 | |
| }, | |
| { | |
| "epoch": 15.507692307692308, | |
| "grad_norm": 0.1279296875, | |
| "learning_rate": 1.5654290810480041e-06, | |
| "loss": 0.5115, | |
| "num_input_tokens_seen": 58359488, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 15.528205128205128, | |
| "grad_norm": 0.1201171875, | |
| "learning_rate": 1.562401748073975e-06, | |
| "loss": 0.4713, | |
| "num_input_tokens_seen": 58433632, | |
| "step": 3785 | |
| }, | |
| { | |
| "epoch": 15.548717948717949, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 1.5593741604615679e-06, | |
| "loss": 0.5218, | |
| "num_input_tokens_seen": 58526048, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 15.569230769230769, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 1.5563463305652454e-06, | |
| "loss": 0.4432, | |
| "num_input_tokens_seen": 58593440, | |
| "step": 3795 | |
| }, | |
| { | |
| "epoch": 15.58974358974359, | |
| "grad_norm": 0.1552734375, | |
| "learning_rate": 1.5533182707404563e-06, | |
| "loss": 0.4793, | |
| "num_input_tokens_seen": 58664832, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 15.61025641025641, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 1.550289993343591e-06, | |
| "loss": 0.4287, | |
| "num_input_tokens_seen": 58734976, | |
| "step": 3805 | |
| }, | |
| { | |
| "epoch": 15.63076923076923, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 1.547261510731924e-06, | |
| "loss": 0.5701, | |
| "num_input_tokens_seen": 58826304, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 15.65128205128205, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 1.5442328352635706e-06, | |
| "loss": 0.4858, | |
| "num_input_tokens_seen": 58897216, | |
| "step": 3815 | |
| }, | |
| { | |
| "epoch": 15.671794871794871, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 1.54120397929743e-06, | |
| "loss": 0.4541, | |
| "num_input_tokens_seen": 58970880, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 15.692307692307692, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 1.5381749551931405e-06, | |
| "loss": 0.5178, | |
| "num_input_tokens_seen": 59049408, | |
| "step": 3825 | |
| }, | |
| { | |
| "epoch": 15.712820512820512, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 1.5351457753110244e-06, | |
| "loss": 0.4799, | |
| "num_input_tokens_seen": 59122336, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 15.733333333333333, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1.5321164520120421e-06, | |
| "loss": 0.4284, | |
| "num_input_tokens_seen": 59197600, | |
| "step": 3835 | |
| }, | |
| { | |
| "epoch": 15.753846153846155, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 1.5290869976577366e-06, | |
| "loss": 0.4343, | |
| "num_input_tokens_seen": 59268224, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 15.774358974358975, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 1.5260574246101875e-06, | |
| "loss": 0.4371, | |
| "num_input_tokens_seen": 59341408, | |
| "step": 3845 | |
| }, | |
| { | |
| "epoch": 15.794871794871796, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 1.5230277452319585e-06, | |
| "loss": 0.5017, | |
| "num_input_tokens_seen": 59423648, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 15.815384615384616, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 1.519997971886046e-06, | |
| "loss": 0.4473, | |
| "num_input_tokens_seen": 59495872, | |
| "step": 3855 | |
| }, | |
| { | |
| "epoch": 15.835897435897436, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 1.5169681169358314e-06, | |
| "loss": 0.4693, | |
| "num_input_tokens_seen": 59574880, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 15.856410256410257, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 1.513938192745028e-06, | |
| "loss": 0.4955, | |
| "num_input_tokens_seen": 59657472, | |
| "step": 3865 | |
| }, | |
| { | |
| "epoch": 15.876923076923077, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 1.5109082116776328e-06, | |
| "loss": 0.4219, | |
| "num_input_tokens_seen": 59733888, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 15.897435897435898, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 1.5078781860978732e-06, | |
| "loss": 0.4462, | |
| "num_input_tokens_seen": 59811392, | |
| "step": 3875 | |
| }, | |
| { | |
| "epoch": 15.917948717948718, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 1.5048481283701594e-06, | |
| "loss": 0.4938, | |
| "num_input_tokens_seen": 59886784, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 15.938461538461539, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 1.501818050859033e-06, | |
| "loss": 0.4465, | |
| "num_input_tokens_seen": 59959072, | |
| "step": 3885 | |
| }, | |
| { | |
| "epoch": 15.95897435897436, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 1.498787965929116e-06, | |
| "loss": 0.462, | |
| "num_input_tokens_seen": 60032832, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 15.97948717948718, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 1.49575788594506e-06, | |
| "loss": 0.5279, | |
| "num_input_tokens_seen": 60119456, | |
| "step": 3895 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 1.4927278232714975e-06, | |
| "loss": 0.4908, | |
| "num_input_tokens_seen": 60204448, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 16.02051282051282, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 1.4896977902729893e-06, | |
| "loss": 0.4607, | |
| "num_input_tokens_seen": 60279104, | |
| "step": 3905 | |
| }, | |
| { | |
| "epoch": 16.04102564102564, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 1.4866677993139761e-06, | |
| "loss": 0.4405, | |
| "num_input_tokens_seen": 60349504, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 16.06153846153846, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 1.4836378627587266e-06, | |
| "loss": 0.4744, | |
| "num_input_tokens_seen": 60420640, | |
| "step": 3915 | |
| }, | |
| { | |
| "epoch": 16.08205128205128, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 1.4806079929712874e-06, | |
| "loss": 0.4868, | |
| "num_input_tokens_seen": 60497728, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 16.102564102564102, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 1.477578202315433e-06, | |
| "loss": 0.5095, | |
| "num_input_tokens_seen": 60579008, | |
| "step": 3925 | |
| }, | |
| { | |
| "epoch": 16.123076923076923, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 1.4745485031546143e-06, | |
| "loss": 0.5101, | |
| "num_input_tokens_seen": 60657088, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 16.143589743589743, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 1.4715189078519094e-06, | |
| "loss": 0.4938, | |
| "num_input_tokens_seen": 60734560, | |
| "step": 3935 | |
| }, | |
| { | |
| "epoch": 16.164102564102564, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 1.468489428769973e-06, | |
| "loss": 0.4785, | |
| "num_input_tokens_seen": 60812960, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 16.184615384615384, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 1.4654600782709843e-06, | |
| "loss": 0.4953, | |
| "num_input_tokens_seen": 60892768, | |
| "step": 3945 | |
| }, | |
| { | |
| "epoch": 16.205128205128204, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 1.4624308687165985e-06, | |
| "loss": 0.4472, | |
| "num_input_tokens_seen": 60964096, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 16.225641025641025, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 1.4594018124678965e-06, | |
| "loss": 0.4191, | |
| "num_input_tokens_seen": 61034304, | |
| "step": 3955 | |
| }, | |
| { | |
| "epoch": 16.246153846153845, | |
| "grad_norm": 0.181640625, | |
| "learning_rate": 1.4563729218853329e-06, | |
| "loss": 0.465, | |
| "num_input_tokens_seen": 61113024, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 16.266666666666666, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 1.4533442093286853e-06, | |
| "loss": 0.4703, | |
| "num_input_tokens_seen": 61184256, | |
| "step": 3965 | |
| }, | |
| { | |
| "epoch": 16.287179487179486, | |
| "grad_norm": 0.1416015625, | |
| "learning_rate": 1.4503156871570062e-06, | |
| "loss": 0.4773, | |
| "num_input_tokens_seen": 61262208, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 16.307692307692307, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 1.4472873677285706e-06, | |
| "loss": 0.4172, | |
| "num_input_tokens_seen": 61334560, | |
| "step": 3975 | |
| }, | |
| { | |
| "epoch": 16.328205128205127, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 1.4442592634008266e-06, | |
| "loss": 0.4326, | |
| "num_input_tokens_seen": 61405888, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 16.348717948717947, | |
| "grad_norm": 0.1240234375, | |
| "learning_rate": 1.4412313865303438e-06, | |
| "loss": 0.5303, | |
| "num_input_tokens_seen": 61489280, | |
| "step": 3985 | |
| }, | |
| { | |
| "epoch": 16.369230769230768, | |
| "grad_norm": 0.16015625, | |
| "learning_rate": 1.4382037494727649e-06, | |
| "loss": 0.4726, | |
| "num_input_tokens_seen": 61562912, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 16.38974358974359, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 1.435176364582752e-06, | |
| "loss": 0.458, | |
| "num_input_tokens_seen": 61652064, | |
| "step": 3995 | |
| }, | |
| { | |
| "epoch": 16.41025641025641, | |
| "grad_norm": 0.1171875, | |
| "learning_rate": 1.4321492442139405e-06, | |
| "loss": 0.5266, | |
| "num_input_tokens_seen": 61737536, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 16.43076923076923, | |
| "grad_norm": 0.158203125, | |
| "learning_rate": 1.4291224007188849e-06, | |
| "loss": 0.4394, | |
| "num_input_tokens_seen": 61811840, | |
| "step": 4005 | |
| }, | |
| { | |
| "epoch": 16.45128205128205, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 1.4260958464490104e-06, | |
| "loss": 0.4474, | |
| "num_input_tokens_seen": 61884288, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 16.47179487179487, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1.4230695937545616e-06, | |
| "loss": 0.5, | |
| "num_input_tokens_seen": 61958944, | |
| "step": 4015 | |
| }, | |
| { | |
| "epoch": 16.49230769230769, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 1.420043654984553e-06, | |
| "loss": 0.4766, | |
| "num_input_tokens_seen": 62038016, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 16.51282051282051, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 1.4170180424867176e-06, | |
| "loss": 0.4593, | |
| "num_input_tokens_seen": 62107616, | |
| "step": 4025 | |
| }, | |
| { | |
| "epoch": 16.533333333333335, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 1.4139927686074577e-06, | |
| "loss": 0.4446, | |
| "num_input_tokens_seen": 62188352, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 16.553846153846155, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 1.4109678456917926e-06, | |
| "loss": 0.4658, | |
| "num_input_tokens_seen": 62259456, | |
| "step": 4035 | |
| }, | |
| { | |
| "epoch": 16.574358974358976, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 1.4079432860833104e-06, | |
| "loss": 0.5363, | |
| "num_input_tokens_seen": 62356448, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 16.594871794871796, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 1.4049191021241176e-06, | |
| "loss": 0.5006, | |
| "num_input_tokens_seen": 62429824, | |
| "step": 4045 | |
| }, | |
| { | |
| "epoch": 16.615384615384617, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 1.4018953061547853e-06, | |
| "loss": 0.4638, | |
| "num_input_tokens_seen": 62514528, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 16.635897435897437, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 1.3988719105143038e-06, | |
| "loss": 0.4413, | |
| "num_input_tokens_seen": 62593472, | |
| "step": 4055 | |
| }, | |
| { | |
| "epoch": 16.656410256410258, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 1.395848927540028e-06, | |
| "loss": 0.4506, | |
| "num_input_tokens_seen": 62671520, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 16.676923076923078, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 1.39282636956763e-06, | |
| "loss": 0.4616, | |
| "num_input_tokens_seen": 62749856, | |
| "step": 4065 | |
| }, | |
| { | |
| "epoch": 16.6974358974359, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 1.3898042489310471e-06, | |
| "loss": 0.456, | |
| "num_input_tokens_seen": 62823616, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 16.71794871794872, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 1.3867825779624325e-06, | |
| "loss": 0.45, | |
| "num_input_tokens_seen": 62908416, | |
| "step": 4075 | |
| }, | |
| { | |
| "epoch": 16.73846153846154, | |
| "grad_norm": 0.41015625, | |
| "learning_rate": 1.3837613689921037e-06, | |
| "loss": 0.4563, | |
| "num_input_tokens_seen": 62978016, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 16.75897435897436, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 1.380740634348494e-06, | |
| "loss": 0.4952, | |
| "num_input_tokens_seen": 63065952, | |
| "step": 4085 | |
| }, | |
| { | |
| "epoch": 16.77948717948718, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 1.3777203863580993e-06, | |
| "loss": 0.4327, | |
| "num_input_tokens_seen": 63141792, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 16.8, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 1.3747006373454321e-06, | |
| "loss": 0.5152, | |
| "num_input_tokens_seen": 63223520, | |
| "step": 4095 | |
| }, | |
| { | |
| "epoch": 16.82051282051282, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 1.3716813996329669e-06, | |
| "loss": 0.4952, | |
| "num_input_tokens_seen": 63297696, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 16.84102564102564, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 1.3686626855410929e-06, | |
| "loss": 0.4873, | |
| "num_input_tokens_seen": 63378304, | |
| "step": 4105 | |
| }, | |
| { | |
| "epoch": 16.861538461538462, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 1.3656445073880615e-06, | |
| "loss": 0.4461, | |
| "num_input_tokens_seen": 63454400, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 16.882051282051282, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 1.3626268774899381e-06, | |
| "loss": 0.4612, | |
| "num_input_tokens_seen": 63526400, | |
| "step": 4115 | |
| }, | |
| { | |
| "epoch": 16.902564102564103, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 1.3596098081605505e-06, | |
| "loss": 0.5061, | |
| "num_input_tokens_seen": 63612448, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 16.923076923076923, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 1.3565933117114385e-06, | |
| "loss": 0.486, | |
| "num_input_tokens_seen": 63693248, | |
| "step": 4125 | |
| }, | |
| { | |
| "epoch": 16.943589743589744, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 1.3535774004518057e-06, | |
| "loss": 0.4772, | |
| "num_input_tokens_seen": 63765504, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 16.964102564102564, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 1.3505620866884666e-06, | |
| "loss": 0.457, | |
| "num_input_tokens_seen": 63839616, | |
| "step": 4135 | |
| }, | |
| { | |
| "epoch": 16.984615384615385, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 1.347547382725797e-06, | |
| "loss": 0.4961, | |
| "num_input_tokens_seen": 63924736, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 17.005128205128205, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 1.344533300865686e-06, | |
| "loss": 0.5218, | |
| "num_input_tokens_seen": 64000256, | |
| "step": 4145 | |
| }, | |
| { | |
| "epoch": 17.025641025641026, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 1.3415198534074823e-06, | |
| "loss": 0.4501, | |
| "num_input_tokens_seen": 64077472, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 17.046153846153846, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 1.3385070526479475e-06, | |
| "loss": 0.4704, | |
| "num_input_tokens_seen": 64157024, | |
| "step": 4155 | |
| }, | |
| { | |
| "epoch": 17.066666666666666, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 1.3354949108812026e-06, | |
| "loss": 0.485, | |
| "num_input_tokens_seen": 64226656, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 17.087179487179487, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1.3324834403986815e-06, | |
| "loss": 0.4316, | |
| "num_input_tokens_seen": 64302816, | |
| "step": 4165 | |
| }, | |
| { | |
| "epoch": 17.107692307692307, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1.3294726534890766e-06, | |
| "loss": 0.4781, | |
| "num_input_tokens_seen": 64371840, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 17.128205128205128, | |
| "grad_norm": 0.189453125, | |
| "learning_rate": 1.326462562438293e-06, | |
| "loss": 0.4331, | |
| "num_input_tokens_seen": 64451776, | |
| "step": 4175 | |
| }, | |
| { | |
| "epoch": 17.148717948717948, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 1.3234531795293945e-06, | |
| "loss": 0.4754, | |
| "num_input_tokens_seen": 64523712, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 17.16923076923077, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 1.3204445170425565e-06, | |
| "loss": 0.4983, | |
| "num_input_tokens_seen": 64594272, | |
| "step": 4185 | |
| }, | |
| { | |
| "epoch": 17.18974358974359, | |
| "grad_norm": 0.1650390625, | |
| "learning_rate": 1.3174365872550138e-06, | |
| "loss": 0.4852, | |
| "num_input_tokens_seen": 64677280, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 17.21025641025641, | |
| "grad_norm": 0.1591796875, | |
| "learning_rate": 1.3144294024410122e-06, | |
| "loss": 0.4843, | |
| "num_input_tokens_seen": 64758080, | |
| "step": 4195 | |
| }, | |
| { | |
| "epoch": 17.23076923076923, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 1.3114229748717563e-06, | |
| "loss": 0.4564, | |
| "num_input_tokens_seen": 64830848, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 17.25128205128205, | |
| "grad_norm": 0.1474609375, | |
| "learning_rate": 1.308417316815362e-06, | |
| "loss": 0.4432, | |
| "num_input_tokens_seen": 64907296, | |
| "step": 4205 | |
| }, | |
| { | |
| "epoch": 17.27179487179487, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 1.3054124405368036e-06, | |
| "loss": 0.4901, | |
| "num_input_tokens_seen": 64987392, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 17.29230769230769, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 1.3024083582978668e-06, | |
| "loss": 0.5076, | |
| "num_input_tokens_seen": 65077408, | |
| "step": 4215 | |
| }, | |
| { | |
| "epoch": 17.31282051282051, | |
| "grad_norm": 0.12158203125, | |
| "learning_rate": 1.2994050823570968e-06, | |
| "loss": 0.4818, | |
| "num_input_tokens_seen": 65159104, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 17.333333333333332, | |
| "grad_norm": 0.099609375, | |
| "learning_rate": 1.2964026249697475e-06, | |
| "loss": 0.4525, | |
| "num_input_tokens_seen": 65242816, | |
| "step": 4225 | |
| }, | |
| { | |
| "epoch": 17.353846153846153, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 1.293400998387734e-06, | |
| "loss": 0.4444, | |
| "num_input_tokens_seen": 65317376, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 17.374358974358973, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1.2904002148595797e-06, | |
| "loss": 0.4897, | |
| "num_input_tokens_seen": 65391264, | |
| "step": 4235 | |
| }, | |
| { | |
| "epoch": 17.394871794871793, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 1.2874002866303695e-06, | |
| "loss": 0.5116, | |
| "num_input_tokens_seen": 65467744, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 17.415384615384614, | |
| "grad_norm": 0.1298828125, | |
| "learning_rate": 1.2844012259416965e-06, | |
| "loss": 0.4498, | |
| "num_input_tokens_seen": 65535680, | |
| "step": 4245 | |
| }, | |
| { | |
| "epoch": 17.435897435897434, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 1.2814030450316151e-06, | |
| "loss": 0.4984, | |
| "num_input_tokens_seen": 65611680, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 17.456410256410255, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 1.2784057561345885e-06, | |
| "loss": 0.5276, | |
| "num_input_tokens_seen": 65684160, | |
| "step": 4255 | |
| }, | |
| { | |
| "epoch": 17.476923076923075, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 1.2754093714814407e-06, | |
| "loss": 0.4434, | |
| "num_input_tokens_seen": 65755104, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 17.4974358974359, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 1.2724139032993057e-06, | |
| "loss": 0.4743, | |
| "num_input_tokens_seen": 65835072, | |
| "step": 4265 | |
| }, | |
| { | |
| "epoch": 17.51794871794872, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 1.269419363811577e-06, | |
| "loss": 0.4691, | |
| "num_input_tokens_seen": 65916704, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 17.53846153846154, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 1.2664257652378598e-06, | |
| "loss": 0.5129, | |
| "num_input_tokens_seen": 65989696, | |
| "step": 4275 | |
| }, | |
| { | |
| "epoch": 17.55897435897436, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 1.2634331197939183e-06, | |
| "loss": 0.4868, | |
| "num_input_tokens_seen": 66071744, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 17.57948717948718, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 1.2604414396916286e-06, | |
| "loss": 0.489, | |
| "num_input_tokens_seen": 66153024, | |
| "step": 4285 | |
| }, | |
| { | |
| "epoch": 17.6, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 1.2574507371389267e-06, | |
| "loss": 0.4855, | |
| "num_input_tokens_seen": 66220384, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 17.620512820512822, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 1.25446102433976e-06, | |
| "loss": 0.4791, | |
| "num_input_tokens_seen": 66298912, | |
| "step": 4295 | |
| }, | |
| { | |
| "epoch": 17.641025641025642, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 1.2514723134940365e-06, | |
| "loss": 0.4563, | |
| "num_input_tokens_seen": 66373120, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 17.661538461538463, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 1.2484846167975767e-06, | |
| "loss": 0.4577, | |
| "num_input_tokens_seen": 66452096, | |
| "step": 4305 | |
| }, | |
| { | |
| "epoch": 17.682051282051283, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 1.2454979464420624e-06, | |
| "loss": 0.4875, | |
| "num_input_tokens_seen": 66530784, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 17.702564102564104, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 1.2425123146149863e-06, | |
| "loss": 0.4551, | |
| "num_input_tokens_seen": 66607264, | |
| "step": 4315 | |
| }, | |
| { | |
| "epoch": 17.723076923076924, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 1.2395277334996047e-06, | |
| "loss": 0.5378, | |
| "num_input_tokens_seen": 66692608, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 17.743589743589745, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 1.2365442152748846e-06, | |
| "loss": 0.4518, | |
| "num_input_tokens_seen": 66773504, | |
| "step": 4325 | |
| }, | |
| { | |
| "epoch": 17.764102564102565, | |
| "grad_norm": 0.625, | |
| "learning_rate": 1.2335617721154577e-06, | |
| "loss": 0.4412, | |
| "num_input_tokens_seen": 66844928, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 17.784615384615385, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 1.2305804161915671e-06, | |
| "loss": 0.5214, | |
| "num_input_tokens_seen": 66918080, | |
| "step": 4335 | |
| }, | |
| { | |
| "epoch": 17.805128205128206, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 1.2276001596690205e-06, | |
| "loss": 0.47, | |
| "num_input_tokens_seen": 66988992, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 17.825641025641026, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 1.2246210147091382e-06, | |
| "loss": 0.4545, | |
| "num_input_tokens_seen": 67068288, | |
| "step": 4345 | |
| }, | |
| { | |
| "epoch": 17.846153846153847, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 1.2216429934687062e-06, | |
| "loss": 0.4574, | |
| "num_input_tokens_seen": 67142560, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 17.866666666666667, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 1.2186661080999234e-06, | |
| "loss": 0.4717, | |
| "num_input_tokens_seen": 67222560, | |
| "step": 4355 | |
| }, | |
| { | |
| "epoch": 17.887179487179488, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 1.2156903707503544e-06, | |
| "loss": 0.4662, | |
| "num_input_tokens_seen": 67301760, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 17.907692307692308, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 1.2127157935628789e-06, | |
| "loss": 0.4958, | |
| "num_input_tokens_seen": 67381856, | |
| "step": 4365 | |
| }, | |
| { | |
| "epoch": 17.92820512820513, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 1.2097423886756433e-06, | |
| "loss": 0.5327, | |
| "num_input_tokens_seen": 67467968, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 17.94871794871795, | |
| "grad_norm": 0.248046875, | |
| "learning_rate": 1.2067701682220084e-06, | |
| "loss": 0.4606, | |
| "num_input_tokens_seen": 67548672, | |
| "step": 4375 | |
| }, | |
| { | |
| "epoch": 17.96923076923077, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 1.2037991443305043e-06, | |
| "loss": 0.4451, | |
| "num_input_tokens_seen": 67625248, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 17.98974358974359, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 1.2008293291247754e-06, | |
| "loss": 0.4416, | |
| "num_input_tokens_seen": 67695808, | |
| "step": 4385 | |
| }, | |
| { | |
| "epoch": 18.01025641025641, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1.1978607347235367e-06, | |
| "loss": 0.4741, | |
| "num_input_tokens_seen": 67774400, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 18.03076923076923, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1.1948933732405205e-06, | |
| "loss": 0.446, | |
| "num_input_tokens_seen": 67846848, | |
| "step": 4395 | |
| }, | |
| { | |
| "epoch": 18.05128205128205, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 1.191927256784427e-06, | |
| "loss": 0.4315, | |
| "num_input_tokens_seen": 67917440, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 18.07179487179487, | |
| "grad_norm": 0.181640625, | |
| "learning_rate": 1.1889623974588772e-06, | |
| "loss": 0.4055, | |
| "num_input_tokens_seen": 67990656, | |
| "step": 4405 | |
| }, | |
| { | |
| "epoch": 18.092307692307692, | |
| "grad_norm": 0.1591796875, | |
| "learning_rate": 1.185998807362362e-06, | |
| "loss": 0.499, | |
| "num_input_tokens_seen": 68067872, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 18.112820512820512, | |
| "grad_norm": 0.162109375, | |
| "learning_rate": 1.1830364985881924e-06, | |
| "loss": 0.481, | |
| "num_input_tokens_seen": 68145632, | |
| "step": 4415 | |
| }, | |
| { | |
| "epoch": 18.133333333333333, | |
| "grad_norm": 0.1650390625, | |
| "learning_rate": 1.1800754832244515e-06, | |
| "loss": 0.4931, | |
| "num_input_tokens_seen": 68221472, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 18.153846153846153, | |
| "grad_norm": 0.1513671875, | |
| "learning_rate": 1.1771157733539442e-06, | |
| "loss": 0.4377, | |
| "num_input_tokens_seen": 68292352, | |
| "step": 4425 | |
| }, | |
| { | |
| "epoch": 18.174358974358974, | |
| "grad_norm": 0.150390625, | |
| "learning_rate": 1.174157381054148e-06, | |
| "loss": 0.5265, | |
| "num_input_tokens_seen": 68374880, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 18.194871794871794, | |
| "grad_norm": 0.1416015625, | |
| "learning_rate": 1.1712003183971644e-06, | |
| "loss": 0.5103, | |
| "num_input_tokens_seen": 68457280, | |
| "step": 4435 | |
| }, | |
| { | |
| "epoch": 18.215384615384615, | |
| "grad_norm": 0.119140625, | |
| "learning_rate": 1.1682445974496686e-06, | |
| "loss": 0.5133, | |
| "num_input_tokens_seen": 68540224, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 18.235897435897435, | |
| "grad_norm": 0.11865234375, | |
| "learning_rate": 1.1652902302728607e-06, | |
| "loss": 0.4805, | |
| "num_input_tokens_seen": 68621056, | |
| "step": 4445 | |
| }, | |
| { | |
| "epoch": 18.256410256410255, | |
| "grad_norm": 0.12353515625, | |
| "learning_rate": 1.1623372289224172e-06, | |
| "loss": 0.4863, | |
| "num_input_tokens_seen": 68697440, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 18.276923076923076, | |
| "grad_norm": 0.125, | |
| "learning_rate": 1.1593856054484403e-06, | |
| "loss": 0.4685, | |
| "num_input_tokens_seen": 68778944, | |
| "step": 4455 | |
| }, | |
| { | |
| "epoch": 18.297435897435896, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 1.156435371895411e-06, | |
| "loss": 0.4934, | |
| "num_input_tokens_seen": 68859712, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 18.317948717948717, | |
| "grad_norm": 0.1962890625, | |
| "learning_rate": 1.1534865403021366e-06, | |
| "loss": 0.4439, | |
| "num_input_tokens_seen": 68932512, | |
| "step": 4465 | |
| }, | |
| { | |
| "epoch": 18.338461538461537, | |
| "grad_norm": 0.15625, | |
| "learning_rate": 1.1505391227017046e-06, | |
| "loss": 0.4295, | |
| "num_input_tokens_seen": 69006080, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 18.358974358974358, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 1.1475931311214338e-06, | |
| "loss": 0.4644, | |
| "num_input_tokens_seen": 69081984, | |
| "step": 4475 | |
| }, | |
| { | |
| "epoch": 18.379487179487178, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 1.144648577582821e-06, | |
| "loss": 0.4899, | |
| "num_input_tokens_seen": 69158240, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 18.4, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 1.141705474101498e-06, | |
| "loss": 0.4763, | |
| "num_input_tokens_seen": 69241184, | |
| "step": 4485 | |
| }, | |
| { | |
| "epoch": 18.42051282051282, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 1.138763832687177e-06, | |
| "loss": 0.4784, | |
| "num_input_tokens_seen": 69326112, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 18.44102564102564, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 1.1358236653436052e-06, | |
| "loss": 0.4624, | |
| "num_input_tokens_seen": 69400832, | |
| "step": 4495 | |
| }, | |
| { | |
| "epoch": 18.46153846153846, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 1.1328849840685143e-06, | |
| "loss": 0.4523, | |
| "num_input_tokens_seen": 69478464, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 18.48205128205128, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 1.1299478008535726e-06, | |
| "loss": 0.5372, | |
| "num_input_tokens_seen": 69559712, | |
| "step": 4505 | |
| }, | |
| { | |
| "epoch": 18.5025641025641, | |
| "grad_norm": 0.458984375, | |
| "learning_rate": 1.1270121276843342e-06, | |
| "loss": 0.4224, | |
| "num_input_tokens_seen": 69638528, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 18.523076923076925, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 1.1240779765401926e-06, | |
| "loss": 0.4726, | |
| "num_input_tokens_seen": 69719264, | |
| "step": 4515 | |
| }, | |
| { | |
| "epoch": 18.543589743589745, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 1.1211453593943293e-06, | |
| "loss": 0.4407, | |
| "num_input_tokens_seen": 69798432, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 18.564102564102566, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 1.118214288213667e-06, | |
| "loss": 0.4594, | |
| "num_input_tokens_seen": 69869984, | |
| "step": 4525 | |
| }, | |
| { | |
| "epoch": 18.584615384615386, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 1.1152847749588186e-06, | |
| "loss": 0.5029, | |
| "num_input_tokens_seen": 69949088, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 18.605128205128207, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 1.1123568315840419e-06, | |
| "loss": 0.4386, | |
| "num_input_tokens_seen": 70023904, | |
| "step": 4535 | |
| }, | |
| { | |
| "epoch": 18.625641025641027, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 1.1094304700371863e-06, | |
| "loss": 0.5118, | |
| "num_input_tokens_seen": 70098400, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 18.646153846153847, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 1.1065057022596483e-06, | |
| "loss": 0.4559, | |
| "num_input_tokens_seen": 70176576, | |
| "step": 4545 | |
| }, | |
| { | |
| "epoch": 18.666666666666668, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 1.1035825401863185e-06, | |
| "loss": 0.4418, | |
| "num_input_tokens_seen": 70255264, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 18.68717948717949, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 1.100660995745538e-06, | |
| "loss": 0.435, | |
| "num_input_tokens_seen": 70324160, | |
| "step": 4555 | |
| }, | |
| { | |
| "epoch": 18.70769230769231, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 1.0977410808590437e-06, | |
| "loss": 0.5499, | |
| "num_input_tokens_seen": 70411616, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 18.72820512820513, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 1.0948228074419269e-06, | |
| "loss": 0.4705, | |
| "num_input_tokens_seen": 70482592, | |
| "step": 4565 | |
| }, | |
| { | |
| "epoch": 18.74871794871795, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 1.0919061874025774e-06, | |
| "loss": 0.4466, | |
| "num_input_tokens_seen": 70565120, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 18.76923076923077, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 1.0889912326426393e-06, | |
| "loss": 0.4323, | |
| "num_input_tokens_seen": 70635168, | |
| "step": 4575 | |
| }, | |
| { | |
| "epoch": 18.78974358974359, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 1.0860779550569609e-06, | |
| "loss": 0.5235, | |
| "num_input_tokens_seen": 70720096, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 18.81025641025641, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 1.083166366533548e-06, | |
| "loss": 0.4659, | |
| "num_input_tokens_seen": 70807648, | |
| "step": 4585 | |
| }, | |
| { | |
| "epoch": 18.83076923076923, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 1.080256478953512e-06, | |
| "loss": 0.4968, | |
| "num_input_tokens_seen": 70888832, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 18.851282051282052, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 1.0773483041910247e-06, | |
| "loss": 0.5112, | |
| "num_input_tokens_seen": 70971136, | |
| "step": 4595 | |
| }, | |
| { | |
| "epoch": 18.871794871794872, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 1.0744418541132676e-06, | |
| "loss": 0.4414, | |
| "num_input_tokens_seen": 71038816, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 18.892307692307693, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 1.0715371405803858e-06, | |
| "loss": 0.4874, | |
| "num_input_tokens_seen": 71114720, | |
| "step": 4605 | |
| }, | |
| { | |
| "epoch": 18.912820512820513, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 1.0686341754454364e-06, | |
| "loss": 0.4571, | |
| "num_input_tokens_seen": 71189856, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 18.933333333333334, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1.0657329705543439e-06, | |
| "loss": 0.4661, | |
| "num_input_tokens_seen": 71271136, | |
| "step": 4615 | |
| }, | |
| { | |
| "epoch": 18.953846153846154, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 1.0628335377458477e-06, | |
| "loss": 0.435, | |
| "num_input_tokens_seen": 71343488, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 18.974358974358974, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 1.0599358888514582e-06, | |
| "loss": 0.4845, | |
| "num_input_tokens_seen": 71419104, | |
| "step": 4625 | |
| }, | |
| { | |
| "epoch": 18.994871794871795, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 1.0570400356954044e-06, | |
| "loss": 0.5111, | |
| "num_input_tokens_seen": 71491776, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 19.015384615384615, | |
| "grad_norm": 0.18359375, | |
| "learning_rate": 1.0541459900945892e-06, | |
| "loss": 0.5006, | |
| "num_input_tokens_seen": 71573184, | |
| "step": 4635 | |
| }, | |
| { | |
| "epoch": 19.035897435897436, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 1.0512537638585379e-06, | |
| "loss": 0.4828, | |
| "num_input_tokens_seen": 71648768, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 19.056410256410256, | |
| "grad_norm": 0.1572265625, | |
| "learning_rate": 1.0483633687893526e-06, | |
| "loss": 0.4607, | |
| "num_input_tokens_seen": 71721824, | |
| "step": 4645 | |
| }, | |
| { | |
| "epoch": 19.076923076923077, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 1.0454748166816645e-06, | |
| "loss": 0.497, | |
| "num_input_tokens_seen": 71797568, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 19.097435897435897, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 1.0425881193225808e-06, | |
| "loss": 0.4607, | |
| "num_input_tokens_seen": 71871136, | |
| "step": 4655 | |
| }, | |
| { | |
| "epoch": 19.117948717948718, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 1.0397032884916438e-06, | |
| "loss": 0.4232, | |
| "num_input_tokens_seen": 71941984, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 19.138461538461538, | |
| "grad_norm": 0.123046875, | |
| "learning_rate": 1.0368203359607767e-06, | |
| "loss": 0.5216, | |
| "num_input_tokens_seen": 72024736, | |
| "step": 4665 | |
| }, | |
| { | |
| "epoch": 19.15897435897436, | |
| "grad_norm": 0.12109375, | |
| "learning_rate": 1.0339392734942393e-06, | |
| "loss": 0.4565, | |
| "num_input_tokens_seen": 72096064, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 19.17948717948718, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 1.031060112848578e-06, | |
| "loss": 0.4488, | |
| "num_input_tokens_seen": 72171168, | |
| "step": 4675 | |
| }, | |
| { | |
| "epoch": 19.2, | |
| "grad_norm": 0.126953125, | |
| "learning_rate": 1.0281828657725798e-06, | |
| "loss": 0.4567, | |
| "num_input_tokens_seen": 72249056, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 19.22051282051282, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 1.0253075440072212e-06, | |
| "loss": 0.4556, | |
| "num_input_tokens_seen": 72338944, | |
| "step": 4685 | |
| }, | |
| { | |
| "epoch": 19.24102564102564, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 1.0224341592856245e-06, | |
| "loss": 0.4686, | |
| "num_input_tokens_seen": 72412032, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 19.26153846153846, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 1.0195627233330052e-06, | |
| "loss": 0.4087, | |
| "num_input_tokens_seen": 72485760, | |
| "step": 4695 | |
| }, | |
| { | |
| "epoch": 19.28205128205128, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 1.0166932478666292e-06, | |
| "loss": 0.4634, | |
| "num_input_tokens_seen": 72565344, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 19.3025641025641, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 1.0138257445957601e-06, | |
| "loss": 0.5316, | |
| "num_input_tokens_seen": 72644800, | |
| "step": 4705 | |
| }, | |
| { | |
| "epoch": 19.323076923076922, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 1.0109602252216153e-06, | |
| "loss": 0.4929, | |
| "num_input_tokens_seen": 72725440, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 19.343589743589742, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 1.0080967014373152e-06, | |
| "loss": 0.442, | |
| "num_input_tokens_seen": 72796064, | |
| "step": 4715 | |
| }, | |
| { | |
| "epoch": 19.364102564102563, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 1.0052351849278385e-06, | |
| "loss": 0.4442, | |
| "num_input_tokens_seen": 72870080, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 19.384615384615383, | |
| "grad_norm": 0.2490234375, | |
| "learning_rate": 1.0023756873699723e-06, | |
| "loss": 0.4751, | |
| "num_input_tokens_seen": 72944288, | |
| "step": 4725 | |
| }, | |
| { | |
| "epoch": 19.405128205128204, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 9.995182204322637e-07, | |
| "loss": 0.5129, | |
| "num_input_tokens_seen": 73021664, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 19.425641025641024, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 9.966627957749767e-07, | |
| "loss": 0.4796, | |
| "num_input_tokens_seen": 73097920, | |
| "step": 4735 | |
| }, | |
| { | |
| "epoch": 19.446153846153845, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 9.93809425050039e-07, | |
| "loss": 0.4785, | |
| "num_input_tokens_seen": 73170112, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 19.466666666666665, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 9.909581199009971e-07, | |
| "loss": 0.4379, | |
| "num_input_tokens_seen": 73241312, | |
| "step": 4745 | |
| }, | |
| { | |
| "epoch": 19.487179487179485, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 9.8810889196297e-07, | |
| "loss": 0.5408, | |
| "num_input_tokens_seen": 73319936, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 19.50769230769231, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 9.852617528625992e-07, | |
| "loss": 0.4878, | |
| "num_input_tokens_seen": 73397824, | |
| "step": 4755 | |
| }, | |
| { | |
| "epoch": 19.52820512820513, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 9.824167142180032e-07, | |
| "loss": 0.4627, | |
| "num_input_tokens_seen": 73470688, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 19.54871794871795, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 9.795737876387285e-07, | |
| "loss": 0.4799, | |
| "num_input_tokens_seen": 73545792, | |
| "step": 4765 | |
| }, | |
| { | |
| "epoch": 19.56923076923077, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 9.76732984725704e-07, | |
| "loss": 0.428, | |
| "num_input_tokens_seen": 73617760, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 19.58974358974359, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 9.738943170711916e-07, | |
| "loss": 0.4608, | |
| "num_input_tokens_seen": 73707328, | |
| "step": 4775 | |
| }, | |
| { | |
| "epoch": 19.61025641025641, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 9.710577962587412e-07, | |
| "loss": 0.4951, | |
| "num_input_tokens_seen": 73782976, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 19.630769230769232, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 9.68223433863141e-07, | |
| "loss": 0.53, | |
| "num_input_tokens_seen": 73872256, | |
| "step": 4785 | |
| }, | |
| { | |
| "epoch": 19.651282051282053, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 9.653912414503731e-07, | |
| "loss": 0.45, | |
| "num_input_tokens_seen": 73947520, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 19.671794871794873, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 9.625612305775626e-07, | |
| "loss": 0.4218, | |
| "num_input_tokens_seen": 74023104, | |
| "step": 4795 | |
| }, | |
| { | |
| "epoch": 19.692307692307693, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 9.597334127929346e-07, | |
| "loss": 0.4531, | |
| "num_input_tokens_seen": 74105920, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 19.712820512820514, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 9.569077996357638e-07, | |
| "loss": 0.4142, | |
| "num_input_tokens_seen": 74182752, | |
| "step": 4805 | |
| }, | |
| { | |
| "epoch": 19.733333333333334, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 9.54084402636329e-07, | |
| "loss": 0.5121, | |
| "num_input_tokens_seen": 74265248, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 19.753846153846155, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 9.512632333158653e-07, | |
| "loss": 0.523, | |
| "num_input_tokens_seen": 74363232, | |
| "step": 4815 | |
| }, | |
| { | |
| "epoch": 19.774358974358975, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 9.484443031865176e-07, | |
| "loss": 0.5103, | |
| "num_input_tokens_seen": 74442848, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 19.794871794871796, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 9.456276237512949e-07, | |
| "loss": 0.4725, | |
| "num_input_tokens_seen": 74513376, | |
| "step": 4825 | |
| }, | |
| { | |
| "epoch": 19.815384615384616, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 9.428132065040198e-07, | |
| "loss": 0.4921, | |
| "num_input_tokens_seen": 74588928, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 19.835897435897436, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 9.40001062929285e-07, | |
| "loss": 0.4331, | |
| "num_input_tokens_seen": 74659040, | |
| "step": 4835 | |
| }, | |
| { | |
| "epoch": 19.856410256410257, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 9.371912045024046e-07, | |
| "loss": 0.44, | |
| "num_input_tokens_seen": 74737472, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 19.876923076923077, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 9.343836426893687e-07, | |
| "loss": 0.4831, | |
| "num_input_tokens_seen": 74810592, | |
| "step": 4845 | |
| }, | |
| { | |
| "epoch": 19.897435897435898, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 9.315783889467943e-07, | |
| "loss": 0.4494, | |
| "num_input_tokens_seen": 74880096, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 19.91794871794872, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 9.287754547218821e-07, | |
| "loss": 0.5063, | |
| "num_input_tokens_seen": 74951168, | |
| "step": 4855 | |
| }, | |
| { | |
| "epoch": 19.93846153846154, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 9.259748514523654e-07, | |
| "loss": 0.4975, | |
| "num_input_tokens_seen": 75034592, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 19.95897435897436, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 9.231765905664677e-07, | |
| "loss": 0.4802, | |
| "num_input_tokens_seen": 75108992, | |
| "step": 4865 | |
| }, | |
| { | |
| "epoch": 19.97948717948718, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 9.20380683482853e-07, | |
| "loss": 0.4473, | |
| "num_input_tokens_seen": 75185824, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 9.175871416105802e-07, | |
| "loss": 0.4971, | |
| "num_input_tokens_seen": 75263936, | |
| "step": 4875 | |
| }, | |
| { | |
| "epoch": 20.02051282051282, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 9.147959763490578e-07, | |
| "loss": 0.493, | |
| "num_input_tokens_seen": 75342752, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 20.04102564102564, | |
| "grad_norm": 0.11669921875, | |
| "learning_rate": 9.120071990879949e-07, | |
| "loss": 0.4709, | |
| "num_input_tokens_seen": 75420704, | |
| "step": 4885 | |
| }, | |
| { | |
| "epoch": 20.06153846153846, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 9.092208212073569e-07, | |
| "loss": 0.5248, | |
| "num_input_tokens_seen": 75496128, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 20.08205128205128, | |
| "grad_norm": 0.1220703125, | |
| "learning_rate": 9.064368540773177e-07, | |
| "loss": 0.525, | |
| "num_input_tokens_seen": 75585984, | |
| "step": 4895 | |
| }, | |
| { | |
| "epoch": 20.102564102564102, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 9.036553090582145e-07, | |
| "loss": 0.5186, | |
| "num_input_tokens_seen": 75668416, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 20.123076923076923, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 9.008761975004994e-07, | |
| "loss": 0.456, | |
| "num_input_tokens_seen": 75744960, | |
| "step": 4905 | |
| }, | |
| { | |
| "epoch": 20.143589743589743, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 8.98099530744696e-07, | |
| "loss": 0.4507, | |
| "num_input_tokens_seen": 75818688, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 20.164102564102564, | |
| "grad_norm": 0.150390625, | |
| "learning_rate": 8.953253201213517e-07, | |
| "loss": 0.4552, | |
| "num_input_tokens_seen": 75901824, | |
| "step": 4915 | |
| }, | |
| { | |
| "epoch": 20.184615384615384, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 8.925535769509895e-07, | |
| "loss": 0.4153, | |
| "num_input_tokens_seen": 75973248, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 20.205128205128204, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 8.897843125440653e-07, | |
| "loss": 0.4452, | |
| "num_input_tokens_seen": 76043840, | |
| "step": 4925 | |
| }, | |
| { | |
| "epoch": 20.217435897435898, | |
| "num_input_tokens_seen": 76105440, | |
| "step": 4928, | |
| "total_flos": 1.6998510508071322e+18, | |
| "train_loss": 0.4908539823365289, | |
| "train_runtime": 36017.1103, | |
| "train_samples_per_second": 3.465, | |
| "train_steps_per_second": 0.216 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 7776, | |
| "num_input_tokens_seen": 76105440, | |
| "num_train_epochs": 32, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.6998510508071322e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |