diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,61563 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 8790, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00034129692832764505, + "grad_norm": 8.071894645690918, + "learning_rate": 0.0009998862343572241, + "loss": 12.1418, + "step": 1 + }, + { + "epoch": 0.0006825938566552901, + "grad_norm": 5.984577178955078, + "learning_rate": 0.0009997724687144482, + "loss": 10.9212, + "step": 2 + }, + { + "epoch": 0.0010238907849829352, + "grad_norm": 5.766450881958008, + "learning_rate": 0.0009996587030716723, + "loss": 10.3976, + "step": 3 + }, + { + "epoch": 0.0013651877133105802, + "grad_norm": 4.486149787902832, + "learning_rate": 0.0009995449374288964, + "loss": 9.6927, + "step": 4 + }, + { + "epoch": 0.0017064846416382253, + "grad_norm": 4.054387092590332, + "learning_rate": 0.0009994311717861205, + "loss": 9.4759, + "step": 5 + }, + { + "epoch": 0.0020477815699658703, + "grad_norm": 4.101088523864746, + "learning_rate": 0.0009993174061433449, + "loss": 9.2064, + "step": 6 + }, + { + "epoch": 0.002389078498293515, + "grad_norm": 4.606311798095703, + "learning_rate": 0.000999203640500569, + "loss": 8.5577, + "step": 7 + }, + { + "epoch": 0.0027303754266211604, + "grad_norm": 4.251020431518555, + "learning_rate": 0.000999089874857793, + "loss": 8.6099, + "step": 8 + }, + { + "epoch": 0.0030716723549488053, + "grad_norm": 4.296706199645996, + "learning_rate": 0.0009989761092150172, + "loss": 8.6605, + "step": 9 + }, + { + "epoch": 0.0034129692832764505, + "grad_norm": 4.185203552246094, + "learning_rate": 0.000998862343572241, + "loss": 8.4278, + "step": 10 + }, + { + "epoch": 0.0037542662116040954, + "grad_norm": 19.910564422607422, + "learning_rate": 0.0009987485779294652, + "loss": 6.7563, + "step": 11 + }, + { + "epoch": 0.004095563139931741, + "grad_norm": 4.908084392547607, + "learning_rate": 0.0009986348122866895, + "loss": 8.5025, + "step": 12 + }, + { + "epoch": 0.004436860068259386, + "grad_norm": 4.213999271392822, + "learning_rate": 0.0009985210466439136, + "loss": 8.3884, + "step": 13 + }, + { + "epoch": 0.00477815699658703, + "grad_norm": 5.283620834350586, + "learning_rate": 0.0009984072810011377, + "loss": 7.6757, + "step": 14 + }, + { + "epoch": 0.005119453924914676, + "grad_norm": 4.11642599105835, + "learning_rate": 0.0009982935153583618, + "loss": 8.2126, + "step": 15 + }, + { + "epoch": 0.005460750853242321, + "grad_norm": 3.9162049293518066, + "learning_rate": 0.000998179749715586, + "loss": 8.1421, + "step": 16 + }, + { + "epoch": 0.005802047781569966, + "grad_norm": 3.7291955947875977, + "learning_rate": 0.00099806598407281, + "loss": 8.4332, + "step": 17 + }, + { + "epoch": 0.0061433447098976105, + "grad_norm": 4.422487258911133, + "learning_rate": 0.0009979522184300341, + "loss": 7.373, + "step": 18 + }, + { + "epoch": 0.006484641638225256, + "grad_norm": 3.9871609210968018, + "learning_rate": 0.0009978384527872582, + "loss": 7.8555, + "step": 19 + }, + { + "epoch": 0.006825938566552901, + "grad_norm": 4.012209415435791, + "learning_rate": 0.0009977246871444823, + "loss": 7.9322, + "step": 20 + }, + { + "epoch": 0.007167235494880546, + "grad_norm": 3.9194388389587402, + "learning_rate": 0.0009976109215017064, + "loss": 7.5508, + "step": 21 + }, + { + "epoch": 0.007508532423208191, + "grad_norm": 3.990365743637085, + "learning_rate": 0.0009974971558589305, + "loss": 7.8027, + "step": 22 + }, + { + "epoch": 0.007849829351535836, + "grad_norm": 4.236116409301758, + "learning_rate": 0.0009973833902161549, + "loss": 7.9234, + "step": 23 + }, + { + "epoch": 0.008191126279863481, + "grad_norm": 9.924948692321777, + "learning_rate": 0.000997269624573379, + "loss": 7.0817, + "step": 24 + }, + { + "epoch": 0.008532423208191127, + "grad_norm": 7.816974639892578, + "learning_rate": 0.000997155858930603, + "loss": 6.7681, + "step": 25 + }, + { + "epoch": 0.008873720136518772, + "grad_norm": 4.295772075653076, + "learning_rate": 0.0009970420932878272, + "loss": 7.8656, + "step": 26 + }, + { + "epoch": 0.009215017064846417, + "grad_norm": 8.219182968139648, + "learning_rate": 0.0009969283276450513, + "loss": 5.6737, + "step": 27 + }, + { + "epoch": 0.00955631399317406, + "grad_norm": 6.722366809844971, + "learning_rate": 0.0009968145620022754, + "loss": 6.44, + "step": 28 + }, + { + "epoch": 0.009897610921501706, + "grad_norm": 4.0717597007751465, + "learning_rate": 0.0009967007963594995, + "loss": 7.4495, + "step": 29 + }, + { + "epoch": 0.010238907849829351, + "grad_norm": 3.9176180362701416, + "learning_rate": 0.0009965870307167236, + "loss": 7.2357, + "step": 30 + }, + { + "epoch": 0.010580204778156996, + "grad_norm": 4.101375102996826, + "learning_rate": 0.0009964732650739477, + "loss": 7.068, + "step": 31 + }, + { + "epoch": 0.010921501706484642, + "grad_norm": 3.5856902599334717, + "learning_rate": 0.0009963594994311718, + "loss": 7.6736, + "step": 32 + }, + { + "epoch": 0.011262798634812287, + "grad_norm": 4.2619194984436035, + "learning_rate": 0.000996245733788396, + "loss": 7.0183, + "step": 33 + }, + { + "epoch": 0.011604095563139932, + "grad_norm": 3.7279083728790283, + "learning_rate": 0.00099613196814562, + "loss": 7.1387, + "step": 34 + }, + { + "epoch": 0.011945392491467578, + "grad_norm": 3.478987216949463, + "learning_rate": 0.0009960182025028441, + "loss": 7.4298, + "step": 35 + }, + { + "epoch": 0.012286689419795221, + "grad_norm": 3.570570707321167, + "learning_rate": 0.0009959044368600682, + "loss": 7.0718, + "step": 36 + }, + { + "epoch": 0.012627986348122866, + "grad_norm": 3.5319583415985107, + "learning_rate": 0.0009957906712172923, + "loss": 7.0964, + "step": 37 + }, + { + "epoch": 0.012969283276450512, + "grad_norm": 3.5056099891662598, + "learning_rate": 0.0009956769055745164, + "loss": 7.1669, + "step": 38 + }, + { + "epoch": 0.013310580204778157, + "grad_norm": 3.48520827293396, + "learning_rate": 0.0009955631399317405, + "loss": 7.5944, + "step": 39 + }, + { + "epoch": 0.013651877133105802, + "grad_norm": 3.7257206439971924, + "learning_rate": 0.0009954493742889647, + "loss": 7.0551, + "step": 40 + }, + { + "epoch": 0.013993174061433447, + "grad_norm": 4.222928524017334, + "learning_rate": 0.000995335608646189, + "loss": 6.9363, + "step": 41 + }, + { + "epoch": 0.014334470989761093, + "grad_norm": 4.023824214935303, + "learning_rate": 0.000995221843003413, + "loss": 7.2418, + "step": 42 + }, + { + "epoch": 0.014675767918088738, + "grad_norm": 3.970456600189209, + "learning_rate": 0.0009951080773606372, + "loss": 6.9394, + "step": 43 + }, + { + "epoch": 0.015017064846416382, + "grad_norm": 3.697723627090454, + "learning_rate": 0.0009949943117178613, + "loss": 7.1638, + "step": 44 + }, + { + "epoch": 0.015358361774744027, + "grad_norm": 9.966476440429688, + "learning_rate": 0.0009948805460750854, + "loss": 6.5215, + "step": 45 + }, + { + "epoch": 0.015699658703071672, + "grad_norm": 4.007918357849121, + "learning_rate": 0.0009947667804323095, + "loss": 7.0861, + "step": 46 + }, + { + "epoch": 0.016040955631399317, + "grad_norm": 4.113846778869629, + "learning_rate": 0.0009946530147895336, + "loss": 7.4096, + "step": 47 + }, + { + "epoch": 0.016382252559726963, + "grad_norm": 3.8716022968292236, + "learning_rate": 0.0009945392491467577, + "loss": 7.1557, + "step": 48 + }, + { + "epoch": 0.016723549488054608, + "grad_norm": 3.5497727394104004, + "learning_rate": 0.0009944254835039818, + "loss": 7.3205, + "step": 49 + }, + { + "epoch": 0.017064846416382253, + "grad_norm": 3.501861810684204, + "learning_rate": 0.000994311717861206, + "loss": 7.1354, + "step": 50 + }, + { + "epoch": 0.0174061433447099, + "grad_norm": 3.8758769035339355, + "learning_rate": 0.00099419795221843, + "loss": 6.743, + "step": 51 + }, + { + "epoch": 0.017747440273037544, + "grad_norm": 4.309325218200684, + "learning_rate": 0.0009940841865756541, + "loss": 6.7807, + "step": 52 + }, + { + "epoch": 0.01808873720136519, + "grad_norm": 3.619455337524414, + "learning_rate": 0.0009939704209328782, + "loss": 7.1105, + "step": 53 + }, + { + "epoch": 0.018430034129692834, + "grad_norm": 3.5727694034576416, + "learning_rate": 0.0009938566552901023, + "loss": 7.1482, + "step": 54 + }, + { + "epoch": 0.01877133105802048, + "grad_norm": 11.776286125183105, + "learning_rate": 0.0009937428896473264, + "loss": 5.6497, + "step": 55 + }, + { + "epoch": 0.01911262798634812, + "grad_norm": 4.078109264373779, + "learning_rate": 0.0009936291240045505, + "loss": 7.2197, + "step": 56 + }, + { + "epoch": 0.019453924914675767, + "grad_norm": 4.522604942321777, + "learning_rate": 0.0009935153583617747, + "loss": 6.3764, + "step": 57 + }, + { + "epoch": 0.019795221843003412, + "grad_norm": 3.437112808227539, + "learning_rate": 0.000993401592718999, + "loss": 6.8425, + "step": 58 + }, + { + "epoch": 0.020136518771331057, + "grad_norm": 3.7361385822296143, + "learning_rate": 0.000993287827076223, + "loss": 7.3883, + "step": 59 + }, + { + "epoch": 0.020477815699658702, + "grad_norm": 3.4807488918304443, + "learning_rate": 0.0009931740614334472, + "loss": 7.353, + "step": 60 + }, + { + "epoch": 0.020819112627986348, + "grad_norm": 3.5185515880584717, + "learning_rate": 0.0009930602957906713, + "loss": 7.1191, + "step": 61 + }, + { + "epoch": 0.021160409556313993, + "grad_norm": 3.42722749710083, + "learning_rate": 0.0009929465301478954, + "loss": 7.0395, + "step": 62 + }, + { + "epoch": 0.021501706484641638, + "grad_norm": 3.5530056953430176, + "learning_rate": 0.0009928327645051195, + "loss": 7.005, + "step": 63 + }, + { + "epoch": 0.021843003412969283, + "grad_norm": 3.577638626098633, + "learning_rate": 0.0009927189988623436, + "loss": 7.1476, + "step": 64 + }, + { + "epoch": 0.02218430034129693, + "grad_norm": 4.04377555847168, + "learning_rate": 0.0009926052332195677, + "loss": 7.1881, + "step": 65 + }, + { + "epoch": 0.022525597269624574, + "grad_norm": 4.380084037780762, + "learning_rate": 0.0009924914675767918, + "loss": 6.6989, + "step": 66 + }, + { + "epoch": 0.02286689419795222, + "grad_norm": 4.379724502563477, + "learning_rate": 0.000992377701934016, + "loss": 7.276, + "step": 67 + }, + { + "epoch": 0.023208191126279865, + "grad_norm": 3.8870277404785156, + "learning_rate": 0.00099226393629124, + "loss": 7.2336, + "step": 68 + }, + { + "epoch": 0.02354948805460751, + "grad_norm": 3.6682233810424805, + "learning_rate": 0.0009921501706484641, + "loss": 6.7899, + "step": 69 + }, + { + "epoch": 0.023890784982935155, + "grad_norm": 3.732855796813965, + "learning_rate": 0.0009920364050056882, + "loss": 6.5689, + "step": 70 + }, + { + "epoch": 0.024232081911262797, + "grad_norm": 3.7168567180633545, + "learning_rate": 0.0009919226393629123, + "loss": 6.9181, + "step": 71 + }, + { + "epoch": 0.024573378839590442, + "grad_norm": 3.8344621658325195, + "learning_rate": 0.0009918088737201364, + "loss": 7.4924, + "step": 72 + }, + { + "epoch": 0.024914675767918087, + "grad_norm": 5.3828606605529785, + "learning_rate": 0.0009916951080773605, + "loss": 6.9739, + "step": 73 + }, + { + "epoch": 0.025255972696245733, + "grad_norm": 3.8628880977630615, + "learning_rate": 0.0009915813424345847, + "loss": 7.7034, + "step": 74 + }, + { + "epoch": 0.025597269624573378, + "grad_norm": 4.138500213623047, + "learning_rate": 0.000991467576791809, + "loss": 7.1116, + "step": 75 + }, + { + "epoch": 0.025938566552901023, + "grad_norm": 3.994074821472168, + "learning_rate": 0.000991353811149033, + "loss": 6.6436, + "step": 76 + }, + { + "epoch": 0.02627986348122867, + "grad_norm": 3.761909246444702, + "learning_rate": 0.0009912400455062572, + "loss": 6.9494, + "step": 77 + }, + { + "epoch": 0.026621160409556314, + "grad_norm": 3.97804594039917, + "learning_rate": 0.0009911262798634813, + "loss": 6.7642, + "step": 78 + }, + { + "epoch": 0.02696245733788396, + "grad_norm": 12.43903923034668, + "learning_rate": 0.0009910125142207054, + "loss": 7.3236, + "step": 79 + }, + { + "epoch": 0.027303754266211604, + "grad_norm": 4.261969566345215, + "learning_rate": 0.0009908987485779295, + "loss": 6.7744, + "step": 80 + }, + { + "epoch": 0.02764505119453925, + "grad_norm": 7.615736484527588, + "learning_rate": 0.0009907849829351536, + "loss": 6.103, + "step": 81 + }, + { + "epoch": 0.027986348122866895, + "grad_norm": 3.833221197128296, + "learning_rate": 0.0009906712172923777, + "loss": 6.8201, + "step": 82 + }, + { + "epoch": 0.02832764505119454, + "grad_norm": 3.7682363986968994, + "learning_rate": 0.0009905574516496018, + "loss": 7.2732, + "step": 83 + }, + { + "epoch": 0.028668941979522185, + "grad_norm": 3.788194179534912, + "learning_rate": 0.000990443686006826, + "loss": 6.8951, + "step": 84 + }, + { + "epoch": 0.02901023890784983, + "grad_norm": 3.743475914001465, + "learning_rate": 0.00099032992036405, + "loss": 7.4113, + "step": 85 + }, + { + "epoch": 0.029351535836177476, + "grad_norm": 6.027090072631836, + "learning_rate": 0.0009902161547212743, + "loss": 6.7511, + "step": 86 + }, + { + "epoch": 0.029692832764505118, + "grad_norm": 11.415129661560059, + "learning_rate": 0.0009901023890784985, + "loss": 6.9999, + "step": 87 + }, + { + "epoch": 0.030034129692832763, + "grad_norm": 3.9674999713897705, + "learning_rate": 0.0009899886234357223, + "loss": 7.0998, + "step": 88 + }, + { + "epoch": 0.03037542662116041, + "grad_norm": 2.7847208976745605, + "learning_rate": 0.0009898748577929464, + "loss": 3.5909, + "step": 89 + }, + { + "epoch": 0.030716723549488054, + "grad_norm": 4.207309722900391, + "learning_rate": 0.0009897610921501705, + "loss": 6.5897, + "step": 90 + }, + { + "epoch": 0.0310580204778157, + "grad_norm": 3.6647539138793945, + "learning_rate": 0.0009896473265073947, + "loss": 7.064, + "step": 91 + }, + { + "epoch": 0.031399317406143344, + "grad_norm": 3.664642572402954, + "learning_rate": 0.000989533560864619, + "loss": 6.9888, + "step": 92 + }, + { + "epoch": 0.03174061433447099, + "grad_norm": 3.5732812881469727, + "learning_rate": 0.000989419795221843, + "loss": 6.7395, + "step": 93 + }, + { + "epoch": 0.032081911262798635, + "grad_norm": 3.5871763229370117, + "learning_rate": 0.0009893060295790672, + "loss": 7.214, + "step": 94 + }, + { + "epoch": 0.032423208191126277, + "grad_norm": 7.773305892944336, + "learning_rate": 0.0009891922639362913, + "loss": 6.2924, + "step": 95 + }, + { + "epoch": 0.032764505119453925, + "grad_norm": 4.1082563400268555, + "learning_rate": 0.0009890784982935154, + "loss": 7.0112, + "step": 96 + }, + { + "epoch": 0.03310580204778157, + "grad_norm": 3.709672212600708, + "learning_rate": 0.0009889647326507395, + "loss": 6.6266, + "step": 97 + }, + { + "epoch": 0.033447098976109216, + "grad_norm": 5.293731212615967, + "learning_rate": 0.0009888509670079636, + "loss": 5.1656, + "step": 98 + }, + { + "epoch": 0.03378839590443686, + "grad_norm": 3.9541916847229004, + "learning_rate": 0.0009887372013651877, + "loss": 7.2169, + "step": 99 + }, + { + "epoch": 0.034129692832764506, + "grad_norm": 4.045575141906738, + "learning_rate": 0.0009886234357224118, + "loss": 6.9998, + "step": 100 + }, + { + "epoch": 0.03447098976109215, + "grad_norm": 3.9652297496795654, + "learning_rate": 0.000988509670079636, + "loss": 7.1183, + "step": 101 + }, + { + "epoch": 0.0348122866894198, + "grad_norm": 3.934983730316162, + "learning_rate": 0.00098839590443686, + "loss": 7.133, + "step": 102 + }, + { + "epoch": 0.03515358361774744, + "grad_norm": 3.6193904876708984, + "learning_rate": 0.0009882821387940843, + "loss": 7.1528, + "step": 103 + }, + { + "epoch": 0.03549488054607509, + "grad_norm": 3.532212018966675, + "learning_rate": 0.0009881683731513085, + "loss": 7.3217, + "step": 104 + }, + { + "epoch": 0.03583617747440273, + "grad_norm": 5.649550437927246, + "learning_rate": 0.0009880546075085326, + "loss": 6.6923, + "step": 105 + }, + { + "epoch": 0.03617747440273038, + "grad_norm": 3.867431402206421, + "learning_rate": 0.0009879408418657567, + "loss": 6.7362, + "step": 106 + }, + { + "epoch": 0.03651877133105802, + "grad_norm": 3.762444019317627, + "learning_rate": 0.0009878270762229806, + "loss": 7.6837, + "step": 107 + }, + { + "epoch": 0.03686006825938567, + "grad_norm": 3.932126522064209, + "learning_rate": 0.0009877133105802047, + "loss": 7.0018, + "step": 108 + }, + { + "epoch": 0.03720136518771331, + "grad_norm": 5.752960205078125, + "learning_rate": 0.000987599544937429, + "loss": 6.8083, + "step": 109 + }, + { + "epoch": 0.03754266211604096, + "grad_norm": 3.6438000202178955, + "learning_rate": 0.000987485779294653, + "loss": 6.7968, + "step": 110 + }, + { + "epoch": 0.0378839590443686, + "grad_norm": 3.9477944374084473, + "learning_rate": 0.0009873720136518772, + "loss": 6.9717, + "step": 111 + }, + { + "epoch": 0.03822525597269624, + "grad_norm": 3.776455879211426, + "learning_rate": 0.0009872582480091013, + "loss": 6.5417, + "step": 112 + }, + { + "epoch": 0.03856655290102389, + "grad_norm": 4.05007791519165, + "learning_rate": 0.0009871444823663254, + "loss": 7.0061, + "step": 113 + }, + { + "epoch": 0.03890784982935153, + "grad_norm": 3.8773951530456543, + "learning_rate": 0.0009870307167235495, + "loss": 7.1258, + "step": 114 + }, + { + "epoch": 0.03924914675767918, + "grad_norm": 3.6618783473968506, + "learning_rate": 0.0009869169510807736, + "loss": 6.6578, + "step": 115 + }, + { + "epoch": 0.039590443686006824, + "grad_norm": 3.705451726913452, + "learning_rate": 0.0009868031854379977, + "loss": 7.15, + "step": 116 + }, + { + "epoch": 0.03993174061433447, + "grad_norm": 3.6103193759918213, + "learning_rate": 0.0009866894197952218, + "loss": 7.2358, + "step": 117 + }, + { + "epoch": 0.040273037542662114, + "grad_norm": 3.8052639961242676, + "learning_rate": 0.000986575654152446, + "loss": 6.7413, + "step": 118 + }, + { + "epoch": 0.04061433447098976, + "grad_norm": 3.7359206676483154, + "learning_rate": 0.00098646188850967, + "loss": 6.8834, + "step": 119 + }, + { + "epoch": 0.040955631399317405, + "grad_norm": 3.7333805561065674, + "learning_rate": 0.0009863481228668941, + "loss": 6.7141, + "step": 120 + }, + { + "epoch": 0.041296928327645054, + "grad_norm": 3.592845916748047, + "learning_rate": 0.0009862343572241185, + "loss": 6.9167, + "step": 121 + }, + { + "epoch": 0.041638225255972695, + "grad_norm": 3.8049752712249756, + "learning_rate": 0.0009861205915813426, + "loss": 7.1732, + "step": 122 + }, + { + "epoch": 0.041979522184300344, + "grad_norm": 3.7017672061920166, + "learning_rate": 0.0009860068259385667, + "loss": 7.1409, + "step": 123 + }, + { + "epoch": 0.042320819112627986, + "grad_norm": 3.581944704055786, + "learning_rate": 0.0009858930602957908, + "loss": 7.401, + "step": 124 + }, + { + "epoch": 0.042662116040955635, + "grad_norm": 3.60552716255188, + "learning_rate": 0.0009857792946530149, + "loss": 7.1492, + "step": 125 + }, + { + "epoch": 0.043003412969283276, + "grad_norm": 3.5588576793670654, + "learning_rate": 0.000985665529010239, + "loss": 7.2996, + "step": 126 + }, + { + "epoch": 0.04334470989761092, + "grad_norm": 3.566678524017334, + "learning_rate": 0.000985551763367463, + "loss": 6.7319, + "step": 127 + }, + { + "epoch": 0.04368600682593857, + "grad_norm": 5.839478492736816, + "learning_rate": 0.0009854379977246872, + "loss": 6.2529, + "step": 128 + }, + { + "epoch": 0.04402730375426621, + "grad_norm": 5.420695781707764, + "learning_rate": 0.0009853242320819113, + "loss": 6.3413, + "step": 129 + }, + { + "epoch": 0.04436860068259386, + "grad_norm": 4.142327785491943, + "learning_rate": 0.0009852104664391354, + "loss": 7.3027, + "step": 130 + }, + { + "epoch": 0.0447098976109215, + "grad_norm": 3.7694830894470215, + "learning_rate": 0.0009850967007963595, + "loss": 6.5612, + "step": 131 + }, + { + "epoch": 0.04505119453924915, + "grad_norm": 3.7021262645721436, + "learning_rate": 0.0009849829351535836, + "loss": 7.1486, + "step": 132 + }, + { + "epoch": 0.04539249146757679, + "grad_norm": 3.4137067794799805, + "learning_rate": 0.0009848691695108077, + "loss": 7.3512, + "step": 133 + }, + { + "epoch": 0.04573378839590444, + "grad_norm": 3.9189088344573975, + "learning_rate": 0.0009847554038680318, + "loss": 6.877, + "step": 134 + }, + { + "epoch": 0.04607508532423208, + "grad_norm": 3.598252534866333, + "learning_rate": 0.000984641638225256, + "loss": 7.1513, + "step": 135 + }, + { + "epoch": 0.04641638225255973, + "grad_norm": 3.869384527206421, + "learning_rate": 0.00098452787258248, + "loss": 6.7868, + "step": 136 + }, + { + "epoch": 0.04675767918088737, + "grad_norm": 4.416536808013916, + "learning_rate": 0.0009844141069397041, + "loss": 6.3465, + "step": 137 + }, + { + "epoch": 0.04709897610921502, + "grad_norm": 3.654963493347168, + "learning_rate": 0.0009843003412969285, + "loss": 7.4472, + "step": 138 + }, + { + "epoch": 0.04744027303754266, + "grad_norm": 3.6647439002990723, + "learning_rate": 0.0009841865756541526, + "loss": 7.0507, + "step": 139 + }, + { + "epoch": 0.04778156996587031, + "grad_norm": 3.4522414207458496, + "learning_rate": 0.0009840728100113767, + "loss": 7.1764, + "step": 140 + }, + { + "epoch": 0.04812286689419795, + "grad_norm": 3.3954365253448486, + "learning_rate": 0.0009839590443686008, + "loss": 6.9876, + "step": 141 + }, + { + "epoch": 0.048464163822525594, + "grad_norm": 3.482323169708252, + "learning_rate": 0.0009838452787258249, + "loss": 6.5746, + "step": 142 + }, + { + "epoch": 0.04880546075085324, + "grad_norm": 3.477513551712036, + "learning_rate": 0.000983731513083049, + "loss": 7.2068, + "step": 143 + }, + { + "epoch": 0.049146757679180884, + "grad_norm": 3.6850838661193848, + "learning_rate": 0.000983617747440273, + "loss": 6.8805, + "step": 144 + }, + { + "epoch": 0.04948805460750853, + "grad_norm": 6.7132673263549805, + "learning_rate": 0.0009835039817974972, + "loss": 6.5654, + "step": 145 + }, + { + "epoch": 0.049829351535836175, + "grad_norm": 3.8716394901275635, + "learning_rate": 0.0009833902161547213, + "loss": 6.9398, + "step": 146 + }, + { + "epoch": 0.050170648464163824, + "grad_norm": 3.562126874923706, + "learning_rate": 0.0009832764505119454, + "loss": 6.804, + "step": 147 + }, + { + "epoch": 0.050511945392491465, + "grad_norm": 3.6806352138519287, + "learning_rate": 0.0009831626848691695, + "loss": 7.0423, + "step": 148 + }, + { + "epoch": 0.050853242320819114, + "grad_norm": 5.191007614135742, + "learning_rate": 0.0009830489192263936, + "loss": 5.9842, + "step": 149 + }, + { + "epoch": 0.051194539249146756, + "grad_norm": 3.838003396987915, + "learning_rate": 0.0009829351535836177, + "loss": 7.0365, + "step": 150 + }, + { + "epoch": 0.051535836177474405, + "grad_norm": 4.209732532501221, + "learning_rate": 0.0009828213879408418, + "loss": 6.7704, + "step": 151 + }, + { + "epoch": 0.05187713310580205, + "grad_norm": 3.973942995071411, + "learning_rate": 0.000982707622298066, + "loss": 6.5295, + "step": 152 + }, + { + "epoch": 0.052218430034129695, + "grad_norm": 3.3907649517059326, + "learning_rate": 0.00098259385665529, + "loss": 7.0298, + "step": 153 + }, + { + "epoch": 0.05255972696245734, + "grad_norm": 3.6388776302337646, + "learning_rate": 0.0009824800910125141, + "loss": 6.7746, + "step": 154 + }, + { + "epoch": 0.052901023890784986, + "grad_norm": 3.418466091156006, + "learning_rate": 0.0009823663253697385, + "loss": 6.9274, + "step": 155 + }, + { + "epoch": 0.05324232081911263, + "grad_norm": 4.016181945800781, + "learning_rate": 0.0009822525597269626, + "loss": 6.647, + "step": 156 + }, + { + "epoch": 0.053583617747440276, + "grad_norm": 3.4281997680664062, + "learning_rate": 0.0009821387940841867, + "loss": 6.594, + "step": 157 + }, + { + "epoch": 0.05392491467576792, + "grad_norm": 3.6327078342437744, + "learning_rate": 0.0009820250284414108, + "loss": 6.8371, + "step": 158 + }, + { + "epoch": 0.05426621160409556, + "grad_norm": 3.937331438064575, + "learning_rate": 0.0009819112627986349, + "loss": 6.6783, + "step": 159 + }, + { + "epoch": 0.05460750853242321, + "grad_norm": 5.26201057434082, + "learning_rate": 0.000981797497155859, + "loss": 5.6406, + "step": 160 + }, + { + "epoch": 0.05494880546075085, + "grad_norm": 3.728435516357422, + "learning_rate": 0.000981683731513083, + "loss": 7.1284, + "step": 161 + }, + { + "epoch": 0.0552901023890785, + "grad_norm": 3.79921555519104, + "learning_rate": 0.0009815699658703072, + "loss": 6.7235, + "step": 162 + }, + { + "epoch": 0.05563139931740614, + "grad_norm": 3.5397472381591797, + "learning_rate": 0.0009814562002275313, + "loss": 6.6777, + "step": 163 + }, + { + "epoch": 0.05597269624573379, + "grad_norm": 3.5530202388763428, + "learning_rate": 0.0009813424345847554, + "loss": 6.4635, + "step": 164 + }, + { + "epoch": 0.05631399317406143, + "grad_norm": 4.582598686218262, + "learning_rate": 0.0009812286689419795, + "loss": 6.3199, + "step": 165 + }, + { + "epoch": 0.05665529010238908, + "grad_norm": 4.460880279541016, + "learning_rate": 0.0009811149032992036, + "loss": 5.5054, + "step": 166 + }, + { + "epoch": 0.05699658703071672, + "grad_norm": 3.812800168991089, + "learning_rate": 0.0009810011376564277, + "loss": 6.8298, + "step": 167 + }, + { + "epoch": 0.05733788395904437, + "grad_norm": 3.747919797897339, + "learning_rate": 0.0009808873720136518, + "loss": 6.7751, + "step": 168 + }, + { + "epoch": 0.05767918088737201, + "grad_norm": 6.039458274841309, + "learning_rate": 0.000980773606370876, + "loss": 5.7825, + "step": 169 + }, + { + "epoch": 0.05802047781569966, + "grad_norm": 3.8591084480285645, + "learning_rate": 0.0009806598407281, + "loss": 6.8238, + "step": 170 + }, + { + "epoch": 0.0583617747440273, + "grad_norm": 3.8271124362945557, + "learning_rate": 0.0009805460750853241, + "loss": 6.3032, + "step": 171 + }, + { + "epoch": 0.05870307167235495, + "grad_norm": 3.730949640274048, + "learning_rate": 0.0009804323094425485, + "loss": 6.9856, + "step": 172 + }, + { + "epoch": 0.059044368600682594, + "grad_norm": 5.848387718200684, + "learning_rate": 0.0009803185437997726, + "loss": 5.8503, + "step": 173 + }, + { + "epoch": 0.059385665529010236, + "grad_norm": 5.451254844665527, + "learning_rate": 0.0009802047781569967, + "loss": 6.6104, + "step": 174 + }, + { + "epoch": 0.059726962457337884, + "grad_norm": 4.356447219848633, + "learning_rate": 0.0009800910125142208, + "loss": 7.2519, + "step": 175 + }, + { + "epoch": 0.060068259385665526, + "grad_norm": 3.5853893756866455, + "learning_rate": 0.0009799772468714449, + "loss": 6.8436, + "step": 176 + }, + { + "epoch": 0.060409556313993175, + "grad_norm": 3.9135236740112305, + "learning_rate": 0.000979863481228669, + "loss": 6.7005, + "step": 177 + }, + { + "epoch": 0.06075085324232082, + "grad_norm": 3.59726881980896, + "learning_rate": 0.000979749715585893, + "loss": 6.747, + "step": 178 + }, + { + "epoch": 0.061092150170648465, + "grad_norm": 4.7263593673706055, + "learning_rate": 0.0009796359499431172, + "loss": 5.5305, + "step": 179 + }, + { + "epoch": 0.06143344709897611, + "grad_norm": 3.808228015899658, + "learning_rate": 0.0009795221843003413, + "loss": 7.0182, + "step": 180 + }, + { + "epoch": 0.061774744027303756, + "grad_norm": 3.7145042419433594, + "learning_rate": 0.0009794084186575654, + "loss": 6.8614, + "step": 181 + }, + { + "epoch": 0.0621160409556314, + "grad_norm": 3.560530424118042, + "learning_rate": 0.0009792946530147895, + "loss": 6.8768, + "step": 182 + }, + { + "epoch": 0.062457337883959047, + "grad_norm": 3.480214834213257, + "learning_rate": 0.0009791808873720136, + "loss": 6.9845, + "step": 183 + }, + { + "epoch": 0.06279863481228669, + "grad_norm": 3.4854085445404053, + "learning_rate": 0.000979067121729238, + "loss": 6.9272, + "step": 184 + }, + { + "epoch": 0.06313993174061433, + "grad_norm": 3.636730432510376, + "learning_rate": 0.0009789533560864618, + "loss": 6.84, + "step": 185 + }, + { + "epoch": 0.06348122866894199, + "grad_norm": 3.543924331665039, + "learning_rate": 0.000978839590443686, + "loss": 6.9787, + "step": 186 + }, + { + "epoch": 0.06382252559726963, + "grad_norm": 3.629248857498169, + "learning_rate": 0.00097872582480091, + "loss": 6.7263, + "step": 187 + }, + { + "epoch": 0.06416382252559727, + "grad_norm": 3.5820744037628174, + "learning_rate": 0.0009786120591581341, + "loss": 6.6454, + "step": 188 + }, + { + "epoch": 0.06450511945392491, + "grad_norm": 6.293400764465332, + "learning_rate": 0.0009784982935153585, + "loss": 4.73, + "step": 189 + }, + { + "epoch": 0.06484641638225255, + "grad_norm": 6.925587177276611, + "learning_rate": 0.0009783845278725826, + "loss": 4.6797, + "step": 190 + }, + { + "epoch": 0.06518771331058021, + "grad_norm": 4.270597457885742, + "learning_rate": 0.0009782707622298067, + "loss": 6.9059, + "step": 191 + }, + { + "epoch": 0.06552901023890785, + "grad_norm": 3.773348808288574, + "learning_rate": 0.0009781569965870308, + "loss": 7.0965, + "step": 192 + }, + { + "epoch": 0.06587030716723549, + "grad_norm": 3.767024517059326, + "learning_rate": 0.0009780432309442549, + "loss": 6.7144, + "step": 193 + }, + { + "epoch": 0.06621160409556313, + "grad_norm": 3.9505908489227295, + "learning_rate": 0.000977929465301479, + "loss": 6.8606, + "step": 194 + }, + { + "epoch": 0.06655290102389079, + "grad_norm": 3.617802381515503, + "learning_rate": 0.000977815699658703, + "loss": 6.7616, + "step": 195 + }, + { + "epoch": 0.06689419795221843, + "grad_norm": 3.433324098587036, + "learning_rate": 0.0009777019340159272, + "loss": 6.7047, + "step": 196 + }, + { + "epoch": 0.06723549488054607, + "grad_norm": 3.5975823402404785, + "learning_rate": 0.0009775881683731513, + "loss": 6.5858, + "step": 197 + }, + { + "epoch": 0.06757679180887372, + "grad_norm": 3.548548936843872, + "learning_rate": 0.0009774744027303754, + "loss": 7.1213, + "step": 198 + }, + { + "epoch": 0.06791808873720137, + "grad_norm": 3.8470911979675293, + "learning_rate": 0.0009773606370875995, + "loss": 6.3278, + "step": 199 + }, + { + "epoch": 0.06825938566552901, + "grad_norm": 3.527210235595703, + "learning_rate": 0.0009772468714448236, + "loss": 6.8947, + "step": 200 + }, + { + "epoch": 0.06860068259385665, + "grad_norm": 3.4403419494628906, + "learning_rate": 0.000977133105802048, + "loss": 6.738, + "step": 201 + }, + { + "epoch": 0.0689419795221843, + "grad_norm": 3.733076810836792, + "learning_rate": 0.000977019340159272, + "loss": 6.9729, + "step": 202 + }, + { + "epoch": 0.06928327645051195, + "grad_norm": 3.4491777420043945, + "learning_rate": 0.0009769055745164961, + "loss": 7.0452, + "step": 203 + }, + { + "epoch": 0.0696245733788396, + "grad_norm": 12.765624046325684, + "learning_rate": 0.0009767918088737202, + "loss": 6.6967, + "step": 204 + }, + { + "epoch": 0.06996587030716724, + "grad_norm": 4.026791095733643, + "learning_rate": 0.0009766780432309441, + "loss": 6.9201, + "step": 205 + }, + { + "epoch": 0.07030716723549488, + "grad_norm": 6.290328502655029, + "learning_rate": 0.0009765642775881682, + "loss": 6.3519, + "step": 206 + }, + { + "epoch": 0.07064846416382252, + "grad_norm": 3.867109775543213, + "learning_rate": 0.0009764505119453925, + "loss": 7.309, + "step": 207 + }, + { + "epoch": 0.07098976109215017, + "grad_norm": 3.9246108531951904, + "learning_rate": 0.0009763367463026166, + "loss": 6.7805, + "step": 208 + }, + { + "epoch": 0.07133105802047782, + "grad_norm": 3.378211498260498, + "learning_rate": 0.0009762229806598408, + "loss": 7.007, + "step": 209 + }, + { + "epoch": 0.07167235494880546, + "grad_norm": 3.3665435314178467, + "learning_rate": 0.0009761092150170649, + "loss": 6.7892, + "step": 210 + }, + { + "epoch": 0.0720136518771331, + "grad_norm": 3.457585096359253, + "learning_rate": 0.000975995449374289, + "loss": 6.8816, + "step": 211 + }, + { + "epoch": 0.07235494880546076, + "grad_norm": 3.859353542327881, + "learning_rate": 0.0009758816837315131, + "loss": 5.9202, + "step": 212 + }, + { + "epoch": 0.0726962457337884, + "grad_norm": 5.0818190574646, + "learning_rate": 0.0009757679180887372, + "loss": 6.0906, + "step": 213 + }, + { + "epoch": 0.07303754266211604, + "grad_norm": 3.940415620803833, + "learning_rate": 0.0009756541524459613, + "loss": 7.1606, + "step": 214 + }, + { + "epoch": 0.07337883959044368, + "grad_norm": 9.891190528869629, + "learning_rate": 0.0009755403868031855, + "loss": 4.2678, + "step": 215 + }, + { + "epoch": 0.07372013651877134, + "grad_norm": 4.019010543823242, + "learning_rate": 0.0009754266211604096, + "loss": 6.3178, + "step": 216 + }, + { + "epoch": 0.07406143344709898, + "grad_norm": 3.7409043312072754, + "learning_rate": 0.0009753128555176337, + "loss": 6.9846, + "step": 217 + }, + { + "epoch": 0.07440273037542662, + "grad_norm": 3.7472164630889893, + "learning_rate": 0.0009751990898748578, + "loss": 6.3952, + "step": 218 + }, + { + "epoch": 0.07474402730375426, + "grad_norm": 3.981301784515381, + "learning_rate": 0.0009750853242320819, + "loss": 6.888, + "step": 219 + }, + { + "epoch": 0.07508532423208192, + "grad_norm": 7.142719268798828, + "learning_rate": 0.000974971558589306, + "loss": 6.4133, + "step": 220 + }, + { + "epoch": 0.07542662116040956, + "grad_norm": 3.712938070297241, + "learning_rate": 0.0009748577929465302, + "loss": 7.2119, + "step": 221 + }, + { + "epoch": 0.0757679180887372, + "grad_norm": 3.7851550579071045, + "learning_rate": 0.0009747440273037544, + "loss": 6.5805, + "step": 222 + }, + { + "epoch": 0.07610921501706484, + "grad_norm": 3.5392749309539795, + "learning_rate": 0.0009746302616609785, + "loss": 6.6098, + "step": 223 + }, + { + "epoch": 0.07645051194539249, + "grad_norm": 4.032394886016846, + "learning_rate": 0.0009745164960182025, + "loss": 6.7128, + "step": 224 + }, + { + "epoch": 0.07679180887372014, + "grad_norm": 3.6618125438690186, + "learning_rate": 0.0009744027303754266, + "loss": 6.6606, + "step": 225 + }, + { + "epoch": 0.07713310580204778, + "grad_norm": 3.458958148956299, + "learning_rate": 0.0009742889647326507, + "loss": 6.8682, + "step": 226 + }, + { + "epoch": 0.07747440273037542, + "grad_norm": 4.641584396362305, + "learning_rate": 0.0009741751990898749, + "loss": 6.1076, + "step": 227 + }, + { + "epoch": 0.07781569965870307, + "grad_norm": 3.7606310844421387, + "learning_rate": 0.000974061433447099, + "loss": 6.9889, + "step": 228 + }, + { + "epoch": 0.07815699658703072, + "grad_norm": 3.448821783065796, + "learning_rate": 0.0009739476678043231, + "loss": 6.9313, + "step": 229 + }, + { + "epoch": 0.07849829351535836, + "grad_norm": 3.49123477935791, + "learning_rate": 0.0009738339021615472, + "loss": 6.9526, + "step": 230 + }, + { + "epoch": 0.078839590443686, + "grad_norm": 3.5688812732696533, + "learning_rate": 0.0009737201365187713, + "loss": 6.5122, + "step": 231 + }, + { + "epoch": 0.07918088737201365, + "grad_norm": 3.416281223297119, + "learning_rate": 0.0009736063708759955, + "loss": 6.7054, + "step": 232 + }, + { + "epoch": 0.0795221843003413, + "grad_norm": 3.3496880531311035, + "learning_rate": 0.0009734926052332196, + "loss": 6.6948, + "step": 233 + }, + { + "epoch": 0.07986348122866894, + "grad_norm": 3.9867589473724365, + "learning_rate": 0.0009733788395904437, + "loss": 6.8821, + "step": 234 + }, + { + "epoch": 0.08020477815699659, + "grad_norm": 3.6225593090057373, + "learning_rate": 0.0009732650739476678, + "loss": 6.706, + "step": 235 + }, + { + "epoch": 0.08054607508532423, + "grad_norm": 3.7255892753601074, + "learning_rate": 0.0009731513083048919, + "loss": 7.0775, + "step": 236 + }, + { + "epoch": 0.08088737201365187, + "grad_norm": 8.664634704589844, + "learning_rate": 0.000973037542662116, + "loss": 5.4271, + "step": 237 + }, + { + "epoch": 0.08122866894197953, + "grad_norm": 4.9100117683410645, + "learning_rate": 0.0009729237770193402, + "loss": 6.4401, + "step": 238 + }, + { + "epoch": 0.08156996587030717, + "grad_norm": 3.7028720378875732, + "learning_rate": 0.0009728100113765644, + "loss": 6.3939, + "step": 239 + }, + { + "epoch": 0.08191126279863481, + "grad_norm": 3.713730573654175, + "learning_rate": 0.0009726962457337885, + "loss": 6.8395, + "step": 240 + }, + { + "epoch": 0.08225255972696245, + "grad_norm": 3.7303857803344727, + "learning_rate": 0.0009725824800910126, + "loss": 6.5482, + "step": 241 + }, + { + "epoch": 0.08259385665529011, + "grad_norm": 3.4762961864471436, + "learning_rate": 0.0009724687144482367, + "loss": 7.0664, + "step": 242 + }, + { + "epoch": 0.08293515358361775, + "grad_norm": 3.4974660873413086, + "learning_rate": 0.0009723549488054608, + "loss": 6.7011, + "step": 243 + }, + { + "epoch": 0.08327645051194539, + "grad_norm": 3.515742540359497, + "learning_rate": 0.0009722411831626849, + "loss": 7.2582, + "step": 244 + }, + { + "epoch": 0.08361774744027303, + "grad_norm": 3.654632806777954, + "learning_rate": 0.000972127417519909, + "loss": 6.6722, + "step": 245 + }, + { + "epoch": 0.08395904436860069, + "grad_norm": 3.580826759338379, + "learning_rate": 0.0009720136518771331, + "loss": 6.6658, + "step": 246 + }, + { + "epoch": 0.08430034129692833, + "grad_norm": 3.44158673286438, + "learning_rate": 0.0009718998862343572, + "loss": 6.6069, + "step": 247 + }, + { + "epoch": 0.08464163822525597, + "grad_norm": 3.856389045715332, + "learning_rate": 0.0009717861205915813, + "loss": 6.7927, + "step": 248 + }, + { + "epoch": 0.08498293515358361, + "grad_norm": 3.6192235946655273, + "learning_rate": 0.0009716723549488055, + "loss": 6.9655, + "step": 249 + }, + { + "epoch": 0.08532423208191127, + "grad_norm": 3.7248499393463135, + "learning_rate": 0.0009715585893060296, + "loss": 7.0733, + "step": 250 + }, + { + "epoch": 0.08566552901023891, + "grad_norm": 3.485734701156616, + "learning_rate": 0.0009714448236632537, + "loss": 6.7725, + "step": 251 + }, + { + "epoch": 0.08600682593856655, + "grad_norm": 3.442128896713257, + "learning_rate": 0.0009713310580204778, + "loss": 6.8307, + "step": 252 + }, + { + "epoch": 0.0863481228668942, + "grad_norm": 4.0273308753967285, + "learning_rate": 0.0009712172923777019, + "loss": 6.2919, + "step": 253 + }, + { + "epoch": 0.08668941979522184, + "grad_norm": 3.7515664100646973, + "learning_rate": 0.000971103526734926, + "loss": 6.7646, + "step": 254 + }, + { + "epoch": 0.08703071672354949, + "grad_norm": 3.6255035400390625, + "learning_rate": 0.0009709897610921503, + "loss": 7.1922, + "step": 255 + }, + { + "epoch": 0.08737201365187713, + "grad_norm": 3.9632773399353027, + "learning_rate": 0.0009708759954493744, + "loss": 6.6199, + "step": 256 + }, + { + "epoch": 0.08771331058020478, + "grad_norm": 3.5362298488616943, + "learning_rate": 0.0009707622298065985, + "loss": 6.9201, + "step": 257 + }, + { + "epoch": 0.08805460750853242, + "grad_norm": 3.4530086517333984, + "learning_rate": 0.0009706484641638226, + "loss": 6.8544, + "step": 258 + }, + { + "epoch": 0.08839590443686007, + "grad_norm": 3.586812973022461, + "learning_rate": 0.0009705346985210467, + "loss": 6.6807, + "step": 259 + }, + { + "epoch": 0.08873720136518772, + "grad_norm": 3.553516387939453, + "learning_rate": 0.0009704209328782708, + "loss": 7.0369, + "step": 260 + }, + { + "epoch": 0.08907849829351536, + "grad_norm": 3.6874141693115234, + "learning_rate": 0.000970307167235495, + "loss": 6.7392, + "step": 261 + }, + { + "epoch": 0.089419795221843, + "grad_norm": 3.5124759674072266, + "learning_rate": 0.0009701934015927191, + "loss": 6.6912, + "step": 262 + }, + { + "epoch": 0.08976109215017065, + "grad_norm": 3.7072601318359375, + "learning_rate": 0.0009700796359499431, + "loss": 6.5064, + "step": 263 + }, + { + "epoch": 0.0901023890784983, + "grad_norm": 4.079402446746826, + "learning_rate": 0.0009699658703071672, + "loss": 6.487, + "step": 264 + }, + { + "epoch": 0.09044368600682594, + "grad_norm": 3.6425037384033203, + "learning_rate": 0.0009698521046643913, + "loss": 6.7205, + "step": 265 + }, + { + "epoch": 0.09078498293515358, + "grad_norm": 3.5927422046661377, + "learning_rate": 0.0009697383390216154, + "loss": 6.6607, + "step": 266 + }, + { + "epoch": 0.09112627986348124, + "grad_norm": 3.606675624847412, + "learning_rate": 0.0009696245733788396, + "loss": 7.0384, + "step": 267 + }, + { + "epoch": 0.09146757679180888, + "grad_norm": 4.241934776306152, + "learning_rate": 0.0009695108077360637, + "loss": 6.8748, + "step": 268 + }, + { + "epoch": 0.09180887372013652, + "grad_norm": 3.801959276199341, + "learning_rate": 0.0009693970420932878, + "loss": 7.1313, + "step": 269 + }, + { + "epoch": 0.09215017064846416, + "grad_norm": 3.597132921218872, + "learning_rate": 0.0009692832764505119, + "loss": 6.954, + "step": 270 + }, + { + "epoch": 0.0924914675767918, + "grad_norm": 3.9115519523620605, + "learning_rate": 0.000969169510807736, + "loss": 6.5418, + "step": 271 + }, + { + "epoch": 0.09283276450511946, + "grad_norm": 4.029146671295166, + "learning_rate": 0.0009690557451649603, + "loss": 6.626, + "step": 272 + }, + { + "epoch": 0.0931740614334471, + "grad_norm": 3.7334775924682617, + "learning_rate": 0.0009689419795221844, + "loss": 6.3663, + "step": 273 + }, + { + "epoch": 0.09351535836177474, + "grad_norm": 3.6482112407684326, + "learning_rate": 0.0009688282138794085, + "loss": 6.9092, + "step": 274 + }, + { + "epoch": 0.09385665529010238, + "grad_norm": 8.627721786499023, + "learning_rate": 0.0009687144482366326, + "loss": 5.9086, + "step": 275 + }, + { + "epoch": 0.09419795221843004, + "grad_norm": 3.9426357746124268, + "learning_rate": 0.0009686006825938567, + "loss": 7.169, + "step": 276 + }, + { + "epoch": 0.09453924914675768, + "grad_norm": 4.135240077972412, + "learning_rate": 0.0009684869169510808, + "loss": 6.7811, + "step": 277 + }, + { + "epoch": 0.09488054607508532, + "grad_norm": 4.837364673614502, + "learning_rate": 0.000968373151308305, + "loss": 6.8342, + "step": 278 + }, + { + "epoch": 0.09522184300341296, + "grad_norm": 3.7553555965423584, + "learning_rate": 0.0009682593856655291, + "loss": 6.4767, + "step": 279 + }, + { + "epoch": 0.09556313993174062, + "grad_norm": 3.947141647338867, + "learning_rate": 0.0009681456200227532, + "loss": 6.2992, + "step": 280 + }, + { + "epoch": 0.09590443686006826, + "grad_norm": 4.0404486656188965, + "learning_rate": 0.0009680318543799773, + "loss": 6.9428, + "step": 281 + }, + { + "epoch": 0.0962457337883959, + "grad_norm": 3.603900194168091, + "learning_rate": 0.0009679180887372013, + "loss": 7.2719, + "step": 282 + }, + { + "epoch": 0.09658703071672355, + "grad_norm": 5.454912185668945, + "learning_rate": 0.0009678043230944254, + "loss": 6.1625, + "step": 283 + }, + { + "epoch": 0.09692832764505119, + "grad_norm": 4.6887736320495605, + "learning_rate": 0.0009676905574516496, + "loss": 4.2823, + "step": 284 + }, + { + "epoch": 0.09726962457337884, + "grad_norm": 4.27936315536499, + "learning_rate": 0.0009675767918088737, + "loss": 6.8959, + "step": 285 + }, + { + "epoch": 0.09761092150170649, + "grad_norm": 3.739509105682373, + "learning_rate": 0.0009674630261660978, + "loss": 7.2816, + "step": 286 + }, + { + "epoch": 0.09795221843003413, + "grad_norm": 3.4555413722991943, + "learning_rate": 0.0009673492605233219, + "loss": 7.0285, + "step": 287 + }, + { + "epoch": 0.09829351535836177, + "grad_norm": 3.6354432106018066, + "learning_rate": 0.000967235494880546, + "loss": 6.5368, + "step": 288 + }, + { + "epoch": 0.09863481228668942, + "grad_norm": 3.558715343475342, + "learning_rate": 0.0009671217292377701, + "loss": 7.1075, + "step": 289 + }, + { + "epoch": 0.09897610921501707, + "grad_norm": 9.740402221679688, + "learning_rate": 0.0009670079635949944, + "loss": 6.4126, + "step": 290 + }, + { + "epoch": 0.09931740614334471, + "grad_norm": 3.963900089263916, + "learning_rate": 0.0009668941979522185, + "loss": 6.8788, + "step": 291 + }, + { + "epoch": 0.09965870307167235, + "grad_norm": 3.8590564727783203, + "learning_rate": 0.0009667804323094426, + "loss": 7.1995, + "step": 292 + }, + { + "epoch": 0.1, + "grad_norm": 3.5563805103302, + "learning_rate": 0.0009666666666666667, + "loss": 7.3654, + "step": 293 + }, + { + "epoch": 0.10034129692832765, + "grad_norm": 3.6024012565612793, + "learning_rate": 0.0009665529010238908, + "loss": 6.7006, + "step": 294 + }, + { + "epoch": 0.10068259385665529, + "grad_norm": 3.463623523712158, + "learning_rate": 0.000966439135381115, + "loss": 6.8185, + "step": 295 + }, + { + "epoch": 0.10102389078498293, + "grad_norm": 5.106781005859375, + "learning_rate": 0.0009663253697383391, + "loss": 5.5867, + "step": 296 + }, + { + "epoch": 0.10136518771331059, + "grad_norm": 3.8914577960968018, + "learning_rate": 0.0009662116040955632, + "loss": 7.6307, + "step": 297 + }, + { + "epoch": 0.10170648464163823, + "grad_norm": 4.023569583892822, + "learning_rate": 0.0009660978384527873, + "loss": 6.4419, + "step": 298 + }, + { + "epoch": 0.10204778156996587, + "grad_norm": 4.755953788757324, + "learning_rate": 0.0009659840728100114, + "loss": 6.7173, + "step": 299 + }, + { + "epoch": 0.10238907849829351, + "grad_norm": 3.9473185539245605, + "learning_rate": 0.0009658703071672355, + "loss": 4.9165, + "step": 300 + }, + { + "epoch": 0.10273037542662115, + "grad_norm": 4.2228684425354, + "learning_rate": 0.0009657565415244597, + "loss": 6.7526, + "step": 301 + }, + { + "epoch": 0.10307167235494881, + "grad_norm": 3.5894923210144043, + "learning_rate": 0.0009656427758816837, + "loss": 7.1576, + "step": 302 + }, + { + "epoch": 0.10341296928327645, + "grad_norm": 4.090893745422363, + "learning_rate": 0.0009655290102389078, + "loss": 6.4097, + "step": 303 + }, + { + "epoch": 0.1037542662116041, + "grad_norm": 3.7067759037017822, + "learning_rate": 0.0009654152445961319, + "loss": 7.3378, + "step": 304 + }, + { + "epoch": 0.10409556313993173, + "grad_norm": 3.431490182876587, + "learning_rate": 0.000965301478953356, + "loss": 7.1981, + "step": 305 + }, + { + "epoch": 0.10443686006825939, + "grad_norm": 3.6670467853546143, + "learning_rate": 0.0009651877133105801, + "loss": 6.5654, + "step": 306 + }, + { + "epoch": 0.10477815699658703, + "grad_norm": 3.554365634918213, + "learning_rate": 0.0009650739476678044, + "loss": 6.481, + "step": 307 + }, + { + "epoch": 0.10511945392491467, + "grad_norm": 3.5014779567718506, + "learning_rate": 0.0009649601820250285, + "loss": 6.8385, + "step": 308 + }, + { + "epoch": 0.10546075085324232, + "grad_norm": 10.873976707458496, + "learning_rate": 0.0009648464163822526, + "loss": 6.5274, + "step": 309 + }, + { + "epoch": 0.10580204778156997, + "grad_norm": 3.9372551441192627, + "learning_rate": 0.0009647326507394767, + "loss": 7.2976, + "step": 310 + }, + { + "epoch": 0.10614334470989761, + "grad_norm": 3.7556705474853516, + "learning_rate": 0.0009646188850967008, + "loss": 7.2823, + "step": 311 + }, + { + "epoch": 0.10648464163822526, + "grad_norm": 5.166213512420654, + "learning_rate": 0.000964505119453925, + "loss": 6.1935, + "step": 312 + }, + { + "epoch": 0.1068259385665529, + "grad_norm": 4.041624546051025, + "learning_rate": 0.0009643913538111491, + "loss": 7.1861, + "step": 313 + }, + { + "epoch": 0.10716723549488055, + "grad_norm": 4.318609714508057, + "learning_rate": 0.0009642775881683732, + "loss": 6.4683, + "step": 314 + }, + { + "epoch": 0.1075085324232082, + "grad_norm": 3.769338607788086, + "learning_rate": 0.0009641638225255973, + "loss": 6.7583, + "step": 315 + }, + { + "epoch": 0.10784982935153584, + "grad_norm": 3.6845760345458984, + "learning_rate": 0.0009640500568828214, + "loss": 6.8184, + "step": 316 + }, + { + "epoch": 0.10819112627986348, + "grad_norm": 3.5176687240600586, + "learning_rate": 0.0009639362912400455, + "loss": 6.7214, + "step": 317 + }, + { + "epoch": 0.10853242320819112, + "grad_norm": 3.728588581085205, + "learning_rate": 0.0009638225255972697, + "loss": 7.0418, + "step": 318 + }, + { + "epoch": 0.10887372013651878, + "grad_norm": 3.9855856895446777, + "learning_rate": 0.0009637087599544938, + "loss": 6.6959, + "step": 319 + }, + { + "epoch": 0.10921501706484642, + "grad_norm": 4.362178802490234, + "learning_rate": 0.0009635949943117179, + "loss": 6.0196, + "step": 320 + }, + { + "epoch": 0.10955631399317406, + "grad_norm": 5.292174816131592, + "learning_rate": 0.0009634812286689419, + "loss": 5.8822, + "step": 321 + }, + { + "epoch": 0.1098976109215017, + "grad_norm": 4.121539115905762, + "learning_rate": 0.000963367463026166, + "loss": 7.0859, + "step": 322 + }, + { + "epoch": 0.11023890784982936, + "grad_norm": 4.680109024047852, + "learning_rate": 0.0009632536973833901, + "loss": 6.4441, + "step": 323 + }, + { + "epoch": 0.110580204778157, + "grad_norm": 3.6892054080963135, + "learning_rate": 0.0009631399317406144, + "loss": 7.0459, + "step": 324 + }, + { + "epoch": 0.11092150170648464, + "grad_norm": 3.639540672302246, + "learning_rate": 0.0009630261660978385, + "loss": 7.1954, + "step": 325 + }, + { + "epoch": 0.11126279863481228, + "grad_norm": 3.4080095291137695, + "learning_rate": 0.0009629124004550626, + "loss": 7.0554, + "step": 326 + }, + { + "epoch": 0.11160409556313994, + "grad_norm": 3.4967737197875977, + "learning_rate": 0.0009627986348122867, + "loss": 7.17, + "step": 327 + }, + { + "epoch": 0.11194539249146758, + "grad_norm": 3.8794071674346924, + "learning_rate": 0.0009626848691695108, + "loss": 6.759, + "step": 328 + }, + { + "epoch": 0.11228668941979522, + "grad_norm": 3.408348321914673, + "learning_rate": 0.0009625711035267349, + "loss": 6.7917, + "step": 329 + }, + { + "epoch": 0.11262798634812286, + "grad_norm": 3.614760637283325, + "learning_rate": 0.0009624573378839591, + "loss": 6.6399, + "step": 330 + }, + { + "epoch": 0.1129692832764505, + "grad_norm": 3.6130752563476562, + "learning_rate": 0.0009623435722411832, + "loss": 6.8108, + "step": 331 + }, + { + "epoch": 0.11331058020477816, + "grad_norm": 3.7819409370422363, + "learning_rate": 0.0009622298065984073, + "loss": 7.3848, + "step": 332 + }, + { + "epoch": 0.1136518771331058, + "grad_norm": 3.626450300216675, + "learning_rate": 0.0009621160409556314, + "loss": 7.0753, + "step": 333 + }, + { + "epoch": 0.11399317406143344, + "grad_norm": 3.5669236183166504, + "learning_rate": 0.0009620022753128555, + "loss": 6.5975, + "step": 334 + }, + { + "epoch": 0.11433447098976109, + "grad_norm": 3.4628782272338867, + "learning_rate": 0.0009618885096700797, + "loss": 6.7684, + "step": 335 + }, + { + "epoch": 0.11467576791808874, + "grad_norm": 7.344601154327393, + "learning_rate": 0.0009617747440273038, + "loss": 6.0625, + "step": 336 + }, + { + "epoch": 0.11501706484641638, + "grad_norm": 3.9138948917388916, + "learning_rate": 0.0009616609783845279, + "loss": 6.9233, + "step": 337 + }, + { + "epoch": 0.11535836177474403, + "grad_norm": 3.7263455390930176, + "learning_rate": 0.000961547212741752, + "loss": 7.1069, + "step": 338 + }, + { + "epoch": 0.11569965870307167, + "grad_norm": 3.638993740081787, + "learning_rate": 0.0009614334470989762, + "loss": 6.5446, + "step": 339 + }, + { + "epoch": 0.11604095563139932, + "grad_norm": 3.507223129272461, + "learning_rate": 0.0009613196814562003, + "loss": 6.8566, + "step": 340 + }, + { + "epoch": 0.11638225255972696, + "grad_norm": 3.5542092323303223, + "learning_rate": 0.0009612059158134244, + "loss": 7.013, + "step": 341 + }, + { + "epoch": 0.1167235494880546, + "grad_norm": 3.579059600830078, + "learning_rate": 0.0009610921501706485, + "loss": 7.4816, + "step": 342 + }, + { + "epoch": 0.11706484641638225, + "grad_norm": 3.462669849395752, + "learning_rate": 0.0009609783845278726, + "loss": 6.7983, + "step": 343 + }, + { + "epoch": 0.1174061433447099, + "grad_norm": 3.597429037094116, + "learning_rate": 0.0009608646188850967, + "loss": 6.9345, + "step": 344 + }, + { + "epoch": 0.11774744027303755, + "grad_norm": 15.800237655639648, + "learning_rate": 0.0009607508532423208, + "loss": 6.5234, + "step": 345 + }, + { + "epoch": 0.11808873720136519, + "grad_norm": 4.462514400482178, + "learning_rate": 0.0009606370875995449, + "loss": 7.2368, + "step": 346 + }, + { + "epoch": 0.11843003412969283, + "grad_norm": 4.498167514801025, + "learning_rate": 0.0009605233219567691, + "loss": 6.6885, + "step": 347 + }, + { + "epoch": 0.11877133105802047, + "grad_norm": 4.717837810516357, + "learning_rate": 0.0009604095563139932, + "loss": 5.435, + "step": 348 + }, + { + "epoch": 0.11911262798634813, + "grad_norm": 3.9620487689971924, + "learning_rate": 0.0009602957906712173, + "loss": 7.6661, + "step": 349 + }, + { + "epoch": 0.11945392491467577, + "grad_norm": 3.7153236865997314, + "learning_rate": 0.0009601820250284414, + "loss": 6.6377, + "step": 350 + }, + { + "epoch": 0.11979522184300341, + "grad_norm": 4.2678632736206055, + "learning_rate": 0.0009600682593856655, + "loss": 6.5929, + "step": 351 + }, + { + "epoch": 0.12013651877133105, + "grad_norm": 3.488931894302368, + "learning_rate": 0.0009599544937428897, + "loss": 6.7701, + "step": 352 + }, + { + "epoch": 0.12047781569965871, + "grad_norm": 3.7950785160064697, + "learning_rate": 0.0009598407281001138, + "loss": 6.3283, + "step": 353 + }, + { + "epoch": 0.12081911262798635, + "grad_norm": 3.611812114715576, + "learning_rate": 0.0009597269624573379, + "loss": 7.0418, + "step": 354 + }, + { + "epoch": 0.12116040955631399, + "grad_norm": 3.4913434982299805, + "learning_rate": 0.000959613196814562, + "loss": 7.1764, + "step": 355 + }, + { + "epoch": 0.12150170648464163, + "grad_norm": 3.7057766914367676, + "learning_rate": 0.0009594994311717862, + "loss": 6.6347, + "step": 356 + }, + { + "epoch": 0.12184300341296929, + "grad_norm": 3.8991594314575195, + "learning_rate": 0.0009593856655290103, + "loss": 6.7696, + "step": 357 + }, + { + "epoch": 0.12218430034129693, + "grad_norm": 3.863154172897339, + "learning_rate": 0.0009592718998862345, + "loss": 7.317, + "step": 358 + }, + { + "epoch": 0.12252559726962457, + "grad_norm": 3.521376848220825, + "learning_rate": 0.0009591581342434586, + "loss": 7.5248, + "step": 359 + }, + { + "epoch": 0.12286689419795221, + "grad_norm": 3.6891329288482666, + "learning_rate": 0.0009590443686006826, + "loss": 6.9287, + "step": 360 + }, + { + "epoch": 0.12320819112627987, + "grad_norm": 3.7071328163146973, + "learning_rate": 0.0009589306029579067, + "loss": 7.086, + "step": 361 + }, + { + "epoch": 0.12354948805460751, + "grad_norm": 5.102900981903076, + "learning_rate": 0.0009588168373151308, + "loss": 6.361, + "step": 362 + }, + { + "epoch": 0.12389078498293515, + "grad_norm": 4.640048503875732, + "learning_rate": 0.0009587030716723549, + "loss": 5.9508, + "step": 363 + }, + { + "epoch": 0.1242320819112628, + "grad_norm": 7.667442321777344, + "learning_rate": 0.0009585893060295791, + "loss": 7.4653, + "step": 364 + }, + { + "epoch": 0.12457337883959044, + "grad_norm": 4.129282474517822, + "learning_rate": 0.0009584755403868032, + "loss": 7.1397, + "step": 365 + }, + { + "epoch": 0.12491467576791809, + "grad_norm": 3.7277729511260986, + "learning_rate": 0.0009583617747440273, + "loss": 6.2993, + "step": 366 + }, + { + "epoch": 0.12525597269624572, + "grad_norm": 3.472531795501709, + "learning_rate": 0.0009582480091012514, + "loss": 6.7378, + "step": 367 + }, + { + "epoch": 0.12559726962457338, + "grad_norm": 4.953517913818359, + "learning_rate": 0.0009581342434584755, + "loss": 6.3897, + "step": 368 + }, + { + "epoch": 0.12593856655290103, + "grad_norm": 3.745403289794922, + "learning_rate": 0.0009580204778156996, + "loss": 6.8016, + "step": 369 + }, + { + "epoch": 0.12627986348122866, + "grad_norm": 3.9581098556518555, + "learning_rate": 0.0009579067121729238, + "loss": 6.9358, + "step": 370 + }, + { + "epoch": 0.12662116040955632, + "grad_norm": 3.588125467300415, + "learning_rate": 0.0009577929465301479, + "loss": 6.5954, + "step": 371 + }, + { + "epoch": 0.12696245733788397, + "grad_norm": 3.592097282409668, + "learning_rate": 0.000957679180887372, + "loss": 7.2236, + "step": 372 + }, + { + "epoch": 0.1273037542662116, + "grad_norm": 3.409364700317383, + "learning_rate": 0.0009575654152445962, + "loss": 7.0846, + "step": 373 + }, + { + "epoch": 0.12764505119453926, + "grad_norm": 4.143648147583008, + "learning_rate": 0.0009574516496018203, + "loss": 6.6434, + "step": 374 + }, + { + "epoch": 0.12798634812286688, + "grad_norm": 4.170565128326416, + "learning_rate": 0.0009573378839590445, + "loss": 6.5485, + "step": 375 + }, + { + "epoch": 0.12832764505119454, + "grad_norm": 3.7150259017944336, + "learning_rate": 0.0009572241183162686, + "loss": 6.7944, + "step": 376 + }, + { + "epoch": 0.1286689419795222, + "grad_norm": 9.187422752380371, + "learning_rate": 0.0009571103526734927, + "loss": 5.1412, + "step": 377 + }, + { + "epoch": 0.12901023890784982, + "grad_norm": 3.895047187805176, + "learning_rate": 0.0009569965870307168, + "loss": 6.4227, + "step": 378 + }, + { + "epoch": 0.12935153583617748, + "grad_norm": 3.7599642276763916, + "learning_rate": 0.0009568828213879409, + "loss": 6.9761, + "step": 379 + }, + { + "epoch": 0.1296928327645051, + "grad_norm": 3.6376614570617676, + "learning_rate": 0.0009567690557451649, + "loss": 7.0387, + "step": 380 + }, + { + "epoch": 0.13003412969283276, + "grad_norm": 3.673825263977051, + "learning_rate": 0.0009566552901023891, + "loss": 6.6699, + "step": 381 + }, + { + "epoch": 0.13037542662116042, + "grad_norm": 3.4586920738220215, + "learning_rate": 0.0009565415244596132, + "loss": 6.8109, + "step": 382 + }, + { + "epoch": 0.13071672354948805, + "grad_norm": 3.3592073917388916, + "learning_rate": 0.0009564277588168373, + "loss": 7.0395, + "step": 383 + }, + { + "epoch": 0.1310580204778157, + "grad_norm": 3.409188985824585, + "learning_rate": 0.0009563139931740614, + "loss": 6.7086, + "step": 384 + }, + { + "epoch": 0.13139931740614336, + "grad_norm": 3.571364164352417, + "learning_rate": 0.0009562002275312855, + "loss": 7.0613, + "step": 385 + }, + { + "epoch": 0.13174061433447098, + "grad_norm": 3.6313486099243164, + "learning_rate": 0.0009560864618885096, + "loss": 6.8737, + "step": 386 + }, + { + "epoch": 0.13208191126279864, + "grad_norm": 5.386178970336914, + "learning_rate": 0.0009559726962457338, + "loss": 6.8494, + "step": 387 + }, + { + "epoch": 0.13242320819112627, + "grad_norm": 3.736448287963867, + "learning_rate": 0.0009558589306029579, + "loss": 6.805, + "step": 388 + }, + { + "epoch": 0.13276450511945392, + "grad_norm": 4.280710697174072, + "learning_rate": 0.000955745164960182, + "loss": 6.5263, + "step": 389 + }, + { + "epoch": 0.13310580204778158, + "grad_norm": 3.7368850708007812, + "learning_rate": 0.0009556313993174062, + "loss": 6.8247, + "step": 390 + }, + { + "epoch": 0.1334470989761092, + "grad_norm": 3.579983711242676, + "learning_rate": 0.0009555176336746303, + "loss": 7.1728, + "step": 391 + }, + { + "epoch": 0.13378839590443686, + "grad_norm": 3.969280242919922, + "learning_rate": 0.0009554038680318545, + "loss": 6.2745, + "step": 392 + }, + { + "epoch": 0.13412969283276452, + "grad_norm": 3.6488418579101562, + "learning_rate": 0.0009552901023890786, + "loss": 7.0, + "step": 393 + }, + { + "epoch": 0.13447098976109215, + "grad_norm": 3.3817570209503174, + "learning_rate": 0.0009551763367463027, + "loss": 6.2211, + "step": 394 + }, + { + "epoch": 0.1348122866894198, + "grad_norm": 3.5612990856170654, + "learning_rate": 0.0009550625711035268, + "loss": 6.9262, + "step": 395 + }, + { + "epoch": 0.13515358361774743, + "grad_norm": 3.475904703140259, + "learning_rate": 0.0009549488054607509, + "loss": 7.1462, + "step": 396 + }, + { + "epoch": 0.13549488054607509, + "grad_norm": 3.9977428913116455, + "learning_rate": 0.000954835039817975, + "loss": 6.8524, + "step": 397 + }, + { + "epoch": 0.13583617747440274, + "grad_norm": 3.5686051845550537, + "learning_rate": 0.0009547212741751992, + "loss": 7.1196, + "step": 398 + }, + { + "epoch": 0.13617747440273037, + "grad_norm": 3.616010904312134, + "learning_rate": 0.0009546075085324232, + "loss": 6.8889, + "step": 399 + }, + { + "epoch": 0.13651877133105803, + "grad_norm": 3.5153353214263916, + "learning_rate": 0.0009544937428896473, + "loss": 7.0783, + "step": 400 + }, + { + "epoch": 0.13686006825938565, + "grad_norm": 3.46384596824646, + "learning_rate": 0.0009543799772468714, + "loss": 7.2027, + "step": 401 + }, + { + "epoch": 0.1372013651877133, + "grad_norm": 3.6632463932037354, + "learning_rate": 0.0009542662116040955, + "loss": 6.8579, + "step": 402 + }, + { + "epoch": 0.13754266211604096, + "grad_norm": 3.654392719268799, + "learning_rate": 0.0009541524459613196, + "loss": 6.9971, + "step": 403 + }, + { + "epoch": 0.1378839590443686, + "grad_norm": 3.7651548385620117, + "learning_rate": 0.0009540386803185438, + "loss": 6.9558, + "step": 404 + }, + { + "epoch": 0.13822525597269625, + "grad_norm": 3.590897560119629, + "learning_rate": 0.0009539249146757679, + "loss": 6.9441, + "step": 405 + }, + { + "epoch": 0.1385665529010239, + "grad_norm": 3.5330758094787598, + "learning_rate": 0.000953811149032992, + "loss": 7.3093, + "step": 406 + }, + { + "epoch": 0.13890784982935153, + "grad_norm": 3.9493212699890137, + "learning_rate": 0.0009536973833902162, + "loss": 6.1789, + "step": 407 + }, + { + "epoch": 0.1392491467576792, + "grad_norm": 3.6030216217041016, + "learning_rate": 0.0009535836177474403, + "loss": 6.7268, + "step": 408 + }, + { + "epoch": 0.13959044368600682, + "grad_norm": 3.7132041454315186, + "learning_rate": 0.0009534698521046644, + "loss": 6.8771, + "step": 409 + }, + { + "epoch": 0.13993174061433447, + "grad_norm": 3.7219643592834473, + "learning_rate": 0.0009533560864618886, + "loss": 6.7741, + "step": 410 + }, + { + "epoch": 0.14027303754266213, + "grad_norm": 3.658827066421509, + "learning_rate": 0.0009532423208191127, + "loss": 7.0821, + "step": 411 + }, + { + "epoch": 0.14061433447098975, + "grad_norm": 3.503436326980591, + "learning_rate": 0.0009531285551763368, + "loss": 7.0266, + "step": 412 + }, + { + "epoch": 0.1409556313993174, + "grad_norm": 3.4202098846435547, + "learning_rate": 0.0009530147895335609, + "loss": 6.7716, + "step": 413 + }, + { + "epoch": 0.14129692832764504, + "grad_norm": 3.497220993041992, + "learning_rate": 0.000952901023890785, + "loss": 7.1303, + "step": 414 + }, + { + "epoch": 0.1416382252559727, + "grad_norm": 3.7073585987091064, + "learning_rate": 0.0009527872582480092, + "loss": 6.9418, + "step": 415 + }, + { + "epoch": 0.14197952218430035, + "grad_norm": 3.451277256011963, + "learning_rate": 0.0009526734926052333, + "loss": 6.9495, + "step": 416 + }, + { + "epoch": 0.14232081911262798, + "grad_norm": 3.564136505126953, + "learning_rate": 0.0009525597269624574, + "loss": 6.8177, + "step": 417 + }, + { + "epoch": 0.14266211604095563, + "grad_norm": 4.293824672698975, + "learning_rate": 0.0009524459613196815, + "loss": 7.1537, + "step": 418 + }, + { + "epoch": 0.1430034129692833, + "grad_norm": 4.002580165863037, + "learning_rate": 0.0009523321956769055, + "loss": 7.0574, + "step": 419 + }, + { + "epoch": 0.14334470989761092, + "grad_norm": 3.631091833114624, + "learning_rate": 0.0009522184300341296, + "loss": 6.552, + "step": 420 + }, + { + "epoch": 0.14368600682593857, + "grad_norm": 3.5741636753082275, + "learning_rate": 0.0009521046643913538, + "loss": 6.7028, + "step": 421 + }, + { + "epoch": 0.1440273037542662, + "grad_norm": 3.8410189151763916, + "learning_rate": 0.0009519908987485779, + "loss": 7.0462, + "step": 422 + }, + { + "epoch": 0.14436860068259386, + "grad_norm": 3.5977299213409424, + "learning_rate": 0.000951877133105802, + "loss": 7.1202, + "step": 423 + }, + { + "epoch": 0.1447098976109215, + "grad_norm": 3.541287899017334, + "learning_rate": 0.0009517633674630262, + "loss": 6.6661, + "step": 424 + }, + { + "epoch": 0.14505119453924914, + "grad_norm": 3.6210668087005615, + "learning_rate": 0.0009516496018202503, + "loss": 7.1838, + "step": 425 + }, + { + "epoch": 0.1453924914675768, + "grad_norm": 4.462888717651367, + "learning_rate": 0.0009515358361774744, + "loss": 5.8725, + "step": 426 + }, + { + "epoch": 0.14573378839590442, + "grad_norm": 3.566049814224243, + "learning_rate": 0.0009514220705346986, + "loss": 6.7938, + "step": 427 + }, + { + "epoch": 0.14607508532423208, + "grad_norm": 3.4675862789154053, + "learning_rate": 0.0009513083048919227, + "loss": 6.5886, + "step": 428 + }, + { + "epoch": 0.14641638225255973, + "grad_norm": 3.4165515899658203, + "learning_rate": 0.0009511945392491468, + "loss": 6.9825, + "step": 429 + }, + { + "epoch": 0.14675767918088736, + "grad_norm": 3.5711846351623535, + "learning_rate": 0.0009510807736063709, + "loss": 6.9881, + "step": 430 + }, + { + "epoch": 0.14709897610921502, + "grad_norm": 3.523361921310425, + "learning_rate": 0.000950967007963595, + "loss": 6.6169, + "step": 431 + }, + { + "epoch": 0.14744027303754267, + "grad_norm": 3.6487302780151367, + "learning_rate": 0.0009508532423208191, + "loss": 7.0033, + "step": 432 + }, + { + "epoch": 0.1477815699658703, + "grad_norm": 4.30921745300293, + "learning_rate": 0.0009507394766780433, + "loss": 6.4524, + "step": 433 + }, + { + "epoch": 0.14812286689419796, + "grad_norm": 3.7454891204833984, + "learning_rate": 0.0009506257110352674, + "loss": 6.6826, + "step": 434 + }, + { + "epoch": 0.14846416382252559, + "grad_norm": 3.676464796066284, + "learning_rate": 0.0009505119453924915, + "loss": 6.8758, + "step": 435 + }, + { + "epoch": 0.14880546075085324, + "grad_norm": 3.738009452819824, + "learning_rate": 0.0009503981797497156, + "loss": 6.9021, + "step": 436 + }, + { + "epoch": 0.1491467576791809, + "grad_norm": 3.388455629348755, + "learning_rate": 0.0009502844141069397, + "loss": 6.7852, + "step": 437 + }, + { + "epoch": 0.14948805460750852, + "grad_norm": 5.163676738739014, + "learning_rate": 0.0009501706484641638, + "loss": 6.4292, + "step": 438 + }, + { + "epoch": 0.14982935153583618, + "grad_norm": 3.6834332942962646, + "learning_rate": 0.0009500568828213879, + "loss": 6.8296, + "step": 439 + }, + { + "epoch": 0.15017064846416384, + "grad_norm": 3.6888158321380615, + "learning_rate": 0.000949943117178612, + "loss": 6.7824, + "step": 440 + }, + { + "epoch": 0.15051194539249146, + "grad_norm": 3.5720479488372803, + "learning_rate": 0.0009498293515358362, + "loss": 6.973, + "step": 441 + }, + { + "epoch": 0.15085324232081912, + "grad_norm": 3.5570552349090576, + "learning_rate": 0.0009497155858930603, + "loss": 6.264, + "step": 442 + }, + { + "epoch": 0.15119453924914675, + "grad_norm": 3.610084056854248, + "learning_rate": 0.0009496018202502844, + "loss": 6.2526, + "step": 443 + }, + { + "epoch": 0.1515358361774744, + "grad_norm": 3.445772647857666, + "learning_rate": 0.0009494880546075086, + "loss": 6.888, + "step": 444 + }, + { + "epoch": 0.15187713310580206, + "grad_norm": 3.443215847015381, + "learning_rate": 0.0009493742889647327, + "loss": 7.2213, + "step": 445 + }, + { + "epoch": 0.1522184300341297, + "grad_norm": 3.4744131565093994, + "learning_rate": 0.0009492605233219568, + "loss": 6.1855, + "step": 446 + }, + { + "epoch": 0.15255972696245734, + "grad_norm": 3.616422176361084, + "learning_rate": 0.0009491467576791809, + "loss": 6.7537, + "step": 447 + }, + { + "epoch": 0.15290102389078497, + "grad_norm": 3.427217483520508, + "learning_rate": 0.000949032992036405, + "loss": 6.5635, + "step": 448 + }, + { + "epoch": 0.15324232081911263, + "grad_norm": 3.655622720718384, + "learning_rate": 0.0009489192263936291, + "loss": 6.4325, + "step": 449 + }, + { + "epoch": 0.15358361774744028, + "grad_norm": 3.9160969257354736, + "learning_rate": 0.0009488054607508533, + "loss": 6.4151, + "step": 450 + }, + { + "epoch": 0.1539249146757679, + "grad_norm": 4.333205223083496, + "learning_rate": 0.0009486916951080774, + "loss": 6.0077, + "step": 451 + }, + { + "epoch": 0.15426621160409557, + "grad_norm": 3.7891650199890137, + "learning_rate": 0.0009485779294653015, + "loss": 6.3463, + "step": 452 + }, + { + "epoch": 0.15460750853242322, + "grad_norm": 3.907245635986328, + "learning_rate": 0.0009484641638225256, + "loss": 5.503, + "step": 453 + }, + { + "epoch": 0.15494880546075085, + "grad_norm": 3.9546966552734375, + "learning_rate": 0.0009483503981797497, + "loss": 6.6111, + "step": 454 + }, + { + "epoch": 0.1552901023890785, + "grad_norm": 3.712440252304077, + "learning_rate": 0.000948236632536974, + "loss": 6.6064, + "step": 455 + }, + { + "epoch": 0.15563139931740613, + "grad_norm": 3.5256028175354004, + "learning_rate": 0.0009481228668941981, + "loss": 6.9209, + "step": 456 + }, + { + "epoch": 0.1559726962457338, + "grad_norm": 4.240937232971191, + "learning_rate": 0.0009480091012514222, + "loss": 6.3623, + "step": 457 + }, + { + "epoch": 0.15631399317406144, + "grad_norm": 3.398322820663452, + "learning_rate": 0.0009478953356086462, + "loss": 6.574, + "step": 458 + }, + { + "epoch": 0.15665529010238907, + "grad_norm": 3.7059226036071777, + "learning_rate": 0.0009477815699658703, + "loss": 6.9253, + "step": 459 + }, + { + "epoch": 0.15699658703071673, + "grad_norm": 4.11593770980835, + "learning_rate": 0.0009476678043230944, + "loss": 6.4204, + "step": 460 + }, + { + "epoch": 0.15733788395904436, + "grad_norm": 3.5810751914978027, + "learning_rate": 0.0009475540386803186, + "loss": 6.8083, + "step": 461 + }, + { + "epoch": 0.157679180887372, + "grad_norm": 3.538257360458374, + "learning_rate": 0.0009474402730375427, + "loss": 7.1334, + "step": 462 + }, + { + "epoch": 0.15802047781569967, + "grad_norm": 3.456045627593994, + "learning_rate": 0.0009473265073947668, + "loss": 7.0928, + "step": 463 + }, + { + "epoch": 0.1583617747440273, + "grad_norm": 3.6278865337371826, + "learning_rate": 0.0009472127417519909, + "loss": 6.8821, + "step": 464 + }, + { + "epoch": 0.15870307167235495, + "grad_norm": 3.6520791053771973, + "learning_rate": 0.000947098976109215, + "loss": 6.5972, + "step": 465 + }, + { + "epoch": 0.1590443686006826, + "grad_norm": 3.5707054138183594, + "learning_rate": 0.0009469852104664391, + "loss": 7.1507, + "step": 466 + }, + { + "epoch": 0.15938566552901023, + "grad_norm": 3.7022809982299805, + "learning_rate": 0.0009468714448236633, + "loss": 6.5752, + "step": 467 + }, + { + "epoch": 0.1597269624573379, + "grad_norm": 3.426180839538574, + "learning_rate": 0.0009467576791808874, + "loss": 6.6443, + "step": 468 + }, + { + "epoch": 0.16006825938566552, + "grad_norm": 3.5428926944732666, + "learning_rate": 0.0009466439135381115, + "loss": 6.5234, + "step": 469 + }, + { + "epoch": 0.16040955631399317, + "grad_norm": 3.7080366611480713, + "learning_rate": 0.0009465301478953356, + "loss": 7.1171, + "step": 470 + }, + { + "epoch": 0.16075085324232083, + "grad_norm": 3.589245080947876, + "learning_rate": 0.0009464163822525597, + "loss": 6.8297, + "step": 471 + }, + { + "epoch": 0.16109215017064846, + "grad_norm": 3.5839226245880127, + "learning_rate": 0.0009463026166097838, + "loss": 6.6766, + "step": 472 + }, + { + "epoch": 0.1614334470989761, + "grad_norm": 3.460501194000244, + "learning_rate": 0.0009461888509670081, + "loss": 6.9474, + "step": 473 + }, + { + "epoch": 0.16177474402730374, + "grad_norm": 3.6447677612304688, + "learning_rate": 0.0009460750853242322, + "loss": 7.2087, + "step": 474 + }, + { + "epoch": 0.1621160409556314, + "grad_norm": 6.186277866363525, + "learning_rate": 0.0009459613196814563, + "loss": 5.5688, + "step": 475 + }, + { + "epoch": 0.16245733788395905, + "grad_norm": 3.7447400093078613, + "learning_rate": 0.0009458475540386804, + "loss": 6.5, + "step": 476 + }, + { + "epoch": 0.16279863481228668, + "grad_norm": 3.6991195678710938, + "learning_rate": 0.0009457337883959044, + "loss": 6.7563, + "step": 477 + }, + { + "epoch": 0.16313993174061434, + "grad_norm": 3.4451916217803955, + "learning_rate": 0.0009456200227531286, + "loss": 7.0189, + "step": 478 + }, + { + "epoch": 0.163481228668942, + "grad_norm": 17.707456588745117, + "learning_rate": 0.0009455062571103527, + "loss": 5.872, + "step": 479 + }, + { + "epoch": 0.16382252559726962, + "grad_norm": 4.85144567489624, + "learning_rate": 0.0009453924914675768, + "loss": 6.0977, + "step": 480 + }, + { + "epoch": 0.16416382252559727, + "grad_norm": 3.997605085372925, + "learning_rate": 0.0009452787258248009, + "loss": 7.2862, + "step": 481 + }, + { + "epoch": 0.1645051194539249, + "grad_norm": 4.731887340545654, + "learning_rate": 0.000945164960182025, + "loss": 7.1271, + "step": 482 + }, + { + "epoch": 0.16484641638225256, + "grad_norm": 3.7682464122772217, + "learning_rate": 0.0009450511945392491, + "loss": 7.1285, + "step": 483 + }, + { + "epoch": 0.16518771331058021, + "grad_norm": 3.5759470462799072, + "learning_rate": 0.0009449374288964733, + "loss": 7.0162, + "step": 484 + }, + { + "epoch": 0.16552901023890784, + "grad_norm": 3.409883499145508, + "learning_rate": 0.0009448236632536974, + "loss": 7.237, + "step": 485 + }, + { + "epoch": 0.1658703071672355, + "grad_norm": 3.6246159076690674, + "learning_rate": 0.0009447098976109215, + "loss": 6.7235, + "step": 486 + }, + { + "epoch": 0.16621160409556315, + "grad_norm": 3.5993361473083496, + "learning_rate": 0.0009445961319681456, + "loss": 6.7617, + "step": 487 + }, + { + "epoch": 0.16655290102389078, + "grad_norm": 3.4608912467956543, + "learning_rate": 0.0009444823663253697, + "loss": 6.7189, + "step": 488 + }, + { + "epoch": 0.16689419795221844, + "grad_norm": 4.949087619781494, + "learning_rate": 0.0009443686006825938, + "loss": 6.1499, + "step": 489 + }, + { + "epoch": 0.16723549488054607, + "grad_norm": 3.6994855403900146, + "learning_rate": 0.0009442548350398181, + "loss": 7.1528, + "step": 490 + }, + { + "epoch": 0.16757679180887372, + "grad_norm": 3.868940591812134, + "learning_rate": 0.0009441410693970422, + "loss": 7.0506, + "step": 491 + }, + { + "epoch": 0.16791808873720138, + "grad_norm": 3.722750663757324, + "learning_rate": 0.0009440273037542663, + "loss": 7.0174, + "step": 492 + }, + { + "epoch": 0.168259385665529, + "grad_norm": 7.372501373291016, + "learning_rate": 0.0009439135381114904, + "loss": 5.1276, + "step": 493 + }, + { + "epoch": 0.16860068259385666, + "grad_norm": 5.211971282958984, + "learning_rate": 0.0009437997724687145, + "loss": 6.1224, + "step": 494 + }, + { + "epoch": 0.1689419795221843, + "grad_norm": 3.662050485610962, + "learning_rate": 0.0009436860068259387, + "loss": 6.9951, + "step": 495 + }, + { + "epoch": 0.16928327645051194, + "grad_norm": 3.8950295448303223, + "learning_rate": 0.0009435722411831627, + "loss": 6.9776, + "step": 496 + }, + { + "epoch": 0.1696245733788396, + "grad_norm": 3.697416067123413, + "learning_rate": 0.0009434584755403868, + "loss": 6.5273, + "step": 497 + }, + { + "epoch": 0.16996587030716723, + "grad_norm": 3.4824891090393066, + "learning_rate": 0.0009433447098976109, + "loss": 6.9454, + "step": 498 + }, + { + "epoch": 0.17030716723549488, + "grad_norm": 3.859316825866699, + "learning_rate": 0.000943230944254835, + "loss": 5.7186, + "step": 499 + }, + { + "epoch": 0.17064846416382254, + "grad_norm": 8.799308776855469, + "learning_rate": 0.0009431171786120591, + "loss": 6.3941, + "step": 500 + }, + { + "epoch": 0.17098976109215017, + "grad_norm": 3.745943069458008, + "learning_rate": 0.0009430034129692833, + "loss": 7.0287, + "step": 501 + }, + { + "epoch": 0.17133105802047782, + "grad_norm": 3.7031750679016113, + "learning_rate": 0.0009428896473265074, + "loss": 6.6921, + "step": 502 + }, + { + "epoch": 0.17167235494880545, + "grad_norm": 3.5170814990997314, + "learning_rate": 0.0009427758816837315, + "loss": 7.0773, + "step": 503 + }, + { + "epoch": 0.1720136518771331, + "grad_norm": 3.4774396419525146, + "learning_rate": 0.0009426621160409556, + "loss": 6.9226, + "step": 504 + }, + { + "epoch": 0.17235494880546076, + "grad_norm": 3.5223052501678467, + "learning_rate": 0.0009425483503981797, + "loss": 6.7229, + "step": 505 + }, + { + "epoch": 0.1726962457337884, + "grad_norm": 4.429044723510742, + "learning_rate": 0.0009424345847554038, + "loss": 6.1489, + "step": 506 + }, + { + "epoch": 0.17303754266211605, + "grad_norm": 3.946762800216675, + "learning_rate": 0.0009423208191126281, + "loss": 6.3177, + "step": 507 + }, + { + "epoch": 0.17337883959044367, + "grad_norm": 4.3716278076171875, + "learning_rate": 0.0009422070534698522, + "loss": 7.4142, + "step": 508 + }, + { + "epoch": 0.17372013651877133, + "grad_norm": 4.144838809967041, + "learning_rate": 0.0009420932878270763, + "loss": 6.5615, + "step": 509 + }, + { + "epoch": 0.17406143344709898, + "grad_norm": 3.442716121673584, + "learning_rate": 0.0009419795221843004, + "loss": 7.1919, + "step": 510 + }, + { + "epoch": 0.1744027303754266, + "grad_norm": 3.616173505783081, + "learning_rate": 0.0009418657565415245, + "loss": 7.0364, + "step": 511 + }, + { + "epoch": 0.17474402730375427, + "grad_norm": 3.2651257514953613, + "learning_rate": 0.0009417519908987486, + "loss": 7.1869, + "step": 512 + }, + { + "epoch": 0.17508532423208192, + "grad_norm": 3.429654836654663, + "learning_rate": 0.0009416382252559728, + "loss": 7.3759, + "step": 513 + }, + { + "epoch": 0.17542662116040955, + "grad_norm": 3.8366029262542725, + "learning_rate": 0.0009415244596131969, + "loss": 6.5422, + "step": 514 + }, + { + "epoch": 0.1757679180887372, + "grad_norm": 6.700514793395996, + "learning_rate": 0.000941410693970421, + "loss": 5.9699, + "step": 515 + }, + { + "epoch": 0.17610921501706484, + "grad_norm": 3.9318792819976807, + "learning_rate": 0.000941296928327645, + "loss": 7.1496, + "step": 516 + }, + { + "epoch": 0.1764505119453925, + "grad_norm": 3.7701475620269775, + "learning_rate": 0.0009411831626848691, + "loss": 7.1363, + "step": 517 + }, + { + "epoch": 0.17679180887372015, + "grad_norm": 3.5054924488067627, + "learning_rate": 0.0009410693970420933, + "loss": 6.9795, + "step": 518 + }, + { + "epoch": 0.17713310580204777, + "grad_norm": 3.7713727951049805, + "learning_rate": 0.0009409556313993174, + "loss": 6.6172, + "step": 519 + }, + { + "epoch": 0.17747440273037543, + "grad_norm": 8.710354804992676, + "learning_rate": 0.0009408418657565415, + "loss": 6.0337, + "step": 520 + }, + { + "epoch": 0.17781569965870306, + "grad_norm": 4.916598320007324, + "learning_rate": 0.0009407281001137656, + "loss": 6.0356, + "step": 521 + }, + { + "epoch": 0.1781569965870307, + "grad_norm": 3.8350720405578613, + "learning_rate": 0.0009406143344709897, + "loss": 7.201, + "step": 522 + }, + { + "epoch": 0.17849829351535837, + "grad_norm": 3.7248024940490723, + "learning_rate": 0.0009405005688282138, + "loss": 6.5138, + "step": 523 + }, + { + "epoch": 0.178839590443686, + "grad_norm": 3.730102062225342, + "learning_rate": 0.0009403868031854381, + "loss": 6.3692, + "step": 524 + }, + { + "epoch": 0.17918088737201365, + "grad_norm": 3.4296181201934814, + "learning_rate": 0.0009402730375426622, + "loss": 6.7243, + "step": 525 + }, + { + "epoch": 0.1795221843003413, + "grad_norm": 3.4603865146636963, + "learning_rate": 0.0009401592718998863, + "loss": 7.2087, + "step": 526 + }, + { + "epoch": 0.17986348122866894, + "grad_norm": 3.4255411624908447, + "learning_rate": 0.0009400455062571104, + "loss": 6.6606, + "step": 527 + }, + { + "epoch": 0.1802047781569966, + "grad_norm": 4.901156902313232, + "learning_rate": 0.0009399317406143345, + "loss": 6.4545, + "step": 528 + }, + { + "epoch": 0.18054607508532422, + "grad_norm": 3.5273005962371826, + "learning_rate": 0.0009398179749715586, + "loss": 6.8521, + "step": 529 + }, + { + "epoch": 0.18088737201365188, + "grad_norm": 3.794410467147827, + "learning_rate": 0.0009397042093287828, + "loss": 6.5531, + "step": 530 + }, + { + "epoch": 0.18122866894197953, + "grad_norm": 3.8027398586273193, + "learning_rate": 0.0009395904436860069, + "loss": 6.9897, + "step": 531 + }, + { + "epoch": 0.18156996587030716, + "grad_norm": 3.6400845050811768, + "learning_rate": 0.000939476678043231, + "loss": 6.4599, + "step": 532 + }, + { + "epoch": 0.18191126279863482, + "grad_norm": 3.6795458793640137, + "learning_rate": 0.0009393629124004551, + "loss": 7.0436, + "step": 533 + }, + { + "epoch": 0.18225255972696247, + "grad_norm": 3.549872398376465, + "learning_rate": 0.0009392491467576792, + "loss": 6.8872, + "step": 534 + }, + { + "epoch": 0.1825938566552901, + "grad_norm": 3.6060047149658203, + "learning_rate": 0.0009391353811149032, + "loss": 6.8889, + "step": 535 + }, + { + "epoch": 0.18293515358361775, + "grad_norm": 3.5164926052093506, + "learning_rate": 0.0009390216154721274, + "loss": 7.2238, + "step": 536 + }, + { + "epoch": 0.18327645051194538, + "grad_norm": 4.204543590545654, + "learning_rate": 0.0009389078498293515, + "loss": 6.8331, + "step": 537 + }, + { + "epoch": 0.18361774744027304, + "grad_norm": 3.713944673538208, + "learning_rate": 0.0009387940841865756, + "loss": 6.3832, + "step": 538 + }, + { + "epoch": 0.1839590443686007, + "grad_norm": 3.504955530166626, + "learning_rate": 0.0009386803185437997, + "loss": 6.9558, + "step": 539 + }, + { + "epoch": 0.18430034129692832, + "grad_norm": 3.7779481410980225, + "learning_rate": 0.0009385665529010238, + "loss": 6.7189, + "step": 540 + }, + { + "epoch": 0.18464163822525598, + "grad_norm": 3.7830755710601807, + "learning_rate": 0.0009384527872582481, + "loss": 7.0508, + "step": 541 + }, + { + "epoch": 0.1849829351535836, + "grad_norm": 3.6746010780334473, + "learning_rate": 0.0009383390216154722, + "loss": 6.6615, + "step": 542 + }, + { + "epoch": 0.18532423208191126, + "grad_norm": 3.6559464931488037, + "learning_rate": 0.0009382252559726963, + "loss": 7.2021, + "step": 543 + }, + { + "epoch": 0.18566552901023892, + "grad_norm": 3.4099745750427246, + "learning_rate": 0.0009381114903299204, + "loss": 6.8867, + "step": 544 + }, + { + "epoch": 0.18600682593856654, + "grad_norm": 3.7949607372283936, + "learning_rate": 0.0009379977246871445, + "loss": 6.3174, + "step": 545 + }, + { + "epoch": 0.1863481228668942, + "grad_norm": 3.6113579273223877, + "learning_rate": 0.0009378839590443686, + "loss": 7.2956, + "step": 546 + }, + { + "epoch": 0.18668941979522186, + "grad_norm": 4.320201396942139, + "learning_rate": 0.0009377701934015928, + "loss": 7.0727, + "step": 547 + }, + { + "epoch": 0.18703071672354948, + "grad_norm": 3.824106454849243, + "learning_rate": 0.0009376564277588169, + "loss": 6.5791, + "step": 548 + }, + { + "epoch": 0.18737201365187714, + "grad_norm": 3.5648560523986816, + "learning_rate": 0.000937542662116041, + "loss": 7.0372, + "step": 549 + }, + { + "epoch": 0.18771331058020477, + "grad_norm": 6.90482234954834, + "learning_rate": 0.0009374288964732651, + "loss": 6.3724, + "step": 550 + }, + { + "epoch": 0.18805460750853242, + "grad_norm": 3.8881046772003174, + "learning_rate": 0.0009373151308304892, + "loss": 6.7275, + "step": 551 + }, + { + "epoch": 0.18839590443686008, + "grad_norm": 3.4772567749023438, + "learning_rate": 0.0009372013651877133, + "loss": 7.1439, + "step": 552 + }, + { + "epoch": 0.1887372013651877, + "grad_norm": 3.674238681793213, + "learning_rate": 0.0009370875995449375, + "loss": 7.3201, + "step": 553 + }, + { + "epoch": 0.18907849829351536, + "grad_norm": 3.3719608783721924, + "learning_rate": 0.0009369738339021616, + "loss": 6.646, + "step": 554 + }, + { + "epoch": 0.189419795221843, + "grad_norm": 3.6703298091888428, + "learning_rate": 0.0009368600682593856, + "loss": 6.7161, + "step": 555 + }, + { + "epoch": 0.18976109215017065, + "grad_norm": 3.6245388984680176, + "learning_rate": 0.0009367463026166097, + "loss": 6.3849, + "step": 556 + }, + { + "epoch": 0.1901023890784983, + "grad_norm": 3.5778818130493164, + "learning_rate": 0.0009366325369738338, + "loss": 7.4046, + "step": 557 + }, + { + "epoch": 0.19044368600682593, + "grad_norm": 4.9418253898620605, + "learning_rate": 0.000936518771331058, + "loss": 6.5024, + "step": 558 + }, + { + "epoch": 0.19078498293515359, + "grad_norm": 3.6862545013427734, + "learning_rate": 0.0009364050056882822, + "loss": 6.7619, + "step": 559 + }, + { + "epoch": 0.19112627986348124, + "grad_norm": 4.22722864151001, + "learning_rate": 0.0009362912400455063, + "loss": 6.5308, + "step": 560 + }, + { + "epoch": 0.19146757679180887, + "grad_norm": 3.733459949493408, + "learning_rate": 0.0009361774744027304, + "loss": 6.5565, + "step": 561 + }, + { + "epoch": 0.19180887372013652, + "grad_norm": 9.069499015808105, + "learning_rate": 0.0009360637087599545, + "loss": 4.9708, + "step": 562 + }, + { + "epoch": 0.19215017064846415, + "grad_norm": 3.968690872192383, + "learning_rate": 0.0009359499431171786, + "loss": 7.3263, + "step": 563 + }, + { + "epoch": 0.1924914675767918, + "grad_norm": 3.5820865631103516, + "learning_rate": 0.0009358361774744028, + "loss": 6.7364, + "step": 564 + }, + { + "epoch": 0.19283276450511946, + "grad_norm": 3.6914074420928955, + "learning_rate": 0.0009357224118316269, + "loss": 6.8361, + "step": 565 + }, + { + "epoch": 0.1931740614334471, + "grad_norm": 3.580321788787842, + "learning_rate": 0.000935608646188851, + "loss": 6.4783, + "step": 566 + }, + { + "epoch": 0.19351535836177475, + "grad_norm": 3.5576207637786865, + "learning_rate": 0.0009354948805460751, + "loss": 6.5367, + "step": 567 + }, + { + "epoch": 0.19385665529010238, + "grad_norm": 3.515730619430542, + "learning_rate": 0.0009353811149032992, + "loss": 6.481, + "step": 568 + }, + { + "epoch": 0.19419795221843003, + "grad_norm": 3.8840041160583496, + "learning_rate": 0.0009352673492605233, + "loss": 6.4732, + "step": 569 + }, + { + "epoch": 0.1945392491467577, + "grad_norm": 3.75571346282959, + "learning_rate": 0.0009351535836177475, + "loss": 7.1131, + "step": 570 + }, + { + "epoch": 0.19488054607508531, + "grad_norm": 4.804229259490967, + "learning_rate": 0.0009350398179749716, + "loss": 6.1646, + "step": 571 + }, + { + "epoch": 0.19522184300341297, + "grad_norm": 3.6777968406677246, + "learning_rate": 0.0009349260523321957, + "loss": 6.8409, + "step": 572 + }, + { + "epoch": 0.19556313993174063, + "grad_norm": 3.6401546001434326, + "learning_rate": 0.0009348122866894199, + "loss": 7.1515, + "step": 573 + }, + { + "epoch": 0.19590443686006825, + "grad_norm": 3.532172679901123, + "learning_rate": 0.0009346985210466438, + "loss": 6.3835, + "step": 574 + }, + { + "epoch": 0.1962457337883959, + "grad_norm": 3.4800662994384766, + "learning_rate": 0.000934584755403868, + "loss": 7.2508, + "step": 575 + }, + { + "epoch": 0.19658703071672354, + "grad_norm": 3.7157084941864014, + "learning_rate": 0.0009344709897610922, + "loss": 6.508, + "step": 576 + }, + { + "epoch": 0.1969283276450512, + "grad_norm": 3.7525947093963623, + "learning_rate": 0.0009343572241183163, + "loss": 6.3877, + "step": 577 + }, + { + "epoch": 0.19726962457337885, + "grad_norm": 3.6418375968933105, + "learning_rate": 0.0009342434584755404, + "loss": 6.7508, + "step": 578 + }, + { + "epoch": 0.19761092150170648, + "grad_norm": 4.113407135009766, + "learning_rate": 0.0009341296928327645, + "loss": 6.676, + "step": 579 + }, + { + "epoch": 0.19795221843003413, + "grad_norm": 3.6704797744750977, + "learning_rate": 0.0009340159271899886, + "loss": 6.9985, + "step": 580 + }, + { + "epoch": 0.1982935153583618, + "grad_norm": 3.590165138244629, + "learning_rate": 0.0009339021615472128, + "loss": 6.6784, + "step": 581 + }, + { + "epoch": 0.19863481228668942, + "grad_norm": 3.575233221054077, + "learning_rate": 0.0009337883959044369, + "loss": 6.4992, + "step": 582 + }, + { + "epoch": 0.19897610921501707, + "grad_norm": 3.404895782470703, + "learning_rate": 0.000933674630261661, + "loss": 6.9338, + "step": 583 + }, + { + "epoch": 0.1993174061433447, + "grad_norm": 3.5058460235595703, + "learning_rate": 0.0009335608646188851, + "loss": 7.2116, + "step": 584 + }, + { + "epoch": 0.19965870307167236, + "grad_norm": 3.462622880935669, + "learning_rate": 0.0009334470989761092, + "loss": 7.0462, + "step": 585 + }, + { + "epoch": 0.2, + "grad_norm": 4.042704105377197, + "learning_rate": 0.0009333333333333333, + "loss": 5.7069, + "step": 586 + }, + { + "epoch": 0.20034129692832764, + "grad_norm": 3.7907888889312744, + "learning_rate": 0.0009332195676905575, + "loss": 6.7271, + "step": 587 + }, + { + "epoch": 0.2006825938566553, + "grad_norm": 3.531925916671753, + "learning_rate": 0.0009331058020477816, + "loss": 7.0678, + "step": 588 + }, + { + "epoch": 0.20102389078498292, + "grad_norm": 3.567275285720825, + "learning_rate": 0.0009329920364050057, + "loss": 6.9811, + "step": 589 + }, + { + "epoch": 0.20136518771331058, + "grad_norm": 3.8175978660583496, + "learning_rate": 0.0009328782707622299, + "loss": 6.6779, + "step": 590 + }, + { + "epoch": 0.20170648464163823, + "grad_norm": 3.672842502593994, + "learning_rate": 0.000932764505119454, + "loss": 6.6118, + "step": 591 + }, + { + "epoch": 0.20204778156996586, + "grad_norm": 3.625286102294922, + "learning_rate": 0.0009326507394766781, + "loss": 7.1495, + "step": 592 + }, + { + "epoch": 0.20238907849829352, + "grad_norm": 3.4718549251556396, + "learning_rate": 0.0009325369738339023, + "loss": 7.5187, + "step": 593 + }, + { + "epoch": 0.20273037542662117, + "grad_norm": 3.5792765617370605, + "learning_rate": 0.0009324232081911263, + "loss": 6.7316, + "step": 594 + }, + { + "epoch": 0.2030716723549488, + "grad_norm": 19.332000732421875, + "learning_rate": 0.0009323094425483504, + "loss": 6.0635, + "step": 595 + }, + { + "epoch": 0.20341296928327646, + "grad_norm": 3.741669178009033, + "learning_rate": 0.0009321956769055745, + "loss": 7.1053, + "step": 596 + }, + { + "epoch": 0.20375426621160408, + "grad_norm": 4.053689002990723, + "learning_rate": 0.0009320819112627986, + "loss": 6.9079, + "step": 597 + }, + { + "epoch": 0.20409556313993174, + "grad_norm": 3.5358211994171143, + "learning_rate": 0.0009319681456200227, + "loss": 6.9465, + "step": 598 + }, + { + "epoch": 0.2044368600682594, + "grad_norm": 3.4627411365509033, + "learning_rate": 0.0009318543799772469, + "loss": 7.1842, + "step": 599 + }, + { + "epoch": 0.20477815699658702, + "grad_norm": 3.443898916244507, + "learning_rate": 0.000931740614334471, + "loss": 6.5099, + "step": 600 + }, + { + "epoch": 0.20511945392491468, + "grad_norm": 3.4302115440368652, + "learning_rate": 0.0009316268486916951, + "loss": 6.896, + "step": 601 + }, + { + "epoch": 0.2054607508532423, + "grad_norm": 3.346616268157959, + "learning_rate": 0.0009315130830489192, + "loss": 6.8985, + "step": 602 + }, + { + "epoch": 0.20580204778156996, + "grad_norm": 3.753207206726074, + "learning_rate": 0.0009313993174061433, + "loss": 6.4971, + "step": 603 + }, + { + "epoch": 0.20614334470989762, + "grad_norm": 3.675739049911499, + "learning_rate": 0.0009312855517633675, + "loss": 6.6612, + "step": 604 + }, + { + "epoch": 0.20648464163822525, + "grad_norm": 3.6538069248199463, + "learning_rate": 0.0009311717861205916, + "loss": 7.0008, + "step": 605 + }, + { + "epoch": 0.2068259385665529, + "grad_norm": 3.7137982845306396, + "learning_rate": 0.0009310580204778157, + "loss": 7.0696, + "step": 606 + }, + { + "epoch": 0.20716723549488056, + "grad_norm": 3.7337138652801514, + "learning_rate": 0.0009309442548350399, + "loss": 6.686, + "step": 607 + }, + { + "epoch": 0.2075085324232082, + "grad_norm": 7.5925374031066895, + "learning_rate": 0.000930830489192264, + "loss": 6.9714, + "step": 608 + }, + { + "epoch": 0.20784982935153584, + "grad_norm": 3.7731120586395264, + "learning_rate": 0.0009307167235494881, + "loss": 6.924, + "step": 609 + }, + { + "epoch": 0.20819112627986347, + "grad_norm": 6.881464004516602, + "learning_rate": 0.0009306029579067123, + "loss": 6.9183, + "step": 610 + }, + { + "epoch": 0.20853242320819113, + "grad_norm": 3.973215103149414, + "learning_rate": 0.0009304891922639364, + "loss": 6.7891, + "step": 611 + }, + { + "epoch": 0.20887372013651878, + "grad_norm": 3.4739882946014404, + "learning_rate": 0.0009303754266211605, + "loss": 7.1653, + "step": 612 + }, + { + "epoch": 0.2092150170648464, + "grad_norm": 3.3610455989837646, + "learning_rate": 0.0009302616609783845, + "loss": 6.6231, + "step": 613 + }, + { + "epoch": 0.20955631399317406, + "grad_norm": 3.4226975440979004, + "learning_rate": 0.0009301478953356086, + "loss": 7.0169, + "step": 614 + }, + { + "epoch": 0.2098976109215017, + "grad_norm": 4.903579235076904, + "learning_rate": 0.0009300341296928327, + "loss": 6.194, + "step": 615 + }, + { + "epoch": 0.21023890784982935, + "grad_norm": 3.5574898719787598, + "learning_rate": 0.0009299203640500569, + "loss": 7.0527, + "step": 616 + }, + { + "epoch": 0.210580204778157, + "grad_norm": 3.566174030303955, + "learning_rate": 0.000929806598407281, + "loss": 6.8922, + "step": 617 + }, + { + "epoch": 0.21092150170648463, + "grad_norm": 4.031206130981445, + "learning_rate": 0.0009296928327645051, + "loss": 5.8383, + "step": 618 + }, + { + "epoch": 0.2112627986348123, + "grad_norm": 3.552882671356201, + "learning_rate": 0.0009295790671217292, + "loss": 6.6364, + "step": 619 + }, + { + "epoch": 0.21160409556313994, + "grad_norm": 6.967950820922852, + "learning_rate": 0.0009294653014789533, + "loss": 6.5468, + "step": 620 + }, + { + "epoch": 0.21194539249146757, + "grad_norm": 3.7092936038970947, + "learning_rate": 0.0009293515358361775, + "loss": 7.2022, + "step": 621 + }, + { + "epoch": 0.21228668941979523, + "grad_norm": 3.346576690673828, + "learning_rate": 0.0009292377701934016, + "loss": 6.4981, + "step": 622 + }, + { + "epoch": 0.21262798634812285, + "grad_norm": 3.6055033206939697, + "learning_rate": 0.0009291240045506257, + "loss": 6.9378, + "step": 623 + }, + { + "epoch": 0.2129692832764505, + "grad_norm": 3.5705137252807617, + "learning_rate": 0.0009290102389078499, + "loss": 7.0167, + "step": 624 + }, + { + "epoch": 0.21331058020477817, + "grad_norm": 4.07545280456543, + "learning_rate": 0.000928896473265074, + "loss": 6.4161, + "step": 625 + }, + { + "epoch": 0.2136518771331058, + "grad_norm": 3.458583116531372, + "learning_rate": 0.0009287827076222981, + "loss": 6.7419, + "step": 626 + }, + { + "epoch": 0.21399317406143345, + "grad_norm": 3.4590044021606445, + "learning_rate": 0.0009286689419795223, + "loss": 6.5505, + "step": 627 + }, + { + "epoch": 0.2143344709897611, + "grad_norm": 3.5659596920013428, + "learning_rate": 0.0009285551763367464, + "loss": 6.6099, + "step": 628 + }, + { + "epoch": 0.21467576791808873, + "grad_norm": 3.5950722694396973, + "learning_rate": 0.0009284414106939705, + "loss": 6.6846, + "step": 629 + }, + { + "epoch": 0.2150170648464164, + "grad_norm": 3.622309923171997, + "learning_rate": 0.0009283276450511946, + "loss": 6.7184, + "step": 630 + }, + { + "epoch": 0.21535836177474402, + "grad_norm": 3.5090525150299072, + "learning_rate": 0.0009282138794084187, + "loss": 7.0747, + "step": 631 + }, + { + "epoch": 0.21569965870307167, + "grad_norm": 3.4692866802215576, + "learning_rate": 0.0009281001137656428, + "loss": 7.4671, + "step": 632 + }, + { + "epoch": 0.21604095563139933, + "grad_norm": 3.454890489578247, + "learning_rate": 0.0009279863481228669, + "loss": 7.0538, + "step": 633 + }, + { + "epoch": 0.21638225255972696, + "grad_norm": 5.090976715087891, + "learning_rate": 0.000927872582480091, + "loss": 6.5268, + "step": 634 + }, + { + "epoch": 0.2167235494880546, + "grad_norm": 3.852503776550293, + "learning_rate": 0.0009277588168373151, + "loss": 6.9396, + "step": 635 + }, + { + "epoch": 0.21706484641638224, + "grad_norm": 3.7737314701080322, + "learning_rate": 0.0009276450511945392, + "loss": 6.7992, + "step": 636 + }, + { + "epoch": 0.2174061433447099, + "grad_norm": 3.6202504634857178, + "learning_rate": 0.0009275312855517633, + "loss": 7.4372, + "step": 637 + }, + { + "epoch": 0.21774744027303755, + "grad_norm": 3.5939218997955322, + "learning_rate": 0.0009274175199089874, + "loss": 6.6953, + "step": 638 + }, + { + "epoch": 0.21808873720136518, + "grad_norm": 3.323547601699829, + "learning_rate": 0.0009273037542662116, + "loss": 6.7816, + "step": 639 + }, + { + "epoch": 0.21843003412969283, + "grad_norm": 3.6134841442108154, + "learning_rate": 0.0009271899886234357, + "loss": 6.6858, + "step": 640 + }, + { + "epoch": 0.2187713310580205, + "grad_norm": 3.660320520401001, + "learning_rate": 0.0009270762229806599, + "loss": 7.0058, + "step": 641 + }, + { + "epoch": 0.21911262798634812, + "grad_norm": 3.380748748779297, + "learning_rate": 0.000926962457337884, + "loss": 6.735, + "step": 642 + }, + { + "epoch": 0.21945392491467577, + "grad_norm": 6.264308452606201, + "learning_rate": 0.0009268486916951081, + "loss": 5.8211, + "step": 643 + }, + { + "epoch": 0.2197952218430034, + "grad_norm": 3.7562389373779297, + "learning_rate": 0.0009267349260523323, + "loss": 6.7083, + "step": 644 + }, + { + "epoch": 0.22013651877133106, + "grad_norm": 3.7538766860961914, + "learning_rate": 0.0009266211604095564, + "loss": 6.8536, + "step": 645 + }, + { + "epoch": 0.2204778156996587, + "grad_norm": 3.648890972137451, + "learning_rate": 0.0009265073947667805, + "loss": 6.6039, + "step": 646 + }, + { + "epoch": 0.22081911262798634, + "grad_norm": 3.616142988204956, + "learning_rate": 0.0009263936291240046, + "loss": 6.0385, + "step": 647 + }, + { + "epoch": 0.221160409556314, + "grad_norm": 3.6815671920776367, + "learning_rate": 0.0009262798634812287, + "loss": 7.124, + "step": 648 + }, + { + "epoch": 0.22150170648464163, + "grad_norm": 3.596876382827759, + "learning_rate": 0.0009261660978384528, + "loss": 6.5317, + "step": 649 + }, + { + "epoch": 0.22184300341296928, + "grad_norm": 3.4325168132781982, + "learning_rate": 0.000926052332195677, + "loss": 6.7055, + "step": 650 + }, + { + "epoch": 0.22218430034129694, + "grad_norm": 3.43967604637146, + "learning_rate": 0.0009259385665529011, + "loss": 6.9872, + "step": 651 + }, + { + "epoch": 0.22252559726962456, + "grad_norm": 3.4720215797424316, + "learning_rate": 0.0009258248009101251, + "loss": 6.807, + "step": 652 + }, + { + "epoch": 0.22286689419795222, + "grad_norm": 4.370832920074463, + "learning_rate": 0.0009257110352673492, + "loss": 5.8649, + "step": 653 + }, + { + "epoch": 0.22320819112627988, + "grad_norm": 3.566333293914795, + "learning_rate": 0.0009255972696245733, + "loss": 6.6208, + "step": 654 + }, + { + "epoch": 0.2235494880546075, + "grad_norm": 3.5970399379730225, + "learning_rate": 0.0009254835039817974, + "loss": 7.0027, + "step": 655 + }, + { + "epoch": 0.22389078498293516, + "grad_norm": 3.698763370513916, + "learning_rate": 0.0009253697383390216, + "loss": 7.0716, + "step": 656 + }, + { + "epoch": 0.2242320819112628, + "grad_norm": 3.4119129180908203, + "learning_rate": 0.0009252559726962458, + "loss": 6.8046, + "step": 657 + }, + { + "epoch": 0.22457337883959044, + "grad_norm": 3.6655516624450684, + "learning_rate": 0.0009251422070534699, + "loss": 6.9696, + "step": 658 + }, + { + "epoch": 0.2249146757679181, + "grad_norm": 5.750580787658691, + "learning_rate": 0.000925028441410694, + "loss": 5.6451, + "step": 659 + }, + { + "epoch": 0.22525597269624573, + "grad_norm": 3.9216561317443848, + "learning_rate": 0.0009249146757679181, + "loss": 6.4134, + "step": 660 + }, + { + "epoch": 0.22559726962457338, + "grad_norm": 4.239558696746826, + "learning_rate": 0.0009248009101251423, + "loss": 5.6066, + "step": 661 + }, + { + "epoch": 0.225938566552901, + "grad_norm": 3.7342917919158936, + "learning_rate": 0.0009246871444823664, + "loss": 7.0818, + "step": 662 + }, + { + "epoch": 0.22627986348122867, + "grad_norm": 5.248586177825928, + "learning_rate": 0.0009245733788395905, + "loss": 5.4571, + "step": 663 + }, + { + "epoch": 0.22662116040955632, + "grad_norm": 3.6075022220611572, + "learning_rate": 0.0009244596131968146, + "loss": 6.9643, + "step": 664 + }, + { + "epoch": 0.22696245733788395, + "grad_norm": 3.528815269470215, + "learning_rate": 0.0009243458475540387, + "loss": 7.1822, + "step": 665 + }, + { + "epoch": 0.2273037542662116, + "grad_norm": 3.4905171394348145, + "learning_rate": 0.0009242320819112628, + "loss": 7.0671, + "step": 666 + }, + { + "epoch": 0.22764505119453926, + "grad_norm": 3.64202880859375, + "learning_rate": 0.000924118316268487, + "loss": 6.6831, + "step": 667 + }, + { + "epoch": 0.2279863481228669, + "grad_norm": 6.533376216888428, + "learning_rate": 0.0009240045506257111, + "loss": 6.1337, + "step": 668 + }, + { + "epoch": 0.22832764505119454, + "grad_norm": 3.9284870624542236, + "learning_rate": 0.0009238907849829352, + "loss": 6.5213, + "step": 669 + }, + { + "epoch": 0.22866894197952217, + "grad_norm": 3.7156975269317627, + "learning_rate": 0.0009237770193401593, + "loss": 6.5675, + "step": 670 + }, + { + "epoch": 0.22901023890784983, + "grad_norm": 4.064757347106934, + "learning_rate": 0.0009236632536973833, + "loss": 7.1138, + "step": 671 + }, + { + "epoch": 0.22935153583617748, + "grad_norm": 5.838778018951416, + "learning_rate": 0.0009235494880546074, + "loss": 6.6873, + "step": 672 + }, + { + "epoch": 0.2296928327645051, + "grad_norm": 3.700157642364502, + "learning_rate": 0.0009234357224118316, + "loss": 6.5795, + "step": 673 + }, + { + "epoch": 0.23003412969283277, + "grad_norm": 3.6335175037384033, + "learning_rate": 0.0009233219567690558, + "loss": 6.5419, + "step": 674 + }, + { + "epoch": 0.23037542662116042, + "grad_norm": 3.073715925216675, + "learning_rate": 0.0009232081911262799, + "loss": 6.6049, + "step": 675 + }, + { + "epoch": 0.23071672354948805, + "grad_norm": 3.373309850692749, + "learning_rate": 0.000923094425483504, + "loss": 6.7097, + "step": 676 + }, + { + "epoch": 0.2310580204778157, + "grad_norm": 3.3596081733703613, + "learning_rate": 0.0009229806598407281, + "loss": 6.8117, + "step": 677 + }, + { + "epoch": 0.23139931740614333, + "grad_norm": 3.4036943912506104, + "learning_rate": 0.0009228668941979522, + "loss": 7.1459, + "step": 678 + }, + { + "epoch": 0.231740614334471, + "grad_norm": 5.635969638824463, + "learning_rate": 0.0009227531285551764, + "loss": 5.9685, + "step": 679 + }, + { + "epoch": 0.23208191126279865, + "grad_norm": 4.183934688568115, + "learning_rate": 0.0009226393629124005, + "loss": 6.5457, + "step": 680 + }, + { + "epoch": 0.23242320819112627, + "grad_norm": 3.8760788440704346, + "learning_rate": 0.0009225255972696246, + "loss": 7.0518, + "step": 681 + }, + { + "epoch": 0.23276450511945393, + "grad_norm": 3.5754668712615967, + "learning_rate": 0.0009224118316268487, + "loss": 6.5852, + "step": 682 + }, + { + "epoch": 0.23310580204778156, + "grad_norm": 3.5699808597564697, + "learning_rate": 0.0009222980659840728, + "loss": 7.1735, + "step": 683 + }, + { + "epoch": 0.2334470989761092, + "grad_norm": 3.3643417358398438, + "learning_rate": 0.000922184300341297, + "loss": 6.8809, + "step": 684 + }, + { + "epoch": 0.23378839590443687, + "grad_norm": 4.393078804016113, + "learning_rate": 0.0009220705346985211, + "loss": 6.2643, + "step": 685 + }, + { + "epoch": 0.2341296928327645, + "grad_norm": 3.7724368572235107, + "learning_rate": 0.0009219567690557452, + "loss": 6.6099, + "step": 686 + }, + { + "epoch": 0.23447098976109215, + "grad_norm": 3.7385144233703613, + "learning_rate": 0.0009218430034129693, + "loss": 6.8247, + "step": 687 + }, + { + "epoch": 0.2348122866894198, + "grad_norm": 3.5062074661254883, + "learning_rate": 0.0009217292377701934, + "loss": 7.1594, + "step": 688 + }, + { + "epoch": 0.23515358361774744, + "grad_norm": 4.198519229888916, + "learning_rate": 0.0009216154721274175, + "loss": 6.0545, + "step": 689 + }, + { + "epoch": 0.2354948805460751, + "grad_norm": 3.814720392227173, + "learning_rate": 0.0009215017064846418, + "loss": 7.1292, + "step": 690 + }, + { + "epoch": 0.23583617747440272, + "grad_norm": 3.642256498336792, + "learning_rate": 0.0009213879408418658, + "loss": 7.0849, + "step": 691 + }, + { + "epoch": 0.23617747440273038, + "grad_norm": 4.872190475463867, + "learning_rate": 0.0009212741751990899, + "loss": 6.8024, + "step": 692 + }, + { + "epoch": 0.23651877133105803, + "grad_norm": 3.5935611724853516, + "learning_rate": 0.000921160409556314, + "loss": 6.6592, + "step": 693 + }, + { + "epoch": 0.23686006825938566, + "grad_norm": 4.270242214202881, + "learning_rate": 0.0009210466439135381, + "loss": 6.636, + "step": 694 + }, + { + "epoch": 0.23720136518771331, + "grad_norm": 3.73964524269104, + "learning_rate": 0.0009209328782707622, + "loss": 7.3186, + "step": 695 + }, + { + "epoch": 0.23754266211604094, + "grad_norm": 3.55539870262146, + "learning_rate": 0.0009208191126279864, + "loss": 7.008, + "step": 696 + }, + { + "epoch": 0.2378839590443686, + "grad_norm": 3.5897581577301025, + "learning_rate": 0.0009207053469852105, + "loss": 6.8969, + "step": 697 + }, + { + "epoch": 0.23822525597269625, + "grad_norm": 3.5224783420562744, + "learning_rate": 0.0009205915813424346, + "loss": 6.6032, + "step": 698 + }, + { + "epoch": 0.23856655290102388, + "grad_norm": 3.5760772228240967, + "learning_rate": 0.0009204778156996587, + "loss": 6.8774, + "step": 699 + }, + { + "epoch": 0.23890784982935154, + "grad_norm": 3.4625370502471924, + "learning_rate": 0.0009203640500568828, + "loss": 6.934, + "step": 700 + }, + { + "epoch": 0.2392491467576792, + "grad_norm": 3.7003684043884277, + "learning_rate": 0.0009202502844141069, + "loss": 6.7796, + "step": 701 + }, + { + "epoch": 0.23959044368600682, + "grad_norm": 3.4892773628234863, + "learning_rate": 0.0009201365187713311, + "loss": 6.6771, + "step": 702 + }, + { + "epoch": 0.23993174061433448, + "grad_norm": 3.626009941101074, + "learning_rate": 0.0009200227531285552, + "loss": 6.4784, + "step": 703 + }, + { + "epoch": 0.2402730375426621, + "grad_norm": 4.080516815185547, + "learning_rate": 0.0009199089874857793, + "loss": 6.844, + "step": 704 + }, + { + "epoch": 0.24061433447098976, + "grad_norm": 3.7570433616638184, + "learning_rate": 0.0009197952218430034, + "loss": 6.3753, + "step": 705 + }, + { + "epoch": 0.24095563139931742, + "grad_norm": 3.6478946208953857, + "learning_rate": 0.0009196814562002275, + "loss": 6.4808, + "step": 706 + }, + { + "epoch": 0.24129692832764504, + "grad_norm": 3.6579360961914062, + "learning_rate": 0.0009195676905574518, + "loss": 6.345, + "step": 707 + }, + { + "epoch": 0.2416382252559727, + "grad_norm": 3.3532590866088867, + "learning_rate": 0.0009194539249146759, + "loss": 6.8791, + "step": 708 + }, + { + "epoch": 0.24197952218430033, + "grad_norm": 3.424499273300171, + "learning_rate": 0.0009193401592719, + "loss": 6.5488, + "step": 709 + }, + { + "epoch": 0.24232081911262798, + "grad_norm": 3.450228452682495, + "learning_rate": 0.000919226393629124, + "loss": 6.8405, + "step": 710 + }, + { + "epoch": 0.24266211604095564, + "grad_norm": 3.4876630306243896, + "learning_rate": 0.0009191126279863481, + "loss": 6.9152, + "step": 711 + }, + { + "epoch": 0.24300341296928327, + "grad_norm": 3.635850429534912, + "learning_rate": 0.0009189988623435722, + "loss": 6.6264, + "step": 712 + }, + { + "epoch": 0.24334470989761092, + "grad_norm": 3.780963897705078, + "learning_rate": 0.0009188850967007964, + "loss": 6.1287, + "step": 713 + }, + { + "epoch": 0.24368600682593858, + "grad_norm": 3.4603798389434814, + "learning_rate": 0.0009187713310580205, + "loss": 6.6569, + "step": 714 + }, + { + "epoch": 0.2440273037542662, + "grad_norm": 3.917168378829956, + "learning_rate": 0.0009186575654152446, + "loss": 6.2551, + "step": 715 + }, + { + "epoch": 0.24436860068259386, + "grad_norm": 3.537386417388916, + "learning_rate": 0.0009185437997724687, + "loss": 7.0423, + "step": 716 + }, + { + "epoch": 0.2447098976109215, + "grad_norm": 3.577162027359009, + "learning_rate": 0.0009184300341296928, + "loss": 6.9338, + "step": 717 + }, + { + "epoch": 0.24505119453924915, + "grad_norm": 3.8202831745147705, + "learning_rate": 0.0009183162684869169, + "loss": 6.0808, + "step": 718 + }, + { + "epoch": 0.2453924914675768, + "grad_norm": 3.6418538093566895, + "learning_rate": 0.0009182025028441411, + "loss": 7.1966, + "step": 719 + }, + { + "epoch": 0.24573378839590443, + "grad_norm": 3.42926025390625, + "learning_rate": 0.0009180887372013652, + "loss": 7.0308, + "step": 720 + }, + { + "epoch": 0.24607508532423208, + "grad_norm": 3.68467378616333, + "learning_rate": 0.0009179749715585893, + "loss": 6.9812, + "step": 721 + }, + { + "epoch": 0.24641638225255974, + "grad_norm": 3.54465651512146, + "learning_rate": 0.0009178612059158134, + "loss": 6.8175, + "step": 722 + }, + { + "epoch": 0.24675767918088737, + "grad_norm": 3.897510051727295, + "learning_rate": 0.0009177474402730375, + "loss": 6.6669, + "step": 723 + }, + { + "epoch": 0.24709897610921502, + "grad_norm": 3.5380194187164307, + "learning_rate": 0.0009176336746302618, + "loss": 6.7135, + "step": 724 + }, + { + "epoch": 0.24744027303754265, + "grad_norm": 4.017455577850342, + "learning_rate": 0.0009175199089874859, + "loss": 6.2163, + "step": 725 + }, + { + "epoch": 0.2477815699658703, + "grad_norm": 3.646085739135742, + "learning_rate": 0.00091740614334471, + "loss": 6.6526, + "step": 726 + }, + { + "epoch": 0.24812286689419796, + "grad_norm": 3.8709123134613037, + "learning_rate": 0.0009172923777019341, + "loss": 6.7876, + "step": 727 + }, + { + "epoch": 0.2484641638225256, + "grad_norm": 3.960822582244873, + "learning_rate": 0.0009171786120591582, + "loss": 6.7348, + "step": 728 + }, + { + "epoch": 0.24880546075085325, + "grad_norm": 3.8198039531707764, + "learning_rate": 0.0009170648464163823, + "loss": 6.4908, + "step": 729 + }, + { + "epoch": 0.24914675767918087, + "grad_norm": 4.048033237457275, + "learning_rate": 0.0009169510807736064, + "loss": 7.083, + "step": 730 + }, + { + "epoch": 0.24948805460750853, + "grad_norm": 3.5977671146392822, + "learning_rate": 0.0009168373151308305, + "loss": 7.1354, + "step": 731 + }, + { + "epoch": 0.24982935153583619, + "grad_norm": 3.9307358264923096, + "learning_rate": 0.0009167235494880546, + "loss": 6.8698, + "step": 732 + }, + { + "epoch": 0.2501706484641638, + "grad_norm": 3.6373484134674072, + "learning_rate": 0.0009166097838452787, + "loss": 6.9548, + "step": 733 + }, + { + "epoch": 0.25051194539249144, + "grad_norm": 3.8144493103027344, + "learning_rate": 0.0009164960182025028, + "loss": 6.875, + "step": 734 + }, + { + "epoch": 0.2508532423208191, + "grad_norm": 3.9613707065582275, + "learning_rate": 0.0009163822525597269, + "loss": 6.3705, + "step": 735 + }, + { + "epoch": 0.25119453924914675, + "grad_norm": 4.828775882720947, + "learning_rate": 0.0009162684869169511, + "loss": 6.0426, + "step": 736 + }, + { + "epoch": 0.2515358361774744, + "grad_norm": 3.8135440349578857, + "learning_rate": 0.0009161547212741752, + "loss": 6.7698, + "step": 737 + }, + { + "epoch": 0.25187713310580206, + "grad_norm": 3.477957248687744, + "learning_rate": 0.0009160409556313993, + "loss": 6.7878, + "step": 738 + }, + { + "epoch": 0.2522184300341297, + "grad_norm": 3.6808841228485107, + "learning_rate": 0.0009159271899886234, + "loss": 6.9381, + "step": 739 + }, + { + "epoch": 0.2525597269624573, + "grad_norm": 3.500927448272705, + "learning_rate": 0.0009158134243458475, + "loss": 6.7481, + "step": 740 + }, + { + "epoch": 0.252901023890785, + "grad_norm": 3.291139602661133, + "learning_rate": 0.0009156996587030717, + "loss": 6.6917, + "step": 741 + }, + { + "epoch": 0.25324232081911263, + "grad_norm": 3.652759552001953, + "learning_rate": 0.0009155858930602959, + "loss": 7.108, + "step": 742 + }, + { + "epoch": 0.25358361774744026, + "grad_norm": 3.6527302265167236, + "learning_rate": 0.00091547212741752, + "loss": 6.6711, + "step": 743 + }, + { + "epoch": 0.25392491467576794, + "grad_norm": 3.4871606826782227, + "learning_rate": 0.0009153583617747441, + "loss": 7.0744, + "step": 744 + }, + { + "epoch": 0.25426621160409557, + "grad_norm": 5.465394973754883, + "learning_rate": 0.0009152445961319682, + "loss": 6.4962, + "step": 745 + }, + { + "epoch": 0.2546075085324232, + "grad_norm": 3.8844666481018066, + "learning_rate": 0.0009151308304891923, + "loss": 6.8589, + "step": 746 + }, + { + "epoch": 0.2549488054607508, + "grad_norm": 3.7383275032043457, + "learning_rate": 0.0009150170648464165, + "loss": 7.1996, + "step": 747 + }, + { + "epoch": 0.2552901023890785, + "grad_norm": 3.918121814727783, + "learning_rate": 0.0009149032992036406, + "loss": 6.5667, + "step": 748 + }, + { + "epoch": 0.25563139931740614, + "grad_norm": 3.829987049102783, + "learning_rate": 0.0009147895335608646, + "loss": 6.5303, + "step": 749 + }, + { + "epoch": 0.25597269624573377, + "grad_norm": 3.3914873600006104, + "learning_rate": 0.0009146757679180887, + "loss": 7.1671, + "step": 750 + }, + { + "epoch": 0.25631399317406145, + "grad_norm": 3.518707275390625, + "learning_rate": 0.0009145620022753128, + "loss": 6.7905, + "step": 751 + }, + { + "epoch": 0.2566552901023891, + "grad_norm": 3.641456127166748, + "learning_rate": 0.0009144482366325369, + "loss": 6.7043, + "step": 752 + }, + { + "epoch": 0.2569965870307167, + "grad_norm": 3.5336403846740723, + "learning_rate": 0.0009143344709897611, + "loss": 6.4704, + "step": 753 + }, + { + "epoch": 0.2573378839590444, + "grad_norm": 3.45283579826355, + "learning_rate": 0.0009142207053469852, + "loss": 6.7629, + "step": 754 + }, + { + "epoch": 0.257679180887372, + "grad_norm": 3.4973959922790527, + "learning_rate": 0.0009141069397042093, + "loss": 6.82, + "step": 755 + }, + { + "epoch": 0.25802047781569964, + "grad_norm": 3.7503435611724854, + "learning_rate": 0.0009139931740614334, + "loss": 6.2377, + "step": 756 + }, + { + "epoch": 0.25836177474402733, + "grad_norm": 3.7249867916107178, + "learning_rate": 0.0009138794084186575, + "loss": 7.3, + "step": 757 + }, + { + "epoch": 0.25870307167235496, + "grad_norm": 3.5649001598358154, + "learning_rate": 0.0009137656427758817, + "loss": 6.6075, + "step": 758 + }, + { + "epoch": 0.2590443686006826, + "grad_norm": 4.057753562927246, + "learning_rate": 0.0009136518771331059, + "loss": 6.6559, + "step": 759 + }, + { + "epoch": 0.2593856655290102, + "grad_norm": 3.949220657348633, + "learning_rate": 0.00091353811149033, + "loss": 6.8383, + "step": 760 + }, + { + "epoch": 0.2597269624573379, + "grad_norm": 3.460750102996826, + "learning_rate": 0.0009134243458475541, + "loss": 6.892, + "step": 761 + }, + { + "epoch": 0.2600682593856655, + "grad_norm": 7.605457305908203, + "learning_rate": 0.0009133105802047782, + "loss": 6.766, + "step": 762 + }, + { + "epoch": 0.26040955631399315, + "grad_norm": 3.6430115699768066, + "learning_rate": 0.0009131968145620023, + "loss": 6.9554, + "step": 763 + }, + { + "epoch": 0.26075085324232083, + "grad_norm": 3.728748083114624, + "learning_rate": 0.0009130830489192265, + "loss": 7.2722, + "step": 764 + }, + { + "epoch": 0.26109215017064846, + "grad_norm": 3.6627445220947266, + "learning_rate": 0.0009129692832764506, + "loss": 6.8527, + "step": 765 + }, + { + "epoch": 0.2614334470989761, + "grad_norm": 3.310154676437378, + "learning_rate": 0.0009128555176336747, + "loss": 7.1432, + "step": 766 + }, + { + "epoch": 0.2617747440273038, + "grad_norm": 3.2910642623901367, + "learning_rate": 0.0009127417519908988, + "loss": 7.0282, + "step": 767 + }, + { + "epoch": 0.2621160409556314, + "grad_norm": 3.301368236541748, + "learning_rate": 0.0009126279863481229, + "loss": 6.8306, + "step": 768 + }, + { + "epoch": 0.26245733788395903, + "grad_norm": 6.695905685424805, + "learning_rate": 0.0009125142207053469, + "loss": 5.8919, + "step": 769 + }, + { + "epoch": 0.2627986348122867, + "grad_norm": 3.5930447578430176, + "learning_rate": 0.0009124004550625711, + "loss": 6.3776, + "step": 770 + }, + { + "epoch": 0.26313993174061434, + "grad_norm": 3.598477363586426, + "learning_rate": 0.0009122866894197952, + "loss": 6.9116, + "step": 771 + }, + { + "epoch": 0.26348122866894197, + "grad_norm": 3.46647572517395, + "learning_rate": 0.0009121729237770193, + "loss": 6.6275, + "step": 772 + }, + { + "epoch": 0.2638225255972696, + "grad_norm": 3.6713485717773438, + "learning_rate": 0.0009120591581342434, + "loss": 6.3535, + "step": 773 + }, + { + "epoch": 0.2641638225255973, + "grad_norm": 4.576082229614258, + "learning_rate": 0.0009119453924914675, + "loss": 6.585, + "step": 774 + }, + { + "epoch": 0.2645051194539249, + "grad_norm": 3.976733922958374, + "learning_rate": 0.0009118316268486917, + "loss": 6.479, + "step": 775 + }, + { + "epoch": 0.26484641638225254, + "grad_norm": 3.6250696182250977, + "learning_rate": 0.0009117178612059159, + "loss": 6.4861, + "step": 776 + }, + { + "epoch": 0.2651877133105802, + "grad_norm": 3.517406702041626, + "learning_rate": 0.00091160409556314, + "loss": 7.2256, + "step": 777 + }, + { + "epoch": 0.26552901023890785, + "grad_norm": 3.710671901702881, + "learning_rate": 0.0009114903299203641, + "loss": 6.5771, + "step": 778 + }, + { + "epoch": 0.2658703071672355, + "grad_norm": 3.4462730884552, + "learning_rate": 0.0009113765642775882, + "loss": 6.6247, + "step": 779 + }, + { + "epoch": 0.26621160409556316, + "grad_norm": 3.5399367809295654, + "learning_rate": 0.0009112627986348123, + "loss": 6.4941, + "step": 780 + }, + { + "epoch": 0.2665529010238908, + "grad_norm": 3.5168278217315674, + "learning_rate": 0.0009111490329920364, + "loss": 6.8313, + "step": 781 + }, + { + "epoch": 0.2668941979522184, + "grad_norm": 3.445139169692993, + "learning_rate": 0.0009110352673492606, + "loss": 6.7253, + "step": 782 + }, + { + "epoch": 0.2672354948805461, + "grad_norm": 3.9971537590026855, + "learning_rate": 0.0009109215017064847, + "loss": 6.8024, + "step": 783 + }, + { + "epoch": 0.2675767918088737, + "grad_norm": 3.8834123611450195, + "learning_rate": 0.0009108077360637088, + "loss": 6.729, + "step": 784 + }, + { + "epoch": 0.26791808873720135, + "grad_norm": 4.05747652053833, + "learning_rate": 0.0009106939704209329, + "loss": 6.3846, + "step": 785 + }, + { + "epoch": 0.26825938566552904, + "grad_norm": 3.5583267211914062, + "learning_rate": 0.000910580204778157, + "loss": 7.0615, + "step": 786 + }, + { + "epoch": 0.26860068259385667, + "grad_norm": 3.9900383949279785, + "learning_rate": 0.0009104664391353812, + "loss": 6.3194, + "step": 787 + }, + { + "epoch": 0.2689419795221843, + "grad_norm": 3.564735174179077, + "learning_rate": 0.0009103526734926052, + "loss": 6.736, + "step": 788 + }, + { + "epoch": 0.2692832764505119, + "grad_norm": 3.4203550815582275, + "learning_rate": 0.0009102389078498293, + "loss": 6.8528, + "step": 789 + }, + { + "epoch": 0.2696245733788396, + "grad_norm": 3.507297992706299, + "learning_rate": 0.0009101251422070534, + "loss": 6.7446, + "step": 790 + }, + { + "epoch": 0.26996587030716723, + "grad_norm": 3.489607095718384, + "learning_rate": 0.0009100113765642775, + "loss": 7.1027, + "step": 791 + }, + { + "epoch": 0.27030716723549486, + "grad_norm": 3.3406436443328857, + "learning_rate": 0.0009098976109215017, + "loss": 6.9971, + "step": 792 + }, + { + "epoch": 0.27064846416382254, + "grad_norm": 3.441478967666626, + "learning_rate": 0.0009097838452787259, + "loss": 6.8602, + "step": 793 + }, + { + "epoch": 0.27098976109215017, + "grad_norm": 3.7188546657562256, + "learning_rate": 0.00090967007963595, + "loss": 6.4872, + "step": 794 + }, + { + "epoch": 0.2713310580204778, + "grad_norm": 3.50281023979187, + "learning_rate": 0.0009095563139931741, + "loss": 6.6612, + "step": 795 + }, + { + "epoch": 0.2716723549488055, + "grad_norm": 3.8761279582977295, + "learning_rate": 0.0009094425483503982, + "loss": 6.5065, + "step": 796 + }, + { + "epoch": 0.2720136518771331, + "grad_norm": 5.031599521636963, + "learning_rate": 0.0009093287827076223, + "loss": 5.0877, + "step": 797 + }, + { + "epoch": 0.27235494880546074, + "grad_norm": 3.8323252201080322, + "learning_rate": 0.0009092150170648464, + "loss": 6.7722, + "step": 798 + }, + { + "epoch": 0.2726962457337884, + "grad_norm": 5.560990333557129, + "learning_rate": 0.0009091012514220706, + "loss": 5.4916, + "step": 799 + }, + { + "epoch": 0.27303754266211605, + "grad_norm": 3.516458511352539, + "learning_rate": 0.0009089874857792947, + "loss": 6.7099, + "step": 800 + }, + { + "epoch": 0.2733788395904437, + "grad_norm": 3.5895726680755615, + "learning_rate": 0.0009088737201365188, + "loss": 6.3197, + "step": 801 + }, + { + "epoch": 0.2737201365187713, + "grad_norm": 3.489635467529297, + "learning_rate": 0.0009087599544937429, + "loss": 7.0153, + "step": 802 + }, + { + "epoch": 0.274061433447099, + "grad_norm": 3.5357353687286377, + "learning_rate": 0.000908646188850967, + "loss": 6.392, + "step": 803 + }, + { + "epoch": 0.2744027303754266, + "grad_norm": 3.435361623764038, + "learning_rate": 0.0009085324232081912, + "loss": 6.7289, + "step": 804 + }, + { + "epoch": 0.27474402730375425, + "grad_norm": 3.4798481464385986, + "learning_rate": 0.0009084186575654153, + "loss": 7.1502, + "step": 805 + }, + { + "epoch": 0.27508532423208193, + "grad_norm": 4.012742042541504, + "learning_rate": 0.0009083048919226394, + "loss": 5.9575, + "step": 806 + }, + { + "epoch": 0.27542662116040956, + "grad_norm": 3.503770112991333, + "learning_rate": 0.0009081911262798636, + "loss": 6.7986, + "step": 807 + }, + { + "epoch": 0.2757679180887372, + "grad_norm": 3.8158676624298096, + "learning_rate": 0.0009080773606370875, + "loss": 6.6379, + "step": 808 + }, + { + "epoch": 0.27610921501706487, + "grad_norm": 3.9384515285491943, + "learning_rate": 0.0009079635949943117, + "loss": 6.6877, + "step": 809 + }, + { + "epoch": 0.2764505119453925, + "grad_norm": 4.289785861968994, + "learning_rate": 0.0009078498293515359, + "loss": 6.4728, + "step": 810 + }, + { + "epoch": 0.2767918088737201, + "grad_norm": 3.6413400173187256, + "learning_rate": 0.00090773606370876, + "loss": 6.9374, + "step": 811 + }, + { + "epoch": 0.2771331058020478, + "grad_norm": 3.771023988723755, + "learning_rate": 0.0009076222980659841, + "loss": 7.0035, + "step": 812 + }, + { + "epoch": 0.27747440273037544, + "grad_norm": 3.7076187133789062, + "learning_rate": 0.0009075085324232082, + "loss": 6.5376, + "step": 813 + }, + { + "epoch": 0.27781569965870306, + "grad_norm": 4.858947277069092, + "learning_rate": 0.0009073947667804323, + "loss": 6.2922, + "step": 814 + }, + { + "epoch": 0.2781569965870307, + "grad_norm": 3.7647488117218018, + "learning_rate": 0.0009072810011376564, + "loss": 7.0143, + "step": 815 + }, + { + "epoch": 0.2784982935153584, + "grad_norm": 4.246391773223877, + "learning_rate": 0.0009071672354948806, + "loss": 6.8741, + "step": 816 + }, + { + "epoch": 0.278839590443686, + "grad_norm": 3.448207139968872, + "learning_rate": 0.0009070534698521047, + "loss": 7.1626, + "step": 817 + }, + { + "epoch": 0.27918088737201363, + "grad_norm": 3.4277663230895996, + "learning_rate": 0.0009069397042093288, + "loss": 7.1784, + "step": 818 + }, + { + "epoch": 0.2795221843003413, + "grad_norm": 3.3655548095703125, + "learning_rate": 0.0009068259385665529, + "loss": 6.8989, + "step": 819 + }, + { + "epoch": 0.27986348122866894, + "grad_norm": 15.125733375549316, + "learning_rate": 0.000906712172923777, + "loss": 6.2618, + "step": 820 + }, + { + "epoch": 0.28020477815699657, + "grad_norm": 3.803508996963501, + "learning_rate": 0.0009065984072810011, + "loss": 7.2742, + "step": 821 + }, + { + "epoch": 0.28054607508532425, + "grad_norm": 4.941516399383545, + "learning_rate": 0.0009064846416382253, + "loss": 6.3995, + "step": 822 + }, + { + "epoch": 0.2808873720136519, + "grad_norm": 4.072309494018555, + "learning_rate": 0.0009063708759954494, + "loss": 6.865, + "step": 823 + }, + { + "epoch": 0.2812286689419795, + "grad_norm": 3.5273547172546387, + "learning_rate": 0.0009062571103526736, + "loss": 6.8719, + "step": 824 + }, + { + "epoch": 0.2815699658703072, + "grad_norm": 3.4187939167022705, + "learning_rate": 0.0009061433447098977, + "loss": 6.6182, + "step": 825 + }, + { + "epoch": 0.2819112627986348, + "grad_norm": 5.67510461807251, + "learning_rate": 0.0009060295790671218, + "loss": 5.9724, + "step": 826 + }, + { + "epoch": 0.28225255972696245, + "grad_norm": 3.5598597526550293, + "learning_rate": 0.0009059158134243459, + "loss": 7.0266, + "step": 827 + }, + { + "epoch": 0.2825938566552901, + "grad_norm": 4.407433986663818, + "learning_rate": 0.00090580204778157, + "loss": 6.3405, + "step": 828 + }, + { + "epoch": 0.28293515358361776, + "grad_norm": 3.8481311798095703, + "learning_rate": 0.0009056882821387941, + "loss": 6.6051, + "step": 829 + }, + { + "epoch": 0.2832764505119454, + "grad_norm": 3.4790523052215576, + "learning_rate": 0.0009055745164960182, + "loss": 6.5085, + "step": 830 + }, + { + "epoch": 0.283617747440273, + "grad_norm": 3.397590398788452, + "learning_rate": 0.0009054607508532423, + "loss": 7.1648, + "step": 831 + }, + { + "epoch": 0.2839590443686007, + "grad_norm": 3.374161958694458, + "learning_rate": 0.0009053469852104664, + "loss": 6.8825, + "step": 832 + }, + { + "epoch": 0.2843003412969283, + "grad_norm": 3.5871407985687256, + "learning_rate": 0.0009052332195676906, + "loss": 6.7742, + "step": 833 + }, + { + "epoch": 0.28464163822525596, + "grad_norm": 5.533285140991211, + "learning_rate": 0.0009051194539249147, + "loss": 6.1539, + "step": 834 + }, + { + "epoch": 0.28498293515358364, + "grad_norm": 4.258869647979736, + "learning_rate": 0.0009050056882821388, + "loss": 6.2765, + "step": 835 + }, + { + "epoch": 0.28532423208191127, + "grad_norm": 3.785416603088379, + "learning_rate": 0.0009048919226393629, + "loss": 7.2967, + "step": 836 + }, + { + "epoch": 0.2856655290102389, + "grad_norm": 3.6271724700927734, + "learning_rate": 0.000904778156996587, + "loss": 7.1172, + "step": 837 + }, + { + "epoch": 0.2860068259385666, + "grad_norm": 3.934699296951294, + "learning_rate": 0.0009046643913538111, + "loss": 6.4742, + "step": 838 + }, + { + "epoch": 0.2863481228668942, + "grad_norm": 3.78275990486145, + "learning_rate": 0.0009045506257110353, + "loss": 6.3398, + "step": 839 + }, + { + "epoch": 0.28668941979522183, + "grad_norm": 4.096293926239014, + "learning_rate": 0.0009044368600682594, + "loss": 6.7516, + "step": 840 + }, + { + "epoch": 0.28703071672354946, + "grad_norm": 9.30138111114502, + "learning_rate": 0.0009043230944254836, + "loss": 7.8543, + "step": 841 + }, + { + "epoch": 0.28737201365187715, + "grad_norm": 3.7339835166931152, + "learning_rate": 0.0009042093287827077, + "loss": 7.1888, + "step": 842 + }, + { + "epoch": 0.2877133105802048, + "grad_norm": 3.5420281887054443, + "learning_rate": 0.0009040955631399318, + "loss": 6.9893, + "step": 843 + }, + { + "epoch": 0.2880546075085324, + "grad_norm": 3.5827724933624268, + "learning_rate": 0.0009039817974971559, + "loss": 6.8226, + "step": 844 + }, + { + "epoch": 0.2883959044368601, + "grad_norm": 22.785751342773438, + "learning_rate": 0.0009038680318543801, + "loss": 6.6574, + "step": 845 + }, + { + "epoch": 0.2887372013651877, + "grad_norm": 3.686525583267212, + "learning_rate": 0.0009037542662116041, + "loss": 6.6054, + "step": 846 + }, + { + "epoch": 0.28907849829351534, + "grad_norm": 3.8692002296447754, + "learning_rate": 0.0009036405005688282, + "loss": 6.785, + "step": 847 + }, + { + "epoch": 0.289419795221843, + "grad_norm": 5.953486919403076, + "learning_rate": 0.0009035267349260523, + "loss": 4.0383, + "step": 848 + }, + { + "epoch": 0.28976109215017065, + "grad_norm": 3.760619878768921, + "learning_rate": 0.0009034129692832764, + "loss": 6.9499, + "step": 849 + }, + { + "epoch": 0.2901023890784983, + "grad_norm": 4.419182300567627, + "learning_rate": 0.0009032992036405006, + "loss": 6.7118, + "step": 850 + }, + { + "epoch": 0.29044368600682596, + "grad_norm": 3.5752365589141846, + "learning_rate": 0.0009031854379977247, + "loss": 6.9143, + "step": 851 + }, + { + "epoch": 0.2907849829351536, + "grad_norm": 3.5920283794403076, + "learning_rate": 0.0009030716723549488, + "loss": 7.0298, + "step": 852 + }, + { + "epoch": 0.2911262798634812, + "grad_norm": 4.226795196533203, + "learning_rate": 0.0009029579067121729, + "loss": 5.9417, + "step": 853 + }, + { + "epoch": 0.29146757679180885, + "grad_norm": 3.6170947551727295, + "learning_rate": 0.000902844141069397, + "loss": 6.926, + "step": 854 + }, + { + "epoch": 0.29180887372013653, + "grad_norm": 3.58585786819458, + "learning_rate": 0.0009027303754266211, + "loss": 6.7011, + "step": 855 + }, + { + "epoch": 0.29215017064846416, + "grad_norm": 3.7158362865448, + "learning_rate": 0.0009026166097838453, + "loss": 6.3346, + "step": 856 + }, + { + "epoch": 0.2924914675767918, + "grad_norm": 3.4805827140808105, + "learning_rate": 0.0009025028441410694, + "loss": 7.1524, + "step": 857 + }, + { + "epoch": 0.29283276450511947, + "grad_norm": 4.309206485748291, + "learning_rate": 0.0009023890784982936, + "loss": 6.79, + "step": 858 + }, + { + "epoch": 0.2931740614334471, + "grad_norm": 4.295877456665039, + "learning_rate": 0.0009022753128555177, + "loss": 6.5865, + "step": 859 + }, + { + "epoch": 0.2935153583617747, + "grad_norm": 3.572010040283203, + "learning_rate": 0.0009021615472127418, + "loss": 6.9353, + "step": 860 + }, + { + "epoch": 0.2938566552901024, + "grad_norm": 3.4424901008605957, + "learning_rate": 0.0009020477815699659, + "loss": 6.7194, + "step": 861 + }, + { + "epoch": 0.29419795221843004, + "grad_norm": 3.855348825454712, + "learning_rate": 0.0009019340159271901, + "loss": 6.9222, + "step": 862 + }, + { + "epoch": 0.29453924914675766, + "grad_norm": 6.261025428771973, + "learning_rate": 0.0009018202502844142, + "loss": 5.8834, + "step": 863 + }, + { + "epoch": 0.29488054607508535, + "grad_norm": 4.185757160186768, + "learning_rate": 0.0009017064846416383, + "loss": 6.8247, + "step": 864 + }, + { + "epoch": 0.295221843003413, + "grad_norm": 3.7310314178466797, + "learning_rate": 0.0009015927189988624, + "loss": 7.1868, + "step": 865 + }, + { + "epoch": 0.2955631399317406, + "grad_norm": 3.530855655670166, + "learning_rate": 0.0009014789533560864, + "loss": 7.2622, + "step": 866 + }, + { + "epoch": 0.29590443686006823, + "grad_norm": 7.029987335205078, + "learning_rate": 0.0009013651877133105, + "loss": 6.3624, + "step": 867 + }, + { + "epoch": 0.2962457337883959, + "grad_norm": 3.599027633666992, + "learning_rate": 0.0009012514220705347, + "loss": 7.0364, + "step": 868 + }, + { + "epoch": 0.29658703071672354, + "grad_norm": 3.781937599182129, + "learning_rate": 0.0009011376564277588, + "loss": 6.8376, + "step": 869 + }, + { + "epoch": 0.29692832764505117, + "grad_norm": 3.625401258468628, + "learning_rate": 0.0009010238907849829, + "loss": 7.1427, + "step": 870 + }, + { + "epoch": 0.29726962457337885, + "grad_norm": 3.468726396560669, + "learning_rate": 0.000900910125142207, + "loss": 7.1632, + "step": 871 + }, + { + "epoch": 0.2976109215017065, + "grad_norm": 5.193005084991455, + "learning_rate": 0.0009007963594994311, + "loss": 6.2791, + "step": 872 + }, + { + "epoch": 0.2979522184300341, + "grad_norm": 3.5303850173950195, + "learning_rate": 0.0009006825938566553, + "loss": 6.9128, + "step": 873 + }, + { + "epoch": 0.2982935153583618, + "grad_norm": 3.5984513759613037, + "learning_rate": 0.0009005688282138795, + "loss": 6.8501, + "step": 874 + }, + { + "epoch": 0.2986348122866894, + "grad_norm": 13.128771781921387, + "learning_rate": 0.0009004550625711036, + "loss": 4.9908, + "step": 875 + }, + { + "epoch": 0.29897610921501705, + "grad_norm": 4.513108730316162, + "learning_rate": 0.0009003412969283277, + "loss": 6.2214, + "step": 876 + }, + { + "epoch": 0.29931740614334473, + "grad_norm": 3.9287192821502686, + "learning_rate": 0.0009002275312855518, + "loss": 6.5736, + "step": 877 + }, + { + "epoch": 0.29965870307167236, + "grad_norm": 3.6785993576049805, + "learning_rate": 0.0009001137656427759, + "loss": 6.5955, + "step": 878 + }, + { + "epoch": 0.3, + "grad_norm": 3.5705087184906006, + "learning_rate": 0.0009000000000000001, + "loss": 6.6474, + "step": 879 + }, + { + "epoch": 0.3003412969283277, + "grad_norm": 3.4256176948547363, + "learning_rate": 0.0008998862343572242, + "loss": 7.1297, + "step": 880 + }, + { + "epoch": 0.3006825938566553, + "grad_norm": 3.45796275138855, + "learning_rate": 0.0008997724687144483, + "loss": 7.0026, + "step": 881 + }, + { + "epoch": 0.30102389078498293, + "grad_norm": 3.569291353225708, + "learning_rate": 0.0008996587030716724, + "loss": 6.8185, + "step": 882 + }, + { + "epoch": 0.30136518771331056, + "grad_norm": 4.46466588973999, + "learning_rate": 0.0008995449374288965, + "loss": 6.4208, + "step": 883 + }, + { + "epoch": 0.30170648464163824, + "grad_norm": 3.295574903488159, + "learning_rate": 0.0008994311717861206, + "loss": 6.6947, + "step": 884 + }, + { + "epoch": 0.30204778156996587, + "grad_norm": 3.447225570678711, + "learning_rate": 0.0008993174061433447, + "loss": 7.3734, + "step": 885 + }, + { + "epoch": 0.3023890784982935, + "grad_norm": 3.348236322402954, + "learning_rate": 0.0008992036405005688, + "loss": 6.6871, + "step": 886 + }, + { + "epoch": 0.3027303754266212, + "grad_norm": 3.5283188819885254, + "learning_rate": 0.0008990898748577929, + "loss": 6.9201, + "step": 887 + }, + { + "epoch": 0.3030716723549488, + "grad_norm": 3.4299886226654053, + "learning_rate": 0.000898976109215017, + "loss": 6.8192, + "step": 888 + }, + { + "epoch": 0.30341296928327643, + "grad_norm": 3.3818650245666504, + "learning_rate": 0.0008988623435722411, + "loss": 6.4265, + "step": 889 + }, + { + "epoch": 0.3037542662116041, + "grad_norm": 3.4895620346069336, + "learning_rate": 0.0008987485779294653, + "loss": 6.9777, + "step": 890 + }, + { + "epoch": 0.30409556313993175, + "grad_norm": 3.668978691101074, + "learning_rate": 0.0008986348122866895, + "loss": 6.9788, + "step": 891 + }, + { + "epoch": 0.3044368600682594, + "grad_norm": 3.538581132888794, + "learning_rate": 0.0008985210466439136, + "loss": 6.7965, + "step": 892 + }, + { + "epoch": 0.30477815699658706, + "grad_norm": 3.531942844390869, + "learning_rate": 0.0008984072810011377, + "loss": 7.2188, + "step": 893 + }, + { + "epoch": 0.3051194539249147, + "grad_norm": 3.566673755645752, + "learning_rate": 0.0008982935153583618, + "loss": 6.6428, + "step": 894 + }, + { + "epoch": 0.3054607508532423, + "grad_norm": 3.5514180660247803, + "learning_rate": 0.0008981797497155859, + "loss": 7.0462, + "step": 895 + }, + { + "epoch": 0.30580204778156994, + "grad_norm": 9.032792091369629, + "learning_rate": 0.0008980659840728101, + "loss": 8.609, + "step": 896 + }, + { + "epoch": 0.3061433447098976, + "grad_norm": 3.692873001098633, + "learning_rate": 0.0008979522184300342, + "loss": 6.7491, + "step": 897 + }, + { + "epoch": 0.30648464163822525, + "grad_norm": 3.670551061630249, + "learning_rate": 0.0008978384527872583, + "loss": 6.3838, + "step": 898 + }, + { + "epoch": 0.3068259385665529, + "grad_norm": 5.768166542053223, + "learning_rate": 0.0008977246871444824, + "loss": 6.4646, + "step": 899 + }, + { + "epoch": 0.30716723549488056, + "grad_norm": 3.830822706222534, + "learning_rate": 0.0008976109215017065, + "loss": 6.4905, + "step": 900 + }, + { + "epoch": 0.3075085324232082, + "grad_norm": 3.606807231903076, + "learning_rate": 0.0008974971558589306, + "loss": 6.8074, + "step": 901 + }, + { + "epoch": 0.3078498293515358, + "grad_norm": 3.6105079650878906, + "learning_rate": 0.0008973833902161548, + "loss": 6.8121, + "step": 902 + }, + { + "epoch": 0.3081911262798635, + "grad_norm": 3.3105735778808594, + "learning_rate": 0.0008972696245733789, + "loss": 6.5316, + "step": 903 + }, + { + "epoch": 0.30853242320819113, + "grad_norm": 4.859966278076172, + "learning_rate": 0.000897155858930603, + "loss": 6.0759, + "step": 904 + }, + { + "epoch": 0.30887372013651876, + "grad_norm": 3.6393473148345947, + "learning_rate": 0.000897042093287827, + "loss": 6.8395, + "step": 905 + }, + { + "epoch": 0.30921501706484644, + "grad_norm": 3.572521686553955, + "learning_rate": 0.0008969283276450511, + "loss": 6.8854, + "step": 906 + }, + { + "epoch": 0.30955631399317407, + "grad_norm": 3.761270046234131, + "learning_rate": 0.0008968145620022752, + "loss": 6.2211, + "step": 907 + }, + { + "epoch": 0.3098976109215017, + "grad_norm": 4.0054192543029785, + "learning_rate": 0.0008967007963594995, + "loss": 6.4794, + "step": 908 + }, + { + "epoch": 0.3102389078498293, + "grad_norm": 3.401998519897461, + "learning_rate": 0.0008965870307167236, + "loss": 7.0889, + "step": 909 + }, + { + "epoch": 0.310580204778157, + "grad_norm": 7.148408889770508, + "learning_rate": 0.0008964732650739477, + "loss": 6.3543, + "step": 910 + }, + { + "epoch": 0.31092150170648464, + "grad_norm": 3.945793628692627, + "learning_rate": 0.0008963594994311718, + "loss": 7.0554, + "step": 911 + }, + { + "epoch": 0.31126279863481227, + "grad_norm": 4.088592052459717, + "learning_rate": 0.0008962457337883959, + "loss": 6.5995, + "step": 912 + }, + { + "epoch": 0.31160409556313995, + "grad_norm": 4.394853591918945, + "learning_rate": 0.0008961319681456201, + "loss": 7.0796, + "step": 913 + }, + { + "epoch": 0.3119453924914676, + "grad_norm": 3.678943395614624, + "learning_rate": 0.0008960182025028442, + "loss": 7.3075, + "step": 914 + }, + { + "epoch": 0.3122866894197952, + "grad_norm": 3.424969434738159, + "learning_rate": 0.0008959044368600683, + "loss": 6.9823, + "step": 915 + }, + { + "epoch": 0.3126279863481229, + "grad_norm": 3.4337804317474365, + "learning_rate": 0.0008957906712172924, + "loss": 6.8313, + "step": 916 + }, + { + "epoch": 0.3129692832764505, + "grad_norm": 3.492877960205078, + "learning_rate": 0.0008956769055745165, + "loss": 6.7776, + "step": 917 + }, + { + "epoch": 0.31331058020477814, + "grad_norm": 3.4652328491210938, + "learning_rate": 0.0008955631399317406, + "loss": 6.5177, + "step": 918 + }, + { + "epoch": 0.3136518771331058, + "grad_norm": 3.4624969959259033, + "learning_rate": 0.0008954493742889648, + "loss": 6.8393, + "step": 919 + }, + { + "epoch": 0.31399317406143346, + "grad_norm": 3.3327879905700684, + "learning_rate": 0.0008953356086461889, + "loss": 6.9991, + "step": 920 + }, + { + "epoch": 0.3143344709897611, + "grad_norm": 3.756620168685913, + "learning_rate": 0.000895221843003413, + "loss": 6.8865, + "step": 921 + }, + { + "epoch": 0.3146757679180887, + "grad_norm": 8.067540168762207, + "learning_rate": 0.0008951080773606371, + "loss": 6.0514, + "step": 922 + }, + { + "epoch": 0.3150170648464164, + "grad_norm": 3.782060384750366, + "learning_rate": 0.0008949943117178612, + "loss": 7.1974, + "step": 923 + }, + { + "epoch": 0.315358361774744, + "grad_norm": 3.9047977924346924, + "learning_rate": 0.0008948805460750852, + "loss": 6.9444, + "step": 924 + }, + { + "epoch": 0.31569965870307165, + "grad_norm": 3.4596681594848633, + "learning_rate": 0.0008947667804323095, + "loss": 6.9334, + "step": 925 + }, + { + "epoch": 0.31604095563139933, + "grad_norm": 3.472452163696289, + "learning_rate": 0.0008946530147895336, + "loss": 6.5, + "step": 926 + }, + { + "epoch": 0.31638225255972696, + "grad_norm": 3.3752787113189697, + "learning_rate": 0.0008945392491467577, + "loss": 6.3694, + "step": 927 + }, + { + "epoch": 0.3167235494880546, + "grad_norm": 15.716917037963867, + "learning_rate": 0.0008944254835039818, + "loss": 9.5123, + "step": 928 + }, + { + "epoch": 0.3170648464163823, + "grad_norm": 4.5218353271484375, + "learning_rate": 0.0008943117178612059, + "loss": 6.7683, + "step": 929 + }, + { + "epoch": 0.3174061433447099, + "grad_norm": 3.797961711883545, + "learning_rate": 0.0008941979522184301, + "loss": 6.4766, + "step": 930 + }, + { + "epoch": 0.31774744027303753, + "grad_norm": 5.070075035095215, + "learning_rate": 0.0008940841865756542, + "loss": 6.3685, + "step": 931 + }, + { + "epoch": 0.3180887372013652, + "grad_norm": 3.9309263229370117, + "learning_rate": 0.0008939704209328783, + "loss": 5.8371, + "step": 932 + }, + { + "epoch": 0.31843003412969284, + "grad_norm": 4.169372081756592, + "learning_rate": 0.0008938566552901024, + "loss": 7.4385, + "step": 933 + }, + { + "epoch": 0.31877133105802047, + "grad_norm": 3.563783884048462, + "learning_rate": 0.0008937428896473265, + "loss": 6.448, + "step": 934 + }, + { + "epoch": 0.3191126279863481, + "grad_norm": 3.728722333908081, + "learning_rate": 0.0008936291240045506, + "loss": 6.3533, + "step": 935 + }, + { + "epoch": 0.3194539249146758, + "grad_norm": 3.3905997276306152, + "learning_rate": 0.0008935153583617748, + "loss": 6.6507, + "step": 936 + }, + { + "epoch": 0.3197952218430034, + "grad_norm": 3.2747488021850586, + "learning_rate": 0.0008934015927189989, + "loss": 7.0518, + "step": 937 + }, + { + "epoch": 0.32013651877133104, + "grad_norm": 3.3199024200439453, + "learning_rate": 0.000893287827076223, + "loss": 6.4047, + "step": 938 + }, + { + "epoch": 0.3204778156996587, + "grad_norm": 6.756965637207031, + "learning_rate": 0.0008931740614334471, + "loss": 6.8004, + "step": 939 + }, + { + "epoch": 0.32081911262798635, + "grad_norm": 3.5635828971862793, + "learning_rate": 0.0008930602957906712, + "loss": 6.7656, + "step": 940 + }, + { + "epoch": 0.321160409556314, + "grad_norm": 3.9271492958068848, + "learning_rate": 0.0008929465301478953, + "loss": 6.823, + "step": 941 + }, + { + "epoch": 0.32150170648464166, + "grad_norm": 3.3149237632751465, + "learning_rate": 0.0008928327645051196, + "loss": 4.3894, + "step": 942 + }, + { + "epoch": 0.3218430034129693, + "grad_norm": 4.048464298248291, + "learning_rate": 0.0008927189988623437, + "loss": 7.0178, + "step": 943 + }, + { + "epoch": 0.3221843003412969, + "grad_norm": 4.173979759216309, + "learning_rate": 0.0008926052332195677, + "loss": 6.5905, + "step": 944 + }, + { + "epoch": 0.3225255972696246, + "grad_norm": 3.4944701194763184, + "learning_rate": 0.0008924914675767918, + "loss": 6.7574, + "step": 945 + }, + { + "epoch": 0.3228668941979522, + "grad_norm": 3.6318819522857666, + "learning_rate": 0.0008923777019340159, + "loss": 6.7549, + "step": 946 + }, + { + "epoch": 0.32320819112627985, + "grad_norm": 3.452150821685791, + "learning_rate": 0.00089226393629124, + "loss": 6.6196, + "step": 947 + }, + { + "epoch": 0.3235494880546075, + "grad_norm": 5.811069488525391, + "learning_rate": 0.0008921501706484642, + "loss": 5.2026, + "step": 948 + }, + { + "epoch": 0.32389078498293516, + "grad_norm": 3.539780378341675, + "learning_rate": 0.0008920364050056883, + "loss": 6.6506, + "step": 949 + }, + { + "epoch": 0.3242320819112628, + "grad_norm": 4.116481304168701, + "learning_rate": 0.0008919226393629124, + "loss": 6.993, + "step": 950 + }, + { + "epoch": 0.3245733788395904, + "grad_norm": 3.6477298736572266, + "learning_rate": 0.0008918088737201365, + "loss": 6.3687, + "step": 951 + }, + { + "epoch": 0.3249146757679181, + "grad_norm": 3.5045762062072754, + "learning_rate": 0.0008916951080773606, + "loss": 6.5074, + "step": 952 + }, + { + "epoch": 0.32525597269624573, + "grad_norm": 4.331373691558838, + "learning_rate": 0.0008915813424345848, + "loss": 6.8514, + "step": 953 + }, + { + "epoch": 0.32559726962457336, + "grad_norm": 3.383375644683838, + "learning_rate": 0.0008914675767918089, + "loss": 6.7091, + "step": 954 + }, + { + "epoch": 0.32593856655290104, + "grad_norm": 3.8869664669036865, + "learning_rate": 0.000891353811149033, + "loss": 6.7753, + "step": 955 + }, + { + "epoch": 0.32627986348122867, + "grad_norm": 5.349417209625244, + "learning_rate": 0.0008912400455062571, + "loss": 6.3789, + "step": 956 + }, + { + "epoch": 0.3266211604095563, + "grad_norm": 3.5549848079681396, + "learning_rate": 0.0008911262798634812, + "loss": 6.8974, + "step": 957 + }, + { + "epoch": 0.326962457337884, + "grad_norm": 3.521125555038452, + "learning_rate": 0.0008910125142207054, + "loss": 6.947, + "step": 958 + }, + { + "epoch": 0.3273037542662116, + "grad_norm": 5.296824932098389, + "learning_rate": 0.0008908987485779296, + "loss": 5.9691, + "step": 959 + }, + { + "epoch": 0.32764505119453924, + "grad_norm": 3.733386516571045, + "learning_rate": 0.0008907849829351537, + "loss": 6.672, + "step": 960 + }, + { + "epoch": 0.32798634812286687, + "grad_norm": 4.317715644836426, + "learning_rate": 0.0008906712172923778, + "loss": 6.1089, + "step": 961 + }, + { + "epoch": 0.32832764505119455, + "grad_norm": 3.5799124240875244, + "learning_rate": 0.0008905574516496019, + "loss": 6.7116, + "step": 962 + }, + { + "epoch": 0.3286689419795222, + "grad_norm": 3.54365873336792, + "learning_rate": 0.0008904436860068259, + "loss": 6.6934, + "step": 963 + }, + { + "epoch": 0.3290102389078498, + "grad_norm": 3.48354172706604, + "learning_rate": 0.00089032992036405, + "loss": 6.7771, + "step": 964 + }, + { + "epoch": 0.3293515358361775, + "grad_norm": 3.8075222969055176, + "learning_rate": 0.0008902161547212742, + "loss": 6.726, + "step": 965 + }, + { + "epoch": 0.3296928327645051, + "grad_norm": 3.549077033996582, + "learning_rate": 0.0008901023890784983, + "loss": 6.8491, + "step": 966 + }, + { + "epoch": 0.33003412969283275, + "grad_norm": 3.487042188644409, + "learning_rate": 0.0008899886234357224, + "loss": 6.9951, + "step": 967 + }, + { + "epoch": 0.33037542662116043, + "grad_norm": 3.4379420280456543, + "learning_rate": 0.0008898748577929465, + "loss": 6.7213, + "step": 968 + }, + { + "epoch": 0.33071672354948806, + "grad_norm": 3.3688578605651855, + "learning_rate": 0.0008897610921501706, + "loss": 6.7468, + "step": 969 + }, + { + "epoch": 0.3310580204778157, + "grad_norm": 4.88122034072876, + "learning_rate": 0.0008896473265073947, + "loss": 6.8637, + "step": 970 + }, + { + "epoch": 0.33139931740614337, + "grad_norm": 3.590115547180176, + "learning_rate": 0.0008895335608646189, + "loss": 6.4559, + "step": 971 + }, + { + "epoch": 0.331740614334471, + "grad_norm": 3.8333377838134766, + "learning_rate": 0.000889419795221843, + "loss": 6.0111, + "step": 972 + }, + { + "epoch": 0.3320819112627986, + "grad_norm": 3.6459643840789795, + "learning_rate": 0.0008893060295790671, + "loss": 5.3953, + "step": 973 + }, + { + "epoch": 0.3324232081911263, + "grad_norm": 3.799060583114624, + "learning_rate": 0.0008891922639362912, + "loss": 6.8401, + "step": 974 + }, + { + "epoch": 0.33276450511945393, + "grad_norm": 3.553802728652954, + "learning_rate": 0.0008890784982935154, + "loss": 6.763, + "step": 975 + }, + { + "epoch": 0.33310580204778156, + "grad_norm": 5.171194553375244, + "learning_rate": 0.0008889647326507396, + "loss": 6.4421, + "step": 976 + }, + { + "epoch": 0.3334470989761092, + "grad_norm": 4.729731559753418, + "learning_rate": 0.0008888509670079637, + "loss": 4.2545, + "step": 977 + }, + { + "epoch": 0.3337883959044369, + "grad_norm": 4.018588066101074, + "learning_rate": 0.0008887372013651878, + "loss": 6.8516, + "step": 978 + }, + { + "epoch": 0.3341296928327645, + "grad_norm": 3.8841042518615723, + "learning_rate": 0.0008886234357224119, + "loss": 7.2299, + "step": 979 + }, + { + "epoch": 0.33447098976109213, + "grad_norm": 3.4637234210968018, + "learning_rate": 0.000888509670079636, + "loss": 6.5481, + "step": 980 + }, + { + "epoch": 0.3348122866894198, + "grad_norm": 3.2626147270202637, + "learning_rate": 0.0008883959044368601, + "loss": 7.2605, + "step": 981 + }, + { + "epoch": 0.33515358361774744, + "grad_norm": 3.390256643295288, + "learning_rate": 0.0008882821387940843, + "loss": 6.8236, + "step": 982 + }, + { + "epoch": 0.33549488054607507, + "grad_norm": 3.666612386703491, + "learning_rate": 0.0008881683731513083, + "loss": 6.838, + "step": 983 + }, + { + "epoch": 0.33583617747440275, + "grad_norm": 3.5521507263183594, + "learning_rate": 0.0008880546075085324, + "loss": 6.6424, + "step": 984 + }, + { + "epoch": 0.3361774744027304, + "grad_norm": 4.2973551750183105, + "learning_rate": 0.0008879408418657565, + "loss": 6.7528, + "step": 985 + }, + { + "epoch": 0.336518771331058, + "grad_norm": 3.663874864578247, + "learning_rate": 0.0008878270762229806, + "loss": 6.7709, + "step": 986 + }, + { + "epoch": 0.3368600682593857, + "grad_norm": 3.8758318424224854, + "learning_rate": 0.0008877133105802047, + "loss": 6.5531, + "step": 987 + }, + { + "epoch": 0.3372013651877133, + "grad_norm": 3.551164388656616, + "learning_rate": 0.0008875995449374289, + "loss": 6.9101, + "step": 988 + }, + { + "epoch": 0.33754266211604095, + "grad_norm": 3.607560873031616, + "learning_rate": 0.000887485779294653, + "loss": 6.5752, + "step": 989 + }, + { + "epoch": 0.3378839590443686, + "grad_norm": 3.6682469844818115, + "learning_rate": 0.0008873720136518771, + "loss": 6.3309, + "step": 990 + }, + { + "epoch": 0.33822525597269626, + "grad_norm": 3.57940411567688, + "learning_rate": 0.0008872582480091012, + "loss": 7.2817, + "step": 991 + }, + { + "epoch": 0.3385665529010239, + "grad_norm": 3.5571844577789307, + "learning_rate": 0.0008871444823663254, + "loss": 6.6641, + "step": 992 + }, + { + "epoch": 0.3389078498293515, + "grad_norm": 3.4492499828338623, + "learning_rate": 0.0008870307167235496, + "loss": 6.9242, + "step": 993 + }, + { + "epoch": 0.3392491467576792, + "grad_norm": 4.031134128570557, + "learning_rate": 0.0008869169510807737, + "loss": 6.8263, + "step": 994 + }, + { + "epoch": 0.3395904436860068, + "grad_norm": 3.3851895332336426, + "learning_rate": 0.0008868031854379978, + "loss": 6.847, + "step": 995 + }, + { + "epoch": 0.33993174061433445, + "grad_norm": 4.721614837646484, + "learning_rate": 0.0008866894197952219, + "loss": 6.7878, + "step": 996 + }, + { + "epoch": 0.34027303754266214, + "grad_norm": 3.3920321464538574, + "learning_rate": 0.000886575654152446, + "loss": 6.6165, + "step": 997 + }, + { + "epoch": 0.34061433447098977, + "grad_norm": 3.5880556106567383, + "learning_rate": 0.0008864618885096701, + "loss": 6.3648, + "step": 998 + }, + { + "epoch": 0.3409556313993174, + "grad_norm": 3.7084357738494873, + "learning_rate": 0.0008863481228668943, + "loss": 6.5777, + "step": 999 + }, + { + "epoch": 0.3412969283276451, + "grad_norm": 3.5012943744659424, + "learning_rate": 0.0008862343572241184, + "loss": 6.7863, + "step": 1000 + }, + { + "epoch": 0.3416382252559727, + "grad_norm": 3.5657830238342285, + "learning_rate": 0.0008861205915813425, + "loss": 7.3395, + "step": 1001 + }, + { + "epoch": 0.34197952218430033, + "grad_norm": 4.714262008666992, + "learning_rate": 0.0008860068259385665, + "loss": 6.4638, + "step": 1002 + }, + { + "epoch": 0.34232081911262796, + "grad_norm": 3.793743848800659, + "learning_rate": 0.0008858930602957906, + "loss": 6.6217, + "step": 1003 + }, + { + "epoch": 0.34266211604095564, + "grad_norm": 6.239543914794922, + "learning_rate": 0.0008857792946530147, + "loss": 6.3399, + "step": 1004 + }, + { + "epoch": 0.3430034129692833, + "grad_norm": 3.838210344314575, + "learning_rate": 0.0008856655290102389, + "loss": 7.1438, + "step": 1005 + }, + { + "epoch": 0.3433447098976109, + "grad_norm": 4.131673812866211, + "learning_rate": 0.000885551763367463, + "loss": 6.3229, + "step": 1006 + }, + { + "epoch": 0.3436860068259386, + "grad_norm": 3.4897634983062744, + "learning_rate": 0.0008854379977246871, + "loss": 7.3418, + "step": 1007 + }, + { + "epoch": 0.3440273037542662, + "grad_norm": 3.3156917095184326, + "learning_rate": 0.0008853242320819112, + "loss": 6.6131, + "step": 1008 + }, + { + "epoch": 0.34436860068259384, + "grad_norm": 5.101655006408691, + "learning_rate": 0.0008852104664391354, + "loss": 6.4129, + "step": 1009 + }, + { + "epoch": 0.3447098976109215, + "grad_norm": 3.4084413051605225, + "learning_rate": 0.0008850967007963595, + "loss": 6.8648, + "step": 1010 + }, + { + "epoch": 0.34505119453924915, + "grad_norm": 3.558993101119995, + "learning_rate": 0.0008849829351535837, + "loss": 6.5622, + "step": 1011 + }, + { + "epoch": 0.3453924914675768, + "grad_norm": 3.5753562450408936, + "learning_rate": 0.0008848691695108078, + "loss": 6.7141, + "step": 1012 + }, + { + "epoch": 0.34573378839590446, + "grad_norm": 3.5802392959594727, + "learning_rate": 0.0008847554038680319, + "loss": 6.9107, + "step": 1013 + }, + { + "epoch": 0.3460750853242321, + "grad_norm": 3.4724674224853516, + "learning_rate": 0.000884641638225256, + "loss": 6.6967, + "step": 1014 + }, + { + "epoch": 0.3464163822525597, + "grad_norm": 3.293440103530884, + "learning_rate": 0.0008845278725824801, + "loss": 6.4138, + "step": 1015 + }, + { + "epoch": 0.34675767918088735, + "grad_norm": 3.4762954711914062, + "learning_rate": 0.0008844141069397043, + "loss": 6.8232, + "step": 1016 + }, + { + "epoch": 0.34709897610921503, + "grad_norm": 3.6370632648468018, + "learning_rate": 0.0008843003412969284, + "loss": 6.9722, + "step": 1017 + }, + { + "epoch": 0.34744027303754266, + "grad_norm": 3.511749029159546, + "learning_rate": 0.0008841865756541525, + "loss": 6.8501, + "step": 1018 + }, + { + "epoch": 0.3477815699658703, + "grad_norm": 5.163912296295166, + "learning_rate": 0.0008840728100113766, + "loss": 6.1093, + "step": 1019 + }, + { + "epoch": 0.34812286689419797, + "grad_norm": 8.192817687988281, + "learning_rate": 0.0008839590443686007, + "loss": 6.5018, + "step": 1020 + }, + { + "epoch": 0.3484641638225256, + "grad_norm": 3.8490147590637207, + "learning_rate": 0.0008838452787258247, + "loss": 6.8653, + "step": 1021 + }, + { + "epoch": 0.3488054607508532, + "grad_norm": 3.7160000801086426, + "learning_rate": 0.0008837315130830489, + "loss": 6.9512, + "step": 1022 + }, + { + "epoch": 0.3491467576791809, + "grad_norm": 3.8101956844329834, + "learning_rate": 0.000883617747440273, + "loss": 6.2376, + "step": 1023 + }, + { + "epoch": 0.34948805460750854, + "grad_norm": 3.3378794193267822, + "learning_rate": 0.0008835039817974971, + "loss": 6.8794, + "step": 1024 + }, + { + "epoch": 0.34982935153583616, + "grad_norm": 3.5934832096099854, + "learning_rate": 0.0008833902161547212, + "loss": 6.1176, + "step": 1025 + }, + { + "epoch": 0.35017064846416385, + "grad_norm": 3.461996078491211, + "learning_rate": 0.0008832764505119454, + "loss": 6.5096, + "step": 1026 + }, + { + "epoch": 0.3505119453924915, + "grad_norm": 3.272123098373413, + "learning_rate": 0.0008831626848691695, + "loss": 6.7895, + "step": 1027 + }, + { + "epoch": 0.3508532423208191, + "grad_norm": 3.3164021968841553, + "learning_rate": 0.0008830489192263937, + "loss": 6.7558, + "step": 1028 + }, + { + "epoch": 0.35119453924914673, + "grad_norm": 3.2223832607269287, + "learning_rate": 0.0008829351535836178, + "loss": 6.5788, + "step": 1029 + }, + { + "epoch": 0.3515358361774744, + "grad_norm": 5.8226237297058105, + "learning_rate": 0.0008828213879408419, + "loss": 3.4241, + "step": 1030 + }, + { + "epoch": 0.35187713310580204, + "grad_norm": 3.6470441818237305, + "learning_rate": 0.000882707622298066, + "loss": 6.7667, + "step": 1031 + }, + { + "epoch": 0.35221843003412967, + "grad_norm": 3.8023180961608887, + "learning_rate": 0.0008825938566552901, + "loss": 6.8807, + "step": 1032 + }, + { + "epoch": 0.35255972696245735, + "grad_norm": 3.8079946041107178, + "learning_rate": 0.0008824800910125143, + "loss": 6.7188, + "step": 1033 + }, + { + "epoch": 0.352901023890785, + "grad_norm": 3.5138330459594727, + "learning_rate": 0.0008823663253697384, + "loss": 7.1953, + "step": 1034 + }, + { + "epoch": 0.3532423208191126, + "grad_norm": 3.8133950233459473, + "learning_rate": 0.0008822525597269625, + "loss": 7.1188, + "step": 1035 + }, + { + "epoch": 0.3535836177474403, + "grad_norm": 3.640202522277832, + "learning_rate": 0.0008821387940841866, + "loss": 6.739, + "step": 1036 + }, + { + "epoch": 0.3539249146757679, + "grad_norm": 3.518843173980713, + "learning_rate": 0.0008820250284414107, + "loss": 6.8397, + "step": 1037 + }, + { + "epoch": 0.35426621160409555, + "grad_norm": 3.5482702255249023, + "learning_rate": 0.0008819112627986348, + "loss": 6.618, + "step": 1038 + }, + { + "epoch": 0.35460750853242323, + "grad_norm": 3.419344902038574, + "learning_rate": 0.000881797497155859, + "loss": 6.9977, + "step": 1039 + }, + { + "epoch": 0.35494880546075086, + "grad_norm": 3.4673240184783936, + "learning_rate": 0.0008816837315130831, + "loss": 6.4868, + "step": 1040 + }, + { + "epoch": 0.3552901023890785, + "grad_norm": 8.60751724243164, + "learning_rate": 0.0008815699658703071, + "loss": 4.6019, + "step": 1041 + }, + { + "epoch": 0.3556313993174061, + "grad_norm": 4.347542762756348, + "learning_rate": 0.0008814562002275312, + "loss": 6.7114, + "step": 1042 + }, + { + "epoch": 0.3559726962457338, + "grad_norm": 5.797741413116455, + "learning_rate": 0.0008813424345847554, + "loss": 4.9746, + "step": 1043 + }, + { + "epoch": 0.3563139931740614, + "grad_norm": 4.145489692687988, + "learning_rate": 0.0008812286689419795, + "loss": 6.8995, + "step": 1044 + }, + { + "epoch": 0.35665529010238906, + "grad_norm": 4.579039096832275, + "learning_rate": 0.0008811149032992037, + "loss": 7.1214, + "step": 1045 + }, + { + "epoch": 0.35699658703071674, + "grad_norm": 3.6013543605804443, + "learning_rate": 0.0008810011376564278, + "loss": 6.9495, + "step": 1046 + }, + { + "epoch": 0.35733788395904437, + "grad_norm": 3.570963144302368, + "learning_rate": 0.0008808873720136519, + "loss": 7.0384, + "step": 1047 + }, + { + "epoch": 0.357679180887372, + "grad_norm": 3.448312759399414, + "learning_rate": 0.000880773606370876, + "loss": 6.8964, + "step": 1048 + }, + { + "epoch": 0.3580204778156997, + "grad_norm": 3.4679174423217773, + "learning_rate": 0.0008806598407281001, + "loss": 6.5713, + "step": 1049 + }, + { + "epoch": 0.3583617747440273, + "grad_norm": 3.8704769611358643, + "learning_rate": 0.0008805460750853242, + "loss": 6.424, + "step": 1050 + }, + { + "epoch": 0.35870307167235493, + "grad_norm": 3.3437047004699707, + "learning_rate": 0.0008804323094425484, + "loss": 7.1021, + "step": 1051 + }, + { + "epoch": 0.3590443686006826, + "grad_norm": 7.765682220458984, + "learning_rate": 0.0008803185437997725, + "loss": 5.6959, + "step": 1052 + }, + { + "epoch": 0.35938566552901025, + "grad_norm": 4.041772842407227, + "learning_rate": 0.0008802047781569966, + "loss": 6.5353, + "step": 1053 + }, + { + "epoch": 0.3597269624573379, + "grad_norm": 4.038043022155762, + "learning_rate": 0.0008800910125142207, + "loss": 6.8547, + "step": 1054 + }, + { + "epoch": 0.36006825938566556, + "grad_norm": 3.7071590423583984, + "learning_rate": 0.0008799772468714448, + "loss": 6.4563, + "step": 1055 + }, + { + "epoch": 0.3604095563139932, + "grad_norm": 4.166858196258545, + "learning_rate": 0.000879863481228669, + "loss": 6.431, + "step": 1056 + }, + { + "epoch": 0.3607508532423208, + "grad_norm": 3.6435773372650146, + "learning_rate": 0.0008797497155858931, + "loss": 6.6037, + "step": 1057 + }, + { + "epoch": 0.36109215017064844, + "grad_norm": 14.491841316223145, + "learning_rate": 0.0008796359499431173, + "loss": 7.0323, + "step": 1058 + }, + { + "epoch": 0.3614334470989761, + "grad_norm": 3.81062388420105, + "learning_rate": 0.0008795221843003414, + "loss": 7.0748, + "step": 1059 + }, + { + "epoch": 0.36177474402730375, + "grad_norm": 3.9405431747436523, + "learning_rate": 0.0008794084186575654, + "loss": 6.9883, + "step": 1060 + }, + { + "epoch": 0.3621160409556314, + "grad_norm": 3.7249462604522705, + "learning_rate": 0.0008792946530147895, + "loss": 6.7057, + "step": 1061 + }, + { + "epoch": 0.36245733788395906, + "grad_norm": 4.359280109405518, + "learning_rate": 0.0008791808873720137, + "loss": 6.2134, + "step": 1062 + }, + { + "epoch": 0.3627986348122867, + "grad_norm": 3.8895323276519775, + "learning_rate": 0.0008790671217292378, + "loss": 6.3199, + "step": 1063 + }, + { + "epoch": 0.3631399317406143, + "grad_norm": 3.3901989459991455, + "learning_rate": 0.0008789533560864619, + "loss": 6.822, + "step": 1064 + }, + { + "epoch": 0.363481228668942, + "grad_norm": 3.3694682121276855, + "learning_rate": 0.000878839590443686, + "loss": 6.8209, + "step": 1065 + }, + { + "epoch": 0.36382252559726963, + "grad_norm": 3.7551913261413574, + "learning_rate": 0.0008787258248009101, + "loss": 6.7343, + "step": 1066 + }, + { + "epoch": 0.36416382252559726, + "grad_norm": 3.3077569007873535, + "learning_rate": 0.0008786120591581342, + "loss": 6.944, + "step": 1067 + }, + { + "epoch": 0.36450511945392494, + "grad_norm": 3.292142629623413, + "learning_rate": 0.0008784982935153584, + "loss": 7.1756, + "step": 1068 + }, + { + "epoch": 0.36484641638225257, + "grad_norm": 4.133096218109131, + "learning_rate": 0.0008783845278725825, + "loss": 6.7308, + "step": 1069 + }, + { + "epoch": 0.3651877133105802, + "grad_norm": 3.361743927001953, + "learning_rate": 0.0008782707622298066, + "loss": 7.252, + "step": 1070 + }, + { + "epoch": 0.3655290102389078, + "grad_norm": 3.582728624343872, + "learning_rate": 0.0008781569965870307, + "loss": 6.8102, + "step": 1071 + }, + { + "epoch": 0.3658703071672355, + "grad_norm": 4.287242412567139, + "learning_rate": 0.0008780432309442548, + "loss": 6.4504, + "step": 1072 + }, + { + "epoch": 0.36621160409556314, + "grad_norm": 3.59372878074646, + "learning_rate": 0.000877929465301479, + "loss": 7.1913, + "step": 1073 + }, + { + "epoch": 0.36655290102389076, + "grad_norm": 3.4123799800872803, + "learning_rate": 0.0008778156996587031, + "loss": 6.5238, + "step": 1074 + }, + { + "epoch": 0.36689419795221845, + "grad_norm": 3.3152260780334473, + "learning_rate": 0.0008777019340159273, + "loss": 6.9748, + "step": 1075 + }, + { + "epoch": 0.3672354948805461, + "grad_norm": 3.3980987071990967, + "learning_rate": 0.0008775881683731514, + "loss": 6.8588, + "step": 1076 + }, + { + "epoch": 0.3675767918088737, + "grad_norm": 3.447199583053589, + "learning_rate": 0.0008774744027303755, + "loss": 6.3821, + "step": 1077 + }, + { + "epoch": 0.3679180887372014, + "grad_norm": 3.4618401527404785, + "learning_rate": 0.0008773606370875996, + "loss": 6.95, + "step": 1078 + }, + { + "epoch": 0.368259385665529, + "grad_norm": 3.597135543823242, + "learning_rate": 0.0008772468714448238, + "loss": 6.4812, + "step": 1079 + }, + { + "epoch": 0.36860068259385664, + "grad_norm": 3.396935224533081, + "learning_rate": 0.0008771331058020478, + "loss": 6.7553, + "step": 1080 + }, + { + "epoch": 0.3689419795221843, + "grad_norm": 3.4151859283447266, + "learning_rate": 0.0008770193401592719, + "loss": 6.4082, + "step": 1081 + }, + { + "epoch": 0.36928327645051195, + "grad_norm": 3.4455440044403076, + "learning_rate": 0.000876905574516496, + "loss": 6.7333, + "step": 1082 + }, + { + "epoch": 0.3696245733788396, + "grad_norm": 3.539586067199707, + "learning_rate": 0.0008767918088737201, + "loss": 6.5843, + "step": 1083 + }, + { + "epoch": 0.3699658703071672, + "grad_norm": 6.984851360321045, + "learning_rate": 0.0008766780432309442, + "loss": 5.8875, + "step": 1084 + }, + { + "epoch": 0.3703071672354949, + "grad_norm": 3.949312448501587, + "learning_rate": 0.0008765642775881684, + "loss": 6.2822, + "step": 1085 + }, + { + "epoch": 0.3706484641638225, + "grad_norm": 3.787013530731201, + "learning_rate": 0.0008764505119453925, + "loss": 6.7024, + "step": 1086 + }, + { + "epoch": 0.37098976109215015, + "grad_norm": 3.797456741333008, + "learning_rate": 0.0008763367463026166, + "loss": 6.6641, + "step": 1087 + }, + { + "epoch": 0.37133105802047783, + "grad_norm": 3.9768006801605225, + "learning_rate": 0.0008762229806598407, + "loss": 6.8357, + "step": 1088 + }, + { + "epoch": 0.37167235494880546, + "grad_norm": 3.4279286861419678, + "learning_rate": 0.0008761092150170648, + "loss": 6.5971, + "step": 1089 + }, + { + "epoch": 0.3720136518771331, + "grad_norm": 3.417083501815796, + "learning_rate": 0.0008759954493742889, + "loss": 6.818, + "step": 1090 + }, + { + "epoch": 0.3723549488054608, + "grad_norm": 3.553903818130493, + "learning_rate": 0.0008758816837315132, + "loss": 6.8396, + "step": 1091 + }, + { + "epoch": 0.3726962457337884, + "grad_norm": 3.620215892791748, + "learning_rate": 0.0008757679180887373, + "loss": 6.2553, + "step": 1092 + }, + { + "epoch": 0.37303754266211603, + "grad_norm": 5.6876935958862305, + "learning_rate": 0.0008756541524459614, + "loss": 6.035, + "step": 1093 + }, + { + "epoch": 0.3733788395904437, + "grad_norm": 3.6916663646698, + "learning_rate": 0.0008755403868031855, + "loss": 6.9809, + "step": 1094 + }, + { + "epoch": 0.37372013651877134, + "grad_norm": 3.829458236694336, + "learning_rate": 0.0008754266211604096, + "loss": 6.3949, + "step": 1095 + }, + { + "epoch": 0.37406143344709897, + "grad_norm": 3.719882011413574, + "learning_rate": 0.0008753128555176338, + "loss": 7.0602, + "step": 1096 + }, + { + "epoch": 0.3744027303754266, + "grad_norm": 4.063767433166504, + "learning_rate": 0.0008751990898748579, + "loss": 6.713, + "step": 1097 + }, + { + "epoch": 0.3747440273037543, + "grad_norm": 3.6009387969970703, + "learning_rate": 0.000875085324232082, + "loss": 6.6959, + "step": 1098 + }, + { + "epoch": 0.3750853242320819, + "grad_norm": 3.6646664142608643, + "learning_rate": 0.000874971558589306, + "loss": 6.8591, + "step": 1099 + }, + { + "epoch": 0.37542662116040953, + "grad_norm": 3.4815189838409424, + "learning_rate": 0.0008748577929465301, + "loss": 6.6657, + "step": 1100 + }, + { + "epoch": 0.3757679180887372, + "grad_norm": 3.6855881214141846, + "learning_rate": 0.0008747440273037542, + "loss": 6.7854, + "step": 1101 + }, + { + "epoch": 0.37610921501706485, + "grad_norm": 3.5180046558380127, + "learning_rate": 0.0008746302616609784, + "loss": 6.8922, + "step": 1102 + }, + { + "epoch": 0.3764505119453925, + "grad_norm": 3.3709070682525635, + "learning_rate": 0.0008745164960182025, + "loss": 7.0109, + "step": 1103 + }, + { + "epoch": 0.37679180887372016, + "grad_norm": 3.4868783950805664, + "learning_rate": 0.0008744027303754266, + "loss": 7.1033, + "step": 1104 + }, + { + "epoch": 0.3771331058020478, + "grad_norm": 3.5890045166015625, + "learning_rate": 0.0008742889647326507, + "loss": 6.5034, + "step": 1105 + }, + { + "epoch": 0.3774744027303754, + "grad_norm": 5.1739726066589355, + "learning_rate": 0.0008741751990898748, + "loss": 6.2063, + "step": 1106 + }, + { + "epoch": 0.3778156996587031, + "grad_norm": 3.525132179260254, + "learning_rate": 0.0008740614334470989, + "loss": 7.3972, + "step": 1107 + }, + { + "epoch": 0.3781569965870307, + "grad_norm": 3.622143268585205, + "learning_rate": 0.0008739476678043232, + "loss": 7.0089, + "step": 1108 + }, + { + "epoch": 0.37849829351535835, + "grad_norm": 3.494596481323242, + "learning_rate": 0.0008738339021615473, + "loss": 6.5724, + "step": 1109 + }, + { + "epoch": 0.378839590443686, + "grad_norm": 3.3533830642700195, + "learning_rate": 0.0008737201365187714, + "loss": 7.1964, + "step": 1110 + }, + { + "epoch": 0.37918088737201366, + "grad_norm": 3.518562078475952, + "learning_rate": 0.0008736063708759955, + "loss": 7.2307, + "step": 1111 + }, + { + "epoch": 0.3795221843003413, + "grad_norm": 3.4765775203704834, + "learning_rate": 0.0008734926052332196, + "loss": 6.6891, + "step": 1112 + }, + { + "epoch": 0.3798634812286689, + "grad_norm": 3.379734754562378, + "learning_rate": 0.0008733788395904437, + "loss": 6.883, + "step": 1113 + }, + { + "epoch": 0.3802047781569966, + "grad_norm": 3.9166646003723145, + "learning_rate": 0.0008732650739476679, + "loss": 6.7689, + "step": 1114 + }, + { + "epoch": 0.38054607508532423, + "grad_norm": 3.344590663909912, + "learning_rate": 0.000873151308304892, + "loss": 6.579, + "step": 1115 + }, + { + "epoch": 0.38088737201365186, + "grad_norm": 3.7421436309814453, + "learning_rate": 0.0008730375426621161, + "loss": 7.2903, + "step": 1116 + }, + { + "epoch": 0.38122866894197954, + "grad_norm": 3.9341204166412354, + "learning_rate": 0.0008729237770193402, + "loss": 6.4053, + "step": 1117 + }, + { + "epoch": 0.38156996587030717, + "grad_norm": 3.6717798709869385, + "learning_rate": 0.0008728100113765643, + "loss": 6.56, + "step": 1118 + }, + { + "epoch": 0.3819112627986348, + "grad_norm": 3.7355589866638184, + "learning_rate": 0.0008726962457337884, + "loss": 6.1227, + "step": 1119 + }, + { + "epoch": 0.3822525597269625, + "grad_norm": 3.6662402153015137, + "learning_rate": 0.0008725824800910125, + "loss": 6.8174, + "step": 1120 + }, + { + "epoch": 0.3825938566552901, + "grad_norm": 3.7742650508880615, + "learning_rate": 0.0008724687144482366, + "loss": 6.9281, + "step": 1121 + }, + { + "epoch": 0.38293515358361774, + "grad_norm": 3.4731431007385254, + "learning_rate": 0.0008723549488054607, + "loss": 6.5173, + "step": 1122 + }, + { + "epoch": 0.38327645051194537, + "grad_norm": 3.55572247505188, + "learning_rate": 0.0008722411831626848, + "loss": 6.8068, + "step": 1123 + }, + { + "epoch": 0.38361774744027305, + "grad_norm": 3.307126522064209, + "learning_rate": 0.0008721274175199089, + "loss": 6.2692, + "step": 1124 + }, + { + "epoch": 0.3839590443686007, + "grad_norm": 3.444089889526367, + "learning_rate": 0.0008720136518771332, + "loss": 6.4222, + "step": 1125 + }, + { + "epoch": 0.3843003412969283, + "grad_norm": 3.731123685836792, + "learning_rate": 0.0008718998862343573, + "loss": 6.7032, + "step": 1126 + }, + { + "epoch": 0.384641638225256, + "grad_norm": 3.5673561096191406, + "learning_rate": 0.0008717861205915814, + "loss": 6.7617, + "step": 1127 + }, + { + "epoch": 0.3849829351535836, + "grad_norm": 4.335822105407715, + "learning_rate": 0.0008716723549488055, + "loss": 6.0413, + "step": 1128 + }, + { + "epoch": 0.38532423208191124, + "grad_norm": 3.483842372894287, + "learning_rate": 0.0008715585893060296, + "loss": 6.902, + "step": 1129 + }, + { + "epoch": 0.3856655290102389, + "grad_norm": 3.3141043186187744, + "learning_rate": 0.0008714448236632537, + "loss": 6.5926, + "step": 1130 + }, + { + "epoch": 0.38600682593856656, + "grad_norm": 3.5295164585113525, + "learning_rate": 0.0008713310580204779, + "loss": 7.411, + "step": 1131 + }, + { + "epoch": 0.3863481228668942, + "grad_norm": 3.6749589443206787, + "learning_rate": 0.000871217292377702, + "loss": 6.4381, + "step": 1132 + }, + { + "epoch": 0.38668941979522187, + "grad_norm": 3.7729690074920654, + "learning_rate": 0.0008711035267349261, + "loss": 6.2913, + "step": 1133 + }, + { + "epoch": 0.3870307167235495, + "grad_norm": 3.4314284324645996, + "learning_rate": 0.0008709897610921502, + "loss": 7.1253, + "step": 1134 + }, + { + "epoch": 0.3873720136518771, + "grad_norm": 3.633718252182007, + "learning_rate": 0.0008708759954493743, + "loss": 6.378, + "step": 1135 + }, + { + "epoch": 0.38771331058020475, + "grad_norm": 3.4309566020965576, + "learning_rate": 0.0008707622298065985, + "loss": 6.7567, + "step": 1136 + }, + { + "epoch": 0.38805460750853243, + "grad_norm": 5.337145805358887, + "learning_rate": 0.0008706484641638226, + "loss": 6.1113, + "step": 1137 + }, + { + "epoch": 0.38839590443686006, + "grad_norm": 3.6700313091278076, + "learning_rate": 0.0008705346985210466, + "loss": 7.0188, + "step": 1138 + }, + { + "epoch": 0.3887372013651877, + "grad_norm": 3.756471633911133, + "learning_rate": 0.0008704209328782707, + "loss": 6.3016, + "step": 1139 + }, + { + "epoch": 0.3890784982935154, + "grad_norm": 3.3960928916931152, + "learning_rate": 0.0008703071672354948, + "loss": 6.6751, + "step": 1140 + }, + { + "epoch": 0.389419795221843, + "grad_norm": 3.354475736618042, + "learning_rate": 0.0008701934015927189, + "loss": 6.9203, + "step": 1141 + }, + { + "epoch": 0.38976109215017063, + "grad_norm": 4.095553398132324, + "learning_rate": 0.0008700796359499432, + "loss": 6.2154, + "step": 1142 + }, + { + "epoch": 0.3901023890784983, + "grad_norm": 3.592214584350586, + "learning_rate": 0.0008699658703071673, + "loss": 6.5693, + "step": 1143 + }, + { + "epoch": 0.39044368600682594, + "grad_norm": 3.5223283767700195, + "learning_rate": 0.0008698521046643914, + "loss": 7.3168, + "step": 1144 + }, + { + "epoch": 0.39078498293515357, + "grad_norm": 3.3714346885681152, + "learning_rate": 0.0008697383390216155, + "loss": 6.5233, + "step": 1145 + }, + { + "epoch": 0.39112627986348125, + "grad_norm": 4.475141525268555, + "learning_rate": 0.0008696245733788396, + "loss": 6.3844, + "step": 1146 + }, + { + "epoch": 0.3914675767918089, + "grad_norm": 3.5419039726257324, + "learning_rate": 0.0008695108077360637, + "loss": 6.718, + "step": 1147 + }, + { + "epoch": 0.3918088737201365, + "grad_norm": 3.6940994262695312, + "learning_rate": 0.0008693970420932879, + "loss": 6.9183, + "step": 1148 + }, + { + "epoch": 0.3921501706484642, + "grad_norm": 3.406237840652466, + "learning_rate": 0.000869283276450512, + "loss": 6.817, + "step": 1149 + }, + { + "epoch": 0.3924914675767918, + "grad_norm": 3.5237555503845215, + "learning_rate": 0.0008691695108077361, + "loss": 6.565, + "step": 1150 + }, + { + "epoch": 0.39283276450511945, + "grad_norm": 3.400852918624878, + "learning_rate": 0.0008690557451649602, + "loss": 6.5612, + "step": 1151 + }, + { + "epoch": 0.3931740614334471, + "grad_norm": 3.451266050338745, + "learning_rate": 0.0008689419795221843, + "loss": 6.5092, + "step": 1152 + }, + { + "epoch": 0.39351535836177476, + "grad_norm": 4.814857006072998, + "learning_rate": 0.0008688282138794084, + "loss": 6.1531, + "step": 1153 + }, + { + "epoch": 0.3938566552901024, + "grad_norm": 3.619635820388794, + "learning_rate": 0.0008687144482366326, + "loss": 6.4214, + "step": 1154 + }, + { + "epoch": 0.39419795221843, + "grad_norm": 2.678661346435547, + "learning_rate": 0.0008686006825938567, + "loss": 3.9171, + "step": 1155 + }, + { + "epoch": 0.3945392491467577, + "grad_norm": 4.169159889221191, + "learning_rate": 0.0008684869169510808, + "loss": 6.2438, + "step": 1156 + }, + { + "epoch": 0.3948805460750853, + "grad_norm": 3.618849515914917, + "learning_rate": 0.0008683731513083049, + "loss": 6.7297, + "step": 1157 + }, + { + "epoch": 0.39522184300341295, + "grad_norm": 5.10459566116333, + "learning_rate": 0.0008682593856655289, + "loss": 6.2313, + "step": 1158 + }, + { + "epoch": 0.39556313993174064, + "grad_norm": 3.637228012084961, + "learning_rate": 0.0008681456200227532, + "loss": 6.6383, + "step": 1159 + }, + { + "epoch": 0.39590443686006827, + "grad_norm": 3.539729356765747, + "learning_rate": 0.0008680318543799773, + "loss": 6.6497, + "step": 1160 + }, + { + "epoch": 0.3962457337883959, + "grad_norm": 3.3988003730773926, + "learning_rate": 0.0008679180887372014, + "loss": 6.2502, + "step": 1161 + }, + { + "epoch": 0.3965870307167236, + "grad_norm": 3.7295820713043213, + "learning_rate": 0.0008678043230944255, + "loss": 6.354, + "step": 1162 + }, + { + "epoch": 0.3969283276450512, + "grad_norm": 3.539332151412964, + "learning_rate": 0.0008676905574516496, + "loss": 7.1846, + "step": 1163 + }, + { + "epoch": 0.39726962457337883, + "grad_norm": 3.5238754749298096, + "learning_rate": 0.0008675767918088737, + "loss": 6.2985, + "step": 1164 + }, + { + "epoch": 0.39761092150170646, + "grad_norm": 7.77903938293457, + "learning_rate": 0.0008674630261660979, + "loss": 4.739, + "step": 1165 + }, + { + "epoch": 0.39795221843003414, + "grad_norm": 3.561558485031128, + "learning_rate": 0.000867349260523322, + "loss": 6.7075, + "step": 1166 + }, + { + "epoch": 0.39829351535836177, + "grad_norm": 3.6842966079711914, + "learning_rate": 0.0008672354948805461, + "loss": 6.934, + "step": 1167 + }, + { + "epoch": 0.3986348122866894, + "grad_norm": 3.6015913486480713, + "learning_rate": 0.0008671217292377702, + "loss": 7.0431, + "step": 1168 + }, + { + "epoch": 0.3989761092150171, + "grad_norm": 3.3261101245880127, + "learning_rate": 0.0008670079635949943, + "loss": 6.5901, + "step": 1169 + }, + { + "epoch": 0.3993174061433447, + "grad_norm": 3.6874048709869385, + "learning_rate": 0.0008668941979522184, + "loss": 6.8816, + "step": 1170 + }, + { + "epoch": 0.39965870307167234, + "grad_norm": 3.367588996887207, + "learning_rate": 0.0008667804323094426, + "loss": 6.8432, + "step": 1171 + }, + { + "epoch": 0.4, + "grad_norm": 3.2957537174224854, + "learning_rate": 0.0008666666666666667, + "loss": 6.7904, + "step": 1172 + }, + { + "epoch": 0.40034129692832765, + "grad_norm": 3.3574087619781494, + "learning_rate": 0.0008665529010238908, + "loss": 6.7804, + "step": 1173 + }, + { + "epoch": 0.4006825938566553, + "grad_norm": 3.550178050994873, + "learning_rate": 0.0008664391353811149, + "loss": 6.5741, + "step": 1174 + }, + { + "epoch": 0.40102389078498296, + "grad_norm": 3.4762377738952637, + "learning_rate": 0.000866325369738339, + "loss": 7.1227, + "step": 1175 + }, + { + "epoch": 0.4013651877133106, + "grad_norm": 3.523291826248169, + "learning_rate": 0.0008662116040955633, + "loss": 6.8638, + "step": 1176 + }, + { + "epoch": 0.4017064846416382, + "grad_norm": 3.267287492752075, + "learning_rate": 0.0008660978384527873, + "loss": 6.6402, + "step": 1177 + }, + { + "epoch": 0.40204778156996585, + "grad_norm": 3.369250774383545, + "learning_rate": 0.0008659840728100114, + "loss": 6.683, + "step": 1178 + }, + { + "epoch": 0.40238907849829353, + "grad_norm": 3.4328765869140625, + "learning_rate": 0.0008658703071672355, + "loss": 6.6393, + "step": 1179 + }, + { + "epoch": 0.40273037542662116, + "grad_norm": 3.6571414470672607, + "learning_rate": 0.0008657565415244596, + "loss": 6.5947, + "step": 1180 + }, + { + "epoch": 0.4030716723549488, + "grad_norm": 3.311383008956909, + "learning_rate": 0.0008656427758816837, + "loss": 6.7028, + "step": 1181 + }, + { + "epoch": 0.40341296928327647, + "grad_norm": 3.5350563526153564, + "learning_rate": 0.0008655290102389079, + "loss": 6.2399, + "step": 1182 + }, + { + "epoch": 0.4037542662116041, + "grad_norm": 3.58445143699646, + "learning_rate": 0.000865415244596132, + "loss": 6.4364, + "step": 1183 + }, + { + "epoch": 0.4040955631399317, + "grad_norm": 3.439987897872925, + "learning_rate": 0.0008653014789533561, + "loss": 6.5951, + "step": 1184 + }, + { + "epoch": 0.4044368600682594, + "grad_norm": 8.74139404296875, + "learning_rate": 0.0008651877133105802, + "loss": 6.3451, + "step": 1185 + }, + { + "epoch": 0.40477815699658704, + "grad_norm": 3.636200428009033, + "learning_rate": 0.0008650739476678043, + "loss": 7.1951, + "step": 1186 + }, + { + "epoch": 0.40511945392491466, + "grad_norm": 3.539865493774414, + "learning_rate": 0.0008649601820250284, + "loss": 7.0252, + "step": 1187 + }, + { + "epoch": 0.40546075085324235, + "grad_norm": 3.5502421855926514, + "learning_rate": 0.0008648464163822526, + "loss": 6.9277, + "step": 1188 + }, + { + "epoch": 0.40580204778157, + "grad_norm": 6.065408706665039, + "learning_rate": 0.0008647326507394767, + "loss": 6.2756, + "step": 1189 + }, + { + "epoch": 0.4061433447098976, + "grad_norm": 3.5880606174468994, + "learning_rate": 0.0008646188850967008, + "loss": 6.609, + "step": 1190 + }, + { + "epoch": 0.40648464163822523, + "grad_norm": 3.5269694328308105, + "learning_rate": 0.000864505119453925, + "loss": 6.6192, + "step": 1191 + }, + { + "epoch": 0.4068259385665529, + "grad_norm": 3.216411828994751, + "learning_rate": 0.000864391353811149, + "loss": 6.6329, + "step": 1192 + }, + { + "epoch": 0.40716723549488054, + "grad_norm": 3.540984630584717, + "learning_rate": 0.0008642775881683732, + "loss": 6.8023, + "step": 1193 + }, + { + "epoch": 0.40750853242320817, + "grad_norm": 3.430344581604004, + "learning_rate": 0.0008641638225255974, + "loss": 6.4494, + "step": 1194 + }, + { + "epoch": 0.40784982935153585, + "grad_norm": 3.64399790763855, + "learning_rate": 0.0008640500568828215, + "loss": 6.0932, + "step": 1195 + }, + { + "epoch": 0.4081911262798635, + "grad_norm": 3.293858528137207, + "learning_rate": 0.0008639362912400455, + "loss": 6.4266, + "step": 1196 + }, + { + "epoch": 0.4085324232081911, + "grad_norm": 3.498997449874878, + "learning_rate": 0.0008638225255972696, + "loss": 6.4859, + "step": 1197 + }, + { + "epoch": 0.4088737201365188, + "grad_norm": 3.547935724258423, + "learning_rate": 0.0008637087599544937, + "loss": 7.0903, + "step": 1198 + }, + { + "epoch": 0.4092150170648464, + "grad_norm": 3.3775041103363037, + "learning_rate": 0.0008635949943117179, + "loss": 6.6445, + "step": 1199 + }, + { + "epoch": 0.40955631399317405, + "grad_norm": 3.678861379623413, + "learning_rate": 0.000863481228668942, + "loss": 6.3223, + "step": 1200 + }, + { + "epoch": 0.40989761092150173, + "grad_norm": 5.478918075561523, + "learning_rate": 0.0008633674630261661, + "loss": 6.726, + "step": 1201 + }, + { + "epoch": 0.41023890784982936, + "grad_norm": 5.6262407302856445, + "learning_rate": 0.0008632536973833902, + "loss": 6.1447, + "step": 1202 + }, + { + "epoch": 0.410580204778157, + "grad_norm": 3.8595380783081055, + "learning_rate": 0.0008631399317406143, + "loss": 6.7759, + "step": 1203 + }, + { + "epoch": 0.4109215017064846, + "grad_norm": 3.590745449066162, + "learning_rate": 0.0008630261660978384, + "loss": 6.6065, + "step": 1204 + }, + { + "epoch": 0.4112627986348123, + "grad_norm": 3.531480550765991, + "learning_rate": 0.0008629124004550626, + "loss": 6.3746, + "step": 1205 + }, + { + "epoch": 0.4116040955631399, + "grad_norm": 3.606606960296631, + "learning_rate": 0.0008627986348122867, + "loss": 6.3677, + "step": 1206 + }, + { + "epoch": 0.41194539249146755, + "grad_norm": 3.5986878871917725, + "learning_rate": 0.0008626848691695108, + "loss": 6.2718, + "step": 1207 + }, + { + "epoch": 0.41228668941979524, + "grad_norm": 3.399188995361328, + "learning_rate": 0.000862571103526735, + "loss": 6.8167, + "step": 1208 + }, + { + "epoch": 0.41262798634812287, + "grad_norm": 3.2899904251098633, + "learning_rate": 0.000862457337883959, + "loss": 7.0081, + "step": 1209 + }, + { + "epoch": 0.4129692832764505, + "grad_norm": 3.522502899169922, + "learning_rate": 0.0008623435722411832, + "loss": 6.6561, + "step": 1210 + }, + { + "epoch": 0.4133105802047782, + "grad_norm": 3.5413362979888916, + "learning_rate": 0.0008622298065984074, + "loss": 6.8759, + "step": 1211 + }, + { + "epoch": 0.4136518771331058, + "grad_norm": 3.4126598834991455, + "learning_rate": 0.0008621160409556315, + "loss": 7.0551, + "step": 1212 + }, + { + "epoch": 0.41399317406143343, + "grad_norm": 3.658048391342163, + "learning_rate": 0.0008620022753128556, + "loss": 6.7476, + "step": 1213 + }, + { + "epoch": 0.4143344709897611, + "grad_norm": 3.4803097248077393, + "learning_rate": 0.0008618885096700797, + "loss": 6.717, + "step": 1214 + }, + { + "epoch": 0.41467576791808874, + "grad_norm": 3.52303147315979, + "learning_rate": 0.0008617747440273038, + "loss": 6.6623, + "step": 1215 + }, + { + "epoch": 0.4150170648464164, + "grad_norm": 3.362966537475586, + "learning_rate": 0.0008616609783845278, + "loss": 6.8483, + "step": 1216 + }, + { + "epoch": 0.415358361774744, + "grad_norm": 4.339871883392334, + "learning_rate": 0.000861547212741752, + "loss": 6.3404, + "step": 1217 + }, + { + "epoch": 0.4156996587030717, + "grad_norm": 3.4630043506622314, + "learning_rate": 0.0008614334470989761, + "loss": 7.0581, + "step": 1218 + }, + { + "epoch": 0.4160409556313993, + "grad_norm": 4.27215576171875, + "learning_rate": 0.0008613196814562002, + "loss": 6.5148, + "step": 1219 + }, + { + "epoch": 0.41638225255972694, + "grad_norm": 3.6531379222869873, + "learning_rate": 0.0008612059158134243, + "loss": 6.1321, + "step": 1220 + }, + { + "epoch": 0.4167235494880546, + "grad_norm": 3.4010026454925537, + "learning_rate": 0.0008610921501706484, + "loss": 6.827, + "step": 1221 + }, + { + "epoch": 0.41706484641638225, + "grad_norm": 3.467449903488159, + "learning_rate": 0.0008609783845278726, + "loss": 7.0066, + "step": 1222 + }, + { + "epoch": 0.4174061433447099, + "grad_norm": 3.3873372077941895, + "learning_rate": 0.0008608646188850967, + "loss": 6.1582, + "step": 1223 + }, + { + "epoch": 0.41774744027303756, + "grad_norm": 3.483137607574463, + "learning_rate": 0.0008607508532423208, + "loss": 6.3962, + "step": 1224 + }, + { + "epoch": 0.4180887372013652, + "grad_norm": 3.4427895545959473, + "learning_rate": 0.000860637087599545, + "loss": 6.5905, + "step": 1225 + }, + { + "epoch": 0.4184300341296928, + "grad_norm": 3.545475721359253, + "learning_rate": 0.000860523321956769, + "loss": 7.2328, + "step": 1226 + }, + { + "epoch": 0.4187713310580205, + "grad_norm": 3.3596596717834473, + "learning_rate": 0.0008604095563139932, + "loss": 6.6329, + "step": 1227 + }, + { + "epoch": 0.41911262798634813, + "grad_norm": 3.347614049911499, + "learning_rate": 0.0008602957906712174, + "loss": 6.6236, + "step": 1228 + }, + { + "epoch": 0.41945392491467576, + "grad_norm": 3.284127950668335, + "learning_rate": 0.0008601820250284415, + "loss": 6.7784, + "step": 1229 + }, + { + "epoch": 0.4197952218430034, + "grad_norm": 3.3968138694763184, + "learning_rate": 0.0008600682593856656, + "loss": 6.8942, + "step": 1230 + }, + { + "epoch": 0.42013651877133107, + "grad_norm": 3.375905752182007, + "learning_rate": 0.0008599544937428897, + "loss": 6.6425, + "step": 1231 + }, + { + "epoch": 0.4204778156996587, + "grad_norm": 3.409838914871216, + "learning_rate": 0.0008598407281001138, + "loss": 6.6804, + "step": 1232 + }, + { + "epoch": 0.4208191126279863, + "grad_norm": 4.324724197387695, + "learning_rate": 0.0008597269624573379, + "loss": 6.1592, + "step": 1233 + }, + { + "epoch": 0.421160409556314, + "grad_norm": 5.707208156585693, + "learning_rate": 0.0008596131968145621, + "loss": 5.9943, + "step": 1234 + }, + { + "epoch": 0.42150170648464164, + "grad_norm": 3.6807117462158203, + "learning_rate": 0.0008594994311717861, + "loss": 6.7764, + "step": 1235 + }, + { + "epoch": 0.42184300341296926, + "grad_norm": 3.467085361480713, + "learning_rate": 0.0008593856655290102, + "loss": 6.5814, + "step": 1236 + }, + { + "epoch": 0.42218430034129695, + "grad_norm": 3.4565463066101074, + "learning_rate": 0.0008592718998862343, + "loss": 6.9017, + "step": 1237 + }, + { + "epoch": 0.4225255972696246, + "grad_norm": 4.344884395599365, + "learning_rate": 0.0008591581342434584, + "loss": 6.5987, + "step": 1238 + }, + { + "epoch": 0.4228668941979522, + "grad_norm": 4.411989212036133, + "learning_rate": 0.0008590443686006826, + "loss": 6.741, + "step": 1239 + }, + { + "epoch": 0.4232081911262799, + "grad_norm": 3.775130271911621, + "learning_rate": 0.0008589306029579067, + "loss": 6.3065, + "step": 1240 + }, + { + "epoch": 0.4235494880546075, + "grad_norm": 3.5393505096435547, + "learning_rate": 0.0008588168373151308, + "loss": 6.9163, + "step": 1241 + }, + { + "epoch": 0.42389078498293514, + "grad_norm": 3.7094571590423584, + "learning_rate": 0.000858703071672355, + "loss": 5.9449, + "step": 1242 + }, + { + "epoch": 0.4242320819112628, + "grad_norm": 3.3689160346984863, + "learning_rate": 0.000858589306029579, + "loss": 6.5366, + "step": 1243 + }, + { + "epoch": 0.42457337883959045, + "grad_norm": 4.066009521484375, + "learning_rate": 0.0008584755403868032, + "loss": 6.2055, + "step": 1244 + }, + { + "epoch": 0.4249146757679181, + "grad_norm": 3.611774444580078, + "learning_rate": 0.0008583617747440274, + "loss": 6.6451, + "step": 1245 + }, + { + "epoch": 0.4252559726962457, + "grad_norm": 3.587705373764038, + "learning_rate": 0.0008582480091012515, + "loss": 6.6259, + "step": 1246 + }, + { + "epoch": 0.4255972696245734, + "grad_norm": 3.442603349685669, + "learning_rate": 0.0008581342434584756, + "loss": 6.8989, + "step": 1247 + }, + { + "epoch": 0.425938566552901, + "grad_norm": 4.580451488494873, + "learning_rate": 0.0008580204778156997, + "loss": 6.2377, + "step": 1248 + }, + { + "epoch": 0.42627986348122865, + "grad_norm": 3.4543490409851074, + "learning_rate": 0.0008579067121729238, + "loss": 6.2214, + "step": 1249 + }, + { + "epoch": 0.42662116040955633, + "grad_norm": 5.877060890197754, + "learning_rate": 0.0008577929465301479, + "loss": 5.4147, + "step": 1250 + }, + { + "epoch": 0.42696245733788396, + "grad_norm": 4.635700225830078, + "learning_rate": 0.0008576791808873721, + "loss": 6.1787, + "step": 1251 + }, + { + "epoch": 0.4273037542662116, + "grad_norm": 3.6666946411132812, + "learning_rate": 0.0008575654152445962, + "loss": 7.006, + "step": 1252 + }, + { + "epoch": 0.42764505119453927, + "grad_norm": 3.5804553031921387, + "learning_rate": 0.0008574516496018203, + "loss": 6.8813, + "step": 1253 + }, + { + "epoch": 0.4279863481228669, + "grad_norm": 3.4612207412719727, + "learning_rate": 0.0008573378839590444, + "loss": 6.6291, + "step": 1254 + }, + { + "epoch": 0.4283276450511945, + "grad_norm": 3.6497762203216553, + "learning_rate": 0.0008572241183162684, + "loss": 6.9736, + "step": 1255 + }, + { + "epoch": 0.4286689419795222, + "grad_norm": 3.296250104904175, + "learning_rate": 0.0008571103526734925, + "loss": 6.5129, + "step": 1256 + }, + { + "epoch": 0.42901023890784984, + "grad_norm": 3.4279532432556152, + "learning_rate": 0.0008569965870307167, + "loss": 6.4369, + "step": 1257 + }, + { + "epoch": 0.42935153583617747, + "grad_norm": 3.448242664337158, + "learning_rate": 0.0008568828213879408, + "loss": 6.6171, + "step": 1258 + }, + { + "epoch": 0.4296928327645051, + "grad_norm": 3.353353977203369, + "learning_rate": 0.000856769055745165, + "loss": 6.5279, + "step": 1259 + }, + { + "epoch": 0.4300341296928328, + "grad_norm": 3.488964319229126, + "learning_rate": 0.0008566552901023891, + "loss": 6.8257, + "step": 1260 + }, + { + "epoch": 0.4303754266211604, + "grad_norm": 3.5172197818756104, + "learning_rate": 0.0008565415244596132, + "loss": 6.1531, + "step": 1261 + }, + { + "epoch": 0.43071672354948803, + "grad_norm": 3.3878061771392822, + "learning_rate": 0.0008564277588168374, + "loss": 6.8535, + "step": 1262 + }, + { + "epoch": 0.4310580204778157, + "grad_norm": 3.500743865966797, + "learning_rate": 0.0008563139931740615, + "loss": 6.9892, + "step": 1263 + }, + { + "epoch": 0.43139931740614335, + "grad_norm": 7.8660736083984375, + "learning_rate": 0.0008562002275312856, + "loss": 5.9798, + "step": 1264 + }, + { + "epoch": 0.431740614334471, + "grad_norm": 3.737638473510742, + "learning_rate": 0.0008560864618885097, + "loss": 6.8881, + "step": 1265 + }, + { + "epoch": 0.43208191126279866, + "grad_norm": 3.8500828742980957, + "learning_rate": 0.0008559726962457338, + "loss": 6.8304, + "step": 1266 + }, + { + "epoch": 0.4324232081911263, + "grad_norm": 3.5614750385284424, + "learning_rate": 0.0008558589306029579, + "loss": 6.4568, + "step": 1267 + }, + { + "epoch": 0.4327645051194539, + "grad_norm": 3.2859628200531006, + "learning_rate": 0.0008557451649601821, + "loss": 6.977, + "step": 1268 + }, + { + "epoch": 0.4331058020477816, + "grad_norm": 3.4138572216033936, + "learning_rate": 0.0008556313993174062, + "loss": 5.7698, + "step": 1269 + }, + { + "epoch": 0.4334470989761092, + "grad_norm": 3.3377716541290283, + "learning_rate": 0.0008555176336746303, + "loss": 6.5715, + "step": 1270 + }, + { + "epoch": 0.43378839590443685, + "grad_norm": 3.5361649990081787, + "learning_rate": 0.0008554038680318544, + "loss": 6.546, + "step": 1271 + }, + { + "epoch": 0.4341296928327645, + "grad_norm": 3.4464402198791504, + "learning_rate": 0.0008552901023890785, + "loss": 6.2318, + "step": 1272 + }, + { + "epoch": 0.43447098976109216, + "grad_norm": 3.289402484893799, + "learning_rate": 0.0008551763367463026, + "loss": 6.7913, + "step": 1273 + }, + { + "epoch": 0.4348122866894198, + "grad_norm": 3.665785312652588, + "learning_rate": 0.0008550625711035267, + "loss": 6.6228, + "step": 1274 + }, + { + "epoch": 0.4351535836177474, + "grad_norm": 5.519403457641602, + "learning_rate": 0.0008549488054607508, + "loss": 6.0677, + "step": 1275 + }, + { + "epoch": 0.4354948805460751, + "grad_norm": 3.639775276184082, + "learning_rate": 0.000854835039817975, + "loss": 6.8561, + "step": 1276 + }, + { + "epoch": 0.43583617747440273, + "grad_norm": 3.529599905014038, + "learning_rate": 0.0008547212741751991, + "loss": 6.6314, + "step": 1277 + }, + { + "epoch": 0.43617747440273036, + "grad_norm": 3.8909149169921875, + "learning_rate": 0.0008546075085324232, + "loss": 6.7449, + "step": 1278 + }, + { + "epoch": 0.43651877133105804, + "grad_norm": 4.182558059692383, + "learning_rate": 0.0008544937428896473, + "loss": 6.1402, + "step": 1279 + }, + { + "epoch": 0.43686006825938567, + "grad_norm": 3.6773123741149902, + "learning_rate": 0.0008543799772468715, + "loss": 6.5567, + "step": 1280 + }, + { + "epoch": 0.4372013651877133, + "grad_norm": 3.5389983654022217, + "learning_rate": 0.0008542662116040956, + "loss": 6.847, + "step": 1281 + }, + { + "epoch": 0.437542662116041, + "grad_norm": 3.5662002563476562, + "learning_rate": 0.0008541524459613197, + "loss": 6.242, + "step": 1282 + }, + { + "epoch": 0.4378839590443686, + "grad_norm": 3.623596668243408, + "learning_rate": 0.0008540386803185438, + "loss": 6.4278, + "step": 1283 + }, + { + "epoch": 0.43822525597269624, + "grad_norm": 3.6582117080688477, + "learning_rate": 0.0008539249146757679, + "loss": 6.266, + "step": 1284 + }, + { + "epoch": 0.43856655290102387, + "grad_norm": 5.415266990661621, + "learning_rate": 0.0008538111490329921, + "loss": 4.8563, + "step": 1285 + }, + { + "epoch": 0.43890784982935155, + "grad_norm": 3.88291335105896, + "learning_rate": 0.0008536973833902162, + "loss": 7.0351, + "step": 1286 + }, + { + "epoch": 0.4392491467576792, + "grad_norm": 3.7798752784729004, + "learning_rate": 0.0008535836177474403, + "loss": 6.9712, + "step": 1287 + }, + { + "epoch": 0.4395904436860068, + "grad_norm": 4.600677490234375, + "learning_rate": 0.0008534698521046644, + "loss": 6.4982, + "step": 1288 + }, + { + "epoch": 0.4399317406143345, + "grad_norm": 3.6498169898986816, + "learning_rate": 0.0008533560864618885, + "loss": 6.7925, + "step": 1289 + }, + { + "epoch": 0.4402730375426621, + "grad_norm": 3.712674617767334, + "learning_rate": 0.0008532423208191126, + "loss": 6.5274, + "step": 1290 + }, + { + "epoch": 0.44061433447098974, + "grad_norm": 3.2611215114593506, + "learning_rate": 0.0008531285551763368, + "loss": 6.954, + "step": 1291 + }, + { + "epoch": 0.4409556313993174, + "grad_norm": 3.1427042484283447, + "learning_rate": 0.000853014789533561, + "loss": 6.6816, + "step": 1292 + }, + { + "epoch": 0.44129692832764505, + "grad_norm": 3.4534733295440674, + "learning_rate": 0.0008529010238907851, + "loss": 6.4714, + "step": 1293 + }, + { + "epoch": 0.4416382252559727, + "grad_norm": 3.25132155418396, + "learning_rate": 0.0008527872582480091, + "loss": 6.3441, + "step": 1294 + }, + { + "epoch": 0.44197952218430037, + "grad_norm": 3.3785088062286377, + "learning_rate": 0.0008526734926052332, + "loss": 6.8156, + "step": 1295 + }, + { + "epoch": 0.442320819112628, + "grad_norm": 3.79790997505188, + "learning_rate": 0.0008525597269624573, + "loss": 6.3941, + "step": 1296 + }, + { + "epoch": 0.4426621160409556, + "grad_norm": 10.423567771911621, + "learning_rate": 0.0008524459613196815, + "loss": 5.7454, + "step": 1297 + }, + { + "epoch": 0.44300341296928325, + "grad_norm": 6.871647357940674, + "learning_rate": 0.0008523321956769056, + "loss": 6.844, + "step": 1298 + }, + { + "epoch": 0.44334470989761093, + "grad_norm": 4.372817039489746, + "learning_rate": 0.0008522184300341297, + "loss": 6.4617, + "step": 1299 + }, + { + "epoch": 0.44368600682593856, + "grad_norm": 3.725336790084839, + "learning_rate": 0.0008521046643913538, + "loss": 6.9746, + "step": 1300 + }, + { + "epoch": 0.4440273037542662, + "grad_norm": 3.9999923706054688, + "learning_rate": 0.0008519908987485779, + "loss": 6.1832, + "step": 1301 + }, + { + "epoch": 0.4443686006825939, + "grad_norm": 8.845232963562012, + "learning_rate": 0.0008518771331058021, + "loss": 4.6078, + "step": 1302 + }, + { + "epoch": 0.4447098976109215, + "grad_norm": 3.548273801803589, + "learning_rate": 0.0008517633674630262, + "loss": 7.1117, + "step": 1303 + }, + { + "epoch": 0.44505119453924913, + "grad_norm": 3.508362054824829, + "learning_rate": 0.0008516496018202503, + "loss": 7.0278, + "step": 1304 + }, + { + "epoch": 0.4453924914675768, + "grad_norm": 5.351772785186768, + "learning_rate": 0.0008515358361774744, + "loss": 6.3568, + "step": 1305 + }, + { + "epoch": 0.44573378839590444, + "grad_norm": 3.509801149368286, + "learning_rate": 0.0008514220705346985, + "loss": 6.6052, + "step": 1306 + }, + { + "epoch": 0.44607508532423207, + "grad_norm": 3.358114719390869, + "learning_rate": 0.0008513083048919226, + "loss": 6.3912, + "step": 1307 + }, + { + "epoch": 0.44641638225255975, + "grad_norm": 5.949875354766846, + "learning_rate": 0.0008511945392491469, + "loss": 6.5835, + "step": 1308 + }, + { + "epoch": 0.4467576791808874, + "grad_norm": 3.4979476928710938, + "learning_rate": 0.000851080773606371, + "loss": 6.1771, + "step": 1309 + }, + { + "epoch": 0.447098976109215, + "grad_norm": 3.5438613891601562, + "learning_rate": 0.0008509670079635951, + "loss": 6.2541, + "step": 1310 + }, + { + "epoch": 0.44744027303754264, + "grad_norm": 3.3947629928588867, + "learning_rate": 0.0008508532423208192, + "loss": 6.2373, + "step": 1311 + }, + { + "epoch": 0.4477815699658703, + "grad_norm": 3.5880777835845947, + "learning_rate": 0.0008507394766780433, + "loss": 6.3093, + "step": 1312 + }, + { + "epoch": 0.44812286689419795, + "grad_norm": 3.6189653873443604, + "learning_rate": 0.0008506257110352673, + "loss": 6.1852, + "step": 1313 + }, + { + "epoch": 0.4484641638225256, + "grad_norm": 3.5925283432006836, + "learning_rate": 0.0008505119453924915, + "loss": 6.7835, + "step": 1314 + }, + { + "epoch": 0.44880546075085326, + "grad_norm": 3.7551400661468506, + "learning_rate": 0.0008503981797497156, + "loss": 6.3399, + "step": 1315 + }, + { + "epoch": 0.4491467576791809, + "grad_norm": 3.489022731781006, + "learning_rate": 0.0008502844141069397, + "loss": 6.8905, + "step": 1316 + }, + { + "epoch": 0.4494880546075085, + "grad_norm": 3.6786651611328125, + "learning_rate": 0.0008501706484641638, + "loss": 6.74, + "step": 1317 + }, + { + "epoch": 0.4498293515358362, + "grad_norm": 3.3341808319091797, + "learning_rate": 0.0008500568828213879, + "loss": 6.4478, + "step": 1318 + }, + { + "epoch": 0.4501706484641638, + "grad_norm": 8.599390983581543, + "learning_rate": 0.000849943117178612, + "loss": 6.5547, + "step": 1319 + }, + { + "epoch": 0.45051194539249145, + "grad_norm": 4.72025728225708, + "learning_rate": 0.0008498293515358362, + "loss": 6.7761, + "step": 1320 + }, + { + "epoch": 0.45085324232081914, + "grad_norm": 3.8403704166412354, + "learning_rate": 0.0008497155858930603, + "loss": 6.6878, + "step": 1321 + }, + { + "epoch": 0.45119453924914676, + "grad_norm": 3.509477376937866, + "learning_rate": 0.0008496018202502844, + "loss": 7.0436, + "step": 1322 + }, + { + "epoch": 0.4515358361774744, + "grad_norm": 3.821585178375244, + "learning_rate": 0.0008494880546075085, + "loss": 6.573, + "step": 1323 + }, + { + "epoch": 0.451877133105802, + "grad_norm": 3.739370584487915, + "learning_rate": 0.0008493742889647326, + "loss": 6.3931, + "step": 1324 + }, + { + "epoch": 0.4522184300341297, + "grad_norm": 3.2959489822387695, + "learning_rate": 0.0008492605233219569, + "loss": 6.8188, + "step": 1325 + }, + { + "epoch": 0.45255972696245733, + "grad_norm": 3.6209750175476074, + "learning_rate": 0.000849146757679181, + "loss": 6.6649, + "step": 1326 + }, + { + "epoch": 0.45290102389078496, + "grad_norm": 3.411741018295288, + "learning_rate": 0.0008490329920364051, + "loss": 6.2213, + "step": 1327 + }, + { + "epoch": 0.45324232081911264, + "grad_norm": 3.494786262512207, + "learning_rate": 0.0008489192263936292, + "loss": 6.9988, + "step": 1328 + }, + { + "epoch": 0.45358361774744027, + "grad_norm": 3.3790512084960938, + "learning_rate": 0.0008488054607508533, + "loss": 6.7258, + "step": 1329 + }, + { + "epoch": 0.4539249146757679, + "grad_norm": 3.412260055541992, + "learning_rate": 0.0008486916951080774, + "loss": 6.8724, + "step": 1330 + }, + { + "epoch": 0.4542662116040956, + "grad_norm": 3.655362844467163, + "learning_rate": 0.0008485779294653016, + "loss": 6.644, + "step": 1331 + }, + { + "epoch": 0.4546075085324232, + "grad_norm": 3.687267780303955, + "learning_rate": 0.0008484641638225257, + "loss": 6.2437, + "step": 1332 + }, + { + "epoch": 0.45494880546075084, + "grad_norm": 3.81823468208313, + "learning_rate": 0.0008483503981797497, + "loss": 6.98, + "step": 1333 + }, + { + "epoch": 0.4552901023890785, + "grad_norm": 4.4424967765808105, + "learning_rate": 0.0008482366325369738, + "loss": 6.6895, + "step": 1334 + }, + { + "epoch": 0.45563139931740615, + "grad_norm": 3.6372294425964355, + "learning_rate": 0.0008481228668941979, + "loss": 6.4528, + "step": 1335 + }, + { + "epoch": 0.4559726962457338, + "grad_norm": 4.353541851043701, + "learning_rate": 0.000848009101251422, + "loss": 6.8672, + "step": 1336 + }, + { + "epoch": 0.45631399317406146, + "grad_norm": 3.73286509513855, + "learning_rate": 0.0008478953356086462, + "loss": 6.2034, + "step": 1337 + }, + { + "epoch": 0.4566552901023891, + "grad_norm": 3.300739049911499, + "learning_rate": 0.0008477815699658703, + "loss": 6.6602, + "step": 1338 + }, + { + "epoch": 0.4569965870307167, + "grad_norm": 3.4340107440948486, + "learning_rate": 0.0008476678043230944, + "loss": 6.3554, + "step": 1339 + }, + { + "epoch": 0.45733788395904434, + "grad_norm": 3.3781628608703613, + "learning_rate": 0.0008475540386803185, + "loss": 6.4238, + "step": 1340 + }, + { + "epoch": 0.45767918088737203, + "grad_norm": 3.5280208587646484, + "learning_rate": 0.0008474402730375426, + "loss": 6.8517, + "step": 1341 + }, + { + "epoch": 0.45802047781569966, + "grad_norm": 3.4973056316375732, + "learning_rate": 0.0008473265073947669, + "loss": 6.3462, + "step": 1342 + }, + { + "epoch": 0.4583617747440273, + "grad_norm": 3.7946386337280273, + "learning_rate": 0.000847212741751991, + "loss": 6.7902, + "step": 1343 + }, + { + "epoch": 0.45870307167235497, + "grad_norm": 3.5301756858825684, + "learning_rate": 0.0008470989761092151, + "loss": 6.9557, + "step": 1344 + }, + { + "epoch": 0.4590443686006826, + "grad_norm": 3.576582431793213, + "learning_rate": 0.0008469852104664392, + "loss": 7.1152, + "step": 1345 + }, + { + "epoch": 0.4593856655290102, + "grad_norm": 3.645031452178955, + "learning_rate": 0.0008468714448236633, + "loss": 6.5057, + "step": 1346 + }, + { + "epoch": 0.4597269624573379, + "grad_norm": 3.4032137393951416, + "learning_rate": 0.0008467576791808874, + "loss": 6.8364, + "step": 1347 + }, + { + "epoch": 0.46006825938566553, + "grad_norm": 3.5544800758361816, + "learning_rate": 0.0008466439135381116, + "loss": 6.8789, + "step": 1348 + }, + { + "epoch": 0.46040955631399316, + "grad_norm": 3.475367784500122, + "learning_rate": 0.0008465301478953357, + "loss": 6.9583, + "step": 1349 + }, + { + "epoch": 0.46075085324232085, + "grad_norm": 3.5934853553771973, + "learning_rate": 0.0008464163822525598, + "loss": 7.1558, + "step": 1350 + }, + { + "epoch": 0.4610921501706485, + "grad_norm": 3.3591737747192383, + "learning_rate": 0.0008463026166097839, + "loss": 6.709, + "step": 1351 + }, + { + "epoch": 0.4614334470989761, + "grad_norm": 3.4733426570892334, + "learning_rate": 0.0008461888509670079, + "loss": 6.7051, + "step": 1352 + }, + { + "epoch": 0.46177474402730373, + "grad_norm": 3.335206985473633, + "learning_rate": 0.000846075085324232, + "loss": 6.4163, + "step": 1353 + }, + { + "epoch": 0.4621160409556314, + "grad_norm": 3.442255973815918, + "learning_rate": 0.0008459613196814562, + "loss": 6.4429, + "step": 1354 + }, + { + "epoch": 0.46245733788395904, + "grad_norm": 3.5777900218963623, + "learning_rate": 0.0008458475540386803, + "loss": 6.9031, + "step": 1355 + }, + { + "epoch": 0.46279863481228667, + "grad_norm": 3.4293618202209473, + "learning_rate": 0.0008457337883959044, + "loss": 6.5918, + "step": 1356 + }, + { + "epoch": 0.46313993174061435, + "grad_norm": 3.3813259601593018, + "learning_rate": 0.0008456200227531285, + "loss": 6.4456, + "step": 1357 + }, + { + "epoch": 0.463481228668942, + "grad_norm": 4.471978664398193, + "learning_rate": 0.0008455062571103526, + "loss": 5.7945, + "step": 1358 + }, + { + "epoch": 0.4638225255972696, + "grad_norm": 4.107702255249023, + "learning_rate": 0.0008453924914675767, + "loss": 6.4136, + "step": 1359 + }, + { + "epoch": 0.4641638225255973, + "grad_norm": 3.807112216949463, + "learning_rate": 0.000845278725824801, + "loss": 6.8553, + "step": 1360 + }, + { + "epoch": 0.4645051194539249, + "grad_norm": 3.7887256145477295, + "learning_rate": 0.0008451649601820251, + "loss": 6.4577, + "step": 1361 + }, + { + "epoch": 0.46484641638225255, + "grad_norm": 3.356083393096924, + "learning_rate": 0.0008450511945392492, + "loss": 6.4253, + "step": 1362 + }, + { + "epoch": 0.46518771331058023, + "grad_norm": 3.5079097747802734, + "learning_rate": 0.0008449374288964733, + "loss": 6.9637, + "step": 1363 + }, + { + "epoch": 0.46552901023890786, + "grad_norm": 3.414142370223999, + "learning_rate": 0.0008448236632536974, + "loss": 6.5443, + "step": 1364 + }, + { + "epoch": 0.4658703071672355, + "grad_norm": 7.6356587409973145, + "learning_rate": 0.0008447098976109216, + "loss": 6.1728, + "step": 1365 + }, + { + "epoch": 0.4662116040955631, + "grad_norm": 3.9965884685516357, + "learning_rate": 0.0008445961319681457, + "loss": 6.5908, + "step": 1366 + }, + { + "epoch": 0.4665529010238908, + "grad_norm": 3.6052424907684326, + "learning_rate": 0.0008444823663253698, + "loss": 6.8923, + "step": 1367 + }, + { + "epoch": 0.4668941979522184, + "grad_norm": 3.6835129261016846, + "learning_rate": 0.0008443686006825939, + "loss": 6.4614, + "step": 1368 + }, + { + "epoch": 0.46723549488054605, + "grad_norm": 3.5373635292053223, + "learning_rate": 0.000844254835039818, + "loss": 6.6617, + "step": 1369 + }, + { + "epoch": 0.46757679180887374, + "grad_norm": 4.537879467010498, + "learning_rate": 0.0008441410693970421, + "loss": 5.4222, + "step": 1370 + }, + { + "epoch": 0.46791808873720137, + "grad_norm": 3.356199026107788, + "learning_rate": 0.0008440273037542663, + "loss": 6.792, + "step": 1371 + }, + { + "epoch": 0.468259385665529, + "grad_norm": 3.830232858657837, + "learning_rate": 0.0008439135381114903, + "loss": 6.6182, + "step": 1372 + }, + { + "epoch": 0.4686006825938567, + "grad_norm": 3.4203712940216064, + "learning_rate": 0.0008437997724687144, + "loss": 6.5533, + "step": 1373 + }, + { + "epoch": 0.4689419795221843, + "grad_norm": 3.5441665649414062, + "learning_rate": 0.0008436860068259385, + "loss": 6.5479, + "step": 1374 + }, + { + "epoch": 0.46928327645051193, + "grad_norm": 3.4909310340881348, + "learning_rate": 0.0008435722411831626, + "loss": 6.1958, + "step": 1375 + }, + { + "epoch": 0.4696245733788396, + "grad_norm": 3.6683506965637207, + "learning_rate": 0.0008434584755403867, + "loss": 6.6273, + "step": 1376 + }, + { + "epoch": 0.46996587030716724, + "grad_norm": 4.192461013793945, + "learning_rate": 0.000843344709897611, + "loss": 6.0141, + "step": 1377 + }, + { + "epoch": 0.47030716723549487, + "grad_norm": 3.5298233032226562, + "learning_rate": 0.0008432309442548351, + "loss": 6.9406, + "step": 1378 + }, + { + "epoch": 0.4706484641638225, + "grad_norm": 3.549668550491333, + "learning_rate": 0.0008431171786120592, + "loss": 6.8979, + "step": 1379 + }, + { + "epoch": 0.4709897610921502, + "grad_norm": 3.3617653846740723, + "learning_rate": 0.0008430034129692833, + "loss": 6.4427, + "step": 1380 + }, + { + "epoch": 0.4713310580204778, + "grad_norm": 3.317580461502075, + "learning_rate": 0.0008428896473265074, + "loss": 7.0131, + "step": 1381 + }, + { + "epoch": 0.47167235494880544, + "grad_norm": 3.481401205062866, + "learning_rate": 0.0008427758816837315, + "loss": 6.6665, + "step": 1382 + }, + { + "epoch": 0.4720136518771331, + "grad_norm": 3.4528825283050537, + "learning_rate": 0.0008426621160409557, + "loss": 6.6673, + "step": 1383 + }, + { + "epoch": 0.47235494880546075, + "grad_norm": 4.272742748260498, + "learning_rate": 0.0008425483503981798, + "loss": 5.9544, + "step": 1384 + }, + { + "epoch": 0.4726962457337884, + "grad_norm": 3.5958762168884277, + "learning_rate": 0.0008424345847554039, + "loss": 6.8734, + "step": 1385 + }, + { + "epoch": 0.47303754266211606, + "grad_norm": 3.564652681350708, + "learning_rate": 0.000842320819112628, + "loss": 6.846, + "step": 1386 + }, + { + "epoch": 0.4733788395904437, + "grad_norm": 3.3910999298095703, + "learning_rate": 0.0008422070534698521, + "loss": 6.3894, + "step": 1387 + }, + { + "epoch": 0.4737201365187713, + "grad_norm": 4.498713493347168, + "learning_rate": 0.0008420932878270763, + "loss": 6.7253, + "step": 1388 + }, + { + "epoch": 0.474061433447099, + "grad_norm": 3.539781332015991, + "learning_rate": 0.0008419795221843004, + "loss": 7.1748, + "step": 1389 + }, + { + "epoch": 0.47440273037542663, + "grad_norm": 3.6499907970428467, + "learning_rate": 0.0008418657565415245, + "loss": 7.0704, + "step": 1390 + }, + { + "epoch": 0.47474402730375426, + "grad_norm": 3.4689838886260986, + "learning_rate": 0.0008417519908987485, + "loss": 6.6226, + "step": 1391 + }, + { + "epoch": 0.4750853242320819, + "grad_norm": 3.3279478549957275, + "learning_rate": 0.0008416382252559726, + "loss": 6.6018, + "step": 1392 + }, + { + "epoch": 0.47542662116040957, + "grad_norm": 3.3303651809692383, + "learning_rate": 0.0008415244596131967, + "loss": 6.6632, + "step": 1393 + }, + { + "epoch": 0.4757679180887372, + "grad_norm": 3.5589680671691895, + "learning_rate": 0.000841410693970421, + "loss": 6.7845, + "step": 1394 + }, + { + "epoch": 0.4761092150170648, + "grad_norm": 3.484956741333008, + "learning_rate": 0.0008412969283276451, + "loss": 6.8862, + "step": 1395 + }, + { + "epoch": 0.4764505119453925, + "grad_norm": 4.03617525100708, + "learning_rate": 0.0008411831626848692, + "loss": 6.6096, + "step": 1396 + }, + { + "epoch": 0.47679180887372014, + "grad_norm": 3.343886375427246, + "learning_rate": 0.0008410693970420933, + "loss": 6.322, + "step": 1397 + }, + { + "epoch": 0.47713310580204776, + "grad_norm": 3.6128275394439697, + "learning_rate": 0.0008409556313993174, + "loss": 6.9015, + "step": 1398 + }, + { + "epoch": 0.47747440273037545, + "grad_norm": 3.4812707901000977, + "learning_rate": 0.0008408418657565415, + "loss": 6.8565, + "step": 1399 + }, + { + "epoch": 0.4778156996587031, + "grad_norm": 3.590444326400757, + "learning_rate": 0.0008407281001137657, + "loss": 6.534, + "step": 1400 + }, + { + "epoch": 0.4781569965870307, + "grad_norm": 3.251206159591675, + "learning_rate": 0.0008406143344709898, + "loss": 6.7066, + "step": 1401 + }, + { + "epoch": 0.4784982935153584, + "grad_norm": 3.3419792652130127, + "learning_rate": 0.0008405005688282139, + "loss": 6.9371, + "step": 1402 + }, + { + "epoch": 0.478839590443686, + "grad_norm": 3.3254618644714355, + "learning_rate": 0.000840386803185438, + "loss": 6.8618, + "step": 1403 + }, + { + "epoch": 0.47918088737201364, + "grad_norm": 5.28602409362793, + "learning_rate": 0.0008402730375426621, + "loss": 5.7656, + "step": 1404 + }, + { + "epoch": 0.47952218430034127, + "grad_norm": 4.599923610687256, + "learning_rate": 0.0008401592718998863, + "loss": 5.8085, + "step": 1405 + }, + { + "epoch": 0.47986348122866895, + "grad_norm": 3.606318712234497, + "learning_rate": 0.0008400455062571104, + "loss": 6.6817, + "step": 1406 + }, + { + "epoch": 0.4802047781569966, + "grad_norm": 3.839956283569336, + "learning_rate": 0.0008399317406143345, + "loss": 7.2384, + "step": 1407 + }, + { + "epoch": 0.4805460750853242, + "grad_norm": 3.5408711433410645, + "learning_rate": 0.0008398179749715586, + "loss": 6.7327, + "step": 1408 + }, + { + "epoch": 0.4808873720136519, + "grad_norm": 3.496644973754883, + "learning_rate": 0.0008397042093287828, + "loss": 6.9381, + "step": 1409 + }, + { + "epoch": 0.4812286689419795, + "grad_norm": 3.4285078048706055, + "learning_rate": 0.0008395904436860067, + "loss": 6.5183, + "step": 1410 + }, + { + "epoch": 0.48156996587030715, + "grad_norm": 3.322774648666382, + "learning_rate": 0.000839476678043231, + "loss": 6.3513, + "step": 1411 + }, + { + "epoch": 0.48191126279863483, + "grad_norm": 3.334423542022705, + "learning_rate": 0.0008393629124004551, + "loss": 6.9284, + "step": 1412 + }, + { + "epoch": 0.48225255972696246, + "grad_norm": 3.4194326400756836, + "learning_rate": 0.0008392491467576792, + "loss": 7.0258, + "step": 1413 + }, + { + "epoch": 0.4825938566552901, + "grad_norm": 5.742277145385742, + "learning_rate": 0.0008391353811149033, + "loss": 5.9465, + "step": 1414 + }, + { + "epoch": 0.48293515358361777, + "grad_norm": 3.6702628135681152, + "learning_rate": 0.0008390216154721274, + "loss": 6.7509, + "step": 1415 + }, + { + "epoch": 0.4832764505119454, + "grad_norm": 3.5624711513519287, + "learning_rate": 0.0008389078498293515, + "loss": 7.3361, + "step": 1416 + }, + { + "epoch": 0.483617747440273, + "grad_norm": 3.3893463611602783, + "learning_rate": 0.0008387940841865757, + "loss": 6.5935, + "step": 1417 + }, + { + "epoch": 0.48395904436860065, + "grad_norm": 4.454052448272705, + "learning_rate": 0.0008386803185437998, + "loss": 6.352, + "step": 1418 + }, + { + "epoch": 0.48430034129692834, + "grad_norm": 3.6366562843322754, + "learning_rate": 0.0008385665529010239, + "loss": 6.7991, + "step": 1419 + }, + { + "epoch": 0.48464163822525597, + "grad_norm": 3.4529809951782227, + "learning_rate": 0.000838452787258248, + "loss": 6.6057, + "step": 1420 + }, + { + "epoch": 0.4849829351535836, + "grad_norm": 3.475712776184082, + "learning_rate": 0.0008383390216154721, + "loss": 5.9796, + "step": 1421 + }, + { + "epoch": 0.4853242320819113, + "grad_norm": 3.537789821624756, + "learning_rate": 0.0008382252559726962, + "loss": 7.112, + "step": 1422 + }, + { + "epoch": 0.4856655290102389, + "grad_norm": 3.504626750946045, + "learning_rate": 0.0008381114903299204, + "loss": 6.8114, + "step": 1423 + }, + { + "epoch": 0.48600682593856653, + "grad_norm": 3.580801486968994, + "learning_rate": 0.0008379977246871445, + "loss": 6.4932, + "step": 1424 + }, + { + "epoch": 0.4863481228668942, + "grad_norm": 3.4925484657287598, + "learning_rate": 0.0008378839590443686, + "loss": 6.7872, + "step": 1425 + }, + { + "epoch": 0.48668941979522184, + "grad_norm": 3.814373254776001, + "learning_rate": 0.0008377701934015928, + "loss": 6.9086, + "step": 1426 + }, + { + "epoch": 0.4870307167235495, + "grad_norm": 3.2810380458831787, + "learning_rate": 0.0008376564277588169, + "loss": 6.6771, + "step": 1427 + }, + { + "epoch": 0.48737201365187716, + "grad_norm": 3.3674893379211426, + "learning_rate": 0.0008375426621160411, + "loss": 6.7597, + "step": 1428 + }, + { + "epoch": 0.4877133105802048, + "grad_norm": 3.3947198390960693, + "learning_rate": 0.0008374288964732652, + "loss": 6.5867, + "step": 1429 + }, + { + "epoch": 0.4880546075085324, + "grad_norm": 3.6930675506591797, + "learning_rate": 0.0008373151308304892, + "loss": 6.7218, + "step": 1430 + }, + { + "epoch": 0.4883959044368601, + "grad_norm": 3.436976909637451, + "learning_rate": 0.0008372013651877133, + "loss": 6.906, + "step": 1431 + }, + { + "epoch": 0.4887372013651877, + "grad_norm": 3.462581157684326, + "learning_rate": 0.0008370875995449374, + "loss": 6.7478, + "step": 1432 + }, + { + "epoch": 0.48907849829351535, + "grad_norm": 3.483604907989502, + "learning_rate": 0.0008369738339021615, + "loss": 6.3906, + "step": 1433 + }, + { + "epoch": 0.489419795221843, + "grad_norm": 3.3858816623687744, + "learning_rate": 0.0008368600682593857, + "loss": 6.3999, + "step": 1434 + }, + { + "epoch": 0.48976109215017066, + "grad_norm": 3.541221857070923, + "learning_rate": 0.0008367463026166098, + "loss": 7.3143, + "step": 1435 + }, + { + "epoch": 0.4901023890784983, + "grad_norm": 3.380033254623413, + "learning_rate": 0.0008366325369738339, + "loss": 6.7311, + "step": 1436 + }, + { + "epoch": 0.4904436860068259, + "grad_norm": 3.423128604888916, + "learning_rate": 0.000836518771331058, + "loss": 6.5591, + "step": 1437 + }, + { + "epoch": 0.4907849829351536, + "grad_norm": 3.3924520015716553, + "learning_rate": 0.0008364050056882821, + "loss": 6.8674, + "step": 1438 + }, + { + "epoch": 0.49112627986348123, + "grad_norm": 3.7471725940704346, + "learning_rate": 0.0008362912400455062, + "loss": 6.6909, + "step": 1439 + }, + { + "epoch": 0.49146757679180886, + "grad_norm": 3.535061836242676, + "learning_rate": 0.0008361774744027304, + "loss": 6.5537, + "step": 1440 + }, + { + "epoch": 0.49180887372013654, + "grad_norm": 3.317924976348877, + "learning_rate": 0.0008360637087599545, + "loss": 6.6799, + "step": 1441 + }, + { + "epoch": 0.49215017064846417, + "grad_norm": 5.021495342254639, + "learning_rate": 0.0008359499431171786, + "loss": 5.363, + "step": 1442 + }, + { + "epoch": 0.4924914675767918, + "grad_norm": 3.5335588455200195, + "learning_rate": 0.0008358361774744028, + "loss": 6.8348, + "step": 1443 + }, + { + "epoch": 0.4928327645051195, + "grad_norm": 3.5306732654571533, + "learning_rate": 0.0008357224118316269, + "loss": 6.6957, + "step": 1444 + }, + { + "epoch": 0.4931740614334471, + "grad_norm": 3.4954781532287598, + "learning_rate": 0.0008356086461888511, + "loss": 6.8805, + "step": 1445 + }, + { + "epoch": 0.49351535836177474, + "grad_norm": 3.4456214904785156, + "learning_rate": 0.0008354948805460752, + "loss": 6.6546, + "step": 1446 + }, + { + "epoch": 0.49385665529010236, + "grad_norm": 3.323965311050415, + "learning_rate": 0.0008353811149032993, + "loss": 6.8501, + "step": 1447 + }, + { + "epoch": 0.49419795221843005, + "grad_norm": 3.692003011703491, + "learning_rate": 0.0008352673492605234, + "loss": 6.282, + "step": 1448 + }, + { + "epoch": 0.4945392491467577, + "grad_norm": 3.3409910202026367, + "learning_rate": 0.0008351535836177474, + "loss": 6.6912, + "step": 1449 + }, + { + "epoch": 0.4948805460750853, + "grad_norm": 3.5133414268493652, + "learning_rate": 0.0008350398179749715, + "loss": 6.0178, + "step": 1450 + }, + { + "epoch": 0.495221843003413, + "grad_norm": 4.341811656951904, + "learning_rate": 0.0008349260523321957, + "loss": 6.4182, + "step": 1451 + }, + { + "epoch": 0.4955631399317406, + "grad_norm": 4.636251926422119, + "learning_rate": 0.0008348122866894198, + "loss": 6.824, + "step": 1452 + }, + { + "epoch": 0.49590443686006824, + "grad_norm": 5.0915632247924805, + "learning_rate": 0.0008346985210466439, + "loss": 5.3702, + "step": 1453 + }, + { + "epoch": 0.4962457337883959, + "grad_norm": 3.465383529663086, + "learning_rate": 0.000834584755403868, + "loss": 6.642, + "step": 1454 + }, + { + "epoch": 0.49658703071672355, + "grad_norm": 3.462819814682007, + "learning_rate": 0.0008344709897610921, + "loss": 6.5499, + "step": 1455 + }, + { + "epoch": 0.4969283276450512, + "grad_norm": 3.4404056072235107, + "learning_rate": 0.0008343572241183162, + "loss": 6.8499, + "step": 1456 + }, + { + "epoch": 0.49726962457337887, + "grad_norm": 3.342381238937378, + "learning_rate": 0.0008342434584755404, + "loss": 6.5637, + "step": 1457 + }, + { + "epoch": 0.4976109215017065, + "grad_norm": 3.599907875061035, + "learning_rate": 0.0008341296928327645, + "loss": 6.5076, + "step": 1458 + }, + { + "epoch": 0.4979522184300341, + "grad_norm": 3.521761178970337, + "learning_rate": 0.0008340159271899886, + "loss": 6.557, + "step": 1459 + }, + { + "epoch": 0.49829351535836175, + "grad_norm": 4.715250015258789, + "learning_rate": 0.0008339021615472128, + "loss": 5.1495, + "step": 1460 + }, + { + "epoch": 0.49863481228668943, + "grad_norm": 3.4854562282562256, + "learning_rate": 0.0008337883959044369, + "loss": 6.7593, + "step": 1461 + }, + { + "epoch": 0.49897610921501706, + "grad_norm": 3.903296947479248, + "learning_rate": 0.000833674630261661, + "loss": 6.1174, + "step": 1462 + }, + { + "epoch": 0.4993174061433447, + "grad_norm": 3.4270193576812744, + "learning_rate": 0.0008335608646188852, + "loss": 6.5968, + "step": 1463 + }, + { + "epoch": 0.49965870307167237, + "grad_norm": 8.496833801269531, + "learning_rate": 0.0008334470989761093, + "loss": 6.2964, + "step": 1464 + }, + { + "epoch": 0.5, + "grad_norm": 3.6175074577331543, + "learning_rate": 0.0008333333333333334, + "loss": 6.989, + "step": 1465 + }, + { + "epoch": 0.5003412969283276, + "grad_norm": 3.5683434009552, + "learning_rate": 0.0008332195676905575, + "loss": 7.0006, + "step": 1466 + }, + { + "epoch": 0.5006825938566553, + "grad_norm": 3.887437582015991, + "learning_rate": 0.0008331058020477816, + "loss": 6.1925, + "step": 1467 + }, + { + "epoch": 0.5010238907849829, + "grad_norm": 3.302295207977295, + "learning_rate": 0.0008329920364050058, + "loss": 6.4899, + "step": 1468 + }, + { + "epoch": 0.5013651877133106, + "grad_norm": 3.6436431407928467, + "learning_rate": 0.0008328782707622298, + "loss": 6.103, + "step": 1469 + }, + { + "epoch": 0.5017064846416383, + "grad_norm": 3.531428337097168, + "learning_rate": 0.0008327645051194539, + "loss": 6.1611, + "step": 1470 + }, + { + "epoch": 0.5020477815699659, + "grad_norm": 3.350801944732666, + "learning_rate": 0.000832650739476678, + "loss": 6.4045, + "step": 1471 + }, + { + "epoch": 0.5023890784982935, + "grad_norm": 4.109683513641357, + "learning_rate": 0.0008325369738339021, + "loss": 3.9738, + "step": 1472 + }, + { + "epoch": 0.5027303754266211, + "grad_norm": 3.4948461055755615, + "learning_rate": 0.0008324232081911262, + "loss": 6.8257, + "step": 1473 + }, + { + "epoch": 0.5030716723549488, + "grad_norm": 3.710834264755249, + "learning_rate": 0.0008323094425483504, + "loss": 6.6345, + "step": 1474 + }, + { + "epoch": 0.5034129692832765, + "grad_norm": 3.6197144985198975, + "learning_rate": 0.0008321956769055745, + "loss": 6.386, + "step": 1475 + }, + { + "epoch": 0.5037542662116041, + "grad_norm": 3.6927449703216553, + "learning_rate": 0.0008320819112627986, + "loss": 6.3214, + "step": 1476 + }, + { + "epoch": 0.5040955631399318, + "grad_norm": 3.6893727779388428, + "learning_rate": 0.0008319681456200228, + "loss": 6.8485, + "step": 1477 + }, + { + "epoch": 0.5044368600682594, + "grad_norm": 4.398456573486328, + "learning_rate": 0.0008318543799772469, + "loss": 6.3134, + "step": 1478 + }, + { + "epoch": 0.504778156996587, + "grad_norm": 3.5371034145355225, + "learning_rate": 0.000831740614334471, + "loss": 6.685, + "step": 1479 + }, + { + "epoch": 0.5051194539249146, + "grad_norm": 3.924985885620117, + "learning_rate": 0.0008316268486916952, + "loss": 6.4405, + "step": 1480 + }, + { + "epoch": 0.5054607508532423, + "grad_norm": 3.409550666809082, + "learning_rate": 0.0008315130830489193, + "loss": 6.5593, + "step": 1481 + }, + { + "epoch": 0.50580204778157, + "grad_norm": 3.491574287414551, + "learning_rate": 0.0008313993174061434, + "loss": 6.2359, + "step": 1482 + }, + { + "epoch": 0.5061433447098976, + "grad_norm": 3.836836338043213, + "learning_rate": 0.0008312855517633675, + "loss": 6.3855, + "step": 1483 + }, + { + "epoch": 0.5064846416382253, + "grad_norm": 5.168152809143066, + "learning_rate": 0.0008311717861205916, + "loss": 4.9784, + "step": 1484 + }, + { + "epoch": 0.5068259385665529, + "grad_norm": 9.451974868774414, + "learning_rate": 0.0008310580204778158, + "loss": 6.8587, + "step": 1485 + }, + { + "epoch": 0.5071672354948805, + "grad_norm": 3.954226493835449, + "learning_rate": 0.0008309442548350399, + "loss": 7.1898, + "step": 1486 + }, + { + "epoch": 0.5075085324232081, + "grad_norm": 5.50667142868042, + "learning_rate": 0.000830830489192264, + "loss": 6.3041, + "step": 1487 + }, + { + "epoch": 0.5078498293515359, + "grad_norm": 3.840693235397339, + "learning_rate": 0.000830716723549488, + "loss": 6.9871, + "step": 1488 + }, + { + "epoch": 0.5081911262798635, + "grad_norm": 8.774356842041016, + "learning_rate": 0.0008306029579067121, + "loss": 8.2422, + "step": 1489 + }, + { + "epoch": 0.5085324232081911, + "grad_norm": 3.4625966548919678, + "learning_rate": 0.0008304891922639362, + "loss": 6.9081, + "step": 1490 + }, + { + "epoch": 0.5088737201365188, + "grad_norm": 3.4914445877075195, + "learning_rate": 0.0008303754266211604, + "loss": 6.7927, + "step": 1491 + }, + { + "epoch": 0.5092150170648464, + "grad_norm": 3.4084482192993164, + "learning_rate": 0.0008302616609783845, + "loss": 7.4133, + "step": 1492 + }, + { + "epoch": 0.509556313993174, + "grad_norm": 7.066652774810791, + "learning_rate": 0.0008301478953356087, + "loss": 6.348, + "step": 1493 + }, + { + "epoch": 0.5098976109215017, + "grad_norm": 4.37141227722168, + "learning_rate": 0.0008300341296928328, + "loss": 6.462, + "step": 1494 + }, + { + "epoch": 0.5102389078498294, + "grad_norm": 5.614762783050537, + "learning_rate": 0.0008299203640500569, + "loss": 6.7, + "step": 1495 + }, + { + "epoch": 0.510580204778157, + "grad_norm": 4.1260271072387695, + "learning_rate": 0.000829806598407281, + "loss": 6.0831, + "step": 1496 + }, + { + "epoch": 0.5109215017064846, + "grad_norm": 3.777806282043457, + "learning_rate": 0.0008296928327645052, + "loss": 7.0977, + "step": 1497 + }, + { + "epoch": 0.5112627986348123, + "grad_norm": 3.448523759841919, + "learning_rate": 0.0008295790671217293, + "loss": 6.7309, + "step": 1498 + }, + { + "epoch": 0.5116040955631399, + "grad_norm": 3.3855607509613037, + "learning_rate": 0.0008294653014789534, + "loss": 6.2709, + "step": 1499 + }, + { + "epoch": 0.5119453924914675, + "grad_norm": 3.435232162475586, + "learning_rate": 0.0008293515358361775, + "loss": 6.3555, + "step": 1500 + }, + { + "epoch": 0.5122866894197953, + "grad_norm": 3.2398738861083984, + "learning_rate": 0.0008292377701934016, + "loss": 3.8611, + "step": 1501 + }, + { + "epoch": 0.5126279863481229, + "grad_norm": 4.865748882293701, + "learning_rate": 0.0008291240045506257, + "loss": 6.4543, + "step": 1502 + }, + { + "epoch": 0.5129692832764505, + "grad_norm": 4.460921287536621, + "learning_rate": 0.0008290102389078499, + "loss": 6.4598, + "step": 1503 + }, + { + "epoch": 0.5133105802047782, + "grad_norm": 4.03560209274292, + "learning_rate": 0.000828896473265074, + "loss": 6.5586, + "step": 1504 + }, + { + "epoch": 0.5136518771331058, + "grad_norm": 3.64315128326416, + "learning_rate": 0.0008287827076222981, + "loss": 7.1097, + "step": 1505 + }, + { + "epoch": 0.5139931740614334, + "grad_norm": 4.334589004516602, + "learning_rate": 0.0008286689419795222, + "loss": 6.402, + "step": 1506 + }, + { + "epoch": 0.514334470989761, + "grad_norm": 27.29024314880371, + "learning_rate": 0.0008285551763367463, + "loss": 7.7064, + "step": 1507 + }, + { + "epoch": 0.5146757679180888, + "grad_norm": 4.106762886047363, + "learning_rate": 0.0008284414106939704, + "loss": 6.5886, + "step": 1508 + }, + { + "epoch": 0.5150170648464164, + "grad_norm": 4.14035701751709, + "learning_rate": 0.0008283276450511945, + "loss": 7.2057, + "step": 1509 + }, + { + "epoch": 0.515358361774744, + "grad_norm": 3.5756776332855225, + "learning_rate": 0.0008282138794084187, + "loss": 6.7014, + "step": 1510 + }, + { + "epoch": 0.5156996587030717, + "grad_norm": 3.2851498126983643, + "learning_rate": 0.0008281001137656428, + "loss": 6.6437, + "step": 1511 + }, + { + "epoch": 0.5160409556313993, + "grad_norm": 3.490445375442505, + "learning_rate": 0.0008279863481228669, + "loss": 6.464, + "step": 1512 + }, + { + "epoch": 0.5163822525597269, + "grad_norm": 3.3551950454711914, + "learning_rate": 0.000827872582480091, + "loss": 6.5594, + "step": 1513 + }, + { + "epoch": 0.5167235494880547, + "grad_norm": 3.4405341148376465, + "learning_rate": 0.0008277588168373152, + "loss": 6.8159, + "step": 1514 + }, + { + "epoch": 0.5170648464163823, + "grad_norm": 3.30784010887146, + "learning_rate": 0.0008276450511945393, + "loss": 6.8372, + "step": 1515 + }, + { + "epoch": 0.5174061433447099, + "grad_norm": 3.6393938064575195, + "learning_rate": 0.0008275312855517634, + "loss": 6.9488, + "step": 1516 + }, + { + "epoch": 0.5177474402730375, + "grad_norm": 3.558758497238159, + "learning_rate": 0.0008274175199089875, + "loss": 6.452, + "step": 1517 + }, + { + "epoch": 0.5180887372013652, + "grad_norm": 3.8486268520355225, + "learning_rate": 0.0008273037542662116, + "loss": 6.601, + "step": 1518 + }, + { + "epoch": 0.5184300341296928, + "grad_norm": 3.421262741088867, + "learning_rate": 0.0008271899886234357, + "loss": 6.6491, + "step": 1519 + }, + { + "epoch": 0.5187713310580204, + "grad_norm": 3.4998815059661865, + "learning_rate": 0.0008270762229806599, + "loss": 6.5504, + "step": 1520 + }, + { + "epoch": 0.5191126279863482, + "grad_norm": 3.533386707305908, + "learning_rate": 0.000826962457337884, + "loss": 6.797, + "step": 1521 + }, + { + "epoch": 0.5194539249146758, + "grad_norm": 3.6013705730438232, + "learning_rate": 0.0008268486916951081, + "loss": 7.1215, + "step": 1522 + }, + { + "epoch": 0.5197952218430034, + "grad_norm": 3.9131691455841064, + "learning_rate": 0.0008267349260523322, + "loss": 6.4156, + "step": 1523 + }, + { + "epoch": 0.520136518771331, + "grad_norm": 3.614016532897949, + "learning_rate": 0.0008266211604095563, + "loss": 6.9913, + "step": 1524 + }, + { + "epoch": 0.5204778156996587, + "grad_norm": 4.081565856933594, + "learning_rate": 0.0008265073947667804, + "loss": 6.7432, + "step": 1525 + }, + { + "epoch": 0.5208191126279863, + "grad_norm": 3.426879405975342, + "learning_rate": 0.0008263936291240047, + "loss": 7.093, + "step": 1526 + }, + { + "epoch": 0.521160409556314, + "grad_norm": 3.377384662628174, + "learning_rate": 0.0008262798634812287, + "loss": 6.8175, + "step": 1527 + }, + { + "epoch": 0.5215017064846417, + "grad_norm": 3.400791883468628, + "learning_rate": 0.0008261660978384528, + "loss": 6.8663, + "step": 1528 + }, + { + "epoch": 0.5218430034129693, + "grad_norm": 3.3176791667938232, + "learning_rate": 0.0008260523321956769, + "loss": 7.0355, + "step": 1529 + }, + { + "epoch": 0.5221843003412969, + "grad_norm": 3.3764326572418213, + "learning_rate": 0.000825938566552901, + "loss": 6.8315, + "step": 1530 + }, + { + "epoch": 0.5225255972696246, + "grad_norm": 3.2921946048736572, + "learning_rate": 0.0008258248009101252, + "loss": 6.869, + "step": 1531 + }, + { + "epoch": 0.5228668941979522, + "grad_norm": 3.3354740142822266, + "learning_rate": 0.0008257110352673493, + "loss": 6.8916, + "step": 1532 + }, + { + "epoch": 0.5232081911262798, + "grad_norm": 7.8725199699401855, + "learning_rate": 0.0008255972696245734, + "loss": 5.065, + "step": 1533 + }, + { + "epoch": 0.5235494880546075, + "grad_norm": 3.697795867919922, + "learning_rate": 0.0008254835039817975, + "loss": 6.9579, + "step": 1534 + }, + { + "epoch": 0.5238907849829352, + "grad_norm": 8.069738388061523, + "learning_rate": 0.0008253697383390216, + "loss": 6.3024, + "step": 1535 + }, + { + "epoch": 0.5242320819112628, + "grad_norm": 4.431558132171631, + "learning_rate": 0.0008252559726962457, + "loss": 6.1488, + "step": 1536 + }, + { + "epoch": 0.5245733788395904, + "grad_norm": 4.185642719268799, + "learning_rate": 0.0008251422070534699, + "loss": 7.1044, + "step": 1537 + }, + { + "epoch": 0.5249146757679181, + "grad_norm": 3.705733060836792, + "learning_rate": 0.000825028441410694, + "loss": 6.8109, + "step": 1538 + }, + { + "epoch": 0.5252559726962457, + "grad_norm": 3.4032537937164307, + "learning_rate": 0.0008249146757679181, + "loss": 6.6804, + "step": 1539 + }, + { + "epoch": 0.5255972696245734, + "grad_norm": 3.183061122894287, + "learning_rate": 0.0008248009101251422, + "loss": 6.3562, + "step": 1540 + }, + { + "epoch": 0.525938566552901, + "grad_norm": 3.406890630722046, + "learning_rate": 0.0008246871444823663, + "loss": 6.4419, + "step": 1541 + }, + { + "epoch": 0.5262798634812287, + "grad_norm": 3.5061683654785156, + "learning_rate": 0.0008245733788395904, + "loss": 6.7721, + "step": 1542 + }, + { + "epoch": 0.5266211604095563, + "grad_norm": 3.441354751586914, + "learning_rate": 0.0008244596131968147, + "loss": 6.1766, + "step": 1543 + }, + { + "epoch": 0.5269624573378839, + "grad_norm": 4.2335429191589355, + "learning_rate": 0.0008243458475540388, + "loss": 6.2502, + "step": 1544 + }, + { + "epoch": 0.5273037542662116, + "grad_norm": 3.5456383228302, + "learning_rate": 0.0008242320819112629, + "loss": 6.5149, + "step": 1545 + }, + { + "epoch": 0.5276450511945392, + "grad_norm": 3.58383846282959, + "learning_rate": 0.000824118316268487, + "loss": 6.4269, + "step": 1546 + }, + { + "epoch": 0.5279863481228669, + "grad_norm": 4.371920585632324, + "learning_rate": 0.000824004550625711, + "loss": 6.4354, + "step": 1547 + }, + { + "epoch": 0.5283276450511946, + "grad_norm": 3.588916063308716, + "learning_rate": 0.0008238907849829351, + "loss": 6.6855, + "step": 1548 + }, + { + "epoch": 0.5286689419795222, + "grad_norm": 4.130184650421143, + "learning_rate": 0.0008237770193401593, + "loss": 6.1499, + "step": 1549 + }, + { + "epoch": 0.5290102389078498, + "grad_norm": 4.080915927886963, + "learning_rate": 0.0008236632536973834, + "loss": 5.4587, + "step": 1550 + }, + { + "epoch": 0.5293515358361774, + "grad_norm": 3.7122843265533447, + "learning_rate": 0.0008235494880546075, + "loss": 6.3315, + "step": 1551 + }, + { + "epoch": 0.5296928327645051, + "grad_norm": 3.8710949420928955, + "learning_rate": 0.0008234357224118316, + "loss": 6.4841, + "step": 1552 + }, + { + "epoch": 0.5300341296928328, + "grad_norm": 3.440357208251953, + "learning_rate": 0.0008233219567690557, + "loss": 6.9585, + "step": 1553 + }, + { + "epoch": 0.5303754266211604, + "grad_norm": 3.3393759727478027, + "learning_rate": 0.0008232081911262799, + "loss": 6.8411, + "step": 1554 + }, + { + "epoch": 0.5307167235494881, + "grad_norm": 3.5455715656280518, + "learning_rate": 0.000823094425483504, + "loss": 6.6823, + "step": 1555 + }, + { + "epoch": 0.5310580204778157, + "grad_norm": 3.815316677093506, + "learning_rate": 0.0008229806598407281, + "loss": 6.1607, + "step": 1556 + }, + { + "epoch": 0.5313993174061433, + "grad_norm": 3.953657388687134, + "learning_rate": 0.0008228668941979522, + "loss": 6.6442, + "step": 1557 + }, + { + "epoch": 0.531740614334471, + "grad_norm": 3.4265329837799072, + "learning_rate": 0.0008227531285551763, + "loss": 6.6399, + "step": 1558 + }, + { + "epoch": 0.5320819112627987, + "grad_norm": 3.6078526973724365, + "learning_rate": 0.0008226393629124004, + "loss": 6.654, + "step": 1559 + }, + { + "epoch": 0.5324232081911263, + "grad_norm": 3.5532915592193604, + "learning_rate": 0.0008225255972696247, + "loss": 6.0045, + "step": 1560 + }, + { + "epoch": 0.532764505119454, + "grad_norm": 3.4579410552978516, + "learning_rate": 0.0008224118316268488, + "loss": 6.6127, + "step": 1561 + }, + { + "epoch": 0.5331058020477816, + "grad_norm": 3.489680051803589, + "learning_rate": 0.0008222980659840729, + "loss": 7.2289, + "step": 1562 + }, + { + "epoch": 0.5334470989761092, + "grad_norm": 3.981506824493408, + "learning_rate": 0.000822184300341297, + "loss": 6.8315, + "step": 1563 + }, + { + "epoch": 0.5337883959044368, + "grad_norm": 3.5434958934783936, + "learning_rate": 0.0008220705346985211, + "loss": 6.4467, + "step": 1564 + }, + { + "epoch": 0.5341296928327645, + "grad_norm": 20.380250930786133, + "learning_rate": 0.0008219567690557452, + "loss": 5.3638, + "step": 1565 + }, + { + "epoch": 0.5344709897610922, + "grad_norm": 3.482201337814331, + "learning_rate": 0.0008218430034129693, + "loss": 6.8673, + "step": 1566 + }, + { + "epoch": 0.5348122866894198, + "grad_norm": 3.6969149112701416, + "learning_rate": 0.0008217292377701934, + "loss": 6.7083, + "step": 1567 + }, + { + "epoch": 0.5351535836177475, + "grad_norm": 3.4104528427124023, + "learning_rate": 0.0008216154721274175, + "loss": 6.7722, + "step": 1568 + }, + { + "epoch": 0.5354948805460751, + "grad_norm": 3.611492872238159, + "learning_rate": 0.0008215017064846416, + "loss": 6.3124, + "step": 1569 + }, + { + "epoch": 0.5358361774744027, + "grad_norm": 16.186843872070312, + "learning_rate": 0.0008213879408418657, + "loss": 8.7558, + "step": 1570 + }, + { + "epoch": 0.5361774744027303, + "grad_norm": 3.5169475078582764, + "learning_rate": 0.0008212741751990899, + "loss": 7.0439, + "step": 1571 + }, + { + "epoch": 0.5365187713310581, + "grad_norm": 3.61405611038208, + "learning_rate": 0.000821160409556314, + "loss": 6.2176, + "step": 1572 + }, + { + "epoch": 0.5368600682593857, + "grad_norm": 3.3224093914031982, + "learning_rate": 0.0008210466439135381, + "loss": 6.693, + "step": 1573 + }, + { + "epoch": 0.5372013651877133, + "grad_norm": 3.339376211166382, + "learning_rate": 0.0008209328782707622, + "loss": 6.3451, + "step": 1574 + }, + { + "epoch": 0.537542662116041, + "grad_norm": 3.3219499588012695, + "learning_rate": 0.0008208191126279863, + "loss": 6.9464, + "step": 1575 + }, + { + "epoch": 0.5378839590443686, + "grad_norm": 3.3583548069000244, + "learning_rate": 0.0008207053469852104, + "loss": 6.9545, + "step": 1576 + }, + { + "epoch": 0.5382252559726962, + "grad_norm": 3.8789596557617188, + "learning_rate": 0.0008205915813424347, + "loss": 6.5906, + "step": 1577 + }, + { + "epoch": 0.5385665529010238, + "grad_norm": 3.4868991374969482, + "learning_rate": 0.0008204778156996588, + "loss": 6.8179, + "step": 1578 + }, + { + "epoch": 0.5389078498293516, + "grad_norm": 3.5278048515319824, + "learning_rate": 0.0008203640500568829, + "loss": 6.8537, + "step": 1579 + }, + { + "epoch": 0.5392491467576792, + "grad_norm": 3.1563076972961426, + "learning_rate": 0.000820250284414107, + "loss": 6.7337, + "step": 1580 + }, + { + "epoch": 0.5395904436860068, + "grad_norm": 3.347745418548584, + "learning_rate": 0.0008201365187713311, + "loss": 6.8519, + "step": 1581 + }, + { + "epoch": 0.5399317406143345, + "grad_norm": 3.3623850345611572, + "learning_rate": 0.0008200227531285552, + "loss": 6.7128, + "step": 1582 + }, + { + "epoch": 0.5402730375426621, + "grad_norm": 3.4226489067077637, + "learning_rate": 0.0008199089874857794, + "loss": 6.9467, + "step": 1583 + }, + { + "epoch": 0.5406143344709897, + "grad_norm": 3.467519521713257, + "learning_rate": 0.0008197952218430035, + "loss": 7.0993, + "step": 1584 + }, + { + "epoch": 0.5409556313993175, + "grad_norm": 4.463557720184326, + "learning_rate": 0.0008196814562002275, + "loss": 6.207, + "step": 1585 + }, + { + "epoch": 0.5412969283276451, + "grad_norm": 7.1371846199035645, + "learning_rate": 0.0008195676905574516, + "loss": 6.115, + "step": 1586 + }, + { + "epoch": 0.5416382252559727, + "grad_norm": 4.31298828125, + "learning_rate": 0.0008194539249146757, + "loss": 6.4027, + "step": 1587 + }, + { + "epoch": 0.5419795221843003, + "grad_norm": 3.982283115386963, + "learning_rate": 0.0008193401592718998, + "loss": 6.9022, + "step": 1588 + }, + { + "epoch": 0.542320819112628, + "grad_norm": 3.6711089611053467, + "learning_rate": 0.000819226393629124, + "loss": 6.7532, + "step": 1589 + }, + { + "epoch": 0.5426621160409556, + "grad_norm": 3.598482370376587, + "learning_rate": 0.0008191126279863481, + "loss": 6.0211, + "step": 1590 + }, + { + "epoch": 0.5430034129692832, + "grad_norm": 3.518918514251709, + "learning_rate": 0.0008189988623435722, + "loss": 6.7096, + "step": 1591 + }, + { + "epoch": 0.543344709897611, + "grad_norm": 3.3870673179626465, + "learning_rate": 0.0008188850967007963, + "loss": 6.8977, + "step": 1592 + }, + { + "epoch": 0.5436860068259386, + "grad_norm": 3.3564047813415527, + "learning_rate": 0.0008187713310580204, + "loss": 6.5459, + "step": 1593 + }, + { + "epoch": 0.5440273037542662, + "grad_norm": 3.4745521545410156, + "learning_rate": 0.0008186575654152447, + "loss": 6.3275, + "step": 1594 + }, + { + "epoch": 0.5443686006825939, + "grad_norm": 3.4397785663604736, + "learning_rate": 0.0008185437997724688, + "loss": 6.644, + "step": 1595 + }, + { + "epoch": 0.5447098976109215, + "grad_norm": 3.6341989040374756, + "learning_rate": 0.0008184300341296929, + "loss": 6.3724, + "step": 1596 + }, + { + "epoch": 0.5450511945392491, + "grad_norm": 3.4546432495117188, + "learning_rate": 0.000818316268486917, + "loss": 6.0687, + "step": 1597 + }, + { + "epoch": 0.5453924914675768, + "grad_norm": 3.2902228832244873, + "learning_rate": 0.0008182025028441411, + "loss": 6.7731, + "step": 1598 + }, + { + "epoch": 0.5457337883959045, + "grad_norm": 21.545167922973633, + "learning_rate": 0.0008180887372013652, + "loss": 9.0693, + "step": 1599 + }, + { + "epoch": 0.5460750853242321, + "grad_norm": 4.197141170501709, + "learning_rate": 0.0008179749715585894, + "loss": 6.2883, + "step": 1600 + }, + { + "epoch": 0.5464163822525597, + "grad_norm": 3.763883113861084, + "learning_rate": 0.0008178612059158135, + "loss": 6.7896, + "step": 1601 + }, + { + "epoch": 0.5467576791808874, + "grad_norm": 3.678478717803955, + "learning_rate": 0.0008177474402730376, + "loss": 6.6968, + "step": 1602 + }, + { + "epoch": 0.547098976109215, + "grad_norm": 4.786699295043945, + "learning_rate": 0.0008176336746302617, + "loss": 6.342, + "step": 1603 + }, + { + "epoch": 0.5474402730375426, + "grad_norm": 2.2955498695373535, + "learning_rate": 0.0008175199089874858, + "loss": 3.6378, + "step": 1604 + }, + { + "epoch": 0.5477815699658704, + "grad_norm": 3.439753293991089, + "learning_rate": 0.0008174061433447098, + "loss": 6.7798, + "step": 1605 + }, + { + "epoch": 0.548122866894198, + "grad_norm": 4.1457839012146, + "learning_rate": 0.000817292377701934, + "loss": 6.3028, + "step": 1606 + }, + { + "epoch": 0.5484641638225256, + "grad_norm": 3.4417805671691895, + "learning_rate": 0.0008171786120591581, + "loss": 7.0572, + "step": 1607 + }, + { + "epoch": 0.5488054607508532, + "grad_norm": 3.348179578781128, + "learning_rate": 0.0008170648464163822, + "loss": 6.7471, + "step": 1608 + }, + { + "epoch": 0.5491467576791809, + "grad_norm": 3.4743990898132324, + "learning_rate": 0.0008169510807736063, + "loss": 7.0257, + "step": 1609 + }, + { + "epoch": 0.5494880546075085, + "grad_norm": 3.894490957260132, + "learning_rate": 0.0008168373151308304, + "loss": 6.1222, + "step": 1610 + }, + { + "epoch": 0.5498293515358362, + "grad_norm": 3.6673929691314697, + "learning_rate": 0.0008167235494880547, + "loss": 6.5822, + "step": 1611 + }, + { + "epoch": 0.5501706484641639, + "grad_norm": 3.5163426399230957, + "learning_rate": 0.0008166097838452788, + "loss": 6.7198, + "step": 1612 + }, + { + "epoch": 0.5505119453924915, + "grad_norm": 4.025699138641357, + "learning_rate": 0.0008164960182025029, + "loss": 6.3125, + "step": 1613 + }, + { + "epoch": 0.5508532423208191, + "grad_norm": 4.222406387329102, + "learning_rate": 0.000816382252559727, + "loss": 7.1646, + "step": 1614 + }, + { + "epoch": 0.5511945392491467, + "grad_norm": 6.100660800933838, + "learning_rate": 0.0008162684869169511, + "loss": 6.3055, + "step": 1615 + }, + { + "epoch": 0.5515358361774744, + "grad_norm": 3.6640098094940186, + "learning_rate": 0.0008161547212741752, + "loss": 6.3726, + "step": 1616 + }, + { + "epoch": 0.551877133105802, + "grad_norm": 3.549154281616211, + "learning_rate": 0.0008160409556313994, + "loss": 6.8686, + "step": 1617 + }, + { + "epoch": 0.5522184300341297, + "grad_norm": 3.3445968627929688, + "learning_rate": 0.0008159271899886235, + "loss": 6.7752, + "step": 1618 + }, + { + "epoch": 0.5525597269624574, + "grad_norm": 3.3909804821014404, + "learning_rate": 0.0008158134243458476, + "loss": 6.7392, + "step": 1619 + }, + { + "epoch": 0.552901023890785, + "grad_norm": 3.3068623542785645, + "learning_rate": 0.0008156996587030717, + "loss": 7.0023, + "step": 1620 + }, + { + "epoch": 0.5532423208191126, + "grad_norm": 5.286759376525879, + "learning_rate": 0.0008155858930602958, + "loss": 5.7817, + "step": 1621 + }, + { + "epoch": 0.5535836177474402, + "grad_norm": 4.21416711807251, + "learning_rate": 0.0008154721274175199, + "loss": 6.6314, + "step": 1622 + }, + { + "epoch": 0.5539249146757679, + "grad_norm": 3.586740016937256, + "learning_rate": 0.0008153583617747441, + "loss": 6.7685, + "step": 1623 + }, + { + "epoch": 0.5542662116040956, + "grad_norm": 3.7087645530700684, + "learning_rate": 0.0008152445961319681, + "loss": 5.8739, + "step": 1624 + }, + { + "epoch": 0.5546075085324232, + "grad_norm": 3.9392848014831543, + "learning_rate": 0.0008151308304891922, + "loss": 6.6611, + "step": 1625 + }, + { + "epoch": 0.5549488054607509, + "grad_norm": 3.521493673324585, + "learning_rate": 0.0008150170648464163, + "loss": 7.1034, + "step": 1626 + }, + { + "epoch": 0.5552901023890785, + "grad_norm": 4.224700927734375, + "learning_rate": 0.0008149032992036404, + "loss": 5.8669, + "step": 1627 + }, + { + "epoch": 0.5556313993174061, + "grad_norm": 3.757660388946533, + "learning_rate": 0.0008147895335608646, + "loss": 6.5412, + "step": 1628 + }, + { + "epoch": 0.5559726962457338, + "grad_norm": 3.58388614654541, + "learning_rate": 0.0008146757679180888, + "loss": 6.7925, + "step": 1629 + }, + { + "epoch": 0.5563139931740614, + "grad_norm": 3.373109817504883, + "learning_rate": 0.0008145620022753129, + "loss": 6.5338, + "step": 1630 + }, + { + "epoch": 0.5566552901023891, + "grad_norm": 3.3776378631591797, + "learning_rate": 0.000814448236632537, + "loss": 6.6037, + "step": 1631 + }, + { + "epoch": 0.5569965870307167, + "grad_norm": 3.9116930961608887, + "learning_rate": 0.0008143344709897611, + "loss": 5.6358, + "step": 1632 + }, + { + "epoch": 0.5573378839590444, + "grad_norm": 3.494377851486206, + "learning_rate": 0.0008142207053469852, + "loss": 6.9675, + "step": 1633 + }, + { + "epoch": 0.557679180887372, + "grad_norm": 3.6268115043640137, + "learning_rate": 0.0008141069397042094, + "loss": 6.7061, + "step": 1634 + }, + { + "epoch": 0.5580204778156996, + "grad_norm": 4.0463409423828125, + "learning_rate": 0.0008139931740614335, + "loss": 5.8411, + "step": 1635 + }, + { + "epoch": 0.5583617747440273, + "grad_norm": 3.519866466522217, + "learning_rate": 0.0008138794084186576, + "loss": 6.6951, + "step": 1636 + }, + { + "epoch": 0.558703071672355, + "grad_norm": 3.481750965118408, + "learning_rate": 0.0008137656427758817, + "loss": 7.011, + "step": 1637 + }, + { + "epoch": 0.5590443686006826, + "grad_norm": 3.268798828125, + "learning_rate": 0.0008136518771331058, + "loss": 6.7461, + "step": 1638 + }, + { + "epoch": 0.5593856655290103, + "grad_norm": 3.8919827938079834, + "learning_rate": 0.0008135381114903299, + "loss": 6.4497, + "step": 1639 + }, + { + "epoch": 0.5597269624573379, + "grad_norm": 3.408388614654541, + "learning_rate": 0.0008134243458475541, + "loss": 7.0488, + "step": 1640 + }, + { + "epoch": 0.5600682593856655, + "grad_norm": 3.337852954864502, + "learning_rate": 0.0008133105802047782, + "loss": 6.6489, + "step": 1641 + }, + { + "epoch": 0.5604095563139931, + "grad_norm": 3.176896095275879, + "learning_rate": 0.0008131968145620023, + "loss": 7.0354, + "step": 1642 + }, + { + "epoch": 0.5607508532423208, + "grad_norm": 3.4171290397644043, + "learning_rate": 0.0008130830489192265, + "loss": 6.6106, + "step": 1643 + }, + { + "epoch": 0.5610921501706485, + "grad_norm": 3.348039150238037, + "learning_rate": 0.0008129692832764504, + "loss": 6.4115, + "step": 1644 + }, + { + "epoch": 0.5614334470989761, + "grad_norm": 3.897597074508667, + "learning_rate": 0.0008128555176336746, + "loss": 6.5893, + "step": 1645 + }, + { + "epoch": 0.5617747440273038, + "grad_norm": 3.424443006515503, + "learning_rate": 0.0008127417519908988, + "loss": 6.4034, + "step": 1646 + }, + { + "epoch": 0.5621160409556314, + "grad_norm": 3.577786445617676, + "learning_rate": 0.0008126279863481229, + "loss": 6.916, + "step": 1647 + }, + { + "epoch": 0.562457337883959, + "grad_norm": 3.442438840866089, + "learning_rate": 0.000812514220705347, + "loss": 7.1346, + "step": 1648 + }, + { + "epoch": 0.5627986348122866, + "grad_norm": 3.3798537254333496, + "learning_rate": 0.0008124004550625711, + "loss": 6.4942, + "step": 1649 + }, + { + "epoch": 0.5631399317406144, + "grad_norm": 3.885824680328369, + "learning_rate": 0.0008122866894197952, + "loss": 6.8912, + "step": 1650 + }, + { + "epoch": 0.563481228668942, + "grad_norm": 3.910099744796753, + "learning_rate": 0.0008121729237770193, + "loss": 6.3676, + "step": 1651 + }, + { + "epoch": 0.5638225255972696, + "grad_norm": 3.5344743728637695, + "learning_rate": 0.0008120591581342435, + "loss": 6.8041, + "step": 1652 + }, + { + "epoch": 0.5641638225255973, + "grad_norm": 7.247477054595947, + "learning_rate": 0.0008119453924914676, + "loss": 5.6654, + "step": 1653 + }, + { + "epoch": 0.5645051194539249, + "grad_norm": 3.664292097091675, + "learning_rate": 0.0008118316268486917, + "loss": 7.0846, + "step": 1654 + }, + { + "epoch": 0.5648464163822525, + "grad_norm": 5.331287384033203, + "learning_rate": 0.0008117178612059158, + "loss": 5.1035, + "step": 1655 + }, + { + "epoch": 0.5651877133105802, + "grad_norm": 4.2963175773620605, + "learning_rate": 0.0008116040955631399, + "loss": 6.543, + "step": 1656 + }, + { + "epoch": 0.5655290102389079, + "grad_norm": 3.7694435119628906, + "learning_rate": 0.0008114903299203641, + "loss": 6.5782, + "step": 1657 + }, + { + "epoch": 0.5658703071672355, + "grad_norm": 3.3305928707122803, + "learning_rate": 0.0008113765642775882, + "loss": 6.992, + "step": 1658 + }, + { + "epoch": 0.5662116040955631, + "grad_norm": 3.3793208599090576, + "learning_rate": 0.0008112627986348123, + "loss": 6.3376, + "step": 1659 + }, + { + "epoch": 0.5665529010238908, + "grad_norm": 3.6285388469696045, + "learning_rate": 0.0008111490329920365, + "loss": 6.6463, + "step": 1660 + }, + { + "epoch": 0.5668941979522184, + "grad_norm": 3.3783798217773438, + "learning_rate": 0.0008110352673492606, + "loss": 6.4926, + "step": 1661 + }, + { + "epoch": 0.567235494880546, + "grad_norm": 3.489332675933838, + "learning_rate": 0.0008109215017064847, + "loss": 6.9067, + "step": 1662 + }, + { + "epoch": 0.5675767918088738, + "grad_norm": 3.4197375774383545, + "learning_rate": 0.0008108077360637088, + "loss": 6.8896, + "step": 1663 + }, + { + "epoch": 0.5679180887372014, + "grad_norm": 5.391114234924316, + "learning_rate": 0.0008106939704209329, + "loss": 6.6774, + "step": 1664 + }, + { + "epoch": 0.568259385665529, + "grad_norm": 3.594270706176758, + "learning_rate": 0.000810580204778157, + "loss": 7.0118, + "step": 1665 + }, + { + "epoch": 0.5686006825938567, + "grad_norm": 3.6209936141967773, + "learning_rate": 0.0008104664391353811, + "loss": 5.8403, + "step": 1666 + }, + { + "epoch": 0.5689419795221843, + "grad_norm": 3.4565608501434326, + "learning_rate": 0.0008103526734926052, + "loss": 7.2969, + "step": 1667 + }, + { + "epoch": 0.5692832764505119, + "grad_norm": 3.351238489151001, + "learning_rate": 0.0008102389078498293, + "loss": 7.1469, + "step": 1668 + }, + { + "epoch": 0.5696245733788395, + "grad_norm": 3.340491771697998, + "learning_rate": 0.0008101251422070535, + "loss": 6.5831, + "step": 1669 + }, + { + "epoch": 0.5699658703071673, + "grad_norm": 3.3697030544281006, + "learning_rate": 0.0008100113765642776, + "loss": 6.5447, + "step": 1670 + }, + { + "epoch": 0.5703071672354949, + "grad_norm": 3.3213913440704346, + "learning_rate": 0.0008098976109215017, + "loss": 6.7253, + "step": 1671 + }, + { + "epoch": 0.5706484641638225, + "grad_norm": 3.31199312210083, + "learning_rate": 0.0008097838452787258, + "loss": 6.7037, + "step": 1672 + }, + { + "epoch": 0.5709897610921502, + "grad_norm": 3.239823341369629, + "learning_rate": 0.0008096700796359499, + "loss": 6.2837, + "step": 1673 + }, + { + "epoch": 0.5713310580204778, + "grad_norm": 3.233081102371216, + "learning_rate": 0.0008095563139931741, + "loss": 6.4038, + "step": 1674 + }, + { + "epoch": 0.5716723549488054, + "grad_norm": 3.2402610778808594, + "learning_rate": 0.0008094425483503982, + "loss": 6.8486, + "step": 1675 + }, + { + "epoch": 0.5720136518771332, + "grad_norm": 3.5634925365448, + "learning_rate": 0.0008093287827076223, + "loss": 5.8242, + "step": 1676 + }, + { + "epoch": 0.5723549488054608, + "grad_norm": 3.4806721210479736, + "learning_rate": 0.0008092150170648465, + "loss": 7.0927, + "step": 1677 + }, + { + "epoch": 0.5726962457337884, + "grad_norm": 3.4350132942199707, + "learning_rate": 0.0008091012514220706, + "loss": 6.6534, + "step": 1678 + }, + { + "epoch": 0.573037542662116, + "grad_norm": 3.3814663887023926, + "learning_rate": 0.0008089874857792947, + "loss": 6.8434, + "step": 1679 + }, + { + "epoch": 0.5733788395904437, + "grad_norm": 3.420792579650879, + "learning_rate": 0.0008088737201365189, + "loss": 6.8301, + "step": 1680 + }, + { + "epoch": 0.5737201365187713, + "grad_norm": 3.6617331504821777, + "learning_rate": 0.000808759954493743, + "loss": 7.1931, + "step": 1681 + }, + { + "epoch": 0.5740614334470989, + "grad_norm": 3.417377233505249, + "learning_rate": 0.0008086461888509671, + "loss": 6.8856, + "step": 1682 + }, + { + "epoch": 0.5744027303754267, + "grad_norm": 3.4047882556915283, + "learning_rate": 0.0008085324232081911, + "loss": 6.7193, + "step": 1683 + }, + { + "epoch": 0.5747440273037543, + "grad_norm": 5.160583019256592, + "learning_rate": 0.0008084186575654152, + "loss": 6.3314, + "step": 1684 + }, + { + "epoch": 0.5750853242320819, + "grad_norm": 3.442270517349243, + "learning_rate": 0.0008083048919226393, + "loss": 6.5523, + "step": 1685 + }, + { + "epoch": 0.5754266211604095, + "grad_norm": 3.4969444274902344, + "learning_rate": 0.0008081911262798635, + "loss": 6.1442, + "step": 1686 + }, + { + "epoch": 0.5757679180887372, + "grad_norm": 3.283874988555908, + "learning_rate": 0.0008080773606370876, + "loss": 6.5447, + "step": 1687 + }, + { + "epoch": 0.5761092150170648, + "grad_norm": 3.373081922531128, + "learning_rate": 0.0008079635949943117, + "loss": 6.9056, + "step": 1688 + }, + { + "epoch": 0.5764505119453925, + "grad_norm": 3.2919609546661377, + "learning_rate": 0.0008078498293515358, + "loss": 6.6918, + "step": 1689 + }, + { + "epoch": 0.5767918088737202, + "grad_norm": 3.302234411239624, + "learning_rate": 0.0008077360637087599, + "loss": 6.5728, + "step": 1690 + }, + { + "epoch": 0.5771331058020478, + "grad_norm": 3.6976983547210693, + "learning_rate": 0.000807622298065984, + "loss": 6.1227, + "step": 1691 + }, + { + "epoch": 0.5774744027303754, + "grad_norm": 3.5474138259887695, + "learning_rate": 0.0008075085324232082, + "loss": 6.2463, + "step": 1692 + }, + { + "epoch": 0.577815699658703, + "grad_norm": 3.680959701538086, + "learning_rate": 0.0008073947667804323, + "loss": 6.6472, + "step": 1693 + }, + { + "epoch": 0.5781569965870307, + "grad_norm": 3.4441518783569336, + "learning_rate": 0.0008072810011376565, + "loss": 6.5921, + "step": 1694 + }, + { + "epoch": 0.5784982935153583, + "grad_norm": 4.723526477813721, + "learning_rate": 0.0008071672354948806, + "loss": 6.2168, + "step": 1695 + }, + { + "epoch": 0.578839590443686, + "grad_norm": 3.377566337585449, + "learning_rate": 0.0008070534698521047, + "loss": 6.7912, + "step": 1696 + }, + { + "epoch": 0.5791808873720137, + "grad_norm": 3.3538033962249756, + "learning_rate": 0.0008069397042093289, + "loss": 6.7811, + "step": 1697 + }, + { + "epoch": 0.5795221843003413, + "grad_norm": 3.2516143321990967, + "learning_rate": 0.000806825938566553, + "loss": 6.6028, + "step": 1698 + }, + { + "epoch": 0.5798634812286689, + "grad_norm": 3.5221121311187744, + "learning_rate": 0.0008067121729237771, + "loss": 6.8532, + "step": 1699 + }, + { + "epoch": 0.5802047781569966, + "grad_norm": 3.5948259830474854, + "learning_rate": 0.0008065984072810012, + "loss": 6.5451, + "step": 1700 + }, + { + "epoch": 0.5805460750853242, + "grad_norm": 3.5543367862701416, + "learning_rate": 0.0008064846416382253, + "loss": 6.5479, + "step": 1701 + }, + { + "epoch": 0.5808873720136519, + "grad_norm": 3.5869266986846924, + "learning_rate": 0.0008063708759954493, + "loss": 6.4638, + "step": 1702 + }, + { + "epoch": 0.5812286689419796, + "grad_norm": 3.233880043029785, + "learning_rate": 0.0008062571103526735, + "loss": 6.6942, + "step": 1703 + }, + { + "epoch": 0.5815699658703072, + "grad_norm": 3.558081865310669, + "learning_rate": 0.0008061433447098976, + "loss": 6.5371, + "step": 1704 + }, + { + "epoch": 0.5819112627986348, + "grad_norm": 3.4222469329833984, + "learning_rate": 0.0008060295790671217, + "loss": 6.667, + "step": 1705 + }, + { + "epoch": 0.5822525597269624, + "grad_norm": 3.447845697402954, + "learning_rate": 0.0008059158134243458, + "loss": 6.8876, + "step": 1706 + }, + { + "epoch": 0.5825938566552901, + "grad_norm": 5.436509132385254, + "learning_rate": 0.0008058020477815699, + "loss": 6.4207, + "step": 1707 + }, + { + "epoch": 0.5829351535836177, + "grad_norm": 3.4778072834014893, + "learning_rate": 0.000805688282138794, + "loss": 6.5555, + "step": 1708 + }, + { + "epoch": 0.5832764505119454, + "grad_norm": 3.7661755084991455, + "learning_rate": 0.0008055745164960182, + "loss": 6.6308, + "step": 1709 + }, + { + "epoch": 0.5836177474402731, + "grad_norm": 3.373845100402832, + "learning_rate": 0.0008054607508532424, + "loss": 7.044, + "step": 1710 + }, + { + "epoch": 0.5839590443686007, + "grad_norm": 3.3945181369781494, + "learning_rate": 0.0008053469852104665, + "loss": 6.8728, + "step": 1711 + }, + { + "epoch": 0.5843003412969283, + "grad_norm": 3.244920015335083, + "learning_rate": 0.0008052332195676906, + "loss": 6.4727, + "step": 1712 + }, + { + "epoch": 0.5846416382252559, + "grad_norm": 3.3084375858306885, + "learning_rate": 0.0008051194539249147, + "loss": 6.6563, + "step": 1713 + }, + { + "epoch": 0.5849829351535836, + "grad_norm": 3.3623743057250977, + "learning_rate": 0.0008050056882821389, + "loss": 6.8908, + "step": 1714 + }, + { + "epoch": 0.5853242320819113, + "grad_norm": 3.3226821422576904, + "learning_rate": 0.000804891922639363, + "loss": 6.7584, + "step": 1715 + }, + { + "epoch": 0.5856655290102389, + "grad_norm": 3.3474953174591064, + "learning_rate": 0.0008047781569965871, + "loss": 7.0349, + "step": 1716 + }, + { + "epoch": 0.5860068259385666, + "grad_norm": 3.4366891384124756, + "learning_rate": 0.0008046643913538112, + "loss": 6.7988, + "step": 1717 + }, + { + "epoch": 0.5863481228668942, + "grad_norm": 3.3742077350616455, + "learning_rate": 0.0008045506257110353, + "loss": 6.3303, + "step": 1718 + }, + { + "epoch": 0.5866894197952218, + "grad_norm": 3.367166042327881, + "learning_rate": 0.0008044368600682594, + "loss": 6.9229, + "step": 1719 + }, + { + "epoch": 0.5870307167235495, + "grad_norm": 3.3801698684692383, + "learning_rate": 0.0008043230944254836, + "loss": 6.4958, + "step": 1720 + }, + { + "epoch": 0.5873720136518771, + "grad_norm": 3.513958692550659, + "learning_rate": 0.0008042093287827077, + "loss": 6.4753, + "step": 1721 + }, + { + "epoch": 0.5877133105802048, + "grad_norm": 3.514188051223755, + "learning_rate": 0.0008040955631399317, + "loss": 6.6555, + "step": 1722 + }, + { + "epoch": 0.5880546075085324, + "grad_norm": 3.736156702041626, + "learning_rate": 0.0008039817974971558, + "loss": 6.2383, + "step": 1723 + }, + { + "epoch": 0.5883959044368601, + "grad_norm": 3.569303274154663, + "learning_rate": 0.0008038680318543799, + "loss": 6.6994, + "step": 1724 + }, + { + "epoch": 0.5887372013651877, + "grad_norm": 3.693387031555176, + "learning_rate": 0.000803754266211604, + "loss": 7.2297, + "step": 1725 + }, + { + "epoch": 0.5890784982935153, + "grad_norm": 3.4730892181396484, + "learning_rate": 0.0008036405005688282, + "loss": 6.9965, + "step": 1726 + }, + { + "epoch": 0.589419795221843, + "grad_norm": 3.8345372676849365, + "learning_rate": 0.0008035267349260524, + "loss": 6.1234, + "step": 1727 + }, + { + "epoch": 0.5897610921501707, + "grad_norm": 3.4538676738739014, + "learning_rate": 0.0008034129692832765, + "loss": 6.5552, + "step": 1728 + }, + { + "epoch": 0.5901023890784983, + "grad_norm": 3.578291654586792, + "learning_rate": 0.0008032992036405006, + "loss": 7.2901, + "step": 1729 + }, + { + "epoch": 0.590443686006826, + "grad_norm": 3.2786262035369873, + "learning_rate": 0.0008031854379977247, + "loss": 6.9642, + "step": 1730 + }, + { + "epoch": 0.5907849829351536, + "grad_norm": 3.3056552410125732, + "learning_rate": 0.0008030716723549488, + "loss": 6.7187, + "step": 1731 + }, + { + "epoch": 0.5911262798634812, + "grad_norm": 3.6989173889160156, + "learning_rate": 0.000802957906712173, + "loss": 6.7993, + "step": 1732 + }, + { + "epoch": 0.5914675767918088, + "grad_norm": 3.3782973289489746, + "learning_rate": 0.0008028441410693971, + "loss": 6.8576, + "step": 1733 + }, + { + "epoch": 0.5918088737201365, + "grad_norm": 3.2711713314056396, + "learning_rate": 0.0008027303754266212, + "loss": 6.8393, + "step": 1734 + }, + { + "epoch": 0.5921501706484642, + "grad_norm": 3.265575885772705, + "learning_rate": 0.0008026166097838453, + "loss": 6.5095, + "step": 1735 + }, + { + "epoch": 0.5924914675767918, + "grad_norm": 3.287745237350464, + "learning_rate": 0.0008025028441410694, + "loss": 6.8591, + "step": 1736 + }, + { + "epoch": 0.5928327645051195, + "grad_norm": 4.958430290222168, + "learning_rate": 0.0008023890784982936, + "loss": 6.7966, + "step": 1737 + }, + { + "epoch": 0.5931740614334471, + "grad_norm": 4.398024082183838, + "learning_rate": 0.0008022753128555177, + "loss": 5.6212, + "step": 1738 + }, + { + "epoch": 0.5935153583617747, + "grad_norm": 3.486335277557373, + "learning_rate": 0.0008021615472127418, + "loss": 6.9339, + "step": 1739 + }, + { + "epoch": 0.5938566552901023, + "grad_norm": 3.2780847549438477, + "learning_rate": 0.0008020477815699659, + "loss": 6.4291, + "step": 1740 + }, + { + "epoch": 0.5941979522184301, + "grad_norm": 8.394953727722168, + "learning_rate": 0.0008019340159271899, + "loss": 6.1876, + "step": 1741 + }, + { + "epoch": 0.5945392491467577, + "grad_norm": 3.4329168796539307, + "learning_rate": 0.000801820250284414, + "loss": 4.1804, + "step": 1742 + }, + { + "epoch": 0.5948805460750853, + "grad_norm": 4.278625011444092, + "learning_rate": 0.0008017064846416382, + "loss": 6.3612, + "step": 1743 + }, + { + "epoch": 0.595221843003413, + "grad_norm": 3.5409955978393555, + "learning_rate": 0.0008015927189988624, + "loss": 6.4157, + "step": 1744 + }, + { + "epoch": 0.5955631399317406, + "grad_norm": 5.1169891357421875, + "learning_rate": 0.0008014789533560865, + "loss": 5.6104, + "step": 1745 + }, + { + "epoch": 0.5959044368600682, + "grad_norm": 3.445643901824951, + "learning_rate": 0.0008013651877133106, + "loss": 6.0491, + "step": 1746 + }, + { + "epoch": 0.596245733788396, + "grad_norm": 3.8756022453308105, + "learning_rate": 0.0008012514220705347, + "loss": 5.9952, + "step": 1747 + }, + { + "epoch": 0.5965870307167236, + "grad_norm": 3.3911144733428955, + "learning_rate": 0.0008011376564277588, + "loss": 7.2222, + "step": 1748 + }, + { + "epoch": 0.5969283276450512, + "grad_norm": 3.516190528869629, + "learning_rate": 0.000801023890784983, + "loss": 6.4552, + "step": 1749 + }, + { + "epoch": 0.5972696245733788, + "grad_norm": 3.3756418228149414, + "learning_rate": 0.0008009101251422071, + "loss": 6.643, + "step": 1750 + }, + { + "epoch": 0.5976109215017065, + "grad_norm": 6.02211856842041, + "learning_rate": 0.0008007963594994312, + "loss": 6.1575, + "step": 1751 + }, + { + "epoch": 0.5979522184300341, + "grad_norm": 3.416879177093506, + "learning_rate": 0.0008006825938566553, + "loss": 6.7652, + "step": 1752 + }, + { + "epoch": 0.5982935153583617, + "grad_norm": 4.541624069213867, + "learning_rate": 0.0008005688282138794, + "loss": 6.4852, + "step": 1753 + }, + { + "epoch": 0.5986348122866895, + "grad_norm": 12.214677810668945, + "learning_rate": 0.0008004550625711036, + "loss": 6.6869, + "step": 1754 + }, + { + "epoch": 0.5989761092150171, + "grad_norm": 3.7498533725738525, + "learning_rate": 0.0008003412969283277, + "loss": 6.6797, + "step": 1755 + }, + { + "epoch": 0.5993174061433447, + "grad_norm": 7.0607523918151855, + "learning_rate": 0.0008002275312855518, + "loss": 6.0239, + "step": 1756 + }, + { + "epoch": 0.5996587030716723, + "grad_norm": 5.875295639038086, + "learning_rate": 0.0008001137656427759, + "loss": 6.777, + "step": 1757 + }, + { + "epoch": 0.6, + "grad_norm": 3.4712796211242676, + "learning_rate": 0.0008, + "loss": 6.5113, + "step": 1758 + }, + { + "epoch": 0.6003412969283276, + "grad_norm": 3.289867877960205, + "learning_rate": 0.0007998862343572241, + "loss": 6.271, + "step": 1759 + }, + { + "epoch": 0.6006825938566553, + "grad_norm": 3.281529664993286, + "learning_rate": 0.0007997724687144482, + "loss": 6.068, + "step": 1760 + }, + { + "epoch": 0.601023890784983, + "grad_norm": 3.141273260116577, + "learning_rate": 0.0007996587030716724, + "loss": 6.6534, + "step": 1761 + }, + { + "epoch": 0.6013651877133106, + "grad_norm": 3.538984537124634, + "learning_rate": 0.0007995449374288965, + "loss": 6.7176, + "step": 1762 + }, + { + "epoch": 0.6017064846416382, + "grad_norm": 3.208937644958496, + "learning_rate": 0.0007994311717861206, + "loss": 6.2085, + "step": 1763 + }, + { + "epoch": 0.6020477815699659, + "grad_norm": 3.4167351722717285, + "learning_rate": 0.0007993174061433447, + "loss": 6.4306, + "step": 1764 + }, + { + "epoch": 0.6023890784982935, + "grad_norm": 3.342594861984253, + "learning_rate": 0.0007992036405005688, + "loss": 6.9312, + "step": 1765 + }, + { + "epoch": 0.6027303754266211, + "grad_norm": 3.4137015342712402, + "learning_rate": 0.000799089874857793, + "loss": 6.6204, + "step": 1766 + }, + { + "epoch": 0.6030716723549489, + "grad_norm": 3.446514368057251, + "learning_rate": 0.0007989761092150171, + "loss": 6.866, + "step": 1767 + }, + { + "epoch": 0.6034129692832765, + "grad_norm": 3.3566482067108154, + "learning_rate": 0.0007988623435722412, + "loss": 6.1879, + "step": 1768 + }, + { + "epoch": 0.6037542662116041, + "grad_norm": 3.4932174682617188, + "learning_rate": 0.0007987485779294653, + "loss": 7.2527, + "step": 1769 + }, + { + "epoch": 0.6040955631399317, + "grad_norm": 3.385852098464966, + "learning_rate": 0.0007986348122866894, + "loss": 7.0579, + "step": 1770 + }, + { + "epoch": 0.6044368600682594, + "grad_norm": 3.311905860900879, + "learning_rate": 0.0007985210466439135, + "loss": 7.0279, + "step": 1771 + }, + { + "epoch": 0.604778156996587, + "grad_norm": 3.295506238937378, + "learning_rate": 0.0007984072810011377, + "loss": 6.3771, + "step": 1772 + }, + { + "epoch": 0.6051194539249147, + "grad_norm": 3.4223151206970215, + "learning_rate": 0.0007982935153583618, + "loss": 6.1649, + "step": 1773 + }, + { + "epoch": 0.6054607508532424, + "grad_norm": 3.4446609020233154, + "learning_rate": 0.0007981797497155859, + "loss": 5.7184, + "step": 1774 + }, + { + "epoch": 0.60580204778157, + "grad_norm": 3.3032233715057373, + "learning_rate": 0.00079806598407281, + "loss": 6.4993, + "step": 1775 + }, + { + "epoch": 0.6061433447098976, + "grad_norm": 4.309022426605225, + "learning_rate": 0.0007979522184300341, + "loss": 5.8551, + "step": 1776 + }, + { + "epoch": 0.6064846416382252, + "grad_norm": 5.289666175842285, + "learning_rate": 0.0007978384527872584, + "loss": 6.1384, + "step": 1777 + }, + { + "epoch": 0.6068259385665529, + "grad_norm": 3.589069366455078, + "learning_rate": 0.0007977246871444825, + "loss": 6.6301, + "step": 1778 + }, + { + "epoch": 0.6071672354948805, + "grad_norm": 4.165103435516357, + "learning_rate": 0.0007976109215017066, + "loss": 6.2209, + "step": 1779 + }, + { + "epoch": 0.6075085324232082, + "grad_norm": 3.654651641845703, + "learning_rate": 0.0007974971558589306, + "loss": 6.8522, + "step": 1780 + }, + { + "epoch": 0.6078498293515359, + "grad_norm": 3.5410244464874268, + "learning_rate": 0.0007973833902161547, + "loss": 6.7159, + "step": 1781 + }, + { + "epoch": 0.6081911262798635, + "grad_norm": 3.301004648208618, + "learning_rate": 0.0007972696245733788, + "loss": 6.7727, + "step": 1782 + }, + { + "epoch": 0.6085324232081911, + "grad_norm": 8.049127578735352, + "learning_rate": 0.000797155858930603, + "loss": 5.6872, + "step": 1783 + }, + { + "epoch": 0.6088737201365187, + "grad_norm": 3.7068381309509277, + "learning_rate": 0.0007970420932878271, + "loss": 6.6709, + "step": 1784 + }, + { + "epoch": 0.6092150170648464, + "grad_norm": 3.3772456645965576, + "learning_rate": 0.0007969283276450512, + "loss": 6.8441, + "step": 1785 + }, + { + "epoch": 0.6095563139931741, + "grad_norm": 3.3938705921173096, + "learning_rate": 0.0007968145620022753, + "loss": 6.7171, + "step": 1786 + }, + { + "epoch": 0.6098976109215017, + "grad_norm": 3.674920082092285, + "learning_rate": 0.0007967007963594994, + "loss": 6.424, + "step": 1787 + }, + { + "epoch": 0.6102389078498294, + "grad_norm": 5.99582576751709, + "learning_rate": 0.0007965870307167235, + "loss": 6.1769, + "step": 1788 + }, + { + "epoch": 0.610580204778157, + "grad_norm": 3.5924954414367676, + "learning_rate": 0.0007964732650739477, + "loss": 6.5125, + "step": 1789 + }, + { + "epoch": 0.6109215017064846, + "grad_norm": 3.4204015731811523, + "learning_rate": 0.0007963594994311718, + "loss": 6.6597, + "step": 1790 + }, + { + "epoch": 0.6112627986348123, + "grad_norm": 3.3368844985961914, + "learning_rate": 0.0007962457337883959, + "loss": 6.2667, + "step": 1791 + }, + { + "epoch": 0.6116040955631399, + "grad_norm": 3.3641443252563477, + "learning_rate": 0.00079613196814562, + "loss": 6.5083, + "step": 1792 + }, + { + "epoch": 0.6119453924914676, + "grad_norm": 8.113044738769531, + "learning_rate": 0.0007960182025028441, + "loss": 6.1718, + "step": 1793 + }, + { + "epoch": 0.6122866894197952, + "grad_norm": 3.4572935104370117, + "learning_rate": 0.0007959044368600682, + "loss": 6.3586, + "step": 1794 + }, + { + "epoch": 0.6126279863481229, + "grad_norm": 3.6194283962249756, + "learning_rate": 0.0007957906712172925, + "loss": 6.8134, + "step": 1795 + }, + { + "epoch": 0.6129692832764505, + "grad_norm": 3.5017333030700684, + "learning_rate": 0.0007956769055745166, + "loss": 7.0632, + "step": 1796 + }, + { + "epoch": 0.6133105802047781, + "grad_norm": 3.5431251525878906, + "learning_rate": 0.0007955631399317407, + "loss": 7.0487, + "step": 1797 + }, + { + "epoch": 0.6136518771331058, + "grad_norm": 3.3673455715179443, + "learning_rate": 0.0007954493742889648, + "loss": 6.7595, + "step": 1798 + }, + { + "epoch": 0.6139931740614335, + "grad_norm": 5.616215705871582, + "learning_rate": 0.0007953356086461888, + "loss": 5.7906, + "step": 1799 + }, + { + "epoch": 0.6143344709897611, + "grad_norm": 3.828660488128662, + "learning_rate": 0.000795221843003413, + "loss": 6.9296, + "step": 1800 + }, + { + "epoch": 0.6146757679180888, + "grad_norm": 3.748854637145996, + "learning_rate": 0.0007951080773606371, + "loss": 6.3692, + "step": 1801 + }, + { + "epoch": 0.6150170648464164, + "grad_norm": 3.4253838062286377, + "learning_rate": 0.0007949943117178612, + "loss": 6.5113, + "step": 1802 + }, + { + "epoch": 0.615358361774744, + "grad_norm": 5.2447357177734375, + "learning_rate": 0.0007948805460750853, + "loss": 6.4785, + "step": 1803 + }, + { + "epoch": 0.6156996587030716, + "grad_norm": 3.3891453742980957, + "learning_rate": 0.0007947667804323094, + "loss": 7.1645, + "step": 1804 + }, + { + "epoch": 0.6160409556313993, + "grad_norm": 3.5664119720458984, + "learning_rate": 0.0007946530147895335, + "loss": 6.733, + "step": 1805 + }, + { + "epoch": 0.616382252559727, + "grad_norm": 4.1726298332214355, + "learning_rate": 0.0007945392491467577, + "loss": 5.8798, + "step": 1806 + }, + { + "epoch": 0.6167235494880546, + "grad_norm": 4.10631799697876, + "learning_rate": 0.0007944254835039818, + "loss": 6.4036, + "step": 1807 + }, + { + "epoch": 0.6170648464163823, + "grad_norm": 4.00150728225708, + "learning_rate": 0.0007943117178612059, + "loss": 6.7822, + "step": 1808 + }, + { + "epoch": 0.6174061433447099, + "grad_norm": 3.687084674835205, + "learning_rate": 0.00079419795221843, + "loss": 6.653, + "step": 1809 + }, + { + "epoch": 0.6177474402730375, + "grad_norm": 3.6721670627593994, + "learning_rate": 0.0007940841865756541, + "loss": 6.7528, + "step": 1810 + }, + { + "epoch": 0.6180887372013651, + "grad_norm": 3.374575138092041, + "learning_rate": 0.0007939704209328783, + "loss": 6.6044, + "step": 1811 + }, + { + "epoch": 0.6184300341296929, + "grad_norm": 3.891878843307495, + "learning_rate": 0.0007938566552901025, + "loss": 6.495, + "step": 1812 + }, + { + "epoch": 0.6187713310580205, + "grad_norm": 3.5438005924224854, + "learning_rate": 0.0007937428896473266, + "loss": 6.2664, + "step": 1813 + }, + { + "epoch": 0.6191126279863481, + "grad_norm": 3.501797676086426, + "learning_rate": 0.0007936291240045507, + "loss": 6.7247, + "step": 1814 + }, + { + "epoch": 0.6194539249146758, + "grad_norm": 3.600700855255127, + "learning_rate": 0.0007935153583617748, + "loss": 6.8725, + "step": 1815 + }, + { + "epoch": 0.6197952218430034, + "grad_norm": 3.210486650466919, + "learning_rate": 0.0007934015927189989, + "loss": 6.9718, + "step": 1816 + }, + { + "epoch": 0.620136518771331, + "grad_norm": 3.1832642555236816, + "learning_rate": 0.0007932878270762231, + "loss": 6.8543, + "step": 1817 + }, + { + "epoch": 0.6204778156996587, + "grad_norm": 4.302059173583984, + "learning_rate": 0.0007931740614334472, + "loss": 5.7968, + "step": 1818 + }, + { + "epoch": 0.6208191126279864, + "grad_norm": 3.688284158706665, + "learning_rate": 0.0007930602957906712, + "loss": 6.4013, + "step": 1819 + }, + { + "epoch": 0.621160409556314, + "grad_norm": 3.592975616455078, + "learning_rate": 0.0007929465301478953, + "loss": 7.0964, + "step": 1820 + }, + { + "epoch": 0.6215017064846416, + "grad_norm": 3.5165815353393555, + "learning_rate": 0.0007928327645051194, + "loss": 7.1356, + "step": 1821 + }, + { + "epoch": 0.6218430034129693, + "grad_norm": 3.4153037071228027, + "learning_rate": 0.0007927189988623435, + "loss": 6.2598, + "step": 1822 + }, + { + "epoch": 0.6221843003412969, + "grad_norm": 3.3510231971740723, + "learning_rate": 0.0007926052332195677, + "loss": 6.149, + "step": 1823 + }, + { + "epoch": 0.6225255972696245, + "grad_norm": 3.2553350925445557, + "learning_rate": 0.0007924914675767918, + "loss": 6.9447, + "step": 1824 + }, + { + "epoch": 0.6228668941979523, + "grad_norm": 3.4982099533081055, + "learning_rate": 0.0007923777019340159, + "loss": 6.8259, + "step": 1825 + }, + { + "epoch": 0.6232081911262799, + "grad_norm": 3.446209192276001, + "learning_rate": 0.00079226393629124, + "loss": 6.4785, + "step": 1826 + }, + { + "epoch": 0.6235494880546075, + "grad_norm": 3.815865993499756, + "learning_rate": 0.0007921501706484641, + "loss": 6.1361, + "step": 1827 + }, + { + "epoch": 0.6238907849829352, + "grad_norm": 3.4647111892700195, + "learning_rate": 0.0007920364050056883, + "loss": 6.4204, + "step": 1828 + }, + { + "epoch": 0.6242320819112628, + "grad_norm": 5.167119026184082, + "learning_rate": 0.0007919226393629125, + "loss": 6.0301, + "step": 1829 + }, + { + "epoch": 0.6245733788395904, + "grad_norm": 3.3964574337005615, + "learning_rate": 0.0007918088737201366, + "loss": 6.1847, + "step": 1830 + }, + { + "epoch": 0.624914675767918, + "grad_norm": 3.4840471744537354, + "learning_rate": 0.0007916951080773607, + "loss": 6.8604, + "step": 1831 + }, + { + "epoch": 0.6252559726962458, + "grad_norm": 3.4988765716552734, + "learning_rate": 0.0007915813424345848, + "loss": 6.5158, + "step": 1832 + }, + { + "epoch": 0.6255972696245734, + "grad_norm": 3.5883636474609375, + "learning_rate": 0.0007914675767918089, + "loss": 6.6659, + "step": 1833 + }, + { + "epoch": 0.625938566552901, + "grad_norm": 3.3029944896698, + "learning_rate": 0.000791353811149033, + "loss": 6.7029, + "step": 1834 + }, + { + "epoch": 0.6262798634812287, + "grad_norm": 3.6475093364715576, + "learning_rate": 0.0007912400455062572, + "loss": 6.6529, + "step": 1835 + }, + { + "epoch": 0.6266211604095563, + "grad_norm": 3.320028305053711, + "learning_rate": 0.0007911262798634813, + "loss": 6.3984, + "step": 1836 + }, + { + "epoch": 0.6269624573378839, + "grad_norm": 3.4347753524780273, + "learning_rate": 0.0007910125142207054, + "loss": 6.7391, + "step": 1837 + }, + { + "epoch": 0.6273037542662117, + "grad_norm": 3.7779273986816406, + "learning_rate": 0.0007908987485779294, + "loss": 6.4521, + "step": 1838 + }, + { + "epoch": 0.6276450511945393, + "grad_norm": 3.466188907623291, + "learning_rate": 0.0007907849829351535, + "loss": 6.3165, + "step": 1839 + }, + { + "epoch": 0.6279863481228669, + "grad_norm": 3.9799110889434814, + "learning_rate": 0.0007906712172923777, + "loss": 6.4507, + "step": 1840 + }, + { + "epoch": 0.6283276450511945, + "grad_norm": 3.497555732727051, + "learning_rate": 0.0007905574516496018, + "loss": 6.819, + "step": 1841 + }, + { + "epoch": 0.6286689419795222, + "grad_norm": 3.5930631160736084, + "learning_rate": 0.0007904436860068259, + "loss": 6.5014, + "step": 1842 + }, + { + "epoch": 0.6290102389078498, + "grad_norm": 3.517838478088379, + "learning_rate": 0.00079032992036405, + "loss": 6.4191, + "step": 1843 + }, + { + "epoch": 0.6293515358361774, + "grad_norm": 3.4117469787597656, + "learning_rate": 0.0007902161547212741, + "loss": 6.6453, + "step": 1844 + }, + { + "epoch": 0.6296928327645052, + "grad_norm": 3.3217053413391113, + "learning_rate": 0.0007901023890784983, + "loss": 6.5161, + "step": 1845 + }, + { + "epoch": 0.6300341296928328, + "grad_norm": 3.3784968852996826, + "learning_rate": 0.0007899886234357225, + "loss": 6.4929, + "step": 1846 + }, + { + "epoch": 0.6303754266211604, + "grad_norm": 3.2478907108306885, + "learning_rate": 0.0007898748577929466, + "loss": 6.4198, + "step": 1847 + }, + { + "epoch": 0.630716723549488, + "grad_norm": 3.4220783710479736, + "learning_rate": 0.0007897610921501707, + "loss": 6.9195, + "step": 1848 + }, + { + "epoch": 0.6310580204778157, + "grad_norm": 3.5167126655578613, + "learning_rate": 0.0007896473265073948, + "loss": 6.4107, + "step": 1849 + }, + { + "epoch": 0.6313993174061433, + "grad_norm": 3.388826608657837, + "learning_rate": 0.0007895335608646189, + "loss": 6.7458, + "step": 1850 + }, + { + "epoch": 0.631740614334471, + "grad_norm": 3.440612316131592, + "learning_rate": 0.000789419795221843, + "loss": 6.2142, + "step": 1851 + }, + { + "epoch": 0.6320819112627987, + "grad_norm": 3.389021873474121, + "learning_rate": 0.0007893060295790672, + "loss": 6.7827, + "step": 1852 + }, + { + "epoch": 0.6324232081911263, + "grad_norm": 3.3972480297088623, + "learning_rate": 0.0007891922639362913, + "loss": 6.7742, + "step": 1853 + }, + { + "epoch": 0.6327645051194539, + "grad_norm": 4.108426094055176, + "learning_rate": 0.0007890784982935154, + "loss": 6.5065, + "step": 1854 + }, + { + "epoch": 0.6331058020477816, + "grad_norm": 3.433762311935425, + "learning_rate": 0.0007889647326507395, + "loss": 6.8187, + "step": 1855 + }, + { + "epoch": 0.6334470989761092, + "grad_norm": 4.6445746421813965, + "learning_rate": 0.0007888509670079636, + "loss": 5.8341, + "step": 1856 + }, + { + "epoch": 0.6337883959044368, + "grad_norm": 4.046677589416504, + "learning_rate": 0.0007887372013651878, + "loss": 5.9867, + "step": 1857 + }, + { + "epoch": 0.6341296928327645, + "grad_norm": 3.4539291858673096, + "learning_rate": 0.0007886234357224118, + "loss": 6.8548, + "step": 1858 + }, + { + "epoch": 0.6344709897610922, + "grad_norm": 8.063450813293457, + "learning_rate": 0.0007885096700796359, + "loss": 7.0799, + "step": 1859 + }, + { + "epoch": 0.6348122866894198, + "grad_norm": 4.95775032043457, + "learning_rate": 0.00078839590443686, + "loss": 6.2372, + "step": 1860 + }, + { + "epoch": 0.6351535836177474, + "grad_norm": 3.3882105350494385, + "learning_rate": 0.0007882821387940841, + "loss": 6.8213, + "step": 1861 + }, + { + "epoch": 0.6354948805460751, + "grad_norm": 3.2759015560150146, + "learning_rate": 0.0007881683731513083, + "loss": 6.717, + "step": 1862 + }, + { + "epoch": 0.6358361774744027, + "grad_norm": 3.300447463989258, + "learning_rate": 0.0007880546075085325, + "loss": 6.194, + "step": 1863 + }, + { + "epoch": 0.6361774744027304, + "grad_norm": 3.613112688064575, + "learning_rate": 0.0007879408418657566, + "loss": 6.6778, + "step": 1864 + }, + { + "epoch": 0.636518771331058, + "grad_norm": 3.3438189029693604, + "learning_rate": 0.0007878270762229807, + "loss": 6.2512, + "step": 1865 + }, + { + "epoch": 0.6368600682593857, + "grad_norm": 3.257805347442627, + "learning_rate": 0.0007877133105802048, + "loss": 6.7676, + "step": 1866 + }, + { + "epoch": 0.6372013651877133, + "grad_norm": 3.257349729537964, + "learning_rate": 0.0007875995449374289, + "loss": 6.7774, + "step": 1867 + }, + { + "epoch": 0.6375426621160409, + "grad_norm": 9.708579063415527, + "learning_rate": 0.000787485779294653, + "loss": 6.7028, + "step": 1868 + }, + { + "epoch": 0.6378839590443686, + "grad_norm": 3.499809503555298, + "learning_rate": 0.0007873720136518772, + "loss": 6.5104, + "step": 1869 + }, + { + "epoch": 0.6382252559726962, + "grad_norm": 3.5874903202056885, + "learning_rate": 0.0007872582480091013, + "loss": 7.1038, + "step": 1870 + }, + { + "epoch": 0.6385665529010239, + "grad_norm": 3.483231544494629, + "learning_rate": 0.0007871444823663254, + "loss": 6.7159, + "step": 1871 + }, + { + "epoch": 0.6389078498293516, + "grad_norm": 3.406229257583618, + "learning_rate": 0.0007870307167235495, + "loss": 6.5912, + "step": 1872 + }, + { + "epoch": 0.6392491467576792, + "grad_norm": 3.2390456199645996, + "learning_rate": 0.0007869169510807736, + "loss": 6.7661, + "step": 1873 + }, + { + "epoch": 0.6395904436860068, + "grad_norm": 5.70449161529541, + "learning_rate": 0.0007868031854379977, + "loss": 5.6305, + "step": 1874 + }, + { + "epoch": 0.6399317406143344, + "grad_norm": 3.646918535232544, + "learning_rate": 0.0007866894197952219, + "loss": 6.772, + "step": 1875 + }, + { + "epoch": 0.6402730375426621, + "grad_norm": 3.5219454765319824, + "learning_rate": 0.000786575654152446, + "loss": 6.6647, + "step": 1876 + }, + { + "epoch": 0.6406143344709898, + "grad_norm": 3.4784984588623047, + "learning_rate": 0.00078646188850967, + "loss": 6.8152, + "step": 1877 + }, + { + "epoch": 0.6409556313993174, + "grad_norm": 3.503469467163086, + "learning_rate": 0.0007863481228668941, + "loss": 6.5573, + "step": 1878 + }, + { + "epoch": 0.6412969283276451, + "grad_norm": 3.3382554054260254, + "learning_rate": 0.0007862343572241183, + "loss": 7.0561, + "step": 1879 + }, + { + "epoch": 0.6416382252559727, + "grad_norm": 3.2195024490356445, + "learning_rate": 0.0007861205915813425, + "loss": 6.5636, + "step": 1880 + }, + { + "epoch": 0.6419795221843003, + "grad_norm": 3.3817391395568848, + "learning_rate": 0.0007860068259385666, + "loss": 6.0123, + "step": 1881 + }, + { + "epoch": 0.642320819112628, + "grad_norm": 5.903642177581787, + "learning_rate": 0.0007858930602957907, + "loss": 5.0916, + "step": 1882 + }, + { + "epoch": 0.6426621160409556, + "grad_norm": 3.8153889179229736, + "learning_rate": 0.0007857792946530148, + "loss": 6.7254, + "step": 1883 + }, + { + "epoch": 0.6430034129692833, + "grad_norm": 3.6918563842773438, + "learning_rate": 0.0007856655290102389, + "loss": 6.8791, + "step": 1884 + }, + { + "epoch": 0.643344709897611, + "grad_norm": 3.476710796356201, + "learning_rate": 0.000785551763367463, + "loss": 6.2294, + "step": 1885 + }, + { + "epoch": 0.6436860068259386, + "grad_norm": 3.537196397781372, + "learning_rate": 0.0007854379977246872, + "loss": 6.5357, + "step": 1886 + }, + { + "epoch": 0.6440273037542662, + "grad_norm": 4.697029113769531, + "learning_rate": 0.0007853242320819113, + "loss": 6.3114, + "step": 1887 + }, + { + "epoch": 0.6443686006825938, + "grad_norm": 3.5125951766967773, + "learning_rate": 0.0007852104664391354, + "loss": 6.6103, + "step": 1888 + }, + { + "epoch": 0.6447098976109215, + "grad_norm": 3.5048978328704834, + "learning_rate": 0.0007850967007963595, + "loss": 7.0019, + "step": 1889 + }, + { + "epoch": 0.6450511945392492, + "grad_norm": 3.2914698123931885, + "learning_rate": 0.0007849829351535836, + "loss": 6.8293, + "step": 1890 + }, + { + "epoch": 0.6453924914675768, + "grad_norm": 5.254654407501221, + "learning_rate": 0.0007848691695108077, + "loss": 6.2791, + "step": 1891 + }, + { + "epoch": 0.6457337883959045, + "grad_norm": 3.4875900745391846, + "learning_rate": 0.0007847554038680319, + "loss": 7.0699, + "step": 1892 + }, + { + "epoch": 0.6460750853242321, + "grad_norm": 7.53113317489624, + "learning_rate": 0.000784641638225256, + "loss": 5.7548, + "step": 1893 + }, + { + "epoch": 0.6464163822525597, + "grad_norm": 3.497556209564209, + "learning_rate": 0.0007845278725824802, + "loss": 6.2634, + "step": 1894 + }, + { + "epoch": 0.6467576791808873, + "grad_norm": 5.653488636016846, + "learning_rate": 0.0007844141069397043, + "loss": 5.9736, + "step": 1895 + }, + { + "epoch": 0.647098976109215, + "grad_norm": 3.3323814868927, + "learning_rate": 0.0007843003412969284, + "loss": 6.4086, + "step": 1896 + }, + { + "epoch": 0.6474402730375427, + "grad_norm": 5.373404026031494, + "learning_rate": 0.0007841865756541524, + "loss": 6.2502, + "step": 1897 + }, + { + "epoch": 0.6477815699658703, + "grad_norm": 4.002843379974365, + "learning_rate": 0.0007840728100113766, + "loss": 6.3939, + "step": 1898 + }, + { + "epoch": 0.648122866894198, + "grad_norm": 3.4836678504943848, + "learning_rate": 0.0007839590443686007, + "loss": 6.5778, + "step": 1899 + }, + { + "epoch": 0.6484641638225256, + "grad_norm": 5.268570899963379, + "learning_rate": 0.0007838452787258248, + "loss": 6.1417, + "step": 1900 + }, + { + "epoch": 0.6488054607508532, + "grad_norm": 3.3893330097198486, + "learning_rate": 0.0007837315130830489, + "loss": 6.579, + "step": 1901 + }, + { + "epoch": 0.6491467576791808, + "grad_norm": 3.871875762939453, + "learning_rate": 0.000783617747440273, + "loss": 6.7902, + "step": 1902 + }, + { + "epoch": 0.6494880546075086, + "grad_norm": 3.5524959564208984, + "learning_rate": 0.0007835039817974972, + "loss": 6.6698, + "step": 1903 + }, + { + "epoch": 0.6498293515358362, + "grad_norm": 3.273620128631592, + "learning_rate": 0.0007833902161547213, + "loss": 6.2347, + "step": 1904 + }, + { + "epoch": 0.6501706484641638, + "grad_norm": 3.283205986022949, + "learning_rate": 0.0007832764505119454, + "loss": 5.978, + "step": 1905 + }, + { + "epoch": 0.6505119453924915, + "grad_norm": 3.333651304244995, + "learning_rate": 0.0007831626848691695, + "loss": 6.5876, + "step": 1906 + }, + { + "epoch": 0.6508532423208191, + "grad_norm": 6.1999030113220215, + "learning_rate": 0.0007830489192263936, + "loss": 6.6631, + "step": 1907 + }, + { + "epoch": 0.6511945392491467, + "grad_norm": 3.410543203353882, + "learning_rate": 0.0007829351535836177, + "loss": 6.5243, + "step": 1908 + }, + { + "epoch": 0.6515358361774743, + "grad_norm": 4.422999858856201, + "learning_rate": 0.0007828213879408419, + "loss": 6.4633, + "step": 1909 + }, + { + "epoch": 0.6518771331058021, + "grad_norm": 3.5655972957611084, + "learning_rate": 0.000782707622298066, + "loss": 6.524, + "step": 1910 + }, + { + "epoch": 0.6522184300341297, + "grad_norm": 3.3885929584503174, + "learning_rate": 0.0007825938566552902, + "loss": 6.7883, + "step": 1911 + }, + { + "epoch": 0.6525597269624573, + "grad_norm": 3.3488664627075195, + "learning_rate": 0.0007824800910125143, + "loss": 6.917, + "step": 1912 + }, + { + "epoch": 0.652901023890785, + "grad_norm": 3.5068624019622803, + "learning_rate": 0.0007823663253697384, + "loss": 6.6262, + "step": 1913 + }, + { + "epoch": 0.6532423208191126, + "grad_norm": 3.233506202697754, + "learning_rate": 0.0007822525597269625, + "loss": 6.6697, + "step": 1914 + }, + { + "epoch": 0.6535836177474402, + "grad_norm": 3.320382833480835, + "learning_rate": 0.0007821387940841867, + "loss": 6.9184, + "step": 1915 + }, + { + "epoch": 0.653924914675768, + "grad_norm": 3.2479734420776367, + "learning_rate": 0.0007820250284414107, + "loss": 6.8682, + "step": 1916 + }, + { + "epoch": 0.6542662116040956, + "grad_norm": 3.2496206760406494, + "learning_rate": 0.0007819112627986348, + "loss": 6.945, + "step": 1917 + }, + { + "epoch": 0.6546075085324232, + "grad_norm": 3.3230035305023193, + "learning_rate": 0.0007817974971558589, + "loss": 6.6453, + "step": 1918 + }, + { + "epoch": 0.6549488054607508, + "grad_norm": 4.4280619621276855, + "learning_rate": 0.000781683731513083, + "loss": 5.9717, + "step": 1919 + }, + { + "epoch": 0.6552901023890785, + "grad_norm": 3.414978265762329, + "learning_rate": 0.0007815699658703072, + "loss": 7.0985, + "step": 1920 + }, + { + "epoch": 0.6556313993174061, + "grad_norm": 3.3314380645751953, + "learning_rate": 0.0007814562002275313, + "loss": 6.9524, + "step": 1921 + }, + { + "epoch": 0.6559726962457337, + "grad_norm": 3.4111015796661377, + "learning_rate": 0.0007813424345847554, + "loss": 6.455, + "step": 1922 + }, + { + "epoch": 0.6563139931740615, + "grad_norm": 3.4409379959106445, + "learning_rate": 0.0007812286689419795, + "loss": 6.6308, + "step": 1923 + }, + { + "epoch": 0.6566552901023891, + "grad_norm": 3.246084451675415, + "learning_rate": 0.0007811149032992036, + "loss": 6.8811, + "step": 1924 + }, + { + "epoch": 0.6569965870307167, + "grad_norm": 3.421894073486328, + "learning_rate": 0.0007810011376564277, + "loss": 6.9111, + "step": 1925 + }, + { + "epoch": 0.6573378839590444, + "grad_norm": 3.2279820442199707, + "learning_rate": 0.0007808873720136519, + "loss": 6.8361, + "step": 1926 + }, + { + "epoch": 0.657679180887372, + "grad_norm": 3.337752103805542, + "learning_rate": 0.000780773606370876, + "loss": 7.0658, + "step": 1927 + }, + { + "epoch": 0.6580204778156996, + "grad_norm": 3.4158434867858887, + "learning_rate": 0.0007806598407281002, + "loss": 6.6466, + "step": 1928 + }, + { + "epoch": 0.6583617747440274, + "grad_norm": 3.2831523418426514, + "learning_rate": 0.0007805460750853243, + "loss": 6.6213, + "step": 1929 + }, + { + "epoch": 0.658703071672355, + "grad_norm": 3.8880624771118164, + "learning_rate": 0.0007804323094425484, + "loss": 6.2975, + "step": 1930 + }, + { + "epoch": 0.6590443686006826, + "grad_norm": 3.35299015045166, + "learning_rate": 0.0007803185437997725, + "loss": 7.1754, + "step": 1931 + }, + { + "epoch": 0.6593856655290102, + "grad_norm": 3.4394142627716064, + "learning_rate": 0.0007802047781569967, + "loss": 6.6023, + "step": 1932 + }, + { + "epoch": 0.6597269624573379, + "grad_norm": 3.386638641357422, + "learning_rate": 0.0007800910125142208, + "loss": 6.9677, + "step": 1933 + }, + { + "epoch": 0.6600682593856655, + "grad_norm": 3.344113826751709, + "learning_rate": 0.0007799772468714449, + "loss": 6.9545, + "step": 1934 + }, + { + "epoch": 0.6604095563139932, + "grad_norm": 3.3587405681610107, + "learning_rate": 0.0007798634812286689, + "loss": 7.0659, + "step": 1935 + }, + { + "epoch": 0.6607508532423209, + "grad_norm": 3.9267964363098145, + "learning_rate": 0.000779749715585893, + "loss": 4.5715, + "step": 1936 + }, + { + "epoch": 0.6610921501706485, + "grad_norm": 3.7344627380371094, + "learning_rate": 0.0007796359499431171, + "loss": 6.0021, + "step": 1937 + }, + { + "epoch": 0.6614334470989761, + "grad_norm": 3.6474874019622803, + "learning_rate": 0.0007795221843003413, + "loss": 6.2885, + "step": 1938 + }, + { + "epoch": 0.6617747440273037, + "grad_norm": 6.553812026977539, + "learning_rate": 0.0007794084186575654, + "loss": 5.7511, + "step": 1939 + }, + { + "epoch": 0.6621160409556314, + "grad_norm": 3.5463573932647705, + "learning_rate": 0.0007792946530147895, + "loss": 6.4974, + "step": 1940 + }, + { + "epoch": 0.662457337883959, + "grad_norm": 3.536761999130249, + "learning_rate": 0.0007791808873720136, + "loss": 6.4656, + "step": 1941 + }, + { + "epoch": 0.6627986348122867, + "grad_norm": 3.480790615081787, + "learning_rate": 0.0007790671217292377, + "loss": 5.9657, + "step": 1942 + }, + { + "epoch": 0.6631399317406144, + "grad_norm": 3.640864849090576, + "learning_rate": 0.000778953356086462, + "loss": 7.0483, + "step": 1943 + }, + { + "epoch": 0.663481228668942, + "grad_norm": 3.7452423572540283, + "learning_rate": 0.000778839590443686, + "loss": 6.3805, + "step": 1944 + }, + { + "epoch": 0.6638225255972696, + "grad_norm": 3.4025330543518066, + "learning_rate": 0.0007787258248009102, + "loss": 6.5517, + "step": 1945 + }, + { + "epoch": 0.6641638225255972, + "grad_norm": 3.4373252391815186, + "learning_rate": 0.0007786120591581343, + "loss": 6.8531, + "step": 1946 + }, + { + "epoch": 0.6645051194539249, + "grad_norm": 3.367783546447754, + "learning_rate": 0.0007784982935153584, + "loss": 6.8904, + "step": 1947 + }, + { + "epoch": 0.6648464163822526, + "grad_norm": 3.365324020385742, + "learning_rate": 0.0007783845278725825, + "loss": 6.4844, + "step": 1948 + }, + { + "epoch": 0.6651877133105802, + "grad_norm": 3.2752928733825684, + "learning_rate": 0.0007782707622298067, + "loss": 6.9888, + "step": 1949 + }, + { + "epoch": 0.6655290102389079, + "grad_norm": 3.411865711212158, + "learning_rate": 0.0007781569965870308, + "loss": 6.9369, + "step": 1950 + }, + { + "epoch": 0.6658703071672355, + "grad_norm": 3.8111231327056885, + "learning_rate": 0.0007780432309442549, + "loss": 5.9984, + "step": 1951 + }, + { + "epoch": 0.6662116040955631, + "grad_norm": 3.5093679428100586, + "learning_rate": 0.000777929465301479, + "loss": 7.1369, + "step": 1952 + }, + { + "epoch": 0.6665529010238908, + "grad_norm": 8.49619197845459, + "learning_rate": 0.0007778156996587031, + "loss": 6.2056, + "step": 1953 + }, + { + "epoch": 0.6668941979522184, + "grad_norm": 3.572247266769409, + "learning_rate": 0.0007777019340159272, + "loss": 7.0313, + "step": 1954 + }, + { + "epoch": 0.6672354948805461, + "grad_norm": 6.142834663391113, + "learning_rate": 0.0007775881683731513, + "loss": 4.7668, + "step": 1955 + }, + { + "epoch": 0.6675767918088737, + "grad_norm": 3.7518608570098877, + "learning_rate": 0.0007774744027303754, + "loss": 7.0454, + "step": 1956 + }, + { + "epoch": 0.6679180887372014, + "grad_norm": 3.596379280090332, + "learning_rate": 0.0007773606370875995, + "loss": 6.8423, + "step": 1957 + }, + { + "epoch": 0.668259385665529, + "grad_norm": 3.454772710800171, + "learning_rate": 0.0007772468714448236, + "loss": 6.3305, + "step": 1958 + }, + { + "epoch": 0.6686006825938566, + "grad_norm": 3.3043911457061768, + "learning_rate": 0.0007771331058020477, + "loss": 6.575, + "step": 1959 + }, + { + "epoch": 0.6689419795221843, + "grad_norm": 3.314772844314575, + "learning_rate": 0.0007770193401592718, + "loss": 6.6896, + "step": 1960 + }, + { + "epoch": 0.669283276450512, + "grad_norm": 3.414109230041504, + "learning_rate": 0.000776905574516496, + "loss": 6.3657, + "step": 1961 + }, + { + "epoch": 0.6696245733788396, + "grad_norm": 8.866144180297852, + "learning_rate": 0.0007767918088737202, + "loss": 6.4915, + "step": 1962 + }, + { + "epoch": 0.6699658703071673, + "grad_norm": 5.978751182556152, + "learning_rate": 0.0007766780432309443, + "loss": 4.808, + "step": 1963 + }, + { + "epoch": 0.6703071672354949, + "grad_norm": 4.224075794219971, + "learning_rate": 0.0007765642775881684, + "loss": 6.2096, + "step": 1964 + }, + { + "epoch": 0.6706484641638225, + "grad_norm": 3.7597286701202393, + "learning_rate": 0.0007764505119453925, + "loss": 6.6962, + "step": 1965 + }, + { + "epoch": 0.6709897610921501, + "grad_norm": 3.6143290996551514, + "learning_rate": 0.0007763367463026167, + "loss": 6.2249, + "step": 1966 + }, + { + "epoch": 0.6713310580204778, + "grad_norm": 3.733955144882202, + "learning_rate": 0.0007762229806598408, + "loss": 6.1922, + "step": 1967 + }, + { + "epoch": 0.6716723549488055, + "grad_norm": 3.268876791000366, + "learning_rate": 0.0007761092150170649, + "loss": 6.6224, + "step": 1968 + }, + { + "epoch": 0.6720136518771331, + "grad_norm": 3.7937519550323486, + "learning_rate": 0.000775995449374289, + "loss": 6.1537, + "step": 1969 + }, + { + "epoch": 0.6723549488054608, + "grad_norm": 3.96634578704834, + "learning_rate": 0.0007758816837315131, + "loss": 5.2715, + "step": 1970 + }, + { + "epoch": 0.6726962457337884, + "grad_norm": 3.32902455329895, + "learning_rate": 0.0007757679180887372, + "loss": 7.1139, + "step": 1971 + }, + { + "epoch": 0.673037542662116, + "grad_norm": 3.3868730068206787, + "learning_rate": 0.0007756541524459614, + "loss": 7.0923, + "step": 1972 + }, + { + "epoch": 0.6733788395904436, + "grad_norm": 3.4353487491607666, + "learning_rate": 0.0007755403868031855, + "loss": 6.6146, + "step": 1973 + }, + { + "epoch": 0.6737201365187714, + "grad_norm": 4.369273662567139, + "learning_rate": 0.0007754266211604095, + "loss": 6.2917, + "step": 1974 + }, + { + "epoch": 0.674061433447099, + "grad_norm": 3.72794508934021, + "learning_rate": 0.0007753128555176336, + "loss": 6.1588, + "step": 1975 + }, + { + "epoch": 0.6744027303754266, + "grad_norm": 3.5048928260803223, + "learning_rate": 0.0007751990898748577, + "loss": 6.603, + "step": 1976 + }, + { + "epoch": 0.6747440273037543, + "grad_norm": 4.048648834228516, + "learning_rate": 0.0007750853242320818, + "loss": 5.5782, + "step": 1977 + }, + { + "epoch": 0.6750853242320819, + "grad_norm": 3.9075467586517334, + "learning_rate": 0.000774971558589306, + "loss": 5.8681, + "step": 1978 + }, + { + "epoch": 0.6754266211604095, + "grad_norm": 3.3288211822509766, + "learning_rate": 0.0007748577929465302, + "loss": 6.6653, + "step": 1979 + }, + { + "epoch": 0.6757679180887372, + "grad_norm": 3.535240650177002, + "learning_rate": 0.0007747440273037543, + "loss": 6.4881, + "step": 1980 + }, + { + "epoch": 0.6761092150170649, + "grad_norm": 3.2548129558563232, + "learning_rate": 0.0007746302616609784, + "loss": 6.792, + "step": 1981 + }, + { + "epoch": 0.6764505119453925, + "grad_norm": 3.5258853435516357, + "learning_rate": 0.0007745164960182025, + "loss": 6.5358, + "step": 1982 + }, + { + "epoch": 0.6767918088737201, + "grad_norm": 5.013880729675293, + "learning_rate": 0.0007744027303754267, + "loss": 6.1146, + "step": 1983 + }, + { + "epoch": 0.6771331058020478, + "grad_norm": 3.550227165222168, + "learning_rate": 0.0007742889647326508, + "loss": 6.9435, + "step": 1984 + }, + { + "epoch": 0.6774744027303754, + "grad_norm": 3.3671066761016846, + "learning_rate": 0.0007741751990898749, + "loss": 6.4325, + "step": 1985 + }, + { + "epoch": 0.677815699658703, + "grad_norm": 4.051577091217041, + "learning_rate": 0.000774061433447099, + "loss": 6.3684, + "step": 1986 + }, + { + "epoch": 0.6781569965870308, + "grad_norm": 3.442668914794922, + "learning_rate": 0.0007739476678043231, + "loss": 6.7317, + "step": 1987 + }, + { + "epoch": 0.6784982935153584, + "grad_norm": 3.2804269790649414, + "learning_rate": 0.0007738339021615472, + "loss": 6.6793, + "step": 1988 + }, + { + "epoch": 0.678839590443686, + "grad_norm": 4.920018196105957, + "learning_rate": 0.0007737201365187714, + "loss": 5.9878, + "step": 1989 + }, + { + "epoch": 0.6791808873720137, + "grad_norm": 3.5679967403411865, + "learning_rate": 0.0007736063708759955, + "loss": 6.9321, + "step": 1990 + }, + { + "epoch": 0.6795221843003413, + "grad_norm": 3.628213405609131, + "learning_rate": 0.0007734926052332196, + "loss": 6.5297, + "step": 1991 + }, + { + "epoch": 0.6798634812286689, + "grad_norm": 3.5422585010528564, + "learning_rate": 0.0007733788395904437, + "loss": 6.1211, + "step": 1992 + }, + { + "epoch": 0.6802047781569965, + "grad_norm": 3.3573875427246094, + "learning_rate": 0.0007732650739476678, + "loss": 7.0201, + "step": 1993 + }, + { + "epoch": 0.6805460750853243, + "grad_norm": 3.2984368801116943, + "learning_rate": 0.0007731513083048918, + "loss": 6.5626, + "step": 1994 + }, + { + "epoch": 0.6808873720136519, + "grad_norm": 3.394038200378418, + "learning_rate": 0.000773037542662116, + "loss": 6.9126, + "step": 1995 + }, + { + "epoch": 0.6812286689419795, + "grad_norm": 3.2731823921203613, + "learning_rate": 0.0007729237770193402, + "loss": 6.8367, + "step": 1996 + }, + { + "epoch": 0.6815699658703072, + "grad_norm": 3.3242337703704834, + "learning_rate": 0.0007728100113765643, + "loss": 6.6835, + "step": 1997 + }, + { + "epoch": 0.6819112627986348, + "grad_norm": 3.444890260696411, + "learning_rate": 0.0007726962457337884, + "loss": 6.7718, + "step": 1998 + }, + { + "epoch": 0.6822525597269624, + "grad_norm": 3.4120771884918213, + "learning_rate": 0.0007725824800910125, + "loss": 6.6572, + "step": 1999 + }, + { + "epoch": 0.6825938566552902, + "grad_norm": 4.183150768280029, + "learning_rate": 0.0007724687144482366, + "loss": 6.0941, + "step": 2000 + }, + { + "epoch": 0.6829351535836178, + "grad_norm": 3.6355671882629395, + "learning_rate": 0.0007723549488054608, + "loss": 5.8741, + "step": 2001 + }, + { + "epoch": 0.6832764505119454, + "grad_norm": 6.971808433532715, + "learning_rate": 0.0007722411831626849, + "loss": 4.9892, + "step": 2002 + }, + { + "epoch": 0.683617747440273, + "grad_norm": 4.916726589202881, + "learning_rate": 0.000772127417519909, + "loss": 6.2013, + "step": 2003 + }, + { + "epoch": 0.6839590443686007, + "grad_norm": 3.724529981613159, + "learning_rate": 0.0007720136518771331, + "loss": 6.086, + "step": 2004 + }, + { + "epoch": 0.6843003412969283, + "grad_norm": 3.7401139736175537, + "learning_rate": 0.0007718998862343572, + "loss": 6.875, + "step": 2005 + }, + { + "epoch": 0.6846416382252559, + "grad_norm": 3.41847562789917, + "learning_rate": 0.0007717861205915814, + "loss": 6.9408, + "step": 2006 + }, + { + "epoch": 0.6849829351535837, + "grad_norm": 3.425471782684326, + "learning_rate": 0.0007716723549488055, + "loss": 6.7579, + "step": 2007 + }, + { + "epoch": 0.6853242320819113, + "grad_norm": 3.197573661804199, + "learning_rate": 0.0007715585893060296, + "loss": 6.6664, + "step": 2008 + }, + { + "epoch": 0.6856655290102389, + "grad_norm": 3.4145679473876953, + "learning_rate": 0.0007714448236632537, + "loss": 6.515, + "step": 2009 + }, + { + "epoch": 0.6860068259385665, + "grad_norm": 3.9534404277801514, + "learning_rate": 0.0007713310580204778, + "loss": 6.2037, + "step": 2010 + }, + { + "epoch": 0.6863481228668942, + "grad_norm": 3.274080514907837, + "learning_rate": 0.000771217292377702, + "loss": 6.4559, + "step": 2011 + }, + { + "epoch": 0.6866894197952218, + "grad_norm": 3.604434013366699, + "learning_rate": 0.0007711035267349262, + "loss": 7.1227, + "step": 2012 + }, + { + "epoch": 0.6870307167235495, + "grad_norm": 3.635999917984009, + "learning_rate": 0.0007709897610921502, + "loss": 6.5486, + "step": 2013 + }, + { + "epoch": 0.6873720136518772, + "grad_norm": 3.22017765045166, + "learning_rate": 0.0007708759954493743, + "loss": 6.9555, + "step": 2014 + }, + { + "epoch": 0.6877133105802048, + "grad_norm": 3.5242180824279785, + "learning_rate": 0.0007707622298065984, + "loss": 6.6827, + "step": 2015 + }, + { + "epoch": 0.6880546075085324, + "grad_norm": 3.7955944538116455, + "learning_rate": 0.0007706484641638225, + "loss": 5.7521, + "step": 2016 + }, + { + "epoch": 0.68839590443686, + "grad_norm": 3.422865629196167, + "learning_rate": 0.0007705346985210466, + "loss": 6.7181, + "step": 2017 + }, + { + "epoch": 0.6887372013651877, + "grad_norm": 3.7344348430633545, + "learning_rate": 0.0007704209328782708, + "loss": 6.6848, + "step": 2018 + }, + { + "epoch": 0.6890784982935153, + "grad_norm": 3.6247267723083496, + "learning_rate": 0.0007703071672354949, + "loss": 6.3779, + "step": 2019 + }, + { + "epoch": 0.689419795221843, + "grad_norm": 3.669395923614502, + "learning_rate": 0.000770193401592719, + "loss": 6.2553, + "step": 2020 + }, + { + "epoch": 0.6897610921501707, + "grad_norm": 3.4517054557800293, + "learning_rate": 0.0007700796359499431, + "loss": 6.3794, + "step": 2021 + }, + { + "epoch": 0.6901023890784983, + "grad_norm": 3.477140188217163, + "learning_rate": 0.0007699658703071672, + "loss": 7.0087, + "step": 2022 + }, + { + "epoch": 0.6904436860068259, + "grad_norm": 3.354229688644409, + "learning_rate": 0.0007698521046643914, + "loss": 6.3737, + "step": 2023 + }, + { + "epoch": 0.6907849829351536, + "grad_norm": 3.4395453929901123, + "learning_rate": 0.0007697383390216155, + "loss": 6.6592, + "step": 2024 + }, + { + "epoch": 0.6911262798634812, + "grad_norm": 3.5157663822174072, + "learning_rate": 0.0007696245733788396, + "loss": 6.6477, + "step": 2025 + }, + { + "epoch": 0.6914675767918089, + "grad_norm": 3.3607208728790283, + "learning_rate": 0.0007695108077360637, + "loss": 6.3353, + "step": 2026 + }, + { + "epoch": 0.6918088737201366, + "grad_norm": 5.396403789520264, + "learning_rate": 0.0007693970420932878, + "loss": 6.0246, + "step": 2027 + }, + { + "epoch": 0.6921501706484642, + "grad_norm": 3.4905757904052734, + "learning_rate": 0.000769283276450512, + "loss": 6.1745, + "step": 2028 + }, + { + "epoch": 0.6924914675767918, + "grad_norm": 3.5405080318450928, + "learning_rate": 0.0007691695108077362, + "loss": 6.5764, + "step": 2029 + }, + { + "epoch": 0.6928327645051194, + "grad_norm": 4.243968963623047, + "learning_rate": 0.0007690557451649603, + "loss": 6.2635, + "step": 2030 + }, + { + "epoch": 0.6931740614334471, + "grad_norm": 6.531406402587891, + "learning_rate": 0.0007689419795221844, + "loss": 5.7165, + "step": 2031 + }, + { + "epoch": 0.6935153583617747, + "grad_norm": 3.5809173583984375, + "learning_rate": 0.0007688282138794085, + "loss": 6.8084, + "step": 2032 + }, + { + "epoch": 0.6938566552901024, + "grad_norm": 3.7270376682281494, + "learning_rate": 0.0007687144482366325, + "loss": 6.7852, + "step": 2033 + }, + { + "epoch": 0.6941979522184301, + "grad_norm": 3.384864330291748, + "learning_rate": 0.0007686006825938566, + "loss": 6.6244, + "step": 2034 + }, + { + "epoch": 0.6945392491467577, + "grad_norm": 4.850109100341797, + "learning_rate": 0.0007684869169510808, + "loss": 6.7117, + "step": 2035 + }, + { + "epoch": 0.6948805460750853, + "grad_norm": 6.860106468200684, + "learning_rate": 0.0007683731513083049, + "loss": 6.1666, + "step": 2036 + }, + { + "epoch": 0.6952218430034129, + "grad_norm": 3.350128412246704, + "learning_rate": 0.000768259385665529, + "loss": 6.9472, + "step": 2037 + }, + { + "epoch": 0.6955631399317406, + "grad_norm": 3.3778512477874756, + "learning_rate": 0.0007681456200227531, + "loss": 6.8923, + "step": 2038 + }, + { + "epoch": 0.6959044368600683, + "grad_norm": 3.257622003555298, + "learning_rate": 0.0007680318543799772, + "loss": 6.6449, + "step": 2039 + }, + { + "epoch": 0.6962457337883959, + "grad_norm": 3.383254051208496, + "learning_rate": 0.0007679180887372013, + "loss": 5.932, + "step": 2040 + }, + { + "epoch": 0.6965870307167236, + "grad_norm": 3.2423672676086426, + "learning_rate": 0.0007678043230944255, + "loss": 6.2983, + "step": 2041 + }, + { + "epoch": 0.6969283276450512, + "grad_norm": 3.3049263954162598, + "learning_rate": 0.0007676905574516496, + "loss": 6.4907, + "step": 2042 + }, + { + "epoch": 0.6972696245733788, + "grad_norm": 3.3618760108947754, + "learning_rate": 0.0007675767918088737, + "loss": 6.322, + "step": 2043 + }, + { + "epoch": 0.6976109215017064, + "grad_norm": 3.2208092212677, + "learning_rate": 0.0007674630261660978, + "loss": 6.8044, + "step": 2044 + }, + { + "epoch": 0.6979522184300341, + "grad_norm": 3.3235998153686523, + "learning_rate": 0.000767349260523322, + "loss": 5.9484, + "step": 2045 + }, + { + "epoch": 0.6982935153583618, + "grad_norm": 3.5273094177246094, + "learning_rate": 0.0007672354948805462, + "loss": 6.9184, + "step": 2046 + }, + { + "epoch": 0.6986348122866894, + "grad_norm": 3.4016122817993164, + "learning_rate": 0.0007671217292377703, + "loss": 6.5703, + "step": 2047 + }, + { + "epoch": 0.6989761092150171, + "grad_norm": 5.7711405754089355, + "learning_rate": 0.0007670079635949944, + "loss": 5.5968, + "step": 2048 + }, + { + "epoch": 0.6993174061433447, + "grad_norm": 7.544925689697266, + "learning_rate": 0.0007668941979522185, + "loss": 6.2352, + "step": 2049 + }, + { + "epoch": 0.6996587030716723, + "grad_norm": 3.5243000984191895, + "learning_rate": 0.0007667804323094426, + "loss": 6.3483, + "step": 2050 + }, + { + "epoch": 0.7, + "grad_norm": 3.510413885116577, + "learning_rate": 0.0007666666666666667, + "loss": 6.7339, + "step": 2051 + }, + { + "epoch": 0.7003412969283277, + "grad_norm": 3.272040843963623, + "learning_rate": 0.0007665529010238908, + "loss": 6.3808, + "step": 2052 + }, + { + "epoch": 0.7006825938566553, + "grad_norm": 3.230138063430786, + "learning_rate": 0.0007664391353811149, + "loss": 6.4907, + "step": 2053 + }, + { + "epoch": 0.701023890784983, + "grad_norm": 3.2666988372802734, + "learning_rate": 0.000766325369738339, + "loss": 6.5341, + "step": 2054 + }, + { + "epoch": 0.7013651877133106, + "grad_norm": 3.246366262435913, + "learning_rate": 0.0007662116040955631, + "loss": 6.4526, + "step": 2055 + }, + { + "epoch": 0.7017064846416382, + "grad_norm": 3.516317844390869, + "learning_rate": 0.0007660978384527872, + "loss": 7.0432, + "step": 2056 + }, + { + "epoch": 0.7020477815699658, + "grad_norm": 3.530691146850586, + "learning_rate": 0.0007659840728100113, + "loss": 6.7314, + "step": 2057 + }, + { + "epoch": 0.7023890784982935, + "grad_norm": 3.3396496772766113, + "learning_rate": 0.0007658703071672355, + "loss": 6.9605, + "step": 2058 + }, + { + "epoch": 0.7027303754266212, + "grad_norm": 3.306985378265381, + "learning_rate": 0.0007657565415244596, + "loss": 6.488, + "step": 2059 + }, + { + "epoch": 0.7030716723549488, + "grad_norm": 3.3087706565856934, + "learning_rate": 0.0007656427758816837, + "loss": 5.9937, + "step": 2060 + }, + { + "epoch": 0.7034129692832765, + "grad_norm": 3.159797191619873, + "learning_rate": 0.0007655290102389078, + "loss": 6.5233, + "step": 2061 + }, + { + "epoch": 0.7037542662116041, + "grad_norm": 3.4056601524353027, + "learning_rate": 0.000765415244596132, + "loss": 6.7224, + "step": 2062 + }, + { + "epoch": 0.7040955631399317, + "grad_norm": 3.676870822906494, + "learning_rate": 0.0007653014789533561, + "loss": 6.3446, + "step": 2063 + }, + { + "epoch": 0.7044368600682593, + "grad_norm": 3.3793466091156006, + "learning_rate": 0.0007651877133105803, + "loss": 6.8083, + "step": 2064 + }, + { + "epoch": 0.7047781569965871, + "grad_norm": 3.507800340652466, + "learning_rate": 0.0007650739476678044, + "loss": 7.0639, + "step": 2065 + }, + { + "epoch": 0.7051194539249147, + "grad_norm": 6.222537994384766, + "learning_rate": 0.0007649601820250285, + "loss": 6.6979, + "step": 2066 + }, + { + "epoch": 0.7054607508532423, + "grad_norm": 3.477949380874634, + "learning_rate": 0.0007648464163822526, + "loss": 6.6638, + "step": 2067 + }, + { + "epoch": 0.70580204778157, + "grad_norm": 3.4923906326293945, + "learning_rate": 0.0007647326507394767, + "loss": 6.8871, + "step": 2068 + }, + { + "epoch": 0.7061433447098976, + "grad_norm": 3.818601608276367, + "learning_rate": 0.0007646188850967009, + "loss": 6.0797, + "step": 2069 + }, + { + "epoch": 0.7064846416382252, + "grad_norm": 3.1661059856414795, + "learning_rate": 0.000764505119453925, + "loss": 6.2969, + "step": 2070 + }, + { + "epoch": 0.7068259385665528, + "grad_norm": 3.237128496170044, + "learning_rate": 0.0007643913538111491, + "loss": 6.4024, + "step": 2071 + }, + { + "epoch": 0.7071672354948806, + "grad_norm": 3.7169055938720703, + "learning_rate": 0.0007642775881683731, + "loss": 5.803, + "step": 2072 + }, + { + "epoch": 0.7075085324232082, + "grad_norm": 3.3643290996551514, + "learning_rate": 0.0007641638225255972, + "loss": 6.5204, + "step": 2073 + }, + { + "epoch": 0.7078498293515358, + "grad_norm": 3.5020577907562256, + "learning_rate": 0.0007640500568828213, + "loss": 6.4471, + "step": 2074 + }, + { + "epoch": 0.7081911262798635, + "grad_norm": 3.174459934234619, + "learning_rate": 0.0007639362912400455, + "loss": 6.4608, + "step": 2075 + }, + { + "epoch": 0.7085324232081911, + "grad_norm": 3.408348321914673, + "learning_rate": 0.0007638225255972696, + "loss": 6.2846, + "step": 2076 + }, + { + "epoch": 0.7088737201365187, + "grad_norm": 3.218968152999878, + "learning_rate": 0.0007637087599544937, + "loss": 6.5383, + "step": 2077 + }, + { + "epoch": 0.7092150170648465, + "grad_norm": 3.2198712825775146, + "learning_rate": 0.0007635949943117178, + "loss": 6.9809, + "step": 2078 + }, + { + "epoch": 0.7095563139931741, + "grad_norm": 3.364302635192871, + "learning_rate": 0.000763481228668942, + "loss": 6.0809, + "step": 2079 + }, + { + "epoch": 0.7098976109215017, + "grad_norm": 3.22308611869812, + "learning_rate": 0.0007633674630261661, + "loss": 6.7824, + "step": 2080 + }, + { + "epoch": 0.7102389078498293, + "grad_norm": 3.4846742153167725, + "learning_rate": 0.0007632536973833903, + "loss": 6.8048, + "step": 2081 + }, + { + "epoch": 0.710580204778157, + "grad_norm": 3.32481050491333, + "learning_rate": 0.0007631399317406144, + "loss": 6.4347, + "step": 2082 + }, + { + "epoch": 0.7109215017064846, + "grad_norm": 3.340768814086914, + "learning_rate": 0.0007630261660978385, + "loss": 6.7707, + "step": 2083 + }, + { + "epoch": 0.7112627986348122, + "grad_norm": 5.2757086753845215, + "learning_rate": 0.0007629124004550626, + "loss": 5.6813, + "step": 2084 + }, + { + "epoch": 0.71160409556314, + "grad_norm": 3.4196841716766357, + "learning_rate": 0.0007627986348122867, + "loss": 6.8647, + "step": 2085 + }, + { + "epoch": 0.7119453924914676, + "grad_norm": 3.5735483169555664, + "learning_rate": 0.0007626848691695109, + "loss": 6.4418, + "step": 2086 + }, + { + "epoch": 0.7122866894197952, + "grad_norm": 3.366319179534912, + "learning_rate": 0.000762571103526735, + "loss": 6.5476, + "step": 2087 + }, + { + "epoch": 0.7126279863481229, + "grad_norm": 3.2403366565704346, + "learning_rate": 0.0007624573378839591, + "loss": 6.5685, + "step": 2088 + }, + { + "epoch": 0.7129692832764505, + "grad_norm": 3.317497730255127, + "learning_rate": 0.0007623435722411832, + "loss": 6.5655, + "step": 2089 + }, + { + "epoch": 0.7133105802047781, + "grad_norm": 3.3788607120513916, + "learning_rate": 0.0007622298065984073, + "loss": 6.5866, + "step": 2090 + }, + { + "epoch": 0.7136518771331058, + "grad_norm": 7.484004020690918, + "learning_rate": 0.0007621160409556313, + "loss": 5.0795, + "step": 2091 + }, + { + "epoch": 0.7139931740614335, + "grad_norm": 3.5781850814819336, + "learning_rate": 0.0007620022753128555, + "loss": 6.0668, + "step": 2092 + }, + { + "epoch": 0.7143344709897611, + "grad_norm": 3.5423996448516846, + "learning_rate": 0.0007618885096700796, + "loss": 6.4591, + "step": 2093 + }, + { + "epoch": 0.7146757679180887, + "grad_norm": 3.2912189960479736, + "learning_rate": 0.0007617747440273037, + "loss": 6.3944, + "step": 2094 + }, + { + "epoch": 0.7150170648464164, + "grad_norm": 3.3964452743530273, + "learning_rate": 0.0007616609783845278, + "loss": 6.0996, + "step": 2095 + }, + { + "epoch": 0.715358361774744, + "grad_norm": 3.4383544921875, + "learning_rate": 0.000761547212741752, + "loss": 6.8264, + "step": 2096 + }, + { + "epoch": 0.7156996587030716, + "grad_norm": 3.80399227142334, + "learning_rate": 0.0007614334470989761, + "loss": 5.6848, + "step": 2097 + }, + { + "epoch": 0.7160409556313994, + "grad_norm": 3.4346396923065186, + "learning_rate": 0.0007613196814562003, + "loss": 6.3825, + "step": 2098 + }, + { + "epoch": 0.716382252559727, + "grad_norm": 4.535429000854492, + "learning_rate": 0.0007612059158134244, + "loss": 5.9877, + "step": 2099 + }, + { + "epoch": 0.7167235494880546, + "grad_norm": 4.623420238494873, + "learning_rate": 0.0007610921501706485, + "loss": 6.6088, + "step": 2100 + }, + { + "epoch": 0.7170648464163822, + "grad_norm": 3.520456552505493, + "learning_rate": 0.0007609783845278726, + "loss": 6.8445, + "step": 2101 + }, + { + "epoch": 0.7174061433447099, + "grad_norm": 3.4850854873657227, + "learning_rate": 0.0007608646188850967, + "loss": 6.5181, + "step": 2102 + }, + { + "epoch": 0.7177474402730375, + "grad_norm": 3.35722279548645, + "learning_rate": 0.0007607508532423208, + "loss": 7.1329, + "step": 2103 + }, + { + "epoch": 0.7180887372013652, + "grad_norm": 3.2173945903778076, + "learning_rate": 0.000760637087599545, + "loss": 6.9774, + "step": 2104 + }, + { + "epoch": 0.7184300341296929, + "grad_norm": 3.337705373764038, + "learning_rate": 0.0007605233219567691, + "loss": 7.1209, + "step": 2105 + }, + { + "epoch": 0.7187713310580205, + "grad_norm": 3.361863136291504, + "learning_rate": 0.0007604095563139932, + "loss": 6.4386, + "step": 2106 + }, + { + "epoch": 0.7191126279863481, + "grad_norm": 3.371455669403076, + "learning_rate": 0.0007602957906712173, + "loss": 7.0502, + "step": 2107 + }, + { + "epoch": 0.7194539249146757, + "grad_norm": 3.6178102493286133, + "learning_rate": 0.0007601820250284414, + "loss": 6.33, + "step": 2108 + }, + { + "epoch": 0.7197952218430034, + "grad_norm": 3.337318181991577, + "learning_rate": 0.0007600682593856656, + "loss": 7.0613, + "step": 2109 + }, + { + "epoch": 0.7201365187713311, + "grad_norm": 3.516834020614624, + "learning_rate": 0.0007599544937428896, + "loss": 6.2131, + "step": 2110 + }, + { + "epoch": 0.7204778156996587, + "grad_norm": 3.4340057373046875, + "learning_rate": 0.0007598407281001137, + "loss": 6.4354, + "step": 2111 + }, + { + "epoch": 0.7208191126279864, + "grad_norm": 3.5667688846588135, + "learning_rate": 0.0007597269624573379, + "loss": 6.841, + "step": 2112 + }, + { + "epoch": 0.721160409556314, + "grad_norm": 3.2933144569396973, + "learning_rate": 0.000759613196814562, + "loss": 6.542, + "step": 2113 + }, + { + "epoch": 0.7215017064846416, + "grad_norm": 3.4875848293304443, + "learning_rate": 0.0007594994311717861, + "loss": 6.9857, + "step": 2114 + }, + { + "epoch": 0.7218430034129693, + "grad_norm": 4.947911739349365, + "learning_rate": 0.0007593856655290103, + "loss": 5.5756, + "step": 2115 + }, + { + "epoch": 0.7221843003412969, + "grad_norm": 3.511719226837158, + "learning_rate": 0.0007592718998862344, + "loss": 6.6189, + "step": 2116 + }, + { + "epoch": 0.7225255972696246, + "grad_norm": 3.4902215003967285, + "learning_rate": 0.0007591581342434585, + "loss": 6.9783, + "step": 2117 + }, + { + "epoch": 0.7228668941979522, + "grad_norm": 3.5169901847839355, + "learning_rate": 0.0007590443686006826, + "loss": 6.8164, + "step": 2118 + }, + { + "epoch": 0.7232081911262799, + "grad_norm": 3.3471617698669434, + "learning_rate": 0.0007589306029579067, + "loss": 6.8412, + "step": 2119 + }, + { + "epoch": 0.7235494880546075, + "grad_norm": 3.3342645168304443, + "learning_rate": 0.0007588168373151308, + "loss": 6.4882, + "step": 2120 + }, + { + "epoch": 0.7238907849829351, + "grad_norm": 3.29974627494812, + "learning_rate": 0.000758703071672355, + "loss": 6.5342, + "step": 2121 + }, + { + "epoch": 0.7242320819112628, + "grad_norm": 5.739980697631836, + "learning_rate": 0.0007585893060295791, + "loss": 6.6463, + "step": 2122 + }, + { + "epoch": 0.7245733788395905, + "grad_norm": 4.314258575439453, + "learning_rate": 0.0007584755403868032, + "loss": 5.9307, + "step": 2123 + }, + { + "epoch": 0.7249146757679181, + "grad_norm": 5.394867897033691, + "learning_rate": 0.0007583617747440273, + "loss": 6.2618, + "step": 2124 + }, + { + "epoch": 0.7252559726962458, + "grad_norm": 3.7781617641448975, + "learning_rate": 0.0007582480091012514, + "loss": 6.4021, + "step": 2125 + }, + { + "epoch": 0.7255972696245734, + "grad_norm": 3.883280038833618, + "learning_rate": 0.0007581342434584756, + "loss": 6.2862, + "step": 2126 + }, + { + "epoch": 0.725938566552901, + "grad_norm": 3.35640025138855, + "learning_rate": 0.0007580204778156997, + "loss": 6.9783, + "step": 2127 + }, + { + "epoch": 0.7262798634812286, + "grad_norm": 3.3483872413635254, + "learning_rate": 0.0007579067121729239, + "loss": 6.6387, + "step": 2128 + }, + { + "epoch": 0.7266211604095563, + "grad_norm": 3.295193672180176, + "learning_rate": 0.000757792946530148, + "loss": 6.7598, + "step": 2129 + }, + { + "epoch": 0.726962457337884, + "grad_norm": 5.115669250488281, + "learning_rate": 0.000757679180887372, + "loss": 5.9118, + "step": 2130 + }, + { + "epoch": 0.7273037542662116, + "grad_norm": 3.72269606590271, + "learning_rate": 0.0007575654152445961, + "loss": 6.5912, + "step": 2131 + }, + { + "epoch": 0.7276450511945393, + "grad_norm": 3.6045024394989014, + "learning_rate": 0.0007574516496018203, + "loss": 6.7721, + "step": 2132 + }, + { + "epoch": 0.7279863481228669, + "grad_norm": 3.3702828884124756, + "learning_rate": 0.0007573378839590444, + "loss": 6.1741, + "step": 2133 + }, + { + "epoch": 0.7283276450511945, + "grad_norm": 3.3547866344451904, + "learning_rate": 0.0007572241183162685, + "loss": 6.5913, + "step": 2134 + }, + { + "epoch": 0.7286689419795221, + "grad_norm": 3.366942882537842, + "learning_rate": 0.0007571103526734926, + "loss": 6.519, + "step": 2135 + }, + { + "epoch": 0.7290102389078499, + "grad_norm": 3.3425605297088623, + "learning_rate": 0.0007569965870307167, + "loss": 7.0345, + "step": 2136 + }, + { + "epoch": 0.7293515358361775, + "grad_norm": 6.479069232940674, + "learning_rate": 0.0007568828213879408, + "loss": 6.3204, + "step": 2137 + }, + { + "epoch": 0.7296928327645051, + "grad_norm": 3.5541889667510986, + "learning_rate": 0.000756769055745165, + "loss": 6.6065, + "step": 2138 + }, + { + "epoch": 0.7300341296928328, + "grad_norm": 3.4596619606018066, + "learning_rate": 0.0007566552901023891, + "loss": 6.6794, + "step": 2139 + }, + { + "epoch": 0.7303754266211604, + "grad_norm": 3.4551479816436768, + "learning_rate": 0.0007565415244596132, + "loss": 6.7369, + "step": 2140 + }, + { + "epoch": 0.730716723549488, + "grad_norm": 3.431898832321167, + "learning_rate": 0.0007564277588168373, + "loss": 6.1183, + "step": 2141 + }, + { + "epoch": 0.7310580204778157, + "grad_norm": 3.323843002319336, + "learning_rate": 0.0007563139931740614, + "loss": 6.919, + "step": 2142 + }, + { + "epoch": 0.7313993174061434, + "grad_norm": 3.4173998832702637, + "learning_rate": 0.0007562002275312855, + "loss": 6.7127, + "step": 2143 + }, + { + "epoch": 0.731740614334471, + "grad_norm": 3.3746798038482666, + "learning_rate": 0.0007560864618885098, + "loss": 6.8423, + "step": 2144 + }, + { + "epoch": 0.7320819112627986, + "grad_norm": 3.6130669116973877, + "learning_rate": 0.0007559726962457339, + "loss": 6.3245, + "step": 2145 + }, + { + "epoch": 0.7324232081911263, + "grad_norm": 3.3271877765655518, + "learning_rate": 0.000755858930602958, + "loss": 6.885, + "step": 2146 + }, + { + "epoch": 0.7327645051194539, + "grad_norm": 4.328009605407715, + "learning_rate": 0.0007557451649601821, + "loss": 6.3472, + "step": 2147 + }, + { + "epoch": 0.7331058020477815, + "grad_norm": 3.4987595081329346, + "learning_rate": 0.0007556313993174062, + "loss": 6.632, + "step": 2148 + }, + { + "epoch": 0.7334470989761093, + "grad_norm": 3.967522144317627, + "learning_rate": 0.0007555176336746303, + "loss": 6.3009, + "step": 2149 + }, + { + "epoch": 0.7337883959044369, + "grad_norm": 4.190941333770752, + "learning_rate": 0.0007554038680318544, + "loss": 5.315, + "step": 2150 + }, + { + "epoch": 0.7341296928327645, + "grad_norm": 3.2750062942504883, + "learning_rate": 0.0007552901023890785, + "loss": 6.4132, + "step": 2151 + }, + { + "epoch": 0.7344709897610922, + "grad_norm": 3.1912786960601807, + "learning_rate": 0.0007551763367463026, + "loss": 6.4303, + "step": 2152 + }, + { + "epoch": 0.7348122866894198, + "grad_norm": 4.60476016998291, + "learning_rate": 0.0007550625711035267, + "loss": 5.7562, + "step": 2153 + }, + { + "epoch": 0.7351535836177474, + "grad_norm": 3.1308608055114746, + "learning_rate": 0.0007549488054607508, + "loss": 6.6797, + "step": 2154 + }, + { + "epoch": 0.735494880546075, + "grad_norm": 3.2094202041625977, + "learning_rate": 0.000754835039817975, + "loss": 6.6746, + "step": 2155 + }, + { + "epoch": 0.7358361774744028, + "grad_norm": 3.3604657649993896, + "learning_rate": 0.0007547212741751991, + "loss": 7.061, + "step": 2156 + }, + { + "epoch": 0.7361774744027304, + "grad_norm": 3.310800075531006, + "learning_rate": 0.0007546075085324232, + "loss": 6.4595, + "step": 2157 + }, + { + "epoch": 0.736518771331058, + "grad_norm": 4.224414348602295, + "learning_rate": 0.0007544937428896473, + "loss": 6.3834, + "step": 2158 + }, + { + "epoch": 0.7368600682593857, + "grad_norm": 3.4265730381011963, + "learning_rate": 0.0007543799772468714, + "loss": 6.8672, + "step": 2159 + }, + { + "epoch": 0.7372013651877133, + "grad_norm": 3.1829991340637207, + "learning_rate": 0.0007542662116040955, + "loss": 6.7422, + "step": 2160 + }, + { + "epoch": 0.7375426621160409, + "grad_norm": 3.3258779048919678, + "learning_rate": 0.0007541524459613198, + "loss": 6.8123, + "step": 2161 + }, + { + "epoch": 0.7378839590443687, + "grad_norm": 3.4259634017944336, + "learning_rate": 0.0007540386803185439, + "loss": 7.0327, + "step": 2162 + }, + { + "epoch": 0.7382252559726963, + "grad_norm": 6.536485195159912, + "learning_rate": 0.000753924914675768, + "loss": 6.6945, + "step": 2163 + }, + { + "epoch": 0.7385665529010239, + "grad_norm": 3.6783645153045654, + "learning_rate": 0.0007538111490329921, + "loss": 7.1871, + "step": 2164 + }, + { + "epoch": 0.7389078498293515, + "grad_norm": 3.5718960762023926, + "learning_rate": 0.0007536973833902162, + "loss": 6.322, + "step": 2165 + }, + { + "epoch": 0.7392491467576792, + "grad_norm": 3.513333320617676, + "learning_rate": 0.0007535836177474404, + "loss": 6.5015, + "step": 2166 + }, + { + "epoch": 0.7395904436860068, + "grad_norm": 3.306096315383911, + "learning_rate": 0.0007534698521046645, + "loss": 6.2813, + "step": 2167 + }, + { + "epoch": 0.7399317406143344, + "grad_norm": 4.2289557456970215, + "learning_rate": 0.0007533560864618886, + "loss": 6.128, + "step": 2168 + }, + { + "epoch": 0.7402730375426622, + "grad_norm": 3.4186182022094727, + "learning_rate": 0.0007532423208191126, + "loss": 6.4153, + "step": 2169 + }, + { + "epoch": 0.7406143344709898, + "grad_norm": 3.4397335052490234, + "learning_rate": 0.0007531285551763367, + "loss": 6.4691, + "step": 2170 + }, + { + "epoch": 0.7409556313993174, + "grad_norm": 3.2782340049743652, + "learning_rate": 0.0007530147895335608, + "loss": 6.8505, + "step": 2171 + }, + { + "epoch": 0.741296928327645, + "grad_norm": 3.247593879699707, + "learning_rate": 0.000752901023890785, + "loss": 6.429, + "step": 2172 + }, + { + "epoch": 0.7416382252559727, + "grad_norm": 3.29437518119812, + "learning_rate": 0.0007527872582480091, + "loss": 6.7258, + "step": 2173 + }, + { + "epoch": 0.7419795221843003, + "grad_norm": 3.4087376594543457, + "learning_rate": 0.0007526734926052332, + "loss": 7.0698, + "step": 2174 + }, + { + "epoch": 0.742320819112628, + "grad_norm": 3.289987564086914, + "learning_rate": 0.0007525597269624573, + "loss": 6.3238, + "step": 2175 + }, + { + "epoch": 0.7426621160409557, + "grad_norm": 3.370927095413208, + "learning_rate": 0.0007524459613196814, + "loss": 6.6566, + "step": 2176 + }, + { + "epoch": 0.7430034129692833, + "grad_norm": 3.518704891204834, + "learning_rate": 0.0007523321956769055, + "loss": 6.1031, + "step": 2177 + }, + { + "epoch": 0.7433447098976109, + "grad_norm": 3.2359862327575684, + "learning_rate": 0.0007522184300341298, + "loss": 6.4414, + "step": 2178 + }, + { + "epoch": 0.7436860068259386, + "grad_norm": 3.503610134124756, + "learning_rate": 0.0007521046643913539, + "loss": 6.473, + "step": 2179 + }, + { + "epoch": 0.7440273037542662, + "grad_norm": 3.644157886505127, + "learning_rate": 0.000751990898748578, + "loss": 6.7016, + "step": 2180 + }, + { + "epoch": 0.7443686006825938, + "grad_norm": 3.6388585567474365, + "learning_rate": 0.0007518771331058021, + "loss": 6.0456, + "step": 2181 + }, + { + "epoch": 0.7447098976109215, + "grad_norm": 3.494711399078369, + "learning_rate": 0.0007517633674630262, + "loss": 6.3996, + "step": 2182 + }, + { + "epoch": 0.7450511945392492, + "grad_norm": 3.3580422401428223, + "learning_rate": 0.0007516496018202503, + "loss": 6.791, + "step": 2183 + }, + { + "epoch": 0.7453924914675768, + "grad_norm": 3.2778096199035645, + "learning_rate": 0.0007515358361774745, + "loss": 6.8585, + "step": 2184 + }, + { + "epoch": 0.7457337883959044, + "grad_norm": 4.171144485473633, + "learning_rate": 0.0007514220705346986, + "loss": 6.5811, + "step": 2185 + }, + { + "epoch": 0.7460750853242321, + "grad_norm": 3.2704648971557617, + "learning_rate": 0.0007513083048919227, + "loss": 6.4436, + "step": 2186 + }, + { + "epoch": 0.7464163822525597, + "grad_norm": 3.2986881732940674, + "learning_rate": 0.0007511945392491468, + "loss": 6.3978, + "step": 2187 + }, + { + "epoch": 0.7467576791808874, + "grad_norm": 3.2707314491271973, + "learning_rate": 0.0007510807736063708, + "loss": 6.8469, + "step": 2188 + }, + { + "epoch": 0.747098976109215, + "grad_norm": 3.3278586864471436, + "learning_rate": 0.000750967007963595, + "loss": 6.7223, + "step": 2189 + }, + { + "epoch": 0.7474402730375427, + "grad_norm": 3.3085014820098877, + "learning_rate": 0.0007508532423208191, + "loss": 6.732, + "step": 2190 + }, + { + "epoch": 0.7477815699658703, + "grad_norm": 3.2527832984924316, + "learning_rate": 0.0007507394766780432, + "loss": 6.7635, + "step": 2191 + }, + { + "epoch": 0.7481228668941979, + "grad_norm": 4.035043239593506, + "learning_rate": 0.0007506257110352673, + "loss": 6.5922, + "step": 2192 + }, + { + "epoch": 0.7484641638225256, + "grad_norm": 3.383103370666504, + "learning_rate": 0.0007505119453924914, + "loss": 6.6097, + "step": 2193 + }, + { + "epoch": 0.7488054607508532, + "grad_norm": 3.518967628479004, + "learning_rate": 0.0007503981797497155, + "loss": 6.7073, + "step": 2194 + }, + { + "epoch": 0.7491467576791809, + "grad_norm": 3.142347574234009, + "learning_rate": 0.0007502844141069398, + "loss": 6.7196, + "step": 2195 + }, + { + "epoch": 0.7494880546075086, + "grad_norm": 3.3072586059570312, + "learning_rate": 0.0007501706484641639, + "loss": 7.2352, + "step": 2196 + }, + { + "epoch": 0.7498293515358362, + "grad_norm": 3.449594020843506, + "learning_rate": 0.000750056882821388, + "loss": 6.746, + "step": 2197 + }, + { + "epoch": 0.7501706484641638, + "grad_norm": 3.179112672805786, + "learning_rate": 0.0007499431171786121, + "loss": 6.3185, + "step": 2198 + }, + { + "epoch": 0.7505119453924914, + "grad_norm": 3.2998242378234863, + "learning_rate": 0.0007498293515358362, + "loss": 6.7703, + "step": 2199 + }, + { + "epoch": 0.7508532423208191, + "grad_norm": 5.338634967803955, + "learning_rate": 0.0007497155858930603, + "loss": 5.8161, + "step": 2200 + }, + { + "epoch": 0.7511945392491468, + "grad_norm": 4.291819095611572, + "learning_rate": 0.0007496018202502845, + "loss": 6.3253, + "step": 2201 + }, + { + "epoch": 0.7515358361774744, + "grad_norm": 3.579894542694092, + "learning_rate": 0.0007494880546075086, + "loss": 6.7618, + "step": 2202 + }, + { + "epoch": 0.7518771331058021, + "grad_norm": 3.397998809814453, + "learning_rate": 0.0007493742889647327, + "loss": 6.4193, + "step": 2203 + }, + { + "epoch": 0.7522184300341297, + "grad_norm": 3.181931972503662, + "learning_rate": 0.0007492605233219568, + "loss": 6.1566, + "step": 2204 + }, + { + "epoch": 0.7525597269624573, + "grad_norm": 3.316357374191284, + "learning_rate": 0.0007491467576791809, + "loss": 6.9517, + "step": 2205 + }, + { + "epoch": 0.752901023890785, + "grad_norm": 3.6711394786834717, + "learning_rate": 0.000749032992036405, + "loss": 6.699, + "step": 2206 + }, + { + "epoch": 0.7532423208191126, + "grad_norm": 3.2248027324676514, + "learning_rate": 0.0007489192263936292, + "loss": 6.7637, + "step": 2207 + }, + { + "epoch": 0.7535836177474403, + "grad_norm": 6.611885070800781, + "learning_rate": 0.0007488054607508532, + "loss": 5.6556, + "step": 2208 + }, + { + "epoch": 0.7539249146757679, + "grad_norm": 3.5705573558807373, + "learning_rate": 0.0007486916951080773, + "loss": 6.6186, + "step": 2209 + }, + { + "epoch": 0.7542662116040956, + "grad_norm": 3.3954081535339355, + "learning_rate": 0.0007485779294653014, + "loss": 6.6294, + "step": 2210 + }, + { + "epoch": 0.7546075085324232, + "grad_norm": 4.374399185180664, + "learning_rate": 0.0007484641638225255, + "loss": 5.2589, + "step": 2211 + }, + { + "epoch": 0.7549488054607508, + "grad_norm": 3.399153232574463, + "learning_rate": 0.0007483503981797498, + "loss": 7.0186, + "step": 2212 + }, + { + "epoch": 0.7552901023890785, + "grad_norm": 3.427119016647339, + "learning_rate": 0.0007482366325369739, + "loss": 6.4384, + "step": 2213 + }, + { + "epoch": 0.7556313993174062, + "grad_norm": 3.3054943084716797, + "learning_rate": 0.000748122866894198, + "loss": 6.8928, + "step": 2214 + }, + { + "epoch": 0.7559726962457338, + "grad_norm": 3.1258208751678467, + "learning_rate": 0.0007480091012514221, + "loss": 7.0228, + "step": 2215 + }, + { + "epoch": 0.7563139931740614, + "grad_norm": 3.197582721710205, + "learning_rate": 0.0007478953356086462, + "loss": 6.4303, + "step": 2216 + }, + { + "epoch": 0.7566552901023891, + "grad_norm": 3.482532262802124, + "learning_rate": 0.0007477815699658703, + "loss": 6.5166, + "step": 2217 + }, + { + "epoch": 0.7569965870307167, + "grad_norm": 3.3333423137664795, + "learning_rate": 0.0007476678043230945, + "loss": 6.8339, + "step": 2218 + }, + { + "epoch": 0.7573378839590443, + "grad_norm": 3.8367319107055664, + "learning_rate": 0.0007475540386803186, + "loss": 4.8938, + "step": 2219 + }, + { + "epoch": 0.757679180887372, + "grad_norm": 3.483358383178711, + "learning_rate": 0.0007474402730375427, + "loss": 6.3541, + "step": 2220 + }, + { + "epoch": 0.7580204778156997, + "grad_norm": 3.646254539489746, + "learning_rate": 0.0007473265073947668, + "loss": 6.2515, + "step": 2221 + }, + { + "epoch": 0.7583617747440273, + "grad_norm": 3.3919055461883545, + "learning_rate": 0.0007472127417519909, + "loss": 6.4064, + "step": 2222 + }, + { + "epoch": 0.758703071672355, + "grad_norm": 3.8102240562438965, + "learning_rate": 0.000747098976109215, + "loss": 6.0644, + "step": 2223 + }, + { + "epoch": 0.7590443686006826, + "grad_norm": 4.608096599578857, + "learning_rate": 0.0007469852104664392, + "loss": 6.7343, + "step": 2224 + }, + { + "epoch": 0.7593856655290102, + "grad_norm": 4.500402450561523, + "learning_rate": 0.0007468714448236633, + "loss": 6.7885, + "step": 2225 + }, + { + "epoch": 0.7597269624573378, + "grad_norm": 3.641897439956665, + "learning_rate": 0.0007467576791808874, + "loss": 7.0954, + "step": 2226 + }, + { + "epoch": 0.7600682593856656, + "grad_norm": 3.584850788116455, + "learning_rate": 0.0007466439135381114, + "loss": 6.7046, + "step": 2227 + }, + { + "epoch": 0.7604095563139932, + "grad_norm": 3.864504337310791, + "learning_rate": 0.0007465301478953355, + "loss": 6.564, + "step": 2228 + }, + { + "epoch": 0.7607508532423208, + "grad_norm": 3.3200836181640625, + "learning_rate": 0.0007464163822525596, + "loss": 6.8344, + "step": 2229 + }, + { + "epoch": 0.7610921501706485, + "grad_norm": 3.698155641555786, + "learning_rate": 0.0007463026166097839, + "loss": 6.0784, + "step": 2230 + }, + { + "epoch": 0.7614334470989761, + "grad_norm": 3.2318496704101562, + "learning_rate": 0.000746188850967008, + "loss": 6.6715, + "step": 2231 + }, + { + "epoch": 0.7617747440273037, + "grad_norm": 3.3073530197143555, + "learning_rate": 0.0007460750853242321, + "loss": 6.779, + "step": 2232 + }, + { + "epoch": 0.7621160409556313, + "grad_norm": 3.1656546592712402, + "learning_rate": 0.0007459613196814562, + "loss": 6.1155, + "step": 2233 + }, + { + "epoch": 0.7624573378839591, + "grad_norm": 3.251887798309326, + "learning_rate": 0.0007458475540386803, + "loss": 6.747, + "step": 2234 + }, + { + "epoch": 0.7627986348122867, + "grad_norm": 3.452404022216797, + "learning_rate": 0.0007457337883959045, + "loss": 6.3328, + "step": 2235 + }, + { + "epoch": 0.7631399317406143, + "grad_norm": 10.201861381530762, + "learning_rate": 0.0007456200227531286, + "loss": 5.8861, + "step": 2236 + }, + { + "epoch": 0.763481228668942, + "grad_norm": 6.577554225921631, + "learning_rate": 0.0007455062571103527, + "loss": 6.1329, + "step": 2237 + }, + { + "epoch": 0.7638225255972696, + "grad_norm": 4.222815036773682, + "learning_rate": 0.0007453924914675768, + "loss": 6.9536, + "step": 2238 + }, + { + "epoch": 0.7641638225255972, + "grad_norm": 4.045281887054443, + "learning_rate": 0.0007452787258248009, + "loss": 6.7163, + "step": 2239 + }, + { + "epoch": 0.764505119453925, + "grad_norm": 3.9914534091949463, + "learning_rate": 0.000745164960182025, + "loss": 6.5436, + "step": 2240 + }, + { + "epoch": 0.7648464163822526, + "grad_norm": 3.4769484996795654, + "learning_rate": 0.0007450511945392492, + "loss": 6.695, + "step": 2241 + }, + { + "epoch": 0.7651877133105802, + "grad_norm": 3.3896267414093018, + "learning_rate": 0.0007449374288964733, + "loss": 6.5388, + "step": 2242 + }, + { + "epoch": 0.7655290102389078, + "grad_norm": 3.0643441677093506, + "learning_rate": 0.0007448236632536974, + "loss": 6.0324, + "step": 2243 + }, + { + "epoch": 0.7658703071672355, + "grad_norm": 3.558314800262451, + "learning_rate": 0.0007447098976109215, + "loss": 6.1635, + "step": 2244 + }, + { + "epoch": 0.7662116040955631, + "grad_norm": 3.5593554973602295, + "learning_rate": 0.0007445961319681457, + "loss": 7.1386, + "step": 2245 + }, + { + "epoch": 0.7665529010238907, + "grad_norm": 7.941109657287598, + "learning_rate": 0.0007444823663253698, + "loss": 5.8192, + "step": 2246 + }, + { + "epoch": 0.7668941979522185, + "grad_norm": 3.61783504486084, + "learning_rate": 0.0007443686006825939, + "loss": 6.484, + "step": 2247 + }, + { + "epoch": 0.7672354948805461, + "grad_norm": 3.6058130264282227, + "learning_rate": 0.000744254835039818, + "loss": 6.908, + "step": 2248 + }, + { + "epoch": 0.7675767918088737, + "grad_norm": 3.377002477645874, + "learning_rate": 0.0007441410693970421, + "loss": 6.7491, + "step": 2249 + }, + { + "epoch": 0.7679180887372014, + "grad_norm": 3.3072400093078613, + "learning_rate": 0.0007440273037542662, + "loss": 6.5535, + "step": 2250 + }, + { + "epoch": 0.768259385665529, + "grad_norm": 5.5078020095825195, + "learning_rate": 0.0007439135381114903, + "loss": 6.8116, + "step": 2251 + }, + { + "epoch": 0.7686006825938566, + "grad_norm": 3.3346757888793945, + "learning_rate": 0.0007437997724687145, + "loss": 6.6911, + "step": 2252 + }, + { + "epoch": 0.7689419795221843, + "grad_norm": 3.3978588581085205, + "learning_rate": 0.0007436860068259386, + "loss": 6.6, + "step": 2253 + }, + { + "epoch": 0.769283276450512, + "grad_norm": 3.0739238262176514, + "learning_rate": 0.0007435722411831627, + "loss": 6.3782, + "step": 2254 + }, + { + "epoch": 0.7696245733788396, + "grad_norm": 3.2551000118255615, + "learning_rate": 0.0007434584755403868, + "loss": 6.6849, + "step": 2255 + }, + { + "epoch": 0.7699658703071672, + "grad_norm": 3.300963878631592, + "learning_rate": 0.0007433447098976109, + "loss": 6.8587, + "step": 2256 + }, + { + "epoch": 0.7703071672354949, + "grad_norm": 3.314493179321289, + "learning_rate": 0.000743230944254835, + "loss": 6.3657, + "step": 2257 + }, + { + "epoch": 0.7706484641638225, + "grad_norm": 3.863924264907837, + "learning_rate": 0.0007431171786120592, + "loss": 6.2232, + "step": 2258 + }, + { + "epoch": 0.7709897610921501, + "grad_norm": 3.727370023727417, + "learning_rate": 0.0007430034129692833, + "loss": 6.1062, + "step": 2259 + }, + { + "epoch": 0.7713310580204779, + "grad_norm": 3.3532567024230957, + "learning_rate": 0.0007428896473265074, + "loss": 6.0705, + "step": 2260 + }, + { + "epoch": 0.7716723549488055, + "grad_norm": 3.544203281402588, + "learning_rate": 0.0007427758816837315, + "loss": 6.8618, + "step": 2261 + }, + { + "epoch": 0.7720136518771331, + "grad_norm": 3.5925631523132324, + "learning_rate": 0.0007426621160409557, + "loss": 6.7984, + "step": 2262 + }, + { + "epoch": 0.7723549488054607, + "grad_norm": 11.028677940368652, + "learning_rate": 0.0007425483503981798, + "loss": 7.319, + "step": 2263 + }, + { + "epoch": 0.7726962457337884, + "grad_norm": 4.054958343505859, + "learning_rate": 0.000742434584755404, + "loss": 3.7261, + "step": 2264 + }, + { + "epoch": 0.773037542662116, + "grad_norm": 3.978557825088501, + "learning_rate": 0.0007423208191126281, + "loss": 6.3767, + "step": 2265 + }, + { + "epoch": 0.7733788395904437, + "grad_norm": 3.5264103412628174, + "learning_rate": 0.0007422070534698521, + "loss": 6.3895, + "step": 2266 + }, + { + "epoch": 0.7737201365187714, + "grad_norm": 5.270613193511963, + "learning_rate": 0.0007420932878270762, + "loss": 7.091, + "step": 2267 + }, + { + "epoch": 0.774061433447099, + "grad_norm": 9.161282539367676, + "learning_rate": 0.0007419795221843003, + "loss": 6.3023, + "step": 2268 + }, + { + "epoch": 0.7744027303754266, + "grad_norm": 3.5067977905273438, + "learning_rate": 0.0007418657565415244, + "loss": 7.0963, + "step": 2269 + }, + { + "epoch": 0.7747440273037542, + "grad_norm": 3.9335129261016846, + "learning_rate": 0.0007417519908987486, + "loss": 6.6442, + "step": 2270 + }, + { + "epoch": 0.7750853242320819, + "grad_norm": 3.2506704330444336, + "learning_rate": 0.0007416382252559727, + "loss": 6.592, + "step": 2271 + }, + { + "epoch": 0.7754266211604095, + "grad_norm": 3.383512258529663, + "learning_rate": 0.0007415244596131968, + "loss": 6.3122, + "step": 2272 + }, + { + "epoch": 0.7757679180887372, + "grad_norm": 4.869744300842285, + "learning_rate": 0.0007414106939704209, + "loss": 6.1108, + "step": 2273 + }, + { + "epoch": 0.7761092150170649, + "grad_norm": 9.724971771240234, + "learning_rate": 0.000741296928327645, + "loss": 6.1679, + "step": 2274 + }, + { + "epoch": 0.7764505119453925, + "grad_norm": 3.5860729217529297, + "learning_rate": 0.0007411831626848692, + "loss": 6.8647, + "step": 2275 + }, + { + "epoch": 0.7767918088737201, + "grad_norm": 3.8942580223083496, + "learning_rate": 0.0007410693970420933, + "loss": 6.8968, + "step": 2276 + }, + { + "epoch": 0.7771331058020478, + "grad_norm": 3.5359604358673096, + "learning_rate": 0.0007409556313993174, + "loss": 6.515, + "step": 2277 + }, + { + "epoch": 0.7774744027303754, + "grad_norm": 5.767436981201172, + "learning_rate": 0.0007408418657565415, + "loss": 6.4321, + "step": 2278 + }, + { + "epoch": 0.7778156996587031, + "grad_norm": 3.3587779998779297, + "learning_rate": 0.0007407281001137657, + "loss": 6.6002, + "step": 2279 + }, + { + "epoch": 0.7781569965870307, + "grad_norm": 3.4271020889282227, + "learning_rate": 0.0007406143344709898, + "loss": 6.5293, + "step": 2280 + }, + { + "epoch": 0.7784982935153584, + "grad_norm": 3.1101016998291016, + "learning_rate": 0.000740500568828214, + "loss": 6.7376, + "step": 2281 + }, + { + "epoch": 0.778839590443686, + "grad_norm": 9.123248100280762, + "learning_rate": 0.0007403868031854381, + "loss": 6.5933, + "step": 2282 + }, + { + "epoch": 0.7791808873720136, + "grad_norm": 3.5987279415130615, + "learning_rate": 0.0007402730375426622, + "loss": 5.8846, + "step": 2283 + }, + { + "epoch": 0.7795221843003413, + "grad_norm": 3.3567636013031006, + "learning_rate": 0.0007401592718998863, + "loss": 6.7901, + "step": 2284 + }, + { + "epoch": 0.7798634812286689, + "grad_norm": 3.2160773277282715, + "learning_rate": 0.0007400455062571104, + "loss": 6.4096, + "step": 2285 + }, + { + "epoch": 0.7802047781569966, + "grad_norm": 3.893007278442383, + "learning_rate": 0.0007399317406143344, + "loss": 6.9421, + "step": 2286 + }, + { + "epoch": 0.7805460750853243, + "grad_norm": 4.227537631988525, + "learning_rate": 0.0007398179749715586, + "loss": 6.5151, + "step": 2287 + }, + { + "epoch": 0.7808873720136519, + "grad_norm": 3.350599527359009, + "learning_rate": 0.0007397042093287827, + "loss": 6.3571, + "step": 2288 + }, + { + "epoch": 0.7812286689419795, + "grad_norm": 3.571381092071533, + "learning_rate": 0.0007395904436860068, + "loss": 6.446, + "step": 2289 + }, + { + "epoch": 0.7815699658703071, + "grad_norm": 3.227069139480591, + "learning_rate": 0.0007394766780432309, + "loss": 6.7134, + "step": 2290 + }, + { + "epoch": 0.7819112627986348, + "grad_norm": 5.10248327255249, + "learning_rate": 0.000739362912400455, + "loss": 6.0105, + "step": 2291 + }, + { + "epoch": 0.7822525597269625, + "grad_norm": 12.228768348693848, + "learning_rate": 0.0007392491467576792, + "loss": 6.8161, + "step": 2292 + }, + { + "epoch": 0.7825938566552901, + "grad_norm": 3.5377376079559326, + "learning_rate": 0.0007391353811149033, + "loss": 6.5361, + "step": 2293 + }, + { + "epoch": 0.7829351535836178, + "grad_norm": 3.385505437850952, + "learning_rate": 0.0007390216154721274, + "loss": 6.5231, + "step": 2294 + }, + { + "epoch": 0.7832764505119454, + "grad_norm": 3.796246290206909, + "learning_rate": 0.0007389078498293515, + "loss": 6.8123, + "step": 2295 + }, + { + "epoch": 0.783617747440273, + "grad_norm": 3.4816348552703857, + "learning_rate": 0.0007387940841865757, + "loss": 6.6924, + "step": 2296 + }, + { + "epoch": 0.7839590443686006, + "grad_norm": 3.1583478450775146, + "learning_rate": 0.0007386803185437998, + "loss": 6.6573, + "step": 2297 + }, + { + "epoch": 0.7843003412969284, + "grad_norm": 3.1136934757232666, + "learning_rate": 0.000738566552901024, + "loss": 6.679, + "step": 2298 + }, + { + "epoch": 0.784641638225256, + "grad_norm": 3.1441729068756104, + "learning_rate": 0.0007384527872582481, + "loss": 6.7502, + "step": 2299 + }, + { + "epoch": 0.7849829351535836, + "grad_norm": 3.35772442817688, + "learning_rate": 0.0007383390216154722, + "loss": 6.6187, + "step": 2300 + }, + { + "epoch": 0.7853242320819113, + "grad_norm": 3.171649694442749, + "learning_rate": 0.0007382252559726963, + "loss": 6.6412, + "step": 2301 + }, + { + "epoch": 0.7856655290102389, + "grad_norm": 3.1992034912109375, + "learning_rate": 0.0007381114903299204, + "loss": 6.5126, + "step": 2302 + }, + { + "epoch": 0.7860068259385665, + "grad_norm": 3.29266619682312, + "learning_rate": 0.0007379977246871445, + "loss": 6.8787, + "step": 2303 + }, + { + "epoch": 0.7863481228668942, + "grad_norm": 3.2478480339050293, + "learning_rate": 0.0007378839590443687, + "loss": 6.6584, + "step": 2304 + }, + { + "epoch": 0.7866894197952219, + "grad_norm": 4.051445007324219, + "learning_rate": 0.0007377701934015927, + "loss": 5.3193, + "step": 2305 + }, + { + "epoch": 0.7870307167235495, + "grad_norm": 3.5747928619384766, + "learning_rate": 0.0007376564277588168, + "loss": 6.6164, + "step": 2306 + }, + { + "epoch": 0.7873720136518771, + "grad_norm": 3.6790921688079834, + "learning_rate": 0.0007375426621160409, + "loss": 6.2069, + "step": 2307 + }, + { + "epoch": 0.7877133105802048, + "grad_norm": 3.410109519958496, + "learning_rate": 0.000737428896473265, + "loss": 6.6221, + "step": 2308 + }, + { + "epoch": 0.7880546075085324, + "grad_norm": 3.273460626602173, + "learning_rate": 0.0007373151308304891, + "loss": 6.8308, + "step": 2309 + }, + { + "epoch": 0.78839590443686, + "grad_norm": 3.7715392112731934, + "learning_rate": 0.0007372013651877133, + "loss": 6.3757, + "step": 2310 + }, + { + "epoch": 0.7887372013651878, + "grad_norm": 3.6163434982299805, + "learning_rate": 0.0007370875995449374, + "loss": 6.4616, + "step": 2311 + }, + { + "epoch": 0.7890784982935154, + "grad_norm": 3.2773282527923584, + "learning_rate": 0.0007369738339021615, + "loss": 6.6696, + "step": 2312 + }, + { + "epoch": 0.789419795221843, + "grad_norm": 3.1374948024749756, + "learning_rate": 0.0007368600682593857, + "loss": 6.7808, + "step": 2313 + }, + { + "epoch": 0.7897610921501707, + "grad_norm": 3.342297315597534, + "learning_rate": 0.0007367463026166098, + "loss": 6.3873, + "step": 2314 + }, + { + "epoch": 0.7901023890784983, + "grad_norm": 3.1699624061584473, + "learning_rate": 0.000736632536973834, + "loss": 6.458, + "step": 2315 + }, + { + "epoch": 0.7904436860068259, + "grad_norm": 3.4422099590301514, + "learning_rate": 0.0007365187713310581, + "loss": 6.8495, + "step": 2316 + }, + { + "epoch": 0.7907849829351535, + "grad_norm": 3.30281400680542, + "learning_rate": 0.0007364050056882822, + "loss": 6.2791, + "step": 2317 + }, + { + "epoch": 0.7911262798634813, + "grad_norm": 3.281485080718994, + "learning_rate": 0.0007362912400455063, + "loss": 6.733, + "step": 2318 + }, + { + "epoch": 0.7914675767918089, + "grad_norm": 3.2476160526275635, + "learning_rate": 0.0007361774744027304, + "loss": 7.0849, + "step": 2319 + }, + { + "epoch": 0.7918088737201365, + "grad_norm": 5.255532264709473, + "learning_rate": 0.0007360637087599545, + "loss": 5.6159, + "step": 2320 + }, + { + "epoch": 0.7921501706484642, + "grad_norm": 3.1768462657928467, + "learning_rate": 0.0007359499431171787, + "loss": 6.4978, + "step": 2321 + }, + { + "epoch": 0.7924914675767918, + "grad_norm": 3.386536121368408, + "learning_rate": 0.0007358361774744028, + "loss": 6.5782, + "step": 2322 + }, + { + "epoch": 0.7928327645051194, + "grad_norm": 3.406545877456665, + "learning_rate": 0.0007357224118316269, + "loss": 6.5096, + "step": 2323 + }, + { + "epoch": 0.7931740614334472, + "grad_norm": 3.1887378692626953, + "learning_rate": 0.0007356086461888509, + "loss": 6.5397, + "step": 2324 + }, + { + "epoch": 0.7935153583617748, + "grad_norm": 3.7656588554382324, + "learning_rate": 0.000735494880546075, + "loss": 6.4162, + "step": 2325 + }, + { + "epoch": 0.7938566552901024, + "grad_norm": 4.206430912017822, + "learning_rate": 0.0007353811149032991, + "loss": 6.1527, + "step": 2326 + }, + { + "epoch": 0.79419795221843, + "grad_norm": 3.5633111000061035, + "learning_rate": 0.0007352673492605233, + "loss": 6.1671, + "step": 2327 + }, + { + "epoch": 0.7945392491467577, + "grad_norm": 3.431990623474121, + "learning_rate": 0.0007351535836177474, + "loss": 6.4424, + "step": 2328 + }, + { + "epoch": 0.7948805460750853, + "grad_norm": 3.3288986682891846, + "learning_rate": 0.0007350398179749716, + "loss": 6.4327, + "step": 2329 + }, + { + "epoch": 0.7952218430034129, + "grad_norm": 5.098928928375244, + "learning_rate": 0.0007349260523321957, + "loss": 6.9967, + "step": 2330 + }, + { + "epoch": 0.7955631399317407, + "grad_norm": 10.33311939239502, + "learning_rate": 0.0007348122866894198, + "loss": 7.4892, + "step": 2331 + }, + { + "epoch": 0.7959044368600683, + "grad_norm": 3.6178691387176514, + "learning_rate": 0.000734698521046644, + "loss": 6.2764, + "step": 2332 + }, + { + "epoch": 0.7962457337883959, + "grad_norm": 3.6678335666656494, + "learning_rate": 0.0007345847554038681, + "loss": 6.4699, + "step": 2333 + }, + { + "epoch": 0.7965870307167235, + "grad_norm": 3.2506070137023926, + "learning_rate": 0.0007344709897610922, + "loss": 6.8747, + "step": 2334 + }, + { + "epoch": 0.7969283276450512, + "grad_norm": 12.323904037475586, + "learning_rate": 0.0007343572241183163, + "loss": 7.0181, + "step": 2335 + }, + { + "epoch": 0.7972696245733788, + "grad_norm": 3.266211986541748, + "learning_rate": 0.0007342434584755404, + "loss": 6.9827, + "step": 2336 + }, + { + "epoch": 0.7976109215017065, + "grad_norm": 3.527411699295044, + "learning_rate": 0.0007341296928327645, + "loss": 7.1384, + "step": 2337 + }, + { + "epoch": 0.7979522184300342, + "grad_norm": 3.395819902420044, + "learning_rate": 0.0007340159271899887, + "loss": 6.9804, + "step": 2338 + }, + { + "epoch": 0.7982935153583618, + "grad_norm": 3.189687728881836, + "learning_rate": 0.0007339021615472128, + "loss": 6.748, + "step": 2339 + }, + { + "epoch": 0.7986348122866894, + "grad_norm": 3.2057018280029297, + "learning_rate": 0.0007337883959044369, + "loss": 7.0284, + "step": 2340 + }, + { + "epoch": 0.798976109215017, + "grad_norm": 3.4170916080474854, + "learning_rate": 0.000733674630261661, + "loss": 6.6718, + "step": 2341 + }, + { + "epoch": 0.7993174061433447, + "grad_norm": 5.019681930541992, + "learning_rate": 0.0007335608646188851, + "loss": 4.0973, + "step": 2342 + }, + { + "epoch": 0.7996587030716723, + "grad_norm": 3.8981494903564453, + "learning_rate": 0.0007334470989761092, + "loss": 6.8531, + "step": 2343 + }, + { + "epoch": 0.8, + "grad_norm": 3.444851875305176, + "learning_rate": 0.0007333333333333333, + "loss": 6.7379, + "step": 2344 + }, + { + "epoch": 0.8003412969283277, + "grad_norm": 3.404766082763672, + "learning_rate": 0.0007332195676905574, + "loss": 6.8825, + "step": 2345 + }, + { + "epoch": 0.8006825938566553, + "grad_norm": 3.660458564758301, + "learning_rate": 0.0007331058020477816, + "loss": 6.5001, + "step": 2346 + }, + { + "epoch": 0.8010238907849829, + "grad_norm": 3.563992738723755, + "learning_rate": 0.0007329920364050057, + "loss": 7.2701, + "step": 2347 + }, + { + "epoch": 0.8013651877133106, + "grad_norm": 3.2425544261932373, + "learning_rate": 0.0007328782707622298, + "loss": 6.3414, + "step": 2348 + }, + { + "epoch": 0.8017064846416382, + "grad_norm": 3.2002296447753906, + "learning_rate": 0.0007327645051194539, + "loss": 6.5811, + "step": 2349 + }, + { + "epoch": 0.8020477815699659, + "grad_norm": 6.365078926086426, + "learning_rate": 0.0007326507394766781, + "loss": 5.5698, + "step": 2350 + }, + { + "epoch": 0.8023890784982936, + "grad_norm": 3.3614845275878906, + "learning_rate": 0.0007325369738339022, + "loss": 6.7194, + "step": 2351 + }, + { + "epoch": 0.8027303754266212, + "grad_norm": 3.386559247970581, + "learning_rate": 0.0007324232081911263, + "loss": 6.7674, + "step": 2352 + }, + { + "epoch": 0.8030716723549488, + "grad_norm": 3.318051338195801, + "learning_rate": 0.0007323094425483504, + "loss": 6.8814, + "step": 2353 + }, + { + "epoch": 0.8034129692832764, + "grad_norm": 3.2678332328796387, + "learning_rate": 0.0007321956769055745, + "loss": 6.7354, + "step": 2354 + }, + { + "epoch": 0.8037542662116041, + "grad_norm": 3.1607394218444824, + "learning_rate": 0.0007320819112627987, + "loss": 6.4444, + "step": 2355 + }, + { + "epoch": 0.8040955631399317, + "grad_norm": 3.887561798095703, + "learning_rate": 0.0007319681456200228, + "loss": 6.402, + "step": 2356 + }, + { + "epoch": 0.8044368600682594, + "grad_norm": 3.33502459526062, + "learning_rate": 0.0007318543799772469, + "loss": 6.8184, + "step": 2357 + }, + { + "epoch": 0.8047781569965871, + "grad_norm": 3.411867380142212, + "learning_rate": 0.000731740614334471, + "loss": 6.712, + "step": 2358 + }, + { + "epoch": 0.8051194539249147, + "grad_norm": 3.4380221366882324, + "learning_rate": 0.0007316268486916951, + "loss": 6.7389, + "step": 2359 + }, + { + "epoch": 0.8054607508532423, + "grad_norm": 4.988135814666748, + "learning_rate": 0.0007315130830489192, + "loss": 5.3108, + "step": 2360 + }, + { + "epoch": 0.8058020477815699, + "grad_norm": 3.4688353538513184, + "learning_rate": 0.0007313993174061435, + "loss": 6.6813, + "step": 2361 + }, + { + "epoch": 0.8061433447098976, + "grad_norm": 3.7119762897491455, + "learning_rate": 0.0007312855517633676, + "loss": 6.419, + "step": 2362 + }, + { + "epoch": 0.8064846416382253, + "grad_norm": 4.025672912597656, + "learning_rate": 0.0007311717861205916, + "loss": 6.0607, + "step": 2363 + }, + { + "epoch": 0.8068259385665529, + "grad_norm": 3.4323062896728516, + "learning_rate": 0.0007310580204778157, + "loss": 6.8558, + "step": 2364 + }, + { + "epoch": 0.8071672354948806, + "grad_norm": 3.3362526893615723, + "learning_rate": 0.0007309442548350398, + "loss": 6.2351, + "step": 2365 + }, + { + "epoch": 0.8075085324232082, + "grad_norm": 3.44800066947937, + "learning_rate": 0.0007308304891922639, + "loss": 6.2169, + "step": 2366 + }, + { + "epoch": 0.8078498293515358, + "grad_norm": 3.3122382164001465, + "learning_rate": 0.0007307167235494881, + "loss": 7.1405, + "step": 2367 + }, + { + "epoch": 0.8081911262798634, + "grad_norm": 3.3105790615081787, + "learning_rate": 0.0007306029579067122, + "loss": 6.7058, + "step": 2368 + }, + { + "epoch": 0.8085324232081911, + "grad_norm": 3.940232038497925, + "learning_rate": 0.0007304891922639363, + "loss": 6.7658, + "step": 2369 + }, + { + "epoch": 0.8088737201365188, + "grad_norm": 3.7123522758483887, + "learning_rate": 0.0007303754266211604, + "loss": 6.2908, + "step": 2370 + }, + { + "epoch": 0.8092150170648464, + "grad_norm": 3.907029151916504, + "learning_rate": 0.0007302616609783845, + "loss": 6.2083, + "step": 2371 + }, + { + "epoch": 0.8095563139931741, + "grad_norm": 3.455446481704712, + "learning_rate": 0.0007301478953356086, + "loss": 6.6809, + "step": 2372 + }, + { + "epoch": 0.8098976109215017, + "grad_norm": 3.3250789642333984, + "learning_rate": 0.0007300341296928328, + "loss": 6.6732, + "step": 2373 + }, + { + "epoch": 0.8102389078498293, + "grad_norm": 3.413405656814575, + "learning_rate": 0.0007299203640500569, + "loss": 6.0811, + "step": 2374 + }, + { + "epoch": 0.810580204778157, + "grad_norm": 3.436432361602783, + "learning_rate": 0.000729806598407281, + "loss": 6.8951, + "step": 2375 + }, + { + "epoch": 0.8109215017064847, + "grad_norm": 3.539832592010498, + "learning_rate": 0.0007296928327645051, + "loss": 6.9036, + "step": 2376 + }, + { + "epoch": 0.8112627986348123, + "grad_norm": 3.5589230060577393, + "learning_rate": 0.0007295790671217292, + "loss": 6.6657, + "step": 2377 + }, + { + "epoch": 0.81160409556314, + "grad_norm": 5.298614025115967, + "learning_rate": 0.0007294653014789535, + "loss": 5.9221, + "step": 2378 + }, + { + "epoch": 0.8119453924914676, + "grad_norm": 3.4280951023101807, + "learning_rate": 0.0007293515358361776, + "loss": 6.775, + "step": 2379 + }, + { + "epoch": 0.8122866894197952, + "grad_norm": 3.2987239360809326, + "learning_rate": 0.0007292377701934017, + "loss": 6.5316, + "step": 2380 + }, + { + "epoch": 0.8126279863481228, + "grad_norm": 4.67313289642334, + "learning_rate": 0.0007291240045506258, + "loss": 6.9242, + "step": 2381 + }, + { + "epoch": 0.8129692832764505, + "grad_norm": 3.3883895874023438, + "learning_rate": 0.0007290102389078499, + "loss": 6.9792, + "step": 2382 + }, + { + "epoch": 0.8133105802047782, + "grad_norm": 3.6097097396850586, + "learning_rate": 0.0007288964732650739, + "loss": 6.0067, + "step": 2383 + }, + { + "epoch": 0.8136518771331058, + "grad_norm": 3.708559989929199, + "learning_rate": 0.0007287827076222981, + "loss": 6.6121, + "step": 2384 + }, + { + "epoch": 0.8139931740614335, + "grad_norm": 3.4102509021759033, + "learning_rate": 0.0007286689419795222, + "loss": 6.7611, + "step": 2385 + }, + { + "epoch": 0.8143344709897611, + "grad_norm": 3.181281566619873, + "learning_rate": 0.0007285551763367463, + "loss": 6.5479, + "step": 2386 + }, + { + "epoch": 0.8146757679180887, + "grad_norm": 3.2941017150878906, + "learning_rate": 0.0007284414106939704, + "loss": 6.9345, + "step": 2387 + }, + { + "epoch": 0.8150170648464163, + "grad_norm": 3.4857585430145264, + "learning_rate": 0.0007283276450511945, + "loss": 6.2137, + "step": 2388 + }, + { + "epoch": 0.8153583617747441, + "grad_norm": 3.3792757987976074, + "learning_rate": 0.0007282138794084186, + "loss": 6.9875, + "step": 2389 + }, + { + "epoch": 0.8156996587030717, + "grad_norm": 3.3115673065185547, + "learning_rate": 0.0007281001137656428, + "loss": 6.9524, + "step": 2390 + }, + { + "epoch": 0.8160409556313993, + "grad_norm": 3.5123884677886963, + "learning_rate": 0.0007279863481228669, + "loss": 6.3792, + "step": 2391 + }, + { + "epoch": 0.816382252559727, + "grad_norm": 3.4690353870391846, + "learning_rate": 0.000727872582480091, + "loss": 7.0668, + "step": 2392 + }, + { + "epoch": 0.8167235494880546, + "grad_norm": 3.470229387283325, + "learning_rate": 0.0007277588168373151, + "loss": 6.5134, + "step": 2393 + }, + { + "epoch": 0.8170648464163822, + "grad_norm": 3.3032960891723633, + "learning_rate": 0.0007276450511945392, + "loss": 7.0573, + "step": 2394 + }, + { + "epoch": 0.8174061433447098, + "grad_norm": 6.7078094482421875, + "learning_rate": 0.0007275312855517635, + "loss": 6.4783, + "step": 2395 + }, + { + "epoch": 0.8177474402730376, + "grad_norm": 3.469536304473877, + "learning_rate": 0.0007274175199089876, + "loss": 6.6776, + "step": 2396 + }, + { + "epoch": 0.8180887372013652, + "grad_norm": 3.5298237800598145, + "learning_rate": 0.0007273037542662117, + "loss": 6.3748, + "step": 2397 + }, + { + "epoch": 0.8184300341296928, + "grad_norm": 3.280069589614868, + "learning_rate": 0.0007271899886234358, + "loss": 6.5264, + "step": 2398 + }, + { + "epoch": 0.8187713310580205, + "grad_norm": 3.1752591133117676, + "learning_rate": 0.0007270762229806599, + "loss": 6.656, + "step": 2399 + }, + { + "epoch": 0.8191126279863481, + "grad_norm": 3.6475753784179688, + "learning_rate": 0.000726962457337884, + "loss": 6.703, + "step": 2400 + }, + { + "epoch": 0.8194539249146757, + "grad_norm": 3.4017770290374756, + "learning_rate": 0.0007268486916951082, + "loss": 6.6566, + "step": 2401 + }, + { + "epoch": 0.8197952218430035, + "grad_norm": 3.364267349243164, + "learning_rate": 0.0007267349260523322, + "loss": 6.5412, + "step": 2402 + }, + { + "epoch": 0.8201365187713311, + "grad_norm": 3.2577364444732666, + "learning_rate": 0.0007266211604095563, + "loss": 6.4592, + "step": 2403 + }, + { + "epoch": 0.8204778156996587, + "grad_norm": 3.6079461574554443, + "learning_rate": 0.0007265073947667804, + "loss": 6.4137, + "step": 2404 + }, + { + "epoch": 0.8208191126279863, + "grad_norm": 3.259861946105957, + "learning_rate": 0.0007263936291240045, + "loss": 6.5796, + "step": 2405 + }, + { + "epoch": 0.821160409556314, + "grad_norm": 3.173610210418701, + "learning_rate": 0.0007262798634812286, + "loss": 6.9958, + "step": 2406 + }, + { + "epoch": 0.8215017064846416, + "grad_norm": 3.753018856048584, + "learning_rate": 0.0007261660978384528, + "loss": 6.7856, + "step": 2407 + }, + { + "epoch": 0.8218430034129692, + "grad_norm": 4.427280426025391, + "learning_rate": 0.0007260523321956769, + "loss": 6.5343, + "step": 2408 + }, + { + "epoch": 0.822184300341297, + "grad_norm": 3.921279191970825, + "learning_rate": 0.000725938566552901, + "loss": 6.1591, + "step": 2409 + }, + { + "epoch": 0.8225255972696246, + "grad_norm": 3.6298179626464844, + "learning_rate": 0.0007258248009101251, + "loss": 6.6855, + "step": 2410 + }, + { + "epoch": 0.8228668941979522, + "grad_norm": 4.846485137939453, + "learning_rate": 0.0007257110352673492, + "loss": 5.8797, + "step": 2411 + }, + { + "epoch": 0.8232081911262799, + "grad_norm": 3.467717409133911, + "learning_rate": 0.0007255972696245733, + "loss": 6.3911, + "step": 2412 + }, + { + "epoch": 0.8235494880546075, + "grad_norm": 3.697725534439087, + "learning_rate": 0.0007254835039817976, + "loss": 6.4286, + "step": 2413 + }, + { + "epoch": 0.8238907849829351, + "grad_norm": 3.694342851638794, + "learning_rate": 0.0007253697383390217, + "loss": 6.1186, + "step": 2414 + }, + { + "epoch": 0.8242320819112628, + "grad_norm": 3.3936257362365723, + "learning_rate": 0.0007252559726962458, + "loss": 6.7766, + "step": 2415 + }, + { + "epoch": 0.8245733788395905, + "grad_norm": 3.418151617050171, + "learning_rate": 0.0007251422070534699, + "loss": 6.3386, + "step": 2416 + }, + { + "epoch": 0.8249146757679181, + "grad_norm": 3.6463325023651123, + "learning_rate": 0.000725028441410694, + "loss": 6.1039, + "step": 2417 + }, + { + "epoch": 0.8252559726962457, + "grad_norm": 3.3325319290161133, + "learning_rate": 0.0007249146757679182, + "loss": 6.4488, + "step": 2418 + }, + { + "epoch": 0.8255972696245734, + "grad_norm": 3.4702980518341064, + "learning_rate": 0.0007248009101251423, + "loss": 5.9765, + "step": 2419 + }, + { + "epoch": 0.825938566552901, + "grad_norm": 3.6678733825683594, + "learning_rate": 0.0007246871444823664, + "loss": 6.4048, + "step": 2420 + }, + { + "epoch": 0.8262798634812286, + "grad_norm": 3.4993784427642822, + "learning_rate": 0.0007245733788395905, + "loss": 7.3037, + "step": 2421 + }, + { + "epoch": 0.8266211604095564, + "grad_norm": 3.750591516494751, + "learning_rate": 0.0007244596131968145, + "loss": 6.4685, + "step": 2422 + }, + { + "epoch": 0.826962457337884, + "grad_norm": 3.406001567840576, + "learning_rate": 0.0007243458475540386, + "loss": 6.5744, + "step": 2423 + }, + { + "epoch": 0.8273037542662116, + "grad_norm": 3.754840135574341, + "learning_rate": 0.0007242320819112628, + "loss": 6.0161, + "step": 2424 + }, + { + "epoch": 0.8276450511945392, + "grad_norm": 4.734508514404297, + "learning_rate": 0.0007241183162684869, + "loss": 6.0864, + "step": 2425 + }, + { + "epoch": 0.8279863481228669, + "grad_norm": 3.3035054206848145, + "learning_rate": 0.000724004550625711, + "loss": 6.7669, + "step": 2426 + }, + { + "epoch": 0.8283276450511945, + "grad_norm": 3.275688648223877, + "learning_rate": 0.0007238907849829351, + "loss": 6.5271, + "step": 2427 + }, + { + "epoch": 0.8286689419795222, + "grad_norm": 3.2510900497436523, + "learning_rate": 0.0007237770193401592, + "loss": 6.3089, + "step": 2428 + }, + { + "epoch": 0.8290102389078499, + "grad_norm": 4.378868579864502, + "learning_rate": 0.0007236632536973833, + "loss": 5.3725, + "step": 2429 + }, + { + "epoch": 0.8293515358361775, + "grad_norm": 3.4092857837677, + "learning_rate": 0.0007235494880546076, + "loss": 6.4763, + "step": 2430 + }, + { + "epoch": 0.8296928327645051, + "grad_norm": 3.2129056453704834, + "learning_rate": 0.0007234357224118317, + "loss": 6.4405, + "step": 2431 + }, + { + "epoch": 0.8300341296928327, + "grad_norm": 3.3604624271392822, + "learning_rate": 0.0007233219567690558, + "loss": 6.5199, + "step": 2432 + }, + { + "epoch": 0.8303754266211604, + "grad_norm": 3.6818511486053467, + "learning_rate": 0.0007232081911262799, + "loss": 6.1862, + "step": 2433 + }, + { + "epoch": 0.830716723549488, + "grad_norm": 3.174171209335327, + "learning_rate": 0.000723094425483504, + "loss": 6.5523, + "step": 2434 + }, + { + "epoch": 0.8310580204778157, + "grad_norm": 3.3606784343719482, + "learning_rate": 0.0007229806598407282, + "loss": 6.6429, + "step": 2435 + }, + { + "epoch": 0.8313993174061434, + "grad_norm": 3.436796188354492, + "learning_rate": 0.0007228668941979523, + "loss": 6.749, + "step": 2436 + }, + { + "epoch": 0.831740614334471, + "grad_norm": 3.20074462890625, + "learning_rate": 0.0007227531285551764, + "loss": 6.6947, + "step": 2437 + }, + { + "epoch": 0.8320819112627986, + "grad_norm": 3.3540759086608887, + "learning_rate": 0.0007226393629124005, + "loss": 6.7936, + "step": 2438 + }, + { + "epoch": 0.8324232081911263, + "grad_norm": 12.003594398498535, + "learning_rate": 0.0007225255972696246, + "loss": 6.054, + "step": 2439 + }, + { + "epoch": 0.8327645051194539, + "grad_norm": 3.418896198272705, + "learning_rate": 0.0007224118316268487, + "loss": 6.8313, + "step": 2440 + }, + { + "epoch": 0.8331058020477816, + "grad_norm": 3.5334503650665283, + "learning_rate": 0.0007222980659840728, + "loss": 6.7967, + "step": 2441 + }, + { + "epoch": 0.8334470989761092, + "grad_norm": 3.854482412338257, + "learning_rate": 0.0007221843003412969, + "loss": 5.6593, + "step": 2442 + }, + { + "epoch": 0.8337883959044369, + "grad_norm": 3.3690783977508545, + "learning_rate": 0.000722070534698521, + "loss": 6.9381, + "step": 2443 + }, + { + "epoch": 0.8341296928327645, + "grad_norm": 4.998669624328613, + "learning_rate": 0.0007219567690557451, + "loss": 5.7073, + "step": 2444 + }, + { + "epoch": 0.8344709897610921, + "grad_norm": 3.3754959106445312, + "learning_rate": 0.0007218430034129692, + "loss": 6.9455, + "step": 2445 + }, + { + "epoch": 0.8348122866894198, + "grad_norm": 3.3395328521728516, + "learning_rate": 0.0007217292377701933, + "loss": 6.6405, + "step": 2446 + }, + { + "epoch": 0.8351535836177474, + "grad_norm": 3.234178066253662, + "learning_rate": 0.0007216154721274176, + "loss": 7.0086, + "step": 2447 + }, + { + "epoch": 0.8354948805460751, + "grad_norm": 3.2380173206329346, + "learning_rate": 0.0007215017064846417, + "loss": 6.4784, + "step": 2448 + }, + { + "epoch": 0.8358361774744028, + "grad_norm": 3.333740472793579, + "learning_rate": 0.0007213879408418658, + "loss": 6.5965, + "step": 2449 + }, + { + "epoch": 0.8361774744027304, + "grad_norm": 3.263247489929199, + "learning_rate": 0.0007212741751990899, + "loss": 6.589, + "step": 2450 + }, + { + "epoch": 0.836518771331058, + "grad_norm": 3.6883716583251953, + "learning_rate": 0.000721160409556314, + "loss": 6.2031, + "step": 2451 + }, + { + "epoch": 0.8368600682593856, + "grad_norm": 3.3364152908325195, + "learning_rate": 0.0007210466439135381, + "loss": 6.6068, + "step": 2452 + }, + { + "epoch": 0.8372013651877133, + "grad_norm": 3.7093443870544434, + "learning_rate": 0.0007209328782707623, + "loss": 6.4188, + "step": 2453 + }, + { + "epoch": 0.837542662116041, + "grad_norm": 3.4039225578308105, + "learning_rate": 0.0007208191126279864, + "loss": 6.5616, + "step": 2454 + }, + { + "epoch": 0.8378839590443686, + "grad_norm": 3.357640027999878, + "learning_rate": 0.0007207053469852105, + "loss": 6.5854, + "step": 2455 + }, + { + "epoch": 0.8382252559726963, + "grad_norm": 4.375115871429443, + "learning_rate": 0.0007205915813424346, + "loss": 5.801, + "step": 2456 + }, + { + "epoch": 0.8385665529010239, + "grad_norm": 3.4280662536621094, + "learning_rate": 0.0007204778156996587, + "loss": 6.8118, + "step": 2457 + }, + { + "epoch": 0.8389078498293515, + "grad_norm": 3.6396632194519043, + "learning_rate": 0.0007203640500568829, + "loss": 6.6882, + "step": 2458 + }, + { + "epoch": 0.8392491467576791, + "grad_norm": 3.238856792449951, + "learning_rate": 0.000720250284414107, + "loss": 6.3282, + "step": 2459 + }, + { + "epoch": 0.8395904436860068, + "grad_norm": 3.151153087615967, + "learning_rate": 0.0007201365187713311, + "loss": 6.816, + "step": 2460 + }, + { + "epoch": 0.8399317406143345, + "grad_norm": 4.557154178619385, + "learning_rate": 0.0007200227531285551, + "loss": 6.6596, + "step": 2461 + }, + { + "epoch": 0.8402730375426621, + "grad_norm": 3.225273609161377, + "learning_rate": 0.0007199089874857792, + "loss": 6.5537, + "step": 2462 + }, + { + "epoch": 0.8406143344709898, + "grad_norm": 3.4817261695861816, + "learning_rate": 0.0007197952218430033, + "loss": 6.567, + "step": 2463 + }, + { + "epoch": 0.8409556313993174, + "grad_norm": 3.3386945724487305, + "learning_rate": 0.0007196814562002276, + "loss": 7.1524, + "step": 2464 + }, + { + "epoch": 0.841296928327645, + "grad_norm": 3.4559755325317383, + "learning_rate": 0.0007195676905574517, + "loss": 6.4858, + "step": 2465 + }, + { + "epoch": 0.8416382252559726, + "grad_norm": 3.230203866958618, + "learning_rate": 0.0007194539249146758, + "loss": 6.6456, + "step": 2466 + }, + { + "epoch": 0.8419795221843004, + "grad_norm": 3.321943998336792, + "learning_rate": 0.0007193401592718999, + "loss": 6.6494, + "step": 2467 + }, + { + "epoch": 0.842320819112628, + "grad_norm": 3.304291248321533, + "learning_rate": 0.000719226393629124, + "loss": 6.7956, + "step": 2468 + }, + { + "epoch": 0.8426621160409556, + "grad_norm": 3.3571488857269287, + "learning_rate": 0.0007191126279863481, + "loss": 6.8112, + "step": 2469 + }, + { + "epoch": 0.8430034129692833, + "grad_norm": 3.388721466064453, + "learning_rate": 0.0007189988623435723, + "loss": 6.8564, + "step": 2470 + }, + { + "epoch": 0.8433447098976109, + "grad_norm": 3.430446147918701, + "learning_rate": 0.0007188850967007964, + "loss": 6.7175, + "step": 2471 + }, + { + "epoch": 0.8436860068259385, + "grad_norm": 3.3103549480438232, + "learning_rate": 0.0007187713310580205, + "loss": 6.5384, + "step": 2472 + }, + { + "epoch": 0.8440273037542663, + "grad_norm": 7.369909763336182, + "learning_rate": 0.0007186575654152446, + "loss": 5.398, + "step": 2473 + }, + { + "epoch": 0.8443686006825939, + "grad_norm": 3.683732748031616, + "learning_rate": 0.0007185437997724687, + "loss": 6.4451, + "step": 2474 + }, + { + "epoch": 0.8447098976109215, + "grad_norm": 4.542445659637451, + "learning_rate": 0.0007184300341296928, + "loss": 6.101, + "step": 2475 + }, + { + "epoch": 0.8450511945392492, + "grad_norm": 4.248722553253174, + "learning_rate": 0.000718316268486917, + "loss": 6.0299, + "step": 2476 + }, + { + "epoch": 0.8453924914675768, + "grad_norm": 3.940197467803955, + "learning_rate": 0.0007182025028441411, + "loss": 5.5218, + "step": 2477 + }, + { + "epoch": 0.8457337883959044, + "grad_norm": 3.645324945449829, + "learning_rate": 0.0007180887372013652, + "loss": 6.9645, + "step": 2478 + }, + { + "epoch": 0.846075085324232, + "grad_norm": 5.713097095489502, + "learning_rate": 0.0007179749715585894, + "loss": 5.7437, + "step": 2479 + }, + { + "epoch": 0.8464163822525598, + "grad_norm": 3.8885574340820312, + "learning_rate": 0.0007178612059158133, + "loss": 6.7958, + "step": 2480 + }, + { + "epoch": 0.8467576791808874, + "grad_norm": 3.430229902267456, + "learning_rate": 0.0007177474402730376, + "loss": 6.7616, + "step": 2481 + }, + { + "epoch": 0.847098976109215, + "grad_norm": 3.286057472229004, + "learning_rate": 0.0007176336746302617, + "loss": 6.6936, + "step": 2482 + }, + { + "epoch": 0.8474402730375427, + "grad_norm": 6.800687313079834, + "learning_rate": 0.0007175199089874858, + "loss": 5.4928, + "step": 2483 + }, + { + "epoch": 0.8477815699658703, + "grad_norm": 6.051274299621582, + "learning_rate": 0.0007174061433447099, + "loss": 6.3143, + "step": 2484 + }, + { + "epoch": 0.8481228668941979, + "grad_norm": 3.3994340896606445, + "learning_rate": 0.000717292377701934, + "loss": 6.5889, + "step": 2485 + }, + { + "epoch": 0.8484641638225257, + "grad_norm": 3.665647029876709, + "learning_rate": 0.0007171786120591581, + "loss": 5.8222, + "step": 2486 + }, + { + "epoch": 0.8488054607508533, + "grad_norm": 3.3410794734954834, + "learning_rate": 0.0007170648464163823, + "loss": 6.8781, + "step": 2487 + }, + { + "epoch": 0.8491467576791809, + "grad_norm": 3.42598819732666, + "learning_rate": 0.0007169510807736064, + "loss": 6.8923, + "step": 2488 + }, + { + "epoch": 0.8494880546075085, + "grad_norm": 7.492861270904541, + "learning_rate": 0.0007168373151308305, + "loss": 7.0957, + "step": 2489 + }, + { + "epoch": 0.8498293515358362, + "grad_norm": 3.137038469314575, + "learning_rate": 0.0007167235494880546, + "loss": 6.6471, + "step": 2490 + }, + { + "epoch": 0.8501706484641638, + "grad_norm": 3.3882358074188232, + "learning_rate": 0.0007166097838452787, + "loss": 6.782, + "step": 2491 + }, + { + "epoch": 0.8505119453924914, + "grad_norm": 3.2121896743774414, + "learning_rate": 0.0007164960182025028, + "loss": 7.047, + "step": 2492 + }, + { + "epoch": 0.8508532423208192, + "grad_norm": 3.2568767070770264, + "learning_rate": 0.000716382252559727, + "loss": 6.6304, + "step": 2493 + }, + { + "epoch": 0.8511945392491468, + "grad_norm": 3.2692127227783203, + "learning_rate": 0.0007162684869169511, + "loss": 6.4022, + "step": 2494 + }, + { + "epoch": 0.8515358361774744, + "grad_norm": 3.5089681148529053, + "learning_rate": 0.0007161547212741752, + "loss": 6.5384, + "step": 2495 + }, + { + "epoch": 0.851877133105802, + "grad_norm": 3.226477861404419, + "learning_rate": 0.0007160409556313994, + "loss": 6.6573, + "step": 2496 + }, + { + "epoch": 0.8522184300341297, + "grad_norm": 3.2870302200317383, + "learning_rate": 0.0007159271899886235, + "loss": 6.9091, + "step": 2497 + }, + { + "epoch": 0.8525597269624573, + "grad_norm": 3.368769884109497, + "learning_rate": 0.0007158134243458477, + "loss": 6.54, + "step": 2498 + }, + { + "epoch": 0.852901023890785, + "grad_norm": 7.26986837387085, + "learning_rate": 0.0007156996587030717, + "loss": 5.2813, + "step": 2499 + }, + { + "epoch": 0.8532423208191127, + "grad_norm": 3.4235429763793945, + "learning_rate": 0.0007155858930602958, + "loss": 6.5904, + "step": 2500 + }, + { + "epoch": 0.8535836177474403, + "grad_norm": 3.4683103561401367, + "learning_rate": 0.0007154721274175199, + "loss": 6.8818, + "step": 2501 + }, + { + "epoch": 0.8539249146757679, + "grad_norm": 3.259256362915039, + "learning_rate": 0.000715358361774744, + "loss": 6.5643, + "step": 2502 + }, + { + "epoch": 0.8542662116040955, + "grad_norm": 3.19038724899292, + "learning_rate": 0.0007152445961319681, + "loss": 6.4574, + "step": 2503 + }, + { + "epoch": 0.8546075085324232, + "grad_norm": 4.036156177520752, + "learning_rate": 0.0007151308304891923, + "loss": 6.0228, + "step": 2504 + }, + { + "epoch": 0.8549488054607508, + "grad_norm": 3.6171653270721436, + "learning_rate": 0.0007150170648464164, + "loss": 6.4731, + "step": 2505 + }, + { + "epoch": 0.8552901023890785, + "grad_norm": 3.450333595275879, + "learning_rate": 0.0007149032992036405, + "loss": 6.3563, + "step": 2506 + }, + { + "epoch": 0.8556313993174062, + "grad_norm": 3.4450273513793945, + "learning_rate": 0.0007147895335608646, + "loss": 6.9957, + "step": 2507 + }, + { + "epoch": 0.8559726962457338, + "grad_norm": 3.228724241256714, + "learning_rate": 0.0007146757679180887, + "loss": 6.4982, + "step": 2508 + }, + { + "epoch": 0.8563139931740614, + "grad_norm": 3.283799409866333, + "learning_rate": 0.0007145620022753128, + "loss": 6.301, + "step": 2509 + }, + { + "epoch": 0.856655290102389, + "grad_norm": 3.258451223373413, + "learning_rate": 0.000714448236632537, + "loss": 6.2914, + "step": 2510 + }, + { + "epoch": 0.8569965870307167, + "grad_norm": 5.3775153160095215, + "learning_rate": 0.0007143344709897611, + "loss": 6.2267, + "step": 2511 + }, + { + "epoch": 0.8573378839590444, + "grad_norm": 3.5916664600372314, + "learning_rate": 0.0007142207053469852, + "loss": 6.7227, + "step": 2512 + }, + { + "epoch": 0.857679180887372, + "grad_norm": 5.10945987701416, + "learning_rate": 0.0007141069397042094, + "loss": 6.2029, + "step": 2513 + }, + { + "epoch": 0.8580204778156997, + "grad_norm": 3.6190226078033447, + "learning_rate": 0.0007139931740614335, + "loss": 6.0994, + "step": 2514 + }, + { + "epoch": 0.8583617747440273, + "grad_norm": 3.3382725715637207, + "learning_rate": 0.0007138794084186576, + "loss": 6.891, + "step": 2515 + }, + { + "epoch": 0.8587030716723549, + "grad_norm": 3.399120569229126, + "learning_rate": 0.0007137656427758818, + "loss": 6.8522, + "step": 2516 + }, + { + "epoch": 0.8590443686006826, + "grad_norm": 3.6390433311462402, + "learning_rate": 0.0007136518771331059, + "loss": 6.1696, + "step": 2517 + }, + { + "epoch": 0.8593856655290102, + "grad_norm": 9.467489242553711, + "learning_rate": 0.00071353811149033, + "loss": 6.8595, + "step": 2518 + }, + { + "epoch": 0.8597269624573379, + "grad_norm": 3.805907726287842, + "learning_rate": 0.000713424345847554, + "loss": 6.7477, + "step": 2519 + }, + { + "epoch": 0.8600682593856656, + "grad_norm": 3.396374464035034, + "learning_rate": 0.0007133105802047781, + "loss": 6.3055, + "step": 2520 + }, + { + "epoch": 0.8604095563139932, + "grad_norm": 3.43021821975708, + "learning_rate": 0.0007131968145620023, + "loss": 6.5407, + "step": 2521 + }, + { + "epoch": 0.8607508532423208, + "grad_norm": 6.201378345489502, + "learning_rate": 0.0007130830489192264, + "loss": 5.8928, + "step": 2522 + }, + { + "epoch": 0.8610921501706484, + "grad_norm": 3.4402365684509277, + "learning_rate": 0.0007129692832764505, + "loss": 6.5171, + "step": 2523 + }, + { + "epoch": 0.8614334470989761, + "grad_norm": 3.8701541423797607, + "learning_rate": 0.0007128555176336746, + "loss": 6.4958, + "step": 2524 + }, + { + "epoch": 0.8617747440273038, + "grad_norm": 3.2781641483306885, + "learning_rate": 0.0007127417519908987, + "loss": 6.8188, + "step": 2525 + }, + { + "epoch": 0.8621160409556314, + "grad_norm": 3.67041277885437, + "learning_rate": 0.0007126279863481228, + "loss": 5.8762, + "step": 2526 + }, + { + "epoch": 0.8624573378839591, + "grad_norm": 3.6243369579315186, + "learning_rate": 0.000712514220705347, + "loss": 6.3403, + "step": 2527 + }, + { + "epoch": 0.8627986348122867, + "grad_norm": 3.2596120834350586, + "learning_rate": 0.0007124004550625711, + "loss": 6.812, + "step": 2528 + }, + { + "epoch": 0.8631399317406143, + "grad_norm": 4.878437519073486, + "learning_rate": 0.0007122866894197952, + "loss": 6.3827, + "step": 2529 + }, + { + "epoch": 0.863481228668942, + "grad_norm": 2.523153305053711, + "learning_rate": 0.0007121729237770194, + "loss": 3.6518, + "step": 2530 + }, + { + "epoch": 0.8638225255972696, + "grad_norm": 3.9644103050231934, + "learning_rate": 0.0007120591581342435, + "loss": 5.9803, + "step": 2531 + }, + { + "epoch": 0.8641638225255973, + "grad_norm": 3.464620590209961, + "learning_rate": 0.0007119453924914676, + "loss": 6.6308, + "step": 2532 + }, + { + "epoch": 0.8645051194539249, + "grad_norm": 3.587636709213257, + "learning_rate": 0.0007118316268486918, + "loss": 7.2469, + "step": 2533 + }, + { + "epoch": 0.8648464163822526, + "grad_norm": 3.188854217529297, + "learning_rate": 0.0007117178612059159, + "loss": 6.3745, + "step": 2534 + }, + { + "epoch": 0.8651877133105802, + "grad_norm": 3.3936569690704346, + "learning_rate": 0.00071160409556314, + "loss": 7.0373, + "step": 2535 + }, + { + "epoch": 0.8655290102389078, + "grad_norm": 3.3409032821655273, + "learning_rate": 0.0007114903299203641, + "loss": 6.6337, + "step": 2536 + }, + { + "epoch": 0.8658703071672355, + "grad_norm": 5.52333402633667, + "learning_rate": 0.0007113765642775882, + "loss": 4.6133, + "step": 2537 + }, + { + "epoch": 0.8662116040955632, + "grad_norm": 3.3822033405303955, + "learning_rate": 0.0007112627986348122, + "loss": 6.1598, + "step": 2538 + }, + { + "epoch": 0.8665529010238908, + "grad_norm": 3.8111062049865723, + "learning_rate": 0.0007111490329920364, + "loss": 6.0937, + "step": 2539 + }, + { + "epoch": 0.8668941979522184, + "grad_norm": 4.7204909324646, + "learning_rate": 0.0007110352673492605, + "loss": 5.8796, + "step": 2540 + }, + { + "epoch": 0.8672354948805461, + "grad_norm": 3.3463664054870605, + "learning_rate": 0.0007109215017064846, + "loss": 6.7194, + "step": 2541 + }, + { + "epoch": 0.8675767918088737, + "grad_norm": 3.3839924335479736, + "learning_rate": 0.0007108077360637087, + "loss": 6.8464, + "step": 2542 + }, + { + "epoch": 0.8679180887372013, + "grad_norm": 4.8690571784973145, + "learning_rate": 0.0007106939704209328, + "loss": 5.8016, + "step": 2543 + }, + { + "epoch": 0.868259385665529, + "grad_norm": 3.3318896293640137, + "learning_rate": 0.000710580204778157, + "loss": 6.4526, + "step": 2544 + }, + { + "epoch": 0.8686006825938567, + "grad_norm": 3.221559762954712, + "learning_rate": 0.0007104664391353811, + "loss": 6.6463, + "step": 2545 + }, + { + "epoch": 0.8689419795221843, + "grad_norm": 3.1767921447753906, + "learning_rate": 0.0007103526734926053, + "loss": 6.4654, + "step": 2546 + }, + { + "epoch": 0.869283276450512, + "grad_norm": 3.4493680000305176, + "learning_rate": 0.0007102389078498294, + "loss": 5.9502, + "step": 2547 + }, + { + "epoch": 0.8696245733788396, + "grad_norm": 3.9862265586853027, + "learning_rate": 0.0007101251422070535, + "loss": 6.3229, + "step": 2548 + }, + { + "epoch": 0.8699658703071672, + "grad_norm": 3.4914379119873047, + "learning_rate": 0.0007100113765642776, + "loss": 6.9688, + "step": 2549 + }, + { + "epoch": 0.8703071672354948, + "grad_norm": 3.332296371459961, + "learning_rate": 0.0007098976109215018, + "loss": 6.8452, + "step": 2550 + }, + { + "epoch": 0.8706484641638226, + "grad_norm": 4.199538230895996, + "learning_rate": 0.0007097838452787259, + "loss": 6.881, + "step": 2551 + }, + { + "epoch": 0.8709897610921502, + "grad_norm": 3.5314183235168457, + "learning_rate": 0.00070967007963595, + "loss": 5.8921, + "step": 2552 + }, + { + "epoch": 0.8713310580204778, + "grad_norm": 3.459620237350464, + "learning_rate": 0.0007095563139931741, + "loss": 6.258, + "step": 2553 + }, + { + "epoch": 0.8716723549488055, + "grad_norm": 3.3095757961273193, + "learning_rate": 0.0007094425483503982, + "loss": 6.7562, + "step": 2554 + }, + { + "epoch": 0.8720136518771331, + "grad_norm": 3.2606141567230225, + "learning_rate": 0.0007093287827076223, + "loss": 6.7756, + "step": 2555 + }, + { + "epoch": 0.8723549488054607, + "grad_norm": 3.6110377311706543, + "learning_rate": 0.0007092150170648465, + "loss": 6.3181, + "step": 2556 + }, + { + "epoch": 0.8726962457337883, + "grad_norm": 3.273369550704956, + "learning_rate": 0.0007091012514220706, + "loss": 6.9579, + "step": 2557 + }, + { + "epoch": 0.8730375426621161, + "grad_norm": 5.260756492614746, + "learning_rate": 0.0007089874857792946, + "loss": 5.2098, + "step": 2558 + }, + { + "epoch": 0.8733788395904437, + "grad_norm": 3.3652751445770264, + "learning_rate": 0.0007088737201365187, + "loss": 6.8058, + "step": 2559 + }, + { + "epoch": 0.8737201365187713, + "grad_norm": 3.266148805618286, + "learning_rate": 0.0007087599544937428, + "loss": 7.0574, + "step": 2560 + }, + { + "epoch": 0.874061433447099, + "grad_norm": 3.58514666557312, + "learning_rate": 0.000708646188850967, + "loss": 6.6256, + "step": 2561 + }, + { + "epoch": 0.8744027303754266, + "grad_norm": 3.195068597793579, + "learning_rate": 0.0007085324232081911, + "loss": 6.6367, + "step": 2562 + }, + { + "epoch": 0.8747440273037542, + "grad_norm": 4.282649040222168, + "learning_rate": 0.0007084186575654153, + "loss": 4.9499, + "step": 2563 + }, + { + "epoch": 0.875085324232082, + "grad_norm": 3.2996411323547363, + "learning_rate": 0.0007083048919226394, + "loss": 6.4127, + "step": 2564 + }, + { + "epoch": 0.8754266211604096, + "grad_norm": 3.3839046955108643, + "learning_rate": 0.0007081911262798635, + "loss": 6.6752, + "step": 2565 + }, + { + "epoch": 0.8757679180887372, + "grad_norm": 3.211970806121826, + "learning_rate": 0.0007080773606370876, + "loss": 6.1393, + "step": 2566 + }, + { + "epoch": 0.8761092150170648, + "grad_norm": 3.1798558235168457, + "learning_rate": 0.0007079635949943118, + "loss": 6.6951, + "step": 2567 + }, + { + "epoch": 0.8764505119453925, + "grad_norm": 3.2988522052764893, + "learning_rate": 0.0007078498293515359, + "loss": 6.2898, + "step": 2568 + }, + { + "epoch": 0.8767918088737201, + "grad_norm": 3.8183929920196533, + "learning_rate": 0.00070773606370876, + "loss": 6.2697, + "step": 2569 + }, + { + "epoch": 0.8771331058020477, + "grad_norm": 3.168560266494751, + "learning_rate": 0.0007076222980659841, + "loss": 6.2589, + "step": 2570 + }, + { + "epoch": 0.8774744027303755, + "grad_norm": 3.605886459350586, + "learning_rate": 0.0007075085324232082, + "loss": 5.4431, + "step": 2571 + }, + { + "epoch": 0.8778156996587031, + "grad_norm": 3.5225412845611572, + "learning_rate": 0.0007073947667804323, + "loss": 6.9552, + "step": 2572 + }, + { + "epoch": 0.8781569965870307, + "grad_norm": 3.3253746032714844, + "learning_rate": 0.0007072810011376565, + "loss": 6.5634, + "step": 2573 + }, + { + "epoch": 0.8784982935153584, + "grad_norm": 3.3605477809906006, + "learning_rate": 0.0007071672354948806, + "loss": 6.4666, + "step": 2574 + }, + { + "epoch": 0.878839590443686, + "grad_norm": 3.1568098068237305, + "learning_rate": 0.0007070534698521047, + "loss": 6.6565, + "step": 2575 + }, + { + "epoch": 0.8791808873720136, + "grad_norm": 3.7349367141723633, + "learning_rate": 0.0007069397042093288, + "loss": 6.1776, + "step": 2576 + }, + { + "epoch": 0.8795221843003413, + "grad_norm": 3.3942160606384277, + "learning_rate": 0.0007068259385665528, + "loss": 6.32, + "step": 2577 + }, + { + "epoch": 0.879863481228669, + "grad_norm": 3.2961456775665283, + "learning_rate": 0.0007067121729237769, + "loss": 6.7801, + "step": 2578 + }, + { + "epoch": 0.8802047781569966, + "grad_norm": 3.3948254585266113, + "learning_rate": 0.0007065984072810011, + "loss": 6.0769, + "step": 2579 + }, + { + "epoch": 0.8805460750853242, + "grad_norm": 3.3471062183380127, + "learning_rate": 0.0007064846416382253, + "loss": 5.8324, + "step": 2580 + }, + { + "epoch": 0.8808873720136519, + "grad_norm": 3.5792579650878906, + "learning_rate": 0.0007063708759954494, + "loss": 6.313, + "step": 2581 + }, + { + "epoch": 0.8812286689419795, + "grad_norm": 3.430663824081421, + "learning_rate": 0.0007062571103526735, + "loss": 6.2877, + "step": 2582 + }, + { + "epoch": 0.8815699658703071, + "grad_norm": 5.736737251281738, + "learning_rate": 0.0007061433447098976, + "loss": 6.0753, + "step": 2583 + }, + { + "epoch": 0.8819112627986349, + "grad_norm": 3.3799870014190674, + "learning_rate": 0.0007060295790671218, + "loss": 6.7596, + "step": 2584 + }, + { + "epoch": 0.8822525597269625, + "grad_norm": 6.616389751434326, + "learning_rate": 0.0007059158134243459, + "loss": 5.9919, + "step": 2585 + }, + { + "epoch": 0.8825938566552901, + "grad_norm": 3.429597854614258, + "learning_rate": 0.00070580204778157, + "loss": 6.3736, + "step": 2586 + }, + { + "epoch": 0.8829351535836177, + "grad_norm": 3.608384847640991, + "learning_rate": 0.0007056882821387941, + "loss": 6.3896, + "step": 2587 + }, + { + "epoch": 0.8832764505119454, + "grad_norm": 3.358494281768799, + "learning_rate": 0.0007055745164960182, + "loss": 7.1418, + "step": 2588 + }, + { + "epoch": 0.883617747440273, + "grad_norm": 3.164523124694824, + "learning_rate": 0.0007054607508532423, + "loss": 6.8558, + "step": 2589 + }, + { + "epoch": 0.8839590443686007, + "grad_norm": 3.5418014526367188, + "learning_rate": 0.0007053469852104665, + "loss": 6.2485, + "step": 2590 + }, + { + "epoch": 0.8843003412969284, + "grad_norm": 3.181857109069824, + "learning_rate": 0.0007052332195676906, + "loss": 6.9945, + "step": 2591 + }, + { + "epoch": 0.884641638225256, + "grad_norm": 3.445446014404297, + "learning_rate": 0.0007051194539249147, + "loss": 6.6083, + "step": 2592 + }, + { + "epoch": 0.8849829351535836, + "grad_norm": 3.159823417663574, + "learning_rate": 0.0007050056882821388, + "loss": 6.6446, + "step": 2593 + }, + { + "epoch": 0.8853242320819112, + "grad_norm": 5.284470081329346, + "learning_rate": 0.0007048919226393629, + "loss": 5.6566, + "step": 2594 + }, + { + "epoch": 0.8856655290102389, + "grad_norm": 5.081126689910889, + "learning_rate": 0.000704778156996587, + "loss": 5.952, + "step": 2595 + }, + { + "epoch": 0.8860068259385665, + "grad_norm": 3.5319552421569824, + "learning_rate": 0.0007046643913538113, + "loss": 6.2608, + "step": 2596 + }, + { + "epoch": 0.8863481228668942, + "grad_norm": 3.5452115535736084, + "learning_rate": 0.0007045506257110353, + "loss": 7.0805, + "step": 2597 + }, + { + "epoch": 0.8866894197952219, + "grad_norm": 3.5976240634918213, + "learning_rate": 0.0007044368600682594, + "loss": 6.3286, + "step": 2598 + }, + { + "epoch": 0.8870307167235495, + "grad_norm": 3.242631435394287, + "learning_rate": 0.0007043230944254835, + "loss": 6.8221, + "step": 2599 + }, + { + "epoch": 0.8873720136518771, + "grad_norm": 3.2306206226348877, + "learning_rate": 0.0007042093287827076, + "loss": 6.2416, + "step": 2600 + }, + { + "epoch": 0.8877133105802048, + "grad_norm": 3.222269296646118, + "learning_rate": 0.0007040955631399318, + "loss": 6.9457, + "step": 2601 + }, + { + "epoch": 0.8880546075085324, + "grad_norm": 3.364189863204956, + "learning_rate": 0.0007039817974971559, + "loss": 6.3123, + "step": 2602 + }, + { + "epoch": 0.8883959044368601, + "grad_norm": 7.864152908325195, + "learning_rate": 0.00070386803185438, + "loss": 5.3723, + "step": 2603 + }, + { + "epoch": 0.8887372013651877, + "grad_norm": 3.3091676235198975, + "learning_rate": 0.0007037542662116041, + "loss": 6.5701, + "step": 2604 + }, + { + "epoch": 0.8890784982935154, + "grad_norm": 3.4476070404052734, + "learning_rate": 0.0007036405005688282, + "loss": 6.4836, + "step": 2605 + }, + { + "epoch": 0.889419795221843, + "grad_norm": 3.41062593460083, + "learning_rate": 0.0007035267349260523, + "loss": 6.7782, + "step": 2606 + }, + { + "epoch": 0.8897610921501706, + "grad_norm": 3.3153281211853027, + "learning_rate": 0.0007034129692832765, + "loss": 6.5789, + "step": 2607 + }, + { + "epoch": 0.8901023890784983, + "grad_norm": 3.2125444412231445, + "learning_rate": 0.0007032992036405006, + "loss": 6.57, + "step": 2608 + }, + { + "epoch": 0.8904436860068259, + "grad_norm": 3.210056781768799, + "learning_rate": 0.0007031854379977247, + "loss": 6.4377, + "step": 2609 + }, + { + "epoch": 0.8907849829351536, + "grad_norm": 3.1639184951782227, + "learning_rate": 0.0007030716723549488, + "loss": 6.4598, + "step": 2610 + }, + { + "epoch": 0.8911262798634813, + "grad_norm": 3.7884199619293213, + "learning_rate": 0.0007029579067121729, + "loss": 6.4981, + "step": 2611 + }, + { + "epoch": 0.8914675767918089, + "grad_norm": 3.2961177825927734, + "learning_rate": 0.000702844141069397, + "loss": 6.8189, + "step": 2612 + }, + { + "epoch": 0.8918088737201365, + "grad_norm": 3.3489444255828857, + "learning_rate": 0.0007027303754266213, + "loss": 6.3774, + "step": 2613 + }, + { + "epoch": 0.8921501706484641, + "grad_norm": 3.563915967941284, + "learning_rate": 0.0007026166097838454, + "loss": 6.4961, + "step": 2614 + }, + { + "epoch": 0.8924914675767918, + "grad_norm": 3.3189163208007812, + "learning_rate": 0.0007025028441410695, + "loss": 6.6371, + "step": 2615 + }, + { + "epoch": 0.8928327645051195, + "grad_norm": 3.447849750518799, + "learning_rate": 0.0007023890784982935, + "loss": 5.8723, + "step": 2616 + }, + { + "epoch": 0.8931740614334471, + "grad_norm": 3.2559688091278076, + "learning_rate": 0.0007022753128555176, + "loss": 6.4424, + "step": 2617 + }, + { + "epoch": 0.8935153583617748, + "grad_norm": 4.270260334014893, + "learning_rate": 0.0007021615472127417, + "loss": 5.9736, + "step": 2618 + }, + { + "epoch": 0.8938566552901024, + "grad_norm": 3.3759734630584717, + "learning_rate": 0.0007020477815699659, + "loss": 6.6424, + "step": 2619 + }, + { + "epoch": 0.89419795221843, + "grad_norm": 3.839401960372925, + "learning_rate": 0.00070193401592719, + "loss": 5.8343, + "step": 2620 + }, + { + "epoch": 0.8945392491467576, + "grad_norm": 3.4402377605438232, + "learning_rate": 0.0007018202502844141, + "loss": 5.9067, + "step": 2621 + }, + { + "epoch": 0.8948805460750853, + "grad_norm": 3.8046011924743652, + "learning_rate": 0.0007017064846416382, + "loss": 6.205, + "step": 2622 + }, + { + "epoch": 0.895221843003413, + "grad_norm": 3.5362050533294678, + "learning_rate": 0.0007015927189988623, + "loss": 6.9003, + "step": 2623 + }, + { + "epoch": 0.8955631399317406, + "grad_norm": 3.417820930480957, + "learning_rate": 0.0007014789533560865, + "loss": 6.9618, + "step": 2624 + }, + { + "epoch": 0.8959044368600683, + "grad_norm": 3.3132388591766357, + "learning_rate": 0.0007013651877133106, + "loss": 6.8951, + "step": 2625 + }, + { + "epoch": 0.8962457337883959, + "grad_norm": 3.25205659866333, + "learning_rate": 0.0007012514220705347, + "loss": 6.361, + "step": 2626 + }, + { + "epoch": 0.8965870307167235, + "grad_norm": 3.5494885444641113, + "learning_rate": 0.0007011376564277588, + "loss": 6.2906, + "step": 2627 + }, + { + "epoch": 0.8969283276450511, + "grad_norm": 3.837991714477539, + "learning_rate": 0.0007010238907849829, + "loss": 5.26, + "step": 2628 + }, + { + "epoch": 0.8972696245733789, + "grad_norm": 3.9714996814727783, + "learning_rate": 0.000700910125142207, + "loss": 6.2808, + "step": 2629 + }, + { + "epoch": 0.8976109215017065, + "grad_norm": 3.556457042694092, + "learning_rate": 0.0007007963594994313, + "loss": 6.3822, + "step": 2630 + }, + { + "epoch": 0.8979522184300341, + "grad_norm": 3.464637517929077, + "learning_rate": 0.0007006825938566554, + "loss": 6.9314, + "step": 2631 + }, + { + "epoch": 0.8982935153583618, + "grad_norm": 6.831556797027588, + "learning_rate": 0.0007005688282138795, + "loss": 6.4999, + "step": 2632 + }, + { + "epoch": 0.8986348122866894, + "grad_norm": 3.62048602104187, + "learning_rate": 0.0007004550625711036, + "loss": 6.0907, + "step": 2633 + }, + { + "epoch": 0.898976109215017, + "grad_norm": 3.393540620803833, + "learning_rate": 0.0007003412969283277, + "loss": 6.192, + "step": 2634 + }, + { + "epoch": 0.8993174061433447, + "grad_norm": 3.4911301136016846, + "learning_rate": 0.0007002275312855518, + "loss": 6.8963, + "step": 2635 + }, + { + "epoch": 0.8996587030716724, + "grad_norm": 3.324739694595337, + "learning_rate": 0.0007001137656427759, + "loss": 6.5604, + "step": 2636 + }, + { + "epoch": 0.9, + "grad_norm": 3.236996650695801, + "learning_rate": 0.0007, + "loss": 6.8306, + "step": 2637 + }, + { + "epoch": 0.9003412969283277, + "grad_norm": 6.527902603149414, + "learning_rate": 0.0006998862343572241, + "loss": 4.9167, + "step": 2638 + }, + { + "epoch": 0.9006825938566553, + "grad_norm": 3.5526578426361084, + "learning_rate": 0.0006997724687144482, + "loss": 6.7842, + "step": 2639 + }, + { + "epoch": 0.9010238907849829, + "grad_norm": 3.3249645233154297, + "learning_rate": 0.0006996587030716723, + "loss": 6.6712, + "step": 2640 + }, + { + "epoch": 0.9013651877133105, + "grad_norm": 3.207120656967163, + "learning_rate": 0.0006995449374288964, + "loss": 6.8788, + "step": 2641 + }, + { + "epoch": 0.9017064846416383, + "grad_norm": 3.128549098968506, + "learning_rate": 0.0006994311717861206, + "loss": 6.5011, + "step": 2642 + }, + { + "epoch": 0.9020477815699659, + "grad_norm": 3.4323368072509766, + "learning_rate": 0.0006993174061433447, + "loss": 6.3345, + "step": 2643 + }, + { + "epoch": 0.9023890784982935, + "grad_norm": 3.2746779918670654, + "learning_rate": 0.0006992036405005688, + "loss": 6.3755, + "step": 2644 + }, + { + "epoch": 0.9027303754266212, + "grad_norm": 8.446745872497559, + "learning_rate": 0.0006990898748577929, + "loss": 6.7535, + "step": 2645 + }, + { + "epoch": 0.9030716723549488, + "grad_norm": 3.5905816555023193, + "learning_rate": 0.000698976109215017, + "loss": 6.7017, + "step": 2646 + }, + { + "epoch": 0.9034129692832764, + "grad_norm": 3.572148084640503, + "learning_rate": 0.0006988623435722413, + "loss": 6.6064, + "step": 2647 + }, + { + "epoch": 0.903754266211604, + "grad_norm": 5.641802787780762, + "learning_rate": 0.0006987485779294654, + "loss": 5.9658, + "step": 2648 + }, + { + "epoch": 0.9040955631399318, + "grad_norm": 3.906459331512451, + "learning_rate": 0.0006986348122866895, + "loss": 6.1134, + "step": 2649 + }, + { + "epoch": 0.9044368600682594, + "grad_norm": 3.3307929039001465, + "learning_rate": 0.0006985210466439136, + "loss": 6.8376, + "step": 2650 + }, + { + "epoch": 0.904778156996587, + "grad_norm": 3.302304267883301, + "learning_rate": 0.0006984072810011377, + "loss": 6.762, + "step": 2651 + }, + { + "epoch": 0.9051194539249147, + "grad_norm": 3.249640941619873, + "learning_rate": 0.0006982935153583618, + "loss": 6.7481, + "step": 2652 + }, + { + "epoch": 0.9054607508532423, + "grad_norm": 5.366897106170654, + "learning_rate": 0.000698179749715586, + "loss": 6.1722, + "step": 2653 + }, + { + "epoch": 0.9058020477815699, + "grad_norm": 3.280503749847412, + "learning_rate": 0.0006980659840728101, + "loss": 6.5592, + "step": 2654 + }, + { + "epoch": 0.9061433447098977, + "grad_norm": 3.4887964725494385, + "learning_rate": 0.0006979522184300341, + "loss": 6.6677, + "step": 2655 + }, + { + "epoch": 0.9064846416382253, + "grad_norm": 3.333489418029785, + "learning_rate": 0.0006978384527872582, + "loss": 5.8702, + "step": 2656 + }, + { + "epoch": 0.9068259385665529, + "grad_norm": 3.1572794914245605, + "learning_rate": 0.0006977246871444823, + "loss": 6.6554, + "step": 2657 + }, + { + "epoch": 0.9071672354948805, + "grad_norm": 3.2989954948425293, + "learning_rate": 0.0006976109215017064, + "loss": 6.5196, + "step": 2658 + }, + { + "epoch": 0.9075085324232082, + "grad_norm": 5.108197212219238, + "learning_rate": 0.0006974971558589306, + "loss": 5.6353, + "step": 2659 + }, + { + "epoch": 0.9078498293515358, + "grad_norm": 3.3741981983184814, + "learning_rate": 0.0006973833902161547, + "loss": 6.3594, + "step": 2660 + }, + { + "epoch": 0.9081911262798635, + "grad_norm": 3.180333137512207, + "learning_rate": 0.0006972696245733788, + "loss": 6.3406, + "step": 2661 + }, + { + "epoch": 0.9085324232081912, + "grad_norm": 3.803738832473755, + "learning_rate": 0.0006971558589306029, + "loss": 6.3439, + "step": 2662 + }, + { + "epoch": 0.9088737201365188, + "grad_norm": 4.112486362457275, + "learning_rate": 0.000697042093287827, + "loss": 5.6222, + "step": 2663 + }, + { + "epoch": 0.9092150170648464, + "grad_norm": 3.6349542140960693, + "learning_rate": 0.0006969283276450513, + "loss": 6.0037, + "step": 2664 + }, + { + "epoch": 0.909556313993174, + "grad_norm": 3.6072964668273926, + "learning_rate": 0.0006968145620022754, + "loss": 6.6496, + "step": 2665 + }, + { + "epoch": 0.9098976109215017, + "grad_norm": 3.841954469680786, + "learning_rate": 0.0006967007963594995, + "loss": 7.091, + "step": 2666 + }, + { + "epoch": 0.9102389078498293, + "grad_norm": 3.223397970199585, + "learning_rate": 0.0006965870307167236, + "loss": 6.9502, + "step": 2667 + }, + { + "epoch": 0.910580204778157, + "grad_norm": 3.3219428062438965, + "learning_rate": 0.0006964732650739477, + "loss": 6.7914, + "step": 2668 + }, + { + "epoch": 0.9109215017064847, + "grad_norm": 3.22790265083313, + "learning_rate": 0.0006963594994311718, + "loss": 6.8888, + "step": 2669 + }, + { + "epoch": 0.9112627986348123, + "grad_norm": 3.58799409866333, + "learning_rate": 0.000696245733788396, + "loss": 6.5231, + "step": 2670 + }, + { + "epoch": 0.9116040955631399, + "grad_norm": 3.4284210205078125, + "learning_rate": 0.0006961319681456201, + "loss": 6.4324, + "step": 2671 + }, + { + "epoch": 0.9119453924914676, + "grad_norm": 3.6052560806274414, + "learning_rate": 0.0006960182025028442, + "loss": 6.6546, + "step": 2672 + }, + { + "epoch": 0.9122866894197952, + "grad_norm": 3.6261966228485107, + "learning_rate": 0.0006959044368600683, + "loss": 6.6569, + "step": 2673 + }, + { + "epoch": 0.9126279863481229, + "grad_norm": 3.32460880279541, + "learning_rate": 0.0006957906712172923, + "loss": 6.5463, + "step": 2674 + }, + { + "epoch": 0.9129692832764505, + "grad_norm": 3.767965078353882, + "learning_rate": 0.0006956769055745164, + "loss": 6.153, + "step": 2675 + }, + { + "epoch": 0.9133105802047782, + "grad_norm": 3.2888388633728027, + "learning_rate": 0.0006955631399317406, + "loss": 6.5947, + "step": 2676 + }, + { + "epoch": 0.9136518771331058, + "grad_norm": 3.235499620437622, + "learning_rate": 0.0006954493742889647, + "loss": 6.6224, + "step": 2677 + }, + { + "epoch": 0.9139931740614334, + "grad_norm": 3.1884522438049316, + "learning_rate": 0.0006953356086461888, + "loss": 6.8578, + "step": 2678 + }, + { + "epoch": 0.9143344709897611, + "grad_norm": 3.4016973972320557, + "learning_rate": 0.0006952218430034129, + "loss": 6.9575, + "step": 2679 + }, + { + "epoch": 0.9146757679180887, + "grad_norm": 3.2565393447875977, + "learning_rate": 0.000695108077360637, + "loss": 7.086, + "step": 2680 + }, + { + "epoch": 0.9150170648464164, + "grad_norm": 3.359285593032837, + "learning_rate": 0.0006949943117178612, + "loss": 6.695, + "step": 2681 + }, + { + "epoch": 0.9153583617747441, + "grad_norm": 3.5385429859161377, + "learning_rate": 0.0006948805460750854, + "loss": 6.2872, + "step": 2682 + }, + { + "epoch": 0.9156996587030717, + "grad_norm": 3.284391164779663, + "learning_rate": 0.0006947667804323095, + "loss": 6.5209, + "step": 2683 + }, + { + "epoch": 0.9160409556313993, + "grad_norm": 3.260636329650879, + "learning_rate": 0.0006946530147895336, + "loss": 6.4423, + "step": 2684 + }, + { + "epoch": 0.9163822525597269, + "grad_norm": 5.266411304473877, + "learning_rate": 0.0006945392491467577, + "loss": 5.6748, + "step": 2685 + }, + { + "epoch": 0.9167235494880546, + "grad_norm": 3.5976195335388184, + "learning_rate": 0.0006944254835039818, + "loss": 6.3951, + "step": 2686 + }, + { + "epoch": 0.9170648464163823, + "grad_norm": 3.503249406814575, + "learning_rate": 0.000694311717861206, + "loss": 6.8257, + "step": 2687 + }, + { + "epoch": 0.9174061433447099, + "grad_norm": 3.3149096965789795, + "learning_rate": 0.0006941979522184301, + "loss": 6.1159, + "step": 2688 + }, + { + "epoch": 0.9177474402730376, + "grad_norm": 3.2986981868743896, + "learning_rate": 0.0006940841865756542, + "loss": 6.1175, + "step": 2689 + }, + { + "epoch": 0.9180887372013652, + "grad_norm": 3.357754707336426, + "learning_rate": 0.0006939704209328783, + "loss": 6.8085, + "step": 2690 + }, + { + "epoch": 0.9184300341296928, + "grad_norm": 3.491800308227539, + "learning_rate": 0.0006938566552901024, + "loss": 6.4161, + "step": 2691 + }, + { + "epoch": 0.9187713310580204, + "grad_norm": 3.2512035369873047, + "learning_rate": 0.0006937428896473265, + "loss": 6.8517, + "step": 2692 + }, + { + "epoch": 0.9191126279863481, + "grad_norm": 3.1842398643493652, + "learning_rate": 0.0006936291240045507, + "loss": 7.1578, + "step": 2693 + }, + { + "epoch": 0.9194539249146758, + "grad_norm": 3.4947540760040283, + "learning_rate": 0.0006935153583617747, + "loss": 6.0914, + "step": 2694 + }, + { + "epoch": 0.9197952218430034, + "grad_norm": 8.686001777648926, + "learning_rate": 0.0006934015927189988, + "loss": 5.7208, + "step": 2695 + }, + { + "epoch": 0.9201365187713311, + "grad_norm": 3.50386643409729, + "learning_rate": 0.0006932878270762229, + "loss": 6.766, + "step": 2696 + }, + { + "epoch": 0.9204778156996587, + "grad_norm": 3.275815486907959, + "learning_rate": 0.000693174061433447, + "loss": 6.6614, + "step": 2697 + }, + { + "epoch": 0.9208191126279863, + "grad_norm": 3.2420060634613037, + "learning_rate": 0.0006930602957906712, + "loss": 6.4655, + "step": 2698 + }, + { + "epoch": 0.921160409556314, + "grad_norm": 3.386936902999878, + "learning_rate": 0.0006929465301478954, + "loss": 6.318, + "step": 2699 + }, + { + "epoch": 0.9215017064846417, + "grad_norm": 5.493778705596924, + "learning_rate": 0.0006928327645051195, + "loss": 5.9732, + "step": 2700 + }, + { + "epoch": 0.9218430034129693, + "grad_norm": 3.2191455364227295, + "learning_rate": 0.0006927189988623436, + "loss": 6.5306, + "step": 2701 + }, + { + "epoch": 0.922184300341297, + "grad_norm": 3.3493263721466064, + "learning_rate": 0.0006926052332195677, + "loss": 6.4363, + "step": 2702 + }, + { + "epoch": 0.9225255972696246, + "grad_norm": 3.3574280738830566, + "learning_rate": 0.0006924914675767918, + "loss": 6.9716, + "step": 2703 + }, + { + "epoch": 0.9228668941979522, + "grad_norm": 3.3387627601623535, + "learning_rate": 0.000692377701934016, + "loss": 6.9231, + "step": 2704 + }, + { + "epoch": 0.9232081911262798, + "grad_norm": 3.2208516597747803, + "learning_rate": 0.0006922639362912401, + "loss": 6.4118, + "step": 2705 + }, + { + "epoch": 0.9235494880546075, + "grad_norm": 3.2908411026000977, + "learning_rate": 0.0006921501706484642, + "loss": 6.4098, + "step": 2706 + }, + { + "epoch": 0.9238907849829352, + "grad_norm": 3.3244807720184326, + "learning_rate": 0.0006920364050056883, + "loss": 6.8572, + "step": 2707 + }, + { + "epoch": 0.9242320819112628, + "grad_norm": 3.2083566188812256, + "learning_rate": 0.0006919226393629124, + "loss": 6.8654, + "step": 2708 + }, + { + "epoch": 0.9245733788395905, + "grad_norm": 3.230299472808838, + "learning_rate": 0.0006918088737201365, + "loss": 6.6024, + "step": 2709 + }, + { + "epoch": 0.9249146757679181, + "grad_norm": 3.518018960952759, + "learning_rate": 0.0006916951080773607, + "loss": 5.8393, + "step": 2710 + }, + { + "epoch": 0.9252559726962457, + "grad_norm": 3.445234775543213, + "learning_rate": 0.0006915813424345848, + "loss": 6.0757, + "step": 2711 + }, + { + "epoch": 0.9255972696245733, + "grad_norm": 3.329686403274536, + "learning_rate": 0.000691467576791809, + "loss": 6.8527, + "step": 2712 + }, + { + "epoch": 0.9259385665529011, + "grad_norm": 3.224597215652466, + "learning_rate": 0.0006913538111490329, + "loss": 6.5644, + "step": 2713 + }, + { + "epoch": 0.9262798634812287, + "grad_norm": 3.4149997234344482, + "learning_rate": 0.000691240045506257, + "loss": 6.7847, + "step": 2714 + }, + { + "epoch": 0.9266211604095563, + "grad_norm": 3.3255600929260254, + "learning_rate": 0.0006911262798634812, + "loss": 6.5404, + "step": 2715 + }, + { + "epoch": 0.926962457337884, + "grad_norm": 5.029857635498047, + "learning_rate": 0.0006910125142207054, + "loss": 5.6074, + "step": 2716 + }, + { + "epoch": 0.9273037542662116, + "grad_norm": 4.0226149559021, + "learning_rate": 0.0006908987485779295, + "loss": 6.6629, + "step": 2717 + }, + { + "epoch": 0.9276450511945392, + "grad_norm": 3.5831539630889893, + "learning_rate": 0.0006907849829351536, + "loss": 5.8483, + "step": 2718 + }, + { + "epoch": 0.9279863481228668, + "grad_norm": 3.3607494831085205, + "learning_rate": 0.0006906712172923777, + "loss": 6.7091, + "step": 2719 + }, + { + "epoch": 0.9283276450511946, + "grad_norm": 5.548701763153076, + "learning_rate": 0.0006905574516496018, + "loss": 5.6986, + "step": 2720 + }, + { + "epoch": 0.9286689419795222, + "grad_norm": 3.317486047744751, + "learning_rate": 0.0006904436860068259, + "loss": 6.4468, + "step": 2721 + }, + { + "epoch": 0.9290102389078498, + "grad_norm": 3.3964345455169678, + "learning_rate": 0.0006903299203640501, + "loss": 6.7458, + "step": 2722 + }, + { + "epoch": 0.9293515358361775, + "grad_norm": 3.141096591949463, + "learning_rate": 0.0006902161547212742, + "loss": 6.8357, + "step": 2723 + }, + { + "epoch": 0.9296928327645051, + "grad_norm": 3.1272401809692383, + "learning_rate": 0.0006901023890784983, + "loss": 6.2205, + "step": 2724 + }, + { + "epoch": 0.9300341296928327, + "grad_norm": 3.82816743850708, + "learning_rate": 0.0006899886234357224, + "loss": 6.151, + "step": 2725 + }, + { + "epoch": 0.9303754266211605, + "grad_norm": 3.0288915634155273, + "learning_rate": 0.0006898748577929465, + "loss": 6.5263, + "step": 2726 + }, + { + "epoch": 0.9307167235494881, + "grad_norm": 3.344312906265259, + "learning_rate": 0.0006897610921501707, + "loss": 6.6451, + "step": 2727 + }, + { + "epoch": 0.9310580204778157, + "grad_norm": 3.386958122253418, + "learning_rate": 0.0006896473265073948, + "loss": 6.5277, + "step": 2728 + }, + { + "epoch": 0.9313993174061433, + "grad_norm": 3.474781036376953, + "learning_rate": 0.000689533560864619, + "loss": 6.0832, + "step": 2729 + }, + { + "epoch": 0.931740614334471, + "grad_norm": 3.3939759731292725, + "learning_rate": 0.000689419795221843, + "loss": 7.0273, + "step": 2730 + }, + { + "epoch": 0.9320819112627986, + "grad_norm": 3.276217460632324, + "learning_rate": 0.0006893060295790672, + "loss": 5.9832, + "step": 2731 + }, + { + "epoch": 0.9324232081911262, + "grad_norm": 3.2596402168273926, + "learning_rate": 0.0006891922639362913, + "loss": 6.8106, + "step": 2732 + }, + { + "epoch": 0.932764505119454, + "grad_norm": 3.262134313583374, + "learning_rate": 0.0006890784982935154, + "loss": 6.5364, + "step": 2733 + }, + { + "epoch": 0.9331058020477816, + "grad_norm": 4.899016380310059, + "learning_rate": 0.0006889647326507395, + "loss": 6.1792, + "step": 2734 + }, + { + "epoch": 0.9334470989761092, + "grad_norm": 4.251522064208984, + "learning_rate": 0.0006888509670079636, + "loss": 6.3958, + "step": 2735 + }, + { + "epoch": 0.9337883959044369, + "grad_norm": 4.013020992279053, + "learning_rate": 0.0006887372013651877, + "loss": 6.1985, + "step": 2736 + }, + { + "epoch": 0.9341296928327645, + "grad_norm": 3.3481905460357666, + "learning_rate": 0.0006886234357224118, + "loss": 7.0557, + "step": 2737 + }, + { + "epoch": 0.9344709897610921, + "grad_norm": 3.201247215270996, + "learning_rate": 0.0006885096700796359, + "loss": 6.2043, + "step": 2738 + }, + { + "epoch": 0.9348122866894198, + "grad_norm": 3.4401636123657227, + "learning_rate": 0.0006883959044368601, + "loss": 7.097, + "step": 2739 + }, + { + "epoch": 0.9351535836177475, + "grad_norm": 3.1851375102996826, + "learning_rate": 0.0006882821387940842, + "loss": 6.761, + "step": 2740 + }, + { + "epoch": 0.9354948805460751, + "grad_norm": 3.280695915222168, + "learning_rate": 0.0006881683731513083, + "loss": 7.2953, + "step": 2741 + }, + { + "epoch": 0.9358361774744027, + "grad_norm": 3.2717418670654297, + "learning_rate": 0.0006880546075085324, + "loss": 6.71, + "step": 2742 + }, + { + "epoch": 0.9361774744027304, + "grad_norm": 3.8960654735565186, + "learning_rate": 0.0006879408418657565, + "loss": 5.5993, + "step": 2743 + }, + { + "epoch": 0.936518771331058, + "grad_norm": 3.261302947998047, + "learning_rate": 0.0006878270762229807, + "loss": 6.2783, + "step": 2744 + }, + { + "epoch": 0.9368600682593856, + "grad_norm": 3.3171653747558594, + "learning_rate": 0.0006877133105802048, + "loss": 5.9773, + "step": 2745 + }, + { + "epoch": 0.9372013651877134, + "grad_norm": 3.2578046321868896, + "learning_rate": 0.000687599544937429, + "loss": 6.0005, + "step": 2746 + }, + { + "epoch": 0.937542662116041, + "grad_norm": 3.587801933288574, + "learning_rate": 0.0006874857792946531, + "loss": 6.7843, + "step": 2747 + }, + { + "epoch": 0.9378839590443686, + "grad_norm": 3.3838870525360107, + "learning_rate": 0.0006873720136518772, + "loss": 6.8512, + "step": 2748 + }, + { + "epoch": 0.9382252559726962, + "grad_norm": 3.3424386978149414, + "learning_rate": 0.0006872582480091013, + "loss": 6.5035, + "step": 2749 + }, + { + "epoch": 0.9385665529010239, + "grad_norm": 4.673618793487549, + "learning_rate": 0.0006871444823663255, + "loss": 4.9139, + "step": 2750 + }, + { + "epoch": 0.9389078498293515, + "grad_norm": 3.2579727172851562, + "learning_rate": 0.0006870307167235496, + "loss": 6.4092, + "step": 2751 + }, + { + "epoch": 0.9392491467576792, + "grad_norm": 3.674421787261963, + "learning_rate": 0.0006869169510807736, + "loss": 6.4869, + "step": 2752 + }, + { + "epoch": 0.9395904436860069, + "grad_norm": 3.2421398162841797, + "learning_rate": 0.0006868031854379977, + "loss": 6.3196, + "step": 2753 + }, + { + "epoch": 0.9399317406143345, + "grad_norm": 3.4086074829101562, + "learning_rate": 0.0006866894197952218, + "loss": 6.2654, + "step": 2754 + }, + { + "epoch": 0.9402730375426621, + "grad_norm": 3.3109261989593506, + "learning_rate": 0.0006865756541524459, + "loss": 6.3046, + "step": 2755 + }, + { + "epoch": 0.9406143344709897, + "grad_norm": 3.2505524158477783, + "learning_rate": 0.0006864618885096701, + "loss": 6.3068, + "step": 2756 + }, + { + "epoch": 0.9409556313993174, + "grad_norm": 3.0877838134765625, + "learning_rate": 0.0006863481228668942, + "loss": 6.4801, + "step": 2757 + }, + { + "epoch": 0.941296928327645, + "grad_norm": 3.232034206390381, + "learning_rate": 0.0006862343572241183, + "loss": 6.1997, + "step": 2758 + }, + { + "epoch": 0.9416382252559727, + "grad_norm": 3.7556653022766113, + "learning_rate": 0.0006861205915813424, + "loss": 6.0221, + "step": 2759 + }, + { + "epoch": 0.9419795221843004, + "grad_norm": 3.408632278442383, + "learning_rate": 0.0006860068259385665, + "loss": 6.4331, + "step": 2760 + }, + { + "epoch": 0.942320819112628, + "grad_norm": 6.045160293579102, + "learning_rate": 0.0006858930602957906, + "loss": 5.5115, + "step": 2761 + }, + { + "epoch": 0.9426621160409556, + "grad_norm": 3.6110312938690186, + "learning_rate": 0.0006857792946530148, + "loss": 6.957, + "step": 2762 + }, + { + "epoch": 0.9430034129692833, + "grad_norm": 3.6176974773406982, + "learning_rate": 0.000685665529010239, + "loss": 6.2635, + "step": 2763 + }, + { + "epoch": 0.9433447098976109, + "grad_norm": 3.2292110919952393, + "learning_rate": 0.0006855517633674631, + "loss": 6.4325, + "step": 2764 + }, + { + "epoch": 0.9436860068259386, + "grad_norm": 3.226062536239624, + "learning_rate": 0.0006854379977246872, + "loss": 6.1578, + "step": 2765 + }, + { + "epoch": 0.9440273037542662, + "grad_norm": 3.0933258533477783, + "learning_rate": 0.0006853242320819113, + "loss": 6.3193, + "step": 2766 + }, + { + "epoch": 0.9443686006825939, + "grad_norm": 3.3063440322875977, + "learning_rate": 0.0006852104664391355, + "loss": 6.4354, + "step": 2767 + }, + { + "epoch": 0.9447098976109215, + "grad_norm": 7.300811290740967, + "learning_rate": 0.0006850967007963596, + "loss": 5.611, + "step": 2768 + }, + { + "epoch": 0.9450511945392491, + "grad_norm": 3.557973623275757, + "learning_rate": 0.0006849829351535837, + "loss": 6.5173, + "step": 2769 + }, + { + "epoch": 0.9453924914675768, + "grad_norm": 2.229107141494751, + "learning_rate": 0.0006848691695108078, + "loss": 3.489, + "step": 2770 + }, + { + "epoch": 0.9457337883959044, + "grad_norm": 3.4767768383026123, + "learning_rate": 0.0006847554038680319, + "loss": 6.4509, + "step": 2771 + }, + { + "epoch": 0.9460750853242321, + "grad_norm": 3.309904098510742, + "learning_rate": 0.0006846416382252559, + "loss": 6.3209, + "step": 2772 + }, + { + "epoch": 0.9464163822525598, + "grad_norm": 3.6992416381835938, + "learning_rate": 0.0006845278725824801, + "loss": 6.4455, + "step": 2773 + }, + { + "epoch": 0.9467576791808874, + "grad_norm": 3.4120285511016846, + "learning_rate": 0.0006844141069397042, + "loss": 6.5287, + "step": 2774 + }, + { + "epoch": 0.947098976109215, + "grad_norm": 3.3059194087982178, + "learning_rate": 0.0006843003412969283, + "loss": 6.4193, + "step": 2775 + }, + { + "epoch": 0.9474402730375426, + "grad_norm": 3.727429151535034, + "learning_rate": 0.0006841865756541524, + "loss": 6.0036, + "step": 2776 + }, + { + "epoch": 0.9477815699658703, + "grad_norm": 5.650639057159424, + "learning_rate": 0.0006840728100113765, + "loss": 5.4394, + "step": 2777 + }, + { + "epoch": 0.948122866894198, + "grad_norm": 3.2986202239990234, + "learning_rate": 0.0006839590443686006, + "loss": 6.7952, + "step": 2778 + }, + { + "epoch": 0.9484641638225256, + "grad_norm": 4.086126804351807, + "learning_rate": 0.0006838452787258248, + "loss": 4.7384, + "step": 2779 + }, + { + "epoch": 0.9488054607508533, + "grad_norm": 3.3148763179779053, + "learning_rate": 0.000683731513083049, + "loss": 6.6121, + "step": 2780 + }, + { + "epoch": 0.9491467576791809, + "grad_norm": 3.115473508834839, + "learning_rate": 0.0006836177474402731, + "loss": 6.375, + "step": 2781 + }, + { + "epoch": 0.9494880546075085, + "grad_norm": 4.739993572235107, + "learning_rate": 0.0006835039817974972, + "loss": 6.0525, + "step": 2782 + }, + { + "epoch": 0.9498293515358361, + "grad_norm": 3.0348520278930664, + "learning_rate": 0.0006833902161547213, + "loss": 6.6161, + "step": 2783 + }, + { + "epoch": 0.9501706484641638, + "grad_norm": 4.478902339935303, + "learning_rate": 0.0006832764505119454, + "loss": 6.0643, + "step": 2784 + }, + { + "epoch": 0.9505119453924915, + "grad_norm": 3.3169867992401123, + "learning_rate": 0.0006831626848691696, + "loss": 6.4104, + "step": 2785 + }, + { + "epoch": 0.9508532423208191, + "grad_norm": 3.3145945072174072, + "learning_rate": 0.0006830489192263937, + "loss": 6.6644, + "step": 2786 + }, + { + "epoch": 0.9511945392491468, + "grad_norm": 3.361513614654541, + "learning_rate": 0.0006829351535836178, + "loss": 6.4348, + "step": 2787 + }, + { + "epoch": 0.9515358361774744, + "grad_norm": 3.221954584121704, + "learning_rate": 0.0006828213879408419, + "loss": 6.8069, + "step": 2788 + }, + { + "epoch": 0.951877133105802, + "grad_norm": 3.3777694702148438, + "learning_rate": 0.000682707622298066, + "loss": 5.9127, + "step": 2789 + }, + { + "epoch": 0.9522184300341296, + "grad_norm": 4.066497325897217, + "learning_rate": 0.0006825938566552902, + "loss": 5.7392, + "step": 2790 + }, + { + "epoch": 0.9525597269624574, + "grad_norm": 3.3121917247772217, + "learning_rate": 0.0006824800910125142, + "loss": 6.9516, + "step": 2791 + }, + { + "epoch": 0.952901023890785, + "grad_norm": 3.3323752880096436, + "learning_rate": 0.0006823663253697383, + "loss": 6.7219, + "step": 2792 + }, + { + "epoch": 0.9532423208191126, + "grad_norm": 3.7000746726989746, + "learning_rate": 0.0006822525597269624, + "loss": 6.541, + "step": 2793 + }, + { + "epoch": 0.9535836177474403, + "grad_norm": 3.4040660858154297, + "learning_rate": 0.0006821387940841865, + "loss": 6.7015, + "step": 2794 + }, + { + "epoch": 0.9539249146757679, + "grad_norm": 3.9156670570373535, + "learning_rate": 0.0006820250284414106, + "loss": 6.447, + "step": 2795 + }, + { + "epoch": 0.9542662116040955, + "grad_norm": 3.4194421768188477, + "learning_rate": 0.0006819112627986348, + "loss": 6.6974, + "step": 2796 + }, + { + "epoch": 0.9546075085324232, + "grad_norm": 4.750198841094971, + "learning_rate": 0.000681797497155859, + "loss": 6.2898, + "step": 2797 + }, + { + "epoch": 0.9549488054607509, + "grad_norm": 3.528273582458496, + "learning_rate": 0.0006816837315130831, + "loss": 6.2098, + "step": 2798 + }, + { + "epoch": 0.9552901023890785, + "grad_norm": 3.286182403564453, + "learning_rate": 0.0006815699658703072, + "loss": 6.7786, + "step": 2799 + }, + { + "epoch": 0.9556313993174061, + "grad_norm": 3.299617052078247, + "learning_rate": 0.0006814562002275313, + "loss": 6.6818, + "step": 2800 + }, + { + "epoch": 0.9559726962457338, + "grad_norm": 3.2190589904785156, + "learning_rate": 0.0006813424345847554, + "loss": 6.15, + "step": 2801 + }, + { + "epoch": 0.9563139931740614, + "grad_norm": 3.2283973693847656, + "learning_rate": 0.0006812286689419796, + "loss": 5.7856, + "step": 2802 + }, + { + "epoch": 0.956655290102389, + "grad_norm": 3.121652364730835, + "learning_rate": 0.0006811149032992037, + "loss": 6.5968, + "step": 2803 + }, + { + "epoch": 0.9569965870307168, + "grad_norm": 3.343250274658203, + "learning_rate": 0.0006810011376564278, + "loss": 6.9726, + "step": 2804 + }, + { + "epoch": 0.9573378839590444, + "grad_norm": 3.670442819595337, + "learning_rate": 0.0006808873720136519, + "loss": 5.8369, + "step": 2805 + }, + { + "epoch": 0.957679180887372, + "grad_norm": 3.4261021614074707, + "learning_rate": 0.000680773606370876, + "loss": 6.7033, + "step": 2806 + }, + { + "epoch": 0.9580204778156997, + "grad_norm": 3.3535799980163574, + "learning_rate": 0.0006806598407281002, + "loss": 5.2909, + "step": 2807 + }, + { + "epoch": 0.9583617747440273, + "grad_norm": 3.462440252304077, + "learning_rate": 0.0006805460750853243, + "loss": 5.9633, + "step": 2808 + }, + { + "epoch": 0.9587030716723549, + "grad_norm": 3.2325925827026367, + "learning_rate": 0.0006804323094425484, + "loss": 6.4796, + "step": 2809 + }, + { + "epoch": 0.9590443686006825, + "grad_norm": 3.415300130844116, + "learning_rate": 0.0006803185437997725, + "loss": 6.298, + "step": 2810 + }, + { + "epoch": 0.9593856655290103, + "grad_norm": 3.2603628635406494, + "learning_rate": 0.0006802047781569965, + "loss": 6.9542, + "step": 2811 + }, + { + "epoch": 0.9597269624573379, + "grad_norm": 3.158876657485962, + "learning_rate": 0.0006800910125142206, + "loss": 6.7321, + "step": 2812 + }, + { + "epoch": 0.9600682593856655, + "grad_norm": 3.137831687927246, + "learning_rate": 0.0006799772468714448, + "loss": 6.1844, + "step": 2813 + }, + { + "epoch": 0.9604095563139932, + "grad_norm": 3.162550687789917, + "learning_rate": 0.000679863481228669, + "loss": 6.5377, + "step": 2814 + }, + { + "epoch": 0.9607508532423208, + "grad_norm": 3.1445960998535156, + "learning_rate": 0.0006797497155858931, + "loss": 6.4721, + "step": 2815 + }, + { + "epoch": 0.9610921501706484, + "grad_norm": 3.4892373085021973, + "learning_rate": 0.0006796359499431172, + "loss": 6.4251, + "step": 2816 + }, + { + "epoch": 0.9614334470989762, + "grad_norm": 3.36893892288208, + "learning_rate": 0.0006795221843003413, + "loss": 5.9855, + "step": 2817 + }, + { + "epoch": 0.9617747440273038, + "grad_norm": 3.3693227767944336, + "learning_rate": 0.0006794084186575654, + "loss": 6.8197, + "step": 2818 + }, + { + "epoch": 0.9621160409556314, + "grad_norm": 3.2168221473693848, + "learning_rate": 0.0006792946530147896, + "loss": 6.6614, + "step": 2819 + }, + { + "epoch": 0.962457337883959, + "grad_norm": 3.355677604675293, + "learning_rate": 0.0006791808873720137, + "loss": 6.1324, + "step": 2820 + }, + { + "epoch": 0.9627986348122867, + "grad_norm": 3.28233003616333, + "learning_rate": 0.0006790671217292378, + "loss": 6.5953, + "step": 2821 + }, + { + "epoch": 0.9631399317406143, + "grad_norm": 3.175426959991455, + "learning_rate": 0.0006789533560864619, + "loss": 6.4139, + "step": 2822 + }, + { + "epoch": 0.9634812286689419, + "grad_norm": 3.2525055408477783, + "learning_rate": 0.000678839590443686, + "loss": 6.8946, + "step": 2823 + }, + { + "epoch": 0.9638225255972697, + "grad_norm": 3.2364585399627686, + "learning_rate": 0.0006787258248009101, + "loss": 6.5195, + "step": 2824 + }, + { + "epoch": 0.9641638225255973, + "grad_norm": 3.8817360401153564, + "learning_rate": 0.0006786120591581343, + "loss": 5.2455, + "step": 2825 + }, + { + "epoch": 0.9645051194539249, + "grad_norm": 3.36077618598938, + "learning_rate": 0.0006784982935153584, + "loss": 5.7979, + "step": 2826 + }, + { + "epoch": 0.9648464163822525, + "grad_norm": 3.421419620513916, + "learning_rate": 0.0006783845278725825, + "loss": 6.6912, + "step": 2827 + }, + { + "epoch": 0.9651877133105802, + "grad_norm": 3.2399160861968994, + "learning_rate": 0.0006782707622298066, + "loss": 6.7359, + "step": 2828 + }, + { + "epoch": 0.9655290102389078, + "grad_norm": 3.5829226970672607, + "learning_rate": 0.0006781569965870307, + "loss": 6.447, + "step": 2829 + }, + { + "epoch": 0.9658703071672355, + "grad_norm": 3.9516849517822266, + "learning_rate": 0.0006780432309442548, + "loss": 6.1873, + "step": 2830 + }, + { + "epoch": 0.9662116040955632, + "grad_norm": 3.371225595474243, + "learning_rate": 0.000677929465301479, + "loss": 6.8074, + "step": 2831 + }, + { + "epoch": 0.9665529010238908, + "grad_norm": 3.1678028106689453, + "learning_rate": 0.0006778156996587031, + "loss": 6.8476, + "step": 2832 + }, + { + "epoch": 0.9668941979522184, + "grad_norm": 3.185497283935547, + "learning_rate": 0.0006777019340159272, + "loss": 6.7461, + "step": 2833 + }, + { + "epoch": 0.967235494880546, + "grad_norm": 6.401412487030029, + "learning_rate": 0.0006775881683731513, + "loss": 6.6854, + "step": 2834 + }, + { + "epoch": 0.9675767918088737, + "grad_norm": 4.533454418182373, + "learning_rate": 0.0006774744027303754, + "loss": 5.6879, + "step": 2835 + }, + { + "epoch": 0.9679180887372013, + "grad_norm": 4.112688064575195, + "learning_rate": 0.0006773606370875996, + "loss": 6.5312, + "step": 2836 + }, + { + "epoch": 0.968259385665529, + "grad_norm": 3.4008233547210693, + "learning_rate": 0.0006772468714448237, + "loss": 6.5382, + "step": 2837 + }, + { + "epoch": 0.9686006825938567, + "grad_norm": 3.319769859313965, + "learning_rate": 0.0006771331058020478, + "loss": 6.3743, + "step": 2838 + }, + { + "epoch": 0.9689419795221843, + "grad_norm": 3.506112813949585, + "learning_rate": 0.0006770193401592719, + "loss": 5.7598, + "step": 2839 + }, + { + "epoch": 0.9692832764505119, + "grad_norm": 3.2987313270568848, + "learning_rate": 0.000676905574516496, + "loss": 6.7194, + "step": 2840 + }, + { + "epoch": 0.9696245733788396, + "grad_norm": 3.148381233215332, + "learning_rate": 0.0006767918088737201, + "loss": 6.5178, + "step": 2841 + }, + { + "epoch": 0.9699658703071672, + "grad_norm": 3.224257230758667, + "learning_rate": 0.0006766780432309443, + "loss": 6.1293, + "step": 2842 + }, + { + "epoch": 0.9703071672354949, + "grad_norm": 3.166309356689453, + "learning_rate": 0.0006765642775881684, + "loss": 6.7261, + "step": 2843 + }, + { + "epoch": 0.9706484641638226, + "grad_norm": 3.201484441757202, + "learning_rate": 0.0006764505119453925, + "loss": 6.6831, + "step": 2844 + }, + { + "epoch": 0.9709897610921502, + "grad_norm": 3.366837501525879, + "learning_rate": 0.0006763367463026166, + "loss": 6.6644, + "step": 2845 + }, + { + "epoch": 0.9713310580204778, + "grad_norm": 3.205770969390869, + "learning_rate": 0.0006762229806598407, + "loss": 6.787, + "step": 2846 + }, + { + "epoch": 0.9716723549488054, + "grad_norm": 3.400615692138672, + "learning_rate": 0.000676109215017065, + "loss": 6.4538, + "step": 2847 + }, + { + "epoch": 0.9720136518771331, + "grad_norm": 3.295478343963623, + "learning_rate": 0.0006759954493742891, + "loss": 7.0216, + "step": 2848 + }, + { + "epoch": 0.9723549488054608, + "grad_norm": 3.9448795318603516, + "learning_rate": 0.0006758816837315131, + "loss": 6.3134, + "step": 2849 + }, + { + "epoch": 0.9726962457337884, + "grad_norm": 3.3810625076293945, + "learning_rate": 0.0006757679180887372, + "loss": 6.5216, + "step": 2850 + }, + { + "epoch": 0.9730375426621161, + "grad_norm": 3.364837169647217, + "learning_rate": 0.0006756541524459613, + "loss": 6.4476, + "step": 2851 + }, + { + "epoch": 0.9733788395904437, + "grad_norm": 3.2336301803588867, + "learning_rate": 0.0006755403868031854, + "loss": 6.19, + "step": 2852 + }, + { + "epoch": 0.9737201365187713, + "grad_norm": 3.081376314163208, + "learning_rate": 0.0006754266211604096, + "loss": 6.6487, + "step": 2853 + }, + { + "epoch": 0.974061433447099, + "grad_norm": 3.2751305103302, + "learning_rate": 0.0006753128555176337, + "loss": 6.3007, + "step": 2854 + }, + { + "epoch": 0.9744027303754266, + "grad_norm": 6.972997665405273, + "learning_rate": 0.0006751990898748578, + "loss": 5.3055, + "step": 2855 + }, + { + "epoch": 0.9747440273037543, + "grad_norm": 3.2816948890686035, + "learning_rate": 0.0006750853242320819, + "loss": 6.5868, + "step": 2856 + }, + { + "epoch": 0.9750853242320819, + "grad_norm": 3.3620402812957764, + "learning_rate": 0.000674971558589306, + "loss": 6.5817, + "step": 2857 + }, + { + "epoch": 0.9754266211604096, + "grad_norm": 3.404201030731201, + "learning_rate": 0.0006748577929465301, + "loss": 6.3418, + "step": 2858 + }, + { + "epoch": 0.9757679180887372, + "grad_norm": 3.4882445335388184, + "learning_rate": 0.0006747440273037543, + "loss": 5.5313, + "step": 2859 + }, + { + "epoch": 0.9761092150170648, + "grad_norm": 3.3639590740203857, + "learning_rate": 0.0006746302616609784, + "loss": 6.7607, + "step": 2860 + }, + { + "epoch": 0.9764505119453925, + "grad_norm": 3.0582730770111084, + "learning_rate": 0.0006745164960182025, + "loss": 6.4016, + "step": 2861 + }, + { + "epoch": 0.9767918088737202, + "grad_norm": 3.409369945526123, + "learning_rate": 0.0006744027303754266, + "loss": 6.8788, + "step": 2862 + }, + { + "epoch": 0.9771331058020478, + "grad_norm": 3.20253849029541, + "learning_rate": 0.0006742889647326507, + "loss": 6.3911, + "step": 2863 + }, + { + "epoch": 0.9774744027303754, + "grad_norm": 3.5735726356506348, + "learning_rate": 0.0006741751990898749, + "loss": 6.0568, + "step": 2864 + }, + { + "epoch": 0.9778156996587031, + "grad_norm": 3.4013800621032715, + "learning_rate": 0.0006740614334470991, + "loss": 6.2177, + "step": 2865 + }, + { + "epoch": 0.9781569965870307, + "grad_norm": 5.85659646987915, + "learning_rate": 0.0006739476678043232, + "loss": 5.692, + "step": 2866 + }, + { + "epoch": 0.9784982935153583, + "grad_norm": 3.534287214279175, + "learning_rate": 0.0006738339021615473, + "loss": 6.3442, + "step": 2867 + }, + { + "epoch": 0.978839590443686, + "grad_norm": 3.368276357650757, + "learning_rate": 0.0006737201365187714, + "loss": 6.6924, + "step": 2868 + }, + { + "epoch": 0.9791808873720137, + "grad_norm": 3.3042497634887695, + "learning_rate": 0.0006736063708759954, + "loss": 6.2767, + "step": 2869 + }, + { + "epoch": 0.9795221843003413, + "grad_norm": 3.2396910190582275, + "learning_rate": 0.0006734926052332196, + "loss": 6.6282, + "step": 2870 + }, + { + "epoch": 0.979863481228669, + "grad_norm": 3.2214739322662354, + "learning_rate": 0.0006733788395904437, + "loss": 6.5097, + "step": 2871 + }, + { + "epoch": 0.9802047781569966, + "grad_norm": 3.2005984783172607, + "learning_rate": 0.0006732650739476678, + "loss": 6.2032, + "step": 2872 + }, + { + "epoch": 0.9805460750853242, + "grad_norm": 4.3526105880737305, + "learning_rate": 0.0006731513083048919, + "loss": 6.1474, + "step": 2873 + }, + { + "epoch": 0.9808873720136518, + "grad_norm": 3.500005006790161, + "learning_rate": 0.000673037542662116, + "loss": 6.3671, + "step": 2874 + }, + { + "epoch": 0.9812286689419796, + "grad_norm": 4.856510162353516, + "learning_rate": 0.0006729237770193401, + "loss": 5.3342, + "step": 2875 + }, + { + "epoch": 0.9815699658703072, + "grad_norm": 3.3631317615509033, + "learning_rate": 0.0006728100113765643, + "loss": 6.9552, + "step": 2876 + }, + { + "epoch": 0.9819112627986348, + "grad_norm": 4.488306999206543, + "learning_rate": 0.0006726962457337884, + "loss": 5.2524, + "step": 2877 + }, + { + "epoch": 0.9822525597269625, + "grad_norm": 3.190113067626953, + "learning_rate": 0.0006725824800910125, + "loss": 6.5939, + "step": 2878 + }, + { + "epoch": 0.9825938566552901, + "grad_norm": 3.2531468868255615, + "learning_rate": 0.0006724687144482366, + "loss": 6.8033, + "step": 2879 + }, + { + "epoch": 0.9829351535836177, + "grad_norm": 3.396819591522217, + "learning_rate": 0.0006723549488054607, + "loss": 6.5321, + "step": 2880 + }, + { + "epoch": 0.9832764505119453, + "grad_norm": 3.1268954277038574, + "learning_rate": 0.0006722411831626849, + "loss": 6.7329, + "step": 2881 + }, + { + "epoch": 0.9836177474402731, + "grad_norm": 3.4589643478393555, + "learning_rate": 0.0006721274175199091, + "loss": 6.5176, + "step": 2882 + }, + { + "epoch": 0.9839590443686007, + "grad_norm": 3.242070198059082, + "learning_rate": 0.0006720136518771332, + "loss": 6.6396, + "step": 2883 + }, + { + "epoch": 0.9843003412969283, + "grad_norm": 3.2771873474121094, + "learning_rate": 0.0006718998862343573, + "loss": 6.3999, + "step": 2884 + }, + { + "epoch": 0.984641638225256, + "grad_norm": 3.497062921524048, + "learning_rate": 0.0006717861205915814, + "loss": 6.2288, + "step": 2885 + }, + { + "epoch": 0.9849829351535836, + "grad_norm": 3.2468268871307373, + "learning_rate": 0.0006716723549488055, + "loss": 6.7244, + "step": 2886 + }, + { + "epoch": 0.9853242320819112, + "grad_norm": 3.181048631668091, + "learning_rate": 0.0006715585893060296, + "loss": 6.9261, + "step": 2887 + }, + { + "epoch": 0.985665529010239, + "grad_norm": 3.220104455947876, + "learning_rate": 0.0006714448236632537, + "loss": 6.4713, + "step": 2888 + }, + { + "epoch": 0.9860068259385666, + "grad_norm": 3.5669116973876953, + "learning_rate": 0.0006713310580204778, + "loss": 5.612, + "step": 2889 + }, + { + "epoch": 0.9863481228668942, + "grad_norm": 3.239811897277832, + "learning_rate": 0.0006712172923777019, + "loss": 6.8196, + "step": 2890 + }, + { + "epoch": 0.9866894197952218, + "grad_norm": 3.4331865310668945, + "learning_rate": 0.000671103526734926, + "loss": 6.6693, + "step": 2891 + }, + { + "epoch": 0.9870307167235495, + "grad_norm": 3.3848814964294434, + "learning_rate": 0.0006709897610921501, + "loss": 7.0898, + "step": 2892 + }, + { + "epoch": 0.9873720136518771, + "grad_norm": 3.2575674057006836, + "learning_rate": 0.0006708759954493743, + "loss": 6.7033, + "step": 2893 + }, + { + "epoch": 0.9877133105802047, + "grad_norm": 3.440713882446289, + "learning_rate": 0.0006707622298065984, + "loss": 6.4287, + "step": 2894 + }, + { + "epoch": 0.9880546075085325, + "grad_norm": 3.2217304706573486, + "learning_rate": 0.0006706484641638225, + "loss": 6.721, + "step": 2895 + }, + { + "epoch": 0.9883959044368601, + "grad_norm": 3.2484257221221924, + "learning_rate": 0.0006705346985210466, + "loss": 6.8919, + "step": 2896 + }, + { + "epoch": 0.9887372013651877, + "grad_norm": 3.206747055053711, + "learning_rate": 0.0006704209328782707, + "loss": 6.5002, + "step": 2897 + }, + { + "epoch": 0.9890784982935154, + "grad_norm": 3.30210280418396, + "learning_rate": 0.0006703071672354949, + "loss": 6.0187, + "step": 2898 + }, + { + "epoch": 0.989419795221843, + "grad_norm": 2.3465044498443604, + "learning_rate": 0.0006701934015927191, + "loss": 3.2803, + "step": 2899 + }, + { + "epoch": 0.9897610921501706, + "grad_norm": 3.455019950866699, + "learning_rate": 0.0006700796359499432, + "loss": 6.937, + "step": 2900 + }, + { + "epoch": 0.9901023890784983, + "grad_norm": 3.3712611198425293, + "learning_rate": 0.0006699658703071673, + "loss": 6.6826, + "step": 2901 + }, + { + "epoch": 0.990443686006826, + "grad_norm": 3.372312068939209, + "learning_rate": 0.0006698521046643914, + "loss": 6.529, + "step": 2902 + }, + { + "epoch": 0.9907849829351536, + "grad_norm": 3.750188112258911, + "learning_rate": 0.0006697383390216155, + "loss": 6.4456, + "step": 2903 + }, + { + "epoch": 0.9911262798634812, + "grad_norm": 3.316756010055542, + "learning_rate": 0.0006696245733788396, + "loss": 6.311, + "step": 2904 + }, + { + "epoch": 0.9914675767918089, + "grad_norm": 3.249242067337036, + "learning_rate": 0.0006695108077360638, + "loss": 6.9939, + "step": 2905 + }, + { + "epoch": 0.9918088737201365, + "grad_norm": 3.5320539474487305, + "learning_rate": 0.0006693970420932879, + "loss": 6.3807, + "step": 2906 + }, + { + "epoch": 0.9921501706484641, + "grad_norm": 3.2891733646392822, + "learning_rate": 0.000669283276450512, + "loss": 6.5872, + "step": 2907 + }, + { + "epoch": 0.9924914675767919, + "grad_norm": 3.310703754425049, + "learning_rate": 0.000669169510807736, + "loss": 6.7432, + "step": 2908 + }, + { + "epoch": 0.9928327645051195, + "grad_norm": 3.4947831630706787, + "learning_rate": 0.0006690557451649601, + "loss": 6.6943, + "step": 2909 + }, + { + "epoch": 0.9931740614334471, + "grad_norm": 6.251656532287598, + "learning_rate": 0.0006689419795221842, + "loss": 5.9758, + "step": 2910 + }, + { + "epoch": 0.9935153583617747, + "grad_norm": 3.236522912979126, + "learning_rate": 0.0006688282138794084, + "loss": 6.7063, + "step": 2911 + }, + { + "epoch": 0.9938566552901024, + "grad_norm": 3.4270472526550293, + "learning_rate": 0.0006687144482366325, + "loss": 6.9056, + "step": 2912 + }, + { + "epoch": 0.99419795221843, + "grad_norm": 3.3885722160339355, + "learning_rate": 0.0006686006825938566, + "loss": 6.6033, + "step": 2913 + }, + { + "epoch": 0.9945392491467577, + "grad_norm": 3.3846211433410645, + "learning_rate": 0.0006684869169510807, + "loss": 6.8574, + "step": 2914 + }, + { + "epoch": 0.9948805460750854, + "grad_norm": 3.4704039096832275, + "learning_rate": 0.0006683731513083049, + "loss": 6.2599, + "step": 2915 + }, + { + "epoch": 0.995221843003413, + "grad_norm": 3.076455593109131, + "learning_rate": 0.0006682593856655291, + "loss": 6.8536, + "step": 2916 + }, + { + "epoch": 0.9955631399317406, + "grad_norm": 3.1758480072021484, + "learning_rate": 0.0006681456200227532, + "loss": 6.7996, + "step": 2917 + }, + { + "epoch": 0.9959044368600682, + "grad_norm": 4.0002851486206055, + "learning_rate": 0.0006680318543799773, + "loss": 5.2598, + "step": 2918 + }, + { + "epoch": 0.9962457337883959, + "grad_norm": 5.545552730560303, + "learning_rate": 0.0006679180887372014, + "loss": 5.8838, + "step": 2919 + }, + { + "epoch": 0.9965870307167235, + "grad_norm": 3.345064163208008, + "learning_rate": 0.0006678043230944255, + "loss": 6.566, + "step": 2920 + }, + { + "epoch": 0.9969283276450512, + "grad_norm": 3.4302754402160645, + "learning_rate": 0.0006676905574516496, + "loss": 7.0484, + "step": 2921 + }, + { + "epoch": 0.9972696245733789, + "grad_norm": 3.445356607437134, + "learning_rate": 0.0006675767918088738, + "loss": 6.7516, + "step": 2922 + }, + { + "epoch": 0.9976109215017065, + "grad_norm": 3.258596181869507, + "learning_rate": 0.0006674630261660979, + "loss": 6.5921, + "step": 2923 + }, + { + "epoch": 0.9979522184300341, + "grad_norm": 3.594090700149536, + "learning_rate": 0.000667349260523322, + "loss": 6.6817, + "step": 2924 + }, + { + "epoch": 0.9982935153583617, + "grad_norm": 4.468551158905029, + "learning_rate": 0.0006672354948805461, + "loss": 6.0565, + "step": 2925 + }, + { + "epoch": 0.9986348122866894, + "grad_norm": 3.330815076828003, + "learning_rate": 0.0006671217292377702, + "loss": 6.2681, + "step": 2926 + }, + { + "epoch": 0.9989761092150171, + "grad_norm": 3.3018605709075928, + "learning_rate": 0.0006670079635949942, + "loss": 6.4861, + "step": 2927 + }, + { + "epoch": 0.9993174061433447, + "grad_norm": 3.5475411415100098, + "learning_rate": 0.0006668941979522184, + "loss": 6.3887, + "step": 2928 + }, + { + "epoch": 0.9996587030716724, + "grad_norm": 3.245288133621216, + "learning_rate": 0.0006667804323094425, + "loss": 6.7998, + "step": 2929 + }, + { + "epoch": 1.0, + "grad_norm": 3.472442388534546, + "learning_rate": 0.0006666666666666666, + "loss": 6.3197, + "step": 2930 + }, + { + "epoch": 1.0003412969283276, + "grad_norm": 3.2570908069610596, + "learning_rate": 0.0006665529010238907, + "loss": 6.4722, + "step": 2931 + }, + { + "epoch": 1.0006825938566553, + "grad_norm": 3.1390857696533203, + "learning_rate": 0.0006664391353811149, + "loss": 6.8016, + "step": 2932 + }, + { + "epoch": 1.0010238907849829, + "grad_norm": 3.2926814556121826, + "learning_rate": 0.0006663253697383391, + "loss": 6.5122, + "step": 2933 + }, + { + "epoch": 1.0013651877133105, + "grad_norm": 4.434762477874756, + "learning_rate": 0.0006662116040955632, + "loss": 4.9442, + "step": 2934 + }, + { + "epoch": 1.0017064846416381, + "grad_norm": 3.5702288150787354, + "learning_rate": 0.0006660978384527873, + "loss": 5.8109, + "step": 2935 + }, + { + "epoch": 1.0020477815699658, + "grad_norm": 3.399153709411621, + "learning_rate": 0.0006659840728100114, + "loss": 7.0274, + "step": 2936 + }, + { + "epoch": 1.0023890784982936, + "grad_norm": 3.211782455444336, + "learning_rate": 0.0006658703071672355, + "loss": 6.4682, + "step": 2937 + }, + { + "epoch": 1.0027303754266212, + "grad_norm": 3.3782007694244385, + "learning_rate": 0.0006657565415244596, + "loss": 5.5324, + "step": 2938 + }, + { + "epoch": 1.0030716723549489, + "grad_norm": 3.238039493560791, + "learning_rate": 0.0006656427758816838, + "loss": 6.6212, + "step": 2939 + }, + { + "epoch": 1.0034129692832765, + "grad_norm": 3.5137596130371094, + "learning_rate": 0.0006655290102389079, + "loss": 6.15, + "step": 2940 + }, + { + "epoch": 1.0037542662116041, + "grad_norm": 4.969235897064209, + "learning_rate": 0.000665415244596132, + "loss": 6.0722, + "step": 2941 + }, + { + "epoch": 1.0040955631399318, + "grad_norm": 3.3056838512420654, + "learning_rate": 0.0006653014789533561, + "loss": 7.2703, + "step": 2942 + }, + { + "epoch": 1.0044368600682594, + "grad_norm": 3.27779221534729, + "learning_rate": 0.0006651877133105802, + "loss": 6.3094, + "step": 2943 + }, + { + "epoch": 1.004778156996587, + "grad_norm": 3.4663007259368896, + "learning_rate": 0.0006650739476678043, + "loss": 6.4624, + "step": 2944 + }, + { + "epoch": 1.0051194539249146, + "grad_norm": 3.294342279434204, + "learning_rate": 0.0006649601820250285, + "loss": 6.1171, + "step": 2945 + }, + { + "epoch": 1.0054607508532423, + "grad_norm": 3.324336051940918, + "learning_rate": 0.0006648464163822526, + "loss": 6.8682, + "step": 2946 + }, + { + "epoch": 1.00580204778157, + "grad_norm": 3.631251335144043, + "learning_rate": 0.0006647326507394766, + "loss": 5.8187, + "step": 2947 + }, + { + "epoch": 1.0061433447098975, + "grad_norm": 4.85386848449707, + "learning_rate": 0.0006646188850967008, + "loss": 5.7936, + "step": 2948 + }, + { + "epoch": 1.0064846416382252, + "grad_norm": 3.450941324234009, + "learning_rate": 0.0006645051194539249, + "loss": 6.8983, + "step": 2949 + }, + { + "epoch": 1.006825938566553, + "grad_norm": 3.294748544692993, + "learning_rate": 0.000664391353811149, + "loss": 5.9347, + "step": 2950 + }, + { + "epoch": 1.0071672354948806, + "grad_norm": 3.2384605407714844, + "learning_rate": 0.0006642775881683732, + "loss": 6.8953, + "step": 2951 + }, + { + "epoch": 1.0075085324232083, + "grad_norm": 3.294921875, + "learning_rate": 0.0006641638225255973, + "loss": 6.7133, + "step": 2952 + }, + { + "epoch": 1.0078498293515359, + "grad_norm": 3.22776198387146, + "learning_rate": 0.0006640500568828214, + "loss": 6.7304, + "step": 2953 + }, + { + "epoch": 1.0081911262798635, + "grad_norm": 4.246631145477295, + "learning_rate": 0.0006639362912400455, + "loss": 4.3664, + "step": 2954 + }, + { + "epoch": 1.0085324232081911, + "grad_norm": 3.2894020080566406, + "learning_rate": 0.0006638225255972696, + "loss": 6.795, + "step": 2955 + }, + { + "epoch": 1.0088737201365188, + "grad_norm": 3.3010354042053223, + "learning_rate": 0.0006637087599544938, + "loss": 6.5039, + "step": 2956 + }, + { + "epoch": 1.0092150170648464, + "grad_norm": 3.403155565261841, + "learning_rate": 0.0006635949943117179, + "loss": 6.5983, + "step": 2957 + }, + { + "epoch": 1.009556313993174, + "grad_norm": 3.287907123565674, + "learning_rate": 0.000663481228668942, + "loss": 5.8215, + "step": 2958 + }, + { + "epoch": 1.0098976109215017, + "grad_norm": 3.528432607650757, + "learning_rate": 0.0006633674630261661, + "loss": 5.889, + "step": 2959 + }, + { + "epoch": 1.0102389078498293, + "grad_norm": 3.363466739654541, + "learning_rate": 0.0006632536973833902, + "loss": 6.8933, + "step": 2960 + }, + { + "epoch": 1.010580204778157, + "grad_norm": 3.36714243888855, + "learning_rate": 0.0006631399317406143, + "loss": 6.4269, + "step": 2961 + }, + { + "epoch": 1.0109215017064845, + "grad_norm": 3.2045083045959473, + "learning_rate": 0.0006630261660978385, + "loss": 6.6594, + "step": 2962 + }, + { + "epoch": 1.0112627986348124, + "grad_norm": 3.256004571914673, + "learning_rate": 0.0006629124004550626, + "loss": 6.5809, + "step": 2963 + }, + { + "epoch": 1.01160409556314, + "grad_norm": 3.3133018016815186, + "learning_rate": 0.0006627986348122868, + "loss": 6.3003, + "step": 2964 + }, + { + "epoch": 1.0119453924914676, + "grad_norm": 3.8093559741973877, + "learning_rate": 0.0006626848691695109, + "loss": 6.2441, + "step": 2965 + }, + { + "epoch": 1.0122866894197953, + "grad_norm": 3.227069139480591, + "learning_rate": 0.0006625711035267349, + "loss": 6.5614, + "step": 2966 + }, + { + "epoch": 1.012627986348123, + "grad_norm": 3.215060234069824, + "learning_rate": 0.000662457337883959, + "loss": 6.2936, + "step": 2967 + }, + { + "epoch": 1.0129692832764505, + "grad_norm": 5.511234283447266, + "learning_rate": 0.0006623435722411832, + "loss": 6.3446, + "step": 2968 + }, + { + "epoch": 1.0133105802047782, + "grad_norm": 3.6093709468841553, + "learning_rate": 0.0006622298065984073, + "loss": 7.026, + "step": 2969 + }, + { + "epoch": 1.0136518771331058, + "grad_norm": 9.460335731506348, + "learning_rate": 0.0006621160409556314, + "loss": 8.476, + "step": 2970 + }, + { + "epoch": 1.0139931740614334, + "grad_norm": 3.619230031967163, + "learning_rate": 0.0006620022753128555, + "loss": 6.4041, + "step": 2971 + }, + { + "epoch": 1.014334470989761, + "grad_norm": 3.284637928009033, + "learning_rate": 0.0006618885096700796, + "loss": 6.6486, + "step": 2972 + }, + { + "epoch": 1.0146757679180887, + "grad_norm": 3.4304845333099365, + "learning_rate": 0.0006617747440273038, + "loss": 5.978, + "step": 2973 + }, + { + "epoch": 1.0150170648464163, + "grad_norm": 3.1258368492126465, + "learning_rate": 0.0006616609783845279, + "loss": 6.6721, + "step": 2974 + }, + { + "epoch": 1.015358361774744, + "grad_norm": 3.1004960536956787, + "learning_rate": 0.000661547212741752, + "loss": 6.6683, + "step": 2975 + }, + { + "epoch": 1.0156996587030718, + "grad_norm": 3.346923351287842, + "learning_rate": 0.0006614334470989761, + "loss": 6.4257, + "step": 2976 + }, + { + "epoch": 1.0160409556313994, + "grad_norm": 3.8876302242279053, + "learning_rate": 0.0006613196814562002, + "loss": 6.0568, + "step": 2977 + }, + { + "epoch": 1.016382252559727, + "grad_norm": 3.5111100673675537, + "learning_rate": 0.0006612059158134243, + "loss": 6.9239, + "step": 2978 + }, + { + "epoch": 1.0167235494880547, + "grad_norm": 3.2837634086608887, + "learning_rate": 0.0006610921501706485, + "loss": 6.5992, + "step": 2979 + }, + { + "epoch": 1.0170648464163823, + "grad_norm": 3.1712963581085205, + "learning_rate": 0.0006609783845278727, + "loss": 6.7537, + "step": 2980 + }, + { + "epoch": 1.01740614334471, + "grad_norm": 3.12092924118042, + "learning_rate": 0.0006608646188850968, + "loss": 6.6126, + "step": 2981 + }, + { + "epoch": 1.0177474402730375, + "grad_norm": 3.164799928665161, + "learning_rate": 0.0006607508532423209, + "loss": 6.2507, + "step": 2982 + }, + { + "epoch": 1.0180887372013652, + "grad_norm": 3.2638497352600098, + "learning_rate": 0.000660637087599545, + "loss": 6.1015, + "step": 2983 + }, + { + "epoch": 1.0184300341296928, + "grad_norm": 3.560190200805664, + "learning_rate": 0.0006605233219567691, + "loss": 6.388, + "step": 2984 + }, + { + "epoch": 1.0187713310580204, + "grad_norm": 3.2741174697875977, + "learning_rate": 0.0006604095563139933, + "loss": 6.9384, + "step": 2985 + }, + { + "epoch": 1.019112627986348, + "grad_norm": 6.3302788734436035, + "learning_rate": 0.0006602957906712173, + "loss": 5.3906, + "step": 2986 + }, + { + "epoch": 1.0194539249146757, + "grad_norm": 3.5244359970092773, + "learning_rate": 0.0006601820250284414, + "loss": 6.3372, + "step": 2987 + }, + { + "epoch": 1.0197952218430033, + "grad_norm": 3.4077272415161133, + "learning_rate": 0.0006600682593856655, + "loss": 6.0502, + "step": 2988 + }, + { + "epoch": 1.0201365187713312, + "grad_norm": 3.206965446472168, + "learning_rate": 0.0006599544937428896, + "loss": 6.3437, + "step": 2989 + }, + { + "epoch": 1.0204778156996588, + "grad_norm": 3.215886116027832, + "learning_rate": 0.0006598407281001137, + "loss": 6.6096, + "step": 2990 + }, + { + "epoch": 1.0208191126279864, + "grad_norm": 3.386878728866577, + "learning_rate": 0.0006597269624573379, + "loss": 6.5003, + "step": 2991 + }, + { + "epoch": 1.021160409556314, + "grad_norm": 3.419936180114746, + "learning_rate": 0.000659613196814562, + "loss": 6.5968, + "step": 2992 + }, + { + "epoch": 1.0215017064846417, + "grad_norm": 3.197819471359253, + "learning_rate": 0.0006594994311717861, + "loss": 6.9043, + "step": 2993 + }, + { + "epoch": 1.0218430034129693, + "grad_norm": 3.7721807956695557, + "learning_rate": 0.0006593856655290102, + "loss": 5.7042, + "step": 2994 + }, + { + "epoch": 1.022184300341297, + "grad_norm": 3.4664015769958496, + "learning_rate": 0.0006592718998862343, + "loss": 6.1596, + "step": 2995 + }, + { + "epoch": 1.0225255972696246, + "grad_norm": 3.259398937225342, + "learning_rate": 0.0006591581342434585, + "loss": 6.705, + "step": 2996 + }, + { + "epoch": 1.0228668941979522, + "grad_norm": 3.5420732498168945, + "learning_rate": 0.0006590443686006827, + "loss": 5.5468, + "step": 2997 + }, + { + "epoch": 1.0232081911262798, + "grad_norm": 3.332111120223999, + "learning_rate": 0.0006589306029579068, + "loss": 6.9041, + "step": 2998 + }, + { + "epoch": 1.0235494880546074, + "grad_norm": 3.29581618309021, + "learning_rate": 0.0006588168373151309, + "loss": 6.0205, + "step": 2999 + }, + { + "epoch": 1.023890784982935, + "grad_norm": 3.214153289794922, + "learning_rate": 0.000658703071672355, + "loss": 6.4507, + "step": 3000 + }, + { + "epoch": 1.0242320819112627, + "grad_norm": 3.5588912963867188, + "learning_rate": 0.0006585893060295791, + "loss": 6.4715, + "step": 3001 + }, + { + "epoch": 1.0245733788395905, + "grad_norm": 3.5324950218200684, + "learning_rate": 0.0006584755403868033, + "loss": 6.2474, + "step": 3002 + }, + { + "epoch": 1.0249146757679182, + "grad_norm": 3.397630453109741, + "learning_rate": 0.0006583617747440274, + "loss": 6.9605, + "step": 3003 + }, + { + "epoch": 1.0252559726962458, + "grad_norm": 3.475452423095703, + "learning_rate": 0.0006582480091012515, + "loss": 6.6207, + "step": 3004 + }, + { + "epoch": 1.0255972696245734, + "grad_norm": 3.4406540393829346, + "learning_rate": 0.0006581342434584755, + "loss": 6.6004, + "step": 3005 + }, + { + "epoch": 1.025938566552901, + "grad_norm": 5.332218170166016, + "learning_rate": 0.0006580204778156996, + "loss": 5.9674, + "step": 3006 + }, + { + "epoch": 1.0262798634812287, + "grad_norm": 3.4023447036743164, + "learning_rate": 0.0006579067121729237, + "loss": 6.3335, + "step": 3007 + }, + { + "epoch": 1.0266211604095563, + "grad_norm": 3.25628399848938, + "learning_rate": 0.0006577929465301479, + "loss": 6.2624, + "step": 3008 + }, + { + "epoch": 1.026962457337884, + "grad_norm": 3.3259971141815186, + "learning_rate": 0.000657679180887372, + "loss": 6.3255, + "step": 3009 + }, + { + "epoch": 1.0273037542662116, + "grad_norm": 4.986135482788086, + "learning_rate": 0.0006575654152445961, + "loss": 5.4787, + "step": 3010 + }, + { + "epoch": 1.0276450511945392, + "grad_norm": 3.862529754638672, + "learning_rate": 0.0006574516496018202, + "loss": 5.2939, + "step": 3011 + }, + { + "epoch": 1.0279863481228668, + "grad_norm": 3.2690622806549072, + "learning_rate": 0.0006573378839590443, + "loss": 6.5039, + "step": 3012 + }, + { + "epoch": 1.0283276450511944, + "grad_norm": 3.9144980907440186, + "learning_rate": 0.0006572241183162685, + "loss": 5.961, + "step": 3013 + }, + { + "epoch": 1.028668941979522, + "grad_norm": 3.58562970161438, + "learning_rate": 0.0006571103526734927, + "loss": 5.8142, + "step": 3014 + }, + { + "epoch": 1.02901023890785, + "grad_norm": 3.2888505458831787, + "learning_rate": 0.0006569965870307168, + "loss": 6.5148, + "step": 3015 + }, + { + "epoch": 1.0293515358361776, + "grad_norm": 3.2708396911621094, + "learning_rate": 0.0006568828213879409, + "loss": 6.4559, + "step": 3016 + }, + { + "epoch": 1.0296928327645052, + "grad_norm": 3.5158915519714355, + "learning_rate": 0.000656769055745165, + "loss": 6.3544, + "step": 3017 + }, + { + "epoch": 1.0300341296928328, + "grad_norm": 3.263108968734741, + "learning_rate": 0.0006566552901023891, + "loss": 6.443, + "step": 3018 + }, + { + "epoch": 1.0303754266211604, + "grad_norm": 3.9215052127838135, + "learning_rate": 0.0006565415244596133, + "loss": 5.9249, + "step": 3019 + }, + { + "epoch": 1.030716723549488, + "grad_norm": 3.202061653137207, + "learning_rate": 0.0006564277588168374, + "loss": 6.5584, + "step": 3020 + }, + { + "epoch": 1.0310580204778157, + "grad_norm": 14.988799095153809, + "learning_rate": 0.0006563139931740615, + "loss": 6.4804, + "step": 3021 + }, + { + "epoch": 1.0313993174061433, + "grad_norm": 11.352145195007324, + "learning_rate": 0.0006562002275312856, + "loss": 7.3238, + "step": 3022 + }, + { + "epoch": 1.031740614334471, + "grad_norm": 3.537874221801758, + "learning_rate": 0.0006560864618885097, + "loss": 7.0763, + "step": 3023 + }, + { + "epoch": 1.0320819112627986, + "grad_norm": 3.5525970458984375, + "learning_rate": 0.0006559726962457337, + "loss": 6.6374, + "step": 3024 + }, + { + "epoch": 1.0324232081911262, + "grad_norm": 3.3168790340423584, + "learning_rate": 0.0006558589306029579, + "loss": 6.5269, + "step": 3025 + }, + { + "epoch": 1.0327645051194538, + "grad_norm": 5.2062835693359375, + "learning_rate": 0.000655745164960182, + "loss": 6.9295, + "step": 3026 + }, + { + "epoch": 1.0331058020477815, + "grad_norm": 3.208059310913086, + "learning_rate": 0.0006556313993174061, + "loss": 7.0108, + "step": 3027 + }, + { + "epoch": 1.0334470989761093, + "grad_norm": 3.500671148300171, + "learning_rate": 0.0006555176336746302, + "loss": 6.1865, + "step": 3028 + }, + { + "epoch": 1.033788395904437, + "grad_norm": 5.409757137298584, + "learning_rate": 0.0006554038680318543, + "loss": 6.2085, + "step": 3029 + }, + { + "epoch": 1.0341296928327646, + "grad_norm": 3.258284091949463, + "learning_rate": 0.0006552901023890784, + "loss": 6.0585, + "step": 3030 + }, + { + "epoch": 1.0344709897610922, + "grad_norm": 3.4100899696350098, + "learning_rate": 0.0006551763367463027, + "loss": 6.069, + "step": 3031 + }, + { + "epoch": 1.0348122866894198, + "grad_norm": 3.146129846572876, + "learning_rate": 0.0006550625711035268, + "loss": 5.9704, + "step": 3032 + }, + { + "epoch": 1.0351535836177475, + "grad_norm": 3.3420863151550293, + "learning_rate": 0.0006549488054607509, + "loss": 7.3534, + "step": 3033 + }, + { + "epoch": 1.035494880546075, + "grad_norm": 3.4072580337524414, + "learning_rate": 0.000654835039817975, + "loss": 6.6606, + "step": 3034 + }, + { + "epoch": 1.0358361774744027, + "grad_norm": 3.3108816146850586, + "learning_rate": 0.0006547212741751991, + "loss": 6.3785, + "step": 3035 + }, + { + "epoch": 1.0361774744027303, + "grad_norm": 3.2887344360351562, + "learning_rate": 0.0006546075085324233, + "loss": 7.0955, + "step": 3036 + }, + { + "epoch": 1.036518771331058, + "grad_norm": 6.6958842277526855, + "learning_rate": 0.0006544937428896474, + "loss": 5.6834, + "step": 3037 + }, + { + "epoch": 1.0368600682593856, + "grad_norm": 3.3041765689849854, + "learning_rate": 0.0006543799772468715, + "loss": 6.5706, + "step": 3038 + }, + { + "epoch": 1.0372013651877132, + "grad_norm": 3.6999776363372803, + "learning_rate": 0.0006542662116040956, + "loss": 6.5919, + "step": 3039 + }, + { + "epoch": 1.0375426621160408, + "grad_norm": 3.507978916168213, + "learning_rate": 0.0006541524459613197, + "loss": 6.4322, + "step": 3040 + }, + { + "epoch": 1.0378839590443687, + "grad_norm": 3.151499032974243, + "learning_rate": 0.0006540386803185438, + "loss": 6.5041, + "step": 3041 + }, + { + "epoch": 1.0382252559726963, + "grad_norm": 3.339322805404663, + "learning_rate": 0.000653924914675768, + "loss": 6.2779, + "step": 3042 + }, + { + "epoch": 1.038566552901024, + "grad_norm": 4.641025543212891, + "learning_rate": 0.0006538111490329921, + "loss": 5.4317, + "step": 3043 + }, + { + "epoch": 1.0389078498293516, + "grad_norm": 3.3344995975494385, + "learning_rate": 0.0006536973833902161, + "loss": 6.5923, + "step": 3044 + }, + { + "epoch": 1.0392491467576792, + "grad_norm": 3.2587029933929443, + "learning_rate": 0.0006535836177474402, + "loss": 6.4993, + "step": 3045 + }, + { + "epoch": 1.0395904436860068, + "grad_norm": 4.462654113769531, + "learning_rate": 0.0006534698521046643, + "loss": 6.0649, + "step": 3046 + }, + { + "epoch": 1.0399317406143345, + "grad_norm": 3.293430805206299, + "learning_rate": 0.0006533560864618884, + "loss": 5.8758, + "step": 3047 + }, + { + "epoch": 1.040273037542662, + "grad_norm": 3.2306947708129883, + "learning_rate": 0.0006532423208191127, + "loss": 6.3128, + "step": 3048 + }, + { + "epoch": 1.0406143344709897, + "grad_norm": 3.3286349773406982, + "learning_rate": 0.0006531285551763368, + "loss": 6.7096, + "step": 3049 + }, + { + "epoch": 1.0409556313993173, + "grad_norm": 3.1706297397613525, + "learning_rate": 0.0006530147895335609, + "loss": 6.7695, + "step": 3050 + }, + { + "epoch": 1.041296928327645, + "grad_norm": 3.0948450565338135, + "learning_rate": 0.000652901023890785, + "loss": 6.4126, + "step": 3051 + }, + { + "epoch": 1.0416382252559726, + "grad_norm": 3.1173272132873535, + "learning_rate": 0.0006527872582480091, + "loss": 6.961, + "step": 3052 + }, + { + "epoch": 1.0419795221843002, + "grad_norm": 3.5899710655212402, + "learning_rate": 0.0006526734926052332, + "loss": 5.9567, + "step": 3053 + }, + { + "epoch": 1.042320819112628, + "grad_norm": 3.2167882919311523, + "learning_rate": 0.0006525597269624574, + "loss": 6.2195, + "step": 3054 + }, + { + "epoch": 1.0426621160409557, + "grad_norm": 3.2807223796844482, + "learning_rate": 0.0006524459613196815, + "loss": 6.4943, + "step": 3055 + }, + { + "epoch": 1.0430034129692833, + "grad_norm": 3.2400906085968018, + "learning_rate": 0.0006523321956769056, + "loss": 6.4222, + "step": 3056 + }, + { + "epoch": 1.043344709897611, + "grad_norm": 3.420193672180176, + "learning_rate": 0.0006522184300341297, + "loss": 6.3227, + "step": 3057 + }, + { + "epoch": 1.0436860068259386, + "grad_norm": 3.231858015060425, + "learning_rate": 0.0006521046643913538, + "loss": 7.0397, + "step": 3058 + }, + { + "epoch": 1.0440273037542662, + "grad_norm": 3.291337490081787, + "learning_rate": 0.000651990898748578, + "loss": 5.907, + "step": 3059 + }, + { + "epoch": 1.0443686006825939, + "grad_norm": 3.354321241378784, + "learning_rate": 0.0006518771331058021, + "loss": 6.532, + "step": 3060 + }, + { + "epoch": 1.0447098976109215, + "grad_norm": 3.303192377090454, + "learning_rate": 0.0006517633674630262, + "loss": 6.2837, + "step": 3061 + }, + { + "epoch": 1.045051194539249, + "grad_norm": 3.2552897930145264, + "learning_rate": 0.0006516496018202503, + "loss": 6.9191, + "step": 3062 + }, + { + "epoch": 1.0453924914675767, + "grad_norm": 3.7623484134674072, + "learning_rate": 0.0006515358361774743, + "loss": 6.0883, + "step": 3063 + }, + { + "epoch": 1.0457337883959044, + "grad_norm": 3.314925193786621, + "learning_rate": 0.0006514220705346984, + "loss": 6.4738, + "step": 3064 + }, + { + "epoch": 1.046075085324232, + "grad_norm": 3.2855207920074463, + "learning_rate": 0.0006513083048919227, + "loss": 5.971, + "step": 3065 + }, + { + "epoch": 1.0464163822525596, + "grad_norm": 3.406338930130005, + "learning_rate": 0.0006511945392491468, + "loss": 6.8355, + "step": 3066 + }, + { + "epoch": 1.0467576791808875, + "grad_norm": 3.4004054069519043, + "learning_rate": 0.0006510807736063709, + "loss": 6.2127, + "step": 3067 + }, + { + "epoch": 1.047098976109215, + "grad_norm": 3.392861843109131, + "learning_rate": 0.000650967007963595, + "loss": 6.3045, + "step": 3068 + }, + { + "epoch": 1.0474402730375427, + "grad_norm": 3.3509092330932617, + "learning_rate": 0.0006508532423208191, + "loss": 6.2746, + "step": 3069 + }, + { + "epoch": 1.0477815699658704, + "grad_norm": 3.2432587146759033, + "learning_rate": 0.0006507394766780432, + "loss": 6.21, + "step": 3070 + }, + { + "epoch": 1.048122866894198, + "grad_norm": 8.192042350769043, + "learning_rate": 0.0006506257110352674, + "loss": 6.7381, + "step": 3071 + }, + { + "epoch": 1.0484641638225256, + "grad_norm": 5.284655570983887, + "learning_rate": 0.0006505119453924915, + "loss": 6.5578, + "step": 3072 + }, + { + "epoch": 1.0488054607508532, + "grad_norm": 3.650186538696289, + "learning_rate": 0.0006503981797497156, + "loss": 6.2315, + "step": 3073 + }, + { + "epoch": 1.0491467576791809, + "grad_norm": 3.323975086212158, + "learning_rate": 0.0006502844141069397, + "loss": 6.7739, + "step": 3074 + }, + { + "epoch": 1.0494880546075085, + "grad_norm": 3.2510526180267334, + "learning_rate": 0.0006501706484641638, + "loss": 6.4504, + "step": 3075 + }, + { + "epoch": 1.0498293515358361, + "grad_norm": 7.123908996582031, + "learning_rate": 0.000650056882821388, + "loss": 7.3726, + "step": 3076 + }, + { + "epoch": 1.0501706484641637, + "grad_norm": 3.3244235515594482, + "learning_rate": 0.0006499431171786121, + "loss": 5.9337, + "step": 3077 + }, + { + "epoch": 1.0505119453924914, + "grad_norm": 4.019181728363037, + "learning_rate": 0.0006498293515358362, + "loss": 5.851, + "step": 3078 + }, + { + "epoch": 1.050853242320819, + "grad_norm": 3.2663567066192627, + "learning_rate": 0.0006497155858930603, + "loss": 6.808, + "step": 3079 + }, + { + "epoch": 1.0511945392491469, + "grad_norm": 3.2824909687042236, + "learning_rate": 0.0006496018202502844, + "loss": 6.8706, + "step": 3080 + }, + { + "epoch": 1.0515358361774745, + "grad_norm": 3.404869794845581, + "learning_rate": 0.0006494880546075086, + "loss": 6.5542, + "step": 3081 + }, + { + "epoch": 1.051877133105802, + "grad_norm": 3.04349684715271, + "learning_rate": 0.0006493742889647328, + "loss": 6.3294, + "step": 3082 + }, + { + "epoch": 1.0522184300341297, + "grad_norm": 3.5680627822875977, + "learning_rate": 0.0006492605233219568, + "loss": 6.0419, + "step": 3083 + }, + { + "epoch": 1.0525597269624574, + "grad_norm": 3.081714153289795, + "learning_rate": 0.0006491467576791809, + "loss": 6.4956, + "step": 3084 + }, + { + "epoch": 1.052901023890785, + "grad_norm": 3.4470624923706055, + "learning_rate": 0.000649032992036405, + "loss": 6.9409, + "step": 3085 + }, + { + "epoch": 1.0532423208191126, + "grad_norm": 3.1926770210266113, + "learning_rate": 0.0006489192263936291, + "loss": 6.461, + "step": 3086 + }, + { + "epoch": 1.0535836177474402, + "grad_norm": 10.409677505493164, + "learning_rate": 0.0006488054607508532, + "loss": 5.6452, + "step": 3087 + }, + { + "epoch": 1.0539249146757679, + "grad_norm": 3.4879887104034424, + "learning_rate": 0.0006486916951080774, + "loss": 6.7937, + "step": 3088 + }, + { + "epoch": 1.0542662116040955, + "grad_norm": 3.3193447589874268, + "learning_rate": 0.0006485779294653015, + "loss": 6.813, + "step": 3089 + }, + { + "epoch": 1.0546075085324231, + "grad_norm": 3.5224204063415527, + "learning_rate": 0.0006484641638225256, + "loss": 6.5331, + "step": 3090 + }, + { + "epoch": 1.0549488054607508, + "grad_norm": 3.5668528079986572, + "learning_rate": 0.0006483503981797497, + "loss": 5.8162, + "step": 3091 + }, + { + "epoch": 1.0552901023890784, + "grad_norm": 3.3257477283477783, + "learning_rate": 0.0006482366325369738, + "loss": 5.6231, + "step": 3092 + }, + { + "epoch": 1.0556313993174062, + "grad_norm": 3.734152317047119, + "learning_rate": 0.0006481228668941979, + "loss": 6.3863, + "step": 3093 + }, + { + "epoch": 1.0559726962457339, + "grad_norm": 3.2832605838775635, + "learning_rate": 0.0006480091012514221, + "loss": 6.9209, + "step": 3094 + }, + { + "epoch": 1.0563139931740615, + "grad_norm": 3.4488635063171387, + "learning_rate": 0.0006478953356086462, + "loss": 6.6284, + "step": 3095 + }, + { + "epoch": 1.0566552901023891, + "grad_norm": 3.6133999824523926, + "learning_rate": 0.0006477815699658703, + "loss": 6.6132, + "step": 3096 + }, + { + "epoch": 1.0569965870307167, + "grad_norm": 3.2564053535461426, + "learning_rate": 0.0006476678043230944, + "loss": 6.4362, + "step": 3097 + }, + { + "epoch": 1.0573378839590444, + "grad_norm": 3.2632007598876953, + "learning_rate": 0.0006475540386803186, + "loss": 6.6581, + "step": 3098 + }, + { + "epoch": 1.057679180887372, + "grad_norm": 3.2787528038024902, + "learning_rate": 0.0006474402730375428, + "loss": 6.1405, + "step": 3099 + }, + { + "epoch": 1.0580204778156996, + "grad_norm": 3.1381187438964844, + "learning_rate": 0.0006473265073947669, + "loss": 6.5018, + "step": 3100 + }, + { + "epoch": 1.0583617747440273, + "grad_norm": 6.568216800689697, + "learning_rate": 0.000647212741751991, + "loss": 5.6483, + "step": 3101 + }, + { + "epoch": 1.058703071672355, + "grad_norm": 3.281235694885254, + "learning_rate": 0.000647098976109215, + "loss": 6.5081, + "step": 3102 + }, + { + "epoch": 1.0590443686006825, + "grad_norm": 3.3020272254943848, + "learning_rate": 0.0006469852104664391, + "loss": 6.0661, + "step": 3103 + }, + { + "epoch": 1.0593856655290101, + "grad_norm": 3.2067642211914062, + "learning_rate": 0.0006468714448236632, + "loss": 6.4818, + "step": 3104 + }, + { + "epoch": 1.0597269624573378, + "grad_norm": 8.392196655273438, + "learning_rate": 0.0006467576791808874, + "loss": 6.1493, + "step": 3105 + }, + { + "epoch": 1.0600682593856656, + "grad_norm": 4.222563743591309, + "learning_rate": 0.0006466439135381115, + "loss": 6.2424, + "step": 3106 + }, + { + "epoch": 1.0604095563139933, + "grad_norm": 3.43169903755188, + "learning_rate": 0.0006465301478953356, + "loss": 6.2598, + "step": 3107 + }, + { + "epoch": 1.0607508532423209, + "grad_norm": 3.4820284843444824, + "learning_rate": 0.0006464163822525597, + "loss": 6.6193, + "step": 3108 + }, + { + "epoch": 1.0610921501706485, + "grad_norm": 3.227473497390747, + "learning_rate": 0.0006463026166097838, + "loss": 6.5877, + "step": 3109 + }, + { + "epoch": 1.0614334470989761, + "grad_norm": 3.3051085472106934, + "learning_rate": 0.0006461888509670079, + "loss": 6.1434, + "step": 3110 + }, + { + "epoch": 1.0617747440273038, + "grad_norm": 4.006912708282471, + "learning_rate": 0.0006460750853242321, + "loss": 6.5416, + "step": 3111 + }, + { + "epoch": 1.0621160409556314, + "grad_norm": 3.184049129486084, + "learning_rate": 0.0006459613196814562, + "loss": 6.7029, + "step": 3112 + }, + { + "epoch": 1.062457337883959, + "grad_norm": 3.2582833766937256, + "learning_rate": 0.0006458475540386803, + "loss": 6.4838, + "step": 3113 + }, + { + "epoch": 1.0627986348122866, + "grad_norm": 3.350537061691284, + "learning_rate": 0.0006457337883959044, + "loss": 6.6411, + "step": 3114 + }, + { + "epoch": 1.0631399317406143, + "grad_norm": 4.094681739807129, + "learning_rate": 0.0006456200227531286, + "loss": 5.957, + "step": 3115 + }, + { + "epoch": 1.063481228668942, + "grad_norm": 3.8731682300567627, + "learning_rate": 0.0006455062571103528, + "loss": 6.3783, + "step": 3116 + }, + { + "epoch": 1.0638225255972695, + "grad_norm": 3.384065866470337, + "learning_rate": 0.0006453924914675769, + "loss": 6.6837, + "step": 3117 + }, + { + "epoch": 1.0641638225255972, + "grad_norm": 3.379866361618042, + "learning_rate": 0.000645278725824801, + "loss": 6.4046, + "step": 3118 + }, + { + "epoch": 1.064505119453925, + "grad_norm": 3.2628586292266846, + "learning_rate": 0.0006451649601820251, + "loss": 6.209, + "step": 3119 + }, + { + "epoch": 1.0648464163822526, + "grad_norm": 3.4711813926696777, + "learning_rate": 0.0006450511945392492, + "loss": 6.1973, + "step": 3120 + }, + { + "epoch": 1.0651877133105803, + "grad_norm": 3.7682406902313232, + "learning_rate": 0.0006449374288964733, + "loss": 6.1473, + "step": 3121 + }, + { + "epoch": 1.065529010238908, + "grad_norm": 8.02403450012207, + "learning_rate": 0.0006448236632536974, + "loss": 5.49, + "step": 3122 + }, + { + "epoch": 1.0658703071672355, + "grad_norm": 3.402454376220703, + "learning_rate": 0.0006447098976109215, + "loss": 6.6287, + "step": 3123 + }, + { + "epoch": 1.0662116040955631, + "grad_norm": 4.036313056945801, + "learning_rate": 0.0006445961319681456, + "loss": 5.7235, + "step": 3124 + }, + { + "epoch": 1.0665529010238908, + "grad_norm": 7.888774394989014, + "learning_rate": 0.0006444823663253697, + "loss": 4.691, + "step": 3125 + }, + { + "epoch": 1.0668941979522184, + "grad_norm": 3.3642637729644775, + "learning_rate": 0.0006443686006825938, + "loss": 6.7421, + "step": 3126 + }, + { + "epoch": 1.067235494880546, + "grad_norm": 3.545217990875244, + "learning_rate": 0.0006442548350398179, + "loss": 5.7046, + "step": 3127 + }, + { + "epoch": 1.0675767918088737, + "grad_norm": 3.4235167503356934, + "learning_rate": 0.0006441410693970421, + "loss": 6.7823, + "step": 3128 + }, + { + "epoch": 1.0679180887372013, + "grad_norm": 3.252849578857422, + "learning_rate": 0.0006440273037542662, + "loss": 7.001, + "step": 3129 + }, + { + "epoch": 1.068259385665529, + "grad_norm": 5.600972652435303, + "learning_rate": 0.0006439135381114903, + "loss": 6.5534, + "step": 3130 + }, + { + "epoch": 1.0686006825938565, + "grad_norm": 3.1004841327667236, + "learning_rate": 0.0006437997724687144, + "loss": 6.7081, + "step": 3131 + }, + { + "epoch": 1.0689419795221844, + "grad_norm": 8.813851356506348, + "learning_rate": 0.0006436860068259386, + "loss": 4.8907, + "step": 3132 + }, + { + "epoch": 1.069283276450512, + "grad_norm": 3.3041679859161377, + "learning_rate": 0.0006435722411831627, + "loss": 6.594, + "step": 3133 + }, + { + "epoch": 1.0696245733788396, + "grad_norm": 3.500278949737549, + "learning_rate": 0.0006434584755403869, + "loss": 6.3211, + "step": 3134 + }, + { + "epoch": 1.0699658703071673, + "grad_norm": 3.4029455184936523, + "learning_rate": 0.000643344709897611, + "loss": 6.3552, + "step": 3135 + }, + { + "epoch": 1.070307167235495, + "grad_norm": 3.182002305984497, + "learning_rate": 0.0006432309442548351, + "loss": 6.8919, + "step": 3136 + }, + { + "epoch": 1.0706484641638225, + "grad_norm": 3.758815050125122, + "learning_rate": 0.0006431171786120592, + "loss": 5.6043, + "step": 3137 + }, + { + "epoch": 1.0709897610921502, + "grad_norm": 3.294699192047119, + "learning_rate": 0.0006430034129692833, + "loss": 6.3708, + "step": 3138 + }, + { + "epoch": 1.0713310580204778, + "grad_norm": 3.3556272983551025, + "learning_rate": 0.0006428896473265075, + "loss": 6.6939, + "step": 3139 + }, + { + "epoch": 1.0716723549488054, + "grad_norm": 3.1853625774383545, + "learning_rate": 0.0006427758816837316, + "loss": 6.1872, + "step": 3140 + }, + { + "epoch": 1.072013651877133, + "grad_norm": 3.300252676010132, + "learning_rate": 0.0006426621160409556, + "loss": 6.771, + "step": 3141 + }, + { + "epoch": 1.0723549488054607, + "grad_norm": 3.111130714416504, + "learning_rate": 0.0006425483503981797, + "loss": 6.3194, + "step": 3142 + }, + { + "epoch": 1.0726962457337883, + "grad_norm": 3.808727741241455, + "learning_rate": 0.0006424345847554038, + "loss": 5.9753, + "step": 3143 + }, + { + "epoch": 1.073037542662116, + "grad_norm": 3.226867437362671, + "learning_rate": 0.0006423208191126279, + "loss": 6.5513, + "step": 3144 + }, + { + "epoch": 1.0733788395904438, + "grad_norm": 3.2645046710968018, + "learning_rate": 0.0006422070534698521, + "loss": 6.7096, + "step": 3145 + }, + { + "epoch": 1.0737201365187714, + "grad_norm": 3.1545019149780273, + "learning_rate": 0.0006420932878270762, + "loss": 6.5052, + "step": 3146 + }, + { + "epoch": 1.074061433447099, + "grad_norm": 3.2468883991241455, + "learning_rate": 0.0006419795221843003, + "loss": 6.8057, + "step": 3147 + }, + { + "epoch": 1.0744027303754267, + "grad_norm": 5.185894966125488, + "learning_rate": 0.0006418657565415244, + "loss": 4.2064, + "step": 3148 + }, + { + "epoch": 1.0747440273037543, + "grad_norm": 3.218177318572998, + "learning_rate": 0.0006417519908987486, + "loss": 6.056, + "step": 3149 + }, + { + "epoch": 1.075085324232082, + "grad_norm": 3.383833646774292, + "learning_rate": 0.0006416382252559727, + "loss": 6.7475, + "step": 3150 + }, + { + "epoch": 1.0754266211604095, + "grad_norm": 3.697319507598877, + "learning_rate": 0.0006415244596131969, + "loss": 6.663, + "step": 3151 + }, + { + "epoch": 1.0757679180887372, + "grad_norm": 3.122490644454956, + "learning_rate": 0.000641410693970421, + "loss": 6.6991, + "step": 3152 + }, + { + "epoch": 1.0761092150170648, + "grad_norm": 3.148451089859009, + "learning_rate": 0.0006412969283276451, + "loss": 6.6486, + "step": 3153 + }, + { + "epoch": 1.0764505119453924, + "grad_norm": 3.321945905685425, + "learning_rate": 0.0006411831626848692, + "loss": 6.1962, + "step": 3154 + }, + { + "epoch": 1.07679180887372, + "grad_norm": 3.590642213821411, + "learning_rate": 0.0006410693970420933, + "loss": 5.5254, + "step": 3155 + }, + { + "epoch": 1.0771331058020477, + "grad_norm": 3.300290584564209, + "learning_rate": 0.0006409556313993174, + "loss": 5.7448, + "step": 3156 + }, + { + "epoch": 1.0774744027303753, + "grad_norm": 3.4239137172698975, + "learning_rate": 0.0006408418657565416, + "loss": 6.2855, + "step": 3157 + }, + { + "epoch": 1.0778156996587032, + "grad_norm": 3.3473432064056396, + "learning_rate": 0.0006407281001137657, + "loss": 6.6531, + "step": 3158 + }, + { + "epoch": 1.0781569965870308, + "grad_norm": 3.0467193126678467, + "learning_rate": 0.0006406143344709898, + "loss": 6.3434, + "step": 3159 + }, + { + "epoch": 1.0784982935153584, + "grad_norm": 3.6349399089813232, + "learning_rate": 0.0006405005688282139, + "loss": 6.3252, + "step": 3160 + }, + { + "epoch": 1.078839590443686, + "grad_norm": 3.4417903423309326, + "learning_rate": 0.0006403868031854379, + "loss": 6.9098, + "step": 3161 + }, + { + "epoch": 1.0791808873720137, + "grad_norm": 3.258446455001831, + "learning_rate": 0.0006402730375426621, + "loss": 6.3267, + "step": 3162 + }, + { + "epoch": 1.0795221843003413, + "grad_norm": 3.3519585132598877, + "learning_rate": 0.0006401592718998862, + "loss": 5.89, + "step": 3163 + }, + { + "epoch": 1.079863481228669, + "grad_norm": 3.009504795074463, + "learning_rate": 0.0006400455062571103, + "loss": 6.2854, + "step": 3164 + }, + { + "epoch": 1.0802047781569966, + "grad_norm": 3.346721649169922, + "learning_rate": 0.0006399317406143345, + "loss": 6.9105, + "step": 3165 + }, + { + "epoch": 1.0805460750853242, + "grad_norm": 3.1614081859588623, + "learning_rate": 0.0006398179749715586, + "loss": 6.4212, + "step": 3166 + }, + { + "epoch": 1.0808873720136518, + "grad_norm": 3.280531406402588, + "learning_rate": 0.0006397042093287827, + "loss": 6.22, + "step": 3167 + }, + { + "epoch": 1.0812286689419794, + "grad_norm": 3.3301196098327637, + "learning_rate": 0.0006395904436860069, + "loss": 6.7717, + "step": 3168 + }, + { + "epoch": 1.081569965870307, + "grad_norm": 3.3172807693481445, + "learning_rate": 0.000639476678043231, + "loss": 6.6153, + "step": 3169 + }, + { + "epoch": 1.0819112627986347, + "grad_norm": 3.3986637592315674, + "learning_rate": 0.0006393629124004551, + "loss": 6.0319, + "step": 3170 + }, + { + "epoch": 1.0822525597269625, + "grad_norm": 3.199392318725586, + "learning_rate": 0.0006392491467576792, + "loss": 6.2352, + "step": 3171 + }, + { + "epoch": 1.0825938566552902, + "grad_norm": 5.347891330718994, + "learning_rate": 0.0006391353811149033, + "loss": 5.6896, + "step": 3172 + }, + { + "epoch": 1.0829351535836178, + "grad_norm": 3.7180368900299072, + "learning_rate": 0.0006390216154721274, + "loss": 6.2303, + "step": 3173 + }, + { + "epoch": 1.0832764505119454, + "grad_norm": 3.3773393630981445, + "learning_rate": 0.0006389078498293516, + "loss": 6.0189, + "step": 3174 + }, + { + "epoch": 1.083617747440273, + "grad_norm": 6.927037715911865, + "learning_rate": 0.0006387940841865757, + "loss": 4.5765, + "step": 3175 + }, + { + "epoch": 1.0839590443686007, + "grad_norm": 4.274540901184082, + "learning_rate": 0.0006386803185437998, + "loss": 5.871, + "step": 3176 + }, + { + "epoch": 1.0843003412969283, + "grad_norm": 3.52830171585083, + "learning_rate": 0.0006385665529010239, + "loss": 6.474, + "step": 3177 + }, + { + "epoch": 1.084641638225256, + "grad_norm": 3.9919192790985107, + "learning_rate": 0.000638452787258248, + "loss": 6.2379, + "step": 3178 + }, + { + "epoch": 1.0849829351535836, + "grad_norm": 3.2457633018493652, + "learning_rate": 0.0006383390216154722, + "loss": 7.1027, + "step": 3179 + }, + { + "epoch": 1.0853242320819112, + "grad_norm": 3.2085020542144775, + "learning_rate": 0.0006382252559726962, + "loss": 6.7974, + "step": 3180 + }, + { + "epoch": 1.0856655290102388, + "grad_norm": 3.2093658447265625, + "learning_rate": 0.0006381114903299203, + "loss": 6.5593, + "step": 3181 + }, + { + "epoch": 1.0860068259385665, + "grad_norm": 4.833531856536865, + "learning_rate": 0.0006379977246871445, + "loss": 6.0521, + "step": 3182 + }, + { + "epoch": 1.086348122866894, + "grad_norm": 3.516695499420166, + "learning_rate": 0.0006378839590443686, + "loss": 6.3321, + "step": 3183 + }, + { + "epoch": 1.086689419795222, + "grad_norm": 3.375440835952759, + "learning_rate": 0.0006377701934015927, + "loss": 6.4715, + "step": 3184 + }, + { + "epoch": 1.0870307167235496, + "grad_norm": 3.308617115020752, + "learning_rate": 0.0006376564277588169, + "loss": 6.5871, + "step": 3185 + }, + { + "epoch": 1.0873720136518772, + "grad_norm": 3.2866013050079346, + "learning_rate": 0.000637542662116041, + "loss": 6.5732, + "step": 3186 + }, + { + "epoch": 1.0877133105802048, + "grad_norm": 3.3857619762420654, + "learning_rate": 0.0006374288964732651, + "loss": 6.3731, + "step": 3187 + }, + { + "epoch": 1.0880546075085324, + "grad_norm": 3.51200795173645, + "learning_rate": 0.0006373151308304892, + "loss": 5.6781, + "step": 3188 + }, + { + "epoch": 1.08839590443686, + "grad_norm": 3.297863006591797, + "learning_rate": 0.0006372013651877133, + "loss": 6.4584, + "step": 3189 + }, + { + "epoch": 1.0887372013651877, + "grad_norm": 3.261662244796753, + "learning_rate": 0.0006370875995449374, + "loss": 6.2231, + "step": 3190 + }, + { + "epoch": 1.0890784982935153, + "grad_norm": 3.8961715698242188, + "learning_rate": 0.0006369738339021616, + "loss": 6.607, + "step": 3191 + }, + { + "epoch": 1.089419795221843, + "grad_norm": 3.399348020553589, + "learning_rate": 0.0006368600682593857, + "loss": 4.4051, + "step": 3192 + }, + { + "epoch": 1.0897610921501706, + "grad_norm": 4.216414928436279, + "learning_rate": 0.0006367463026166098, + "loss": 4.1361, + "step": 3193 + }, + { + "epoch": 1.0901023890784982, + "grad_norm": 3.367846727371216, + "learning_rate": 0.0006366325369738339, + "loss": 6.8591, + "step": 3194 + }, + { + "epoch": 1.0904436860068258, + "grad_norm": 3.9558677673339844, + "learning_rate": 0.000636518771331058, + "loss": 6.0432, + "step": 3195 + }, + { + "epoch": 1.0907849829351535, + "grad_norm": 3.5588676929473877, + "learning_rate": 0.0006364050056882821, + "loss": 6.5362, + "step": 3196 + }, + { + "epoch": 1.0911262798634813, + "grad_norm": 4.590184688568115, + "learning_rate": 0.0006362912400455064, + "loss": 6.3793, + "step": 3197 + }, + { + "epoch": 1.091467576791809, + "grad_norm": 3.1657865047454834, + "learning_rate": 0.0006361774744027305, + "loss": 6.3938, + "step": 3198 + }, + { + "epoch": 1.0918088737201366, + "grad_norm": 3.1333377361297607, + "learning_rate": 0.0006360637087599546, + "loss": 6.9106, + "step": 3199 + }, + { + "epoch": 1.0921501706484642, + "grad_norm": 3.446474313735962, + "learning_rate": 0.0006359499431171786, + "loss": 6.3406, + "step": 3200 + }, + { + "epoch": 1.0924914675767918, + "grad_norm": 4.603214263916016, + "learning_rate": 0.0006358361774744027, + "loss": 5.921, + "step": 3201 + }, + { + "epoch": 1.0928327645051195, + "grad_norm": 3.366414785385132, + "learning_rate": 0.0006357224118316269, + "loss": 6.5776, + "step": 3202 + }, + { + "epoch": 1.093174061433447, + "grad_norm": 3.351638078689575, + "learning_rate": 0.000635608646188851, + "loss": 6.0564, + "step": 3203 + }, + { + "epoch": 1.0935153583617747, + "grad_norm": 3.204946517944336, + "learning_rate": 0.0006354948805460751, + "loss": 6.5415, + "step": 3204 + }, + { + "epoch": 1.0938566552901023, + "grad_norm": 3.1665844917297363, + "learning_rate": 0.0006353811149032992, + "loss": 6.6033, + "step": 3205 + }, + { + "epoch": 1.09419795221843, + "grad_norm": 3.453909158706665, + "learning_rate": 0.0006352673492605233, + "loss": 6.2195, + "step": 3206 + }, + { + "epoch": 1.0945392491467576, + "grad_norm": 3.191018581390381, + "learning_rate": 0.0006351535836177474, + "loss": 6.3949, + "step": 3207 + }, + { + "epoch": 1.0948805460750852, + "grad_norm": 3.1959285736083984, + "learning_rate": 0.0006350398179749716, + "loss": 6.2866, + "step": 3208 + }, + { + "epoch": 1.0952218430034129, + "grad_norm": 3.368140459060669, + "learning_rate": 0.0006349260523321957, + "loss": 6.4439, + "step": 3209 + }, + { + "epoch": 1.0955631399317407, + "grad_norm": 3.4131994247436523, + "learning_rate": 0.0006348122866894198, + "loss": 6.5307, + "step": 3210 + }, + { + "epoch": 1.0959044368600683, + "grad_norm": 3.310483455657959, + "learning_rate": 0.0006346985210466439, + "loss": 6.8359, + "step": 3211 + }, + { + "epoch": 1.096245733788396, + "grad_norm": 3.1908204555511475, + "learning_rate": 0.000634584755403868, + "loss": 6.6964, + "step": 3212 + }, + { + "epoch": 1.0965870307167236, + "grad_norm": 3.5243823528289795, + "learning_rate": 0.0006344709897610921, + "loss": 6.4033, + "step": 3213 + }, + { + "epoch": 1.0969283276450512, + "grad_norm": 3.324995756149292, + "learning_rate": 0.0006343572241183164, + "loss": 6.8275, + "step": 3214 + }, + { + "epoch": 1.0972696245733788, + "grad_norm": 3.591324806213379, + "learning_rate": 0.0006342434584755405, + "loss": 6.5428, + "step": 3215 + }, + { + "epoch": 1.0976109215017065, + "grad_norm": 3.3508944511413574, + "learning_rate": 0.0006341296928327646, + "loss": 6.4207, + "step": 3216 + }, + { + "epoch": 1.097952218430034, + "grad_norm": 3.258124589920044, + "learning_rate": 0.0006340159271899887, + "loss": 6.369, + "step": 3217 + }, + { + "epoch": 1.0982935153583617, + "grad_norm": 3.1361072063446045, + "learning_rate": 0.0006339021615472128, + "loss": 6.3561, + "step": 3218 + }, + { + "epoch": 1.0986348122866894, + "grad_norm": 3.4941697120666504, + "learning_rate": 0.0006337883959044368, + "loss": 5.6931, + "step": 3219 + }, + { + "epoch": 1.098976109215017, + "grad_norm": 3.129948854446411, + "learning_rate": 0.000633674630261661, + "loss": 6.4307, + "step": 3220 + }, + { + "epoch": 1.0993174061433446, + "grad_norm": 6.091549396514893, + "learning_rate": 0.0006335608646188851, + "loss": 5.7017, + "step": 3221 + }, + { + "epoch": 1.0996587030716722, + "grad_norm": 3.1629409790039062, + "learning_rate": 0.0006334470989761092, + "loss": 6.0674, + "step": 3222 + }, + { + "epoch": 1.1, + "grad_norm": 6.926721572875977, + "learning_rate": 0.0006333333333333333, + "loss": 5.3602, + "step": 3223 + }, + { + "epoch": 1.1003412969283277, + "grad_norm": 4.425708770751953, + "learning_rate": 0.0006332195676905574, + "loss": 6.9397, + "step": 3224 + }, + { + "epoch": 1.1006825938566553, + "grad_norm": 3.5748658180236816, + "learning_rate": 0.0006331058020477816, + "loss": 7.2775, + "step": 3225 + }, + { + "epoch": 1.101023890784983, + "grad_norm": 3.3313708305358887, + "learning_rate": 0.0006329920364050057, + "loss": 6.7344, + "step": 3226 + }, + { + "epoch": 1.1013651877133106, + "grad_norm": 4.757328987121582, + "learning_rate": 0.0006328782707622298, + "loss": 5.769, + "step": 3227 + }, + { + "epoch": 1.1017064846416382, + "grad_norm": 3.1376912593841553, + "learning_rate": 0.0006327645051194539, + "loss": 6.7527, + "step": 3228 + }, + { + "epoch": 1.1020477815699659, + "grad_norm": 3.682274341583252, + "learning_rate": 0.000632650739476678, + "loss": 6.7461, + "step": 3229 + }, + { + "epoch": 1.1023890784982935, + "grad_norm": 3.245393991470337, + "learning_rate": 0.0006325369738339021, + "loss": 6.5636, + "step": 3230 + }, + { + "epoch": 1.1027303754266211, + "grad_norm": 3.133185625076294, + "learning_rate": 0.0006324232081911264, + "loss": 6.2898, + "step": 3231 + }, + { + "epoch": 1.1030716723549487, + "grad_norm": 3.1336660385131836, + "learning_rate": 0.0006323094425483505, + "loss": 6.4333, + "step": 3232 + }, + { + "epoch": 1.1034129692832764, + "grad_norm": 3.263469934463501, + "learning_rate": 0.0006321956769055746, + "loss": 6.4195, + "step": 3233 + }, + { + "epoch": 1.103754266211604, + "grad_norm": 3.146401882171631, + "learning_rate": 0.0006320819112627987, + "loss": 6.7663, + "step": 3234 + }, + { + "epoch": 1.1040955631399316, + "grad_norm": 3.2967708110809326, + "learning_rate": 0.0006319681456200228, + "loss": 6.8334, + "step": 3235 + }, + { + "epoch": 1.1044368600682595, + "grad_norm": 3.167576551437378, + "learning_rate": 0.0006318543799772469, + "loss": 6.619, + "step": 3236 + }, + { + "epoch": 1.104778156996587, + "grad_norm": 6.352286338806152, + "learning_rate": 0.0006317406143344711, + "loss": 6.0053, + "step": 3237 + }, + { + "epoch": 1.1051194539249147, + "grad_norm": 3.420991897583008, + "learning_rate": 0.0006316268486916951, + "loss": 6.2279, + "step": 3238 + }, + { + "epoch": 1.1054607508532424, + "grad_norm": 3.3597917556762695, + "learning_rate": 0.0006315130830489192, + "loss": 6.4583, + "step": 3239 + }, + { + "epoch": 1.10580204778157, + "grad_norm": 3.377300262451172, + "learning_rate": 0.0006313993174061433, + "loss": 6.5939, + "step": 3240 + }, + { + "epoch": 1.1061433447098976, + "grad_norm": 3.4640071392059326, + "learning_rate": 0.0006312855517633674, + "loss": 6.1728, + "step": 3241 + }, + { + "epoch": 1.1064846416382252, + "grad_norm": 3.410388708114624, + "learning_rate": 0.0006311717861205916, + "loss": 6.1942, + "step": 3242 + }, + { + "epoch": 1.1068259385665529, + "grad_norm": 3.2869067192077637, + "learning_rate": 0.0006310580204778157, + "loss": 6.6288, + "step": 3243 + }, + { + "epoch": 1.1071672354948805, + "grad_norm": 3.073132038116455, + "learning_rate": 0.0006309442548350398, + "loss": 6.4948, + "step": 3244 + }, + { + "epoch": 1.1075085324232081, + "grad_norm": 3.332573175430298, + "learning_rate": 0.0006308304891922639, + "loss": 6.2622, + "step": 3245 + }, + { + "epoch": 1.1078498293515358, + "grad_norm": 3.1241261959075928, + "learning_rate": 0.000630716723549488, + "loss": 6.4462, + "step": 3246 + }, + { + "epoch": 1.1081911262798634, + "grad_norm": 3.215703248977661, + "learning_rate": 0.0006306029579067121, + "loss": 6.9699, + "step": 3247 + }, + { + "epoch": 1.108532423208191, + "grad_norm": 3.1730904579162598, + "learning_rate": 0.0006304891922639364, + "loss": 6.5032, + "step": 3248 + }, + { + "epoch": 1.1088737201365189, + "grad_norm": 3.286179542541504, + "learning_rate": 0.0006303754266211605, + "loss": 6.4786, + "step": 3249 + }, + { + "epoch": 1.1092150170648465, + "grad_norm": 3.2079620361328125, + "learning_rate": 0.0006302616609783846, + "loss": 5.9624, + "step": 3250 + }, + { + "epoch": 1.1095563139931741, + "grad_norm": 3.4078316688537598, + "learning_rate": 0.0006301478953356087, + "loss": 5.7281, + "step": 3251 + }, + { + "epoch": 1.1098976109215017, + "grad_norm": 3.154865264892578, + "learning_rate": 0.0006300341296928328, + "loss": 6.2634, + "step": 3252 + }, + { + "epoch": 1.1102389078498294, + "grad_norm": 4.15884256362915, + "learning_rate": 0.0006299203640500569, + "loss": 5.9743, + "step": 3253 + }, + { + "epoch": 1.110580204778157, + "grad_norm": 3.244605302810669, + "learning_rate": 0.0006298065984072811, + "loss": 6.3037, + "step": 3254 + }, + { + "epoch": 1.1109215017064846, + "grad_norm": 3.354834794998169, + "learning_rate": 0.0006296928327645052, + "loss": 6.2177, + "step": 3255 + }, + { + "epoch": 1.1112627986348123, + "grad_norm": 3.270688056945801, + "learning_rate": 0.0006295790671217293, + "loss": 6.2083, + "step": 3256 + }, + { + "epoch": 1.1116040955631399, + "grad_norm": 3.2896766662597656, + "learning_rate": 0.0006294653014789534, + "loss": 6.3809, + "step": 3257 + }, + { + "epoch": 1.1119453924914675, + "grad_norm": 3.2658944129943848, + "learning_rate": 0.0006293515358361774, + "loss": 5.7977, + "step": 3258 + }, + { + "epoch": 1.1122866894197951, + "grad_norm": 3.716240644454956, + "learning_rate": 0.0006292377701934015, + "loss": 6.0917, + "step": 3259 + }, + { + "epoch": 1.1126279863481228, + "grad_norm": 3.223571538925171, + "learning_rate": 0.0006291240045506257, + "loss": 4.5392, + "step": 3260 + }, + { + "epoch": 1.1129692832764504, + "grad_norm": 3.6493613719940186, + "learning_rate": 0.0006290102389078498, + "loss": 6.3741, + "step": 3261 + }, + { + "epoch": 1.1133105802047782, + "grad_norm": 3.8748128414154053, + "learning_rate": 0.0006288964732650739, + "loss": 5.9105, + "step": 3262 + }, + { + "epoch": 1.1136518771331059, + "grad_norm": 3.320779323577881, + "learning_rate": 0.000628782707622298, + "loss": 6.4, + "step": 3263 + }, + { + "epoch": 1.1139931740614335, + "grad_norm": 3.4547297954559326, + "learning_rate": 0.0006286689419795221, + "loss": 6.1842, + "step": 3264 + }, + { + "epoch": 1.1143344709897611, + "grad_norm": 3.3176543712615967, + "learning_rate": 0.0006285551763367464, + "loss": 6.8925, + "step": 3265 + }, + { + "epoch": 1.1146757679180888, + "grad_norm": 3.095383882522583, + "learning_rate": 0.0006284414106939705, + "loss": 6.7177, + "step": 3266 + }, + { + "epoch": 1.1150170648464164, + "grad_norm": 3.1322736740112305, + "learning_rate": 0.0006283276450511946, + "loss": 7.11, + "step": 3267 + }, + { + "epoch": 1.115358361774744, + "grad_norm": 3.179048538208008, + "learning_rate": 0.0006282138794084187, + "loss": 6.6664, + "step": 3268 + }, + { + "epoch": 1.1156996587030716, + "grad_norm": 3.2446703910827637, + "learning_rate": 0.0006281001137656428, + "loss": 6.2555, + "step": 3269 + }, + { + "epoch": 1.1160409556313993, + "grad_norm": 3.4410719871520996, + "learning_rate": 0.0006279863481228669, + "loss": 6.3639, + "step": 3270 + }, + { + "epoch": 1.116382252559727, + "grad_norm": 3.151151657104492, + "learning_rate": 0.0006278725824800911, + "loss": 6.548, + "step": 3271 + }, + { + "epoch": 1.1167235494880545, + "grad_norm": 3.394761085510254, + "learning_rate": 0.0006277588168373152, + "loss": 6.7941, + "step": 3272 + }, + { + "epoch": 1.1170648464163822, + "grad_norm": 3.4084866046905518, + "learning_rate": 0.0006276450511945393, + "loss": 6.0128, + "step": 3273 + }, + { + "epoch": 1.11740614334471, + "grad_norm": 3.2266435623168945, + "learning_rate": 0.0006275312855517634, + "loss": 6.3047, + "step": 3274 + }, + { + "epoch": 1.1177474402730376, + "grad_norm": 3.2533907890319824, + "learning_rate": 0.0006274175199089875, + "loss": 6.4112, + "step": 3275 + }, + { + "epoch": 1.1180887372013653, + "grad_norm": 3.1983823776245117, + "learning_rate": 0.0006273037542662116, + "loss": 6.8299, + "step": 3276 + }, + { + "epoch": 1.1184300341296929, + "grad_norm": 3.781130075454712, + "learning_rate": 0.0006271899886234357, + "loss": 6.1287, + "step": 3277 + }, + { + "epoch": 1.1187713310580205, + "grad_norm": 3.3088560104370117, + "learning_rate": 0.0006270762229806598, + "loss": 6.1003, + "step": 3278 + }, + { + "epoch": 1.1191126279863481, + "grad_norm": 3.457223653793335, + "learning_rate": 0.0006269624573378839, + "loss": 6.1749, + "step": 3279 + }, + { + "epoch": 1.1194539249146758, + "grad_norm": 3.4108994007110596, + "learning_rate": 0.000626848691695108, + "loss": 6.4882, + "step": 3280 + }, + { + "epoch": 1.1197952218430034, + "grad_norm": 3.191033363342285, + "learning_rate": 0.0006267349260523321, + "loss": 6.386, + "step": 3281 + }, + { + "epoch": 1.120136518771331, + "grad_norm": 3.2378880977630615, + "learning_rate": 0.0006266211604095564, + "loss": 6.5682, + "step": 3282 + }, + { + "epoch": 1.1204778156996587, + "grad_norm": 3.0198142528533936, + "learning_rate": 0.0006265073947667805, + "loss": 6.6885, + "step": 3283 + }, + { + "epoch": 1.1208191126279863, + "grad_norm": 3.2005209922790527, + "learning_rate": 0.0006263936291240046, + "loss": 6.7357, + "step": 3284 + }, + { + "epoch": 1.121160409556314, + "grad_norm": 3.2650158405303955, + "learning_rate": 0.0006262798634812287, + "loss": 6.8819, + "step": 3285 + }, + { + "epoch": 1.1215017064846415, + "grad_norm": 3.4467949867248535, + "learning_rate": 0.0006261660978384528, + "loss": 6.327, + "step": 3286 + }, + { + "epoch": 1.1218430034129694, + "grad_norm": 3.51582932472229, + "learning_rate": 0.0006260523321956769, + "loss": 6.4275, + "step": 3287 + }, + { + "epoch": 1.122184300341297, + "grad_norm": 3.2341697216033936, + "learning_rate": 0.0006259385665529011, + "loss": 6.6526, + "step": 3288 + }, + { + "epoch": 1.1225255972696246, + "grad_norm": 3.0597870349884033, + "learning_rate": 0.0006258248009101252, + "loss": 6.3242, + "step": 3289 + }, + { + "epoch": 1.1228668941979523, + "grad_norm": 3.1777312755584717, + "learning_rate": 0.0006257110352673493, + "loss": 6.7306, + "step": 3290 + }, + { + "epoch": 1.12320819112628, + "grad_norm": 3.1080517768859863, + "learning_rate": 0.0006255972696245734, + "loss": 6.5034, + "step": 3291 + }, + { + "epoch": 1.1235494880546075, + "grad_norm": 3.166882276535034, + "learning_rate": 0.0006254835039817975, + "loss": 6.6549, + "step": 3292 + }, + { + "epoch": 1.1238907849829352, + "grad_norm": 3.2958264350891113, + "learning_rate": 0.0006253697383390216, + "loss": 6.3964, + "step": 3293 + }, + { + "epoch": 1.1242320819112628, + "grad_norm": 3.2784876823425293, + "learning_rate": 0.0006252559726962458, + "loss": 6.7125, + "step": 3294 + }, + { + "epoch": 1.1245733788395904, + "grad_norm": 3.357529878616333, + "learning_rate": 0.0006251422070534699, + "loss": 6.2972, + "step": 3295 + }, + { + "epoch": 1.124914675767918, + "grad_norm": 3.1187024116516113, + "learning_rate": 0.000625028441410694, + "loss": 6.4141, + "step": 3296 + }, + { + "epoch": 1.1252559726962457, + "grad_norm": 3.192850112915039, + "learning_rate": 0.000624914675767918, + "loss": 6.4792, + "step": 3297 + }, + { + "epoch": 1.1255972696245733, + "grad_norm": 3.124603509902954, + "learning_rate": 0.0006248009101251421, + "loss": 6.084, + "step": 3298 + }, + { + "epoch": 1.1259385665529011, + "grad_norm": 3.316884756088257, + "learning_rate": 0.0006246871444823662, + "loss": 6.8159, + "step": 3299 + }, + { + "epoch": 1.1262798634812285, + "grad_norm": 3.2244949340820312, + "learning_rate": 0.0006245733788395905, + "loss": 6.0295, + "step": 3300 + }, + { + "epoch": 1.1266211604095564, + "grad_norm": 3.214632749557495, + "learning_rate": 0.0006244596131968146, + "loss": 6.0611, + "step": 3301 + }, + { + "epoch": 1.126962457337884, + "grad_norm": 3.4261796474456787, + "learning_rate": 0.0006243458475540387, + "loss": 6.371, + "step": 3302 + }, + { + "epoch": 1.1273037542662117, + "grad_norm": 3.2653138637542725, + "learning_rate": 0.0006242320819112628, + "loss": 6.4282, + "step": 3303 + }, + { + "epoch": 1.1276450511945393, + "grad_norm": 3.3320472240448, + "learning_rate": 0.0006241183162684869, + "loss": 6.7758, + "step": 3304 + }, + { + "epoch": 1.127986348122867, + "grad_norm": 3.2385928630828857, + "learning_rate": 0.0006240045506257111, + "loss": 6.7638, + "step": 3305 + }, + { + "epoch": 1.1283276450511945, + "grad_norm": 3.225954055786133, + "learning_rate": 0.0006238907849829352, + "loss": 6.3909, + "step": 3306 + }, + { + "epoch": 1.1286689419795222, + "grad_norm": 3.2931458950042725, + "learning_rate": 0.0006237770193401593, + "loss": 6.7229, + "step": 3307 + }, + { + "epoch": 1.1290102389078498, + "grad_norm": 3.082092761993408, + "learning_rate": 0.0006236632536973834, + "loss": 6.673, + "step": 3308 + }, + { + "epoch": 1.1293515358361774, + "grad_norm": 4.754388809204102, + "learning_rate": 0.0006235494880546075, + "loss": 6.1331, + "step": 3309 + }, + { + "epoch": 1.129692832764505, + "grad_norm": 3.1302919387817383, + "learning_rate": 0.0006234357224118316, + "loss": 6.512, + "step": 3310 + }, + { + "epoch": 1.1300341296928327, + "grad_norm": 5.564751148223877, + "learning_rate": 0.0006233219567690558, + "loss": 6.6001, + "step": 3311 + }, + { + "epoch": 1.1303754266211605, + "grad_norm": 2.1751673221588135, + "learning_rate": 0.0006232081911262799, + "loss": 3.1882, + "step": 3312 + }, + { + "epoch": 1.130716723549488, + "grad_norm": 3.3056774139404297, + "learning_rate": 0.000623094425483504, + "loss": 6.0435, + "step": 3313 + }, + { + "epoch": 1.1310580204778158, + "grad_norm": 3.5110793113708496, + "learning_rate": 0.0006229806598407281, + "loss": 6.7698, + "step": 3314 + }, + { + "epoch": 1.1313993174061434, + "grad_norm": 3.8492422103881836, + "learning_rate": 0.0006228668941979523, + "loss": 5.9606, + "step": 3315 + }, + { + "epoch": 1.131740614334471, + "grad_norm": 3.3324551582336426, + "learning_rate": 0.0006227531285551762, + "loss": 6.1918, + "step": 3316 + }, + { + "epoch": 1.1320819112627987, + "grad_norm": 3.3533132076263428, + "learning_rate": 0.0006226393629124005, + "loss": 6.8835, + "step": 3317 + }, + { + "epoch": 1.1324232081911263, + "grad_norm": 3.216062068939209, + "learning_rate": 0.0006225255972696246, + "loss": 6.1811, + "step": 3318 + }, + { + "epoch": 1.132764505119454, + "grad_norm": 3.2382802963256836, + "learning_rate": 0.0006224118316268487, + "loss": 6.3388, + "step": 3319 + }, + { + "epoch": 1.1331058020477816, + "grad_norm": 3.712456226348877, + "learning_rate": 0.0006222980659840728, + "loss": 6.0058, + "step": 3320 + }, + { + "epoch": 1.1334470989761092, + "grad_norm": 3.3816704750061035, + "learning_rate": 0.0006221843003412969, + "loss": 6.3261, + "step": 3321 + }, + { + "epoch": 1.1337883959044368, + "grad_norm": 3.733504295349121, + "learning_rate": 0.000622070534698521, + "loss": 4.9361, + "step": 3322 + }, + { + "epoch": 1.1341296928327644, + "grad_norm": 3.436828851699829, + "learning_rate": 0.0006219567690557452, + "loss": 6.4064, + "step": 3323 + }, + { + "epoch": 1.134470989761092, + "grad_norm": 3.211754083633423, + "learning_rate": 0.0006218430034129693, + "loss": 6.9981, + "step": 3324 + }, + { + "epoch": 1.13481228668942, + "grad_norm": 3.2073490619659424, + "learning_rate": 0.0006217292377701934, + "loss": 6.2856, + "step": 3325 + }, + { + "epoch": 1.1351535836177473, + "grad_norm": 3.1387381553649902, + "learning_rate": 0.0006216154721274175, + "loss": 6.6068, + "step": 3326 + }, + { + "epoch": 1.1354948805460752, + "grad_norm": 5.683926582336426, + "learning_rate": 0.0006215017064846416, + "loss": 6.1509, + "step": 3327 + }, + { + "epoch": 1.1358361774744028, + "grad_norm": 3.4648234844207764, + "learning_rate": 0.0006213879408418658, + "loss": 6.0164, + "step": 3328 + }, + { + "epoch": 1.1361774744027304, + "grad_norm": 3.3188838958740234, + "learning_rate": 0.0006212741751990899, + "loss": 6.6927, + "step": 3329 + }, + { + "epoch": 1.136518771331058, + "grad_norm": 5.669882297515869, + "learning_rate": 0.000621160409556314, + "loss": 5.7178, + "step": 3330 + }, + { + "epoch": 1.1368600682593857, + "grad_norm": 3.321120023727417, + "learning_rate": 0.0006210466439135381, + "loss": 6.3605, + "step": 3331 + }, + { + "epoch": 1.1372013651877133, + "grad_norm": 3.19608473777771, + "learning_rate": 0.0006209328782707623, + "loss": 7.1759, + "step": 3332 + }, + { + "epoch": 1.137542662116041, + "grad_norm": 3.3638601303100586, + "learning_rate": 0.0006208191126279864, + "loss": 6.4738, + "step": 3333 + }, + { + "epoch": 1.1378839590443686, + "grad_norm": 3.9922034740448, + "learning_rate": 0.0006207053469852106, + "loss": 6.4603, + "step": 3334 + }, + { + "epoch": 1.1382252559726962, + "grad_norm": 3.5877456665039062, + "learning_rate": 0.0006205915813424347, + "loss": 5.6385, + "step": 3335 + }, + { + "epoch": 1.1385665529010238, + "grad_norm": 3.1927621364593506, + "learning_rate": 0.0006204778156996587, + "loss": 5.9584, + "step": 3336 + }, + { + "epoch": 1.1389078498293514, + "grad_norm": 7.028829574584961, + "learning_rate": 0.0006203640500568828, + "loss": 4.9702, + "step": 3337 + }, + { + "epoch": 1.1392491467576793, + "grad_norm": 3.2814626693725586, + "learning_rate": 0.0006202502844141069, + "loss": 6.8503, + "step": 3338 + }, + { + "epoch": 1.1395904436860067, + "grad_norm": 3.5198097229003906, + "learning_rate": 0.000620136518771331, + "loss": 6.2619, + "step": 3339 + }, + { + "epoch": 1.1399317406143346, + "grad_norm": 3.3513224124908447, + "learning_rate": 0.0006200227531285552, + "loss": 6.749, + "step": 3340 + }, + { + "epoch": 1.1402730375426622, + "grad_norm": 3.736387014389038, + "learning_rate": 0.0006199089874857793, + "loss": 5.8229, + "step": 3341 + }, + { + "epoch": 1.1406143344709898, + "grad_norm": 3.531832456588745, + "learning_rate": 0.0006197952218430034, + "loss": 5.5888, + "step": 3342 + }, + { + "epoch": 1.1409556313993174, + "grad_norm": 3.2506160736083984, + "learning_rate": 0.0006196814562002275, + "loss": 6.379, + "step": 3343 + }, + { + "epoch": 1.141296928327645, + "grad_norm": 5.999919414520264, + "learning_rate": 0.0006195676905574516, + "loss": 5.3758, + "step": 3344 + }, + { + "epoch": 1.1416382252559727, + "grad_norm": 3.500176191329956, + "learning_rate": 0.0006194539249146758, + "loss": 6.2346, + "step": 3345 + }, + { + "epoch": 1.1419795221843003, + "grad_norm": 2.100086212158203, + "learning_rate": 0.0006193401592718999, + "loss": 3.3155, + "step": 3346 + }, + { + "epoch": 1.142320819112628, + "grad_norm": 5.4075798988342285, + "learning_rate": 0.000619226393629124, + "loss": 4.6973, + "step": 3347 + }, + { + "epoch": 1.1426621160409556, + "grad_norm": 3.206171989440918, + "learning_rate": 0.0006191126279863481, + "loss": 6.4632, + "step": 3348 + }, + { + "epoch": 1.1430034129692832, + "grad_norm": 3.123758316040039, + "learning_rate": 0.0006189988623435723, + "loss": 6.6153, + "step": 3349 + }, + { + "epoch": 1.1433447098976108, + "grad_norm": 3.109133243560791, + "learning_rate": 0.0006188850967007964, + "loss": 6.9189, + "step": 3350 + }, + { + "epoch": 1.1436860068259387, + "grad_norm": 3.7639737129211426, + "learning_rate": 0.0006187713310580206, + "loss": 5.0232, + "step": 3351 + }, + { + "epoch": 1.144027303754266, + "grad_norm": 3.2191059589385986, + "learning_rate": 0.0006186575654152447, + "loss": 6.3187, + "step": 3352 + }, + { + "epoch": 1.144368600682594, + "grad_norm": 3.2868123054504395, + "learning_rate": 0.0006185437997724688, + "loss": 6.3158, + "step": 3353 + }, + { + "epoch": 1.1447098976109216, + "grad_norm": 3.318932294845581, + "learning_rate": 0.0006184300341296929, + "loss": 6.2764, + "step": 3354 + }, + { + "epoch": 1.1450511945392492, + "grad_norm": 3.06105899810791, + "learning_rate": 0.0006183162684869169, + "loss": 6.4749, + "step": 3355 + }, + { + "epoch": 1.1453924914675768, + "grad_norm": 3.42996883392334, + "learning_rate": 0.000618202502844141, + "loss": 6.438, + "step": 3356 + }, + { + "epoch": 1.1457337883959045, + "grad_norm": 3.3737375736236572, + "learning_rate": 0.0006180887372013652, + "loss": 6.6715, + "step": 3357 + }, + { + "epoch": 1.146075085324232, + "grad_norm": 3.796318531036377, + "learning_rate": 0.0006179749715585893, + "loss": 5.5668, + "step": 3358 + }, + { + "epoch": 1.1464163822525597, + "grad_norm": 3.3401639461517334, + "learning_rate": 0.0006178612059158134, + "loss": 6.3617, + "step": 3359 + }, + { + "epoch": 1.1467576791808873, + "grad_norm": 3.4352896213531494, + "learning_rate": 0.0006177474402730375, + "loss": 6.105, + "step": 3360 + }, + { + "epoch": 1.147098976109215, + "grad_norm": 3.1488349437713623, + "learning_rate": 0.0006176336746302616, + "loss": 6.588, + "step": 3361 + }, + { + "epoch": 1.1474402730375426, + "grad_norm": 3.108013868331909, + "learning_rate": 0.0006175199089874857, + "loss": 6.22, + "step": 3362 + }, + { + "epoch": 1.1477815699658702, + "grad_norm": 3.5639657974243164, + "learning_rate": 0.0006174061433447099, + "loss": 6.7161, + "step": 3363 + }, + { + "epoch": 1.148122866894198, + "grad_norm": 4.529385566711426, + "learning_rate": 0.000617292377701934, + "loss": 5.2737, + "step": 3364 + }, + { + "epoch": 1.1484641638225255, + "grad_norm": 5.763156414031982, + "learning_rate": 0.0006171786120591581, + "loss": 5.3901, + "step": 3365 + }, + { + "epoch": 1.1488054607508533, + "grad_norm": 3.319201707839966, + "learning_rate": 0.0006170648464163823, + "loss": 6.2037, + "step": 3366 + }, + { + "epoch": 1.149146757679181, + "grad_norm": 3.429962635040283, + "learning_rate": 0.0006169510807736064, + "loss": 6.1665, + "step": 3367 + }, + { + "epoch": 1.1494880546075086, + "grad_norm": 3.73044490814209, + "learning_rate": 0.0006168373151308306, + "loss": 6.0798, + "step": 3368 + }, + { + "epoch": 1.1498293515358362, + "grad_norm": 5.758520126342773, + "learning_rate": 0.0006167235494880547, + "loss": 5.322, + "step": 3369 + }, + { + "epoch": 1.1501706484641638, + "grad_norm": 3.261571168899536, + "learning_rate": 0.0006166097838452788, + "loss": 6.9592, + "step": 3370 + }, + { + "epoch": 1.1505119453924915, + "grad_norm": 3.1305222511291504, + "learning_rate": 0.0006164960182025029, + "loss": 6.0327, + "step": 3371 + }, + { + "epoch": 1.150853242320819, + "grad_norm": 3.285921335220337, + "learning_rate": 0.000616382252559727, + "loss": 6.7096, + "step": 3372 + }, + { + "epoch": 1.1511945392491467, + "grad_norm": 3.030154228210449, + "learning_rate": 0.0006162684869169511, + "loss": 6.5408, + "step": 3373 + }, + { + "epoch": 1.1515358361774743, + "grad_norm": 3.1218512058258057, + "learning_rate": 0.0006161547212741753, + "loss": 6.5412, + "step": 3374 + }, + { + "epoch": 1.151877133105802, + "grad_norm": 3.093693733215332, + "learning_rate": 0.0006160409556313993, + "loss": 6.7893, + "step": 3375 + }, + { + "epoch": 1.1522184300341296, + "grad_norm": 8.688549041748047, + "learning_rate": 0.0006159271899886234, + "loss": 5.4964, + "step": 3376 + }, + { + "epoch": 1.1525597269624575, + "grad_norm": 3.335278272628784, + "learning_rate": 0.0006158134243458475, + "loss": 6.1672, + "step": 3377 + }, + { + "epoch": 1.1529010238907849, + "grad_norm": 3.6452267169952393, + "learning_rate": 0.0006156996587030716, + "loss": 5.9682, + "step": 3378 + }, + { + "epoch": 1.1532423208191127, + "grad_norm": 3.369281530380249, + "learning_rate": 0.0006155858930602957, + "loss": 6.0847, + "step": 3379 + }, + { + "epoch": 1.1535836177474403, + "grad_norm": 3.305046319961548, + "learning_rate": 0.0006154721274175199, + "loss": 6.8861, + "step": 3380 + }, + { + "epoch": 1.153924914675768, + "grad_norm": 4.961038112640381, + "learning_rate": 0.000615358361774744, + "loss": 4.7492, + "step": 3381 + }, + { + "epoch": 1.1542662116040956, + "grad_norm": 3.369922161102295, + "learning_rate": 0.0006152445961319682, + "loss": 6.3582, + "step": 3382 + }, + { + "epoch": 1.1546075085324232, + "grad_norm": 3.3537933826446533, + "learning_rate": 0.0006151308304891923, + "loss": 6.2107, + "step": 3383 + }, + { + "epoch": 1.1549488054607508, + "grad_norm": 3.241588830947876, + "learning_rate": 0.0006150170648464164, + "loss": 6.0113, + "step": 3384 + }, + { + "epoch": 1.1552901023890785, + "grad_norm": 3.1636621952056885, + "learning_rate": 0.0006149032992036406, + "loss": 6.7975, + "step": 3385 + }, + { + "epoch": 1.155631399317406, + "grad_norm": 3.1564855575561523, + "learning_rate": 0.0006147895335608647, + "loss": 6.2154, + "step": 3386 + }, + { + "epoch": 1.1559726962457337, + "grad_norm": 3.18015456199646, + "learning_rate": 0.0006146757679180888, + "loss": 6.7107, + "step": 3387 + }, + { + "epoch": 1.1563139931740614, + "grad_norm": 3.214262008666992, + "learning_rate": 0.0006145620022753129, + "loss": 6.2703, + "step": 3388 + }, + { + "epoch": 1.156655290102389, + "grad_norm": 3.171013593673706, + "learning_rate": 0.000614448236632537, + "loss": 6.5638, + "step": 3389 + }, + { + "epoch": 1.1569965870307168, + "grad_norm": 3.184718608856201, + "learning_rate": 0.0006143344709897611, + "loss": 6.9765, + "step": 3390 + }, + { + "epoch": 1.1573378839590442, + "grad_norm": 3.161717414855957, + "learning_rate": 0.0006142207053469853, + "loss": 6.8384, + "step": 3391 + }, + { + "epoch": 1.157679180887372, + "grad_norm": 5.287087440490723, + "learning_rate": 0.0006141069397042094, + "loss": 6.4128, + "step": 3392 + }, + { + "epoch": 1.1580204778156997, + "grad_norm": 3.629075765609741, + "learning_rate": 0.0006139931740614335, + "loss": 6.171, + "step": 3393 + }, + { + "epoch": 1.1583617747440274, + "grad_norm": 3.260854721069336, + "learning_rate": 0.0006138794084186575, + "loss": 6.6395, + "step": 3394 + }, + { + "epoch": 1.158703071672355, + "grad_norm": 3.2226295471191406, + "learning_rate": 0.0006137656427758816, + "loss": 6.533, + "step": 3395 + }, + { + "epoch": 1.1590443686006826, + "grad_norm": 3.1546380519866943, + "learning_rate": 0.0006136518771331057, + "loss": 6.5444, + "step": 3396 + }, + { + "epoch": 1.1593856655290102, + "grad_norm": 3.138427257537842, + "learning_rate": 0.0006135381114903299, + "loss": 6.7728, + "step": 3397 + }, + { + "epoch": 1.1597269624573379, + "grad_norm": 3.0224416255950928, + "learning_rate": 0.000613424345847554, + "loss": 6.6283, + "step": 3398 + }, + { + "epoch": 1.1600682593856655, + "grad_norm": 3.074084997177124, + "learning_rate": 0.0006133105802047782, + "loss": 6.5736, + "step": 3399 + }, + { + "epoch": 1.1604095563139931, + "grad_norm": 4.4774909019470215, + "learning_rate": 0.0006131968145620023, + "loss": 5.5653, + "step": 3400 + }, + { + "epoch": 1.1607508532423207, + "grad_norm": 3.481534004211426, + "learning_rate": 0.0006130830489192264, + "loss": 6.0247, + "step": 3401 + }, + { + "epoch": 1.1610921501706484, + "grad_norm": 3.528672695159912, + "learning_rate": 0.0006129692832764505, + "loss": 6.7388, + "step": 3402 + }, + { + "epoch": 1.1614334470989762, + "grad_norm": 3.561124324798584, + "learning_rate": 0.0006128555176336747, + "loss": 6.303, + "step": 3403 + }, + { + "epoch": 1.1617747440273036, + "grad_norm": 3.158127546310425, + "learning_rate": 0.0006127417519908988, + "loss": 6.2118, + "step": 3404 + }, + { + "epoch": 1.1621160409556315, + "grad_norm": 3.1194143295288086, + "learning_rate": 0.0006126279863481229, + "loss": 6.4867, + "step": 3405 + }, + { + "epoch": 1.162457337883959, + "grad_norm": 3.337048292160034, + "learning_rate": 0.000612514220705347, + "loss": 6.7671, + "step": 3406 + }, + { + "epoch": 1.1627986348122867, + "grad_norm": 3.0700247287750244, + "learning_rate": 0.0006124004550625711, + "loss": 6.1681, + "step": 3407 + }, + { + "epoch": 1.1631399317406144, + "grad_norm": 3.1669013500213623, + "learning_rate": 0.0006122866894197953, + "loss": 6.8375, + "step": 3408 + }, + { + "epoch": 1.163481228668942, + "grad_norm": 3.3716862201690674, + "learning_rate": 0.0006121729237770194, + "loss": 6.3905, + "step": 3409 + }, + { + "epoch": 1.1638225255972696, + "grad_norm": 7.003607273101807, + "learning_rate": 0.0006120591581342435, + "loss": 6.1162, + "step": 3410 + }, + { + "epoch": 1.1641638225255972, + "grad_norm": 3.2531354427337646, + "learning_rate": 0.0006119453924914676, + "loss": 6.4429, + "step": 3411 + }, + { + "epoch": 1.1645051194539249, + "grad_norm": 3.7305870056152344, + "learning_rate": 0.0006118316268486917, + "loss": 6.4708, + "step": 3412 + }, + { + "epoch": 1.1648464163822525, + "grad_norm": 4.199586868286133, + "learning_rate": 0.0006117178612059157, + "loss": 6.2342, + "step": 3413 + }, + { + "epoch": 1.1651877133105801, + "grad_norm": 3.2192304134368896, + "learning_rate": 0.0006116040955631399, + "loss": 6.3102, + "step": 3414 + }, + { + "epoch": 1.1655290102389078, + "grad_norm": 3.2440237998962402, + "learning_rate": 0.000611490329920364, + "loss": 7.0148, + "step": 3415 + }, + { + "epoch": 1.1658703071672356, + "grad_norm": 3.203922748565674, + "learning_rate": 0.0006113765642775882, + "loss": 6.0749, + "step": 3416 + }, + { + "epoch": 1.1662116040955632, + "grad_norm": 3.4446823596954346, + "learning_rate": 0.0006112627986348123, + "loss": 6.4998, + "step": 3417 + }, + { + "epoch": 1.1665529010238909, + "grad_norm": 3.079411506652832, + "learning_rate": 0.0006111490329920364, + "loss": 6.3895, + "step": 3418 + }, + { + "epoch": 1.1668941979522185, + "grad_norm": 3.134840488433838, + "learning_rate": 0.0006110352673492605, + "loss": 6.3418, + "step": 3419 + }, + { + "epoch": 1.1672354948805461, + "grad_norm": 3.2079269886016846, + "learning_rate": 0.0006109215017064847, + "loss": 6.4572, + "step": 3420 + }, + { + "epoch": 1.1675767918088737, + "grad_norm": 4.834338665008545, + "learning_rate": 0.0006108077360637088, + "loss": 5.9253, + "step": 3421 + }, + { + "epoch": 1.1679180887372014, + "grad_norm": 3.2818708419799805, + "learning_rate": 0.0006106939704209329, + "loss": 6.789, + "step": 3422 + }, + { + "epoch": 1.168259385665529, + "grad_norm": 3.2917566299438477, + "learning_rate": 0.000610580204778157, + "loss": 6.8878, + "step": 3423 + }, + { + "epoch": 1.1686006825938566, + "grad_norm": 3.199425458908081, + "learning_rate": 0.0006104664391353811, + "loss": 6.4318, + "step": 3424 + }, + { + "epoch": 1.1689419795221843, + "grad_norm": 4.149702548980713, + "learning_rate": 0.0006103526734926053, + "loss": 5.2789, + "step": 3425 + }, + { + "epoch": 1.1692832764505119, + "grad_norm": 3.1588354110717773, + "learning_rate": 0.0006102389078498294, + "loss": 6.5981, + "step": 3426 + }, + { + "epoch": 1.1696245733788395, + "grad_norm": 3.282426595687866, + "learning_rate": 0.0006101251422070535, + "loss": 6.3488, + "step": 3427 + }, + { + "epoch": 1.1699658703071671, + "grad_norm": 3.071763515472412, + "learning_rate": 0.0006100113765642776, + "loss": 6.6647, + "step": 3428 + }, + { + "epoch": 1.170307167235495, + "grad_norm": 3.4117684364318848, + "learning_rate": 0.0006098976109215017, + "loss": 6.057, + "step": 3429 + }, + { + "epoch": 1.1706484641638226, + "grad_norm": 3.132619857788086, + "learning_rate": 0.0006097838452787258, + "loss": 6.3089, + "step": 3430 + }, + { + "epoch": 1.1709897610921502, + "grad_norm": 3.3119728565216064, + "learning_rate": 0.00060967007963595, + "loss": 6.6789, + "step": 3431 + }, + { + "epoch": 1.1713310580204779, + "grad_norm": 3.4108917713165283, + "learning_rate": 0.0006095563139931742, + "loss": 6.664, + "step": 3432 + }, + { + "epoch": 1.1716723549488055, + "grad_norm": 3.4007740020751953, + "learning_rate": 0.0006094425483503982, + "loss": 6.6983, + "step": 3433 + }, + { + "epoch": 1.1720136518771331, + "grad_norm": 3.2750303745269775, + "learning_rate": 0.0006093287827076223, + "loss": 6.7191, + "step": 3434 + }, + { + "epoch": 1.1723549488054608, + "grad_norm": 3.2517144680023193, + "learning_rate": 0.0006092150170648464, + "loss": 6.6333, + "step": 3435 + }, + { + "epoch": 1.1726962457337884, + "grad_norm": 3.0657079219818115, + "learning_rate": 0.0006091012514220705, + "loss": 7.0866, + "step": 3436 + }, + { + "epoch": 1.173037542662116, + "grad_norm": 3.8574588298797607, + "learning_rate": 0.0006089874857792947, + "loss": 6.2078, + "step": 3437 + }, + { + "epoch": 1.1733788395904436, + "grad_norm": 3.447795867919922, + "learning_rate": 0.0006088737201365188, + "loss": 6.5433, + "step": 3438 + }, + { + "epoch": 1.1737201365187713, + "grad_norm": 3.2688255310058594, + "learning_rate": 0.0006087599544937429, + "loss": 6.0859, + "step": 3439 + }, + { + "epoch": 1.174061433447099, + "grad_norm": 3.0544850826263428, + "learning_rate": 0.000608646188850967, + "loss": 6.6904, + "step": 3440 + }, + { + "epoch": 1.1744027303754265, + "grad_norm": 3.1975157260894775, + "learning_rate": 0.0006085324232081911, + "loss": 6.3464, + "step": 3441 + }, + { + "epoch": 1.1747440273037544, + "grad_norm": 3.1595888137817383, + "learning_rate": 0.0006084186575654152, + "loss": 6.721, + "step": 3442 + }, + { + "epoch": 1.175085324232082, + "grad_norm": 3.2810568809509277, + "learning_rate": 0.0006083048919226394, + "loss": 5.9367, + "step": 3443 + }, + { + "epoch": 1.1754266211604096, + "grad_norm": 3.1587705612182617, + "learning_rate": 0.0006081911262798635, + "loss": 6.2515, + "step": 3444 + }, + { + "epoch": 1.1757679180887373, + "grad_norm": 3.1385536193847656, + "learning_rate": 0.0006080773606370876, + "loss": 6.3264, + "step": 3445 + }, + { + "epoch": 1.176109215017065, + "grad_norm": 3.262084722518921, + "learning_rate": 0.0006079635949943117, + "loss": 6.7922, + "step": 3446 + }, + { + "epoch": 1.1764505119453925, + "grad_norm": 3.3084287643432617, + "learning_rate": 0.0006078498293515358, + "loss": 6.5934, + "step": 3447 + }, + { + "epoch": 1.1767918088737201, + "grad_norm": 7.798718452453613, + "learning_rate": 0.00060773606370876, + "loss": 5.7443, + "step": 3448 + }, + { + "epoch": 1.1771331058020478, + "grad_norm": 3.4016666412353516, + "learning_rate": 0.0006076222980659842, + "loss": 7.0521, + "step": 3449 + }, + { + "epoch": 1.1774744027303754, + "grad_norm": 3.541292905807495, + "learning_rate": 0.0006075085324232083, + "loss": 6.5505, + "step": 3450 + }, + { + "epoch": 1.177815699658703, + "grad_norm": 3.220148801803589, + "learning_rate": 0.0006073947667804324, + "loss": 6.8404, + "step": 3451 + }, + { + "epoch": 1.1781569965870307, + "grad_norm": 8.531793594360352, + "learning_rate": 0.0006072810011376564, + "loss": 5.1038, + "step": 3452 + }, + { + "epoch": 1.1784982935153583, + "grad_norm": 3.7234575748443604, + "learning_rate": 0.0006071672354948805, + "loss": 6.5902, + "step": 3453 + }, + { + "epoch": 1.178839590443686, + "grad_norm": 3.364846706390381, + "learning_rate": 0.0006070534698521047, + "loss": 6.1682, + "step": 3454 + }, + { + "epoch": 1.1791808873720138, + "grad_norm": 3.291454315185547, + "learning_rate": 0.0006069397042093288, + "loss": 6.5433, + "step": 3455 + }, + { + "epoch": 1.1795221843003414, + "grad_norm": 3.3431570529937744, + "learning_rate": 0.0006068259385665529, + "loss": 6.5502, + "step": 3456 + }, + { + "epoch": 1.179863481228669, + "grad_norm": 5.842983722686768, + "learning_rate": 0.000606712172923777, + "loss": 5.9161, + "step": 3457 + }, + { + "epoch": 1.1802047781569966, + "grad_norm": 3.1439208984375, + "learning_rate": 0.0006065984072810011, + "loss": 6.7444, + "step": 3458 + }, + { + "epoch": 1.1805460750853243, + "grad_norm": 3.290973424911499, + "learning_rate": 0.0006064846416382252, + "loss": 6.1831, + "step": 3459 + }, + { + "epoch": 1.180887372013652, + "grad_norm": 3.0965545177459717, + "learning_rate": 0.0006063708759954494, + "loss": 6.2298, + "step": 3460 + }, + { + "epoch": 1.1812286689419795, + "grad_norm": 3.1501073837280273, + "learning_rate": 0.0006062571103526735, + "loss": 5.6735, + "step": 3461 + }, + { + "epoch": 1.1815699658703072, + "grad_norm": 3.1459507942199707, + "learning_rate": 0.0006061433447098976, + "loss": 6.6602, + "step": 3462 + }, + { + "epoch": 1.1819112627986348, + "grad_norm": 6.302970886230469, + "learning_rate": 0.0006060295790671217, + "loss": 6.541, + "step": 3463 + }, + { + "epoch": 1.1822525597269624, + "grad_norm": 3.364511251449585, + "learning_rate": 0.0006059158134243458, + "loss": 6.4438, + "step": 3464 + }, + { + "epoch": 1.18259385665529, + "grad_norm": 3.289085626602173, + "learning_rate": 0.0006058020477815699, + "loss": 6.512, + "step": 3465 + }, + { + "epoch": 1.1829351535836177, + "grad_norm": 3.2350807189941406, + "learning_rate": 0.0006056882821387942, + "loss": 6.6411, + "step": 3466 + }, + { + "epoch": 1.1832764505119453, + "grad_norm": 3.8296823501586914, + "learning_rate": 0.0006055745164960183, + "loss": 6.2768, + "step": 3467 + }, + { + "epoch": 1.1836177474402731, + "grad_norm": 3.2926394939422607, + "learning_rate": 0.0006054607508532424, + "loss": 6.6457, + "step": 3468 + }, + { + "epoch": 1.1839590443686008, + "grad_norm": 3.056105613708496, + "learning_rate": 0.0006053469852104665, + "loss": 6.1748, + "step": 3469 + }, + { + "epoch": 1.1843003412969284, + "grad_norm": 3.2163851261138916, + "learning_rate": 0.0006052332195676906, + "loss": 6.1277, + "step": 3470 + }, + { + "epoch": 1.184641638225256, + "grad_norm": 3.086276054382324, + "learning_rate": 0.0006051194539249148, + "loss": 6.2933, + "step": 3471 + }, + { + "epoch": 1.1849829351535837, + "grad_norm": 5.672454833984375, + "learning_rate": 0.0006050056882821388, + "loss": 5.7082, + "step": 3472 + }, + { + "epoch": 1.1853242320819113, + "grad_norm": 3.3687098026275635, + "learning_rate": 0.0006048919226393629, + "loss": 6.9588, + "step": 3473 + }, + { + "epoch": 1.185665529010239, + "grad_norm": 3.393932580947876, + "learning_rate": 0.000604778156996587, + "loss": 6.7387, + "step": 3474 + }, + { + "epoch": 1.1860068259385665, + "grad_norm": 3.1261706352233887, + "learning_rate": 0.0006046643913538111, + "loss": 6.5774, + "step": 3475 + }, + { + "epoch": 1.1863481228668942, + "grad_norm": 3.0573582649230957, + "learning_rate": 0.0006045506257110352, + "loss": 6.3902, + "step": 3476 + }, + { + "epoch": 1.1866894197952218, + "grad_norm": 3.2194995880126953, + "learning_rate": 0.0006044368600682594, + "loss": 6.3269, + "step": 3477 + }, + { + "epoch": 1.1870307167235494, + "grad_norm": 3.1429195404052734, + "learning_rate": 0.0006043230944254835, + "loss": 6.4637, + "step": 3478 + }, + { + "epoch": 1.187372013651877, + "grad_norm": 3.0701262950897217, + "learning_rate": 0.0006042093287827076, + "loss": 6.6339, + "step": 3479 + }, + { + "epoch": 1.1877133105802047, + "grad_norm": 3.701120138168335, + "learning_rate": 0.0006040955631399317, + "loss": 6.075, + "step": 3480 + }, + { + "epoch": 1.1880546075085325, + "grad_norm": 3.3148467540740967, + "learning_rate": 0.0006039817974971558, + "loss": 6.1615, + "step": 3481 + }, + { + "epoch": 1.1883959044368602, + "grad_norm": 3.282397985458374, + "learning_rate": 0.0006038680318543799, + "loss": 6.6828, + "step": 3482 + }, + { + "epoch": 1.1887372013651878, + "grad_norm": 3.1746835708618164, + "learning_rate": 0.0006037542662116042, + "loss": 6.7092, + "step": 3483 + }, + { + "epoch": 1.1890784982935154, + "grad_norm": 3.044534206390381, + "learning_rate": 0.0006036405005688283, + "loss": 5.9796, + "step": 3484 + }, + { + "epoch": 1.189419795221843, + "grad_norm": 3.210864543914795, + "learning_rate": 0.0006035267349260524, + "loss": 6.522, + "step": 3485 + }, + { + "epoch": 1.1897610921501707, + "grad_norm": 3.328636646270752, + "learning_rate": 0.0006034129692832765, + "loss": 6.472, + "step": 3486 + }, + { + "epoch": 1.1901023890784983, + "grad_norm": 3.153980255126953, + "learning_rate": 0.0006032992036405006, + "loss": 7.1446, + "step": 3487 + }, + { + "epoch": 1.190443686006826, + "grad_norm": 3.198655128479004, + "learning_rate": 0.0006031854379977248, + "loss": 6.6485, + "step": 3488 + }, + { + "epoch": 1.1907849829351536, + "grad_norm": 3.2300875186920166, + "learning_rate": 0.0006030716723549489, + "loss": 6.5407, + "step": 3489 + }, + { + "epoch": 1.1911262798634812, + "grad_norm": 3.580975294113159, + "learning_rate": 0.000602957906712173, + "loss": 6.4401, + "step": 3490 + }, + { + "epoch": 1.1914675767918088, + "grad_norm": 3.525164842605591, + "learning_rate": 0.000602844141069397, + "loss": 6.486, + "step": 3491 + }, + { + "epoch": 1.1918088737201364, + "grad_norm": 3.407437562942505, + "learning_rate": 0.0006027303754266211, + "loss": 6.3749, + "step": 3492 + }, + { + "epoch": 1.192150170648464, + "grad_norm": 3.6547207832336426, + "learning_rate": 0.0006026166097838452, + "loss": 5.8585, + "step": 3493 + }, + { + "epoch": 1.192491467576792, + "grad_norm": 3.2446658611297607, + "learning_rate": 0.0006025028441410694, + "loss": 6.5614, + "step": 3494 + }, + { + "epoch": 1.1928327645051195, + "grad_norm": 3.150815963745117, + "learning_rate": 0.0006023890784982935, + "loss": 6.0707, + "step": 3495 + }, + { + "epoch": 1.1931740614334472, + "grad_norm": 5.976104736328125, + "learning_rate": 0.0006022753128555176, + "loss": 6.0327, + "step": 3496 + }, + { + "epoch": 1.1935153583617748, + "grad_norm": 3.18243408203125, + "learning_rate": 0.0006021615472127417, + "loss": 6.7489, + "step": 3497 + }, + { + "epoch": 1.1938566552901024, + "grad_norm": 3.4427740573883057, + "learning_rate": 0.0006020477815699658, + "loss": 5.9822, + "step": 3498 + }, + { + "epoch": 1.19419795221843, + "grad_norm": 3.2316274642944336, + "learning_rate": 0.00060193401592719, + "loss": 6.4285, + "step": 3499 + }, + { + "epoch": 1.1945392491467577, + "grad_norm": 3.2206499576568604, + "learning_rate": 0.0006018202502844142, + "loss": 6.5806, + "step": 3500 + }, + { + "epoch": 1.1948805460750853, + "grad_norm": 3.1856689453125, + "learning_rate": 0.0006017064846416383, + "loss": 6.5702, + "step": 3501 + }, + { + "epoch": 1.195221843003413, + "grad_norm": 3.3193821907043457, + "learning_rate": 0.0006015927189988624, + "loss": 6.6808, + "step": 3502 + }, + { + "epoch": 1.1955631399317406, + "grad_norm": 4.13479471206665, + "learning_rate": 0.0006014789533560865, + "loss": 5.7823, + "step": 3503 + }, + { + "epoch": 1.1959044368600682, + "grad_norm": 4.107855319976807, + "learning_rate": 0.0006013651877133106, + "loss": 5.8399, + "step": 3504 + }, + { + "epoch": 1.1962457337883958, + "grad_norm": 3.282972574234009, + "learning_rate": 0.0006012514220705347, + "loss": 6.0828, + "step": 3505 + }, + { + "epoch": 1.1965870307167235, + "grad_norm": 3.2156879901885986, + "learning_rate": 0.0006011376564277589, + "loss": 6.3889, + "step": 3506 + }, + { + "epoch": 1.1969283276450513, + "grad_norm": 3.4601330757141113, + "learning_rate": 0.000601023890784983, + "loss": 5.7612, + "step": 3507 + }, + { + "epoch": 1.197269624573379, + "grad_norm": 3.1823012828826904, + "learning_rate": 0.0006009101251422071, + "loss": 6.3664, + "step": 3508 + }, + { + "epoch": 1.1976109215017066, + "grad_norm": 3.098069906234741, + "learning_rate": 0.0006007963594994312, + "loss": 6.0548, + "step": 3509 + }, + { + "epoch": 1.1979522184300342, + "grad_norm": 3.121912956237793, + "learning_rate": 0.0006006825938566553, + "loss": 6.4154, + "step": 3510 + }, + { + "epoch": 1.1982935153583618, + "grad_norm": 3.1986167430877686, + "learning_rate": 0.0006005688282138794, + "loss": 6.5749, + "step": 3511 + }, + { + "epoch": 1.1986348122866894, + "grad_norm": 3.2935080528259277, + "learning_rate": 0.0006004550625711035, + "loss": 6.0803, + "step": 3512 + }, + { + "epoch": 1.198976109215017, + "grad_norm": 4.030786991119385, + "learning_rate": 0.0006003412969283276, + "loss": 6.0646, + "step": 3513 + }, + { + "epoch": 1.1993174061433447, + "grad_norm": 3.3741588592529297, + "learning_rate": 0.0006002275312855517, + "loss": 6.4553, + "step": 3514 + }, + { + "epoch": 1.1996587030716723, + "grad_norm": 3.256930112838745, + "learning_rate": 0.0006001137656427758, + "loss": 6.6533, + "step": 3515 + }, + { + "epoch": 1.2, + "grad_norm": 3.3063952922821045, + "learning_rate": 0.0006, + "loss": 6.369, + "step": 3516 + }, + { + "epoch": 1.2003412969283276, + "grad_norm": 3.2215917110443115, + "learning_rate": 0.0005998862343572242, + "loss": 6.4497, + "step": 3517 + }, + { + "epoch": 1.2006825938566552, + "grad_norm": 4.139420032501221, + "learning_rate": 0.0005997724687144483, + "loss": 6.2639, + "step": 3518 + }, + { + "epoch": 1.2010238907849828, + "grad_norm": 3.161328077316284, + "learning_rate": 0.0005996587030716724, + "loss": 6.4553, + "step": 3519 + }, + { + "epoch": 1.2013651877133107, + "grad_norm": 3.1102607250213623, + "learning_rate": 0.0005995449374288965, + "loss": 6.6922, + "step": 3520 + }, + { + "epoch": 1.2017064846416383, + "grad_norm": 3.2522287368774414, + "learning_rate": 0.0005994311717861206, + "loss": 6.8216, + "step": 3521 + }, + { + "epoch": 1.202047781569966, + "grad_norm": 3.175919771194458, + "learning_rate": 0.0005993174061433447, + "loss": 6.2899, + "step": 3522 + }, + { + "epoch": 1.2023890784982936, + "grad_norm": 3.2448034286499023, + "learning_rate": 0.0005992036405005689, + "loss": 6.7938, + "step": 3523 + }, + { + "epoch": 1.2027303754266212, + "grad_norm": 3.362255811691284, + "learning_rate": 0.000599089874857793, + "loss": 6.4267, + "step": 3524 + }, + { + "epoch": 1.2030716723549488, + "grad_norm": 3.1305243968963623, + "learning_rate": 0.0005989761092150171, + "loss": 6.7481, + "step": 3525 + }, + { + "epoch": 1.2034129692832765, + "grad_norm": 3.1050209999084473, + "learning_rate": 0.0005988623435722412, + "loss": 6.5467, + "step": 3526 + }, + { + "epoch": 1.203754266211604, + "grad_norm": 3.1616594791412354, + "learning_rate": 0.0005987485779294653, + "loss": 6.7965, + "step": 3527 + }, + { + "epoch": 1.2040955631399317, + "grad_norm": 3.2334542274475098, + "learning_rate": 0.0005986348122866895, + "loss": 6.6105, + "step": 3528 + }, + { + "epoch": 1.2044368600682593, + "grad_norm": 4.107026100158691, + "learning_rate": 0.0005985210466439136, + "loss": 6.1539, + "step": 3529 + }, + { + "epoch": 1.204778156996587, + "grad_norm": 3.437368392944336, + "learning_rate": 0.0005984072810011376, + "loss": 6.7131, + "step": 3530 + }, + { + "epoch": 1.2051194539249146, + "grad_norm": 3.2274441719055176, + "learning_rate": 0.0005982935153583617, + "loss": 6.4707, + "step": 3531 + }, + { + "epoch": 1.2054607508532422, + "grad_norm": 3.5051920413970947, + "learning_rate": 0.0005981797497155858, + "loss": 6.4562, + "step": 3532 + }, + { + "epoch": 1.20580204778157, + "grad_norm": 3.2286267280578613, + "learning_rate": 0.00059806598407281, + "loss": 6.3122, + "step": 3533 + }, + { + "epoch": 1.2061433447098977, + "grad_norm": 3.3295505046844482, + "learning_rate": 0.0005979522184300342, + "loss": 6.7408, + "step": 3534 + }, + { + "epoch": 1.2064846416382253, + "grad_norm": 3.2205469608306885, + "learning_rate": 0.0005978384527872583, + "loss": 6.4856, + "step": 3535 + }, + { + "epoch": 1.206825938566553, + "grad_norm": 3.2709555625915527, + "learning_rate": 0.0005977246871444824, + "loss": 6.2622, + "step": 3536 + }, + { + "epoch": 1.2071672354948806, + "grad_norm": 3.1822428703308105, + "learning_rate": 0.0005976109215017065, + "loss": 6.2327, + "step": 3537 + }, + { + "epoch": 1.2075085324232082, + "grad_norm": 3.1136908531188965, + "learning_rate": 0.0005974971558589306, + "loss": 6.6299, + "step": 3538 + }, + { + "epoch": 1.2078498293515358, + "grad_norm": 3.7378649711608887, + "learning_rate": 0.0005973833902161547, + "loss": 5.8328, + "step": 3539 + }, + { + "epoch": 1.2081911262798635, + "grad_norm": 3.2474365234375, + "learning_rate": 0.0005972696245733789, + "loss": 6.47, + "step": 3540 + }, + { + "epoch": 1.208532423208191, + "grad_norm": 3.283991575241089, + "learning_rate": 0.000597155858930603, + "loss": 6.2577, + "step": 3541 + }, + { + "epoch": 1.2088737201365187, + "grad_norm": 3.1167514324188232, + "learning_rate": 0.0005970420932878271, + "loss": 6.4208, + "step": 3542 + }, + { + "epoch": 1.2092150170648464, + "grad_norm": 3.0923006534576416, + "learning_rate": 0.0005969283276450512, + "loss": 6.7528, + "step": 3543 + }, + { + "epoch": 1.209556313993174, + "grad_norm": 3.308056354522705, + "learning_rate": 0.0005968145620022753, + "loss": 7.0382, + "step": 3544 + }, + { + "epoch": 1.2098976109215016, + "grad_norm": 3.772918701171875, + "learning_rate": 0.0005967007963594994, + "loss": 6.3507, + "step": 3545 + }, + { + "epoch": 1.2102389078498295, + "grad_norm": 3.130896806716919, + "learning_rate": 0.0005965870307167236, + "loss": 6.9891, + "step": 3546 + }, + { + "epoch": 1.210580204778157, + "grad_norm": 6.454501152038574, + "learning_rate": 0.0005964732650739477, + "loss": 6.0827, + "step": 3547 + }, + { + "epoch": 1.2109215017064847, + "grad_norm": 3.259948968887329, + "learning_rate": 0.0005963594994311718, + "loss": 5.4018, + "step": 3548 + }, + { + "epoch": 1.2112627986348123, + "grad_norm": 3.4372003078460693, + "learning_rate": 0.000596245733788396, + "loss": 6.5116, + "step": 3549 + }, + { + "epoch": 1.21160409556314, + "grad_norm": 3.244441509246826, + "learning_rate": 0.00059613196814562, + "loss": 6.7611, + "step": 3550 + }, + { + "epoch": 1.2119453924914676, + "grad_norm": 3.4917261600494385, + "learning_rate": 0.0005960182025028442, + "loss": 5.7466, + "step": 3551 + }, + { + "epoch": 1.2122866894197952, + "grad_norm": 3.247199058532715, + "learning_rate": 0.0005959044368600683, + "loss": 6.3, + "step": 3552 + }, + { + "epoch": 1.2126279863481229, + "grad_norm": 3.103398323059082, + "learning_rate": 0.0005957906712172924, + "loss": 6.9337, + "step": 3553 + }, + { + "epoch": 1.2129692832764505, + "grad_norm": 3.120478868484497, + "learning_rate": 0.0005956769055745165, + "loss": 6.3149, + "step": 3554 + }, + { + "epoch": 1.213310580204778, + "grad_norm": 3.190842866897583, + "learning_rate": 0.0005955631399317406, + "loss": 6.078, + "step": 3555 + }, + { + "epoch": 1.2136518771331057, + "grad_norm": 3.1261842250823975, + "learning_rate": 0.0005954493742889647, + "loss": 6.4843, + "step": 3556 + }, + { + "epoch": 1.2139931740614334, + "grad_norm": 3.333153486251831, + "learning_rate": 0.0005953356086461889, + "loss": 6.1083, + "step": 3557 + }, + { + "epoch": 1.214334470989761, + "grad_norm": 3.2019073963165283, + "learning_rate": 0.000595221843003413, + "loss": 6.4916, + "step": 3558 + }, + { + "epoch": 1.2146757679180888, + "grad_norm": 3.232032060623169, + "learning_rate": 0.0005951080773606371, + "loss": 6.5154, + "step": 3559 + }, + { + "epoch": 1.2150170648464165, + "grad_norm": 3.299344301223755, + "learning_rate": 0.0005949943117178612, + "loss": 6.6801, + "step": 3560 + }, + { + "epoch": 1.215358361774744, + "grad_norm": 3.3514556884765625, + "learning_rate": 0.0005948805460750853, + "loss": 6.742, + "step": 3561 + }, + { + "epoch": 1.2156996587030717, + "grad_norm": 3.3210854530334473, + "learning_rate": 0.0005947667804323094, + "loss": 5.9867, + "step": 3562 + }, + { + "epoch": 1.2160409556313994, + "grad_norm": 3.2020204067230225, + "learning_rate": 0.0005946530147895336, + "loss": 6.715, + "step": 3563 + }, + { + "epoch": 1.216382252559727, + "grad_norm": 3.123915433883667, + "learning_rate": 0.0005945392491467577, + "loss": 6.6287, + "step": 3564 + }, + { + "epoch": 1.2167235494880546, + "grad_norm": 3.2689836025238037, + "learning_rate": 0.0005944254835039818, + "loss": 6.6374, + "step": 3565 + }, + { + "epoch": 1.2170648464163822, + "grad_norm": 4.918799877166748, + "learning_rate": 0.000594311717861206, + "loss": 5.6419, + "step": 3566 + }, + { + "epoch": 1.2174061433447099, + "grad_norm": 3.0726449489593506, + "learning_rate": 0.0005941979522184301, + "loss": 6.2995, + "step": 3567 + }, + { + "epoch": 1.2177474402730375, + "grad_norm": 3.210508108139038, + "learning_rate": 0.0005940841865756542, + "loss": 6.058, + "step": 3568 + }, + { + "epoch": 1.2180887372013651, + "grad_norm": 3.5676910877227783, + "learning_rate": 0.0005939704209328783, + "loss": 5.5099, + "step": 3569 + }, + { + "epoch": 1.2184300341296928, + "grad_norm": 3.3140625953674316, + "learning_rate": 0.0005938566552901024, + "loss": 6.7096, + "step": 3570 + }, + { + "epoch": 1.2187713310580204, + "grad_norm": 3.6696548461914062, + "learning_rate": 0.0005937428896473265, + "loss": 4.3158, + "step": 3571 + }, + { + "epoch": 1.2191126279863482, + "grad_norm": 3.3248839378356934, + "learning_rate": 0.0005936291240045506, + "loss": 6.9934, + "step": 3572 + }, + { + "epoch": 1.2194539249146759, + "grad_norm": 3.5815327167510986, + "learning_rate": 0.0005935153583617747, + "loss": 5.5062, + "step": 3573 + }, + { + "epoch": 1.2197952218430035, + "grad_norm": 3.4461398124694824, + "learning_rate": 0.0005934015927189989, + "loss": 6.7242, + "step": 3574 + }, + { + "epoch": 1.2201365187713311, + "grad_norm": 3.182681083679199, + "learning_rate": 0.000593287827076223, + "loss": 6.8025, + "step": 3575 + }, + { + "epoch": 1.2204778156996587, + "grad_norm": 3.6863200664520264, + "learning_rate": 0.0005931740614334471, + "loss": 6.4275, + "step": 3576 + }, + { + "epoch": 1.2208191126279864, + "grad_norm": 3.2513556480407715, + "learning_rate": 0.0005930602957906712, + "loss": 6.3153, + "step": 3577 + }, + { + "epoch": 1.221160409556314, + "grad_norm": 5.446213722229004, + "learning_rate": 0.0005929465301478953, + "loss": 6.1899, + "step": 3578 + }, + { + "epoch": 1.2215017064846416, + "grad_norm": 5.742640972137451, + "learning_rate": 0.0005928327645051194, + "loss": 4.5945, + "step": 3579 + }, + { + "epoch": 1.2218430034129693, + "grad_norm": 3.372920036315918, + "learning_rate": 0.0005927189988623436, + "loss": 6.4181, + "step": 3580 + }, + { + "epoch": 1.2221843003412969, + "grad_norm": 3.323343515396118, + "learning_rate": 0.0005926052332195677, + "loss": 6.5921, + "step": 3581 + }, + { + "epoch": 1.2225255972696245, + "grad_norm": 3.2678415775299072, + "learning_rate": 0.0005924914675767918, + "loss": 6.544, + "step": 3582 + }, + { + "epoch": 1.2228668941979521, + "grad_norm": 3.0852653980255127, + "learning_rate": 0.000592377701934016, + "loss": 6.5581, + "step": 3583 + }, + { + "epoch": 1.2232081911262798, + "grad_norm": 3.8021903038024902, + "learning_rate": 0.0005922639362912401, + "loss": 5.3767, + "step": 3584 + }, + { + "epoch": 1.2235494880546076, + "grad_norm": 3.143950939178467, + "learning_rate": 0.0005921501706484642, + "loss": 6.1757, + "step": 3585 + }, + { + "epoch": 1.2238907849829352, + "grad_norm": 3.0818896293640137, + "learning_rate": 0.0005920364050056884, + "loss": 6.837, + "step": 3586 + }, + { + "epoch": 1.2242320819112629, + "grad_norm": 2.336493730545044, + "learning_rate": 0.0005919226393629125, + "loss": 3.1151, + "step": 3587 + }, + { + "epoch": 1.2245733788395905, + "grad_norm": 3.16595721244812, + "learning_rate": 0.0005918088737201365, + "loss": 6.2877, + "step": 3588 + }, + { + "epoch": 1.2249146757679181, + "grad_norm": 4.046875, + "learning_rate": 0.0005916951080773606, + "loss": 5.9708, + "step": 3589 + }, + { + "epoch": 1.2252559726962458, + "grad_norm": 3.3334009647369385, + "learning_rate": 0.0005915813424345847, + "loss": 6.8327, + "step": 3590 + }, + { + "epoch": 1.2255972696245734, + "grad_norm": 3.452604055404663, + "learning_rate": 0.0005914675767918088, + "loss": 6.5917, + "step": 3591 + }, + { + "epoch": 1.225938566552901, + "grad_norm": 3.072746753692627, + "learning_rate": 0.000591353811149033, + "loss": 6.3403, + "step": 3592 + }, + { + "epoch": 1.2262798634812286, + "grad_norm": 3.248479127883911, + "learning_rate": 0.0005912400455062571, + "loss": 6.3434, + "step": 3593 + }, + { + "epoch": 1.2266211604095563, + "grad_norm": 3.0548229217529297, + "learning_rate": 0.0005911262798634812, + "loss": 6.6718, + "step": 3594 + }, + { + "epoch": 1.226962457337884, + "grad_norm": 3.1804940700531006, + "learning_rate": 0.0005910125142207053, + "loss": 6.3622, + "step": 3595 + }, + { + "epoch": 1.2273037542662115, + "grad_norm": 3.511061668395996, + "learning_rate": 0.0005908987485779294, + "loss": 6.1706, + "step": 3596 + }, + { + "epoch": 1.2276450511945391, + "grad_norm": 3.2657320499420166, + "learning_rate": 0.0005907849829351536, + "loss": 6.4201, + "step": 3597 + }, + { + "epoch": 1.227986348122867, + "grad_norm": 3.191793918609619, + "learning_rate": 0.0005906712172923777, + "loss": 6.2123, + "step": 3598 + }, + { + "epoch": 1.2283276450511946, + "grad_norm": 3.1244120597839355, + "learning_rate": 0.0005905574516496019, + "loss": 6.2795, + "step": 3599 + }, + { + "epoch": 1.2286689419795223, + "grad_norm": 3.2375831604003906, + "learning_rate": 0.000590443686006826, + "loss": 6.175, + "step": 3600 + }, + { + "epoch": 1.2290102389078499, + "grad_norm": 3.0911331176757812, + "learning_rate": 0.0005903299203640501, + "loss": 5.3192, + "step": 3601 + }, + { + "epoch": 1.2293515358361775, + "grad_norm": 3.3348145484924316, + "learning_rate": 0.0005902161547212742, + "loss": 6.614, + "step": 3602 + }, + { + "epoch": 1.2296928327645051, + "grad_norm": 3.407654047012329, + "learning_rate": 0.0005901023890784984, + "loss": 5.4858, + "step": 3603 + }, + { + "epoch": 1.2300341296928328, + "grad_norm": 3.8060050010681152, + "learning_rate": 0.0005899886234357225, + "loss": 6.4772, + "step": 3604 + }, + { + "epoch": 1.2303754266211604, + "grad_norm": 3.2755227088928223, + "learning_rate": 0.0005898748577929466, + "loss": 6.4768, + "step": 3605 + }, + { + "epoch": 1.230716723549488, + "grad_norm": 3.256535530090332, + "learning_rate": 0.0005897610921501707, + "loss": 6.8184, + "step": 3606 + }, + { + "epoch": 1.2310580204778157, + "grad_norm": 3.2337422370910645, + "learning_rate": 0.0005896473265073948, + "loss": 6.2524, + "step": 3607 + }, + { + "epoch": 1.2313993174061433, + "grad_norm": 3.262669801712036, + "learning_rate": 0.0005895335608646188, + "loss": 5.987, + "step": 3608 + }, + { + "epoch": 1.231740614334471, + "grad_norm": 3.1079490184783936, + "learning_rate": 0.000589419795221843, + "loss": 6.9693, + "step": 3609 + }, + { + "epoch": 1.2320819112627985, + "grad_norm": 3.214324951171875, + "learning_rate": 0.0005893060295790671, + "loss": 6.4031, + "step": 3610 + }, + { + "epoch": 1.2324232081911264, + "grad_norm": 3.1826834678649902, + "learning_rate": 0.0005891922639362912, + "loss": 6.5096, + "step": 3611 + }, + { + "epoch": 1.232764505119454, + "grad_norm": 3.2287216186523438, + "learning_rate": 0.0005890784982935153, + "loss": 6.587, + "step": 3612 + }, + { + "epoch": 1.2331058020477816, + "grad_norm": 3.313389778137207, + "learning_rate": 0.0005889647326507394, + "loss": 6.2603, + "step": 3613 + }, + { + "epoch": 1.2334470989761093, + "grad_norm": 3.2733147144317627, + "learning_rate": 0.0005888509670079636, + "loss": 6.7555, + "step": 3614 + }, + { + "epoch": 1.233788395904437, + "grad_norm": 3.265892505645752, + "learning_rate": 0.0005887372013651877, + "loss": 6.1515, + "step": 3615 + }, + { + "epoch": 1.2341296928327645, + "grad_norm": 3.340308904647827, + "learning_rate": 0.0005886234357224119, + "loss": 6.3667, + "step": 3616 + }, + { + "epoch": 1.2344709897610922, + "grad_norm": 3.3036296367645264, + "learning_rate": 0.000588509670079636, + "loss": 7.1784, + "step": 3617 + }, + { + "epoch": 1.2348122866894198, + "grad_norm": 3.338773250579834, + "learning_rate": 0.0005883959044368601, + "loss": 6.5319, + "step": 3618 + }, + { + "epoch": 1.2351535836177474, + "grad_norm": 3.15885329246521, + "learning_rate": 0.0005882821387940842, + "loss": 6.8377, + "step": 3619 + }, + { + "epoch": 1.235494880546075, + "grad_norm": 3.140784978866577, + "learning_rate": 0.0005881683731513084, + "loss": 6.2892, + "step": 3620 + }, + { + "epoch": 1.2358361774744027, + "grad_norm": 3.2073025703430176, + "learning_rate": 0.0005880546075085325, + "loss": 5.9276, + "step": 3621 + }, + { + "epoch": 1.2361774744027303, + "grad_norm": 4.180164813995361, + "learning_rate": 0.0005879408418657566, + "loss": 5.3202, + "step": 3622 + }, + { + "epoch": 1.236518771331058, + "grad_norm": 3.397432327270508, + "learning_rate": 0.0005878270762229807, + "loss": 6.0951, + "step": 3623 + }, + { + "epoch": 1.2368600682593858, + "grad_norm": 3.3566372394561768, + "learning_rate": 0.0005877133105802048, + "loss": 6.4745, + "step": 3624 + }, + { + "epoch": 1.2372013651877134, + "grad_norm": 3.236814260482788, + "learning_rate": 0.0005875995449374289, + "loss": 6.5627, + "step": 3625 + }, + { + "epoch": 1.237542662116041, + "grad_norm": 3.453336238861084, + "learning_rate": 0.0005874857792946531, + "loss": 5.8774, + "step": 3626 + }, + { + "epoch": 1.2378839590443687, + "grad_norm": 6.298210620880127, + "learning_rate": 0.0005873720136518771, + "loss": 5.7175, + "step": 3627 + }, + { + "epoch": 1.2382252559726963, + "grad_norm": 3.396085023880005, + "learning_rate": 0.0005872582480091012, + "loss": 6.5425, + "step": 3628 + }, + { + "epoch": 1.238566552901024, + "grad_norm": 3.876892566680908, + "learning_rate": 0.0005871444823663253, + "loss": 6.0154, + "step": 3629 + }, + { + "epoch": 1.2389078498293515, + "grad_norm": 3.221203088760376, + "learning_rate": 0.0005870307167235494, + "loss": 6.6467, + "step": 3630 + }, + { + "epoch": 1.2392491467576792, + "grad_norm": 4.505368232727051, + "learning_rate": 0.0005869169510807735, + "loss": 5.1206, + "step": 3631 + }, + { + "epoch": 1.2395904436860068, + "grad_norm": 3.1172850131988525, + "learning_rate": 0.0005868031854379977, + "loss": 6.6021, + "step": 3632 + }, + { + "epoch": 1.2399317406143344, + "grad_norm": 3.2955751419067383, + "learning_rate": 0.0005866894197952219, + "loss": 5.3394, + "step": 3633 + }, + { + "epoch": 1.240273037542662, + "grad_norm": 3.096254348754883, + "learning_rate": 0.000586575654152446, + "loss": 6.3385, + "step": 3634 + }, + { + "epoch": 1.2406143344709897, + "grad_norm": 3.1540637016296387, + "learning_rate": 0.0005864618885096701, + "loss": 6.8789, + "step": 3635 + }, + { + "epoch": 1.2409556313993173, + "grad_norm": 3.2090256214141846, + "learning_rate": 0.0005863481228668942, + "loss": 5.8644, + "step": 3636 + }, + { + "epoch": 1.2412969283276452, + "grad_norm": 3.128225564956665, + "learning_rate": 0.0005862343572241184, + "loss": 6.6566, + "step": 3637 + }, + { + "epoch": 1.2416382252559728, + "grad_norm": 4.46166467666626, + "learning_rate": 0.0005861205915813425, + "loss": 5.8012, + "step": 3638 + }, + { + "epoch": 1.2419795221843004, + "grad_norm": 3.2098820209503174, + "learning_rate": 0.0005860068259385666, + "loss": 6.0215, + "step": 3639 + }, + { + "epoch": 1.242320819112628, + "grad_norm": 4.331230163574219, + "learning_rate": 0.0005858930602957907, + "loss": 5.8321, + "step": 3640 + }, + { + "epoch": 1.2426621160409557, + "grad_norm": 3.4035027027130127, + "learning_rate": 0.0005857792946530148, + "loss": 6.608, + "step": 3641 + }, + { + "epoch": 1.2430034129692833, + "grad_norm": 3.085808515548706, + "learning_rate": 0.0005856655290102389, + "loss": 6.7047, + "step": 3642 + }, + { + "epoch": 1.243344709897611, + "grad_norm": 3.15273380279541, + "learning_rate": 0.0005855517633674631, + "loss": 5.9884, + "step": 3643 + }, + { + "epoch": 1.2436860068259386, + "grad_norm": 3.1481151580810547, + "learning_rate": 0.0005854379977246872, + "loss": 6.578, + "step": 3644 + }, + { + "epoch": 1.2440273037542662, + "grad_norm": 3.1449875831604004, + "learning_rate": 0.0005853242320819113, + "loss": 6.3897, + "step": 3645 + }, + { + "epoch": 1.2443686006825938, + "grad_norm": 3.1563587188720703, + "learning_rate": 0.0005852104664391354, + "loss": 6.6151, + "step": 3646 + }, + { + "epoch": 1.2447098976109214, + "grad_norm": 4.4367146492004395, + "learning_rate": 0.0005850967007963594, + "loss": 5.5777, + "step": 3647 + }, + { + "epoch": 1.245051194539249, + "grad_norm": 3.3431198596954346, + "learning_rate": 0.0005849829351535835, + "loss": 6.4409, + "step": 3648 + }, + { + "epoch": 1.2453924914675767, + "grad_norm": 4.078390121459961, + "learning_rate": 0.0005848691695108077, + "loss": 5.2748, + "step": 3649 + }, + { + "epoch": 1.2457337883959045, + "grad_norm": 3.3056936264038086, + "learning_rate": 0.0005847554038680319, + "loss": 6.1259, + "step": 3650 + }, + { + "epoch": 1.2460750853242322, + "grad_norm": 3.204592227935791, + "learning_rate": 0.000584641638225256, + "loss": 6.6076, + "step": 3651 + }, + { + "epoch": 1.2464163822525598, + "grad_norm": 3.014092445373535, + "learning_rate": 0.0005845278725824801, + "loss": 6.6634, + "step": 3652 + }, + { + "epoch": 1.2467576791808874, + "grad_norm": 2.9963581562042236, + "learning_rate": 0.0005844141069397042, + "loss": 6.0894, + "step": 3653 + }, + { + "epoch": 1.247098976109215, + "grad_norm": 3.0778584480285645, + "learning_rate": 0.0005843003412969284, + "loss": 6.6684, + "step": 3654 + }, + { + "epoch": 1.2474402730375427, + "grad_norm": 3.172811985015869, + "learning_rate": 0.0005841865756541525, + "loss": 6.6487, + "step": 3655 + }, + { + "epoch": 1.2477815699658703, + "grad_norm": 3.1092119216918945, + "learning_rate": 0.0005840728100113766, + "loss": 6.0495, + "step": 3656 + }, + { + "epoch": 1.248122866894198, + "grad_norm": 3.1255362033843994, + "learning_rate": 0.0005839590443686007, + "loss": 6.6304, + "step": 3657 + }, + { + "epoch": 1.2484641638225256, + "grad_norm": 3.0759193897247314, + "learning_rate": 0.0005838452787258248, + "loss": 6.9172, + "step": 3658 + }, + { + "epoch": 1.2488054607508532, + "grad_norm": 3.2935664653778076, + "learning_rate": 0.0005837315130830489, + "loss": 5.9662, + "step": 3659 + }, + { + "epoch": 1.2491467576791808, + "grad_norm": 3.370425224304199, + "learning_rate": 0.0005836177474402731, + "loss": 6.481, + "step": 3660 + }, + { + "epoch": 1.2494880546075084, + "grad_norm": 3.1100013256073, + "learning_rate": 0.0005835039817974972, + "loss": 6.6346, + "step": 3661 + }, + { + "epoch": 1.249829351535836, + "grad_norm": 3.2249128818511963, + "learning_rate": 0.0005833902161547213, + "loss": 6.2269, + "step": 3662 + }, + { + "epoch": 1.250170648464164, + "grad_norm": 4.628625869750977, + "learning_rate": 0.0005832764505119454, + "loss": 5.2683, + "step": 3663 + }, + { + "epoch": 1.2505119453924913, + "grad_norm": 3.3335788249969482, + "learning_rate": 0.0005831626848691695, + "loss": 5.914, + "step": 3664 + }, + { + "epoch": 1.2508532423208192, + "grad_norm": 3.5215747356414795, + "learning_rate": 0.0005830489192263936, + "loss": 5.747, + "step": 3665 + }, + { + "epoch": 1.2511945392491468, + "grad_norm": 3.2354788780212402, + "learning_rate": 0.0005829351535836177, + "loss": 5.9926, + "step": 3666 + }, + { + "epoch": 1.2515358361774744, + "grad_norm": 3.289290428161621, + "learning_rate": 0.0005828213879408419, + "loss": 6.9403, + "step": 3667 + }, + { + "epoch": 1.251877133105802, + "grad_norm": 3.2592785358428955, + "learning_rate": 0.000582707622298066, + "loss": 6.102, + "step": 3668 + }, + { + "epoch": 1.2522184300341297, + "grad_norm": 3.279924154281616, + "learning_rate": 0.0005825938566552901, + "loss": 6.5536, + "step": 3669 + }, + { + "epoch": 1.2525597269624573, + "grad_norm": 6.693105697631836, + "learning_rate": 0.0005824800910125142, + "loss": 4.9568, + "step": 3670 + }, + { + "epoch": 1.252901023890785, + "grad_norm": 3.1594643592834473, + "learning_rate": 0.0005823663253697383, + "loss": 6.6421, + "step": 3671 + }, + { + "epoch": 1.2532423208191126, + "grad_norm": 3.6135575771331787, + "learning_rate": 0.0005822525597269625, + "loss": 5.984, + "step": 3672 + }, + { + "epoch": 1.2535836177474402, + "grad_norm": 3.2949254512786865, + "learning_rate": 0.0005821387940841866, + "loss": 6.6492, + "step": 3673 + }, + { + "epoch": 1.253924914675768, + "grad_norm": 3.300008773803711, + "learning_rate": 0.0005820250284414107, + "loss": 5.9884, + "step": 3674 + }, + { + "epoch": 1.2542662116040955, + "grad_norm": 3.126598596572876, + "learning_rate": 0.0005819112627986348, + "loss": 6.3651, + "step": 3675 + }, + { + "epoch": 1.2546075085324233, + "grad_norm": 3.004107713699341, + "learning_rate": 0.0005817974971558589, + "loss": 6.3782, + "step": 3676 + }, + { + "epoch": 1.2549488054607507, + "grad_norm": 3.3004016876220703, + "learning_rate": 0.0005816837315130831, + "loss": 5.9222, + "step": 3677 + }, + { + "epoch": 1.2552901023890786, + "grad_norm": 3.169092893600464, + "learning_rate": 0.0005815699658703072, + "loss": 6.5686, + "step": 3678 + }, + { + "epoch": 1.2556313993174062, + "grad_norm": 4.297203063964844, + "learning_rate": 0.0005814562002275313, + "loss": 5.7196, + "step": 3679 + }, + { + "epoch": 1.2559726962457338, + "grad_norm": 3.2692508697509766, + "learning_rate": 0.0005813424345847554, + "loss": 6.7681, + "step": 3680 + }, + { + "epoch": 1.2563139931740614, + "grad_norm": 3.4113283157348633, + "learning_rate": 0.0005812286689419795, + "loss": 5.9537, + "step": 3681 + }, + { + "epoch": 1.256655290102389, + "grad_norm": 3.1349599361419678, + "learning_rate": 0.0005811149032992036, + "loss": 6.4065, + "step": 3682 + }, + { + "epoch": 1.2569965870307167, + "grad_norm": 3.8762474060058594, + "learning_rate": 0.0005810011376564279, + "loss": 6.0212, + "step": 3683 + }, + { + "epoch": 1.2573378839590443, + "grad_norm": 3.2077221870422363, + "learning_rate": 0.000580887372013652, + "loss": 6.3899, + "step": 3684 + }, + { + "epoch": 1.257679180887372, + "grad_norm": 3.3499104976654053, + "learning_rate": 0.0005807736063708761, + "loss": 6.1954, + "step": 3685 + }, + { + "epoch": 1.2580204778156996, + "grad_norm": 3.0911295413970947, + "learning_rate": 0.0005806598407281001, + "loss": 6.2744, + "step": 3686 + }, + { + "epoch": 1.2583617747440274, + "grad_norm": 3.294782876968384, + "learning_rate": 0.0005805460750853242, + "loss": 6.4801, + "step": 3687 + }, + { + "epoch": 1.2587030716723548, + "grad_norm": 3.7116618156433105, + "learning_rate": 0.0005804323094425483, + "loss": 5.8219, + "step": 3688 + }, + { + "epoch": 1.2590443686006827, + "grad_norm": 3.1158978939056396, + "learning_rate": 0.0005803185437997725, + "loss": 6.1431, + "step": 3689 + }, + { + "epoch": 1.25938566552901, + "grad_norm": 3.4460155963897705, + "learning_rate": 0.0005802047781569966, + "loss": 6.9438, + "step": 3690 + }, + { + "epoch": 1.259726962457338, + "grad_norm": 3.296086311340332, + "learning_rate": 0.0005800910125142207, + "loss": 6.5641, + "step": 3691 + }, + { + "epoch": 1.2600682593856656, + "grad_norm": 3.484635591506958, + "learning_rate": 0.0005799772468714448, + "loss": 6.6866, + "step": 3692 + }, + { + "epoch": 1.2604095563139932, + "grad_norm": 3.001859664916992, + "learning_rate": 0.0005798634812286689, + "loss": 6.845, + "step": 3693 + }, + { + "epoch": 1.2607508532423208, + "grad_norm": 3.264299154281616, + "learning_rate": 0.0005797497155858931, + "loss": 6.5779, + "step": 3694 + }, + { + "epoch": 1.2610921501706485, + "grad_norm": 4.64964485168457, + "learning_rate": 0.0005796359499431172, + "loss": 6.1898, + "step": 3695 + }, + { + "epoch": 1.261433447098976, + "grad_norm": 3.335503339767456, + "learning_rate": 0.0005795221843003413, + "loss": 6.6275, + "step": 3696 + }, + { + "epoch": 1.2617747440273037, + "grad_norm": 3.441882610321045, + "learning_rate": 0.0005794084186575654, + "loss": 6.1924, + "step": 3697 + }, + { + "epoch": 1.2621160409556313, + "grad_norm": 3.124157428741455, + "learning_rate": 0.0005792946530147895, + "loss": 6.4121, + "step": 3698 + }, + { + "epoch": 1.262457337883959, + "grad_norm": 3.6481199264526367, + "learning_rate": 0.0005791808873720136, + "loss": 6.0989, + "step": 3699 + }, + { + "epoch": 1.2627986348122868, + "grad_norm": 3.1325788497924805, + "learning_rate": 0.0005790671217292379, + "loss": 6.6209, + "step": 3700 + }, + { + "epoch": 1.2631399317406142, + "grad_norm": 3.0426456928253174, + "learning_rate": 0.000578953356086462, + "loss": 6.2306, + "step": 3701 + }, + { + "epoch": 1.263481228668942, + "grad_norm": 3.1229195594787598, + "learning_rate": 0.0005788395904436861, + "loss": 6.9213, + "step": 3702 + }, + { + "epoch": 1.2638225255972695, + "grad_norm": 3.100487232208252, + "learning_rate": 0.0005787258248009102, + "loss": 6.5218, + "step": 3703 + }, + { + "epoch": 1.2641638225255973, + "grad_norm": 3.2305166721343994, + "learning_rate": 0.0005786120591581343, + "loss": 6.4707, + "step": 3704 + }, + { + "epoch": 1.264505119453925, + "grad_norm": 3.1007251739501953, + "learning_rate": 0.0005784982935153583, + "loss": 7.0472, + "step": 3705 + }, + { + "epoch": 1.2648464163822526, + "grad_norm": 3.452636480331421, + "learning_rate": 0.0005783845278725825, + "loss": 6.0918, + "step": 3706 + }, + { + "epoch": 1.2651877133105802, + "grad_norm": 3.7483577728271484, + "learning_rate": 0.0005782707622298066, + "loss": 5.3098, + "step": 3707 + }, + { + "epoch": 1.2655290102389078, + "grad_norm": 3.2747714519500732, + "learning_rate": 0.0005781569965870307, + "loss": 6.4773, + "step": 3708 + }, + { + "epoch": 1.2658703071672355, + "grad_norm": 3.334418773651123, + "learning_rate": 0.0005780432309442548, + "loss": 5.9476, + "step": 3709 + }, + { + "epoch": 1.266211604095563, + "grad_norm": 3.388519287109375, + "learning_rate": 0.0005779294653014789, + "loss": 6.7832, + "step": 3710 + }, + { + "epoch": 1.2665529010238907, + "grad_norm": 3.2145025730133057, + "learning_rate": 0.000577815699658703, + "loss": 7.1023, + "step": 3711 + }, + { + "epoch": 1.2668941979522184, + "grad_norm": 3.1737163066864014, + "learning_rate": 0.0005777019340159272, + "loss": 6.2078, + "step": 3712 + }, + { + "epoch": 1.2672354948805462, + "grad_norm": 3.514313220977783, + "learning_rate": 0.0005775881683731513, + "loss": 5.9245, + "step": 3713 + }, + { + "epoch": 1.2675767918088736, + "grad_norm": 3.119048595428467, + "learning_rate": 0.0005774744027303754, + "loss": 6.1917, + "step": 3714 + }, + { + "epoch": 1.2679180887372015, + "grad_norm": 3.2024073600769043, + "learning_rate": 0.0005773606370875995, + "loss": 6.2773, + "step": 3715 + }, + { + "epoch": 1.268259385665529, + "grad_norm": 3.101339817047119, + "learning_rate": 0.0005772468714448236, + "loss": 6.6976, + "step": 3716 + }, + { + "epoch": 1.2686006825938567, + "grad_norm": 4.5886616706848145, + "learning_rate": 0.0005771331058020479, + "loss": 5.4188, + "step": 3717 + }, + { + "epoch": 1.2689419795221843, + "grad_norm": 3.2115495204925537, + "learning_rate": 0.000577019340159272, + "loss": 6.3656, + "step": 3718 + }, + { + "epoch": 1.269283276450512, + "grad_norm": 3.105013608932495, + "learning_rate": 0.0005769055745164961, + "loss": 6.04, + "step": 3719 + }, + { + "epoch": 1.2696245733788396, + "grad_norm": 3.715817451477051, + "learning_rate": 0.0005767918088737202, + "loss": 6.0363, + "step": 3720 + }, + { + "epoch": 1.2699658703071672, + "grad_norm": 3.0130434036254883, + "learning_rate": 0.0005766780432309443, + "loss": 6.3197, + "step": 3721 + }, + { + "epoch": 1.2703071672354949, + "grad_norm": 3.1294775009155273, + "learning_rate": 0.0005765642775881684, + "loss": 6.4423, + "step": 3722 + }, + { + "epoch": 1.2706484641638225, + "grad_norm": 3.1962530612945557, + "learning_rate": 0.0005764505119453926, + "loss": 6.3955, + "step": 3723 + }, + { + "epoch": 1.2709897610921501, + "grad_norm": 3.1761527061462402, + "learning_rate": 0.0005763367463026167, + "loss": 6.3929, + "step": 3724 + }, + { + "epoch": 1.2713310580204777, + "grad_norm": 3.09688138961792, + "learning_rate": 0.0005762229806598407, + "loss": 6.5535, + "step": 3725 + }, + { + "epoch": 1.2716723549488056, + "grad_norm": 3.14316463470459, + "learning_rate": 0.0005761092150170648, + "loss": 6.3527, + "step": 3726 + }, + { + "epoch": 1.272013651877133, + "grad_norm": 2.774230480194092, + "learning_rate": 0.0005759954493742889, + "loss": 3.1619, + "step": 3727 + }, + { + "epoch": 1.2723549488054609, + "grad_norm": 3.723099946975708, + "learning_rate": 0.000575881683731513, + "loss": 6.4016, + "step": 3728 + }, + { + "epoch": 1.2726962457337885, + "grad_norm": 3.691892385482788, + "learning_rate": 0.0005757679180887372, + "loss": 5.958, + "step": 3729 + }, + { + "epoch": 1.273037542662116, + "grad_norm": 3.364720582962036, + "learning_rate": 0.0005756541524459613, + "loss": 6.9903, + "step": 3730 + }, + { + "epoch": 1.2733788395904437, + "grad_norm": 3.500404119491577, + "learning_rate": 0.0005755403868031854, + "loss": 6.1624, + "step": 3731 + }, + { + "epoch": 1.2737201365187714, + "grad_norm": 4.1718597412109375, + "learning_rate": 0.0005754266211604095, + "loss": 6.2387, + "step": 3732 + }, + { + "epoch": 1.274061433447099, + "grad_norm": 3.1191086769104004, + "learning_rate": 0.0005753128555176336, + "loss": 7.2727, + "step": 3733 + }, + { + "epoch": 1.2744027303754266, + "grad_norm": 3.2956604957580566, + "learning_rate": 0.0005751990898748578, + "loss": 6.6461, + "step": 3734 + }, + { + "epoch": 1.2747440273037542, + "grad_norm": 3.2914106845855713, + "learning_rate": 0.000575085324232082, + "loss": 6.8773, + "step": 3735 + }, + { + "epoch": 1.2750853242320819, + "grad_norm": 3.343107223510742, + "learning_rate": 0.0005749715585893061, + "loss": 6.4053, + "step": 3736 + }, + { + "epoch": 1.2754266211604095, + "grad_norm": 3.184845447540283, + "learning_rate": 0.0005748577929465302, + "loss": 5.8288, + "step": 3737 + }, + { + "epoch": 1.2757679180887371, + "grad_norm": 3.4929275512695312, + "learning_rate": 0.0005747440273037543, + "loss": 6.2784, + "step": 3738 + }, + { + "epoch": 1.276109215017065, + "grad_norm": 3.1286535263061523, + "learning_rate": 0.0005746302616609784, + "loss": 6.8593, + "step": 3739 + }, + { + "epoch": 1.2764505119453924, + "grad_norm": 3.4388904571533203, + "learning_rate": 0.0005745164960182026, + "loss": 5.3746, + "step": 3740 + }, + { + "epoch": 1.2767918088737202, + "grad_norm": 3.239522933959961, + "learning_rate": 0.0005744027303754267, + "loss": 5.9408, + "step": 3741 + }, + { + "epoch": 1.2771331058020479, + "grad_norm": 3.316533088684082, + "learning_rate": 0.0005742889647326508, + "loss": 6.9465, + "step": 3742 + }, + { + "epoch": 1.2774744027303755, + "grad_norm": 3.470165967941284, + "learning_rate": 0.0005741751990898749, + "loss": 5.991, + "step": 3743 + }, + { + "epoch": 1.2778156996587031, + "grad_norm": 3.250302314758301, + "learning_rate": 0.0005740614334470989, + "loss": 6.1206, + "step": 3744 + }, + { + "epoch": 1.2781569965870307, + "grad_norm": 3.4087443351745605, + "learning_rate": 0.000573947667804323, + "loss": 6.4992, + "step": 3745 + }, + { + "epoch": 1.2784982935153584, + "grad_norm": 3.2577805519104004, + "learning_rate": 0.0005738339021615472, + "loss": 6.3207, + "step": 3746 + }, + { + "epoch": 1.278839590443686, + "grad_norm": 4.152851104736328, + "learning_rate": 0.0005737201365187713, + "loss": 4.8242, + "step": 3747 + }, + { + "epoch": 1.2791808873720136, + "grad_norm": 3.181995391845703, + "learning_rate": 0.0005736063708759954, + "loss": 6.548, + "step": 3748 + }, + { + "epoch": 1.2795221843003413, + "grad_norm": 3.4868032932281494, + "learning_rate": 0.0005734926052332195, + "loss": 6.2143, + "step": 3749 + }, + { + "epoch": 1.2798634812286689, + "grad_norm": 3.12666392326355, + "learning_rate": 0.0005733788395904436, + "loss": 6.9013, + "step": 3750 + }, + { + "epoch": 1.2802047781569965, + "grad_norm": 3.1087594032287598, + "learning_rate": 0.0005732650739476678, + "loss": 6.8698, + "step": 3751 + }, + { + "epoch": 1.2805460750853244, + "grad_norm": 3.2161049842834473, + "learning_rate": 0.000573151308304892, + "loss": 6.4872, + "step": 3752 + }, + { + "epoch": 1.2808873720136518, + "grad_norm": 3.140566110610962, + "learning_rate": 0.0005730375426621161, + "loss": 6.7521, + "step": 3753 + }, + { + "epoch": 1.2812286689419796, + "grad_norm": 3.375011920928955, + "learning_rate": 0.0005729237770193402, + "loss": 6.361, + "step": 3754 + }, + { + "epoch": 1.2815699658703072, + "grad_norm": 3.8893067836761475, + "learning_rate": 0.0005728100113765643, + "loss": 6.0318, + "step": 3755 + }, + { + "epoch": 1.2819112627986349, + "grad_norm": 3.1433284282684326, + "learning_rate": 0.0005726962457337884, + "loss": 6.0137, + "step": 3756 + }, + { + "epoch": 1.2822525597269625, + "grad_norm": 3.2328553199768066, + "learning_rate": 0.0005725824800910126, + "loss": 6.3901, + "step": 3757 + }, + { + "epoch": 1.2825938566552901, + "grad_norm": 3.0348122119903564, + "learning_rate": 0.0005724687144482367, + "loss": 6.2821, + "step": 3758 + }, + { + "epoch": 1.2829351535836178, + "grad_norm": 3.163795232772827, + "learning_rate": 0.0005723549488054608, + "loss": 6.2773, + "step": 3759 + }, + { + "epoch": 1.2832764505119454, + "grad_norm": 3.1185197830200195, + "learning_rate": 0.0005722411831626849, + "loss": 6.575, + "step": 3760 + }, + { + "epoch": 1.283617747440273, + "grad_norm": 4.635683536529541, + "learning_rate": 0.000572127417519909, + "loss": 5.0669, + "step": 3761 + }, + { + "epoch": 1.2839590443686006, + "grad_norm": 5.593306541442871, + "learning_rate": 0.0005720136518771331, + "loss": 5.0333, + "step": 3762 + }, + { + "epoch": 1.2843003412969283, + "grad_norm": 3.199862241744995, + "learning_rate": 0.0005718998862343572, + "loss": 6.5938, + "step": 3763 + }, + { + "epoch": 1.284641638225256, + "grad_norm": 3.554161787033081, + "learning_rate": 0.0005717861205915813, + "loss": 6.2362, + "step": 3764 + }, + { + "epoch": 1.2849829351535837, + "grad_norm": 3.397326946258545, + "learning_rate": 0.0005716723549488054, + "loss": 6.0153, + "step": 3765 + }, + { + "epoch": 1.2853242320819112, + "grad_norm": 3.148334264755249, + "learning_rate": 0.0005715585893060295, + "loss": 6.2978, + "step": 3766 + }, + { + "epoch": 1.285665529010239, + "grad_norm": 4.568434715270996, + "learning_rate": 0.0005714448236632536, + "loss": 5.762, + "step": 3767 + }, + { + "epoch": 1.2860068259385666, + "grad_norm": 3.284611225128174, + "learning_rate": 0.0005713310580204778, + "loss": 6.6268, + "step": 3768 + }, + { + "epoch": 1.2863481228668943, + "grad_norm": 14.985135078430176, + "learning_rate": 0.000571217292377702, + "loss": 4.1334, + "step": 3769 + }, + { + "epoch": 1.286689419795222, + "grad_norm": 3.237342119216919, + "learning_rate": 0.0005711035267349261, + "loss": 6.3474, + "step": 3770 + }, + { + "epoch": 1.2870307167235495, + "grad_norm": 3.287044048309326, + "learning_rate": 0.0005709897610921502, + "loss": 6.2234, + "step": 3771 + }, + { + "epoch": 1.2873720136518771, + "grad_norm": 3.0932626724243164, + "learning_rate": 0.0005708759954493743, + "loss": 6.8984, + "step": 3772 + }, + { + "epoch": 1.2877133105802048, + "grad_norm": 4.495637893676758, + "learning_rate": 0.0005707622298065984, + "loss": 5.0791, + "step": 3773 + }, + { + "epoch": 1.2880546075085324, + "grad_norm": 3.20120906829834, + "learning_rate": 0.0005706484641638225, + "loss": 6.4873, + "step": 3774 + }, + { + "epoch": 1.28839590443686, + "grad_norm": 3.146425485610962, + "learning_rate": 0.0005705346985210467, + "loss": 6.2309, + "step": 3775 + }, + { + "epoch": 1.2887372013651877, + "grad_norm": 3.152387857437134, + "learning_rate": 0.0005704209328782708, + "loss": 6.3663, + "step": 3776 + }, + { + "epoch": 1.2890784982935153, + "grad_norm": 3.333812952041626, + "learning_rate": 0.0005703071672354949, + "loss": 6.1972, + "step": 3777 + }, + { + "epoch": 1.2894197952218431, + "grad_norm": 4.746973037719727, + "learning_rate": 0.000570193401592719, + "loss": 4.9203, + "step": 3778 + }, + { + "epoch": 1.2897610921501705, + "grad_norm": 5.360435962677002, + "learning_rate": 0.0005700796359499431, + "loss": 5.874, + "step": 3779 + }, + { + "epoch": 1.2901023890784984, + "grad_norm": 3.3603954315185547, + "learning_rate": 0.0005699658703071673, + "loss": 6.2611, + "step": 3780 + }, + { + "epoch": 1.290443686006826, + "grad_norm": 3.251047134399414, + "learning_rate": 0.0005698521046643914, + "loss": 5.6341, + "step": 3781 + }, + { + "epoch": 1.2907849829351536, + "grad_norm": 3.316932201385498, + "learning_rate": 0.0005697383390216155, + "loss": 6.9426, + "step": 3782 + }, + { + "epoch": 1.2911262798634813, + "grad_norm": 5.679823398590088, + "learning_rate": 0.0005696245733788395, + "loss": 5.4432, + "step": 3783 + }, + { + "epoch": 1.291467576791809, + "grad_norm": 3.251844644546509, + "learning_rate": 0.0005695108077360637, + "loss": 6.9916, + "step": 3784 + }, + { + "epoch": 1.2918088737201365, + "grad_norm": 3.1990809440612793, + "learning_rate": 0.0005693970420932878, + "loss": 6.6435, + "step": 3785 + }, + { + "epoch": 1.2921501706484642, + "grad_norm": 3.5617237091064453, + "learning_rate": 0.000569283276450512, + "loss": 5.8256, + "step": 3786 + }, + { + "epoch": 1.2924914675767918, + "grad_norm": 3.2828528881073, + "learning_rate": 0.0005691695108077361, + "loss": 3.0921, + "step": 3787 + }, + { + "epoch": 1.2928327645051194, + "grad_norm": 4.945265769958496, + "learning_rate": 0.0005690557451649602, + "loss": 5.9462, + "step": 3788 + }, + { + "epoch": 1.293174061433447, + "grad_norm": 4.702214241027832, + "learning_rate": 0.0005689419795221843, + "loss": 5.9888, + "step": 3789 + }, + { + "epoch": 1.2935153583617747, + "grad_norm": 3.3534915447235107, + "learning_rate": 0.0005688282138794084, + "loss": 6.2144, + "step": 3790 + }, + { + "epoch": 1.2938566552901025, + "grad_norm": 3.387922525405884, + "learning_rate": 0.0005687144482366325, + "loss": 5.9508, + "step": 3791 + }, + { + "epoch": 1.29419795221843, + "grad_norm": 3.280163049697876, + "learning_rate": 0.0005686006825938567, + "loss": 7.1366, + "step": 3792 + }, + { + "epoch": 1.2945392491467578, + "grad_norm": 3.0818960666656494, + "learning_rate": 0.0005684869169510808, + "loss": 6.249, + "step": 3793 + }, + { + "epoch": 1.2948805460750854, + "grad_norm": 3.1594650745391846, + "learning_rate": 0.0005683731513083049, + "loss": 6.8152, + "step": 3794 + }, + { + "epoch": 1.295221843003413, + "grad_norm": 3.0736000537872314, + "learning_rate": 0.000568259385665529, + "loss": 6.0188, + "step": 3795 + }, + { + "epoch": 1.2955631399317407, + "grad_norm": 3.239778757095337, + "learning_rate": 0.0005681456200227531, + "loss": 6.2566, + "step": 3796 + }, + { + "epoch": 1.2959044368600683, + "grad_norm": 3.039710521697998, + "learning_rate": 0.0005680318543799773, + "loss": 6.3336, + "step": 3797 + }, + { + "epoch": 1.296245733788396, + "grad_norm": 3.1235196590423584, + "learning_rate": 0.0005679180887372014, + "loss": 6.0182, + "step": 3798 + }, + { + "epoch": 1.2965870307167235, + "grad_norm": 3.3898894786834717, + "learning_rate": 0.0005678043230944255, + "loss": 5.9916, + "step": 3799 + }, + { + "epoch": 1.2969283276450512, + "grad_norm": 3.1873528957366943, + "learning_rate": 0.0005676905574516497, + "loss": 6.7079, + "step": 3800 + }, + { + "epoch": 1.2972696245733788, + "grad_norm": 3.133732557296753, + "learning_rate": 0.0005675767918088738, + "loss": 6.0902, + "step": 3801 + }, + { + "epoch": 1.2976109215017064, + "grad_norm": 3.195161819458008, + "learning_rate": 0.0005674630261660978, + "loss": 6.3961, + "step": 3802 + }, + { + "epoch": 1.297952218430034, + "grad_norm": 3.204354763031006, + "learning_rate": 0.000567349260523322, + "loss": 5.9973, + "step": 3803 + }, + { + "epoch": 1.298293515358362, + "grad_norm": 3.1573190689086914, + "learning_rate": 0.0005672354948805461, + "loss": 6.5401, + "step": 3804 + }, + { + "epoch": 1.2986348122866893, + "grad_norm": 3.1424853801727295, + "learning_rate": 0.0005671217292377702, + "loss": 6.5217, + "step": 3805 + }, + { + "epoch": 1.2989761092150172, + "grad_norm": 3.2337796688079834, + "learning_rate": 0.0005670079635949943, + "loss": 6.818, + "step": 3806 + }, + { + "epoch": 1.2993174061433448, + "grad_norm": 3.1818370819091797, + "learning_rate": 0.0005668941979522184, + "loss": 6.4562, + "step": 3807 + }, + { + "epoch": 1.2996587030716724, + "grad_norm": 3.1999707221984863, + "learning_rate": 0.0005667804323094425, + "loss": 6.2445, + "step": 3808 + }, + { + "epoch": 1.3, + "grad_norm": 3.0630509853363037, + "learning_rate": 0.0005666666666666667, + "loss": 6.6849, + "step": 3809 + }, + { + "epoch": 1.3003412969283277, + "grad_norm": 3.165708541870117, + "learning_rate": 0.0005665529010238908, + "loss": 6.9432, + "step": 3810 + }, + { + "epoch": 1.3006825938566553, + "grad_norm": 3.325191020965576, + "learning_rate": 0.0005664391353811149, + "loss": 6.0563, + "step": 3811 + }, + { + "epoch": 1.301023890784983, + "grad_norm": 3.1484081745147705, + "learning_rate": 0.000566325369738339, + "loss": 6.8012, + "step": 3812 + }, + { + "epoch": 1.3013651877133106, + "grad_norm": 3.0226058959960938, + "learning_rate": 0.0005662116040955631, + "loss": 6.4212, + "step": 3813 + }, + { + "epoch": 1.3017064846416382, + "grad_norm": 3.7095937728881836, + "learning_rate": 0.0005660978384527872, + "loss": 5.5472, + "step": 3814 + }, + { + "epoch": 1.3020477815699658, + "grad_norm": 8.578839302062988, + "learning_rate": 0.0005659840728100114, + "loss": 6.6667, + "step": 3815 + }, + { + "epoch": 1.3023890784982934, + "grad_norm": 3.352297067642212, + "learning_rate": 0.0005658703071672356, + "loss": 6.7404, + "step": 3816 + }, + { + "epoch": 1.3027303754266213, + "grad_norm": 3.2766904830932617, + "learning_rate": 0.0005657565415244597, + "loss": 5.9404, + "step": 3817 + }, + { + "epoch": 1.3030716723549487, + "grad_norm": 3.1568057537078857, + "learning_rate": 0.0005656427758816838, + "loss": 6.3997, + "step": 3818 + }, + { + "epoch": 1.3034129692832765, + "grad_norm": 3.079411029815674, + "learning_rate": 0.0005655290102389079, + "loss": 6.5721, + "step": 3819 + }, + { + "epoch": 1.3037542662116042, + "grad_norm": 3.156961679458618, + "learning_rate": 0.0005654152445961321, + "loss": 6.3826, + "step": 3820 + }, + { + "epoch": 1.3040955631399318, + "grad_norm": 3.102477788925171, + "learning_rate": 0.0005653014789533562, + "loss": 6.4791, + "step": 3821 + }, + { + "epoch": 1.3044368600682594, + "grad_norm": 3.097460985183716, + "learning_rate": 0.0005651877133105802, + "loss": 6.6575, + "step": 3822 + }, + { + "epoch": 1.304778156996587, + "grad_norm": 3.1785974502563477, + "learning_rate": 0.0005650739476678043, + "loss": 6.6686, + "step": 3823 + }, + { + "epoch": 1.3051194539249147, + "grad_norm": 3.1115939617156982, + "learning_rate": 0.0005649601820250284, + "loss": 6.6035, + "step": 3824 + }, + { + "epoch": 1.3054607508532423, + "grad_norm": 3.2212889194488525, + "learning_rate": 0.0005648464163822525, + "loss": 6.1973, + "step": 3825 + }, + { + "epoch": 1.30580204778157, + "grad_norm": 3.3271737098693848, + "learning_rate": 0.0005647326507394767, + "loss": 5.9092, + "step": 3826 + }, + { + "epoch": 1.3061433447098976, + "grad_norm": 3.7578651905059814, + "learning_rate": 0.0005646188850967008, + "loss": 5.8015, + "step": 3827 + }, + { + "epoch": 1.3064846416382252, + "grad_norm": 3.2217273712158203, + "learning_rate": 0.0005645051194539249, + "loss": 6.6484, + "step": 3828 + }, + { + "epoch": 1.3068259385665528, + "grad_norm": 3.182534694671631, + "learning_rate": 0.000564391353811149, + "loss": 6.494, + "step": 3829 + }, + { + "epoch": 1.3071672354948807, + "grad_norm": 3.2115378379821777, + "learning_rate": 0.0005642775881683731, + "loss": 6.4137, + "step": 3830 + }, + { + "epoch": 1.307508532423208, + "grad_norm": 3.060603618621826, + "learning_rate": 0.0005641638225255972, + "loss": 6.485, + "step": 3831 + }, + { + "epoch": 1.307849829351536, + "grad_norm": 3.5397446155548096, + "learning_rate": 0.0005640500568828214, + "loss": 6.3132, + "step": 3832 + }, + { + "epoch": 1.3081911262798636, + "grad_norm": 3.190903663635254, + "learning_rate": 0.0005639362912400456, + "loss": 5.9605, + "step": 3833 + }, + { + "epoch": 1.3085324232081912, + "grad_norm": 6.479193210601807, + "learning_rate": 0.0005638225255972697, + "loss": 5.4088, + "step": 3834 + }, + { + "epoch": 1.3088737201365188, + "grad_norm": 3.177168846130371, + "learning_rate": 0.0005637087599544938, + "loss": 6.6025, + "step": 3835 + }, + { + "epoch": 1.3092150170648464, + "grad_norm": 3.4449946880340576, + "learning_rate": 0.0005635949943117179, + "loss": 5.3262, + "step": 3836 + }, + { + "epoch": 1.309556313993174, + "grad_norm": 3.2236359119415283, + "learning_rate": 0.0005634812286689421, + "loss": 6.2613, + "step": 3837 + }, + { + "epoch": 1.3098976109215017, + "grad_norm": 3.60327410697937, + "learning_rate": 0.0005633674630261662, + "loss": 5.2672, + "step": 3838 + }, + { + "epoch": 1.3102389078498293, + "grad_norm": 2.984083890914917, + "learning_rate": 0.0005632536973833903, + "loss": 6.2554, + "step": 3839 + }, + { + "epoch": 1.310580204778157, + "grad_norm": 3.2155168056488037, + "learning_rate": 0.0005631399317406144, + "loss": 6.54, + "step": 3840 + }, + { + "epoch": 1.3109215017064846, + "grad_norm": 3.312378168106079, + "learning_rate": 0.0005630261660978384, + "loss": 6.4537, + "step": 3841 + }, + { + "epoch": 1.3112627986348122, + "grad_norm": 3.047915458679199, + "learning_rate": 0.0005629124004550625, + "loss": 6.4776, + "step": 3842 + }, + { + "epoch": 1.31160409556314, + "grad_norm": 3.2649431228637695, + "learning_rate": 0.0005627986348122867, + "loss": 6.1978, + "step": 3843 + }, + { + "epoch": 1.3119453924914675, + "grad_norm": 3.579108238220215, + "learning_rate": 0.0005626848691695108, + "loss": 6.1248, + "step": 3844 + }, + { + "epoch": 1.3122866894197953, + "grad_norm": 5.931406497955322, + "learning_rate": 0.0005625711035267349, + "loss": 5.7591, + "step": 3845 + }, + { + "epoch": 1.312627986348123, + "grad_norm": 3.2167887687683105, + "learning_rate": 0.000562457337883959, + "loss": 6.8081, + "step": 3846 + }, + { + "epoch": 1.3129692832764506, + "grad_norm": 3.454554557800293, + "learning_rate": 0.0005623435722411831, + "loss": 6.5649, + "step": 3847 + }, + { + "epoch": 1.3133105802047782, + "grad_norm": 7.931427478790283, + "learning_rate": 0.0005622298065984072, + "loss": 4.5475, + "step": 3848 + }, + { + "epoch": 1.3136518771331058, + "grad_norm": 3.3232662677764893, + "learning_rate": 0.0005621160409556314, + "loss": 6.3771, + "step": 3849 + }, + { + "epoch": 1.3139931740614335, + "grad_norm": 3.6749861240386963, + "learning_rate": 0.0005620022753128556, + "loss": 6.2799, + "step": 3850 + }, + { + "epoch": 1.314334470989761, + "grad_norm": 3.1684978008270264, + "learning_rate": 0.0005618885096700797, + "loss": 6.4631, + "step": 3851 + }, + { + "epoch": 1.3146757679180887, + "grad_norm": 3.638622283935547, + "learning_rate": 0.0005617747440273038, + "loss": 5.7704, + "step": 3852 + }, + { + "epoch": 1.3150170648464163, + "grad_norm": 3.2697503566741943, + "learning_rate": 0.0005616609783845279, + "loss": 5.7991, + "step": 3853 + }, + { + "epoch": 1.315358361774744, + "grad_norm": 3.5689799785614014, + "learning_rate": 0.000561547212741752, + "loss": 5.2592, + "step": 3854 + }, + { + "epoch": 1.3156996587030716, + "grad_norm": 3.1521520614624023, + "learning_rate": 0.0005614334470989762, + "loss": 6.3141, + "step": 3855 + }, + { + "epoch": 1.3160409556313994, + "grad_norm": 3.137810468673706, + "learning_rate": 0.0005613196814562003, + "loss": 6.8293, + "step": 3856 + }, + { + "epoch": 1.3163822525597269, + "grad_norm": 3.1368062496185303, + "learning_rate": 0.0005612059158134244, + "loss": 6.7367, + "step": 3857 + }, + { + "epoch": 1.3167235494880547, + "grad_norm": 3.516813278198242, + "learning_rate": 0.0005610921501706485, + "loss": 6.4741, + "step": 3858 + }, + { + "epoch": 1.3170648464163823, + "grad_norm": 5.221506118774414, + "learning_rate": 0.0005609783845278726, + "loss": 5.8297, + "step": 3859 + }, + { + "epoch": 1.31740614334471, + "grad_norm": 3.2400710582733154, + "learning_rate": 0.0005608646188850968, + "loss": 7.1234, + "step": 3860 + }, + { + "epoch": 1.3177474402730376, + "grad_norm": 3.2599074840545654, + "learning_rate": 0.0005607508532423208, + "loss": 6.3569, + "step": 3861 + }, + { + "epoch": 1.3180887372013652, + "grad_norm": 3.250030994415283, + "learning_rate": 0.0005606370875995449, + "loss": 7.1309, + "step": 3862 + }, + { + "epoch": 1.3184300341296928, + "grad_norm": 3.246612071990967, + "learning_rate": 0.000560523321956769, + "loss": 6.5661, + "step": 3863 + }, + { + "epoch": 1.3187713310580205, + "grad_norm": 3.082751989364624, + "learning_rate": 0.0005604095563139931, + "loss": 6.5423, + "step": 3864 + }, + { + "epoch": 1.319112627986348, + "grad_norm": 3.3138341903686523, + "learning_rate": 0.0005602957906712172, + "loss": 6.3662, + "step": 3865 + }, + { + "epoch": 1.3194539249146757, + "grad_norm": 5.791090488433838, + "learning_rate": 0.0005601820250284414, + "loss": 5.5428, + "step": 3866 + }, + { + "epoch": 1.3197952218430034, + "grad_norm": 3.186373710632324, + "learning_rate": 0.0005600682593856656, + "loss": 6.7147, + "step": 3867 + }, + { + "epoch": 1.320136518771331, + "grad_norm": 3.214484930038452, + "learning_rate": 0.0005599544937428897, + "loss": 6.3017, + "step": 3868 + }, + { + "epoch": 1.3204778156996588, + "grad_norm": 3.255305051803589, + "learning_rate": 0.0005598407281001138, + "loss": 6.3534, + "step": 3869 + }, + { + "epoch": 1.3208191126279862, + "grad_norm": 3.318035125732422, + "learning_rate": 0.0005597269624573379, + "loss": 6.9723, + "step": 3870 + }, + { + "epoch": 1.321160409556314, + "grad_norm": 3.18615984916687, + "learning_rate": 0.000559613196814562, + "loss": 6.6132, + "step": 3871 + }, + { + "epoch": 1.3215017064846417, + "grad_norm": 3.249497652053833, + "learning_rate": 0.0005594994311717862, + "loss": 6.7708, + "step": 3872 + }, + { + "epoch": 1.3218430034129693, + "grad_norm": 4.298493385314941, + "learning_rate": 0.0005593856655290103, + "loss": 6.0935, + "step": 3873 + }, + { + "epoch": 1.322184300341297, + "grad_norm": 3.8184046745300293, + "learning_rate": 0.0005592718998862344, + "loss": 5.617, + "step": 3874 + }, + { + "epoch": 1.3225255972696246, + "grad_norm": 3.3545162677764893, + "learning_rate": 0.0005591581342434585, + "loss": 6.2788, + "step": 3875 + }, + { + "epoch": 1.3228668941979522, + "grad_norm": 3.2084295749664307, + "learning_rate": 0.0005590443686006826, + "loss": 6.5599, + "step": 3876 + }, + { + "epoch": 1.3232081911262799, + "grad_norm": 3.0472261905670166, + "learning_rate": 0.0005589306029579067, + "loss": 6.0838, + "step": 3877 + }, + { + "epoch": 1.3235494880546075, + "grad_norm": 3.785743236541748, + "learning_rate": 0.0005588168373151309, + "loss": 5.699, + "step": 3878 + }, + { + "epoch": 1.323890784982935, + "grad_norm": 3.114495038986206, + "learning_rate": 0.000558703071672355, + "loss": 6.5545, + "step": 3879 + }, + { + "epoch": 1.3242320819112627, + "grad_norm": 4.037391662597656, + "learning_rate": 0.000558589306029579, + "loss": 5.8293, + "step": 3880 + }, + { + "epoch": 1.3245733788395904, + "grad_norm": 3.1285529136657715, + "learning_rate": 0.0005584755403868031, + "loss": 6.5584, + "step": 3881 + }, + { + "epoch": 1.3249146757679182, + "grad_norm": 3.292494058609009, + "learning_rate": 0.0005583617747440272, + "loss": 6.8052, + "step": 3882 + }, + { + "epoch": 1.3252559726962456, + "grad_norm": 5.072427272796631, + "learning_rate": 0.0005582480091012514, + "loss": 5.6256, + "step": 3883 + }, + { + "epoch": 1.3255972696245735, + "grad_norm": 3.1769468784332275, + "learning_rate": 0.0005581342434584756, + "loss": 6.5001, + "step": 3884 + }, + { + "epoch": 1.325938566552901, + "grad_norm": 6.59473180770874, + "learning_rate": 0.0005580204778156997, + "loss": 6.2227, + "step": 3885 + }, + { + "epoch": 1.3262798634812287, + "grad_norm": 3.345031261444092, + "learning_rate": 0.0005579067121729238, + "loss": 6.3236, + "step": 3886 + }, + { + "epoch": 1.3266211604095564, + "grad_norm": 3.1531882286071777, + "learning_rate": 0.0005577929465301479, + "loss": 6.8264, + "step": 3887 + }, + { + "epoch": 1.326962457337884, + "grad_norm": 3.17917537689209, + "learning_rate": 0.000557679180887372, + "loss": 6.032, + "step": 3888 + }, + { + "epoch": 1.3273037542662116, + "grad_norm": 3.783651351928711, + "learning_rate": 0.0005575654152445962, + "loss": 5.9765, + "step": 3889 + }, + { + "epoch": 1.3276450511945392, + "grad_norm": 3.515580892562866, + "learning_rate": 0.0005574516496018203, + "loss": 5.7462, + "step": 3890 + }, + { + "epoch": 1.3279863481228669, + "grad_norm": 2.9463393688201904, + "learning_rate": 0.0005573378839590444, + "loss": 6.4177, + "step": 3891 + }, + { + "epoch": 1.3283276450511945, + "grad_norm": 3.109710216522217, + "learning_rate": 0.0005572241183162685, + "loss": 6.3956, + "step": 3892 + }, + { + "epoch": 1.3286689419795221, + "grad_norm": 3.2260215282440186, + "learning_rate": 0.0005571103526734926, + "loss": 6.6147, + "step": 3893 + }, + { + "epoch": 1.3290102389078498, + "grad_norm": 3.0810201168060303, + "learning_rate": 0.0005569965870307167, + "loss": 6.1781, + "step": 3894 + }, + { + "epoch": 1.3293515358361776, + "grad_norm": 3.121894598007202, + "learning_rate": 0.0005568828213879409, + "loss": 6.7156, + "step": 3895 + }, + { + "epoch": 1.329692832764505, + "grad_norm": 3.2574121952056885, + "learning_rate": 0.000556769055745165, + "loss": 6.3006, + "step": 3896 + }, + { + "epoch": 1.3300341296928329, + "grad_norm": 3.250694990158081, + "learning_rate": 0.0005566552901023891, + "loss": 6.9496, + "step": 3897 + }, + { + "epoch": 1.3303754266211605, + "grad_norm": 3.090780258178711, + "learning_rate": 0.0005565415244596132, + "loss": 6.5721, + "step": 3898 + }, + { + "epoch": 1.3307167235494881, + "grad_norm": 3.135728120803833, + "learning_rate": 0.0005564277588168373, + "loss": 7.0426, + "step": 3899 + }, + { + "epoch": 1.3310580204778157, + "grad_norm": 3.099902391433716, + "learning_rate": 0.0005563139931740613, + "loss": 6.4846, + "step": 3900 + }, + { + "epoch": 1.3313993174061434, + "grad_norm": 3.4714698791503906, + "learning_rate": 0.0005562002275312856, + "loss": 6.1539, + "step": 3901 + }, + { + "epoch": 1.331740614334471, + "grad_norm": 3.3556177616119385, + "learning_rate": 0.0005560864618885097, + "loss": 6.3849, + "step": 3902 + }, + { + "epoch": 1.3320819112627986, + "grad_norm": 3.5183370113372803, + "learning_rate": 0.0005559726962457338, + "loss": 6.1542, + "step": 3903 + }, + { + "epoch": 1.3324232081911263, + "grad_norm": 3.0659613609313965, + "learning_rate": 0.0005558589306029579, + "loss": 6.262, + "step": 3904 + }, + { + "epoch": 1.3327645051194539, + "grad_norm": 3.1507010459899902, + "learning_rate": 0.000555745164960182, + "loss": 6.7591, + "step": 3905 + }, + { + "epoch": 1.3331058020477815, + "grad_norm": 3.488461971282959, + "learning_rate": 0.0005556313993174062, + "loss": 6.2227, + "step": 3906 + }, + { + "epoch": 1.3334470989761091, + "grad_norm": 3.122450828552246, + "learning_rate": 0.0005555176336746303, + "loss": 6.2148, + "step": 3907 + }, + { + "epoch": 1.333788395904437, + "grad_norm": 3.227902412414551, + "learning_rate": 0.0005554038680318544, + "loss": 6.7408, + "step": 3908 + }, + { + "epoch": 1.3341296928327644, + "grad_norm": 3.1958107948303223, + "learning_rate": 0.0005552901023890785, + "loss": 6.4788, + "step": 3909 + }, + { + "epoch": 1.3344709897610922, + "grad_norm": 3.2183525562286377, + "learning_rate": 0.0005551763367463026, + "loss": 6.5644, + "step": 3910 + }, + { + "epoch": 1.3348122866894199, + "grad_norm": 3.3385112285614014, + "learning_rate": 0.0005550625711035267, + "loss": 6.0782, + "step": 3911 + }, + { + "epoch": 1.3351535836177475, + "grad_norm": 4.089425086975098, + "learning_rate": 0.0005549488054607509, + "loss": 5.6889, + "step": 3912 + }, + { + "epoch": 1.3354948805460751, + "grad_norm": 3.097321033477783, + "learning_rate": 0.000554835039817975, + "loss": 6.336, + "step": 3913 + }, + { + "epoch": 1.3358361774744028, + "grad_norm": 5.853010654449463, + "learning_rate": 0.0005547212741751991, + "loss": 4.3251, + "step": 3914 + }, + { + "epoch": 1.3361774744027304, + "grad_norm": 3.582655191421509, + "learning_rate": 0.0005546075085324232, + "loss": 5.5901, + "step": 3915 + }, + { + "epoch": 1.336518771331058, + "grad_norm": 3.233471155166626, + "learning_rate": 0.0005544937428896473, + "loss": 6.5578, + "step": 3916 + }, + { + "epoch": 1.3368600682593856, + "grad_norm": 3.3025286197662354, + "learning_rate": 0.0005543799772468715, + "loss": 6.0288, + "step": 3917 + }, + { + "epoch": 1.3372013651877133, + "grad_norm": 3.4599759578704834, + "learning_rate": 0.0005542662116040957, + "loss": 6.4112, + "step": 3918 + }, + { + "epoch": 1.337542662116041, + "grad_norm": 3.0111966133117676, + "learning_rate": 0.0005541524459613197, + "loss": 6.3284, + "step": 3919 + }, + { + "epoch": 1.3378839590443685, + "grad_norm": 4.267691135406494, + "learning_rate": 0.0005540386803185438, + "loss": 6.3674, + "step": 3920 + }, + { + "epoch": 1.3382252559726964, + "grad_norm": 3.2080326080322266, + "learning_rate": 0.0005539249146757679, + "loss": 6.5919, + "step": 3921 + }, + { + "epoch": 1.3385665529010238, + "grad_norm": 3.23378849029541, + "learning_rate": 0.000553811149032992, + "loss": 6.1681, + "step": 3922 + }, + { + "epoch": 1.3389078498293516, + "grad_norm": 3.0529868602752686, + "learning_rate": 0.0005536973833902162, + "loss": 6.6733, + "step": 3923 + }, + { + "epoch": 1.3392491467576793, + "grad_norm": 4.308516502380371, + "learning_rate": 0.0005535836177474403, + "loss": 5.9069, + "step": 3924 + }, + { + "epoch": 1.3395904436860069, + "grad_norm": 6.184211254119873, + "learning_rate": 0.0005534698521046644, + "loss": 5.3009, + "step": 3925 + }, + { + "epoch": 1.3399317406143345, + "grad_norm": 3.4031319618225098, + "learning_rate": 0.0005533560864618885, + "loss": 6.694, + "step": 3926 + }, + { + "epoch": 1.3402730375426621, + "grad_norm": 3.15828275680542, + "learning_rate": 0.0005532423208191126, + "loss": 6.8263, + "step": 3927 + }, + { + "epoch": 1.3406143344709898, + "grad_norm": 3.1163811683654785, + "learning_rate": 0.0005531285551763367, + "loss": 6.3808, + "step": 3928 + }, + { + "epoch": 1.3409556313993174, + "grad_norm": 3.1072165966033936, + "learning_rate": 0.0005530147895335609, + "loss": 6.4251, + "step": 3929 + }, + { + "epoch": 1.341296928327645, + "grad_norm": 3.2711892127990723, + "learning_rate": 0.000552901023890785, + "loss": 6.1466, + "step": 3930 + }, + { + "epoch": 1.3416382252559726, + "grad_norm": 3.1679635047912598, + "learning_rate": 0.0005527872582480091, + "loss": 6.3643, + "step": 3931 + }, + { + "epoch": 1.3419795221843003, + "grad_norm": 3.1142637729644775, + "learning_rate": 0.0005526734926052332, + "loss": 6.8845, + "step": 3932 + }, + { + "epoch": 1.342320819112628, + "grad_norm": 3.082751989364624, + "learning_rate": 0.0005525597269624573, + "loss": 6.1942, + "step": 3933 + }, + { + "epoch": 1.3426621160409558, + "grad_norm": 3.139395236968994, + "learning_rate": 0.0005524459613196815, + "loss": 6.6958, + "step": 3934 + }, + { + "epoch": 1.3430034129692832, + "grad_norm": 3.3839261531829834, + "learning_rate": 0.0005523321956769057, + "loss": 5.6164, + "step": 3935 + }, + { + "epoch": 1.343344709897611, + "grad_norm": 3.2381277084350586, + "learning_rate": 0.0005522184300341298, + "loss": 6.8543, + "step": 3936 + }, + { + "epoch": 1.3436860068259386, + "grad_norm": 3.236882209777832, + "learning_rate": 0.0005521046643913539, + "loss": 6.2873, + "step": 3937 + }, + { + "epoch": 1.3440273037542663, + "grad_norm": 3.085549831390381, + "learning_rate": 0.0005519908987485779, + "loss": 6.6844, + "step": 3938 + }, + { + "epoch": 1.344368600682594, + "grad_norm": 3.2600576877593994, + "learning_rate": 0.000551877133105802, + "loss": 6.4399, + "step": 3939 + }, + { + "epoch": 1.3447098976109215, + "grad_norm": 3.097806215286255, + "learning_rate": 0.0005517633674630261, + "loss": 6.825, + "step": 3940 + }, + { + "epoch": 1.3450511945392492, + "grad_norm": 3.1946775913238525, + "learning_rate": 0.0005516496018202503, + "loss": 6.6171, + "step": 3941 + }, + { + "epoch": 1.3453924914675768, + "grad_norm": 3.3030920028686523, + "learning_rate": 0.0005515358361774744, + "loss": 6.3477, + "step": 3942 + }, + { + "epoch": 1.3457337883959044, + "grad_norm": 3.1355345249176025, + "learning_rate": 0.0005514220705346985, + "loss": 6.8327, + "step": 3943 + }, + { + "epoch": 1.346075085324232, + "grad_norm": 3.1044278144836426, + "learning_rate": 0.0005513083048919226, + "loss": 6.3775, + "step": 3944 + }, + { + "epoch": 1.3464163822525597, + "grad_norm": 3.12194561958313, + "learning_rate": 0.0005511945392491467, + "loss": 6.2563, + "step": 3945 + }, + { + "epoch": 1.3467576791808873, + "grad_norm": 3.1160433292388916, + "learning_rate": 0.0005510807736063709, + "loss": 6.0824, + "step": 3946 + }, + { + "epoch": 1.3470989761092151, + "grad_norm": 3.5935933589935303, + "learning_rate": 0.000550967007963595, + "loss": 3.7834, + "step": 3947 + }, + { + "epoch": 1.3474402730375425, + "grad_norm": 3.4486262798309326, + "learning_rate": 0.0005508532423208191, + "loss": 6.0705, + "step": 3948 + }, + { + "epoch": 1.3477815699658704, + "grad_norm": 3.192941427230835, + "learning_rate": 0.0005507394766780432, + "loss": 5.8078, + "step": 3949 + }, + { + "epoch": 1.348122866894198, + "grad_norm": 8.985308647155762, + "learning_rate": 0.0005506257110352673, + "loss": 5.2075, + "step": 3950 + }, + { + "epoch": 1.3484641638225257, + "grad_norm": 4.667975902557373, + "learning_rate": 0.0005505119453924915, + "loss": 6.0374, + "step": 3951 + }, + { + "epoch": 1.3488054607508533, + "grad_norm": 3.4033608436584473, + "learning_rate": 0.0005503981797497157, + "loss": 6.1368, + "step": 3952 + }, + { + "epoch": 1.349146757679181, + "grad_norm": 3.455477237701416, + "learning_rate": 0.0005502844141069398, + "loss": 6.2675, + "step": 3953 + }, + { + "epoch": 1.3494880546075085, + "grad_norm": 3.2633755207061768, + "learning_rate": 0.0005501706484641639, + "loss": 6.7096, + "step": 3954 + }, + { + "epoch": 1.3498293515358362, + "grad_norm": 4.205435752868652, + "learning_rate": 0.000550056882821388, + "loss": 5.0339, + "step": 3955 + }, + { + "epoch": 1.3501706484641638, + "grad_norm": 3.1582601070404053, + "learning_rate": 0.0005499431171786121, + "loss": 7.1195, + "step": 3956 + }, + { + "epoch": 1.3505119453924914, + "grad_norm": 3.342383623123169, + "learning_rate": 0.0005498293515358362, + "loss": 5.5482, + "step": 3957 + }, + { + "epoch": 1.350853242320819, + "grad_norm": 3.2618634700775146, + "learning_rate": 0.0005497155858930603, + "loss": 6.5156, + "step": 3958 + }, + { + "epoch": 1.3511945392491467, + "grad_norm": 3.0584144592285156, + "learning_rate": 0.0005496018202502844, + "loss": 6.5958, + "step": 3959 + }, + { + "epoch": 1.3515358361774745, + "grad_norm": 3.212893009185791, + "learning_rate": 0.0005494880546075085, + "loss": 6.7449, + "step": 3960 + }, + { + "epoch": 1.351877133105802, + "grad_norm": 3.140817403793335, + "learning_rate": 0.0005493742889647326, + "loss": 6.134, + "step": 3961 + }, + { + "epoch": 1.3522184300341298, + "grad_norm": 3.27764630317688, + "learning_rate": 0.0005492605233219567, + "loss": 5.9015, + "step": 3962 + }, + { + "epoch": 1.3525597269624574, + "grad_norm": 3.1260316371917725, + "learning_rate": 0.0005491467576791809, + "loss": 6.9168, + "step": 3963 + }, + { + "epoch": 1.352901023890785, + "grad_norm": 4.186235427856445, + "learning_rate": 0.000549032992036405, + "loss": 5.7906, + "step": 3964 + }, + { + "epoch": 1.3532423208191127, + "grad_norm": 3.5168609619140625, + "learning_rate": 0.0005489192263936291, + "loss": 6.4136, + "step": 3965 + }, + { + "epoch": 1.3535836177474403, + "grad_norm": 3.287663698196411, + "learning_rate": 0.0005488054607508532, + "loss": 6.072, + "step": 3966 + }, + { + "epoch": 1.353924914675768, + "grad_norm": 3.4218294620513916, + "learning_rate": 0.0005486916951080773, + "loss": 5.65, + "step": 3967 + }, + { + "epoch": 1.3542662116040955, + "grad_norm": 3.3212225437164307, + "learning_rate": 0.0005485779294653015, + "loss": 6.4668, + "step": 3968 + }, + { + "epoch": 1.3546075085324232, + "grad_norm": 3.127525568008423, + "learning_rate": 0.0005484641638225257, + "loss": 6.4388, + "step": 3969 + }, + { + "epoch": 1.3549488054607508, + "grad_norm": 3.166355848312378, + "learning_rate": 0.0005483503981797498, + "loss": 6.3717, + "step": 3970 + }, + { + "epoch": 1.3552901023890784, + "grad_norm": 3.090057134628296, + "learning_rate": 0.0005482366325369739, + "loss": 5.9435, + "step": 3971 + }, + { + "epoch": 1.355631399317406, + "grad_norm": 3.0684850215911865, + "learning_rate": 0.000548122866894198, + "loss": 7.0271, + "step": 3972 + }, + { + "epoch": 1.355972696245734, + "grad_norm": 3.6041576862335205, + "learning_rate": 0.0005480091012514221, + "loss": 6.0391, + "step": 3973 + }, + { + "epoch": 1.3563139931740613, + "grad_norm": 5.369800567626953, + "learning_rate": 0.0005478953356086462, + "loss": 6.1618, + "step": 3974 + }, + { + "epoch": 1.3566552901023892, + "grad_norm": 3.1870033740997314, + "learning_rate": 0.0005477815699658704, + "loss": 6.782, + "step": 3975 + }, + { + "epoch": 1.3569965870307168, + "grad_norm": 3.168116331100464, + "learning_rate": 0.0005476678043230945, + "loss": 6.3682, + "step": 3976 + }, + { + "epoch": 1.3573378839590444, + "grad_norm": 3.2741923332214355, + "learning_rate": 0.0005475540386803185, + "loss": 6.0431, + "step": 3977 + }, + { + "epoch": 1.357679180887372, + "grad_norm": 3.190312147140503, + "learning_rate": 0.0005474402730375426, + "loss": 6.2224, + "step": 3978 + }, + { + "epoch": 1.3580204778156997, + "grad_norm": 3.3220231533050537, + "learning_rate": 0.0005473265073947667, + "loss": 6.61, + "step": 3979 + }, + { + "epoch": 1.3583617747440273, + "grad_norm": 3.0520358085632324, + "learning_rate": 0.0005472127417519908, + "loss": 6.6153, + "step": 3980 + }, + { + "epoch": 1.358703071672355, + "grad_norm": 3.942625045776367, + "learning_rate": 0.000547098976109215, + "loss": 4.871, + "step": 3981 + }, + { + "epoch": 1.3590443686006826, + "grad_norm": 3.148261785507202, + "learning_rate": 0.0005469852104664391, + "loss": 6.5981, + "step": 3982 + }, + { + "epoch": 1.3593856655290102, + "grad_norm": 3.1184489727020264, + "learning_rate": 0.0005468714448236632, + "loss": 6.0445, + "step": 3983 + }, + { + "epoch": 1.3597269624573378, + "grad_norm": 3.2603631019592285, + "learning_rate": 0.0005467576791808873, + "loss": 6.847, + "step": 3984 + }, + { + "epoch": 1.3600682593856654, + "grad_norm": 3.4152493476867676, + "learning_rate": 0.0005466439135381115, + "loss": 5.5912, + "step": 3985 + }, + { + "epoch": 1.3604095563139933, + "grad_norm": 3.3140194416046143, + "learning_rate": 0.0005465301478953357, + "loss": 6.3923, + "step": 3986 + }, + { + "epoch": 1.3607508532423207, + "grad_norm": 3.526538610458374, + "learning_rate": 0.0005464163822525598, + "loss": 5.259, + "step": 3987 + }, + { + "epoch": 1.3610921501706486, + "grad_norm": 3.1631855964660645, + "learning_rate": 0.0005463026166097839, + "loss": 6.5663, + "step": 3988 + }, + { + "epoch": 1.3614334470989762, + "grad_norm": 3.11613392829895, + "learning_rate": 0.000546188850967008, + "loss": 6.3917, + "step": 3989 + }, + { + "epoch": 1.3617747440273038, + "grad_norm": 3.1377980709075928, + "learning_rate": 0.0005460750853242321, + "loss": 6.2215, + "step": 3990 + }, + { + "epoch": 1.3621160409556314, + "grad_norm": 2.9950690269470215, + "learning_rate": 0.0005459613196814562, + "loss": 6.8778, + "step": 3991 + }, + { + "epoch": 1.362457337883959, + "grad_norm": 3.1760122776031494, + "learning_rate": 0.0005458475540386804, + "loss": 6.2066, + "step": 3992 + }, + { + "epoch": 1.3627986348122867, + "grad_norm": 3.045168399810791, + "learning_rate": 0.0005457337883959045, + "loss": 6.3082, + "step": 3993 + }, + { + "epoch": 1.3631399317406143, + "grad_norm": 3.054022789001465, + "learning_rate": 0.0005456200227531286, + "loss": 6.0026, + "step": 3994 + }, + { + "epoch": 1.363481228668942, + "grad_norm": 3.007483959197998, + "learning_rate": 0.0005455062571103527, + "loss": 6.052, + "step": 3995 + }, + { + "epoch": 1.3638225255972696, + "grad_norm": 3.831319570541382, + "learning_rate": 0.0005453924914675768, + "loss": 5.1814, + "step": 3996 + }, + { + "epoch": 1.3641638225255972, + "grad_norm": 3.610045909881592, + "learning_rate": 0.0005452787258248008, + "loss": 6.2811, + "step": 3997 + }, + { + "epoch": 1.3645051194539248, + "grad_norm": 3.2371370792388916, + "learning_rate": 0.000545164960182025, + "loss": 6.8801, + "step": 3998 + }, + { + "epoch": 1.3648464163822527, + "grad_norm": 3.0516843795776367, + "learning_rate": 0.0005450511945392491, + "loss": 6.3365, + "step": 3999 + }, + { + "epoch": 1.36518771331058, + "grad_norm": 3.2358148097991943, + "learning_rate": 0.0005449374288964732, + "loss": 6.708, + "step": 4000 + }, + { + "epoch": 1.365529010238908, + "grad_norm": 3.3484854698181152, + "learning_rate": 0.0005448236632536974, + "loss": 5.9241, + "step": 4001 + }, + { + "epoch": 1.3658703071672356, + "grad_norm": 3.215952157974243, + "learning_rate": 0.0005447098976109215, + "loss": 6.2136, + "step": 4002 + }, + { + "epoch": 1.3662116040955632, + "grad_norm": 3.1921606063842773, + "learning_rate": 0.0005445961319681456, + "loss": 6.4693, + "step": 4003 + }, + { + "epoch": 1.3665529010238908, + "grad_norm": 3.2230448722839355, + "learning_rate": 0.0005444823663253698, + "loss": 6.3591, + "step": 4004 + }, + { + "epoch": 1.3668941979522184, + "grad_norm": 3.30001163482666, + "learning_rate": 0.0005443686006825939, + "loss": 7.2393, + "step": 4005 + }, + { + "epoch": 1.367235494880546, + "grad_norm": 3.1565656661987305, + "learning_rate": 0.000544254835039818, + "loss": 6.7251, + "step": 4006 + }, + { + "epoch": 1.3675767918088737, + "grad_norm": 2.9923689365386963, + "learning_rate": 0.0005441410693970421, + "loss": 6.4636, + "step": 4007 + }, + { + "epoch": 1.3679180887372013, + "grad_norm": 3.189656972885132, + "learning_rate": 0.0005440273037542662, + "loss": 6.586, + "step": 4008 + }, + { + "epoch": 1.368259385665529, + "grad_norm": 4.6809234619140625, + "learning_rate": 0.0005439135381114904, + "loss": 5.4901, + "step": 4009 + }, + { + "epoch": 1.3686006825938566, + "grad_norm": 3.192366600036621, + "learning_rate": 0.0005437997724687145, + "loss": 6.0856, + "step": 4010 + }, + { + "epoch": 1.3689419795221842, + "grad_norm": 3.0873725414276123, + "learning_rate": 0.0005436860068259386, + "loss": 6.8565, + "step": 4011 + }, + { + "epoch": 1.369283276450512, + "grad_norm": 3.1170427799224854, + "learning_rate": 0.0005435722411831627, + "loss": 6.371, + "step": 4012 + }, + { + "epoch": 1.3696245733788395, + "grad_norm": 3.2047770023345947, + "learning_rate": 0.0005434584755403868, + "loss": 6.8304, + "step": 4013 + }, + { + "epoch": 1.3699658703071673, + "grad_norm": 3.132357120513916, + "learning_rate": 0.0005433447098976109, + "loss": 6.4195, + "step": 4014 + }, + { + "epoch": 1.370307167235495, + "grad_norm": 3.173907518386841, + "learning_rate": 0.0005432309442548351, + "loss": 6.4185, + "step": 4015 + }, + { + "epoch": 1.3706484641638226, + "grad_norm": 5.250078201293945, + "learning_rate": 0.0005431171786120591, + "loss": 5.0679, + "step": 4016 + }, + { + "epoch": 1.3709897610921502, + "grad_norm": 3.5316922664642334, + "learning_rate": 0.0005430034129692832, + "loss": 6.0273, + "step": 4017 + }, + { + "epoch": 1.3713310580204778, + "grad_norm": 3.2254114151000977, + "learning_rate": 0.0005428896473265074, + "loss": 6.6384, + "step": 4018 + }, + { + "epoch": 1.3716723549488055, + "grad_norm": 3.242680788040161, + "learning_rate": 0.0005427758816837315, + "loss": 6.5512, + "step": 4019 + }, + { + "epoch": 1.372013651877133, + "grad_norm": 3.2910420894622803, + "learning_rate": 0.0005426621160409556, + "loss": 6.7562, + "step": 4020 + }, + { + "epoch": 1.3723549488054607, + "grad_norm": 3.184915065765381, + "learning_rate": 0.0005425483503981798, + "loss": 6.5688, + "step": 4021 + }, + { + "epoch": 1.3726962457337883, + "grad_norm": 3.098815679550171, + "learning_rate": 0.0005424345847554039, + "loss": 6.6371, + "step": 4022 + }, + { + "epoch": 1.373037542662116, + "grad_norm": 3.283712863922119, + "learning_rate": 0.000542320819112628, + "loss": 6.0917, + "step": 4023 + }, + { + "epoch": 1.3733788395904436, + "grad_norm": 3.190316677093506, + "learning_rate": 0.0005422070534698521, + "loss": 6.1911, + "step": 4024 + }, + { + "epoch": 1.3737201365187715, + "grad_norm": 3.122403621673584, + "learning_rate": 0.0005420932878270762, + "loss": 6.3405, + "step": 4025 + }, + { + "epoch": 1.3740614334470989, + "grad_norm": 3.1362032890319824, + "learning_rate": 0.0005419795221843004, + "loss": 6.6867, + "step": 4026 + }, + { + "epoch": 1.3744027303754267, + "grad_norm": 3.2245562076568604, + "learning_rate": 0.0005418657565415245, + "loss": 6.2402, + "step": 4027 + }, + { + "epoch": 1.3747440273037543, + "grad_norm": 3.087644338607788, + "learning_rate": 0.0005417519908987486, + "loss": 6.16, + "step": 4028 + }, + { + "epoch": 1.375085324232082, + "grad_norm": 3.1691482067108154, + "learning_rate": 0.0005416382252559727, + "loss": 6.5541, + "step": 4029 + }, + { + "epoch": 1.3754266211604096, + "grad_norm": 3.053196668624878, + "learning_rate": 0.0005415244596131968, + "loss": 6.7324, + "step": 4030 + }, + { + "epoch": 1.3757679180887372, + "grad_norm": 4.789836406707764, + "learning_rate": 0.0005414106939704209, + "loss": 5.2476, + "step": 4031 + }, + { + "epoch": 1.3761092150170648, + "grad_norm": 3.1950435638427734, + "learning_rate": 0.0005412969283276451, + "loss": 6.7076, + "step": 4032 + }, + { + "epoch": 1.3764505119453925, + "grad_norm": 3.2419111728668213, + "learning_rate": 0.0005411831626848693, + "loss": 6.1823, + "step": 4033 + }, + { + "epoch": 1.37679180887372, + "grad_norm": 7.173503398895264, + "learning_rate": 0.0005410693970420934, + "loss": 4.7265, + "step": 4034 + }, + { + "epoch": 1.3771331058020477, + "grad_norm": 3.251122236251831, + "learning_rate": 0.0005409556313993175, + "loss": 6.432, + "step": 4035 + }, + { + "epoch": 1.3774744027303754, + "grad_norm": 3.205535888671875, + "learning_rate": 0.0005408418657565415, + "loss": 6.6821, + "step": 4036 + }, + { + "epoch": 1.377815699658703, + "grad_norm": 3.3682541847229004, + "learning_rate": 0.0005407281001137656, + "loss": 6.1631, + "step": 4037 + }, + { + "epoch": 1.3781569965870308, + "grad_norm": 3.0801639556884766, + "learning_rate": 0.0005406143344709898, + "loss": 6.1151, + "step": 4038 + }, + { + "epoch": 1.3784982935153582, + "grad_norm": 3.063805341720581, + "learning_rate": 0.0005405005688282139, + "loss": 6.5842, + "step": 4039 + }, + { + "epoch": 1.378839590443686, + "grad_norm": 2.9392287731170654, + "learning_rate": 0.000540386803185438, + "loss": 6.3864, + "step": 4040 + }, + { + "epoch": 1.3791808873720137, + "grad_norm": 3.018649101257324, + "learning_rate": 0.0005402730375426621, + "loss": 6.542, + "step": 4041 + }, + { + "epoch": 1.3795221843003413, + "grad_norm": 4.661662578582764, + "learning_rate": 0.0005401592718998862, + "loss": 5.7853, + "step": 4042 + }, + { + "epoch": 1.379863481228669, + "grad_norm": 3.264328956604004, + "learning_rate": 0.0005400455062571103, + "loss": 6.3248, + "step": 4043 + }, + { + "epoch": 1.3802047781569966, + "grad_norm": 3.175499200820923, + "learning_rate": 0.0005399317406143345, + "loss": 6.5147, + "step": 4044 + }, + { + "epoch": 1.3805460750853242, + "grad_norm": 3.1350579261779785, + "learning_rate": 0.0005398179749715586, + "loss": 6.9947, + "step": 4045 + }, + { + "epoch": 1.3808873720136519, + "grad_norm": 3.96781063079834, + "learning_rate": 0.0005397042093287827, + "loss": 5.874, + "step": 4046 + }, + { + "epoch": 1.3812286689419795, + "grad_norm": 3.2071373462677, + "learning_rate": 0.0005395904436860068, + "loss": 6.668, + "step": 4047 + }, + { + "epoch": 1.3815699658703071, + "grad_norm": 3.1385679244995117, + "learning_rate": 0.0005394766780432309, + "loss": 6.7825, + "step": 4048 + }, + { + "epoch": 1.3819112627986347, + "grad_norm": 3.573509931564331, + "learning_rate": 0.0005393629124004551, + "loss": 5.5573, + "step": 4049 + }, + { + "epoch": 1.3822525597269624, + "grad_norm": 3.066441059112549, + "learning_rate": 0.0005392491467576793, + "loss": 6.6349, + "step": 4050 + }, + { + "epoch": 1.3825938566552902, + "grad_norm": 2.976698160171509, + "learning_rate": 0.0005391353811149034, + "loss": 6.6275, + "step": 4051 + }, + { + "epoch": 1.3829351535836176, + "grad_norm": 3.050534963607788, + "learning_rate": 0.0005390216154721275, + "loss": 6.8268, + "step": 4052 + }, + { + "epoch": 1.3832764505119455, + "grad_norm": 3.335890769958496, + "learning_rate": 0.0005389078498293516, + "loss": 6.1537, + "step": 4053 + }, + { + "epoch": 1.383617747440273, + "grad_norm": 3.080980062484741, + "learning_rate": 0.0005387940841865757, + "loss": 6.7967, + "step": 4054 + }, + { + "epoch": 1.3839590443686007, + "grad_norm": 3.0135996341705322, + "learning_rate": 0.0005386803185437998, + "loss": 7.1126, + "step": 4055 + }, + { + "epoch": 1.3843003412969284, + "grad_norm": 3.079207420349121, + "learning_rate": 0.0005385665529010239, + "loss": 6.7536, + "step": 4056 + }, + { + "epoch": 1.384641638225256, + "grad_norm": 3.1604158878326416, + "learning_rate": 0.000538452787258248, + "loss": 6.2925, + "step": 4057 + }, + { + "epoch": 1.3849829351535836, + "grad_norm": 3.069831371307373, + "learning_rate": 0.0005383390216154721, + "loss": 6.494, + "step": 4058 + }, + { + "epoch": 1.3853242320819112, + "grad_norm": 3.445303201675415, + "learning_rate": 0.0005382252559726962, + "loss": 6.0185, + "step": 4059 + }, + { + "epoch": 1.3856655290102389, + "grad_norm": 3.107506513595581, + "learning_rate": 0.0005381114903299203, + "loss": 6.3259, + "step": 4060 + }, + { + "epoch": 1.3860068259385665, + "grad_norm": 3.251415729522705, + "learning_rate": 0.0005379977246871445, + "loss": 6.1274, + "step": 4061 + }, + { + "epoch": 1.3863481228668941, + "grad_norm": 3.4389657974243164, + "learning_rate": 0.0005378839590443686, + "loss": 5.9044, + "step": 4062 + }, + { + "epoch": 1.3866894197952218, + "grad_norm": 2.960808515548706, + "learning_rate": 0.0005377701934015927, + "loss": 6.5314, + "step": 4063 + }, + { + "epoch": 1.3870307167235496, + "grad_norm": 3.1747775077819824, + "learning_rate": 0.0005376564277588168, + "loss": 6.3545, + "step": 4064 + }, + { + "epoch": 1.387372013651877, + "grad_norm": 3.289687156677246, + "learning_rate": 0.0005375426621160409, + "loss": 6.4242, + "step": 4065 + }, + { + "epoch": 1.3877133105802049, + "grad_norm": 3.1048786640167236, + "learning_rate": 0.0005374288964732651, + "loss": 6.6646, + "step": 4066 + }, + { + "epoch": 1.3880546075085325, + "grad_norm": 3.1970558166503906, + "learning_rate": 0.0005373151308304893, + "loss": 6.5832, + "step": 4067 + }, + { + "epoch": 1.3883959044368601, + "grad_norm": 4.1045637130737305, + "learning_rate": 0.0005372013651877134, + "loss": 5.232, + "step": 4068 + }, + { + "epoch": 1.3887372013651877, + "grad_norm": 3.392815589904785, + "learning_rate": 0.0005370875995449375, + "loss": 5.7788, + "step": 4069 + }, + { + "epoch": 1.3890784982935154, + "grad_norm": 3.2663235664367676, + "learning_rate": 0.0005369738339021616, + "loss": 6.5488, + "step": 4070 + }, + { + "epoch": 1.389419795221843, + "grad_norm": 4.128077030181885, + "learning_rate": 0.0005368600682593857, + "loss": 5.8966, + "step": 4071 + }, + { + "epoch": 1.3897610921501706, + "grad_norm": 3.2271320819854736, + "learning_rate": 0.0005367463026166099, + "loss": 6.8354, + "step": 4072 + }, + { + "epoch": 1.3901023890784983, + "grad_norm": 3.334517478942871, + "learning_rate": 0.000536632536973834, + "loss": 4.8601, + "step": 4073 + }, + { + "epoch": 1.3904436860068259, + "grad_norm": 3.136608362197876, + "learning_rate": 0.0005365187713310581, + "loss": 6.6863, + "step": 4074 + }, + { + "epoch": 1.3907849829351535, + "grad_norm": 3.104353904724121, + "learning_rate": 0.0005364050056882821, + "loss": 6.5178, + "step": 4075 + }, + { + "epoch": 1.3911262798634811, + "grad_norm": 3.1452524662017822, + "learning_rate": 0.0005362912400455062, + "loss": 6.2725, + "step": 4076 + }, + { + "epoch": 1.391467576791809, + "grad_norm": 3.041658639907837, + "learning_rate": 0.0005361774744027303, + "loss": 6.231, + "step": 4077 + }, + { + "epoch": 1.3918088737201364, + "grad_norm": 3.1051037311553955, + "learning_rate": 0.0005360637087599545, + "loss": 6.0589, + "step": 4078 + }, + { + "epoch": 1.3921501706484642, + "grad_norm": 3.173142433166504, + "learning_rate": 0.0005359499431171786, + "loss": 6.2059, + "step": 4079 + }, + { + "epoch": 1.3924914675767919, + "grad_norm": 3.0535824298858643, + "learning_rate": 0.0005358361774744027, + "loss": 6.2619, + "step": 4080 + }, + { + "epoch": 1.3928327645051195, + "grad_norm": 3.0652310848236084, + "learning_rate": 0.0005357224118316268, + "loss": 6.6454, + "step": 4081 + }, + { + "epoch": 1.3931740614334471, + "grad_norm": 3.147127151489258, + "learning_rate": 0.0005356086461888509, + "loss": 6.4706, + "step": 4082 + }, + { + "epoch": 1.3935153583617748, + "grad_norm": 3.9887466430664062, + "learning_rate": 0.000535494880546075, + "loss": 5.9159, + "step": 4083 + }, + { + "epoch": 1.3938566552901024, + "grad_norm": 3.139221429824829, + "learning_rate": 0.0005353811149032993, + "loss": 6.4576, + "step": 4084 + }, + { + "epoch": 1.39419795221843, + "grad_norm": 3.5257785320281982, + "learning_rate": 0.0005352673492605234, + "loss": 5.7365, + "step": 4085 + }, + { + "epoch": 1.3945392491467576, + "grad_norm": 3.062695026397705, + "learning_rate": 0.0005351535836177475, + "loss": 6.4737, + "step": 4086 + }, + { + "epoch": 1.3948805460750853, + "grad_norm": 3.23443603515625, + "learning_rate": 0.0005350398179749716, + "loss": 5.8095, + "step": 4087 + }, + { + "epoch": 1.395221843003413, + "grad_norm": 3.249667167663574, + "learning_rate": 0.0005349260523321957, + "loss": 6.3928, + "step": 4088 + }, + { + "epoch": 1.3955631399317405, + "grad_norm": 3.0842514038085938, + "learning_rate": 0.0005348122866894199, + "loss": 6.2861, + "step": 4089 + }, + { + "epoch": 1.3959044368600684, + "grad_norm": 3.2530109882354736, + "learning_rate": 0.000534698521046644, + "loss": 5.9651, + "step": 4090 + }, + { + "epoch": 1.3962457337883958, + "grad_norm": 4.654758930206299, + "learning_rate": 0.0005345847554038681, + "loss": 5.5245, + "step": 4091 + }, + { + "epoch": 1.3965870307167236, + "grad_norm": 3.233858108520508, + "learning_rate": 0.0005344709897610922, + "loss": 6.6915, + "step": 4092 + }, + { + "epoch": 1.3969283276450513, + "grad_norm": 3.159165382385254, + "learning_rate": 0.0005343572241183163, + "loss": 6.3194, + "step": 4093 + }, + { + "epoch": 1.3972696245733789, + "grad_norm": 3.0182223320007324, + "learning_rate": 0.0005342434584755403, + "loss": 6.0745, + "step": 4094 + }, + { + "epoch": 1.3976109215017065, + "grad_norm": 3.3083250522613525, + "learning_rate": 0.0005341296928327645, + "loss": 5.7519, + "step": 4095 + }, + { + "epoch": 1.3979522184300341, + "grad_norm": 3.4524002075195312, + "learning_rate": 0.0005340159271899886, + "loss": 5.9577, + "step": 4096 + }, + { + "epoch": 1.3982935153583618, + "grad_norm": 3.095432996749878, + "learning_rate": 0.0005339021615472127, + "loss": 6.4214, + "step": 4097 + }, + { + "epoch": 1.3986348122866894, + "grad_norm": 3.4075958728790283, + "learning_rate": 0.0005337883959044368, + "loss": 5.6942, + "step": 4098 + }, + { + "epoch": 1.398976109215017, + "grad_norm": 4.778470993041992, + "learning_rate": 0.0005336746302616609, + "loss": 5.1441, + "step": 4099 + }, + { + "epoch": 1.3993174061433447, + "grad_norm": 4.083403587341309, + "learning_rate": 0.000533560864618885, + "loss": 5.8321, + "step": 4100 + }, + { + "epoch": 1.3996587030716723, + "grad_norm": 3.242581367492676, + "learning_rate": 0.0005334470989761093, + "loss": 6.1174, + "step": 4101 + }, + { + "epoch": 1.4, + "grad_norm": 3.0662283897399902, + "learning_rate": 0.0005333333333333334, + "loss": 6.7307, + "step": 4102 + }, + { + "epoch": 1.4003412969283278, + "grad_norm": 3.119428873062134, + "learning_rate": 0.0005332195676905575, + "loss": 6.5688, + "step": 4103 + }, + { + "epoch": 1.4006825938566552, + "grad_norm": 3.014010190963745, + "learning_rate": 0.0005331058020477816, + "loss": 6.5248, + "step": 4104 + }, + { + "epoch": 1.401023890784983, + "grad_norm": 3.5993144512176514, + "learning_rate": 0.0005329920364050057, + "loss": 5.533, + "step": 4105 + }, + { + "epoch": 1.4013651877133106, + "grad_norm": 3.0127501487731934, + "learning_rate": 0.0005328782707622299, + "loss": 6.3895, + "step": 4106 + }, + { + "epoch": 1.4017064846416383, + "grad_norm": 3.770829916000366, + "learning_rate": 0.000532764505119454, + "loss": 5.6792, + "step": 4107 + }, + { + "epoch": 1.402047781569966, + "grad_norm": 3.10686993598938, + "learning_rate": 0.0005326507394766781, + "loss": 6.5603, + "step": 4108 + }, + { + "epoch": 1.4023890784982935, + "grad_norm": 3.173125982284546, + "learning_rate": 0.0005325369738339022, + "loss": 5.9965, + "step": 4109 + }, + { + "epoch": 1.4027303754266212, + "grad_norm": 3.075287103652954, + "learning_rate": 0.0005324232081911263, + "loss": 6.0827, + "step": 4110 + }, + { + "epoch": 1.4030716723549488, + "grad_norm": 3.0550026893615723, + "learning_rate": 0.0005323094425483504, + "loss": 6.2533, + "step": 4111 + }, + { + "epoch": 1.4034129692832764, + "grad_norm": 4.698912620544434, + "learning_rate": 0.0005321956769055746, + "loss": 4.8645, + "step": 4112 + }, + { + "epoch": 1.403754266211604, + "grad_norm": 3.12372088432312, + "learning_rate": 0.0005320819112627987, + "loss": 6.4869, + "step": 4113 + }, + { + "epoch": 1.4040955631399317, + "grad_norm": 3.2088253498077393, + "learning_rate": 0.0005319681456200227, + "loss": 6.1782, + "step": 4114 + }, + { + "epoch": 1.4044368600682593, + "grad_norm": 3.946972370147705, + "learning_rate": 0.0005318543799772468, + "loss": 5.4858, + "step": 4115 + }, + { + "epoch": 1.4047781569965871, + "grad_norm": 3.758061408996582, + "learning_rate": 0.0005317406143344709, + "loss": 5.5092, + "step": 4116 + }, + { + "epoch": 1.4051194539249146, + "grad_norm": 3.1338324546813965, + "learning_rate": 0.000531626848691695, + "loss": 6.4477, + "step": 4117 + }, + { + "epoch": 1.4054607508532424, + "grad_norm": 3.124582529067993, + "learning_rate": 0.0005315130830489193, + "loss": 5.2994, + "step": 4118 + }, + { + "epoch": 1.40580204778157, + "grad_norm": 3.2185561656951904, + "learning_rate": 0.0005313993174061434, + "loss": 6.29, + "step": 4119 + }, + { + "epoch": 1.4061433447098977, + "grad_norm": 3.439194440841675, + "learning_rate": 0.0005312855517633675, + "loss": 5.8294, + "step": 4120 + }, + { + "epoch": 1.4064846416382253, + "grad_norm": 3.0680735111236572, + "learning_rate": 0.0005311717861205916, + "loss": 6.3791, + "step": 4121 + }, + { + "epoch": 1.406825938566553, + "grad_norm": 3.082590341567993, + "learning_rate": 0.0005310580204778157, + "loss": 6.5443, + "step": 4122 + }, + { + "epoch": 1.4071672354948805, + "grad_norm": 3.0105934143066406, + "learning_rate": 0.0005309442548350398, + "loss": 6.4141, + "step": 4123 + }, + { + "epoch": 1.4075085324232082, + "grad_norm": 3.0490424633026123, + "learning_rate": 0.000530830489192264, + "loss": 6.5894, + "step": 4124 + }, + { + "epoch": 1.4078498293515358, + "grad_norm": 3.1783745288848877, + "learning_rate": 0.0005307167235494881, + "loss": 6.3531, + "step": 4125 + }, + { + "epoch": 1.4081911262798634, + "grad_norm": 3.387235641479492, + "learning_rate": 0.0005306029579067122, + "loss": 6.4013, + "step": 4126 + }, + { + "epoch": 1.408532423208191, + "grad_norm": 3.4885988235473633, + "learning_rate": 0.0005304891922639363, + "loss": 6.4457, + "step": 4127 + }, + { + "epoch": 1.4088737201365187, + "grad_norm": 3.323430061340332, + "learning_rate": 0.0005303754266211604, + "loss": 6.1437, + "step": 4128 + }, + { + "epoch": 1.4092150170648465, + "grad_norm": 5.294995307922363, + "learning_rate": 0.0005302616609783846, + "loss": 5.1343, + "step": 4129 + }, + { + "epoch": 1.409556313993174, + "grad_norm": 3.099463701248169, + "learning_rate": 0.0005301478953356087, + "loss": 6.3936, + "step": 4130 + }, + { + "epoch": 1.4098976109215018, + "grad_norm": 3.1338493824005127, + "learning_rate": 0.0005300341296928328, + "loss": 6.8859, + "step": 4131 + }, + { + "epoch": 1.4102389078498294, + "grad_norm": 3.034945011138916, + "learning_rate": 0.0005299203640500569, + "loss": 6.1265, + "step": 4132 + }, + { + "epoch": 1.410580204778157, + "grad_norm": 3.033184051513672, + "learning_rate": 0.0005298065984072809, + "loss": 6.3814, + "step": 4133 + }, + { + "epoch": 1.4109215017064847, + "grad_norm": 2.992227554321289, + "learning_rate": 0.000529692832764505, + "loss": 6.4773, + "step": 4134 + }, + { + "epoch": 1.4112627986348123, + "grad_norm": 3.4906859397888184, + "learning_rate": 0.0005295790671217293, + "loss": 5.8243, + "step": 4135 + }, + { + "epoch": 1.41160409556314, + "grad_norm": 3.140052318572998, + "learning_rate": 0.0005294653014789534, + "loss": 6.3378, + "step": 4136 + }, + { + "epoch": 1.4119453924914676, + "grad_norm": 4.089855670928955, + "learning_rate": 0.0005293515358361775, + "loss": 6.1393, + "step": 4137 + }, + { + "epoch": 1.4122866894197952, + "grad_norm": 3.1651060581207275, + "learning_rate": 0.0005292377701934016, + "loss": 6.5695, + "step": 4138 + }, + { + "epoch": 1.4126279863481228, + "grad_norm": 3.1677775382995605, + "learning_rate": 0.0005291240045506257, + "loss": 6.1942, + "step": 4139 + }, + { + "epoch": 1.4129692832764504, + "grad_norm": 3.1922414302825928, + "learning_rate": 0.0005290102389078498, + "loss": 6.1814, + "step": 4140 + }, + { + "epoch": 1.413310580204778, + "grad_norm": 3.341269016265869, + "learning_rate": 0.000528896473265074, + "loss": 6.1189, + "step": 4141 + }, + { + "epoch": 1.413651877133106, + "grad_norm": 3.1737465858459473, + "learning_rate": 0.0005287827076222981, + "loss": 6.324, + "step": 4142 + }, + { + "epoch": 1.4139931740614333, + "grad_norm": 3.0902111530303955, + "learning_rate": 0.0005286689419795222, + "loss": 6.0682, + "step": 4143 + }, + { + "epoch": 1.4143344709897612, + "grad_norm": 3.198076009750366, + "learning_rate": 0.0005285551763367463, + "loss": 6.6576, + "step": 4144 + }, + { + "epoch": 1.4146757679180888, + "grad_norm": 3.349595785140991, + "learning_rate": 0.0005284414106939704, + "loss": 5.8567, + "step": 4145 + }, + { + "epoch": 1.4150170648464164, + "grad_norm": 3.0596208572387695, + "learning_rate": 0.0005283276450511945, + "loss": 6.6378, + "step": 4146 + }, + { + "epoch": 1.415358361774744, + "grad_norm": 3.092496633529663, + "learning_rate": 0.0005282138794084187, + "loss": 6.8541, + "step": 4147 + }, + { + "epoch": 1.4156996587030717, + "grad_norm": 3.092621088027954, + "learning_rate": 0.0005281001137656428, + "loss": 6.2547, + "step": 4148 + }, + { + "epoch": 1.4160409556313993, + "grad_norm": 3.077143907546997, + "learning_rate": 0.0005279863481228669, + "loss": 6.9397, + "step": 4149 + }, + { + "epoch": 1.416382252559727, + "grad_norm": 3.1491472721099854, + "learning_rate": 0.000527872582480091, + "loss": 6.3133, + "step": 4150 + }, + { + "epoch": 1.4167235494880546, + "grad_norm": 3.1234889030456543, + "learning_rate": 0.0005277588168373152, + "loss": 6.5563, + "step": 4151 + }, + { + "epoch": 1.4170648464163822, + "grad_norm": 3.086249828338623, + "learning_rate": 0.0005276450511945393, + "loss": 6.5361, + "step": 4152 + }, + { + "epoch": 1.4174061433447098, + "grad_norm": 3.6380252838134766, + "learning_rate": 0.0005275312855517634, + "loss": 6.0914, + "step": 4153 + }, + { + "epoch": 1.4177474402730375, + "grad_norm": 4.482765197753906, + "learning_rate": 0.0005274175199089875, + "loss": 6.6235, + "step": 4154 + }, + { + "epoch": 1.4180887372013653, + "grad_norm": 3.4601078033447266, + "learning_rate": 0.0005273037542662116, + "loss": 6.413, + "step": 4155 + }, + { + "epoch": 1.4184300341296927, + "grad_norm": 3.319906234741211, + "learning_rate": 0.0005271899886234357, + "loss": 6.8646, + "step": 4156 + }, + { + "epoch": 1.4187713310580206, + "grad_norm": 3.219625473022461, + "learning_rate": 0.0005270762229806598, + "loss": 6.1368, + "step": 4157 + }, + { + "epoch": 1.4191126279863482, + "grad_norm": 3.033139228820801, + "learning_rate": 0.000526962457337884, + "loss": 6.9841, + "step": 4158 + }, + { + "epoch": 1.4194539249146758, + "grad_norm": 3.063420534133911, + "learning_rate": 0.0005268486916951081, + "loss": 6.2634, + "step": 4159 + }, + { + "epoch": 1.4197952218430034, + "grad_norm": 6.298565864562988, + "learning_rate": 0.0005267349260523322, + "loss": 5.7468, + "step": 4160 + }, + { + "epoch": 1.420136518771331, + "grad_norm": 3.0811851024627686, + "learning_rate": 0.0005266211604095563, + "loss": 6.4445, + "step": 4161 + }, + { + "epoch": 1.4204778156996587, + "grad_norm": 3.210052967071533, + "learning_rate": 0.0005265073947667804, + "loss": 6.1846, + "step": 4162 + }, + { + "epoch": 1.4208191126279863, + "grad_norm": 3.0224571228027344, + "learning_rate": 0.0005263936291240045, + "loss": 6.5175, + "step": 4163 + }, + { + "epoch": 1.421160409556314, + "grad_norm": 3.2952568531036377, + "learning_rate": 0.0005262798634812287, + "loss": 6.3716, + "step": 4164 + }, + { + "epoch": 1.4215017064846416, + "grad_norm": 3.3877005577087402, + "learning_rate": 0.0005261660978384528, + "loss": 6.2331, + "step": 4165 + }, + { + "epoch": 1.4218430034129692, + "grad_norm": 3.012554168701172, + "learning_rate": 0.0005260523321956769, + "loss": 6.0445, + "step": 4166 + }, + { + "epoch": 1.4221843003412968, + "grad_norm": 3.1453211307525635, + "learning_rate": 0.000525938566552901, + "loss": 6.6895, + "step": 4167 + }, + { + "epoch": 1.4225255972696247, + "grad_norm": 3.146012783050537, + "learning_rate": 0.0005258248009101252, + "loss": 6.581, + "step": 4168 + }, + { + "epoch": 1.422866894197952, + "grad_norm": 3.0287742614746094, + "learning_rate": 0.0005257110352673494, + "loss": 6.2703, + "step": 4169 + }, + { + "epoch": 1.42320819112628, + "grad_norm": 3.0194754600524902, + "learning_rate": 0.0005255972696245735, + "loss": 6.4683, + "step": 4170 + }, + { + "epoch": 1.4235494880546076, + "grad_norm": 2.9725260734558105, + "learning_rate": 0.0005254835039817976, + "loss": 6.3189, + "step": 4171 + }, + { + "epoch": 1.4238907849829352, + "grad_norm": 3.101661443710327, + "learning_rate": 0.0005253697383390216, + "loss": 6.1797, + "step": 4172 + }, + { + "epoch": 1.4242320819112628, + "grad_norm": 3.5125575065612793, + "learning_rate": 0.0005252559726962457, + "loss": 6.1972, + "step": 4173 + }, + { + "epoch": 1.4245733788395905, + "grad_norm": 3.2034642696380615, + "learning_rate": 0.0005251422070534698, + "loss": 6.4367, + "step": 4174 + }, + { + "epoch": 1.424914675767918, + "grad_norm": 3.173069715499878, + "learning_rate": 0.000525028441410694, + "loss": 6.1895, + "step": 4175 + }, + { + "epoch": 1.4252559726962457, + "grad_norm": 3.3157718181610107, + "learning_rate": 0.0005249146757679181, + "loss": 5.2822, + "step": 4176 + }, + { + "epoch": 1.4255972696245733, + "grad_norm": 3.15852952003479, + "learning_rate": 0.0005248009101251422, + "loss": 6.4959, + "step": 4177 + }, + { + "epoch": 1.425938566552901, + "grad_norm": 2.8751838207244873, + "learning_rate": 0.0005246871444823663, + "loss": 4.8944, + "step": 4178 + }, + { + "epoch": 1.4262798634812286, + "grad_norm": 3.131141185760498, + "learning_rate": 0.0005245733788395904, + "loss": 6.5915, + "step": 4179 + }, + { + "epoch": 1.4266211604095562, + "grad_norm": 3.1316215991973877, + "learning_rate": 0.0005244596131968145, + "loss": 5.7723, + "step": 4180 + }, + { + "epoch": 1.426962457337884, + "grad_norm": 4.314980506896973, + "learning_rate": 0.0005243458475540387, + "loss": 5.9181, + "step": 4181 + }, + { + "epoch": 1.4273037542662115, + "grad_norm": 3.1601321697235107, + "learning_rate": 0.0005242320819112628, + "loss": 6.1492, + "step": 4182 + }, + { + "epoch": 1.4276450511945393, + "grad_norm": 3.1197149753570557, + "learning_rate": 0.0005241183162684869, + "loss": 6.6196, + "step": 4183 + }, + { + "epoch": 1.427986348122867, + "grad_norm": 3.109541893005371, + "learning_rate": 0.000524004550625711, + "loss": 6.5163, + "step": 4184 + }, + { + "epoch": 1.4283276450511946, + "grad_norm": 3.131387233734131, + "learning_rate": 0.0005238907849829352, + "loss": 6.6058, + "step": 4185 + }, + { + "epoch": 1.4286689419795222, + "grad_norm": 4.881186485290527, + "learning_rate": 0.0005237770193401593, + "loss": 6.105, + "step": 4186 + }, + { + "epoch": 1.4290102389078498, + "grad_norm": 3.3822786808013916, + "learning_rate": 0.0005236632536973835, + "loss": 6.0654, + "step": 4187 + }, + { + "epoch": 1.4293515358361775, + "grad_norm": 3.2397496700286865, + "learning_rate": 0.0005235494880546076, + "loss": 6.7056, + "step": 4188 + }, + { + "epoch": 1.429692832764505, + "grad_norm": 3.1808483600616455, + "learning_rate": 0.0005234357224118317, + "loss": 6.3953, + "step": 4189 + }, + { + "epoch": 1.4300341296928327, + "grad_norm": 3.1761183738708496, + "learning_rate": 0.0005233219567690558, + "loss": 6.3699, + "step": 4190 + }, + { + "epoch": 1.4303754266211604, + "grad_norm": 3.019099712371826, + "learning_rate": 0.0005232081911262798, + "loss": 6.266, + "step": 4191 + }, + { + "epoch": 1.430716723549488, + "grad_norm": 3.0634679794311523, + "learning_rate": 0.000523094425483504, + "loss": 6.5473, + "step": 4192 + }, + { + "epoch": 1.4310580204778156, + "grad_norm": 3.0792815685272217, + "learning_rate": 0.0005229806598407281, + "loss": 6.3998, + "step": 4193 + }, + { + "epoch": 1.4313993174061435, + "grad_norm": 3.048142433166504, + "learning_rate": 0.0005228668941979522, + "loss": 6.0785, + "step": 4194 + }, + { + "epoch": 1.4317406143344709, + "grad_norm": 3.5605955123901367, + "learning_rate": 0.0005227531285551763, + "loss": 6.2169, + "step": 4195 + }, + { + "epoch": 1.4320819112627987, + "grad_norm": 3.1062545776367188, + "learning_rate": 0.0005226393629124004, + "loss": 6.2116, + "step": 4196 + }, + { + "epoch": 1.4324232081911263, + "grad_norm": 3.5202982425689697, + "learning_rate": 0.0005225255972696245, + "loss": 6.6248, + "step": 4197 + }, + { + "epoch": 1.432764505119454, + "grad_norm": 4.942377090454102, + "learning_rate": 0.0005224118316268487, + "loss": 4.8293, + "step": 4198 + }, + { + "epoch": 1.4331058020477816, + "grad_norm": 3.237367868423462, + "learning_rate": 0.0005222980659840728, + "loss": 6.9193, + "step": 4199 + }, + { + "epoch": 1.4334470989761092, + "grad_norm": 3.33652400970459, + "learning_rate": 0.0005221843003412969, + "loss": 5.7304, + "step": 4200 + }, + { + "epoch": 1.4337883959044369, + "grad_norm": 4.465225696563721, + "learning_rate": 0.000522070534698521, + "loss": 5.5101, + "step": 4201 + }, + { + "epoch": 1.4341296928327645, + "grad_norm": 3.111236572265625, + "learning_rate": 0.0005219567690557452, + "loss": 6.8656, + "step": 4202 + }, + { + "epoch": 1.434470989761092, + "grad_norm": 3.2016916275024414, + "learning_rate": 0.0005218430034129693, + "loss": 6.7554, + "step": 4203 + }, + { + "epoch": 1.4348122866894197, + "grad_norm": 3.0454368591308594, + "learning_rate": 0.0005217292377701935, + "loss": 6.2635, + "step": 4204 + }, + { + "epoch": 1.4351535836177474, + "grad_norm": 3.07362699508667, + "learning_rate": 0.0005216154721274176, + "loss": 5.867, + "step": 4205 + }, + { + "epoch": 1.435494880546075, + "grad_norm": 3.0416789054870605, + "learning_rate": 0.0005215017064846417, + "loss": 6.4948, + "step": 4206 + }, + { + "epoch": 1.4358361774744028, + "grad_norm": 3.082453966140747, + "learning_rate": 0.0005213879408418658, + "loss": 6.4393, + "step": 4207 + }, + { + "epoch": 1.4361774744027302, + "grad_norm": 4.051252365112305, + "learning_rate": 0.0005212741751990899, + "loss": 5.6161, + "step": 4208 + }, + { + "epoch": 1.436518771331058, + "grad_norm": 3.2135181427001953, + "learning_rate": 0.0005211604095563141, + "loss": 6.2619, + "step": 4209 + }, + { + "epoch": 1.4368600682593857, + "grad_norm": 4.387260437011719, + "learning_rate": 0.0005210466439135382, + "loss": 5.5295, + "step": 4210 + }, + { + "epoch": 1.4372013651877134, + "grad_norm": 3.195601463317871, + "learning_rate": 0.0005209328782707622, + "loss": 6.3014, + "step": 4211 + }, + { + "epoch": 1.437542662116041, + "grad_norm": 3.236250877380371, + "learning_rate": 0.0005208191126279863, + "loss": 6.6634, + "step": 4212 + }, + { + "epoch": 1.4378839590443686, + "grad_norm": 3.2227210998535156, + "learning_rate": 0.0005207053469852104, + "loss": 6.0613, + "step": 4213 + }, + { + "epoch": 1.4382252559726962, + "grad_norm": 4.377707004547119, + "learning_rate": 0.0005205915813424345, + "loss": 4.7205, + "step": 4214 + }, + { + "epoch": 1.4385665529010239, + "grad_norm": 3.1697402000427246, + "learning_rate": 0.0005204778156996587, + "loss": 6.6822, + "step": 4215 + }, + { + "epoch": 1.4389078498293515, + "grad_norm": 3.424391746520996, + "learning_rate": 0.0005203640500568828, + "loss": 6.3304, + "step": 4216 + }, + { + "epoch": 1.4392491467576791, + "grad_norm": 3.166321039199829, + "learning_rate": 0.0005202502844141069, + "loss": 6.5789, + "step": 4217 + }, + { + "epoch": 1.4395904436860067, + "grad_norm": 3.058227062225342, + "learning_rate": 0.000520136518771331, + "loss": 6.3097, + "step": 4218 + }, + { + "epoch": 1.4399317406143344, + "grad_norm": 3.0803213119506836, + "learning_rate": 0.0005200227531285552, + "loss": 6.195, + "step": 4219 + }, + { + "epoch": 1.4402730375426622, + "grad_norm": 3.0819287300109863, + "learning_rate": 0.0005199089874857793, + "loss": 6.3407, + "step": 4220 + }, + { + "epoch": 1.4406143344709896, + "grad_norm": 3.058324098587036, + "learning_rate": 0.0005197952218430035, + "loss": 6.1638, + "step": 4221 + }, + { + "epoch": 1.4409556313993175, + "grad_norm": 3.040989398956299, + "learning_rate": 0.0005196814562002276, + "loss": 6.2791, + "step": 4222 + }, + { + "epoch": 1.441296928327645, + "grad_norm": 3.0801877975463867, + "learning_rate": 0.0005195676905574517, + "loss": 5.9373, + "step": 4223 + }, + { + "epoch": 1.4416382252559727, + "grad_norm": 3.13028883934021, + "learning_rate": 0.0005194539249146758, + "loss": 5.9899, + "step": 4224 + }, + { + "epoch": 1.4419795221843004, + "grad_norm": 3.0730881690979004, + "learning_rate": 0.0005193401592718999, + "loss": 6.0214, + "step": 4225 + }, + { + "epoch": 1.442320819112628, + "grad_norm": 3.320467233657837, + "learning_rate": 0.000519226393629124, + "loss": 6.4929, + "step": 4226 + }, + { + "epoch": 1.4426621160409556, + "grad_norm": 3.381704330444336, + "learning_rate": 0.0005191126279863482, + "loss": 5.9312, + "step": 4227 + }, + { + "epoch": 1.4430034129692833, + "grad_norm": 3.1299855709075928, + "learning_rate": 0.0005189988623435723, + "loss": 6.446, + "step": 4228 + }, + { + "epoch": 1.4433447098976109, + "grad_norm": 3.0571303367614746, + "learning_rate": 0.0005188850967007964, + "loss": 6.3526, + "step": 4229 + }, + { + "epoch": 1.4436860068259385, + "grad_norm": 3.1851227283477783, + "learning_rate": 0.0005187713310580204, + "loss": 6.7403, + "step": 4230 + }, + { + "epoch": 1.4440273037542661, + "grad_norm": 3.1886208057403564, + "learning_rate": 0.0005186575654152445, + "loss": 6.0976, + "step": 4231 + }, + { + "epoch": 1.4443686006825938, + "grad_norm": 3.0047943592071533, + "learning_rate": 0.0005185437997724687, + "loss": 6.605, + "step": 4232 + }, + { + "epoch": 1.4447098976109216, + "grad_norm": 3.09120512008667, + "learning_rate": 0.0005184300341296928, + "loss": 6.6422, + "step": 4233 + }, + { + "epoch": 1.445051194539249, + "grad_norm": 3.228661060333252, + "learning_rate": 0.000518316268486917, + "loss": 6.1845, + "step": 4234 + }, + { + "epoch": 1.4453924914675769, + "grad_norm": 3.080799102783203, + "learning_rate": 0.000518202502844141, + "loss": 6.8722, + "step": 4235 + }, + { + "epoch": 1.4457337883959045, + "grad_norm": 3.116177558898926, + "learning_rate": 0.0005180887372013652, + "loss": 6.7637, + "step": 4236 + }, + { + "epoch": 1.4460750853242321, + "grad_norm": 3.183600664138794, + "learning_rate": 0.0005179749715585893, + "loss": 5.8418, + "step": 4237 + }, + { + "epoch": 1.4464163822525598, + "grad_norm": 3.1986725330352783, + "learning_rate": 0.0005178612059158135, + "loss": 5.7188, + "step": 4238 + }, + { + "epoch": 1.4467576791808874, + "grad_norm": 3.249572277069092, + "learning_rate": 0.0005177474402730376, + "loss": 5.8931, + "step": 4239 + }, + { + "epoch": 1.447098976109215, + "grad_norm": 3.0727765560150146, + "learning_rate": 0.0005176336746302617, + "loss": 6.6941, + "step": 4240 + }, + { + "epoch": 1.4474402730375426, + "grad_norm": 2.9626481533050537, + "learning_rate": 0.0005175199089874858, + "loss": 5.5147, + "step": 4241 + }, + { + "epoch": 1.4477815699658703, + "grad_norm": 2.4201412200927734, + "learning_rate": 0.0005174061433447099, + "loss": 3.1755, + "step": 4242 + }, + { + "epoch": 1.448122866894198, + "grad_norm": 3.06439471244812, + "learning_rate": 0.000517292377701934, + "loss": 6.101, + "step": 4243 + }, + { + "epoch": 1.4484641638225255, + "grad_norm": 3.1331796646118164, + "learning_rate": 0.0005171786120591582, + "loss": 6.3183, + "step": 4244 + }, + { + "epoch": 1.4488054607508531, + "grad_norm": 3.0797486305236816, + "learning_rate": 0.0005170648464163823, + "loss": 6.3455, + "step": 4245 + }, + { + "epoch": 1.449146757679181, + "grad_norm": 3.1885828971862793, + "learning_rate": 0.0005169510807736064, + "loss": 6.6001, + "step": 4246 + }, + { + "epoch": 1.4494880546075084, + "grad_norm": 4.609078884124756, + "learning_rate": 0.0005168373151308305, + "loss": 4.6183, + "step": 4247 + }, + { + "epoch": 1.4498293515358363, + "grad_norm": 3.357383966445923, + "learning_rate": 0.0005167235494880546, + "loss": 6.4766, + "step": 4248 + }, + { + "epoch": 1.4501706484641639, + "grad_norm": 3.172637701034546, + "learning_rate": 0.0005166097838452787, + "loss": 6.3503, + "step": 4249 + }, + { + "epoch": 1.4505119453924915, + "grad_norm": 3.2504348754882812, + "learning_rate": 0.0005164960182025028, + "loss": 6.4823, + "step": 4250 + }, + { + "epoch": 1.4508532423208191, + "grad_norm": 3.318784475326538, + "learning_rate": 0.000516382252559727, + "loss": 5.2902, + "step": 4251 + }, + { + "epoch": 1.4511945392491468, + "grad_norm": 3.166903257369995, + "learning_rate": 0.000516268486916951, + "loss": 7.0076, + "step": 4252 + }, + { + "epoch": 1.4515358361774744, + "grad_norm": 3.29073166847229, + "learning_rate": 0.0005161547212741752, + "loss": 5.809, + "step": 4253 + }, + { + "epoch": 1.451877133105802, + "grad_norm": 3.2139933109283447, + "learning_rate": 0.0005160409556313993, + "loss": 6.6659, + "step": 4254 + }, + { + "epoch": 1.4522184300341296, + "grad_norm": 2.99873685836792, + "learning_rate": 0.0005159271899886235, + "loss": 6.3525, + "step": 4255 + }, + { + "epoch": 1.4525597269624573, + "grad_norm": 3.2787253856658936, + "learning_rate": 0.0005158134243458476, + "loss": 5.645, + "step": 4256 + }, + { + "epoch": 1.452901023890785, + "grad_norm": 3.140230178833008, + "learning_rate": 0.0005156996587030717, + "loss": 6.1587, + "step": 4257 + }, + { + "epoch": 1.4532423208191125, + "grad_norm": 4.204946994781494, + "learning_rate": 0.0005155858930602958, + "loss": 5.4848, + "step": 4258 + }, + { + "epoch": 1.4535836177474404, + "grad_norm": 3.3353021144866943, + "learning_rate": 0.0005154721274175199, + "loss": 6.1801, + "step": 4259 + }, + { + "epoch": 1.4539249146757678, + "grad_norm": 3.392322540283203, + "learning_rate": 0.000515358361774744, + "loss": 6.8188, + "step": 4260 + }, + { + "epoch": 1.4542662116040956, + "grad_norm": 3.1940486431121826, + "learning_rate": 0.0005152445961319682, + "loss": 7.0783, + "step": 4261 + }, + { + "epoch": 1.4546075085324233, + "grad_norm": 3.1675353050231934, + "learning_rate": 0.0005151308304891923, + "loss": 6.6465, + "step": 4262 + }, + { + "epoch": 1.454948805460751, + "grad_norm": 3.268160581588745, + "learning_rate": 0.0005150170648464164, + "loss": 6.48, + "step": 4263 + }, + { + "epoch": 1.4552901023890785, + "grad_norm": 3.4621543884277344, + "learning_rate": 0.0005149032992036405, + "loss": 6.2458, + "step": 4264 + }, + { + "epoch": 1.4556313993174061, + "grad_norm": 3.037792682647705, + "learning_rate": 0.0005147895335608646, + "loss": 6.4756, + "step": 4265 + }, + { + "epoch": 1.4559726962457338, + "grad_norm": 3.1558916568756104, + "learning_rate": 0.0005146757679180887, + "loss": 6.99, + "step": 4266 + }, + { + "epoch": 1.4563139931740614, + "grad_norm": 3.1241016387939453, + "learning_rate": 0.000514562002275313, + "loss": 6.639, + "step": 4267 + }, + { + "epoch": 1.456655290102389, + "grad_norm": 3.117509603500366, + "learning_rate": 0.0005144482366325371, + "loss": 6.2899, + "step": 4268 + }, + { + "epoch": 1.4569965870307167, + "grad_norm": 3.071044683456421, + "learning_rate": 0.000514334470989761, + "loss": 6.1585, + "step": 4269 + }, + { + "epoch": 1.4573378839590443, + "grad_norm": 3.1628692150115967, + "learning_rate": 0.0005142207053469852, + "loss": 6.4385, + "step": 4270 + }, + { + "epoch": 1.457679180887372, + "grad_norm": 5.46381950378418, + "learning_rate": 0.0005141069397042093, + "loss": 4.9057, + "step": 4271 + }, + { + "epoch": 1.4580204778156998, + "grad_norm": 3.0911929607391357, + "learning_rate": 0.0005139931740614334, + "loss": 6.5145, + "step": 4272 + }, + { + "epoch": 1.4583617747440272, + "grad_norm": 3.1896402835845947, + "learning_rate": 0.0005138794084186576, + "loss": 5.9364, + "step": 4273 + }, + { + "epoch": 1.458703071672355, + "grad_norm": 3.1629385948181152, + "learning_rate": 0.0005137656427758817, + "loss": 5.7098, + "step": 4274 + }, + { + "epoch": 1.4590443686006827, + "grad_norm": 3.908473014831543, + "learning_rate": 0.0005136518771331058, + "loss": 6.1301, + "step": 4275 + }, + { + "epoch": 1.4593856655290103, + "grad_norm": 3.4416491985321045, + "learning_rate": 0.0005135381114903299, + "loss": 5.7183, + "step": 4276 + }, + { + "epoch": 1.459726962457338, + "grad_norm": 3.129523992538452, + "learning_rate": 0.000513424345847554, + "loss": 6.4804, + "step": 4277 + }, + { + "epoch": 1.4600682593856655, + "grad_norm": 3.913393020629883, + "learning_rate": 0.0005133105802047782, + "loss": 5.0513, + "step": 4278 + }, + { + "epoch": 1.4604095563139932, + "grad_norm": 3.2042019367218018, + "learning_rate": 0.0005131968145620023, + "loss": 5.7857, + "step": 4279 + }, + { + "epoch": 1.4607508532423208, + "grad_norm": 3.098621368408203, + "learning_rate": 0.0005130830489192264, + "loss": 6.7836, + "step": 4280 + }, + { + "epoch": 1.4610921501706484, + "grad_norm": 3.1765682697296143, + "learning_rate": 0.0005129692832764505, + "loss": 6.9352, + "step": 4281 + }, + { + "epoch": 1.461433447098976, + "grad_norm": 3.0995981693267822, + "learning_rate": 0.0005128555176336746, + "loss": 6.4874, + "step": 4282 + }, + { + "epoch": 1.4617747440273037, + "grad_norm": 3.3691446781158447, + "learning_rate": 0.0005127417519908987, + "loss": 6.0037, + "step": 4283 + }, + { + "epoch": 1.4621160409556313, + "grad_norm": 4.908443450927734, + "learning_rate": 0.000512627986348123, + "loss": 4.7213, + "step": 4284 + }, + { + "epoch": 1.4624573378839592, + "grad_norm": 3.1471049785614014, + "learning_rate": 0.0005125142207053471, + "loss": 6.9427, + "step": 4285 + }, + { + "epoch": 1.4627986348122866, + "grad_norm": 3.119262456893921, + "learning_rate": 0.0005124004550625712, + "loss": 6.4854, + "step": 4286 + }, + { + "epoch": 1.4631399317406144, + "grad_norm": 3.1892127990722656, + "learning_rate": 0.0005122866894197953, + "loss": 6.2065, + "step": 4287 + }, + { + "epoch": 1.463481228668942, + "grad_norm": 3.0533523559570312, + "learning_rate": 0.0005121729237770194, + "loss": 6.6329, + "step": 4288 + }, + { + "epoch": 1.4638225255972697, + "grad_norm": 4.774298191070557, + "learning_rate": 0.0005120591581342434, + "loss": 5.365, + "step": 4289 + }, + { + "epoch": 1.4641638225255973, + "grad_norm": 4.030971050262451, + "learning_rate": 0.0005119453924914676, + "loss": 5.6187, + "step": 4290 + }, + { + "epoch": 1.464505119453925, + "grad_norm": 3.2482876777648926, + "learning_rate": 0.0005118316268486917, + "loss": 6.1619, + "step": 4291 + }, + { + "epoch": 1.4648464163822525, + "grad_norm": 2.9875893592834473, + "learning_rate": 0.0005117178612059158, + "loss": 6.4415, + "step": 4292 + }, + { + "epoch": 1.4651877133105802, + "grad_norm": 4.620938301086426, + "learning_rate": 0.0005116040955631399, + "loss": 1.5726, + "step": 4293 + }, + { + "epoch": 1.4655290102389078, + "grad_norm": 3.08268141746521, + "learning_rate": 0.000511490329920364, + "loss": 6.0083, + "step": 4294 + }, + { + "epoch": 1.4658703071672354, + "grad_norm": 3.3414804935455322, + "learning_rate": 0.0005113765642775882, + "loss": 6.0082, + "step": 4295 + }, + { + "epoch": 1.466211604095563, + "grad_norm": 4.3554792404174805, + "learning_rate": 0.0005112627986348123, + "loss": 4.8411, + "step": 4296 + }, + { + "epoch": 1.4665529010238907, + "grad_norm": 3.1311569213867188, + "learning_rate": 0.0005111490329920364, + "loss": 6.3444, + "step": 4297 + }, + { + "epoch": 1.4668941979522185, + "grad_norm": 3.0714824199676514, + "learning_rate": 0.0005110352673492605, + "loss": 6.2691, + "step": 4298 + }, + { + "epoch": 1.467235494880546, + "grad_norm": 2.927114963531494, + "learning_rate": 0.0005109215017064846, + "loss": 6.2727, + "step": 4299 + }, + { + "epoch": 1.4675767918088738, + "grad_norm": 2.885935068130493, + "learning_rate": 0.0005108077360637087, + "loss": 6.4839, + "step": 4300 + }, + { + "epoch": 1.4679180887372014, + "grad_norm": 2.985340118408203, + "learning_rate": 0.000510693970420933, + "loss": 6.5858, + "step": 4301 + }, + { + "epoch": 1.468259385665529, + "grad_norm": 3.0189003944396973, + "learning_rate": 0.0005105802047781571, + "loss": 6.4866, + "step": 4302 + }, + { + "epoch": 1.4686006825938567, + "grad_norm": 3.324467658996582, + "learning_rate": 0.0005104664391353812, + "loss": 6.1826, + "step": 4303 + }, + { + "epoch": 1.4689419795221843, + "grad_norm": 3.1696572303771973, + "learning_rate": 0.0005103526734926053, + "loss": 6.7885, + "step": 4304 + }, + { + "epoch": 1.469283276450512, + "grad_norm": 3.074852705001831, + "learning_rate": 0.0005102389078498294, + "loss": 6.6288, + "step": 4305 + }, + { + "epoch": 1.4696245733788396, + "grad_norm": 2.911682367324829, + "learning_rate": 0.0005101251422070535, + "loss": 6.4831, + "step": 4306 + }, + { + "epoch": 1.4699658703071672, + "grad_norm": 3.0853192806243896, + "learning_rate": 0.0005100113765642777, + "loss": 6.6247, + "step": 4307 + }, + { + "epoch": 1.4703071672354948, + "grad_norm": 3.0632660388946533, + "learning_rate": 0.0005098976109215017, + "loss": 6.4772, + "step": 4308 + }, + { + "epoch": 1.4706484641638224, + "grad_norm": 3.191326379776001, + "learning_rate": 0.0005097838452787258, + "loss": 6.1043, + "step": 4309 + }, + { + "epoch": 1.47098976109215, + "grad_norm": 2.7078959941864014, + "learning_rate": 0.0005096700796359499, + "loss": 3.3701, + "step": 4310 + }, + { + "epoch": 1.471331058020478, + "grad_norm": 3.1352057456970215, + "learning_rate": 0.000509556313993174, + "loss": 6.237, + "step": 4311 + }, + { + "epoch": 1.4716723549488053, + "grad_norm": 3.192700147628784, + "learning_rate": 0.0005094425483503981, + "loss": 6.6129, + "step": 4312 + }, + { + "epoch": 1.4720136518771332, + "grad_norm": 2.962541103363037, + "learning_rate": 0.0005093287827076223, + "loss": 5.8705, + "step": 4313 + }, + { + "epoch": 1.4723549488054608, + "grad_norm": 3.1734907627105713, + "learning_rate": 0.0005092150170648464, + "loss": 5.8751, + "step": 4314 + }, + { + "epoch": 1.4726962457337884, + "grad_norm": 5.9300456047058105, + "learning_rate": 0.0005091012514220705, + "loss": 5.5791, + "step": 4315 + }, + { + "epoch": 1.473037542662116, + "grad_norm": 2.9836061000823975, + "learning_rate": 0.0005089874857792946, + "loss": 6.6064, + "step": 4316 + }, + { + "epoch": 1.4733788395904437, + "grad_norm": 3.1486117839813232, + "learning_rate": 0.0005088737201365187, + "loss": 6.6439, + "step": 4317 + }, + { + "epoch": 1.4737201365187713, + "grad_norm": 3.243680000305176, + "learning_rate": 0.000508759954493743, + "loss": 6.4473, + "step": 4318 + }, + { + "epoch": 1.474061433447099, + "grad_norm": 3.0161967277526855, + "learning_rate": 0.0005086461888509671, + "loss": 6.6844, + "step": 4319 + }, + { + "epoch": 1.4744027303754266, + "grad_norm": 3.0081169605255127, + "learning_rate": 0.0005085324232081912, + "loss": 6.537, + "step": 4320 + }, + { + "epoch": 1.4747440273037542, + "grad_norm": 2.991122245788574, + "learning_rate": 0.0005084186575654153, + "loss": 6.3377, + "step": 4321 + }, + { + "epoch": 1.4750853242320818, + "grad_norm": 3.0482964515686035, + "learning_rate": 0.0005083048919226394, + "loss": 6.6401, + "step": 4322 + }, + { + "epoch": 1.4754266211604095, + "grad_norm": 4.201125621795654, + "learning_rate": 0.0005081911262798635, + "loss": 6.4634, + "step": 4323 + }, + { + "epoch": 1.4757679180887373, + "grad_norm": 3.031925916671753, + "learning_rate": 0.0005080773606370877, + "loss": 6.3946, + "step": 4324 + }, + { + "epoch": 1.4761092150170647, + "grad_norm": 3.21101975440979, + "learning_rate": 0.0005079635949943118, + "loss": 6.258, + "step": 4325 + }, + { + "epoch": 1.4764505119453926, + "grad_norm": 3.451096534729004, + "learning_rate": 0.0005078498293515359, + "loss": 5.4511, + "step": 4326 + }, + { + "epoch": 1.4767918088737202, + "grad_norm": 3.0925445556640625, + "learning_rate": 0.0005077360637087599, + "loss": 6.7531, + "step": 4327 + }, + { + "epoch": 1.4771331058020478, + "grad_norm": 3.206573963165283, + "learning_rate": 0.000507622298065984, + "loss": 6.8944, + "step": 4328 + }, + { + "epoch": 1.4774744027303754, + "grad_norm": 3.1508631706237793, + "learning_rate": 0.0005075085324232081, + "loss": 6.8273, + "step": 4329 + }, + { + "epoch": 1.477815699658703, + "grad_norm": 3.3948967456817627, + "learning_rate": 0.0005073947667804323, + "loss": 5.824, + "step": 4330 + }, + { + "epoch": 1.4781569965870307, + "grad_norm": 3.199467182159424, + "learning_rate": 0.0005072810011376564, + "loss": 6.1532, + "step": 4331 + }, + { + "epoch": 1.4784982935153583, + "grad_norm": 3.0080020427703857, + "learning_rate": 0.0005071672354948805, + "loss": 6.5633, + "step": 4332 + }, + { + "epoch": 1.478839590443686, + "grad_norm": 3.230149030685425, + "learning_rate": 0.0005070534698521046, + "loss": 6.5214, + "step": 4333 + }, + { + "epoch": 1.4791808873720136, + "grad_norm": 3.4948723316192627, + "learning_rate": 0.0005069397042093287, + "loss": 5.9058, + "step": 4334 + }, + { + "epoch": 1.4795221843003412, + "grad_norm": 3.171333074569702, + "learning_rate": 0.000506825938566553, + "loss": 6.203, + "step": 4335 + }, + { + "epoch": 1.4798634812286688, + "grad_norm": 3.0840699672698975, + "learning_rate": 0.0005067121729237771, + "loss": 6.1753, + "step": 4336 + }, + { + "epoch": 1.4802047781569967, + "grad_norm": 3.062875747680664, + "learning_rate": 0.0005065984072810012, + "loss": 6.574, + "step": 4337 + }, + { + "epoch": 1.480546075085324, + "grad_norm": 5.447630882263184, + "learning_rate": 0.0005064846416382253, + "loss": 4.2755, + "step": 4338 + }, + { + "epoch": 1.480887372013652, + "grad_norm": 3.1661298274993896, + "learning_rate": 0.0005063708759954494, + "loss": 6.4025, + "step": 4339 + }, + { + "epoch": 1.4812286689419796, + "grad_norm": 3.3090450763702393, + "learning_rate": 0.0005062571103526735, + "loss": 6.0994, + "step": 4340 + }, + { + "epoch": 1.4815699658703072, + "grad_norm": 3.69329571723938, + "learning_rate": 0.0005061433447098977, + "loss": 4.7961, + "step": 4341 + }, + { + "epoch": 1.4819112627986348, + "grad_norm": 3.3056480884552, + "learning_rate": 0.0005060295790671218, + "loss": 6.4119, + "step": 4342 + }, + { + "epoch": 1.4822525597269625, + "grad_norm": 10.146489143371582, + "learning_rate": 0.0005059158134243459, + "loss": 6.9681, + "step": 4343 + }, + { + "epoch": 1.48259385665529, + "grad_norm": 3.1964523792266846, + "learning_rate": 0.00050580204778157, + "loss": 6.9197, + "step": 4344 + }, + { + "epoch": 1.4829351535836177, + "grad_norm": 3.3806843757629395, + "learning_rate": 0.0005056882821387941, + "loss": 6.3696, + "step": 4345 + }, + { + "epoch": 1.4832764505119453, + "grad_norm": 3.172544002532959, + "learning_rate": 0.0005055745164960182, + "loss": 5.6425, + "step": 4346 + }, + { + "epoch": 1.483617747440273, + "grad_norm": 3.494082450866699, + "learning_rate": 0.0005054607508532423, + "loss": 6.5567, + "step": 4347 + }, + { + "epoch": 1.4839590443686006, + "grad_norm": 3.1376161575317383, + "learning_rate": 0.0005053469852104664, + "loss": 6.1425, + "step": 4348 + }, + { + "epoch": 1.4843003412969282, + "grad_norm": 4.658734321594238, + "learning_rate": 0.0005052332195676905, + "loss": 4.829, + "step": 4349 + }, + { + "epoch": 1.484641638225256, + "grad_norm": 3.0457332134246826, + "learning_rate": 0.0005051194539249146, + "loss": 6.6027, + "step": 4350 + }, + { + "epoch": 1.4849829351535835, + "grad_norm": 3.252115249633789, + "learning_rate": 0.0005050056882821387, + "loss": 6.4487, + "step": 4351 + }, + { + "epoch": 1.4853242320819113, + "grad_norm": 3.3878161907196045, + "learning_rate": 0.0005048919226393628, + "loss": 5.874, + "step": 4352 + }, + { + "epoch": 1.485665529010239, + "grad_norm": 3.082244873046875, + "learning_rate": 0.0005047781569965871, + "loss": 6.3979, + "step": 4353 + }, + { + "epoch": 1.4860068259385666, + "grad_norm": 3.0858993530273438, + "learning_rate": 0.0005046643913538112, + "loss": 6.2685, + "step": 4354 + }, + { + "epoch": 1.4863481228668942, + "grad_norm": 3.532917022705078, + "learning_rate": 0.0005045506257110353, + "loss": 6.3543, + "step": 4355 + }, + { + "epoch": 1.4866894197952218, + "grad_norm": 3.1342530250549316, + "learning_rate": 0.0005044368600682594, + "loss": 5.8409, + "step": 4356 + }, + { + "epoch": 1.4870307167235495, + "grad_norm": 3.64797306060791, + "learning_rate": 0.0005043230944254835, + "loss": 6.3942, + "step": 4357 + }, + { + "epoch": 1.487372013651877, + "grad_norm": 4.409688472747803, + "learning_rate": 0.0005042093287827077, + "loss": 5.6323, + "step": 4358 + }, + { + "epoch": 1.4877133105802047, + "grad_norm": 3.2418742179870605, + "learning_rate": 0.0005040955631399318, + "loss": 6.3286, + "step": 4359 + }, + { + "epoch": 1.4880546075085324, + "grad_norm": 3.112736225128174, + "learning_rate": 0.0005039817974971559, + "loss": 6.2084, + "step": 4360 + }, + { + "epoch": 1.4883959044368602, + "grad_norm": 3.1486265659332275, + "learning_rate": 0.00050386803185438, + "loss": 6.2417, + "step": 4361 + }, + { + "epoch": 1.4887372013651876, + "grad_norm": 5.536955833435059, + "learning_rate": 0.0005037542662116041, + "loss": 5.6501, + "step": 4362 + }, + { + "epoch": 1.4890784982935155, + "grad_norm": 3.1049556732177734, + "learning_rate": 0.0005036405005688282, + "loss": 6.3247, + "step": 4363 + }, + { + "epoch": 1.4894197952218429, + "grad_norm": 3.001415252685547, + "learning_rate": 0.0005035267349260524, + "loss": 6.7637, + "step": 4364 + }, + { + "epoch": 1.4897610921501707, + "grad_norm": 2.8671603202819824, + "learning_rate": 0.0005034129692832765, + "loss": 6.3853, + "step": 4365 + }, + { + "epoch": 1.4901023890784983, + "grad_norm": 4.22214937210083, + "learning_rate": 0.0005032992036405005, + "loss": 6.064, + "step": 4366 + }, + { + "epoch": 1.490443686006826, + "grad_norm": 3.110260009765625, + "learning_rate": 0.0005031854379977246, + "loss": 6.2449, + "step": 4367 + }, + { + "epoch": 1.4907849829351536, + "grad_norm": 3.0448741912841797, + "learning_rate": 0.0005030716723549487, + "loss": 6.6455, + "step": 4368 + }, + { + "epoch": 1.4911262798634812, + "grad_norm": 3.040241003036499, + "learning_rate": 0.0005029579067121728, + "loss": 6.2379, + "step": 4369 + }, + { + "epoch": 1.4914675767918089, + "grad_norm": 3.210784912109375, + "learning_rate": 0.0005028441410693971, + "loss": 5.9151, + "step": 4370 + }, + { + "epoch": 1.4918088737201365, + "grad_norm": 4.361735820770264, + "learning_rate": 0.0005027303754266212, + "loss": 5.6326, + "step": 4371 + }, + { + "epoch": 1.4921501706484641, + "grad_norm": 3.1003541946411133, + "learning_rate": 0.0005026166097838453, + "loss": 6.7568, + "step": 4372 + }, + { + "epoch": 1.4924914675767917, + "grad_norm": 4.7033281326293945, + "learning_rate": 0.0005025028441410694, + "loss": 5.7138, + "step": 4373 + }, + { + "epoch": 1.4928327645051196, + "grad_norm": 3.1924164295196533, + "learning_rate": 0.0005023890784982935, + "loss": 6.2701, + "step": 4374 + }, + { + "epoch": 1.493174061433447, + "grad_norm": 3.1967172622680664, + "learning_rate": 0.0005022753128555177, + "loss": 6.2449, + "step": 4375 + }, + { + "epoch": 1.4935153583617748, + "grad_norm": 3.1058359146118164, + "learning_rate": 0.0005021615472127418, + "loss": 6.567, + "step": 4376 + }, + { + "epoch": 1.4938566552901023, + "grad_norm": 3.0760414600372314, + "learning_rate": 0.0005020477815699659, + "loss": 6.221, + "step": 4377 + }, + { + "epoch": 1.49419795221843, + "grad_norm": 3.031233549118042, + "learning_rate": 0.00050193401592719, + "loss": 6.3645, + "step": 4378 + }, + { + "epoch": 1.4945392491467577, + "grad_norm": 2.9455747604370117, + "learning_rate": 0.0005018202502844141, + "loss": 6.3219, + "step": 4379 + }, + { + "epoch": 1.4948805460750854, + "grad_norm": 3.113830804824829, + "learning_rate": 0.0005017064846416382, + "loss": 6.1877, + "step": 4380 + }, + { + "epoch": 1.495221843003413, + "grad_norm": 2.8928475379943848, + "learning_rate": 0.0005015927189988624, + "loss": 6.0893, + "step": 4381 + }, + { + "epoch": 1.4955631399317406, + "grad_norm": 5.041830539703369, + "learning_rate": 0.0005014789533560865, + "loss": 5.6682, + "step": 4382 + }, + { + "epoch": 1.4959044368600682, + "grad_norm": 3.120551109313965, + "learning_rate": 0.0005013651877133106, + "loss": 6.5289, + "step": 4383 + }, + { + "epoch": 1.4962457337883959, + "grad_norm": 3.156862735748291, + "learning_rate": 0.0005012514220705347, + "loss": 6.1597, + "step": 4384 + }, + { + "epoch": 1.4965870307167235, + "grad_norm": 2.876394510269165, + "learning_rate": 0.0005011376564277589, + "loss": 3.1941, + "step": 4385 + }, + { + "epoch": 1.4969283276450511, + "grad_norm": 3.063058376312256, + "learning_rate": 0.0005010238907849828, + "loss": 6.0953, + "step": 4386 + }, + { + "epoch": 1.497269624573379, + "grad_norm": 3.1170380115509033, + "learning_rate": 0.0005009101251422071, + "loss": 6.3125, + "step": 4387 + }, + { + "epoch": 1.4976109215017064, + "grad_norm": 5.3596062660217285, + "learning_rate": 0.0005007963594994312, + "loss": 4.3308, + "step": 4388 + }, + { + "epoch": 1.4979522184300342, + "grad_norm": 3.159137487411499, + "learning_rate": 0.0005006825938566553, + "loss": 6.5011, + "step": 4389 + }, + { + "epoch": 1.4982935153583616, + "grad_norm": 3.204058885574341, + "learning_rate": 0.0005005688282138794, + "loss": 6.6059, + "step": 4390 + }, + { + "epoch": 1.4986348122866895, + "grad_norm": 3.0355639457702637, + "learning_rate": 0.0005004550625711035, + "loss": 6.5694, + "step": 4391 + }, + { + "epoch": 1.4989761092150171, + "grad_norm": 3.0069313049316406, + "learning_rate": 0.0005003412969283276, + "loss": 6.1367, + "step": 4392 + }, + { + "epoch": 1.4993174061433447, + "grad_norm": 3.0549659729003906, + "learning_rate": 0.0005002275312855518, + "loss": 6.1733, + "step": 4393 + }, + { + "epoch": 1.4996587030716724, + "grad_norm": 3.0711121559143066, + "learning_rate": 0.0005001137656427759, + "loss": 6.0055, + "step": 4394 + }, + { + "epoch": 1.5, + "grad_norm": 3.120171546936035, + "learning_rate": 0.0005, + "loss": 6.6731, + "step": 4395 + }, + { + "epoch": 1.5003412969283276, + "grad_norm": 3.0403029918670654, + "learning_rate": 0.0004998862343572241, + "loss": 6.1493, + "step": 4396 + }, + { + "epoch": 1.5006825938566553, + "grad_norm": 3.202802896499634, + "learning_rate": 0.0004997724687144482, + "loss": 6.1068, + "step": 4397 + }, + { + "epoch": 1.5010238907849829, + "grad_norm": 3.0703487396240234, + "learning_rate": 0.0004996587030716724, + "loss": 6.4405, + "step": 4398 + }, + { + "epoch": 1.5013651877133105, + "grad_norm": 3.1371710300445557, + "learning_rate": 0.0004995449374288965, + "loss": 6.5393, + "step": 4399 + }, + { + "epoch": 1.5017064846416384, + "grad_norm": 3.1856324672698975, + "learning_rate": 0.0004994311717861205, + "loss": 6.4713, + "step": 4400 + }, + { + "epoch": 1.5020477815699658, + "grad_norm": 3.43066143989563, + "learning_rate": 0.0004993174061433447, + "loss": 5.8309, + "step": 4401 + }, + { + "epoch": 1.5023890784982936, + "grad_norm": 3.037707567214966, + "learning_rate": 0.0004992036405005689, + "loss": 6.4918, + "step": 4402 + }, + { + "epoch": 1.502730375426621, + "grad_norm": 3.1267921924591064, + "learning_rate": 0.000499089874857793, + "loss": 6.2709, + "step": 4403 + }, + { + "epoch": 1.5030716723549489, + "grad_norm": 3.2483456134796143, + "learning_rate": 0.0004989761092150171, + "loss": 6.1242, + "step": 4404 + }, + { + "epoch": 1.5034129692832765, + "grad_norm": 3.104841947555542, + "learning_rate": 0.0004988623435722412, + "loss": 6.3647, + "step": 4405 + }, + { + "epoch": 1.5037542662116041, + "grad_norm": 3.147634267807007, + "learning_rate": 0.0004987485779294653, + "loss": 6.5279, + "step": 4406 + }, + { + "epoch": 1.5040955631399318, + "grad_norm": 3.1218855381011963, + "learning_rate": 0.0004986348122866895, + "loss": 6.1442, + "step": 4407 + }, + { + "epoch": 1.5044368600682594, + "grad_norm": 3.1170077323913574, + "learning_rate": 0.0004985210466439136, + "loss": 6.1911, + "step": 4408 + }, + { + "epoch": 1.504778156996587, + "grad_norm": 4.791984558105469, + "learning_rate": 0.0004984072810011377, + "loss": 5.5688, + "step": 4409 + }, + { + "epoch": 1.5051194539249146, + "grad_norm": 2.986837863922119, + "learning_rate": 0.0004982935153583618, + "loss": 6.2765, + "step": 4410 + }, + { + "epoch": 1.5054607508532423, + "grad_norm": 3.148416757583618, + "learning_rate": 0.0004981797497155859, + "loss": 6.2501, + "step": 4411 + }, + { + "epoch": 1.50580204778157, + "grad_norm": 3.0591113567352295, + "learning_rate": 0.00049806598407281, + "loss": 6.6508, + "step": 4412 + }, + { + "epoch": 1.5061433447098977, + "grad_norm": 3.104835271835327, + "learning_rate": 0.0004979522184300341, + "loss": 6.0843, + "step": 4413 + }, + { + "epoch": 1.5064846416382252, + "grad_norm": 3.147568464279175, + "learning_rate": 0.0004978384527872582, + "loss": 5.7625, + "step": 4414 + }, + { + "epoch": 1.506825938566553, + "grad_norm": 3.17414927482605, + "learning_rate": 0.0004977246871444823, + "loss": 5.7501, + "step": 4415 + }, + { + "epoch": 1.5071672354948804, + "grad_norm": 10.699484825134277, + "learning_rate": 0.0004976109215017065, + "loss": 5.8049, + "step": 4416 + }, + { + "epoch": 1.5075085324232083, + "grad_norm": 3.1057991981506348, + "learning_rate": 0.0004974971558589306, + "loss": 6.3801, + "step": 4417 + }, + { + "epoch": 1.5078498293515359, + "grad_norm": 3.1610498428344727, + "learning_rate": 0.0004973833902161547, + "loss": 6.239, + "step": 4418 + }, + { + "epoch": 1.5081911262798635, + "grad_norm": 3.110896348953247, + "learning_rate": 0.0004972696245733789, + "loss": 6.927, + "step": 4419 + }, + { + "epoch": 1.5085324232081911, + "grad_norm": 3.309873580932617, + "learning_rate": 0.000497155858930603, + "loss": 6.2068, + "step": 4420 + }, + { + "epoch": 1.5088737201365188, + "grad_norm": 2.9430341720581055, + "learning_rate": 0.0004970420932878271, + "loss": 6.6838, + "step": 4421 + }, + { + "epoch": 1.5092150170648464, + "grad_norm": 3.1343164443969727, + "learning_rate": 0.0004969283276450512, + "loss": 6.2543, + "step": 4422 + }, + { + "epoch": 1.509556313993174, + "grad_norm": 3.106687068939209, + "learning_rate": 0.0004968145620022753, + "loss": 6.4328, + "step": 4423 + }, + { + "epoch": 1.5098976109215017, + "grad_norm": 3.152050018310547, + "learning_rate": 0.0004967007963594995, + "loss": 5.92, + "step": 4424 + }, + { + "epoch": 1.5102389078498293, + "grad_norm": 3.016350507736206, + "learning_rate": 0.0004965870307167236, + "loss": 6.5876, + "step": 4425 + }, + { + "epoch": 1.5105802047781571, + "grad_norm": 3.782809257507324, + "learning_rate": 0.0004964732650739477, + "loss": 5.1762, + "step": 4426 + }, + { + "epoch": 1.5109215017064845, + "grad_norm": 3.22169828414917, + "learning_rate": 0.0004963594994311718, + "loss": 5.9286, + "step": 4427 + }, + { + "epoch": 1.5112627986348124, + "grad_norm": 3.2454440593719482, + "learning_rate": 0.0004962457337883959, + "loss": 6.5536, + "step": 4428 + }, + { + "epoch": 1.5116040955631398, + "grad_norm": 3.292848587036133, + "learning_rate": 0.00049613196814562, + "loss": 5.9237, + "step": 4429 + }, + { + "epoch": 1.5119453924914676, + "grad_norm": 3.0619893074035645, + "learning_rate": 0.0004960182025028441, + "loss": 6.0451, + "step": 4430 + }, + { + "epoch": 1.5122866894197953, + "grad_norm": 3.2105019092559814, + "learning_rate": 0.0004959044368600682, + "loss": 6.5059, + "step": 4431 + }, + { + "epoch": 1.512627986348123, + "grad_norm": 3.131542921066284, + "learning_rate": 0.0004957906712172923, + "loss": 6.6094, + "step": 4432 + }, + { + "epoch": 1.5129692832764505, + "grad_norm": 3.2492835521698, + "learning_rate": 0.0004956769055745165, + "loss": 6.0207, + "step": 4433 + }, + { + "epoch": 1.5133105802047782, + "grad_norm": 3.1075937747955322, + "learning_rate": 0.0004955631399317406, + "loss": 6.6052, + "step": 4434 + }, + { + "epoch": 1.5136518771331058, + "grad_norm": 3.096921443939209, + "learning_rate": 0.0004954493742889648, + "loss": 6.0225, + "step": 4435 + }, + { + "epoch": 1.5139931740614334, + "grad_norm": 3.1452181339263916, + "learning_rate": 0.0004953356086461889, + "loss": 5.4943, + "step": 4436 + }, + { + "epoch": 1.514334470989761, + "grad_norm": 3.2415482997894287, + "learning_rate": 0.000495221843003413, + "loss": 6.7057, + "step": 4437 + }, + { + "epoch": 1.5146757679180887, + "grad_norm": 3.020129442214966, + "learning_rate": 0.0004951080773606372, + "loss": 6.2966, + "step": 4438 + }, + { + "epoch": 1.5150170648464165, + "grad_norm": 3.014113426208496, + "learning_rate": 0.0004949943117178612, + "loss": 5.4332, + "step": 4439 + }, + { + "epoch": 1.515358361774744, + "grad_norm": 3.206845760345459, + "learning_rate": 0.0004948805460750853, + "loss": 5.9552, + "step": 4440 + }, + { + "epoch": 1.5156996587030718, + "grad_norm": 3.4275362491607666, + "learning_rate": 0.0004947667804323095, + "loss": 5.7084, + "step": 4441 + }, + { + "epoch": 1.5160409556313992, + "grad_norm": 3.051011085510254, + "learning_rate": 0.0004946530147895336, + "loss": 6.4459, + "step": 4442 + }, + { + "epoch": 1.516382252559727, + "grad_norm": 4.869009017944336, + "learning_rate": 0.0004945392491467577, + "loss": 5.4903, + "step": 4443 + }, + { + "epoch": 1.5167235494880547, + "grad_norm": 3.0409438610076904, + "learning_rate": 0.0004944254835039818, + "loss": 6.5647, + "step": 4444 + }, + { + "epoch": 1.5170648464163823, + "grad_norm": 3.4340035915374756, + "learning_rate": 0.0004943117178612059, + "loss": 6.1847, + "step": 4445 + }, + { + "epoch": 1.51740614334471, + "grad_norm": 3.105072021484375, + "learning_rate": 0.00049419795221843, + "loss": 6.2396, + "step": 4446 + }, + { + "epoch": 1.5177474402730375, + "grad_norm": 7.7960638999938965, + "learning_rate": 0.0004940841865756542, + "loss": 5.791, + "step": 4447 + }, + { + "epoch": 1.5180887372013652, + "grad_norm": 3.257929801940918, + "learning_rate": 0.0004939704209328783, + "loss": 6.3392, + "step": 4448 + }, + { + "epoch": 1.5184300341296928, + "grad_norm": 3.071336269378662, + "learning_rate": 0.0004938566552901023, + "loss": 6.1078, + "step": 4449 + }, + { + "epoch": 1.5187713310580204, + "grad_norm": 3.080676317214966, + "learning_rate": 0.0004937428896473265, + "loss": 6.3788, + "step": 4450 + }, + { + "epoch": 1.519112627986348, + "grad_norm": 3.079050064086914, + "learning_rate": 0.0004936291240045506, + "loss": 6.1288, + "step": 4451 + }, + { + "epoch": 1.519453924914676, + "grad_norm": 3.3807246685028076, + "learning_rate": 0.0004935153583617748, + "loss": 6.4449, + "step": 4452 + }, + { + "epoch": 1.5197952218430033, + "grad_norm": 4.4379730224609375, + "learning_rate": 0.0004934015927189989, + "loss": 6.0084, + "step": 4453 + }, + { + "epoch": 1.5201365187713312, + "grad_norm": 5.487371444702148, + "learning_rate": 0.000493287827076223, + "loss": 6.276, + "step": 4454 + }, + { + "epoch": 1.5204778156996586, + "grad_norm": 3.2696280479431152, + "learning_rate": 0.0004931740614334471, + "loss": 6.928, + "step": 4455 + }, + { + "epoch": 1.5208191126279864, + "grad_norm": 3.299424648284912, + "learning_rate": 0.0004930602957906713, + "loss": 5.6853, + "step": 4456 + }, + { + "epoch": 1.521160409556314, + "grad_norm": 3.520862340927124, + "learning_rate": 0.0004929465301478954, + "loss": 6.0821, + "step": 4457 + }, + { + "epoch": 1.5215017064846417, + "grad_norm": 3.142890691757202, + "learning_rate": 0.0004928327645051195, + "loss": 6.0354, + "step": 4458 + }, + { + "epoch": 1.5218430034129693, + "grad_norm": 2.9746880531311035, + "learning_rate": 0.0004927189988623436, + "loss": 6.4439, + "step": 4459 + }, + { + "epoch": 1.522184300341297, + "grad_norm": 4.289761066436768, + "learning_rate": 0.0004926052332195677, + "loss": 5.2062, + "step": 4460 + }, + { + "epoch": 1.5225255972696246, + "grad_norm": 3.730030059814453, + "learning_rate": 0.0004924914675767918, + "loss": 4.8441, + "step": 4461 + }, + { + "epoch": 1.5228668941979522, + "grad_norm": 3.0463922023773193, + "learning_rate": 0.0004923777019340159, + "loss": 6.1014, + "step": 4462 + }, + { + "epoch": 1.5232081911262798, + "grad_norm": 3.1336638927459717, + "learning_rate": 0.00049226393629124, + "loss": 6.3418, + "step": 4463 + }, + { + "epoch": 1.5235494880546074, + "grad_norm": 3.0351781845092773, + "learning_rate": 0.0004921501706484642, + "loss": 6.3558, + "step": 4464 + }, + { + "epoch": 1.5238907849829353, + "grad_norm": 3.130140542984009, + "learning_rate": 0.0004920364050056883, + "loss": 6.5153, + "step": 4465 + }, + { + "epoch": 1.5242320819112627, + "grad_norm": 3.104055404663086, + "learning_rate": 0.0004919226393629124, + "loss": 6.4679, + "step": 4466 + }, + { + "epoch": 1.5245733788395905, + "grad_norm": 4.129725933074951, + "learning_rate": 0.0004918088737201365, + "loss": 5.9675, + "step": 4467 + }, + { + "epoch": 1.524914675767918, + "grad_norm": 3.040032386779785, + "learning_rate": 0.0004916951080773606, + "loss": 6.476, + "step": 4468 + }, + { + "epoch": 1.5252559726962458, + "grad_norm": 2.9849636554718018, + "learning_rate": 0.0004915813424345848, + "loss": 6.3094, + "step": 4469 + }, + { + "epoch": 1.5255972696245734, + "grad_norm": 3.130922794342041, + "learning_rate": 0.0004914675767918089, + "loss": 6.2869, + "step": 4470 + }, + { + "epoch": 1.525938566552901, + "grad_norm": 3.0397064685821533, + "learning_rate": 0.000491353811149033, + "loss": 6.6056, + "step": 4471 + }, + { + "epoch": 1.5262798634812287, + "grad_norm": 3.0539159774780273, + "learning_rate": 0.0004912400455062571, + "loss": 6.4248, + "step": 4472 + }, + { + "epoch": 1.5266211604095563, + "grad_norm": 2.9415955543518066, + "learning_rate": 0.0004911262798634813, + "loss": 6.1774, + "step": 4473 + }, + { + "epoch": 1.526962457337884, + "grad_norm": 3.0912396907806396, + "learning_rate": 0.0004910125142207054, + "loss": 6.3592, + "step": 4474 + }, + { + "epoch": 1.5273037542662116, + "grad_norm": 3.2052664756774902, + "learning_rate": 0.0004908987485779295, + "loss": 5.9968, + "step": 4475 + }, + { + "epoch": 1.5276450511945392, + "grad_norm": 3.073906421661377, + "learning_rate": 0.0004907849829351536, + "loss": 6.3548, + "step": 4476 + }, + { + "epoch": 1.5279863481228668, + "grad_norm": 3.0415360927581787, + "learning_rate": 0.0004906712172923777, + "loss": 5.5661, + "step": 4477 + }, + { + "epoch": 1.5283276450511947, + "grad_norm": 2.9644320011138916, + "learning_rate": 0.0004905574516496018, + "loss": 5.781, + "step": 4478 + }, + { + "epoch": 1.528668941979522, + "grad_norm": 3.074263095855713, + "learning_rate": 0.0004904436860068259, + "loss": 6.4673, + "step": 4479 + }, + { + "epoch": 1.52901023890785, + "grad_norm": 3.1107773780822754, + "learning_rate": 0.00049032992036405, + "loss": 6.2733, + "step": 4480 + }, + { + "epoch": 1.5293515358361773, + "grad_norm": 3.262098550796509, + "learning_rate": 0.0004902161547212742, + "loss": 5.7654, + "step": 4481 + }, + { + "epoch": 1.5296928327645052, + "grad_norm": 7.093457221984863, + "learning_rate": 0.0004901023890784983, + "loss": 5.9941, + "step": 4482 + }, + { + "epoch": 1.5300341296928328, + "grad_norm": 3.19046950340271, + "learning_rate": 0.0004899886234357224, + "loss": 6.786, + "step": 4483 + }, + { + "epoch": 1.5303754266211604, + "grad_norm": 3.732973337173462, + "learning_rate": 0.0004898748577929465, + "loss": 5.1743, + "step": 4484 + }, + { + "epoch": 1.530716723549488, + "grad_norm": 3.1745400428771973, + "learning_rate": 0.0004897610921501706, + "loss": 5.9867, + "step": 4485 + }, + { + "epoch": 1.5310580204778157, + "grad_norm": 3.155008554458618, + "learning_rate": 0.0004896473265073948, + "loss": 6.6222, + "step": 4486 + }, + { + "epoch": 1.5313993174061433, + "grad_norm": 4.09003210067749, + "learning_rate": 0.000489533560864619, + "loss": 5.6898, + "step": 4487 + }, + { + "epoch": 1.531740614334471, + "grad_norm": 4.12489128112793, + "learning_rate": 0.000489419795221843, + "loss": 4.9424, + "step": 4488 + }, + { + "epoch": 1.5320819112627988, + "grad_norm": 3.1861050128936768, + "learning_rate": 0.0004893060295790671, + "loss": 6.032, + "step": 4489 + }, + { + "epoch": 1.5324232081911262, + "grad_norm": 3.163435935974121, + "learning_rate": 0.0004891922639362913, + "loss": 6.3353, + "step": 4490 + }, + { + "epoch": 1.532764505119454, + "grad_norm": 2.9932167530059814, + "learning_rate": 0.0004890784982935154, + "loss": 6.5483, + "step": 4491 + }, + { + "epoch": 1.5331058020477815, + "grad_norm": 3.199826955795288, + "learning_rate": 0.0004889647326507395, + "loss": 6.1149, + "step": 4492 + }, + { + "epoch": 1.5334470989761093, + "grad_norm": 3.0402212142944336, + "learning_rate": 0.0004888509670079636, + "loss": 6.484, + "step": 4493 + }, + { + "epoch": 1.5337883959044367, + "grad_norm": 3.0578010082244873, + "learning_rate": 0.0004887372013651877, + "loss": 6.7166, + "step": 4494 + }, + { + "epoch": 1.5341296928327646, + "grad_norm": 3.0253403186798096, + "learning_rate": 0.0004886234357224118, + "loss": 6.8642, + "step": 4495 + }, + { + "epoch": 1.5344709897610922, + "grad_norm": 3.225531578063965, + "learning_rate": 0.000488509670079636, + "loss": 5.9513, + "step": 4496 + }, + { + "epoch": 1.5348122866894198, + "grad_norm": 3.0968496799468994, + "learning_rate": 0.0004883959044368601, + "loss": 6.327, + "step": 4497 + }, + { + "epoch": 1.5351535836177475, + "grad_norm": 3.0139734745025635, + "learning_rate": 0.0004882821387940841, + "loss": 6.8722, + "step": 4498 + }, + { + "epoch": 1.535494880546075, + "grad_norm": 3.061629056930542, + "learning_rate": 0.0004881683731513083, + "loss": 6.5616, + "step": 4499 + }, + { + "epoch": 1.5358361774744027, + "grad_norm": 3.062117338180542, + "learning_rate": 0.00048805460750853244, + "loss": 6.8498, + "step": 4500 + }, + { + "epoch": 1.5361774744027303, + "grad_norm": 3.0551111698150635, + "learning_rate": 0.00048794084186575654, + "loss": 6.2062, + "step": 4501 + }, + { + "epoch": 1.5365187713310582, + "grad_norm": 3.084839105606079, + "learning_rate": 0.00048782707622298065, + "loss": 6.1916, + "step": 4502 + }, + { + "epoch": 1.5368600682593856, + "grad_norm": 2.976888418197632, + "learning_rate": 0.0004877133105802048, + "loss": 6.6639, + "step": 4503 + }, + { + "epoch": 1.5372013651877134, + "grad_norm": 2.9720981121063232, + "learning_rate": 0.0004875995449374289, + "loss": 6.4217, + "step": 4504 + }, + { + "epoch": 1.5375426621160408, + "grad_norm": 3.941185712814331, + "learning_rate": 0.000487485779294653, + "loss": 5.9576, + "step": 4505 + }, + { + "epoch": 1.5378839590443687, + "grad_norm": 3.0946285724639893, + "learning_rate": 0.0004873720136518772, + "loss": 6.3165, + "step": 4506 + }, + { + "epoch": 1.538225255972696, + "grad_norm": 3.0190694332122803, + "learning_rate": 0.00048725824800910123, + "loss": 6.4502, + "step": 4507 + }, + { + "epoch": 1.538566552901024, + "grad_norm": 3.0796146392822266, + "learning_rate": 0.00048714448236632533, + "loss": 6.1499, + "step": 4508 + }, + { + "epoch": 1.5389078498293516, + "grad_norm": 4.050348281860352, + "learning_rate": 0.0004870307167235495, + "loss": 6.0962, + "step": 4509 + }, + { + "epoch": 1.5392491467576792, + "grad_norm": 3.1888248920440674, + "learning_rate": 0.0004869169510807736, + "loss": 6.0087, + "step": 4510 + }, + { + "epoch": 1.5395904436860068, + "grad_norm": 2.9527971744537354, + "learning_rate": 0.00048680318543799776, + "loss": 6.3688, + "step": 4511 + }, + { + "epoch": 1.5399317406143345, + "grad_norm": 3.0263445377349854, + "learning_rate": 0.00048668941979522186, + "loss": 5.7253, + "step": 4512 + }, + { + "epoch": 1.540273037542662, + "grad_norm": 3.0024619102478027, + "learning_rate": 0.00048657565415244597, + "loss": 6.0979, + "step": 4513 + }, + { + "epoch": 1.5406143344709897, + "grad_norm": 3.0450499057769775, + "learning_rate": 0.0004864618885096701, + "loss": 6.0442, + "step": 4514 + }, + { + "epoch": 1.5409556313993176, + "grad_norm": 3.137784242630005, + "learning_rate": 0.00048634812286689423, + "loss": 6.4424, + "step": 4515 + }, + { + "epoch": 1.541296928327645, + "grad_norm": 3.5526490211486816, + "learning_rate": 0.00048623435722411833, + "loss": 6.2877, + "step": 4516 + }, + { + "epoch": 1.5416382252559728, + "grad_norm": 3.030540704727173, + "learning_rate": 0.00048612059158134244, + "loss": 6.2586, + "step": 4517 + }, + { + "epoch": 1.5419795221843002, + "grad_norm": 3.2171597480773926, + "learning_rate": 0.00048600682593856654, + "loss": 6.0998, + "step": 4518 + }, + { + "epoch": 1.542320819112628, + "grad_norm": 2.999274730682373, + "learning_rate": 0.00048589306029579065, + "loss": 6.0791, + "step": 4519 + }, + { + "epoch": 1.5426621160409555, + "grad_norm": 3.330794334411621, + "learning_rate": 0.0004857792946530148, + "loss": 5.9229, + "step": 4520 + }, + { + "epoch": 1.5430034129692833, + "grad_norm": 3.1574463844299316, + "learning_rate": 0.0004856655290102389, + "loss": 5.7773, + "step": 4521 + }, + { + "epoch": 1.543344709897611, + "grad_norm": 3.089190721511841, + "learning_rate": 0.000485551763367463, + "loss": 6.2909, + "step": 4522 + }, + { + "epoch": 1.5436860068259386, + "grad_norm": 2.9726176261901855, + "learning_rate": 0.0004854379977246872, + "loss": 5.816, + "step": 4523 + }, + { + "epoch": 1.5440273037542662, + "grad_norm": 3.124121904373169, + "learning_rate": 0.0004853242320819113, + "loss": 6.2375, + "step": 4524 + }, + { + "epoch": 1.5443686006825939, + "grad_norm": 3.3395678997039795, + "learning_rate": 0.0004852104664391354, + "loss": 6.0664, + "step": 4525 + }, + { + "epoch": 1.5447098976109215, + "grad_norm": 3.0564520359039307, + "learning_rate": 0.00048509670079635955, + "loss": 6.4603, + "step": 4526 + }, + { + "epoch": 1.545051194539249, + "grad_norm": 3.5565192699432373, + "learning_rate": 0.0004849829351535836, + "loss": 6.0142, + "step": 4527 + }, + { + "epoch": 1.545392491467577, + "grad_norm": 3.0865042209625244, + "learning_rate": 0.0004848691695108077, + "loss": 6.3314, + "step": 4528 + }, + { + "epoch": 1.5457337883959044, + "grad_norm": 4.123645782470703, + "learning_rate": 0.00048475540386803186, + "loss": 5.3, + "step": 4529 + }, + { + "epoch": 1.5460750853242322, + "grad_norm": 3.4055612087249756, + "learning_rate": 0.00048464163822525597, + "loss": 5.816, + "step": 4530 + }, + { + "epoch": 1.5464163822525596, + "grad_norm": 3.080965042114258, + "learning_rate": 0.0004845278725824801, + "loss": 6.9478, + "step": 4531 + }, + { + "epoch": 1.5467576791808875, + "grad_norm": 3.1426844596862793, + "learning_rate": 0.00048441410693970423, + "loss": 5.8243, + "step": 4532 + }, + { + "epoch": 1.5470989761092149, + "grad_norm": 3.0503451824188232, + "learning_rate": 0.00048430034129692834, + "loss": 6.595, + "step": 4533 + }, + { + "epoch": 1.5474402730375427, + "grad_norm": 2.9815714359283447, + "learning_rate": 0.0004841865756541525, + "loss": 6.7113, + "step": 4534 + }, + { + "epoch": 1.5477815699658704, + "grad_norm": 2.955928325653076, + "learning_rate": 0.0004840728100113766, + "loss": 6.4205, + "step": 4535 + }, + { + "epoch": 1.548122866894198, + "grad_norm": 3.0034239292144775, + "learning_rate": 0.00048395904436860065, + "loss": 6.4585, + "step": 4536 + }, + { + "epoch": 1.5484641638225256, + "grad_norm": 3.0914156436920166, + "learning_rate": 0.0004838452787258248, + "loss": 6.3329, + "step": 4537 + }, + { + "epoch": 1.5488054607508532, + "grad_norm": 3.089557409286499, + "learning_rate": 0.0004837315130830489, + "loss": 6.2133, + "step": 4538 + }, + { + "epoch": 1.5491467576791809, + "grad_norm": 3.246049642562866, + "learning_rate": 0.000483617747440273, + "loss": 6.5331, + "step": 4539 + }, + { + "epoch": 1.5494880546075085, + "grad_norm": 3.0475661754608154, + "learning_rate": 0.0004835039817974972, + "loss": 6.365, + "step": 4540 + }, + { + "epoch": 1.5498293515358363, + "grad_norm": 3.186208486557007, + "learning_rate": 0.0004833902161547213, + "loss": 6.1348, + "step": 4541 + }, + { + "epoch": 1.5501706484641637, + "grad_norm": 3.0234334468841553, + "learning_rate": 0.0004832764505119454, + "loss": 6.4846, + "step": 4542 + }, + { + "epoch": 1.5505119453924916, + "grad_norm": 7.129222869873047, + "learning_rate": 0.00048316268486916955, + "loss": 5.4173, + "step": 4543 + }, + { + "epoch": 1.550853242320819, + "grad_norm": 3.194537401199341, + "learning_rate": 0.00048304891922639365, + "loss": 6.3047, + "step": 4544 + }, + { + "epoch": 1.5511945392491469, + "grad_norm": 3.1084678173065186, + "learning_rate": 0.00048293515358361776, + "loss": 6.288, + "step": 4545 + }, + { + "epoch": 1.5515358361774743, + "grad_norm": 3.1638569831848145, + "learning_rate": 0.00048282138794084186, + "loss": 6.6077, + "step": 4546 + }, + { + "epoch": 1.551877133105802, + "grad_norm": 3.195992946624756, + "learning_rate": 0.00048270762229806597, + "loss": 6.6632, + "step": 4547 + }, + { + "epoch": 1.5522184300341297, + "grad_norm": 3.146247148513794, + "learning_rate": 0.00048259385665529007, + "loss": 6.5658, + "step": 4548 + }, + { + "epoch": 1.5525597269624574, + "grad_norm": 3.0369887351989746, + "learning_rate": 0.00048248009101251423, + "loss": 6.1692, + "step": 4549 + }, + { + "epoch": 1.552901023890785, + "grad_norm": 3.0904510021209717, + "learning_rate": 0.00048236632536973834, + "loss": 6.1782, + "step": 4550 + }, + { + "epoch": 1.5532423208191126, + "grad_norm": 3.021652936935425, + "learning_rate": 0.0004822525597269625, + "loss": 6.3469, + "step": 4551 + }, + { + "epoch": 1.5535836177474402, + "grad_norm": 3.161728858947754, + "learning_rate": 0.0004821387940841866, + "loss": 6.258, + "step": 4552 + }, + { + "epoch": 1.5539249146757679, + "grad_norm": 3.3773646354675293, + "learning_rate": 0.0004820250284414107, + "loss": 5.8451, + "step": 4553 + }, + { + "epoch": 1.5542662116040957, + "grad_norm": 3.128512144088745, + "learning_rate": 0.00048191126279863486, + "loss": 6.5491, + "step": 4554 + }, + { + "epoch": 1.5546075085324231, + "grad_norm": 3.0670344829559326, + "learning_rate": 0.00048179749715585897, + "loss": 7.086, + "step": 4555 + }, + { + "epoch": 1.554948805460751, + "grad_norm": 3.074653387069702, + "learning_rate": 0.000481683731513083, + "loss": 6.4795, + "step": 4556 + }, + { + "epoch": 1.5552901023890784, + "grad_norm": 3.1203551292419434, + "learning_rate": 0.0004815699658703072, + "loss": 6.3373, + "step": 4557 + }, + { + "epoch": 1.5556313993174062, + "grad_norm": 3.1555371284484863, + "learning_rate": 0.0004814562002275313, + "loss": 6.8241, + "step": 4558 + }, + { + "epoch": 1.5559726962457336, + "grad_norm": 3.1487646102905273, + "learning_rate": 0.0004813424345847554, + "loss": 6.4579, + "step": 4559 + }, + { + "epoch": 1.5563139931740615, + "grad_norm": 3.830792188644409, + "learning_rate": 0.00048122866894197955, + "loss": 6.1349, + "step": 4560 + }, + { + "epoch": 1.5566552901023891, + "grad_norm": 3.0813772678375244, + "learning_rate": 0.00048111490329920365, + "loss": 6.9713, + "step": 4561 + }, + { + "epoch": 1.5569965870307167, + "grad_norm": 3.344524621963501, + "learning_rate": 0.00048100113765642776, + "loss": 5.4983, + "step": 4562 + }, + { + "epoch": 1.5573378839590444, + "grad_norm": 3.2821524143218994, + "learning_rate": 0.0004808873720136519, + "loss": 6.4667, + "step": 4563 + }, + { + "epoch": 1.557679180887372, + "grad_norm": 3.1274075508117676, + "learning_rate": 0.000480773606370876, + "loss": 6.7131, + "step": 4564 + }, + { + "epoch": 1.5580204778156996, + "grad_norm": 2.94832444190979, + "learning_rate": 0.00048065984072810013, + "loss": 6.3848, + "step": 4565 + }, + { + "epoch": 1.5583617747440273, + "grad_norm": 3.0074613094329834, + "learning_rate": 0.00048054607508532423, + "loss": 6.2491, + "step": 4566 + }, + { + "epoch": 1.5587030716723551, + "grad_norm": 3.0959746837615967, + "learning_rate": 0.00048043230944254834, + "loss": 5.8382, + "step": 4567 + }, + { + "epoch": 1.5590443686006825, + "grad_norm": 3.2001736164093018, + "learning_rate": 0.00048031854379977244, + "loss": 6.5361, + "step": 4568 + }, + { + "epoch": 1.5593856655290104, + "grad_norm": 3.8832218647003174, + "learning_rate": 0.0004802047781569966, + "loss": 5.3809, + "step": 4569 + }, + { + "epoch": 1.5597269624573378, + "grad_norm": 3.133195638656616, + "learning_rate": 0.0004800910125142207, + "loss": 6.6976, + "step": 4570 + }, + { + "epoch": 1.5600682593856656, + "grad_norm": 3.2037956714630127, + "learning_rate": 0.00047997724687144487, + "loss": 5.5344, + "step": 4571 + }, + { + "epoch": 1.560409556313993, + "grad_norm": 3.157449722290039, + "learning_rate": 0.00047986348122866897, + "loss": 6.2814, + "step": 4572 + }, + { + "epoch": 1.5607508532423209, + "grad_norm": 3.062404155731201, + "learning_rate": 0.0004797497155858931, + "loss": 6.316, + "step": 4573 + }, + { + "epoch": 1.5610921501706485, + "grad_norm": 3.134361982345581, + "learning_rate": 0.00047963594994311723, + "loss": 6.5262, + "step": 4574 + }, + { + "epoch": 1.5614334470989761, + "grad_norm": 3.2360646724700928, + "learning_rate": 0.0004795221843003413, + "loss": 6.2296, + "step": 4575 + }, + { + "epoch": 1.5617747440273038, + "grad_norm": 3.012258529663086, + "learning_rate": 0.0004794084186575654, + "loss": 6.7743, + "step": 4576 + }, + { + "epoch": 1.5621160409556314, + "grad_norm": 3.5233166217803955, + "learning_rate": 0.00047929465301478955, + "loss": 5.9291, + "step": 4577 + }, + { + "epoch": 1.562457337883959, + "grad_norm": 4.878261566162109, + "learning_rate": 0.00047918088737201365, + "loss": 5.7263, + "step": 4578 + }, + { + "epoch": 1.5627986348122866, + "grad_norm": 3.228557825088501, + "learning_rate": 0.00047906712172923776, + "loss": 6.0776, + "step": 4579 + }, + { + "epoch": 1.5631399317406145, + "grad_norm": 3.1920437812805176, + "learning_rate": 0.0004789533560864619, + "loss": 6.7448, + "step": 4580 + }, + { + "epoch": 1.563481228668942, + "grad_norm": 3.1803622245788574, + "learning_rate": 0.000478839590443686, + "loss": 6.0921, + "step": 4581 + }, + { + "epoch": 1.5638225255972698, + "grad_norm": 5.843010902404785, + "learning_rate": 0.00047872582480091013, + "loss": 5.7007, + "step": 4582 + }, + { + "epoch": 1.5641638225255972, + "grad_norm": 3.6518936157226562, + "learning_rate": 0.0004786120591581343, + "loss": 6.3155, + "step": 4583 + }, + { + "epoch": 1.564505119453925, + "grad_norm": 3.08497953414917, + "learning_rate": 0.0004784982935153584, + "loss": 6.524, + "step": 4584 + }, + { + "epoch": 1.5648464163822524, + "grad_norm": 2.1860828399658203, + "learning_rate": 0.00047838452787258244, + "loss": 3.6702, + "step": 4585 + }, + { + "epoch": 1.5651877133105803, + "grad_norm": 3.152881145477295, + "learning_rate": 0.0004782707622298066, + "loss": 5.9726, + "step": 4586 + }, + { + "epoch": 1.565529010238908, + "grad_norm": 3.119457721710205, + "learning_rate": 0.0004781569965870307, + "loss": 5.8977, + "step": 4587 + }, + { + "epoch": 1.5658703071672355, + "grad_norm": 3.2837250232696533, + "learning_rate": 0.0004780432309442548, + "loss": 6.7914, + "step": 4588 + }, + { + "epoch": 1.5662116040955631, + "grad_norm": 3.0338499546051025, + "learning_rate": 0.00047792946530147897, + "loss": 6.6875, + "step": 4589 + }, + { + "epoch": 1.5665529010238908, + "grad_norm": 3.1285126209259033, + "learning_rate": 0.0004778156996587031, + "loss": 6.115, + "step": 4590 + }, + { + "epoch": 1.5668941979522184, + "grad_norm": 3.540517568588257, + "learning_rate": 0.00047770193401592724, + "loss": 5.9633, + "step": 4591 + }, + { + "epoch": 1.567235494880546, + "grad_norm": 3.079838991165161, + "learning_rate": 0.00047758816837315134, + "loss": 5.9609, + "step": 4592 + }, + { + "epoch": 1.5675767918088739, + "grad_norm": 3.100437879562378, + "learning_rate": 0.00047747440273037545, + "loss": 5.9144, + "step": 4593 + }, + { + "epoch": 1.5679180887372013, + "grad_norm": 3.044776439666748, + "learning_rate": 0.0004773606370875996, + "loss": 6.1149, + "step": 4594 + }, + { + "epoch": 1.5682593856655291, + "grad_norm": 3.157855749130249, + "learning_rate": 0.00047724687144482366, + "loss": 6.7326, + "step": 4595 + }, + { + "epoch": 1.5686006825938565, + "grad_norm": 6.666663646697998, + "learning_rate": 0.00047713310580204776, + "loss": 5.6815, + "step": 4596 + }, + { + "epoch": 1.5689419795221844, + "grad_norm": 3.1114113330841064, + "learning_rate": 0.0004770193401592719, + "loss": 5.6961, + "step": 4597 + }, + { + "epoch": 1.5692832764505118, + "grad_norm": 3.6068615913391113, + "learning_rate": 0.000476905574516496, + "loss": 4.9314, + "step": 4598 + }, + { + "epoch": 1.5696245733788396, + "grad_norm": 3.157696008682251, + "learning_rate": 0.00047679180887372013, + "loss": 6.3051, + "step": 4599 + }, + { + "epoch": 1.5699658703071673, + "grad_norm": 3.262202262878418, + "learning_rate": 0.0004766780432309443, + "loss": 6.2528, + "step": 4600 + }, + { + "epoch": 1.570307167235495, + "grad_norm": 3.1908304691314697, + "learning_rate": 0.0004765642775881684, + "loss": 6.1413, + "step": 4601 + }, + { + "epoch": 1.5706484641638225, + "grad_norm": 3.041872978210449, + "learning_rate": 0.0004764505119453925, + "loss": 6.4687, + "step": 4602 + }, + { + "epoch": 1.5709897610921502, + "grad_norm": 3.907120943069458, + "learning_rate": 0.00047633674630261666, + "loss": 5.4988, + "step": 4603 + }, + { + "epoch": 1.5713310580204778, + "grad_norm": 3.080669403076172, + "learning_rate": 0.00047622298065984076, + "loss": 6.2327, + "step": 4604 + }, + { + "epoch": 1.5716723549488054, + "grad_norm": 3.0005602836608887, + "learning_rate": 0.0004761092150170648, + "loss": 6.2388, + "step": 4605 + }, + { + "epoch": 1.5720136518771333, + "grad_norm": 3.291757345199585, + "learning_rate": 0.00047599544937428897, + "loss": 6.102, + "step": 4606 + }, + { + "epoch": 1.5723549488054607, + "grad_norm": 3.188426971435547, + "learning_rate": 0.0004758816837315131, + "loss": 6.7411, + "step": 4607 + }, + { + "epoch": 1.5726962457337885, + "grad_norm": 3.0288374423980713, + "learning_rate": 0.0004757679180887372, + "loss": 5.8925, + "step": 4608 + }, + { + "epoch": 1.573037542662116, + "grad_norm": 3.1033778190612793, + "learning_rate": 0.00047565415244596134, + "loss": 6.0647, + "step": 4609 + }, + { + "epoch": 1.5733788395904438, + "grad_norm": 3.0476388931274414, + "learning_rate": 0.00047554038680318545, + "loss": 6.4688, + "step": 4610 + }, + { + "epoch": 1.5737201365187712, + "grad_norm": 4.029483795166016, + "learning_rate": 0.00047542662116040955, + "loss": 5.5514, + "step": 4611 + }, + { + "epoch": 1.574061433447099, + "grad_norm": 3.1188085079193115, + "learning_rate": 0.0004753128555176337, + "loss": 6.6279, + "step": 4612 + }, + { + "epoch": 1.5744027303754267, + "grad_norm": 3.5747570991516113, + "learning_rate": 0.0004751990898748578, + "loss": 6.019, + "step": 4613 + }, + { + "epoch": 1.5747440273037543, + "grad_norm": 3.1913697719573975, + "learning_rate": 0.0004750853242320819, + "loss": 5.9978, + "step": 4614 + }, + { + "epoch": 1.575085324232082, + "grad_norm": 3.071608543395996, + "learning_rate": 0.000474971558589306, + "loss": 6.2824, + "step": 4615 + }, + { + "epoch": 1.5754266211604095, + "grad_norm": 3.110900402069092, + "learning_rate": 0.00047485779294653013, + "loss": 6.441, + "step": 4616 + }, + { + "epoch": 1.5757679180887372, + "grad_norm": 3.037877321243286, + "learning_rate": 0.0004747440273037543, + "loss": 6.5409, + "step": 4617 + }, + { + "epoch": 1.5761092150170648, + "grad_norm": 3.036573886871338, + "learning_rate": 0.0004746302616609784, + "loss": 6.5253, + "step": 4618 + }, + { + "epoch": 1.5764505119453927, + "grad_norm": 3.087519645690918, + "learning_rate": 0.0004745164960182025, + "loss": 6.2424, + "step": 4619 + }, + { + "epoch": 1.57679180887372, + "grad_norm": 2.925489902496338, + "learning_rate": 0.00047440273037542666, + "loss": 6.7424, + "step": 4620 + }, + { + "epoch": 1.577133105802048, + "grad_norm": 2.996990442276001, + "learning_rate": 0.00047428896473265076, + "loss": 6.6232, + "step": 4621 + }, + { + "epoch": 1.5774744027303753, + "grad_norm": 3.041271924972534, + "learning_rate": 0.00047417519908987487, + "loss": 6.5346, + "step": 4622 + }, + { + "epoch": 1.5778156996587032, + "grad_norm": 3.0396969318389893, + "learning_rate": 0.00047406143344709903, + "loss": 6.3334, + "step": 4623 + }, + { + "epoch": 1.5781569965870306, + "grad_norm": 2.9244205951690674, + "learning_rate": 0.0004739476678043231, + "loss": 6.4611, + "step": 4624 + }, + { + "epoch": 1.5784982935153584, + "grad_norm": 3.119096517562866, + "learning_rate": 0.0004738339021615472, + "loss": 6.2721, + "step": 4625 + }, + { + "epoch": 1.578839590443686, + "grad_norm": 3.0966577529907227, + "learning_rate": 0.00047372013651877134, + "loss": 6.0143, + "step": 4626 + }, + { + "epoch": 1.5791808873720137, + "grad_norm": 2.9753429889678955, + "learning_rate": 0.00047360637087599545, + "loss": 6.1939, + "step": 4627 + }, + { + "epoch": 1.5795221843003413, + "grad_norm": 3.2640163898468018, + "learning_rate": 0.00047349260523321955, + "loss": 6.7567, + "step": 4628 + }, + { + "epoch": 1.579863481228669, + "grad_norm": 3.492380380630493, + "learning_rate": 0.0004733788395904437, + "loss": 5.9374, + "step": 4629 + }, + { + "epoch": 1.5802047781569966, + "grad_norm": 3.118227481842041, + "learning_rate": 0.0004732650739476678, + "loss": 6.7009, + "step": 4630 + }, + { + "epoch": 1.5805460750853242, + "grad_norm": 3.0515055656433105, + "learning_rate": 0.0004731513083048919, + "loss": 6.1687, + "step": 4631 + }, + { + "epoch": 1.580887372013652, + "grad_norm": 2.957733154296875, + "learning_rate": 0.0004730375426621161, + "loss": 6.4641, + "step": 4632 + }, + { + "epoch": 1.5812286689419794, + "grad_norm": 3.1707279682159424, + "learning_rate": 0.0004729237770193402, + "loss": 5.8587, + "step": 4633 + }, + { + "epoch": 1.5815699658703073, + "grad_norm": 3.1099092960357666, + "learning_rate": 0.0004728100113765643, + "loss": 6.0575, + "step": 4634 + }, + { + "epoch": 1.5819112627986347, + "grad_norm": 2.9342041015625, + "learning_rate": 0.0004726962457337884, + "loss": 5.9659, + "step": 4635 + }, + { + "epoch": 1.5822525597269625, + "grad_norm": 3.138521909713745, + "learning_rate": 0.0004725824800910125, + "loss": 5.3131, + "step": 4636 + }, + { + "epoch": 1.58259385665529, + "grad_norm": 3.0649149417877197, + "learning_rate": 0.00047246871444823666, + "loss": 6.2565, + "step": 4637 + }, + { + "epoch": 1.5829351535836178, + "grad_norm": 5.445117950439453, + "learning_rate": 0.00047235494880546076, + "loss": 5.9185, + "step": 4638 + }, + { + "epoch": 1.5832764505119454, + "grad_norm": 3.1243233680725098, + "learning_rate": 0.00047224118316268487, + "loss": 6.2452, + "step": 4639 + }, + { + "epoch": 1.583617747440273, + "grad_norm": 3.1046507358551025, + "learning_rate": 0.00047212741751990903, + "loss": 6.2636, + "step": 4640 + }, + { + "epoch": 1.5839590443686007, + "grad_norm": 3.108381509780884, + "learning_rate": 0.00047201365187713313, + "loss": 6.7736, + "step": 4641 + }, + { + "epoch": 1.5843003412969283, + "grad_norm": 3.1656174659729004, + "learning_rate": 0.00047189988623435724, + "loss": 5.8736, + "step": 4642 + }, + { + "epoch": 1.584641638225256, + "grad_norm": 2.9965546131134033, + "learning_rate": 0.00047178612059158134, + "loss": 5.8275, + "step": 4643 + }, + { + "epoch": 1.5849829351535836, + "grad_norm": 2.997084617614746, + "learning_rate": 0.00047167235494880545, + "loss": 6.993, + "step": 4644 + }, + { + "epoch": 1.5853242320819114, + "grad_norm": 2.2224056720733643, + "learning_rate": 0.00047155858930602955, + "loss": 2.9852, + "step": 4645 + }, + { + "epoch": 1.5856655290102388, + "grad_norm": 3.1612727642059326, + "learning_rate": 0.0004714448236632537, + "loss": 6.3212, + "step": 4646 + }, + { + "epoch": 1.5860068259385667, + "grad_norm": 3.2868475914001465, + "learning_rate": 0.0004713310580204778, + "loss": 6.2314, + "step": 4647 + }, + { + "epoch": 1.586348122866894, + "grad_norm": 3.0663857460021973, + "learning_rate": 0.0004712172923777019, + "loss": 6.4269, + "step": 4648 + }, + { + "epoch": 1.586689419795222, + "grad_norm": 3.672306537628174, + "learning_rate": 0.0004711035267349261, + "loss": 5.4267, + "step": 4649 + }, + { + "epoch": 1.5870307167235493, + "grad_norm": 3.1437530517578125, + "learning_rate": 0.0004709897610921502, + "loss": 6.4256, + "step": 4650 + }, + { + "epoch": 1.5873720136518772, + "grad_norm": 3.077855348587036, + "learning_rate": 0.0004708759954493743, + "loss": 6.1129, + "step": 4651 + }, + { + "epoch": 1.5877133105802048, + "grad_norm": 3.188419818878174, + "learning_rate": 0.00047076222980659845, + "loss": 6.0257, + "step": 4652 + }, + { + "epoch": 1.5880546075085324, + "grad_norm": 2.986910581588745, + "learning_rate": 0.0004706484641638225, + "loss": 6.3658, + "step": 4653 + }, + { + "epoch": 1.58839590443686, + "grad_norm": 3.301227569580078, + "learning_rate": 0.00047053469852104666, + "loss": 5.64, + "step": 4654 + }, + { + "epoch": 1.5887372013651877, + "grad_norm": 4.034798622131348, + "learning_rate": 0.00047042093287827077, + "loss": 5.0341, + "step": 4655 + }, + { + "epoch": 1.5890784982935153, + "grad_norm": 3.077500104904175, + "learning_rate": 0.00047030716723549487, + "loss": 5.9843, + "step": 4656 + }, + { + "epoch": 1.589419795221843, + "grad_norm": 2.99000883102417, + "learning_rate": 0.00047019340159271903, + "loss": 6.4999, + "step": 4657 + }, + { + "epoch": 1.5897610921501708, + "grad_norm": 3.0318803787231445, + "learning_rate": 0.00047007963594994313, + "loss": 6.5713, + "step": 4658 + }, + { + "epoch": 1.5901023890784982, + "grad_norm": 3.398522138595581, + "learning_rate": 0.00046996587030716724, + "loss": 5.365, + "step": 4659 + }, + { + "epoch": 1.590443686006826, + "grad_norm": 2.9953789710998535, + "learning_rate": 0.0004698521046643914, + "loss": 6.1898, + "step": 4660 + }, + { + "epoch": 1.5907849829351535, + "grad_norm": 3.6404740810394287, + "learning_rate": 0.0004697383390216155, + "loss": 6.2839, + "step": 4661 + }, + { + "epoch": 1.5911262798634813, + "grad_norm": 3.0940699577331543, + "learning_rate": 0.0004696245733788396, + "loss": 6.2371, + "step": 4662 + }, + { + "epoch": 1.5914675767918087, + "grad_norm": 2.9704160690307617, + "learning_rate": 0.0004695108077360637, + "loss": 6.0635, + "step": 4663 + }, + { + "epoch": 1.5918088737201366, + "grad_norm": 3.2446553707122803, + "learning_rate": 0.0004693970420932878, + "loss": 6.2583, + "step": 4664 + }, + { + "epoch": 1.5921501706484642, + "grad_norm": 2.948814868927002, + "learning_rate": 0.0004692832764505119, + "loss": 6.6138, + "step": 4665 + }, + { + "epoch": 1.5924914675767918, + "grad_norm": 2.9930472373962402, + "learning_rate": 0.0004691695108077361, + "loss": 6.747, + "step": 4666 + }, + { + "epoch": 1.5928327645051195, + "grad_norm": 3.368851661682129, + "learning_rate": 0.0004690557451649602, + "loss": 5.8773, + "step": 4667 + }, + { + "epoch": 1.593174061433447, + "grad_norm": 2.977428674697876, + "learning_rate": 0.0004689419795221843, + "loss": 6.556, + "step": 4668 + }, + { + "epoch": 1.5935153583617747, + "grad_norm": 3.041341543197632, + "learning_rate": 0.00046882821387940845, + "loss": 6.6892, + "step": 4669 + }, + { + "epoch": 1.5938566552901023, + "grad_norm": 2.9362432956695557, + "learning_rate": 0.00046871444823663256, + "loss": 6.8953, + "step": 4670 + }, + { + "epoch": 1.5941979522184302, + "grad_norm": 3.3086955547332764, + "learning_rate": 0.00046860068259385666, + "loss": 6.2347, + "step": 4671 + }, + { + "epoch": 1.5945392491467576, + "grad_norm": 3.477236270904541, + "learning_rate": 0.0004684869169510808, + "loss": 2.9186, + "step": 4672 + }, + { + "epoch": 1.5948805460750854, + "grad_norm": 3.360569477081299, + "learning_rate": 0.00046837315130830487, + "loss": 5.8384, + "step": 4673 + }, + { + "epoch": 1.5952218430034129, + "grad_norm": 3.140916585922241, + "learning_rate": 0.000468259385665529, + "loss": 6.0714, + "step": 4674 + }, + { + "epoch": 1.5955631399317407, + "grad_norm": 3.2916316986083984, + "learning_rate": 0.00046814562002275314, + "loss": 5.5684, + "step": 4675 + }, + { + "epoch": 1.595904436860068, + "grad_norm": 3.8546245098114014, + "learning_rate": 0.00046803185437997724, + "loss": 6.0605, + "step": 4676 + }, + { + "epoch": 1.596245733788396, + "grad_norm": 3.0560851097106934, + "learning_rate": 0.0004679180887372014, + "loss": 6.554, + "step": 4677 + }, + { + "epoch": 1.5965870307167236, + "grad_norm": 2.926163911819458, + "learning_rate": 0.0004678043230944255, + "loss": 6.2362, + "step": 4678 + }, + { + "epoch": 1.5969283276450512, + "grad_norm": 3.3139231204986572, + "learning_rate": 0.0004676905574516496, + "loss": 6.3501, + "step": 4679 + }, + { + "epoch": 1.5972696245733788, + "grad_norm": 3.2012598514556885, + "learning_rate": 0.00046757679180887377, + "loss": 5.9128, + "step": 4680 + }, + { + "epoch": 1.5976109215017065, + "grad_norm": 3.0795986652374268, + "learning_rate": 0.0004674630261660979, + "loss": 5.7596, + "step": 4681 + }, + { + "epoch": 1.597952218430034, + "grad_norm": 3.0973923206329346, + "learning_rate": 0.0004673492605233219, + "loss": 6.176, + "step": 4682 + }, + { + "epoch": 1.5982935153583617, + "grad_norm": 2.9738688468933105, + "learning_rate": 0.0004672354948805461, + "loss": 6.2366, + "step": 4683 + }, + { + "epoch": 1.5986348122866896, + "grad_norm": 3.048534870147705, + "learning_rate": 0.0004671217292377702, + "loss": 6.2017, + "step": 4684 + }, + { + "epoch": 1.598976109215017, + "grad_norm": 3.043328285217285, + "learning_rate": 0.0004670079635949943, + "loss": 6.1733, + "step": 4685 + }, + { + "epoch": 1.5993174061433448, + "grad_norm": 3.086865186691284, + "learning_rate": 0.00046689419795221845, + "loss": 6.1099, + "step": 4686 + }, + { + "epoch": 1.5996587030716722, + "grad_norm": 3.1289331912994385, + "learning_rate": 0.00046678043230944256, + "loss": 5.693, + "step": 4687 + }, + { + "epoch": 1.6, + "grad_norm": 3.2024731636047363, + "learning_rate": 0.00046666666666666666, + "loss": 6.616, + "step": 4688 + }, + { + "epoch": 1.6003412969283275, + "grad_norm": 3.0767719745635986, + "learning_rate": 0.0004665529010238908, + "loss": 5.9802, + "step": 4689 + }, + { + "epoch": 1.6006825938566553, + "grad_norm": 3.016394853591919, + "learning_rate": 0.0004664391353811149, + "loss": 6.0161, + "step": 4690 + }, + { + "epoch": 1.601023890784983, + "grad_norm": 2.9882636070251465, + "learning_rate": 0.00046632536973833903, + "loss": 6.7943, + "step": 4691 + }, + { + "epoch": 1.6013651877133106, + "grad_norm": 3.6328351497650146, + "learning_rate": 0.00046621160409556314, + "loss": 5.8045, + "step": 4692 + }, + { + "epoch": 1.6017064846416382, + "grad_norm": 3.0648207664489746, + "learning_rate": 0.00046609783845278724, + "loss": 6.569, + "step": 4693 + }, + { + "epoch": 1.6020477815699659, + "grad_norm": 3.3391435146331787, + "learning_rate": 0.00046598407281001135, + "loss": 6.1539, + "step": 4694 + }, + { + "epoch": 1.6023890784982935, + "grad_norm": 2.938784122467041, + "learning_rate": 0.0004658703071672355, + "loss": 6.4672, + "step": 4695 + }, + { + "epoch": 1.6027303754266211, + "grad_norm": 3.095799684524536, + "learning_rate": 0.0004657565415244596, + "loss": 6.4475, + "step": 4696 + }, + { + "epoch": 1.603071672354949, + "grad_norm": 3.8702211380004883, + "learning_rate": 0.00046564277588168377, + "loss": 5.1767, + "step": 4697 + }, + { + "epoch": 1.6034129692832764, + "grad_norm": 3.0789310932159424, + "learning_rate": 0.0004655290102389079, + "loss": 5.2598, + "step": 4698 + }, + { + "epoch": 1.6037542662116042, + "grad_norm": 3.129061698913574, + "learning_rate": 0.000465415244596132, + "loss": 6.486, + "step": 4699 + }, + { + "epoch": 1.6040955631399316, + "grad_norm": 3.2048168182373047, + "learning_rate": 0.00046530147895335614, + "loss": 6.495, + "step": 4700 + }, + { + "epoch": 1.6044368600682595, + "grad_norm": 3.1725878715515137, + "learning_rate": 0.00046518771331058024, + "loss": 6.2983, + "step": 4701 + }, + { + "epoch": 1.6047781569965869, + "grad_norm": 3.1969106197357178, + "learning_rate": 0.0004650739476678043, + "loss": 5.2289, + "step": 4702 + }, + { + "epoch": 1.6051194539249147, + "grad_norm": 3.340092658996582, + "learning_rate": 0.00046496018202502845, + "loss": 6.19, + "step": 4703 + }, + { + "epoch": 1.6054607508532424, + "grad_norm": 3.10861873626709, + "learning_rate": 0.00046484641638225256, + "loss": 6.4409, + "step": 4704 + }, + { + "epoch": 1.60580204778157, + "grad_norm": 2.947946310043335, + "learning_rate": 0.00046473265073947666, + "loss": 5.9036, + "step": 4705 + }, + { + "epoch": 1.6061433447098976, + "grad_norm": 3.063727855682373, + "learning_rate": 0.0004646188850967008, + "loss": 6.3689, + "step": 4706 + }, + { + "epoch": 1.6064846416382252, + "grad_norm": 3.1022796630859375, + "learning_rate": 0.0004645051194539249, + "loss": 6.3524, + "step": 4707 + }, + { + "epoch": 1.6068259385665529, + "grad_norm": 3.0189907550811768, + "learning_rate": 0.00046439135381114903, + "loss": 6.7521, + "step": 4708 + }, + { + "epoch": 1.6071672354948805, + "grad_norm": 3.067929744720459, + "learning_rate": 0.0004642775881683732, + "loss": 6.0322, + "step": 4709 + }, + { + "epoch": 1.6075085324232083, + "grad_norm": 3.403313159942627, + "learning_rate": 0.0004641638225255973, + "loss": 5.9158, + "step": 4710 + }, + { + "epoch": 1.6078498293515358, + "grad_norm": 3.100106954574585, + "learning_rate": 0.0004640500568828214, + "loss": 6.0732, + "step": 4711 + }, + { + "epoch": 1.6081911262798636, + "grad_norm": 3.0627291202545166, + "learning_rate": 0.0004639362912400455, + "loss": 5.8361, + "step": 4712 + }, + { + "epoch": 1.608532423208191, + "grad_norm": 3.6144957542419434, + "learning_rate": 0.0004638225255972696, + "loss": 6.1397, + "step": 4713 + }, + { + "epoch": 1.6088737201365189, + "grad_norm": 3.0293452739715576, + "learning_rate": 0.0004637087599544937, + "loss": 6.5087, + "step": 4714 + }, + { + "epoch": 1.6092150170648463, + "grad_norm": 3.0504560470581055, + "learning_rate": 0.0004635949943117179, + "loss": 6.5581, + "step": 4715 + }, + { + "epoch": 1.6095563139931741, + "grad_norm": 3.043027877807617, + "learning_rate": 0.000463481228668942, + "loss": 6.8183, + "step": 4716 + }, + { + "epoch": 1.6098976109215017, + "grad_norm": 3.135340452194214, + "learning_rate": 0.00046336746302616614, + "loss": 6.3905, + "step": 4717 + }, + { + "epoch": 1.6102389078498294, + "grad_norm": 2.9933533668518066, + "learning_rate": 0.00046325369738339024, + "loss": 6.2646, + "step": 4718 + }, + { + "epoch": 1.610580204778157, + "grad_norm": 3.0164849758148193, + "learning_rate": 0.00046313993174061435, + "loss": 6.5204, + "step": 4719 + }, + { + "epoch": 1.6109215017064846, + "grad_norm": 2.9858627319335938, + "learning_rate": 0.0004630261660978385, + "loss": 6.2363, + "step": 4720 + }, + { + "epoch": 1.6112627986348123, + "grad_norm": 2.998555898666382, + "learning_rate": 0.00046291240045506256, + "loss": 6.4151, + "step": 4721 + }, + { + "epoch": 1.6116040955631399, + "grad_norm": 3.497677803039551, + "learning_rate": 0.00046279863481228666, + "loss": 5.135, + "step": 4722 + }, + { + "epoch": 1.6119453924914677, + "grad_norm": 3.07749080657959, + "learning_rate": 0.0004626848691695108, + "loss": 6.402, + "step": 4723 + }, + { + "epoch": 1.6122866894197951, + "grad_norm": 3.127281665802002, + "learning_rate": 0.00046257110352673493, + "loss": 6.8232, + "step": 4724 + }, + { + "epoch": 1.612627986348123, + "grad_norm": 3.1097822189331055, + "learning_rate": 0.00046245733788395903, + "loss": 6.0242, + "step": 4725 + }, + { + "epoch": 1.6129692832764504, + "grad_norm": 3.2166764736175537, + "learning_rate": 0.0004623435722411832, + "loss": 5.9335, + "step": 4726 + }, + { + "epoch": 1.6133105802047782, + "grad_norm": 3.05297589302063, + "learning_rate": 0.0004622298065984073, + "loss": 6.4292, + "step": 4727 + }, + { + "epoch": 1.6136518771331056, + "grad_norm": 2.0636985301971436, + "learning_rate": 0.0004621160409556314, + "loss": 3.1146, + "step": 4728 + }, + { + "epoch": 1.6139931740614335, + "grad_norm": 3.0197324752807617, + "learning_rate": 0.00046200227531285556, + "loss": 6.0745, + "step": 4729 + }, + { + "epoch": 1.6143344709897611, + "grad_norm": 3.0842418670654297, + "learning_rate": 0.00046188850967007967, + "loss": 6.6684, + "step": 4730 + }, + { + "epoch": 1.6146757679180888, + "grad_norm": 3.0386857986450195, + "learning_rate": 0.0004617747440273037, + "loss": 6.3415, + "step": 4731 + }, + { + "epoch": 1.6150170648464164, + "grad_norm": 3.1578781604766846, + "learning_rate": 0.0004616609783845279, + "loss": 5.7987, + "step": 4732 + }, + { + "epoch": 1.615358361774744, + "grad_norm": 3.1602065563201904, + "learning_rate": 0.000461547212741752, + "loss": 6.5547, + "step": 4733 + }, + { + "epoch": 1.6156996587030716, + "grad_norm": 3.2017452716827393, + "learning_rate": 0.0004614334470989761, + "loss": 6.0994, + "step": 4734 + }, + { + "epoch": 1.6160409556313993, + "grad_norm": 3.114725112915039, + "learning_rate": 0.00046131968145620024, + "loss": 6.5211, + "step": 4735 + }, + { + "epoch": 1.6163822525597271, + "grad_norm": 3.1060032844543457, + "learning_rate": 0.00046120591581342435, + "loss": 6.3797, + "step": 4736 + }, + { + "epoch": 1.6167235494880545, + "grad_norm": 3.1103594303131104, + "learning_rate": 0.0004610921501706485, + "loss": 6.0067, + "step": 4737 + }, + { + "epoch": 1.6170648464163824, + "grad_norm": 3.124356985092163, + "learning_rate": 0.0004609783845278726, + "loss": 6.7605, + "step": 4738 + }, + { + "epoch": 1.6174061433447098, + "grad_norm": 3.092719793319702, + "learning_rate": 0.0004608646188850967, + "loss": 5.9352, + "step": 4739 + }, + { + "epoch": 1.6177474402730376, + "grad_norm": 2.993748903274536, + "learning_rate": 0.0004607508532423209, + "loss": 6.4874, + "step": 4740 + }, + { + "epoch": 1.618088737201365, + "grad_norm": 3.0820672512054443, + "learning_rate": 0.00046063708759954493, + "loss": 5.4731, + "step": 4741 + }, + { + "epoch": 1.6184300341296929, + "grad_norm": 3.193246841430664, + "learning_rate": 0.00046052332195676903, + "loss": 6.4882, + "step": 4742 + }, + { + "epoch": 1.6187713310580205, + "grad_norm": 3.0461337566375732, + "learning_rate": 0.0004604095563139932, + "loss": 6.7046, + "step": 4743 + }, + { + "epoch": 1.6191126279863481, + "grad_norm": 3.1627285480499268, + "learning_rate": 0.0004602957906712173, + "loss": 5.9241, + "step": 4744 + }, + { + "epoch": 1.6194539249146758, + "grad_norm": 3.004056215286255, + "learning_rate": 0.0004601820250284414, + "loss": 6.182, + "step": 4745 + }, + { + "epoch": 1.6197952218430034, + "grad_norm": 3.3949027061462402, + "learning_rate": 0.00046006825938566556, + "loss": 6.1612, + "step": 4746 + }, + { + "epoch": 1.620136518771331, + "grad_norm": 3.4996936321258545, + "learning_rate": 0.00045995449374288967, + "loss": 5.6995, + "step": 4747 + }, + { + "epoch": 1.6204778156996587, + "grad_norm": 3.5690553188323975, + "learning_rate": 0.00045984072810011377, + "loss": 6.0195, + "step": 4748 + }, + { + "epoch": 1.6208191126279865, + "grad_norm": 3.138127326965332, + "learning_rate": 0.00045972696245733793, + "loss": 6.9745, + "step": 4749 + }, + { + "epoch": 1.621160409556314, + "grad_norm": 3.223238468170166, + "learning_rate": 0.000459613196814562, + "loss": 6.5723, + "step": 4750 + }, + { + "epoch": 1.6215017064846418, + "grad_norm": 3.028900146484375, + "learning_rate": 0.0004594994311717861, + "loss": 6.8207, + "step": 4751 + }, + { + "epoch": 1.6218430034129692, + "grad_norm": 3.1101443767547607, + "learning_rate": 0.00045938566552901025, + "loss": 6.2446, + "step": 4752 + }, + { + "epoch": 1.622184300341297, + "grad_norm": 3.129108190536499, + "learning_rate": 0.00045927189988623435, + "loss": 6.6041, + "step": 4753 + }, + { + "epoch": 1.6225255972696244, + "grad_norm": 2.9134464263916016, + "learning_rate": 0.00045915813424345846, + "loss": 6.674, + "step": 4754 + }, + { + "epoch": 1.6228668941979523, + "grad_norm": 2.890514373779297, + "learning_rate": 0.0004590443686006826, + "loss": 6.4995, + "step": 4755 + }, + { + "epoch": 1.62320819112628, + "grad_norm": 3.06087589263916, + "learning_rate": 0.0004589306029579067, + "loss": 5.8313, + "step": 4756 + }, + { + "epoch": 1.6235494880546075, + "grad_norm": 2.98832631111145, + "learning_rate": 0.0004588168373151309, + "loss": 6.4396, + "step": 4757 + }, + { + "epoch": 1.6238907849829352, + "grad_norm": 2.935410976409912, + "learning_rate": 0.000458703071672355, + "loss": 6.4128, + "step": 4758 + }, + { + "epoch": 1.6242320819112628, + "grad_norm": 3.138000965118408, + "learning_rate": 0.0004585893060295791, + "loss": 6.956, + "step": 4759 + }, + { + "epoch": 1.6245733788395904, + "grad_norm": 2.011180877685547, + "learning_rate": 0.0004584755403868032, + "loss": 3.2119, + "step": 4760 + }, + { + "epoch": 1.624914675767918, + "grad_norm": 3.1553590297698975, + "learning_rate": 0.0004583617747440273, + "loss": 6.419, + "step": 4761 + }, + { + "epoch": 1.6252559726962459, + "grad_norm": 3.162964344024658, + "learning_rate": 0.0004582480091012514, + "loss": 6.3694, + "step": 4762 + }, + { + "epoch": 1.6255972696245733, + "grad_norm": 3.1152918338775635, + "learning_rate": 0.00045813424345847556, + "loss": 6.0361, + "step": 4763 + }, + { + "epoch": 1.6259385665529011, + "grad_norm": 3.273135185241699, + "learning_rate": 0.00045802047781569967, + "loss": 6.0278, + "step": 4764 + }, + { + "epoch": 1.6262798634812285, + "grad_norm": 4.751234531402588, + "learning_rate": 0.00045790671217292377, + "loss": 5.5492, + "step": 4765 + }, + { + "epoch": 1.6266211604095564, + "grad_norm": 3.12723445892334, + "learning_rate": 0.00045779294653014793, + "loss": 6.8186, + "step": 4766 + }, + { + "epoch": 1.6269624573378838, + "grad_norm": 3.2533812522888184, + "learning_rate": 0.00045767918088737204, + "loss": 6.105, + "step": 4767 + }, + { + "epoch": 1.6273037542662117, + "grad_norm": 3.344517946243286, + "learning_rate": 0.00045756541524459614, + "loss": 5.5507, + "step": 4768 + }, + { + "epoch": 1.6276450511945393, + "grad_norm": 3.1536638736724854, + "learning_rate": 0.0004574516496018203, + "loss": 5.9071, + "step": 4769 + }, + { + "epoch": 1.627986348122867, + "grad_norm": 3.0342910289764404, + "learning_rate": 0.00045733788395904435, + "loss": 6.3558, + "step": 4770 + }, + { + "epoch": 1.6283276450511945, + "grad_norm": 2.96907377243042, + "learning_rate": 0.00045722411831626846, + "loss": 6.4653, + "step": 4771 + }, + { + "epoch": 1.6286689419795222, + "grad_norm": 3.0146501064300537, + "learning_rate": 0.0004571103526734926, + "loss": 6.3785, + "step": 4772 + }, + { + "epoch": 1.6290102389078498, + "grad_norm": 3.0192205905914307, + "learning_rate": 0.0004569965870307167, + "loss": 6.1357, + "step": 4773 + }, + { + "epoch": 1.6293515358361774, + "grad_norm": 2.924898862838745, + "learning_rate": 0.0004568828213879408, + "loss": 6.2558, + "step": 4774 + }, + { + "epoch": 1.6296928327645053, + "grad_norm": 2.9523444175720215, + "learning_rate": 0.000456769055745165, + "loss": 6.6745, + "step": 4775 + }, + { + "epoch": 1.6300341296928327, + "grad_norm": 3.029269218444824, + "learning_rate": 0.0004566552901023891, + "loss": 6.6144, + "step": 4776 + }, + { + "epoch": 1.6303754266211605, + "grad_norm": 3.0683581829071045, + "learning_rate": 0.00045654152445961325, + "loss": 6.2914, + "step": 4777 + }, + { + "epoch": 1.630716723549488, + "grad_norm": 3.100022554397583, + "learning_rate": 0.00045642775881683735, + "loss": 6.7963, + "step": 4778 + }, + { + "epoch": 1.6310580204778158, + "grad_norm": 2.997330904006958, + "learning_rate": 0.00045631399317406146, + "loss": 6.5692, + "step": 4779 + }, + { + "epoch": 1.6313993174061432, + "grad_norm": 3.10273814201355, + "learning_rate": 0.00045620022753128556, + "loss": 6.137, + "step": 4780 + }, + { + "epoch": 1.631740614334471, + "grad_norm": 2.9892499446868896, + "learning_rate": 0.00045608646188850967, + "loss": 6.4949, + "step": 4781 + }, + { + "epoch": 1.6320819112627987, + "grad_norm": 3.031235933303833, + "learning_rate": 0.0004559726962457338, + "loss": 6.3293, + "step": 4782 + }, + { + "epoch": 1.6324232081911263, + "grad_norm": 3.0643863677978516, + "learning_rate": 0.00045585893060295793, + "loss": 6.279, + "step": 4783 + }, + { + "epoch": 1.632764505119454, + "grad_norm": 3.4579365253448486, + "learning_rate": 0.00045574516496018204, + "loss": 6.1474, + "step": 4784 + }, + { + "epoch": 1.6331058020477816, + "grad_norm": 3.1474268436431885, + "learning_rate": 0.00045563139931740614, + "loss": 6.3421, + "step": 4785 + }, + { + "epoch": 1.6334470989761092, + "grad_norm": 3.405183792114258, + "learning_rate": 0.0004555176336746303, + "loss": 6.1607, + "step": 4786 + }, + { + "epoch": 1.6337883959044368, + "grad_norm": 3.0848171710968018, + "learning_rate": 0.0004554038680318544, + "loss": 6.4013, + "step": 4787 + }, + { + "epoch": 1.6341296928327647, + "grad_norm": 3.092343807220459, + "learning_rate": 0.0004552901023890785, + "loss": 6.4724, + "step": 4788 + }, + { + "epoch": 1.634470989761092, + "grad_norm": 3.5067667961120605, + "learning_rate": 0.0004551763367463026, + "loss": 5.3636, + "step": 4789 + }, + { + "epoch": 1.63481228668942, + "grad_norm": 3.0137641429901123, + "learning_rate": 0.0004550625711035267, + "loss": 6.6243, + "step": 4790 + }, + { + "epoch": 1.6351535836177473, + "grad_norm": 3.554887533187866, + "learning_rate": 0.0004549488054607508, + "loss": 5.7127, + "step": 4791 + }, + { + "epoch": 1.6354948805460752, + "grad_norm": 3.113088607788086, + "learning_rate": 0.000454835039817975, + "loss": 6.3377, + "step": 4792 + }, + { + "epoch": 1.6358361774744026, + "grad_norm": 3.3043792247772217, + "learning_rate": 0.0004547212741751991, + "loss": 6.3576, + "step": 4793 + }, + { + "epoch": 1.6361774744027304, + "grad_norm": 3.1431515216827393, + "learning_rate": 0.0004546075085324232, + "loss": 6.6704, + "step": 4794 + }, + { + "epoch": 1.636518771331058, + "grad_norm": 2.899198293685913, + "learning_rate": 0.00045449374288964735, + "loss": 6.5668, + "step": 4795 + }, + { + "epoch": 1.6368600682593857, + "grad_norm": 2.860257625579834, + "learning_rate": 0.00045437997724687146, + "loss": 6.3462, + "step": 4796 + }, + { + "epoch": 1.6372013651877133, + "grad_norm": 2.903857946395874, + "learning_rate": 0.0004542662116040956, + "loss": 6.5885, + "step": 4797 + }, + { + "epoch": 1.637542662116041, + "grad_norm": 3.1243457794189453, + "learning_rate": 0.0004541524459613197, + "loss": 6.5971, + "step": 4798 + }, + { + "epoch": 1.6378839590443686, + "grad_norm": 2.9572298526763916, + "learning_rate": 0.0004540386803185438, + "loss": 6.5093, + "step": 4799 + }, + { + "epoch": 1.6382252559726962, + "grad_norm": 2.8882224559783936, + "learning_rate": 0.00045392491467576793, + "loss": 6.5098, + "step": 4800 + }, + { + "epoch": 1.638566552901024, + "grad_norm": 4.230573654174805, + "learning_rate": 0.00045381114903299204, + "loss": 5.5162, + "step": 4801 + }, + { + "epoch": 1.6389078498293514, + "grad_norm": 3.288973093032837, + "learning_rate": 0.00045369738339021614, + "loss": 6.3791, + "step": 4802 + }, + { + "epoch": 1.6392491467576793, + "grad_norm": 3.1963343620300293, + "learning_rate": 0.0004535836177474403, + "loss": 6.6689, + "step": 4803 + }, + { + "epoch": 1.6395904436860067, + "grad_norm": 3.1104302406311035, + "learning_rate": 0.0004534698521046644, + "loss": 6.6137, + "step": 4804 + }, + { + "epoch": 1.6399317406143346, + "grad_norm": 3.651779890060425, + "learning_rate": 0.0004533560864618885, + "loss": 5.9103, + "step": 4805 + }, + { + "epoch": 1.640273037542662, + "grad_norm": 5.380152702331543, + "learning_rate": 0.00045324232081911267, + "loss": 5.7795, + "step": 4806 + }, + { + "epoch": 1.6406143344709898, + "grad_norm": 3.1054184436798096, + "learning_rate": 0.0004531285551763368, + "loss": 6.1345, + "step": 4807 + }, + { + "epoch": 1.6409556313993174, + "grad_norm": 3.150416851043701, + "learning_rate": 0.0004530147895335609, + "loss": 6.2737, + "step": 4808 + }, + { + "epoch": 1.641296928327645, + "grad_norm": 6.0928053855896, + "learning_rate": 0.000452901023890785, + "loss": 5.2429, + "step": 4809 + }, + { + "epoch": 1.6416382252559727, + "grad_norm": 3.073267698287964, + "learning_rate": 0.0004527872582480091, + "loss": 6.2179, + "step": 4810 + }, + { + "epoch": 1.6419795221843003, + "grad_norm": 3.0769455432891846, + "learning_rate": 0.0004526734926052332, + "loss": 6.1769, + "step": 4811 + }, + { + "epoch": 1.642320819112628, + "grad_norm": 4.3502888679504395, + "learning_rate": 0.00045255972696245736, + "loss": 5.1454, + "step": 4812 + }, + { + "epoch": 1.6426621160409556, + "grad_norm": 3.0830323696136475, + "learning_rate": 0.00045244596131968146, + "loss": 6.0589, + "step": 4813 + }, + { + "epoch": 1.6430034129692834, + "grad_norm": 3.1683781147003174, + "learning_rate": 0.00045233219567690557, + "loss": 6.5236, + "step": 4814 + }, + { + "epoch": 1.6433447098976108, + "grad_norm": 3.1316497325897217, + "learning_rate": 0.0004522184300341297, + "loss": 6.4113, + "step": 4815 + }, + { + "epoch": 1.6436860068259387, + "grad_norm": 3.0804522037506104, + "learning_rate": 0.00045210466439135383, + "loss": 5.9039, + "step": 4816 + }, + { + "epoch": 1.644027303754266, + "grad_norm": 3.0023155212402344, + "learning_rate": 0.00045199089874857793, + "loss": 6.1391, + "step": 4817 + }, + { + "epoch": 1.644368600682594, + "grad_norm": 2.9633994102478027, + "learning_rate": 0.00045187713310580204, + "loss": 6.148, + "step": 4818 + }, + { + "epoch": 1.6447098976109213, + "grad_norm": 2.8795106410980225, + "learning_rate": 0.00045176336746302614, + "loss": 6.4872, + "step": 4819 + }, + { + "epoch": 1.6450511945392492, + "grad_norm": 6.828871250152588, + "learning_rate": 0.0004516496018202503, + "loss": 5.7615, + "step": 4820 + }, + { + "epoch": 1.6453924914675768, + "grad_norm": 3.061558485031128, + "learning_rate": 0.0004515358361774744, + "loss": 6.7772, + "step": 4821 + }, + { + "epoch": 1.6457337883959045, + "grad_norm": 3.115145444869995, + "learning_rate": 0.0004514220705346985, + "loss": 6.3194, + "step": 4822 + }, + { + "epoch": 1.646075085324232, + "grad_norm": 2.98173451423645, + "learning_rate": 0.00045130830489192267, + "loss": 6.3607, + "step": 4823 + }, + { + "epoch": 1.6464163822525597, + "grad_norm": 3.000434637069702, + "learning_rate": 0.0004511945392491468, + "loss": 6.7843, + "step": 4824 + }, + { + "epoch": 1.6467576791808873, + "grad_norm": 2.882544755935669, + "learning_rate": 0.0004510807736063709, + "loss": 6.2495, + "step": 4825 + }, + { + "epoch": 1.647098976109215, + "grad_norm": 3.010756492614746, + "learning_rate": 0.00045096700796359504, + "loss": 6.589, + "step": 4826 + }, + { + "epoch": 1.6474402730375428, + "grad_norm": 2.991612434387207, + "learning_rate": 0.00045085324232081915, + "loss": 5.9436, + "step": 4827 + }, + { + "epoch": 1.6477815699658702, + "grad_norm": 3.1836135387420654, + "learning_rate": 0.0004507394766780432, + "loss": 6.5523, + "step": 4828 + }, + { + "epoch": 1.648122866894198, + "grad_norm": 3.123823404312134, + "learning_rate": 0.00045062571103526736, + "loss": 6.2423, + "step": 4829 + }, + { + "epoch": 1.6484641638225255, + "grad_norm": 3.108682632446289, + "learning_rate": 0.00045051194539249146, + "loss": 5.803, + "step": 4830 + }, + { + "epoch": 1.6488054607508533, + "grad_norm": 2.941983938217163, + "learning_rate": 0.00045039817974971557, + "loss": 6.7919, + "step": 4831 + }, + { + "epoch": 1.6491467576791807, + "grad_norm": 3.0571041107177734, + "learning_rate": 0.0004502844141069397, + "loss": 6.3158, + "step": 4832 + }, + { + "epoch": 1.6494880546075086, + "grad_norm": 3.1755189895629883, + "learning_rate": 0.00045017064846416383, + "loss": 6.4594, + "step": 4833 + }, + { + "epoch": 1.6498293515358362, + "grad_norm": 3.0600199699401855, + "learning_rate": 0.00045005688282138794, + "loss": 5.9204, + "step": 4834 + }, + { + "epoch": 1.6501706484641638, + "grad_norm": 3.043494939804077, + "learning_rate": 0.0004499431171786121, + "loss": 6.3821, + "step": 4835 + }, + { + "epoch": 1.6505119453924915, + "grad_norm": 3.1068077087402344, + "learning_rate": 0.0004498293515358362, + "loss": 6.4803, + "step": 4836 + }, + { + "epoch": 1.650853242320819, + "grad_norm": 2.9238088130950928, + "learning_rate": 0.0004497155858930603, + "loss": 5.7569, + "step": 4837 + }, + { + "epoch": 1.6511945392491467, + "grad_norm": 3.37758207321167, + "learning_rate": 0.0004496018202502844, + "loss": 6.4423, + "step": 4838 + }, + { + "epoch": 1.6515358361774743, + "grad_norm": 2.9938437938690186, + "learning_rate": 0.0004494880546075085, + "loss": 6.492, + "step": 4839 + }, + { + "epoch": 1.6518771331058022, + "grad_norm": 2.9494314193725586, + "learning_rate": 0.0004493742889647327, + "loss": 5.878, + "step": 4840 + }, + { + "epoch": 1.6522184300341296, + "grad_norm": 3.7142817974090576, + "learning_rate": 0.0004492605233219568, + "loss": 5.7933, + "step": 4841 + }, + { + "epoch": 1.6525597269624575, + "grad_norm": 3.0025503635406494, + "learning_rate": 0.0004491467576791809, + "loss": 6.2064, + "step": 4842 + }, + { + "epoch": 1.6529010238907849, + "grad_norm": 3.326956272125244, + "learning_rate": 0.00044903299203640504, + "loss": 5.7267, + "step": 4843 + }, + { + "epoch": 1.6532423208191127, + "grad_norm": 2.990328311920166, + "learning_rate": 0.00044891922639362915, + "loss": 5.7901, + "step": 4844 + }, + { + "epoch": 1.6535836177474401, + "grad_norm": 3.0785820484161377, + "learning_rate": 0.00044880546075085325, + "loss": 5.9214, + "step": 4845 + }, + { + "epoch": 1.653924914675768, + "grad_norm": 3.112694025039673, + "learning_rate": 0.0004486916951080774, + "loss": 6.3741, + "step": 4846 + }, + { + "epoch": 1.6542662116040956, + "grad_norm": 3.4003303050994873, + "learning_rate": 0.0004485779294653015, + "loss": 5.736, + "step": 4847 + }, + { + "epoch": 1.6546075085324232, + "grad_norm": 2.963510274887085, + "learning_rate": 0.00044846416382252557, + "loss": 6.0387, + "step": 4848 + }, + { + "epoch": 1.6549488054607508, + "grad_norm": 3.8840463161468506, + "learning_rate": 0.0004483503981797497, + "loss": 5.371, + "step": 4849 + }, + { + "epoch": 1.6552901023890785, + "grad_norm": 3.072937488555908, + "learning_rate": 0.00044823663253697383, + "loss": 6.6929, + "step": 4850 + }, + { + "epoch": 1.655631399317406, + "grad_norm": 3.033498764038086, + "learning_rate": 0.00044812286689419794, + "loss": 5.939, + "step": 4851 + }, + { + "epoch": 1.6559726962457337, + "grad_norm": 3.0038321018218994, + "learning_rate": 0.0004480091012514221, + "loss": 6.4066, + "step": 4852 + }, + { + "epoch": 1.6563139931740616, + "grad_norm": 3.0644583702087402, + "learning_rate": 0.0004478953356086462, + "loss": 6.2456, + "step": 4853 + }, + { + "epoch": 1.656655290102389, + "grad_norm": 3.0209004878997803, + "learning_rate": 0.0004477815699658703, + "loss": 6.2575, + "step": 4854 + }, + { + "epoch": 1.6569965870307168, + "grad_norm": 2.9037437438964844, + "learning_rate": 0.00044766780432309446, + "loss": 6.1034, + "step": 4855 + }, + { + "epoch": 1.6573378839590442, + "grad_norm": 2.919391393661499, + "learning_rate": 0.00044755403868031857, + "loss": 6.4839, + "step": 4856 + }, + { + "epoch": 1.657679180887372, + "grad_norm": 2.8986079692840576, + "learning_rate": 0.0004474402730375426, + "loss": 6.5156, + "step": 4857 + }, + { + "epoch": 1.6580204778156995, + "grad_norm": 3.268554210662842, + "learning_rate": 0.0004473265073947668, + "loss": 6.1534, + "step": 4858 + }, + { + "epoch": 1.6583617747440274, + "grad_norm": 3.4434092044830322, + "learning_rate": 0.0004472127417519909, + "loss": 5.8377, + "step": 4859 + }, + { + "epoch": 1.658703071672355, + "grad_norm": 2.988135814666748, + "learning_rate": 0.00044709897610921504, + "loss": 6.4622, + "step": 4860 + }, + { + "epoch": 1.6590443686006826, + "grad_norm": 3.0992002487182617, + "learning_rate": 0.00044698521046643915, + "loss": 6.4363, + "step": 4861 + }, + { + "epoch": 1.6593856655290102, + "grad_norm": 3.1471357345581055, + "learning_rate": 0.00044687144482366325, + "loss": 6.0289, + "step": 4862 + }, + { + "epoch": 1.6597269624573379, + "grad_norm": 3.0062060356140137, + "learning_rate": 0.0004467576791808874, + "loss": 6.4248, + "step": 4863 + }, + { + "epoch": 1.6600682593856655, + "grad_norm": 3.003093957901001, + "learning_rate": 0.0004466439135381115, + "loss": 6.4863, + "step": 4864 + }, + { + "epoch": 1.6604095563139931, + "grad_norm": 3.077218770980835, + "learning_rate": 0.0004465301478953356, + "loss": 6.3079, + "step": 4865 + }, + { + "epoch": 1.660750853242321, + "grad_norm": 3.4413108825683594, + "learning_rate": 0.0004464163822525598, + "loss": 5.9743, + "step": 4866 + }, + { + "epoch": 1.6610921501706484, + "grad_norm": 3.0126073360443115, + "learning_rate": 0.00044630261660978383, + "loss": 6.3671, + "step": 4867 + }, + { + "epoch": 1.6614334470989762, + "grad_norm": 3.0863595008850098, + "learning_rate": 0.00044618885096700794, + "loss": 6.6083, + "step": 4868 + }, + { + "epoch": 1.6617747440273036, + "grad_norm": 3.1746888160705566, + "learning_rate": 0.0004460750853242321, + "loss": 6.5011, + "step": 4869 + }, + { + "epoch": 1.6621160409556315, + "grad_norm": 3.1492929458618164, + "learning_rate": 0.0004459613196814562, + "loss": 6.4877, + "step": 4870 + }, + { + "epoch": 1.6624573378839589, + "grad_norm": 3.422447443008423, + "learning_rate": 0.0004458475540386803, + "loss": 5.9579, + "step": 4871 + }, + { + "epoch": 1.6627986348122867, + "grad_norm": 3.057371139526367, + "learning_rate": 0.00044573378839590447, + "loss": 6.0468, + "step": 4872 + }, + { + "epoch": 1.6631399317406144, + "grad_norm": 4.449178695678711, + "learning_rate": 0.00044562002275312857, + "loss": 5.8584, + "step": 4873 + }, + { + "epoch": 1.663481228668942, + "grad_norm": 3.1592650413513184, + "learning_rate": 0.0004455062571103527, + "loss": 6.3337, + "step": 4874 + }, + { + "epoch": 1.6638225255972696, + "grad_norm": 3.210031509399414, + "learning_rate": 0.00044539249146757683, + "loss": 6.4076, + "step": 4875 + }, + { + "epoch": 1.6641638225255972, + "grad_norm": 3.0083580017089844, + "learning_rate": 0.00044527872582480094, + "loss": 6.2168, + "step": 4876 + }, + { + "epoch": 1.6645051194539249, + "grad_norm": 2.0590007305145264, + "learning_rate": 0.000445164960182025, + "loss": 3.1449, + "step": 4877 + }, + { + "epoch": 1.6648464163822525, + "grad_norm": 3.020071506500244, + "learning_rate": 0.00044505119453924915, + "loss": 6.0866, + "step": 4878 + }, + { + "epoch": 1.6651877133105804, + "grad_norm": 4.307253360748291, + "learning_rate": 0.00044493742889647325, + "loss": 6.2752, + "step": 4879 + }, + { + "epoch": 1.6655290102389078, + "grad_norm": 3.5636332035064697, + "learning_rate": 0.00044482366325369736, + "loss": 4.8149, + "step": 4880 + }, + { + "epoch": 1.6658703071672356, + "grad_norm": 3.1198694705963135, + "learning_rate": 0.0004447098976109215, + "loss": 6.368, + "step": 4881 + }, + { + "epoch": 1.666211604095563, + "grad_norm": 3.0112080574035645, + "learning_rate": 0.0004445961319681456, + "loss": 6.3972, + "step": 4882 + }, + { + "epoch": 1.6665529010238909, + "grad_norm": 3.0651512145996094, + "learning_rate": 0.0004444823663253698, + "loss": 4.845, + "step": 4883 + }, + { + "epoch": 1.6668941979522183, + "grad_norm": 3.187875270843506, + "learning_rate": 0.0004443686006825939, + "loss": 6.1866, + "step": 4884 + }, + { + "epoch": 1.6672354948805461, + "grad_norm": 3.1536598205566406, + "learning_rate": 0.000444254835039818, + "loss": 6.3055, + "step": 4885 + }, + { + "epoch": 1.6675767918088737, + "grad_norm": 3.3544106483459473, + "learning_rate": 0.00044414106939704215, + "loss": 6.1421, + "step": 4886 + }, + { + "epoch": 1.6679180887372014, + "grad_norm": 3.0838112831115723, + "learning_rate": 0.0004440273037542662, + "loss": 6.1572, + "step": 4887 + }, + { + "epoch": 1.668259385665529, + "grad_norm": 2.9401392936706543, + "learning_rate": 0.0004439135381114903, + "loss": 6.5586, + "step": 4888 + }, + { + "epoch": 1.6686006825938566, + "grad_norm": 3.06618595123291, + "learning_rate": 0.00044379977246871447, + "loss": 6.0765, + "step": 4889 + }, + { + "epoch": 1.6689419795221843, + "grad_norm": 2.9306881427764893, + "learning_rate": 0.00044368600682593857, + "loss": 6.2415, + "step": 4890 + }, + { + "epoch": 1.6692832764505119, + "grad_norm": 3.1481986045837402, + "learning_rate": 0.0004435722411831627, + "loss": 5.8938, + "step": 4891 + }, + { + "epoch": 1.6696245733788397, + "grad_norm": 3.0431253910064697, + "learning_rate": 0.00044345847554038684, + "loss": 6.1017, + "step": 4892 + }, + { + "epoch": 1.6699658703071671, + "grad_norm": 2.9972074031829834, + "learning_rate": 0.00044334470989761094, + "loss": 6.3889, + "step": 4893 + }, + { + "epoch": 1.670307167235495, + "grad_norm": 3.1371490955352783, + "learning_rate": 0.00044323094425483504, + "loss": 6.6684, + "step": 4894 + }, + { + "epoch": 1.6706484641638224, + "grad_norm": 3.0418312549591064, + "learning_rate": 0.0004431171786120592, + "loss": 6.478, + "step": 4895 + }, + { + "epoch": 1.6709897610921502, + "grad_norm": 3.074183464050293, + "learning_rate": 0.00044300341296928325, + "loss": 6.388, + "step": 4896 + }, + { + "epoch": 1.6713310580204777, + "grad_norm": 3.0589749813079834, + "learning_rate": 0.00044288964732650736, + "loss": 6.2, + "step": 4897 + }, + { + "epoch": 1.6716723549488055, + "grad_norm": 3.161790609359741, + "learning_rate": 0.0004427758816837315, + "loss": 5.6493, + "step": 4898 + }, + { + "epoch": 1.6720136518771331, + "grad_norm": 2.9835970401763916, + "learning_rate": 0.0004426621160409556, + "loss": 6.1388, + "step": 4899 + }, + { + "epoch": 1.6723549488054608, + "grad_norm": 3.0649311542510986, + "learning_rate": 0.00044254835039817973, + "loss": 5.9025, + "step": 4900 + }, + { + "epoch": 1.6726962457337884, + "grad_norm": 3.1138432025909424, + "learning_rate": 0.0004424345847554039, + "loss": 6.4551, + "step": 4901 + }, + { + "epoch": 1.673037542662116, + "grad_norm": 3.0755131244659424, + "learning_rate": 0.000442320819112628, + "loss": 6.5686, + "step": 4902 + }, + { + "epoch": 1.6733788395904436, + "grad_norm": 3.0821003913879395, + "learning_rate": 0.00044220705346985215, + "loss": 6.092, + "step": 4903 + }, + { + "epoch": 1.6737201365187713, + "grad_norm": 2.937201499938965, + "learning_rate": 0.00044209328782707626, + "loss": 6.2059, + "step": 4904 + }, + { + "epoch": 1.6740614334470991, + "grad_norm": 2.9311842918395996, + "learning_rate": 0.00044197952218430036, + "loss": 6.4499, + "step": 4905 + }, + { + "epoch": 1.6744027303754265, + "grad_norm": 3.036937713623047, + "learning_rate": 0.00044186575654152447, + "loss": 6.1801, + "step": 4906 + }, + { + "epoch": 1.6747440273037544, + "grad_norm": 3.5501644611358643, + "learning_rate": 0.00044175199089874857, + "loss": 5.8053, + "step": 4907 + }, + { + "epoch": 1.6750853242320818, + "grad_norm": 3.023075580596924, + "learning_rate": 0.0004416382252559727, + "loss": 6.1232, + "step": 4908 + }, + { + "epoch": 1.6754266211604096, + "grad_norm": 3.7344937324523926, + "learning_rate": 0.00044152445961319684, + "loss": 4.8391, + "step": 4909 + }, + { + "epoch": 1.675767918088737, + "grad_norm": 4.388406276702881, + "learning_rate": 0.00044141069397042094, + "loss": 6.2587, + "step": 4910 + }, + { + "epoch": 1.676109215017065, + "grad_norm": 3.0161640644073486, + "learning_rate": 0.00044129692832764505, + "loss": 6.2966, + "step": 4911 + }, + { + "epoch": 1.6764505119453925, + "grad_norm": 3.0810742378234863, + "learning_rate": 0.0004411831626848692, + "loss": 6.915, + "step": 4912 + }, + { + "epoch": 1.6767918088737201, + "grad_norm": 3.1341419219970703, + "learning_rate": 0.0004410693970420933, + "loss": 7.0646, + "step": 4913 + }, + { + "epoch": 1.6771331058020478, + "grad_norm": 3.8616952896118164, + "learning_rate": 0.0004409556313993174, + "loss": 5.5981, + "step": 4914 + }, + { + "epoch": 1.6774744027303754, + "grad_norm": 3.4659619331359863, + "learning_rate": 0.0004408418657565416, + "loss": 5.83, + "step": 4915 + }, + { + "epoch": 1.677815699658703, + "grad_norm": 3.06744647026062, + "learning_rate": 0.0004407281001137656, + "loss": 6.2469, + "step": 4916 + }, + { + "epoch": 1.6781569965870307, + "grad_norm": 3.057793140411377, + "learning_rate": 0.00044061433447098973, + "loss": 6.788, + "step": 4917 + }, + { + "epoch": 1.6784982935153585, + "grad_norm": 2.971437931060791, + "learning_rate": 0.0004405005688282139, + "loss": 6.2344, + "step": 4918 + }, + { + "epoch": 1.678839590443686, + "grad_norm": 3.025042772293091, + "learning_rate": 0.000440386803185438, + "loss": 6.3374, + "step": 4919 + }, + { + "epoch": 1.6791808873720138, + "grad_norm": 3.0731565952301025, + "learning_rate": 0.0004402730375426621, + "loss": 5.7545, + "step": 4920 + }, + { + "epoch": 1.6795221843003412, + "grad_norm": 2.9983479976654053, + "learning_rate": 0.00044015927189988626, + "loss": 6.1115, + "step": 4921 + }, + { + "epoch": 1.679863481228669, + "grad_norm": 2.97145414352417, + "learning_rate": 0.00044004550625711036, + "loss": 6.2915, + "step": 4922 + }, + { + "epoch": 1.6802047781569964, + "grad_norm": 3.1883833408355713, + "learning_rate": 0.0004399317406143345, + "loss": 5.9769, + "step": 4923 + }, + { + "epoch": 1.6805460750853243, + "grad_norm": 2.9268434047698975, + "learning_rate": 0.0004398179749715586, + "loss": 6.5054, + "step": 4924 + }, + { + "epoch": 1.680887372013652, + "grad_norm": 3.26137375831604, + "learning_rate": 0.0004397042093287827, + "loss": 6.3084, + "step": 4925 + }, + { + "epoch": 1.6812286689419795, + "grad_norm": 3.009206533432007, + "learning_rate": 0.00043959044368600684, + "loss": 6.0177, + "step": 4926 + }, + { + "epoch": 1.6815699658703072, + "grad_norm": 3.147836685180664, + "learning_rate": 0.00043947667804323094, + "loss": 5.1989, + "step": 4927 + }, + { + "epoch": 1.6819112627986348, + "grad_norm": 3.239095687866211, + "learning_rate": 0.00043936291240045505, + "loss": 6.1885, + "step": 4928 + }, + { + "epoch": 1.6822525597269624, + "grad_norm": 3.0361344814300537, + "learning_rate": 0.0004392491467576792, + "loss": 6.1484, + "step": 4929 + }, + { + "epoch": 1.68259385665529, + "grad_norm": 2.998265504837036, + "learning_rate": 0.0004391353811149033, + "loss": 6.7742, + "step": 4930 + }, + { + "epoch": 1.682935153583618, + "grad_norm": 3.0824317932128906, + "learning_rate": 0.0004390216154721274, + "loss": 6.5443, + "step": 4931 + }, + { + "epoch": 1.6832764505119453, + "grad_norm": 5.664287090301514, + "learning_rate": 0.0004389078498293516, + "loss": 5.2706, + "step": 4932 + }, + { + "epoch": 1.6836177474402731, + "grad_norm": 2.9945015907287598, + "learning_rate": 0.0004387940841865757, + "loss": 6.2064, + "step": 4933 + }, + { + "epoch": 1.6839590443686006, + "grad_norm": 3.081329584121704, + "learning_rate": 0.0004386803185437998, + "loss": 6.5694, + "step": 4934 + }, + { + "epoch": 1.6843003412969284, + "grad_norm": 3.1098530292510986, + "learning_rate": 0.0004385665529010239, + "loss": 5.8361, + "step": 4935 + }, + { + "epoch": 1.6846416382252558, + "grad_norm": 3.166102170944214, + "learning_rate": 0.000438452787258248, + "loss": 6.0032, + "step": 4936 + }, + { + "epoch": 1.6849829351535837, + "grad_norm": 3.0542194843292236, + "learning_rate": 0.0004383390216154721, + "loss": 5.7641, + "step": 4937 + }, + { + "epoch": 1.6853242320819113, + "grad_norm": 2.8499386310577393, + "learning_rate": 0.00043822525597269626, + "loss": 6.0653, + "step": 4938 + }, + { + "epoch": 1.685665529010239, + "grad_norm": 3.0446012020111084, + "learning_rate": 0.00043811149032992036, + "loss": 6.3366, + "step": 4939 + }, + { + "epoch": 1.6860068259385665, + "grad_norm": 3.0523667335510254, + "learning_rate": 0.00043799772468714447, + "loss": 6.0754, + "step": 4940 + }, + { + "epoch": 1.6863481228668942, + "grad_norm": 3.245394229888916, + "learning_rate": 0.00043788395904436863, + "loss": 6.1414, + "step": 4941 + }, + { + "epoch": 1.6866894197952218, + "grad_norm": 3.0902605056762695, + "learning_rate": 0.00043777019340159273, + "loss": 6.5985, + "step": 4942 + }, + { + "epoch": 1.6870307167235494, + "grad_norm": 2.9489684104919434, + "learning_rate": 0.0004376564277588169, + "loss": 6.3939, + "step": 4943 + }, + { + "epoch": 1.6873720136518773, + "grad_norm": 3.0497970581054688, + "learning_rate": 0.000437542662116041, + "loss": 6.8189, + "step": 4944 + }, + { + "epoch": 1.6877133105802047, + "grad_norm": 3.0870232582092285, + "learning_rate": 0.00043742889647326505, + "loss": 5.96, + "step": 4945 + }, + { + "epoch": 1.6880546075085325, + "grad_norm": 3.0094494819641113, + "learning_rate": 0.0004373151308304892, + "loss": 6.2432, + "step": 4946 + }, + { + "epoch": 1.68839590443686, + "grad_norm": 3.130962371826172, + "learning_rate": 0.0004372013651877133, + "loss": 6.3456, + "step": 4947 + }, + { + "epoch": 1.6887372013651878, + "grad_norm": 2.9619531631469727, + "learning_rate": 0.0004370875995449374, + "loss": 6.666, + "step": 4948 + }, + { + "epoch": 1.6890784982935152, + "grad_norm": 3.149620532989502, + "learning_rate": 0.0004369738339021616, + "loss": 6.0721, + "step": 4949 + }, + { + "epoch": 1.689419795221843, + "grad_norm": 2.937761068344116, + "learning_rate": 0.0004368600682593857, + "loss": 6.7564, + "step": 4950 + }, + { + "epoch": 1.6897610921501707, + "grad_norm": 3.5034937858581543, + "learning_rate": 0.0004367463026166098, + "loss": 5.4092, + "step": 4951 + }, + { + "epoch": 1.6901023890784983, + "grad_norm": 3.117332935333252, + "learning_rate": 0.00043663253697383394, + "loss": 6.3505, + "step": 4952 + }, + { + "epoch": 1.690443686006826, + "grad_norm": 5.035308361053467, + "learning_rate": 0.00043651877133105805, + "loss": 5.6475, + "step": 4953 + }, + { + "epoch": 1.6907849829351536, + "grad_norm": 3.0669705867767334, + "learning_rate": 0.00043640500568828215, + "loss": 5.8391, + "step": 4954 + }, + { + "epoch": 1.6911262798634812, + "grad_norm": 3.2862114906311035, + "learning_rate": 0.00043629124004550626, + "loss": 5.9049, + "step": 4955 + }, + { + "epoch": 1.6914675767918088, + "grad_norm": 3.088000774383545, + "learning_rate": 0.00043617747440273036, + "loss": 6.5768, + "step": 4956 + }, + { + "epoch": 1.6918088737201367, + "grad_norm": 3.001810073852539, + "learning_rate": 0.00043606370875995447, + "loss": 6.2097, + "step": 4957 + }, + { + "epoch": 1.692150170648464, + "grad_norm": 3.081796407699585, + "learning_rate": 0.00043594994311717863, + "loss": 6.8779, + "step": 4958 + }, + { + "epoch": 1.692491467576792, + "grad_norm": 3.0735950469970703, + "learning_rate": 0.00043583617747440273, + "loss": 5.9983, + "step": 4959 + }, + { + "epoch": 1.6928327645051193, + "grad_norm": 3.0525989532470703, + "learning_rate": 0.00043572241183162684, + "loss": 6.4789, + "step": 4960 + }, + { + "epoch": 1.6931740614334472, + "grad_norm": 2.9428293704986572, + "learning_rate": 0.000435608646188851, + "loss": 6.3712, + "step": 4961 + }, + { + "epoch": 1.6935153583617746, + "grad_norm": 3.002599000930786, + "learning_rate": 0.0004354948805460751, + "loss": 6.2377, + "step": 4962 + }, + { + "epoch": 1.6938566552901024, + "grad_norm": 3.1271250247955322, + "learning_rate": 0.00043538111490329926, + "loss": 6.357, + "step": 4963 + }, + { + "epoch": 1.69419795221843, + "grad_norm": 2.927199125289917, + "learning_rate": 0.0004352673492605233, + "loss": 6.3556, + "step": 4964 + }, + { + "epoch": 1.6945392491467577, + "grad_norm": 3.364306926727295, + "learning_rate": 0.0004351535836177474, + "loss": 6.1131, + "step": 4965 + }, + { + "epoch": 1.6948805460750853, + "grad_norm": 3.0801026821136475, + "learning_rate": 0.0004350398179749716, + "loss": 5.7936, + "step": 4966 + }, + { + "epoch": 1.695221843003413, + "grad_norm": 4.118381500244141, + "learning_rate": 0.0004349260523321957, + "loss": 4.5499, + "step": 4967 + }, + { + "epoch": 1.6955631399317406, + "grad_norm": 3.0644776821136475, + "learning_rate": 0.0004348122866894198, + "loss": 6.8683, + "step": 4968 + }, + { + "epoch": 1.6959044368600682, + "grad_norm": 3.154467821121216, + "learning_rate": 0.00043469852104664395, + "loss": 6.1359, + "step": 4969 + }, + { + "epoch": 1.696245733788396, + "grad_norm": 2.9244272708892822, + "learning_rate": 0.00043458475540386805, + "loss": 5.8604, + "step": 4970 + }, + { + "epoch": 1.6965870307167235, + "grad_norm": 3.170546054840088, + "learning_rate": 0.00043447098976109216, + "loss": 6.1517, + "step": 4971 + }, + { + "epoch": 1.6969283276450513, + "grad_norm": 3.1615684032440186, + "learning_rate": 0.0004343572241183163, + "loss": 6.2409, + "step": 4972 + }, + { + "epoch": 1.6972696245733787, + "grad_norm": 2.9471771717071533, + "learning_rate": 0.0004342434584755404, + "loss": 5.7621, + "step": 4973 + }, + { + "epoch": 1.6976109215017066, + "grad_norm": 3.0422165393829346, + "learning_rate": 0.00043412969283276447, + "loss": 6.4401, + "step": 4974 + }, + { + "epoch": 1.697952218430034, + "grad_norm": 3.3858096599578857, + "learning_rate": 0.00043401592718998863, + "loss": 5.916, + "step": 4975 + }, + { + "epoch": 1.6982935153583618, + "grad_norm": 3.054570198059082, + "learning_rate": 0.00043390216154721273, + "loss": 6.6209, + "step": 4976 + }, + { + "epoch": 1.6986348122866894, + "grad_norm": 3.7693064212799072, + "learning_rate": 0.00043378839590443684, + "loss": 5.5803, + "step": 4977 + }, + { + "epoch": 1.698976109215017, + "grad_norm": 3.033956289291382, + "learning_rate": 0.000433674630261661, + "loss": 6.0595, + "step": 4978 + }, + { + "epoch": 1.6993174061433447, + "grad_norm": 6.4478912353515625, + "learning_rate": 0.0004335608646188851, + "loss": 5.3277, + "step": 4979 + }, + { + "epoch": 1.6996587030716723, + "grad_norm": 3.0002944469451904, + "learning_rate": 0.0004334470989761092, + "loss": 6.3766, + "step": 4980 + }, + { + "epoch": 1.7, + "grad_norm": 3.1205992698669434, + "learning_rate": 0.00043333333333333337, + "loss": 6.3214, + "step": 4981 + }, + { + "epoch": 1.7003412969283276, + "grad_norm": 3.3028411865234375, + "learning_rate": 0.00043321956769055747, + "loss": 6.2618, + "step": 4982 + }, + { + "epoch": 1.7006825938566554, + "grad_norm": 3.0650041103363037, + "learning_rate": 0.00043310580204778163, + "loss": 6.48, + "step": 4983 + }, + { + "epoch": 1.7010238907849828, + "grad_norm": 2.9815592765808105, + "learning_rate": 0.0004329920364050057, + "loss": 5.3141, + "step": 4984 + }, + { + "epoch": 1.7013651877133107, + "grad_norm": 3.0242958068847656, + "learning_rate": 0.0004328782707622298, + "loss": 6.1838, + "step": 4985 + }, + { + "epoch": 1.701706484641638, + "grad_norm": 4.361161708831787, + "learning_rate": 0.00043276450511945395, + "loss": 4.7251, + "step": 4986 + }, + { + "epoch": 1.702047781569966, + "grad_norm": 2.9456946849823, + "learning_rate": 0.00043265073947667805, + "loss": 6.3513, + "step": 4987 + }, + { + "epoch": 1.7023890784982934, + "grad_norm": 3.1762263774871826, + "learning_rate": 0.00043253697383390216, + "loss": 6.6752, + "step": 4988 + }, + { + "epoch": 1.7027303754266212, + "grad_norm": 3.128908634185791, + "learning_rate": 0.0004324232081911263, + "loss": 6.6874, + "step": 4989 + }, + { + "epoch": 1.7030716723549488, + "grad_norm": 2.998319625854492, + "learning_rate": 0.0004323094425483504, + "loss": 5.9913, + "step": 4990 + }, + { + "epoch": 1.7034129692832765, + "grad_norm": 3.161729097366333, + "learning_rate": 0.0004321956769055745, + "loss": 6.0972, + "step": 4991 + }, + { + "epoch": 1.703754266211604, + "grad_norm": 3.145906686782837, + "learning_rate": 0.0004320819112627987, + "loss": 6.0995, + "step": 4992 + }, + { + "epoch": 1.7040955631399317, + "grad_norm": 2.9762144088745117, + "learning_rate": 0.00043196814562002274, + "loss": 6.3909, + "step": 4993 + }, + { + "epoch": 1.7044368600682593, + "grad_norm": 3.2298426628112793, + "learning_rate": 0.00043185437997724684, + "loss": 5.4184, + "step": 4994 + }, + { + "epoch": 1.704778156996587, + "grad_norm": 3.0982184410095215, + "learning_rate": 0.000431740614334471, + "loss": 5.8238, + "step": 4995 + }, + { + "epoch": 1.7051194539249148, + "grad_norm": 2.9920384883880615, + "learning_rate": 0.0004316268486916951, + "loss": 5.8196, + "step": 4996 + }, + { + "epoch": 1.7054607508532422, + "grad_norm": 4.23321533203125, + "learning_rate": 0.0004315130830489192, + "loss": 5.5014, + "step": 4997 + }, + { + "epoch": 1.70580204778157, + "grad_norm": 3.3732895851135254, + "learning_rate": 0.00043139931740614337, + "loss": 5.9258, + "step": 4998 + }, + { + "epoch": 1.7061433447098975, + "grad_norm": 3.0920844078063965, + "learning_rate": 0.0004312855517633675, + "loss": 6.3851, + "step": 4999 + }, + { + "epoch": 1.7064846416382253, + "grad_norm": 3.1090736389160156, + "learning_rate": 0.0004311717861205916, + "loss": 6.5109, + "step": 5000 + }, + { + "epoch": 1.7068259385665527, + "grad_norm": 2.9566802978515625, + "learning_rate": 0.00043105802047781574, + "loss": 6.5596, + "step": 5001 + }, + { + "epoch": 1.7071672354948806, + "grad_norm": 2.936339855194092, + "learning_rate": 0.00043094425483503984, + "loss": 6.7089, + "step": 5002 + }, + { + "epoch": 1.7075085324232082, + "grad_norm": 3.0246903896331787, + "learning_rate": 0.0004308304891922639, + "loss": 6.5555, + "step": 5003 + }, + { + "epoch": 1.7078498293515358, + "grad_norm": 3.0471384525299072, + "learning_rate": 0.00043071672354948805, + "loss": 6.3993, + "step": 5004 + }, + { + "epoch": 1.7081911262798635, + "grad_norm": 3.12800931930542, + "learning_rate": 0.00043060295790671216, + "loss": 6.0564, + "step": 5005 + }, + { + "epoch": 1.708532423208191, + "grad_norm": 6.632503032684326, + "learning_rate": 0.0004304891922639363, + "loss": 5.0925, + "step": 5006 + }, + { + "epoch": 1.7088737201365187, + "grad_norm": 3.128831624984741, + "learning_rate": 0.0004303754266211604, + "loss": 5.274, + "step": 5007 + }, + { + "epoch": 1.7092150170648464, + "grad_norm": 3.365473508834839, + "learning_rate": 0.0004302616609783845, + "loss": 6.5734, + "step": 5008 + }, + { + "epoch": 1.7095563139931742, + "grad_norm": 3.8749191761016846, + "learning_rate": 0.0004301478953356087, + "loss": 6.8053, + "step": 5009 + }, + { + "epoch": 1.7098976109215016, + "grad_norm": 3.1307623386383057, + "learning_rate": 0.0004300341296928328, + "loss": 6.4477, + "step": 5010 + }, + { + "epoch": 1.7102389078498295, + "grad_norm": 2.960883140563965, + "learning_rate": 0.0004299203640500569, + "loss": 6.4975, + "step": 5011 + }, + { + "epoch": 1.7105802047781569, + "grad_norm": 2.9841647148132324, + "learning_rate": 0.00042980659840728105, + "loss": 6.5142, + "step": 5012 + }, + { + "epoch": 1.7109215017064847, + "grad_norm": 3.0024847984313965, + "learning_rate": 0.0004296928327645051, + "loss": 6.8747, + "step": 5013 + }, + { + "epoch": 1.7112627986348121, + "grad_norm": 3.0802016258239746, + "learning_rate": 0.0004295790671217292, + "loss": 5.6291, + "step": 5014 + }, + { + "epoch": 1.71160409556314, + "grad_norm": 3.068802833557129, + "learning_rate": 0.00042946530147895337, + "loss": 5.808, + "step": 5015 + }, + { + "epoch": 1.7119453924914676, + "grad_norm": 2.9802236557006836, + "learning_rate": 0.0004293515358361775, + "loss": 6.4699, + "step": 5016 + }, + { + "epoch": 1.7122866894197952, + "grad_norm": 2.9703474044799805, + "learning_rate": 0.0004292377701934016, + "loss": 5.8941, + "step": 5017 + }, + { + "epoch": 1.7126279863481229, + "grad_norm": 2.9278199672698975, + "learning_rate": 0.00042912400455062574, + "loss": 6.2988, + "step": 5018 + }, + { + "epoch": 1.7129692832764505, + "grad_norm": 3.066082239151001, + "learning_rate": 0.00042901023890784984, + "loss": 6.5867, + "step": 5019 + }, + { + "epoch": 1.713310580204778, + "grad_norm": 3.058166980743408, + "learning_rate": 0.00042889647326507395, + "loss": 6.0648, + "step": 5020 + }, + { + "epoch": 1.7136518771331057, + "grad_norm": 3.1347908973693848, + "learning_rate": 0.0004287827076222981, + "loss": 5.8548, + "step": 5021 + }, + { + "epoch": 1.7139931740614336, + "grad_norm": 3.1643781661987305, + "learning_rate": 0.0004286689419795222, + "loss": 5.9265, + "step": 5022 + }, + { + "epoch": 1.714334470989761, + "grad_norm": 3.030947685241699, + "learning_rate": 0.00042855517633674626, + "loss": 5.989, + "step": 5023 + }, + { + "epoch": 1.7146757679180888, + "grad_norm": 3.111829996109009, + "learning_rate": 0.0004284414106939704, + "loss": 6.5268, + "step": 5024 + }, + { + "epoch": 1.7150170648464163, + "grad_norm": 2.8761911392211914, + "learning_rate": 0.00042832764505119453, + "loss": 6.5559, + "step": 5025 + }, + { + "epoch": 1.715358361774744, + "grad_norm": 5.675846099853516, + "learning_rate": 0.0004282138794084187, + "loss": 5.3305, + "step": 5026 + }, + { + "epoch": 1.7156996587030715, + "grad_norm": 3.131277322769165, + "learning_rate": 0.0004281001137656428, + "loss": 6.8141, + "step": 5027 + }, + { + "epoch": 1.7160409556313994, + "grad_norm": 3.332944631576538, + "learning_rate": 0.0004279863481228669, + "loss": 6.1245, + "step": 5028 + }, + { + "epoch": 1.716382252559727, + "grad_norm": 3.1765148639678955, + "learning_rate": 0.00042787258248009106, + "loss": 5.8214, + "step": 5029 + }, + { + "epoch": 1.7167235494880546, + "grad_norm": 3.1236507892608643, + "learning_rate": 0.00042775881683731516, + "loss": 5.903, + "step": 5030 + }, + { + "epoch": 1.7170648464163822, + "grad_norm": 3.129188060760498, + "learning_rate": 0.00042764505119453927, + "loss": 6.1272, + "step": 5031 + }, + { + "epoch": 1.7174061433447099, + "grad_norm": 3.017763614654541, + "learning_rate": 0.00042753128555176337, + "loss": 6.2087, + "step": 5032 + }, + { + "epoch": 1.7177474402730375, + "grad_norm": 3.383964776992798, + "learning_rate": 0.0004274175199089875, + "loss": 5.143, + "step": 5033 + }, + { + "epoch": 1.7180887372013651, + "grad_norm": 3.011268377304077, + "learning_rate": 0.0004273037542662116, + "loss": 6.7169, + "step": 5034 + }, + { + "epoch": 1.718430034129693, + "grad_norm": 3.053954601287842, + "learning_rate": 0.00042718998862343574, + "loss": 5.9654, + "step": 5035 + }, + { + "epoch": 1.7187713310580204, + "grad_norm": 3.0908567905426025, + "learning_rate": 0.00042707622298065984, + "loss": 6.1534, + "step": 5036 + }, + { + "epoch": 1.7191126279863482, + "grad_norm": 3.233099937438965, + "learning_rate": 0.00042696245733788395, + "loss": 5.985, + "step": 5037 + }, + { + "epoch": 1.7194539249146756, + "grad_norm": 2.9370503425598145, + "learning_rate": 0.0004268486916951081, + "loss": 6.4747, + "step": 5038 + }, + { + "epoch": 1.7197952218430035, + "grad_norm": 3.1536028385162354, + "learning_rate": 0.0004267349260523322, + "loss": 5.9485, + "step": 5039 + }, + { + "epoch": 1.7201365187713311, + "grad_norm": 3.166968584060669, + "learning_rate": 0.0004266211604095563, + "loss": 6.7526, + "step": 5040 + }, + { + "epoch": 1.7204778156996587, + "grad_norm": 3.0940346717834473, + "learning_rate": 0.0004265073947667805, + "loss": 6.4837, + "step": 5041 + }, + { + "epoch": 1.7208191126279864, + "grad_norm": 2.989959955215454, + "learning_rate": 0.00042639362912400453, + "loss": 6.5663, + "step": 5042 + }, + { + "epoch": 1.721160409556314, + "grad_norm": 2.976271152496338, + "learning_rate": 0.00042627986348122863, + "loss": 6.0443, + "step": 5043 + }, + { + "epoch": 1.7215017064846416, + "grad_norm": 3.0593347549438477, + "learning_rate": 0.0004261660978384528, + "loss": 5.6778, + "step": 5044 + }, + { + "epoch": 1.7218430034129693, + "grad_norm": 3.020432710647583, + "learning_rate": 0.0004260523321956769, + "loss": 6.2861, + "step": 5045 + }, + { + "epoch": 1.7221843003412969, + "grad_norm": 3.6655890941619873, + "learning_rate": 0.00042593856655290106, + "loss": 5.8077, + "step": 5046 + }, + { + "epoch": 1.7225255972696245, + "grad_norm": 2.9208641052246094, + "learning_rate": 0.00042582480091012516, + "loss": 6.405, + "step": 5047 + }, + { + "epoch": 1.7228668941979524, + "grad_norm": 2.9895660877227783, + "learning_rate": 0.00042571103526734927, + "loss": 6.1637, + "step": 5048 + }, + { + "epoch": 1.7232081911262798, + "grad_norm": 3.0086591243743896, + "learning_rate": 0.0004255972696245734, + "loss": 6.4718, + "step": 5049 + }, + { + "epoch": 1.7235494880546076, + "grad_norm": 3.0350987911224365, + "learning_rate": 0.00042548350398179753, + "loss": 6.4154, + "step": 5050 + }, + { + "epoch": 1.723890784982935, + "grad_norm": 3.0023648738861084, + "learning_rate": 0.00042536973833902164, + "loss": 6.7531, + "step": 5051 + }, + { + "epoch": 1.7242320819112629, + "grad_norm": 2.9815728664398193, + "learning_rate": 0.00042525597269624574, + "loss": 5.6956, + "step": 5052 + }, + { + "epoch": 1.7245733788395905, + "grad_norm": 3.007316827774048, + "learning_rate": 0.00042514220705346985, + "loss": 6.5796, + "step": 5053 + }, + { + "epoch": 1.7249146757679181, + "grad_norm": 3.032801866531372, + "learning_rate": 0.00042502844141069395, + "loss": 6.1046, + "step": 5054 + }, + { + "epoch": 1.7252559726962458, + "grad_norm": 2.990694284439087, + "learning_rate": 0.0004249146757679181, + "loss": 6.188, + "step": 5055 + }, + { + "epoch": 1.7255972696245734, + "grad_norm": 3.120084285736084, + "learning_rate": 0.0004248009101251422, + "loss": 6.7109, + "step": 5056 + }, + { + "epoch": 1.725938566552901, + "grad_norm": 3.544955253601074, + "learning_rate": 0.0004246871444823663, + "loss": 5.6435, + "step": 5057 + }, + { + "epoch": 1.7262798634812286, + "grad_norm": 2.977841377258301, + "learning_rate": 0.0004245733788395905, + "loss": 5.9886, + "step": 5058 + }, + { + "epoch": 1.7266211604095563, + "grad_norm": 3.0229032039642334, + "learning_rate": 0.0004244596131968146, + "loss": 6.4989, + "step": 5059 + }, + { + "epoch": 1.726962457337884, + "grad_norm": 2.9560835361480713, + "learning_rate": 0.0004243458475540387, + "loss": 6.0739, + "step": 5060 + }, + { + "epoch": 1.7273037542662117, + "grad_norm": 2.8804562091827393, + "learning_rate": 0.00042423208191126285, + "loss": 6.0115, + "step": 5061 + }, + { + "epoch": 1.7276450511945391, + "grad_norm": 2.9489169120788574, + "learning_rate": 0.0004241183162684869, + "loss": 6.7899, + "step": 5062 + }, + { + "epoch": 1.727986348122867, + "grad_norm": 3.0079617500305176, + "learning_rate": 0.000424004550625711, + "loss": 6.6014, + "step": 5063 + }, + { + "epoch": 1.7283276450511944, + "grad_norm": 3.1168062686920166, + "learning_rate": 0.00042389078498293516, + "loss": 6.5893, + "step": 5064 + }, + { + "epoch": 1.7286689419795223, + "grad_norm": 2.953608512878418, + "learning_rate": 0.00042377701934015927, + "loss": 5.8904, + "step": 5065 + }, + { + "epoch": 1.7290102389078499, + "grad_norm": 3.050062417984009, + "learning_rate": 0.0004236632536973834, + "loss": 6.3134, + "step": 5066 + }, + { + "epoch": 1.7293515358361775, + "grad_norm": 2.984161138534546, + "learning_rate": 0.00042354948805460753, + "loss": 6.3411, + "step": 5067 + }, + { + "epoch": 1.7296928327645051, + "grad_norm": 3.0105361938476562, + "learning_rate": 0.00042343572241183164, + "loss": 6.1606, + "step": 5068 + }, + { + "epoch": 1.7300341296928328, + "grad_norm": 2.9933488368988037, + "learning_rate": 0.0004233219567690558, + "loss": 6.2474, + "step": 5069 + }, + { + "epoch": 1.7303754266211604, + "grad_norm": 3.0552947521209717, + "learning_rate": 0.0004232081911262799, + "loss": 5.894, + "step": 5070 + }, + { + "epoch": 1.730716723549488, + "grad_norm": 3.3107733726501465, + "learning_rate": 0.00042309442548350395, + "loss": 6.2136, + "step": 5071 + }, + { + "epoch": 1.7310580204778157, + "grad_norm": 3.0709095001220703, + "learning_rate": 0.0004229806598407281, + "loss": 5.9675, + "step": 5072 + }, + { + "epoch": 1.7313993174061433, + "grad_norm": 3.0665359497070312, + "learning_rate": 0.0004228668941979522, + "loss": 6.2627, + "step": 5073 + }, + { + "epoch": 1.7317406143344711, + "grad_norm": 3.0029146671295166, + "learning_rate": 0.0004227531285551763, + "loss": 6.1561, + "step": 5074 + }, + { + "epoch": 1.7320819112627985, + "grad_norm": 3.0567500591278076, + "learning_rate": 0.0004226393629124005, + "loss": 6.0348, + "step": 5075 + }, + { + "epoch": 1.7324232081911264, + "grad_norm": 3.08443546295166, + "learning_rate": 0.0004225255972696246, + "loss": 5.8998, + "step": 5076 + }, + { + "epoch": 1.7327645051194538, + "grad_norm": 2.951045274734497, + "learning_rate": 0.0004224118316268487, + "loss": 6.1719, + "step": 5077 + }, + { + "epoch": 1.7331058020477816, + "grad_norm": 3.0656325817108154, + "learning_rate": 0.00042229806598407285, + "loss": 6.3616, + "step": 5078 + }, + { + "epoch": 1.7334470989761093, + "grad_norm": 2.9396920204162598, + "learning_rate": 0.00042218430034129695, + "loss": 6.2838, + "step": 5079 + }, + { + "epoch": 1.733788395904437, + "grad_norm": 2.96988844871521, + "learning_rate": 0.00042207053469852106, + "loss": 6.6509, + "step": 5080 + }, + { + "epoch": 1.7341296928327645, + "grad_norm": 2.8381896018981934, + "learning_rate": 0.00042195676905574516, + "loss": 6.1965, + "step": 5081 + }, + { + "epoch": 1.7344709897610922, + "grad_norm": 6.883829116821289, + "learning_rate": 0.00042184300341296927, + "loss": 5.7458, + "step": 5082 + }, + { + "epoch": 1.7348122866894198, + "grad_norm": 3.1585566997528076, + "learning_rate": 0.0004217292377701934, + "loss": 6.4391, + "step": 5083 + }, + { + "epoch": 1.7351535836177474, + "grad_norm": 3.090667486190796, + "learning_rate": 0.00042161547212741753, + "loss": 6.7133, + "step": 5084 + }, + { + "epoch": 1.735494880546075, + "grad_norm": 3.033228874206543, + "learning_rate": 0.00042150170648464164, + "loss": 6.2394, + "step": 5085 + }, + { + "epoch": 1.7358361774744027, + "grad_norm": 4.285531520843506, + "learning_rate": 0.00042138794084186574, + "loss": 4.8546, + "step": 5086 + }, + { + "epoch": 1.7361774744027305, + "grad_norm": 3.2763266563415527, + "learning_rate": 0.0004212741751990899, + "loss": 6.2258, + "step": 5087 + }, + { + "epoch": 1.736518771331058, + "grad_norm": 3.614485025405884, + "learning_rate": 0.000421160409556314, + "loss": 5.6639, + "step": 5088 + }, + { + "epoch": 1.7368600682593858, + "grad_norm": 3.088728666305542, + "learning_rate": 0.00042104664391353817, + "loss": 6.7367, + "step": 5089 + }, + { + "epoch": 1.7372013651877132, + "grad_norm": 3.1327497959136963, + "learning_rate": 0.00042093287827076227, + "loss": 5.9986, + "step": 5090 + }, + { + "epoch": 1.737542662116041, + "grad_norm": 2.9615418910980225, + "learning_rate": 0.0004208191126279863, + "loss": 6.1185, + "step": 5091 + }, + { + "epoch": 1.7378839590443687, + "grad_norm": 3.1406466960906982, + "learning_rate": 0.0004207053469852105, + "loss": 6.3329, + "step": 5092 + }, + { + "epoch": 1.7382252559726963, + "grad_norm": 2.9437146186828613, + "learning_rate": 0.0004205915813424346, + "loss": 6.3977, + "step": 5093 + }, + { + "epoch": 1.738566552901024, + "grad_norm": 2.894519090652466, + "learning_rate": 0.0004204778156996587, + "loss": 6.7851, + "step": 5094 + }, + { + "epoch": 1.7389078498293515, + "grad_norm": 3.03589129447937, + "learning_rate": 0.00042036405005688285, + "loss": 5.6625, + "step": 5095 + }, + { + "epoch": 1.7392491467576792, + "grad_norm": 3.0257980823516846, + "learning_rate": 0.00042025028441410695, + "loss": 5.6283, + "step": 5096 + }, + { + "epoch": 1.7395904436860068, + "grad_norm": 2.9698126316070557, + "learning_rate": 0.00042013651877133106, + "loss": 5.9327, + "step": 5097 + }, + { + "epoch": 1.7399317406143344, + "grad_norm": 3.094011068344116, + "learning_rate": 0.0004200227531285552, + "loss": 6.1025, + "step": 5098 + }, + { + "epoch": 1.740273037542662, + "grad_norm": 2.933558702468872, + "learning_rate": 0.0004199089874857793, + "loss": 6.3226, + "step": 5099 + }, + { + "epoch": 1.74061433447099, + "grad_norm": 3.014883518218994, + "learning_rate": 0.0004197952218430034, + "loss": 6.1689, + "step": 5100 + }, + { + "epoch": 1.7409556313993173, + "grad_norm": 2.9969851970672607, + "learning_rate": 0.00041968145620022753, + "loss": 6.4516, + "step": 5101 + }, + { + "epoch": 1.7412969283276452, + "grad_norm": 2.958338737487793, + "learning_rate": 0.00041956769055745164, + "loss": 6.3338, + "step": 5102 + }, + { + "epoch": 1.7416382252559726, + "grad_norm": 2.871307373046875, + "learning_rate": 0.00041945392491467574, + "loss": 6.3378, + "step": 5103 + }, + { + "epoch": 1.7419795221843004, + "grad_norm": 3.485704183578491, + "learning_rate": 0.0004193401592718999, + "loss": 5.556, + "step": 5104 + }, + { + "epoch": 1.742320819112628, + "grad_norm": 3.0032105445861816, + "learning_rate": 0.000419226393629124, + "loss": 5.7468, + "step": 5105 + }, + { + "epoch": 1.7426621160409557, + "grad_norm": 3.2163028717041016, + "learning_rate": 0.0004191126279863481, + "loss": 6.1866, + "step": 5106 + }, + { + "epoch": 1.7430034129692833, + "grad_norm": 3.722259044647217, + "learning_rate": 0.00041899886234357227, + "loss": 5.2801, + "step": 5107 + }, + { + "epoch": 1.743344709897611, + "grad_norm": 3.0470032691955566, + "learning_rate": 0.0004188850967007964, + "loss": 6.1884, + "step": 5108 + }, + { + "epoch": 1.7436860068259386, + "grad_norm": 3.0684938430786133, + "learning_rate": 0.00041877133105802054, + "loss": 6.9454, + "step": 5109 + }, + { + "epoch": 1.7440273037542662, + "grad_norm": 2.902766227722168, + "learning_rate": 0.0004186575654152446, + "loss": 5.6917, + "step": 5110 + }, + { + "epoch": 1.7443686006825938, + "grad_norm": 2.978318929672241, + "learning_rate": 0.0004185437997724687, + "loss": 5.6608, + "step": 5111 + }, + { + "epoch": 1.7447098976109214, + "grad_norm": 2.9816012382507324, + "learning_rate": 0.00041843003412969285, + "loss": 6.0245, + "step": 5112 + }, + { + "epoch": 1.7450511945392493, + "grad_norm": 3.0680909156799316, + "learning_rate": 0.00041831626848691695, + "loss": 5.9496, + "step": 5113 + }, + { + "epoch": 1.7453924914675767, + "grad_norm": 2.8896589279174805, + "learning_rate": 0.00041820250284414106, + "loss": 6.1912, + "step": 5114 + }, + { + "epoch": 1.7457337883959045, + "grad_norm": 2.889788866043091, + "learning_rate": 0.0004180887372013652, + "loss": 6.7443, + "step": 5115 + }, + { + "epoch": 1.746075085324232, + "grad_norm": 3.0151519775390625, + "learning_rate": 0.0004179749715585893, + "loss": 6.5864, + "step": 5116 + }, + { + "epoch": 1.7464163822525598, + "grad_norm": 2.864457368850708, + "learning_rate": 0.00041786120591581343, + "loss": 6.5104, + "step": 5117 + }, + { + "epoch": 1.7467576791808874, + "grad_norm": 3.0471596717834473, + "learning_rate": 0.0004177474402730376, + "loss": 6.4869, + "step": 5118 + }, + { + "epoch": 1.747098976109215, + "grad_norm": 3.1271932125091553, + "learning_rate": 0.0004176336746302617, + "loss": 6.1027, + "step": 5119 + }, + { + "epoch": 1.7474402730375427, + "grad_norm": 3.2092442512512207, + "learning_rate": 0.00041751990898748574, + "loss": 6.081, + "step": 5120 + }, + { + "epoch": 1.7477815699658703, + "grad_norm": 3.0421433448791504, + "learning_rate": 0.0004174061433447099, + "loss": 6.3332, + "step": 5121 + }, + { + "epoch": 1.748122866894198, + "grad_norm": 2.9517619609832764, + "learning_rate": 0.000417292377701934, + "loss": 6.3718, + "step": 5122 + }, + { + "epoch": 1.7484641638225256, + "grad_norm": 2.967085838317871, + "learning_rate": 0.0004171786120591581, + "loss": 6.2602, + "step": 5123 + }, + { + "epoch": 1.7488054607508532, + "grad_norm": 5.240909576416016, + "learning_rate": 0.00041706484641638227, + "loss": 5.1849, + "step": 5124 + }, + { + "epoch": 1.7491467576791808, + "grad_norm": 3.141845464706421, + "learning_rate": 0.0004169510807736064, + "loss": 5.9711, + "step": 5125 + }, + { + "epoch": 1.7494880546075087, + "grad_norm": 3.0010454654693604, + "learning_rate": 0.0004168373151308305, + "loss": 6.1771, + "step": 5126 + }, + { + "epoch": 1.749829351535836, + "grad_norm": 2.5118589401245117, + "learning_rate": 0.00041672354948805464, + "loss": 3.2816, + "step": 5127 + }, + { + "epoch": 1.750170648464164, + "grad_norm": 3.0626370906829834, + "learning_rate": 0.00041660978384527875, + "loss": 6.3235, + "step": 5128 + }, + { + "epoch": 1.7505119453924913, + "grad_norm": 3.541957378387451, + "learning_rate": 0.0004164960182025029, + "loss": 5.6468, + "step": 5129 + }, + { + "epoch": 1.7508532423208192, + "grad_norm": 3.089536428451538, + "learning_rate": 0.00041638225255972696, + "loss": 6.5741, + "step": 5130 + }, + { + "epoch": 1.7511945392491468, + "grad_norm": 3.0071637630462646, + "learning_rate": 0.00041626848691695106, + "loss": 6.1721, + "step": 5131 + }, + { + "epoch": 1.7515358361774744, + "grad_norm": 3.3689115047454834, + "learning_rate": 0.0004161547212741752, + "loss": 6.32, + "step": 5132 + }, + { + "epoch": 1.751877133105802, + "grad_norm": 3.6956567764282227, + "learning_rate": 0.0004160409556313993, + "loss": 5.7046, + "step": 5133 + }, + { + "epoch": 1.7522184300341297, + "grad_norm": 4.705052375793457, + "learning_rate": 0.00041592718998862343, + "loss": 5.2505, + "step": 5134 + }, + { + "epoch": 1.7525597269624573, + "grad_norm": 2.986630916595459, + "learning_rate": 0.0004158134243458476, + "loss": 6.1452, + "step": 5135 + }, + { + "epoch": 1.752901023890785, + "grad_norm": 3.066112995147705, + "learning_rate": 0.0004156996587030717, + "loss": 6.1217, + "step": 5136 + }, + { + "epoch": 1.7532423208191126, + "grad_norm": 3.2455801963806152, + "learning_rate": 0.0004155858930602958, + "loss": 6.1903, + "step": 5137 + }, + { + "epoch": 1.7535836177474402, + "grad_norm": 4.036913871765137, + "learning_rate": 0.00041547212741751996, + "loss": 5.0287, + "step": 5138 + }, + { + "epoch": 1.753924914675768, + "grad_norm": 2.882148265838623, + "learning_rate": 0.000415358361774744, + "loss": 6.9066, + "step": 5139 + }, + { + "epoch": 1.7542662116040955, + "grad_norm": 3.251460552215576, + "learning_rate": 0.0004152445961319681, + "loss": 5.8373, + "step": 5140 + }, + { + "epoch": 1.7546075085324233, + "grad_norm": 3.242882013320923, + "learning_rate": 0.00041513083048919227, + "loss": 5.5887, + "step": 5141 + }, + { + "epoch": 1.7549488054607507, + "grad_norm": 2.918837070465088, + "learning_rate": 0.0004150170648464164, + "loss": 6.3827, + "step": 5142 + }, + { + "epoch": 1.7552901023890786, + "grad_norm": 5.232894420623779, + "learning_rate": 0.0004149032992036405, + "loss": 4.7531, + "step": 5143 + }, + { + "epoch": 1.7556313993174062, + "grad_norm": 2.9766147136688232, + "learning_rate": 0.00041478953356086464, + "loss": 6.5025, + "step": 5144 + }, + { + "epoch": 1.7559726962457338, + "grad_norm": 3.3592119216918945, + "learning_rate": 0.00041467576791808875, + "loss": 5.1757, + "step": 5145 + }, + { + "epoch": 1.7563139931740614, + "grad_norm": 3.114332437515259, + "learning_rate": 0.00041456200227531285, + "loss": 5.9303, + "step": 5146 + }, + { + "epoch": 1.756655290102389, + "grad_norm": 3.0244557857513428, + "learning_rate": 0.000414448236632537, + "loss": 5.9333, + "step": 5147 + }, + { + "epoch": 1.7569965870307167, + "grad_norm": 3.0684051513671875, + "learning_rate": 0.0004143344709897611, + "loss": 5.7864, + "step": 5148 + }, + { + "epoch": 1.7573378839590443, + "grad_norm": 2.931272029876709, + "learning_rate": 0.0004142207053469852, + "loss": 6.2102, + "step": 5149 + }, + { + "epoch": 1.757679180887372, + "grad_norm": 3.0313282012939453, + "learning_rate": 0.0004141069397042093, + "loss": 6.2017, + "step": 5150 + }, + { + "epoch": 1.7580204778156996, + "grad_norm": 3.021868944168091, + "learning_rate": 0.00041399317406143343, + "loss": 6.8173, + "step": 5151 + }, + { + "epoch": 1.7583617747440274, + "grad_norm": 2.8583309650421143, + "learning_rate": 0.0004138794084186576, + "loss": 6.4639, + "step": 5152 + }, + { + "epoch": 1.7587030716723548, + "grad_norm": 5.182238578796387, + "learning_rate": 0.0004137656427758817, + "loss": 5.7794, + "step": 5153 + }, + { + "epoch": 1.7590443686006827, + "grad_norm": 2.988149881362915, + "learning_rate": 0.0004136518771331058, + "loss": 6.5781, + "step": 5154 + }, + { + "epoch": 1.75938566552901, + "grad_norm": 3.7683768272399902, + "learning_rate": 0.00041353811149032996, + "loss": 6.1431, + "step": 5155 + }, + { + "epoch": 1.759726962457338, + "grad_norm": 3.9146881103515625, + "learning_rate": 0.00041342434584755406, + "loss": 5.6584, + "step": 5156 + }, + { + "epoch": 1.7600682593856656, + "grad_norm": 2.989680290222168, + "learning_rate": 0.00041331058020477817, + "loss": 6.8671, + "step": 5157 + }, + { + "epoch": 1.7604095563139932, + "grad_norm": 3.8746161460876465, + "learning_rate": 0.00041319681456200233, + "loss": 5.0076, + "step": 5158 + }, + { + "epoch": 1.7607508532423208, + "grad_norm": 3.014831066131592, + "learning_rate": 0.0004130830489192264, + "loss": 6.3627, + "step": 5159 + }, + { + "epoch": 1.7610921501706485, + "grad_norm": 2.946559429168701, + "learning_rate": 0.0004129692832764505, + "loss": 5.7226, + "step": 5160 + }, + { + "epoch": 1.761433447098976, + "grad_norm": 2.9957525730133057, + "learning_rate": 0.00041285551763367464, + "loss": 6.5074, + "step": 5161 + }, + { + "epoch": 1.7617747440273037, + "grad_norm": 2.896390438079834, + "learning_rate": 0.00041274175199089875, + "loss": 6.9841, + "step": 5162 + }, + { + "epoch": 1.7621160409556313, + "grad_norm": 3.291266918182373, + "learning_rate": 0.00041262798634812285, + "loss": 6.0102, + "step": 5163 + }, + { + "epoch": 1.762457337883959, + "grad_norm": 3.1446962356567383, + "learning_rate": 0.000412514220705347, + "loss": 5.8353, + "step": 5164 + }, + { + "epoch": 1.7627986348122868, + "grad_norm": 2.8148372173309326, + "learning_rate": 0.0004124004550625711, + "loss": 6.1815, + "step": 5165 + }, + { + "epoch": 1.7631399317406142, + "grad_norm": 3.0298731327056885, + "learning_rate": 0.0004122866894197952, + "loss": 6.2174, + "step": 5166 + }, + { + "epoch": 1.763481228668942, + "grad_norm": 3.0104305744171143, + "learning_rate": 0.0004121729237770194, + "loss": 6.3785, + "step": 5167 + }, + { + "epoch": 1.7638225255972695, + "grad_norm": 2.9257543087005615, + "learning_rate": 0.0004120591581342435, + "loss": 6.0503, + "step": 5168 + }, + { + "epoch": 1.7641638225255973, + "grad_norm": 3.0766642093658447, + "learning_rate": 0.00041194539249146754, + "loss": 5.7167, + "step": 5169 + }, + { + "epoch": 1.764505119453925, + "grad_norm": 2.8836543560028076, + "learning_rate": 0.0004118316268486917, + "loss": 6.0921, + "step": 5170 + }, + { + "epoch": 1.7648464163822526, + "grad_norm": 3.0110116004943848, + "learning_rate": 0.0004117178612059158, + "loss": 6.3009, + "step": 5171 + }, + { + "epoch": 1.7651877133105802, + "grad_norm": 2.9918863773345947, + "learning_rate": 0.00041160409556313996, + "loss": 6.0492, + "step": 5172 + }, + { + "epoch": 1.7655290102389078, + "grad_norm": 4.458995342254639, + "learning_rate": 0.00041149032992036406, + "loss": 5.4651, + "step": 5173 + }, + { + "epoch": 1.7658703071672355, + "grad_norm": 3.1331934928894043, + "learning_rate": 0.00041137656427758817, + "loss": 5.8811, + "step": 5174 + }, + { + "epoch": 1.766211604095563, + "grad_norm": 2.99288010597229, + "learning_rate": 0.00041126279863481233, + "loss": 6.1899, + "step": 5175 + }, + { + "epoch": 1.7665529010238907, + "grad_norm": 6.400101661682129, + "learning_rate": 0.00041114903299203643, + "loss": 4.2093, + "step": 5176 + }, + { + "epoch": 1.7668941979522184, + "grad_norm": 3.309239387512207, + "learning_rate": 0.00041103526734926054, + "loss": 5.9248, + "step": 5177 + }, + { + "epoch": 1.7672354948805462, + "grad_norm": 2.9934983253479004, + "learning_rate": 0.00041092150170648464, + "loss": 6.4463, + "step": 5178 + }, + { + "epoch": 1.7675767918088736, + "grad_norm": 3.0152664184570312, + "learning_rate": 0.00041080773606370875, + "loss": 6.4454, + "step": 5179 + }, + { + "epoch": 1.7679180887372015, + "grad_norm": 3.141155958175659, + "learning_rate": 0.00041069397042093285, + "loss": 5.738, + "step": 5180 + }, + { + "epoch": 1.7682593856655289, + "grad_norm": 3.011068344116211, + "learning_rate": 0.000410580204778157, + "loss": 6.8537, + "step": 5181 + }, + { + "epoch": 1.7686006825938567, + "grad_norm": 7.368420600891113, + "learning_rate": 0.0004104664391353811, + "loss": 5.6683, + "step": 5182 + }, + { + "epoch": 1.7689419795221843, + "grad_norm": 3.043335199356079, + "learning_rate": 0.0004103526734926052, + "loss": 6.6035, + "step": 5183 + }, + { + "epoch": 1.769283276450512, + "grad_norm": 3.1109790802001953, + "learning_rate": 0.0004102389078498294, + "loss": 6.0835, + "step": 5184 + }, + { + "epoch": 1.7696245733788396, + "grad_norm": 3.0300333499908447, + "learning_rate": 0.0004101251422070535, + "loss": 6.4015, + "step": 5185 + }, + { + "epoch": 1.7699658703071672, + "grad_norm": 2.9411182403564453, + "learning_rate": 0.0004100113765642776, + "loss": 6.0647, + "step": 5186 + }, + { + "epoch": 1.7703071672354949, + "grad_norm": 3.1921133995056152, + "learning_rate": 0.00040989761092150175, + "loss": 5.8967, + "step": 5187 + }, + { + "epoch": 1.7706484641638225, + "grad_norm": 2.9762604236602783, + "learning_rate": 0.0004097838452787258, + "loss": 6.4729, + "step": 5188 + }, + { + "epoch": 1.7709897610921501, + "grad_norm": 2.857520341873169, + "learning_rate": 0.0004096700796359499, + "loss": 6.0116, + "step": 5189 + }, + { + "epoch": 1.7713310580204777, + "grad_norm": 2.9309916496276855, + "learning_rate": 0.00040955631399317407, + "loss": 6.5562, + "step": 5190 + }, + { + "epoch": 1.7716723549488056, + "grad_norm": 3.0163867473602295, + "learning_rate": 0.00040944254835039817, + "loss": 6.6054, + "step": 5191 + }, + { + "epoch": 1.772013651877133, + "grad_norm": 2.942237377166748, + "learning_rate": 0.00040932878270762233, + "loss": 6.6192, + "step": 5192 + }, + { + "epoch": 1.7723549488054609, + "grad_norm": 2.938898801803589, + "learning_rate": 0.00040921501706484643, + "loss": 6.3657, + "step": 5193 + }, + { + "epoch": 1.7726962457337883, + "grad_norm": 3.5897812843322754, + "learning_rate": 0.00040910125142207054, + "loss": 5.9367, + "step": 5194 + }, + { + "epoch": 1.773037542662116, + "grad_norm": 2.926736831665039, + "learning_rate": 0.0004089874857792947, + "loss": 6.6937, + "step": 5195 + }, + { + "epoch": 1.7733788395904437, + "grad_norm": 3.1813864707946777, + "learning_rate": 0.0004088737201365188, + "loss": 5.4131, + "step": 5196 + }, + { + "epoch": 1.7737201365187714, + "grad_norm": 3.0285255908966064, + "learning_rate": 0.0004087599544937429, + "loss": 6.0017, + "step": 5197 + }, + { + "epoch": 1.774061433447099, + "grad_norm": 3.036341428756714, + "learning_rate": 0.000408646188850967, + "loss": 6.6581, + "step": 5198 + }, + { + "epoch": 1.7744027303754266, + "grad_norm": 3.0285723209381104, + "learning_rate": 0.0004085324232081911, + "loss": 6.2925, + "step": 5199 + }, + { + "epoch": 1.7747440273037542, + "grad_norm": 2.9925947189331055, + "learning_rate": 0.0004084186575654152, + "loss": 6.1965, + "step": 5200 + }, + { + "epoch": 1.7750853242320819, + "grad_norm": 3.050577163696289, + "learning_rate": 0.0004083048919226394, + "loss": 6.3542, + "step": 5201 + }, + { + "epoch": 1.7754266211604095, + "grad_norm": 2.938793897628784, + "learning_rate": 0.0004081911262798635, + "loss": 6.4952, + "step": 5202 + }, + { + "epoch": 1.7757679180887371, + "grad_norm": 3.007162570953369, + "learning_rate": 0.0004080773606370876, + "loss": 5.9792, + "step": 5203 + }, + { + "epoch": 1.776109215017065, + "grad_norm": 2.8875231742858887, + "learning_rate": 0.00040796359499431175, + "loss": 6.2729, + "step": 5204 + }, + { + "epoch": 1.7764505119453924, + "grad_norm": 4.119450092315674, + "learning_rate": 0.00040784982935153586, + "loss": 4.7705, + "step": 5205 + }, + { + "epoch": 1.7767918088737202, + "grad_norm": 3.139040231704712, + "learning_rate": 0.00040773606370875996, + "loss": 5.8157, + "step": 5206 + }, + { + "epoch": 1.7771331058020476, + "grad_norm": 2.9930758476257324, + "learning_rate": 0.00040762229806598407, + "loss": 6.2695, + "step": 5207 + }, + { + "epoch": 1.7774744027303755, + "grad_norm": 2.9738657474517822, + "learning_rate": 0.00040750853242320817, + "loss": 6.24, + "step": 5208 + }, + { + "epoch": 1.7778156996587031, + "grad_norm": 4.560755252838135, + "learning_rate": 0.0004073947667804323, + "loss": 4.1842, + "step": 5209 + }, + { + "epoch": 1.7781569965870307, + "grad_norm": 3.2815661430358887, + "learning_rate": 0.00040728100113765644, + "loss": 5.3795, + "step": 5210 + }, + { + "epoch": 1.7784982935153584, + "grad_norm": 3.145738124847412, + "learning_rate": 0.00040716723549488054, + "loss": 5.5073, + "step": 5211 + }, + { + "epoch": 1.778839590443686, + "grad_norm": 3.058608293533325, + "learning_rate": 0.0004070534698521047, + "loss": 5.9812, + "step": 5212 + }, + { + "epoch": 1.7791808873720136, + "grad_norm": 2.994455337524414, + "learning_rate": 0.0004069397042093288, + "loss": 6.39, + "step": 5213 + }, + { + "epoch": 1.7795221843003413, + "grad_norm": 2.96401309967041, + "learning_rate": 0.0004068259385665529, + "loss": 6.4591, + "step": 5214 + }, + { + "epoch": 1.7798634812286689, + "grad_norm": 3.074064016342163, + "learning_rate": 0.00040671217292377707, + "loss": 5.9995, + "step": 5215 + }, + { + "epoch": 1.7802047781569965, + "grad_norm": 2.9683525562286377, + "learning_rate": 0.0004065984072810012, + "loss": 6.4996, + "step": 5216 + }, + { + "epoch": 1.7805460750853244, + "grad_norm": 2.9950125217437744, + "learning_rate": 0.0004064846416382252, + "loss": 6.6542, + "step": 5217 + }, + { + "epoch": 1.7808873720136518, + "grad_norm": 2.9647035598754883, + "learning_rate": 0.0004063708759954494, + "loss": 5.9651, + "step": 5218 + }, + { + "epoch": 1.7812286689419796, + "grad_norm": 9.434081077575684, + "learning_rate": 0.0004062571103526735, + "loss": 5.0819, + "step": 5219 + }, + { + "epoch": 1.781569965870307, + "grad_norm": 3.1312179565429688, + "learning_rate": 0.0004061433447098976, + "loss": 6.6445, + "step": 5220 + }, + { + "epoch": 1.7819112627986349, + "grad_norm": 3.0493083000183105, + "learning_rate": 0.00040602957906712175, + "loss": 6.2367, + "step": 5221 + }, + { + "epoch": 1.7822525597269625, + "grad_norm": 3.070904493331909, + "learning_rate": 0.00040591581342434586, + "loss": 6.6448, + "step": 5222 + }, + { + "epoch": 1.7825938566552901, + "grad_norm": 3.064748525619507, + "learning_rate": 0.00040580204778156996, + "loss": 6.255, + "step": 5223 + }, + { + "epoch": 1.7829351535836178, + "grad_norm": 2.90639328956604, + "learning_rate": 0.0004056882821387941, + "loss": 6.2975, + "step": 5224 + }, + { + "epoch": 1.7832764505119454, + "grad_norm": 2.9274797439575195, + "learning_rate": 0.0004055745164960182, + "loss": 6.3858, + "step": 5225 + }, + { + "epoch": 1.783617747440273, + "grad_norm": 3.3938796520233154, + "learning_rate": 0.00040546075085324233, + "loss": 5.6207, + "step": 5226 + }, + { + "epoch": 1.7839590443686006, + "grad_norm": 2.930269956588745, + "learning_rate": 0.00040534698521046644, + "loss": 6.4902, + "step": 5227 + }, + { + "epoch": 1.7843003412969285, + "grad_norm": 2.949573040008545, + "learning_rate": 0.00040523321956769054, + "loss": 6.3274, + "step": 5228 + }, + { + "epoch": 1.784641638225256, + "grad_norm": 3.765118360519409, + "learning_rate": 0.00040511945392491465, + "loss": 5.895, + "step": 5229 + }, + { + "epoch": 1.7849829351535837, + "grad_norm": 3.8920938968658447, + "learning_rate": 0.0004050056882821388, + "loss": 5.751, + "step": 5230 + }, + { + "epoch": 1.7853242320819112, + "grad_norm": 3.290802240371704, + "learning_rate": 0.0004048919226393629, + "loss": 5.801, + "step": 5231 + }, + { + "epoch": 1.785665529010239, + "grad_norm": 3.1709654331207275, + "learning_rate": 0.00040477815699658707, + "loss": 5.6403, + "step": 5232 + }, + { + "epoch": 1.7860068259385664, + "grad_norm": 3.0955452919006348, + "learning_rate": 0.0004046643913538112, + "loss": 6.5735, + "step": 5233 + }, + { + "epoch": 1.7863481228668943, + "grad_norm": 2.9965522289276123, + "learning_rate": 0.0004045506257110353, + "loss": 6.1418, + "step": 5234 + }, + { + "epoch": 1.786689419795222, + "grad_norm": 2.980431079864502, + "learning_rate": 0.00040443686006825944, + "loss": 5.5827, + "step": 5235 + }, + { + "epoch": 1.7870307167235495, + "grad_norm": 2.8499867916107178, + "learning_rate": 0.00040432309442548354, + "loss": 5.8907, + "step": 5236 + }, + { + "epoch": 1.7873720136518771, + "grad_norm": 2.8207998275756836, + "learning_rate": 0.0004042093287827076, + "loss": 6.0233, + "step": 5237 + }, + { + "epoch": 1.7877133105802048, + "grad_norm": 3.0672411918640137, + "learning_rate": 0.00040409556313993175, + "loss": 5.8684, + "step": 5238 + }, + { + "epoch": 1.7880546075085324, + "grad_norm": 3.0053303241729736, + "learning_rate": 0.00040398179749715586, + "loss": 6.1898, + "step": 5239 + }, + { + "epoch": 1.78839590443686, + "grad_norm": 8.463170051574707, + "learning_rate": 0.00040386803185437996, + "loss": 5.5515, + "step": 5240 + }, + { + "epoch": 1.7887372013651879, + "grad_norm": 9.868966102600098, + "learning_rate": 0.0004037542662116041, + "loss": 5.336, + "step": 5241 + }, + { + "epoch": 1.7890784982935153, + "grad_norm": 3.046468496322632, + "learning_rate": 0.00040364050056882823, + "loss": 6.4945, + "step": 5242 + }, + { + "epoch": 1.7894197952218431, + "grad_norm": 3.114753007888794, + "learning_rate": 0.00040352673492605233, + "loss": 6.645, + "step": 5243 + }, + { + "epoch": 1.7897610921501705, + "grad_norm": 4.43574857711792, + "learning_rate": 0.0004034129692832765, + "loss": 6.1538, + "step": 5244 + }, + { + "epoch": 1.7901023890784984, + "grad_norm": 3.092228889465332, + "learning_rate": 0.0004032992036405006, + "loss": 6.236, + "step": 5245 + }, + { + "epoch": 1.7904436860068258, + "grad_norm": 3.4146082401275635, + "learning_rate": 0.00040318543799772465, + "loss": 6.3327, + "step": 5246 + }, + { + "epoch": 1.7907849829351536, + "grad_norm": 3.112100124359131, + "learning_rate": 0.0004030716723549488, + "loss": 6.1249, + "step": 5247 + }, + { + "epoch": 1.7911262798634813, + "grad_norm": 3.0014235973358154, + "learning_rate": 0.0004029579067121729, + "loss": 6.4472, + "step": 5248 + }, + { + "epoch": 1.791467576791809, + "grad_norm": 3.3161520957946777, + "learning_rate": 0.000402844141069397, + "loss": 5.6153, + "step": 5249 + }, + { + "epoch": 1.7918088737201365, + "grad_norm": 2.991649866104126, + "learning_rate": 0.0004027303754266212, + "loss": 6.4251, + "step": 5250 + }, + { + "epoch": 1.7921501706484642, + "grad_norm": 2.9196534156799316, + "learning_rate": 0.0004026166097838453, + "loss": 6.4241, + "step": 5251 + }, + { + "epoch": 1.7924914675767918, + "grad_norm": 4.535854339599609, + "learning_rate": 0.00040250284414106944, + "loss": 4.7127, + "step": 5252 + }, + { + "epoch": 1.7928327645051194, + "grad_norm": 2.958326578140259, + "learning_rate": 0.00040238907849829354, + "loss": 6.3131, + "step": 5253 + }, + { + "epoch": 1.7931740614334473, + "grad_norm": 3.1165990829467773, + "learning_rate": 0.00040227531285551765, + "loss": 5.3994, + "step": 5254 + }, + { + "epoch": 1.7935153583617747, + "grad_norm": 3.234656572341919, + "learning_rate": 0.0004021615472127418, + "loss": 6.8472, + "step": 5255 + }, + { + "epoch": 1.7938566552901025, + "grad_norm": 3.0498690605163574, + "learning_rate": 0.00040204778156996586, + "loss": 6.2153, + "step": 5256 + }, + { + "epoch": 1.79419795221843, + "grad_norm": 2.997565507888794, + "learning_rate": 0.00040193401592718996, + "loss": 6.3802, + "step": 5257 + }, + { + "epoch": 1.7945392491467578, + "grad_norm": 2.9837565422058105, + "learning_rate": 0.0004018202502844141, + "loss": 5.8155, + "step": 5258 + }, + { + "epoch": 1.7948805460750852, + "grad_norm": 2.9351367950439453, + "learning_rate": 0.00040170648464163823, + "loss": 6.6041, + "step": 5259 + }, + { + "epoch": 1.795221843003413, + "grad_norm": 2.891256332397461, + "learning_rate": 0.00040159271899886233, + "loss": 5.978, + "step": 5260 + }, + { + "epoch": 1.7955631399317407, + "grad_norm": 2.9434101581573486, + "learning_rate": 0.0004014789533560865, + "loss": 6.2294, + "step": 5261 + }, + { + "epoch": 1.7959044368600683, + "grad_norm": 3.0695149898529053, + "learning_rate": 0.0004013651877133106, + "loss": 6.538, + "step": 5262 + }, + { + "epoch": 1.796245733788396, + "grad_norm": 3.029841661453247, + "learning_rate": 0.0004012514220705347, + "loss": 5.6806, + "step": 5263 + }, + { + "epoch": 1.7965870307167235, + "grad_norm": 2.961660623550415, + "learning_rate": 0.00040113765642775886, + "loss": 6.4616, + "step": 5264 + }, + { + "epoch": 1.7969283276450512, + "grad_norm": 4.678473472595215, + "learning_rate": 0.00040102389078498297, + "loss": 5.3174, + "step": 5265 + }, + { + "epoch": 1.7972696245733788, + "grad_norm": 3.08498215675354, + "learning_rate": 0.000400910125142207, + "loss": 5.7922, + "step": 5266 + }, + { + "epoch": 1.7976109215017066, + "grad_norm": 2.980827808380127, + "learning_rate": 0.0004007963594994312, + "loss": 6.1678, + "step": 5267 + }, + { + "epoch": 1.797952218430034, + "grad_norm": 2.927297592163086, + "learning_rate": 0.0004006825938566553, + "loss": 6.192, + "step": 5268 + }, + { + "epoch": 1.798293515358362, + "grad_norm": 3.01383638381958, + "learning_rate": 0.0004005688282138794, + "loss": 6.2782, + "step": 5269 + }, + { + "epoch": 1.7986348122866893, + "grad_norm": 3.114114999771118, + "learning_rate": 0.00040045506257110355, + "loss": 5.8744, + "step": 5270 + }, + { + "epoch": 1.7989761092150172, + "grad_norm": 3.1586034297943115, + "learning_rate": 0.00040034129692832765, + "loss": 6.3056, + "step": 5271 + }, + { + "epoch": 1.7993174061433446, + "grad_norm": 2.895045518875122, + "learning_rate": 0.0004002275312855518, + "loss": 5.8402, + "step": 5272 + }, + { + "epoch": 1.7996587030716724, + "grad_norm": 3.156750202178955, + "learning_rate": 0.0004001137656427759, + "loss": 5.9734, + "step": 5273 + }, + { + "epoch": 1.8, + "grad_norm": 3.116347312927246, + "learning_rate": 0.0004, + "loss": 5.6309, + "step": 5274 + }, + { + "epoch": 1.8003412969283277, + "grad_norm": 3.3794455528259277, + "learning_rate": 0.0003998862343572241, + "loss": 6.1649, + "step": 5275 + }, + { + "epoch": 1.8006825938566553, + "grad_norm": 3.00872540473938, + "learning_rate": 0.00039977246871444823, + "loss": 6.449, + "step": 5276 + }, + { + "epoch": 1.801023890784983, + "grad_norm": 3.6723685264587402, + "learning_rate": 0.00039965870307167233, + "loss": 6.0114, + "step": 5277 + }, + { + "epoch": 1.8013651877133106, + "grad_norm": 2.9524502754211426, + "learning_rate": 0.0003995449374288965, + "loss": 6.1426, + "step": 5278 + }, + { + "epoch": 1.8017064846416382, + "grad_norm": 2.987380266189575, + "learning_rate": 0.0003994311717861206, + "loss": 5.964, + "step": 5279 + }, + { + "epoch": 1.802047781569966, + "grad_norm": 3.3028364181518555, + "learning_rate": 0.0003993174061433447, + "loss": 5.3033, + "step": 5280 + }, + { + "epoch": 1.8023890784982934, + "grad_norm": 2.9730100631713867, + "learning_rate": 0.00039920364050056886, + "loss": 5.9817, + "step": 5281 + }, + { + "epoch": 1.8027303754266213, + "grad_norm": 3.1205883026123047, + "learning_rate": 0.00039908987485779297, + "loss": 5.8908, + "step": 5282 + }, + { + "epoch": 1.8030716723549487, + "grad_norm": 2.9819066524505615, + "learning_rate": 0.00039897610921501707, + "loss": 5.9874, + "step": 5283 + }, + { + "epoch": 1.8034129692832765, + "grad_norm": 3.0491702556610107, + "learning_rate": 0.00039886234357224123, + "loss": 6.1682, + "step": 5284 + }, + { + "epoch": 1.803754266211604, + "grad_norm": 3.0884945392608643, + "learning_rate": 0.0003987485779294653, + "loss": 6.6682, + "step": 5285 + }, + { + "epoch": 1.8040955631399318, + "grad_norm": 3.4136314392089844, + "learning_rate": 0.0003986348122866894, + "loss": 5.6285, + "step": 5286 + }, + { + "epoch": 1.8044368600682594, + "grad_norm": 3.038418769836426, + "learning_rate": 0.00039852104664391355, + "loss": 6.8089, + "step": 5287 + }, + { + "epoch": 1.804778156996587, + "grad_norm": 2.9346542358398438, + "learning_rate": 0.00039840728100113765, + "loss": 5.9561, + "step": 5288 + }, + { + "epoch": 1.8051194539249147, + "grad_norm": 4.106077671051025, + "learning_rate": 0.00039829351535836176, + "loss": 5.2572, + "step": 5289 + }, + { + "epoch": 1.8054607508532423, + "grad_norm": 3.0919151306152344, + "learning_rate": 0.0003981797497155859, + "loss": 6.5056, + "step": 5290 + }, + { + "epoch": 1.80580204778157, + "grad_norm": 3.0660130977630615, + "learning_rate": 0.00039806598407281, + "loss": 6.0935, + "step": 5291 + }, + { + "epoch": 1.8061433447098976, + "grad_norm": 3.0275156497955322, + "learning_rate": 0.0003979522184300341, + "loss": 4.5541, + "step": 5292 + }, + { + "epoch": 1.8064846416382254, + "grad_norm": 3.370018243789673, + "learning_rate": 0.0003978384527872583, + "loss": 5.0421, + "step": 5293 + }, + { + "epoch": 1.8068259385665528, + "grad_norm": 3.184339761734009, + "learning_rate": 0.0003977246871444824, + "loss": 5.1131, + "step": 5294 + }, + { + "epoch": 1.8071672354948807, + "grad_norm": 3.00337553024292, + "learning_rate": 0.0003976109215017065, + "loss": 6.6479, + "step": 5295 + }, + { + "epoch": 1.807508532423208, + "grad_norm": 2.92510724067688, + "learning_rate": 0.0003974971558589306, + "loss": 6.097, + "step": 5296 + }, + { + "epoch": 1.807849829351536, + "grad_norm": 3.030064105987549, + "learning_rate": 0.0003973833902161547, + "loss": 6.3918, + "step": 5297 + }, + { + "epoch": 1.8081911262798633, + "grad_norm": 3.019908905029297, + "learning_rate": 0.00039726962457337886, + "loss": 5.8597, + "step": 5298 + }, + { + "epoch": 1.8085324232081912, + "grad_norm": 2.938002109527588, + "learning_rate": 0.00039715585893060297, + "loss": 6.1059, + "step": 5299 + }, + { + "epoch": 1.8088737201365188, + "grad_norm": 3.3193299770355225, + "learning_rate": 0.0003970420932878271, + "loss": 5.3933, + "step": 5300 + }, + { + "epoch": 1.8092150170648464, + "grad_norm": 3.112321376800537, + "learning_rate": 0.00039692832764505123, + "loss": 6.1252, + "step": 5301 + }, + { + "epoch": 1.809556313993174, + "grad_norm": 2.9885871410369873, + "learning_rate": 0.00039681456200227534, + "loss": 6.2435, + "step": 5302 + }, + { + "epoch": 1.8098976109215017, + "grad_norm": 3.8620083332061768, + "learning_rate": 0.00039670079635949944, + "loss": 5.2043, + "step": 5303 + }, + { + "epoch": 1.8102389078498293, + "grad_norm": 3.0883119106292725, + "learning_rate": 0.0003965870307167236, + "loss": 6.0623, + "step": 5304 + }, + { + "epoch": 1.810580204778157, + "grad_norm": 2.965674877166748, + "learning_rate": 0.00039647326507394765, + "loss": 6.138, + "step": 5305 + }, + { + "epoch": 1.8109215017064848, + "grad_norm": 2.917496681213379, + "learning_rate": 0.00039635949943117176, + "loss": 6.5031, + "step": 5306 + }, + { + "epoch": 1.8112627986348122, + "grad_norm": 2.951918363571167, + "learning_rate": 0.0003962457337883959, + "loss": 6.0293, + "step": 5307 + }, + { + "epoch": 1.81160409556314, + "grad_norm": 4.126885890960693, + "learning_rate": 0.00039613196814562, + "loss": 5.7647, + "step": 5308 + }, + { + "epoch": 1.8119453924914675, + "grad_norm": 2.963454484939575, + "learning_rate": 0.0003960182025028441, + "loss": 6.4464, + "step": 5309 + }, + { + "epoch": 1.8122866894197953, + "grad_norm": 3.006791591644287, + "learning_rate": 0.0003959044368600683, + "loss": 6.4746, + "step": 5310 + }, + { + "epoch": 1.8126279863481227, + "grad_norm": 2.443580150604248, + "learning_rate": 0.0003957906712172924, + "loss": 3.2005, + "step": 5311 + }, + { + "epoch": 1.8129692832764506, + "grad_norm": 3.0745327472686768, + "learning_rate": 0.0003956769055745165, + "loss": 6.3638, + "step": 5312 + }, + { + "epoch": 1.8133105802047782, + "grad_norm": 3.1432762145996094, + "learning_rate": 0.00039556313993174065, + "loss": 5.9445, + "step": 5313 + }, + { + "epoch": 1.8136518771331058, + "grad_norm": 3.154367685317993, + "learning_rate": 0.0003954493742889647, + "loss": 6.4095, + "step": 5314 + }, + { + "epoch": 1.8139931740614335, + "grad_norm": 3.050010919570923, + "learning_rate": 0.00039533560864618886, + "loss": 6.4283, + "step": 5315 + }, + { + "epoch": 1.814334470989761, + "grad_norm": 2.967775344848633, + "learning_rate": 0.00039522184300341297, + "loss": 6.1403, + "step": 5316 + }, + { + "epoch": 1.8146757679180887, + "grad_norm": 3.014551877975464, + "learning_rate": 0.0003951080773606371, + "loss": 6.4165, + "step": 5317 + }, + { + "epoch": 1.8150170648464163, + "grad_norm": 3.006186008453369, + "learning_rate": 0.00039499431171786123, + "loss": 7.0178, + "step": 5318 + }, + { + "epoch": 1.8153583617747442, + "grad_norm": 2.8938446044921875, + "learning_rate": 0.00039488054607508534, + "loss": 6.7276, + "step": 5319 + }, + { + "epoch": 1.8156996587030716, + "grad_norm": 2.9930148124694824, + "learning_rate": 0.00039476678043230944, + "loss": 6.5492, + "step": 5320 + }, + { + "epoch": 1.8160409556313994, + "grad_norm": 3.3085694313049316, + "learning_rate": 0.0003946530147895336, + "loss": 6.1508, + "step": 5321 + }, + { + "epoch": 1.8163822525597269, + "grad_norm": 3.0276403427124023, + "learning_rate": 0.0003945392491467577, + "loss": 5.6613, + "step": 5322 + }, + { + "epoch": 1.8167235494880547, + "grad_norm": 2.933673620223999, + "learning_rate": 0.0003944254835039818, + "loss": 6.7415, + "step": 5323 + }, + { + "epoch": 1.817064846416382, + "grad_norm": 2.8729095458984375, + "learning_rate": 0.0003943117178612059, + "loss": 5.8731, + "step": 5324 + }, + { + "epoch": 1.81740614334471, + "grad_norm": 2.9451277256011963, + "learning_rate": 0.00039419795221843, + "loss": 5.8406, + "step": 5325 + }, + { + "epoch": 1.8177474402730376, + "grad_norm": 2.9216501712799072, + "learning_rate": 0.0003940841865756541, + "loss": 5.6633, + "step": 5326 + }, + { + "epoch": 1.8180887372013652, + "grad_norm": 3.156517744064331, + "learning_rate": 0.0003939704209328783, + "loss": 6.3426, + "step": 5327 + }, + { + "epoch": 1.8184300341296928, + "grad_norm": 3.5125882625579834, + "learning_rate": 0.0003938566552901024, + "loss": 5.8847, + "step": 5328 + }, + { + "epoch": 1.8187713310580205, + "grad_norm": 3.0242371559143066, + "learning_rate": 0.0003937428896473265, + "loss": 6.1378, + "step": 5329 + }, + { + "epoch": 1.819112627986348, + "grad_norm": 3.081592559814453, + "learning_rate": 0.00039362912400455065, + "loss": 6.0888, + "step": 5330 + }, + { + "epoch": 1.8194539249146757, + "grad_norm": 2.9294071197509766, + "learning_rate": 0.00039351535836177476, + "loss": 6.4891, + "step": 5331 + }, + { + "epoch": 1.8197952218430036, + "grad_norm": 2.9522759914398193, + "learning_rate": 0.00039340159271899886, + "loss": 6.6007, + "step": 5332 + }, + { + "epoch": 1.820136518771331, + "grad_norm": 2.9654457569122314, + "learning_rate": 0.000393287827076223, + "loss": 6.6926, + "step": 5333 + }, + { + "epoch": 1.8204778156996588, + "grad_norm": 3.4901723861694336, + "learning_rate": 0.0003931740614334471, + "loss": 4.3765, + "step": 5334 + }, + { + "epoch": 1.8208191126279862, + "grad_norm": 4.748300552368164, + "learning_rate": 0.00039306029579067123, + "loss": 5.744, + "step": 5335 + }, + { + "epoch": 1.821160409556314, + "grad_norm": 2.887054204940796, + "learning_rate": 0.00039294653014789534, + "loss": 6.1389, + "step": 5336 + }, + { + "epoch": 1.8215017064846415, + "grad_norm": 2.9811062812805176, + "learning_rate": 0.00039283276450511944, + "loss": 6.3022, + "step": 5337 + }, + { + "epoch": 1.8218430034129693, + "grad_norm": 3.0545904636383057, + "learning_rate": 0.0003927189988623436, + "loss": 6.1094, + "step": 5338 + }, + { + "epoch": 1.822184300341297, + "grad_norm": 3.0074126720428467, + "learning_rate": 0.0003926052332195677, + "loss": 6.274, + "step": 5339 + }, + { + "epoch": 1.8225255972696246, + "grad_norm": 2.941384792327881, + "learning_rate": 0.0003924914675767918, + "loss": 6.2696, + "step": 5340 + }, + { + "epoch": 1.8228668941979522, + "grad_norm": 2.8732473850250244, + "learning_rate": 0.00039237770193401597, + "loss": 6.7092, + "step": 5341 + }, + { + "epoch": 1.8232081911262799, + "grad_norm": 2.9521396160125732, + "learning_rate": 0.0003922639362912401, + "loss": 6.506, + "step": 5342 + }, + { + "epoch": 1.8235494880546075, + "grad_norm": 2.961531162261963, + "learning_rate": 0.0003921501706484642, + "loss": 6.3482, + "step": 5343 + }, + { + "epoch": 1.823890784982935, + "grad_norm": 3.0271778106689453, + "learning_rate": 0.0003920364050056883, + "loss": 5.76, + "step": 5344 + }, + { + "epoch": 1.824232081911263, + "grad_norm": 2.9559972286224365, + "learning_rate": 0.0003919226393629124, + "loss": 6.5723, + "step": 5345 + }, + { + "epoch": 1.8245733788395904, + "grad_norm": 2.9276723861694336, + "learning_rate": 0.0003918088737201365, + "loss": 6.3815, + "step": 5346 + }, + { + "epoch": 1.8249146757679182, + "grad_norm": 2.9702205657958984, + "learning_rate": 0.00039169510807736066, + "loss": 6.6194, + "step": 5347 + }, + { + "epoch": 1.8252559726962456, + "grad_norm": 3.0158731937408447, + "learning_rate": 0.00039158134243458476, + "loss": 6.287, + "step": 5348 + }, + { + "epoch": 1.8255972696245735, + "grad_norm": 3.1500322818756104, + "learning_rate": 0.00039146757679180887, + "loss": 6.433, + "step": 5349 + }, + { + "epoch": 1.8259385665529009, + "grad_norm": 3.0445759296417236, + "learning_rate": 0.000391353811149033, + "loss": 6.2129, + "step": 5350 + }, + { + "epoch": 1.8262798634812287, + "grad_norm": 3.0558390617370605, + "learning_rate": 0.00039124004550625713, + "loss": 6.182, + "step": 5351 + }, + { + "epoch": 1.8266211604095564, + "grad_norm": 3.010730504989624, + "learning_rate": 0.00039112627986348123, + "loss": 5.7881, + "step": 5352 + }, + { + "epoch": 1.826962457337884, + "grad_norm": 2.9294490814208984, + "learning_rate": 0.00039101251422070534, + "loss": 6.0147, + "step": 5353 + }, + { + "epoch": 1.8273037542662116, + "grad_norm": 2.883665084838867, + "learning_rate": 0.00039089874857792944, + "loss": 6.3523, + "step": 5354 + }, + { + "epoch": 1.8276450511945392, + "grad_norm": 3.021610975265503, + "learning_rate": 0.0003907849829351536, + "loss": 6.6719, + "step": 5355 + }, + { + "epoch": 1.8279863481228669, + "grad_norm": 3.135248899459839, + "learning_rate": 0.0003906712172923777, + "loss": 6.1267, + "step": 5356 + }, + { + "epoch": 1.8283276450511945, + "grad_norm": 3.0037925243377686, + "learning_rate": 0.0003905574516496018, + "loss": 5.9688, + "step": 5357 + }, + { + "epoch": 1.8286689419795223, + "grad_norm": 3.0769035816192627, + "learning_rate": 0.00039044368600682597, + "loss": 5.994, + "step": 5358 + }, + { + "epoch": 1.8290102389078498, + "grad_norm": 2.995614767074585, + "learning_rate": 0.0003903299203640501, + "loss": 6.2174, + "step": 5359 + }, + { + "epoch": 1.8293515358361776, + "grad_norm": 3.0436437129974365, + "learning_rate": 0.0003902161547212742, + "loss": 5.6113, + "step": 5360 + }, + { + "epoch": 1.829692832764505, + "grad_norm": 2.8753719329833984, + "learning_rate": 0.00039010238907849834, + "loss": 5.9463, + "step": 5361 + }, + { + "epoch": 1.8300341296928329, + "grad_norm": 2.950981378555298, + "learning_rate": 0.00038998862343572245, + "loss": 5.9652, + "step": 5362 + }, + { + "epoch": 1.8303754266211603, + "grad_norm": 3.0115854740142822, + "learning_rate": 0.0003898748577929465, + "loss": 6.1722, + "step": 5363 + }, + { + "epoch": 1.8307167235494881, + "grad_norm": 3.0039737224578857, + "learning_rate": 0.00038976109215017066, + "loss": 6.8049, + "step": 5364 + }, + { + "epoch": 1.8310580204778157, + "grad_norm": 2.8962740898132324, + "learning_rate": 0.00038964732650739476, + "loss": 6.1094, + "step": 5365 + }, + { + "epoch": 1.8313993174061434, + "grad_norm": 3.1070163249969482, + "learning_rate": 0.00038953356086461887, + "loss": 5.8425, + "step": 5366 + }, + { + "epoch": 1.831740614334471, + "grad_norm": 4.08699893951416, + "learning_rate": 0.000389419795221843, + "loss": 4.5633, + "step": 5367 + }, + { + "epoch": 1.8320819112627986, + "grad_norm": 3.1629393100738525, + "learning_rate": 0.00038930602957906713, + "loss": 6.2469, + "step": 5368 + }, + { + "epoch": 1.8324232081911263, + "grad_norm": 3.210838556289673, + "learning_rate": 0.00038919226393629124, + "loss": 6.6676, + "step": 5369 + }, + { + "epoch": 1.8327645051194539, + "grad_norm": 3.3137311935424805, + "learning_rate": 0.0003890784982935154, + "loss": 5.3108, + "step": 5370 + }, + { + "epoch": 1.8331058020477817, + "grad_norm": 2.9676313400268555, + "learning_rate": 0.0003889647326507395, + "loss": 6.4932, + "step": 5371 + }, + { + "epoch": 1.8334470989761091, + "grad_norm": 2.9483461380004883, + "learning_rate": 0.0003888509670079636, + "loss": 6.2493, + "step": 5372 + }, + { + "epoch": 1.833788395904437, + "grad_norm": 3.206331491470337, + "learning_rate": 0.0003887372013651877, + "loss": 5.3646, + "step": 5373 + }, + { + "epoch": 1.8341296928327644, + "grad_norm": 3.0024166107177734, + "learning_rate": 0.0003886234357224118, + "loss": 6.3351, + "step": 5374 + }, + { + "epoch": 1.8344709897610922, + "grad_norm": 3.38862943649292, + "learning_rate": 0.0003885096700796359, + "loss": 6.2358, + "step": 5375 + }, + { + "epoch": 1.8348122866894196, + "grad_norm": 2.8507230281829834, + "learning_rate": 0.0003883959044368601, + "loss": 5.6975, + "step": 5376 + }, + { + "epoch": 1.8351535836177475, + "grad_norm": 2.947876214981079, + "learning_rate": 0.0003882821387940842, + "loss": 6.2397, + "step": 5377 + }, + { + "epoch": 1.8354948805460751, + "grad_norm": 3.339482307434082, + "learning_rate": 0.00038816837315130834, + "loss": 5.5517, + "step": 5378 + }, + { + "epoch": 1.8358361774744028, + "grad_norm": 2.9442296028137207, + "learning_rate": 0.00038805460750853245, + "loss": 6.5012, + "step": 5379 + }, + { + "epoch": 1.8361774744027304, + "grad_norm": 3.01493239402771, + "learning_rate": 0.00038794084186575655, + "loss": 6.1444, + "step": 5380 + }, + { + "epoch": 1.836518771331058, + "grad_norm": 3.0220446586608887, + "learning_rate": 0.0003878270762229807, + "loss": 6.8928, + "step": 5381 + }, + { + "epoch": 1.8368600682593856, + "grad_norm": 2.9548652172088623, + "learning_rate": 0.00038771331058020476, + "loss": 6.0756, + "step": 5382 + }, + { + "epoch": 1.8372013651877133, + "grad_norm": 3.209747314453125, + "learning_rate": 0.00038759954493742887, + "loss": 5.5828, + "step": 5383 + }, + { + "epoch": 1.8375426621160411, + "grad_norm": 3.493947744369507, + "learning_rate": 0.000387485779294653, + "loss": 6.0699, + "step": 5384 + }, + { + "epoch": 1.8378839590443685, + "grad_norm": 3.8855106830596924, + "learning_rate": 0.00038737201365187713, + "loss": 4.7189, + "step": 5385 + }, + { + "epoch": 1.8382252559726964, + "grad_norm": 3.2830464839935303, + "learning_rate": 0.00038725824800910124, + "loss": 5.7651, + "step": 5386 + }, + { + "epoch": 1.8385665529010238, + "grad_norm": 3.040435314178467, + "learning_rate": 0.0003871444823663254, + "loss": 5.9345, + "step": 5387 + }, + { + "epoch": 1.8389078498293516, + "grad_norm": 3.311544179916382, + "learning_rate": 0.0003870307167235495, + "loss": 5.9783, + "step": 5388 + }, + { + "epoch": 1.839249146757679, + "grad_norm": 2.9888603687286377, + "learning_rate": 0.0003869169510807736, + "loss": 5.666, + "step": 5389 + }, + { + "epoch": 1.8395904436860069, + "grad_norm": 3.0224196910858154, + "learning_rate": 0.00038680318543799776, + "loss": 6.6377, + "step": 5390 + }, + { + "epoch": 1.8399317406143345, + "grad_norm": 3.0179591178894043, + "learning_rate": 0.00038668941979522187, + "loss": 6.0367, + "step": 5391 + }, + { + "epoch": 1.8402730375426621, + "grad_norm": 2.922994375228882, + "learning_rate": 0.0003865756541524459, + "loss": 6.0782, + "step": 5392 + }, + { + "epoch": 1.8406143344709898, + "grad_norm": 2.8612077236175537, + "learning_rate": 0.0003864618885096701, + "loss": 6.0352, + "step": 5393 + }, + { + "epoch": 1.8409556313993174, + "grad_norm": 2.957252025604248, + "learning_rate": 0.0003863481228668942, + "loss": 6.5599, + "step": 5394 + }, + { + "epoch": 1.841296928327645, + "grad_norm": 3.019355535507202, + "learning_rate": 0.0003862343572241183, + "loss": 5.697, + "step": 5395 + }, + { + "epoch": 1.8416382252559726, + "grad_norm": 2.8055245876312256, + "learning_rate": 0.00038612059158134245, + "loss": 6.1868, + "step": 5396 + }, + { + "epoch": 1.8419795221843005, + "grad_norm": 3.0640079975128174, + "learning_rate": 0.00038600682593856655, + "loss": 6.252, + "step": 5397 + }, + { + "epoch": 1.842320819112628, + "grad_norm": 2.855372428894043, + "learning_rate": 0.0003858930602957907, + "loss": 6.6329, + "step": 5398 + }, + { + "epoch": 1.8426621160409558, + "grad_norm": 2.9066994190216064, + "learning_rate": 0.0003857792946530148, + "loss": 6.5626, + "step": 5399 + }, + { + "epoch": 1.8430034129692832, + "grad_norm": 2.871976852416992, + "learning_rate": 0.0003856655290102389, + "loss": 6.0375, + "step": 5400 + }, + { + "epoch": 1.843344709897611, + "grad_norm": 3.041769504547119, + "learning_rate": 0.0003855517633674631, + "loss": 6.551, + "step": 5401 + }, + { + "epoch": 1.8436860068259384, + "grad_norm": 3.0764667987823486, + "learning_rate": 0.00038543799772468713, + "loss": 6.187, + "step": 5402 + }, + { + "epoch": 1.8440273037542663, + "grad_norm": 3.1484851837158203, + "learning_rate": 0.00038532423208191124, + "loss": 6.0588, + "step": 5403 + }, + { + "epoch": 1.844368600682594, + "grad_norm": 3.08003306388855, + "learning_rate": 0.0003852104664391354, + "loss": 6.6638, + "step": 5404 + }, + { + "epoch": 1.8447098976109215, + "grad_norm": 2.9924423694610596, + "learning_rate": 0.0003850967007963595, + "loss": 6.4312, + "step": 5405 + }, + { + "epoch": 1.8450511945392492, + "grad_norm": 3.137577533721924, + "learning_rate": 0.0003849829351535836, + "loss": 6.4421, + "step": 5406 + }, + { + "epoch": 1.8453924914675768, + "grad_norm": 2.974426507949829, + "learning_rate": 0.00038486916951080777, + "loss": 6.4856, + "step": 5407 + }, + { + "epoch": 1.8457337883959044, + "grad_norm": 3.020400047302246, + "learning_rate": 0.00038475540386803187, + "loss": 6.534, + "step": 5408 + }, + { + "epoch": 1.846075085324232, + "grad_norm": 2.938917636871338, + "learning_rate": 0.000384641638225256, + "loss": 6.6811, + "step": 5409 + }, + { + "epoch": 1.8464163822525599, + "grad_norm": 3.0424704551696777, + "learning_rate": 0.00038452787258248013, + "loss": 6.4537, + "step": 5410 + }, + { + "epoch": 1.8467576791808873, + "grad_norm": 3.186312198638916, + "learning_rate": 0.00038441410693970424, + "loss": 6.5699, + "step": 5411 + }, + { + "epoch": 1.8470989761092151, + "grad_norm": 2.9400906562805176, + "learning_rate": 0.0003843003412969283, + "loss": 6.4694, + "step": 5412 + }, + { + "epoch": 1.8474402730375425, + "grad_norm": 3.567128896713257, + "learning_rate": 0.00038418657565415245, + "loss": 4.6422, + "step": 5413 + }, + { + "epoch": 1.8477815699658704, + "grad_norm": 5.326208591461182, + "learning_rate": 0.00038407281001137655, + "loss": 5.3053, + "step": 5414 + }, + { + "epoch": 1.8481228668941978, + "grad_norm": 3.0531771183013916, + "learning_rate": 0.00038395904436860066, + "loss": 5.9597, + "step": 5415 + }, + { + "epoch": 1.8484641638225257, + "grad_norm": 3.1060829162597656, + "learning_rate": 0.0003838452787258248, + "loss": 6.5521, + "step": 5416 + }, + { + "epoch": 1.8488054607508533, + "grad_norm": 3.0353734493255615, + "learning_rate": 0.0003837315130830489, + "loss": 6.4911, + "step": 5417 + }, + { + "epoch": 1.849146757679181, + "grad_norm": 2.9575839042663574, + "learning_rate": 0.0003836177474402731, + "loss": 6.3016, + "step": 5418 + }, + { + "epoch": 1.8494880546075085, + "grad_norm": 2.9795007705688477, + "learning_rate": 0.0003835039817974972, + "loss": 6.3147, + "step": 5419 + }, + { + "epoch": 1.8498293515358362, + "grad_norm": 2.8920490741729736, + "learning_rate": 0.0003833902161547213, + "loss": 6.7194, + "step": 5420 + }, + { + "epoch": 1.8501706484641638, + "grad_norm": 3.0831782817840576, + "learning_rate": 0.0003832764505119454, + "loss": 6.2686, + "step": 5421 + }, + { + "epoch": 1.8505119453924914, + "grad_norm": 2.8378100395202637, + "learning_rate": 0.0003831626848691695, + "loss": 6.5928, + "step": 5422 + }, + { + "epoch": 1.8508532423208193, + "grad_norm": 2.8427324295043945, + "learning_rate": 0.0003830489192263936, + "loss": 6.1053, + "step": 5423 + }, + { + "epoch": 1.8511945392491467, + "grad_norm": 3.0172784328460693, + "learning_rate": 0.00038293515358361777, + "loss": 6.1388, + "step": 5424 + }, + { + "epoch": 1.8515358361774745, + "grad_norm": 2.933791160583496, + "learning_rate": 0.00038282138794084187, + "loss": 6.4189, + "step": 5425 + }, + { + "epoch": 1.851877133105802, + "grad_norm": 2.942861557006836, + "learning_rate": 0.000382707622298066, + "loss": 6.5474, + "step": 5426 + }, + { + "epoch": 1.8522184300341298, + "grad_norm": 6.984040260314941, + "learning_rate": 0.00038259385665529014, + "loss": 6.0704, + "step": 5427 + }, + { + "epoch": 1.8525597269624572, + "grad_norm": 3.091132640838623, + "learning_rate": 0.00038248009101251424, + "loss": 6.6446, + "step": 5428 + }, + { + "epoch": 1.852901023890785, + "grad_norm": 3.019805908203125, + "learning_rate": 0.00038236632536973835, + "loss": 5.9389, + "step": 5429 + }, + { + "epoch": 1.8532423208191127, + "grad_norm": 3.0177457332611084, + "learning_rate": 0.0003822525597269625, + "loss": 6.3112, + "step": 5430 + }, + { + "epoch": 1.8535836177474403, + "grad_norm": 2.8653550148010254, + "learning_rate": 0.00038213879408418656, + "loss": 6.0127, + "step": 5431 + }, + { + "epoch": 1.853924914675768, + "grad_norm": 2.9369118213653564, + "learning_rate": 0.00038202502844141066, + "loss": 5.6815, + "step": 5432 + }, + { + "epoch": 1.8542662116040955, + "grad_norm": 2.954627513885498, + "learning_rate": 0.0003819112627986348, + "loss": 6.5244, + "step": 5433 + }, + { + "epoch": 1.8546075085324232, + "grad_norm": 2.8261237144470215, + "learning_rate": 0.0003817974971558589, + "loss": 6.4313, + "step": 5434 + }, + { + "epoch": 1.8549488054607508, + "grad_norm": 3.0321621894836426, + "learning_rate": 0.00038168373151308303, + "loss": 6.5262, + "step": 5435 + }, + { + "epoch": 1.8552901023890787, + "grad_norm": 3.0571656227111816, + "learning_rate": 0.0003815699658703072, + "loss": 6.4915, + "step": 5436 + }, + { + "epoch": 1.855631399317406, + "grad_norm": 2.8418610095977783, + "learning_rate": 0.0003814562002275313, + "loss": 6.0628, + "step": 5437 + }, + { + "epoch": 1.855972696245734, + "grad_norm": 3.0574285984039307, + "learning_rate": 0.00038134243458475545, + "loss": 6.2824, + "step": 5438 + }, + { + "epoch": 1.8563139931740613, + "grad_norm": 2.951812267303467, + "learning_rate": 0.00038122866894197956, + "loss": 6.4227, + "step": 5439 + }, + { + "epoch": 1.8566552901023892, + "grad_norm": 2.9646008014678955, + "learning_rate": 0.00038111490329920366, + "loss": 6.4111, + "step": 5440 + }, + { + "epoch": 1.8569965870307166, + "grad_norm": 2.9364631175994873, + "learning_rate": 0.00038100113765642777, + "loss": 6.3866, + "step": 5441 + }, + { + "epoch": 1.8573378839590444, + "grad_norm": 4.254305839538574, + "learning_rate": 0.00038088737201365187, + "loss": 5.721, + "step": 5442 + }, + { + "epoch": 1.857679180887372, + "grad_norm": 2.9221060276031494, + "learning_rate": 0.000380773606370876, + "loss": 5.6808, + "step": 5443 + }, + { + "epoch": 1.8580204778156997, + "grad_norm": 2.9992268085479736, + "learning_rate": 0.00038065984072810014, + "loss": 5.8649, + "step": 5444 + }, + { + "epoch": 1.8583617747440273, + "grad_norm": 2.9220197200775146, + "learning_rate": 0.00038054607508532424, + "loss": 5.8822, + "step": 5445 + }, + { + "epoch": 1.858703071672355, + "grad_norm": 2.9059829711914062, + "learning_rate": 0.00038043230944254835, + "loss": 6.4213, + "step": 5446 + }, + { + "epoch": 1.8590443686006826, + "grad_norm": 3.104856252670288, + "learning_rate": 0.0003803185437997725, + "loss": 5.7524, + "step": 5447 + }, + { + "epoch": 1.8593856655290102, + "grad_norm": 2.9057981967926025, + "learning_rate": 0.0003802047781569966, + "loss": 6.5828, + "step": 5448 + }, + { + "epoch": 1.859726962457338, + "grad_norm": 2.952005624771118, + "learning_rate": 0.0003800910125142207, + "loss": 6.5241, + "step": 5449 + }, + { + "epoch": 1.8600682593856654, + "grad_norm": 2.8778185844421387, + "learning_rate": 0.0003799772468714448, + "loss": 6.7302, + "step": 5450 + }, + { + "epoch": 1.8604095563139933, + "grad_norm": 2.990785837173462, + "learning_rate": 0.0003798634812286689, + "loss": 5.8828, + "step": 5451 + }, + { + "epoch": 1.8607508532423207, + "grad_norm": 2.9240405559539795, + "learning_rate": 0.00037974971558589303, + "loss": 6.4997, + "step": 5452 + }, + { + "epoch": 1.8610921501706486, + "grad_norm": 3.0843167304992676, + "learning_rate": 0.0003796359499431172, + "loss": 5.7164, + "step": 5453 + }, + { + "epoch": 1.861433447098976, + "grad_norm": 2.9468958377838135, + "learning_rate": 0.0003795221843003413, + "loss": 6.3823, + "step": 5454 + }, + { + "epoch": 1.8617747440273038, + "grad_norm": 2.890228271484375, + "learning_rate": 0.0003794084186575654, + "loss": 6.1479, + "step": 5455 + }, + { + "epoch": 1.8621160409556314, + "grad_norm": 7.07792329788208, + "learning_rate": 0.00037929465301478956, + "loss": 4.19, + "step": 5456 + }, + { + "epoch": 1.862457337883959, + "grad_norm": 2.9772236347198486, + "learning_rate": 0.00037918088737201366, + "loss": 6.5797, + "step": 5457 + }, + { + "epoch": 1.8627986348122867, + "grad_norm": 3.0900766849517822, + "learning_rate": 0.0003790671217292378, + "loss": 5.6404, + "step": 5458 + }, + { + "epoch": 1.8631399317406143, + "grad_norm": 3.135882616043091, + "learning_rate": 0.00037895335608646193, + "loss": 6.2669, + "step": 5459 + }, + { + "epoch": 1.863481228668942, + "grad_norm": 3.0472939014434814, + "learning_rate": 0.000378839590443686, + "loss": 5.6298, + "step": 5460 + }, + { + "epoch": 1.8638225255972696, + "grad_norm": 3.0007853507995605, + "learning_rate": 0.00037872582480091014, + "loss": 5.4573, + "step": 5461 + }, + { + "epoch": 1.8641638225255974, + "grad_norm": 2.992143154144287, + "learning_rate": 0.00037861205915813424, + "loss": 6.2351, + "step": 5462 + }, + { + "epoch": 1.8645051194539248, + "grad_norm": 2.8720524311065674, + "learning_rate": 0.00037849829351535835, + "loss": 5.7451, + "step": 5463 + }, + { + "epoch": 1.8648464163822527, + "grad_norm": 3.13032603263855, + "learning_rate": 0.0003783845278725825, + "loss": 5.7272, + "step": 5464 + }, + { + "epoch": 1.86518771331058, + "grad_norm": 3.009556293487549, + "learning_rate": 0.0003782707622298066, + "loss": 5.9469, + "step": 5465 + }, + { + "epoch": 1.865529010238908, + "grad_norm": 2.857898712158203, + "learning_rate": 0.0003781569965870307, + "loss": 5.6815, + "step": 5466 + }, + { + "epoch": 1.8658703071672353, + "grad_norm": 2.863910675048828, + "learning_rate": 0.0003780432309442549, + "loss": 6.0975, + "step": 5467 + }, + { + "epoch": 1.8662116040955632, + "grad_norm": 2.9575035572052, + "learning_rate": 0.000377929465301479, + "loss": 6.5311, + "step": 5468 + }, + { + "epoch": 1.8665529010238908, + "grad_norm": 3.4162585735321045, + "learning_rate": 0.0003778156996587031, + "loss": 5.9707, + "step": 5469 + }, + { + "epoch": 1.8668941979522184, + "grad_norm": 3.0944206714630127, + "learning_rate": 0.0003777019340159272, + "loss": 6.2304, + "step": 5470 + }, + { + "epoch": 1.867235494880546, + "grad_norm": 3.3516929149627686, + "learning_rate": 0.0003775881683731513, + "loss": 5.7523, + "step": 5471 + }, + { + "epoch": 1.8675767918088737, + "grad_norm": 2.970231294631958, + "learning_rate": 0.0003774744027303754, + "loss": 5.5873, + "step": 5472 + }, + { + "epoch": 1.8679180887372013, + "grad_norm": 3.073521137237549, + "learning_rate": 0.00037736063708759956, + "loss": 6.0575, + "step": 5473 + }, + { + "epoch": 1.868259385665529, + "grad_norm": 2.949779748916626, + "learning_rate": 0.00037724687144482366, + "loss": 6.1106, + "step": 5474 + }, + { + "epoch": 1.8686006825938568, + "grad_norm": 2.9631776809692383, + "learning_rate": 0.00037713310580204777, + "loss": 5.8668, + "step": 5475 + }, + { + "epoch": 1.8689419795221842, + "grad_norm": 2.9993419647216797, + "learning_rate": 0.00037701934015927193, + "loss": 6.5943, + "step": 5476 + }, + { + "epoch": 1.869283276450512, + "grad_norm": 2.911229133605957, + "learning_rate": 0.00037690557451649603, + "loss": 6.1717, + "step": 5477 + }, + { + "epoch": 1.8696245733788395, + "grad_norm": 3.7616472244262695, + "learning_rate": 0.0003767918088737202, + "loss": 4.3073, + "step": 5478 + }, + { + "epoch": 1.8699658703071673, + "grad_norm": 3.202282190322876, + "learning_rate": 0.0003766780432309443, + "loss": 5.8356, + "step": 5479 + }, + { + "epoch": 1.8703071672354947, + "grad_norm": 3.0891096591949463, + "learning_rate": 0.00037656427758816835, + "loss": 4.9737, + "step": 5480 + }, + { + "epoch": 1.8706484641638226, + "grad_norm": 1.8888578414916992, + "learning_rate": 0.0003764505119453925, + "loss": 2.898, + "step": 5481 + }, + { + "epoch": 1.8709897610921502, + "grad_norm": 3.4496963024139404, + "learning_rate": 0.0003763367463026166, + "loss": 5.7924, + "step": 5482 + }, + { + "epoch": 1.8713310580204778, + "grad_norm": 3.0166397094726562, + "learning_rate": 0.0003762229806598407, + "loss": 6.2951, + "step": 5483 + }, + { + "epoch": 1.8716723549488055, + "grad_norm": 3.0307724475860596, + "learning_rate": 0.0003761092150170649, + "loss": 6.5138, + "step": 5484 + }, + { + "epoch": 1.872013651877133, + "grad_norm": 2.994663953781128, + "learning_rate": 0.000375995449374289, + "loss": 6.3472, + "step": 5485 + }, + { + "epoch": 1.8723549488054607, + "grad_norm": 2.9022762775421143, + "learning_rate": 0.0003758816837315131, + "loss": 5.8626, + "step": 5486 + }, + { + "epoch": 1.8726962457337883, + "grad_norm": 2.9295828342437744, + "learning_rate": 0.00037576791808873725, + "loss": 5.9134, + "step": 5487 + }, + { + "epoch": 1.8730375426621162, + "grad_norm": 2.866713047027588, + "learning_rate": 0.00037565415244596135, + "loss": 6.403, + "step": 5488 + }, + { + "epoch": 1.8733788395904436, + "grad_norm": 2.9316976070404053, + "learning_rate": 0.0003755403868031854, + "loss": 5.9694, + "step": 5489 + }, + { + "epoch": 1.8737201365187715, + "grad_norm": 3.494288682937622, + "learning_rate": 0.00037542662116040956, + "loss": 6.0734, + "step": 5490 + }, + { + "epoch": 1.8740614334470989, + "grad_norm": 7.175894260406494, + "learning_rate": 0.00037531285551763367, + "loss": 5.6944, + "step": 5491 + }, + { + "epoch": 1.8744027303754267, + "grad_norm": 3.8673219680786133, + "learning_rate": 0.00037519908987485777, + "loss": 6.0276, + "step": 5492 + }, + { + "epoch": 1.8747440273037541, + "grad_norm": 3.381589412689209, + "learning_rate": 0.00037508532423208193, + "loss": 5.8487, + "step": 5493 + }, + { + "epoch": 1.875085324232082, + "grad_norm": 3.1894888877868652, + "learning_rate": 0.00037497155858930603, + "loss": 6.3183, + "step": 5494 + }, + { + "epoch": 1.8754266211604096, + "grad_norm": 3.0183558464050293, + "learning_rate": 0.00037485779294653014, + "loss": 6.1961, + "step": 5495 + }, + { + "epoch": 1.8757679180887372, + "grad_norm": 2.9099154472351074, + "learning_rate": 0.0003747440273037543, + "loss": 6.3843, + "step": 5496 + }, + { + "epoch": 1.8761092150170648, + "grad_norm": 3.4501168727874756, + "learning_rate": 0.0003746302616609784, + "loss": 5.7031, + "step": 5497 + }, + { + "epoch": 1.8764505119453925, + "grad_norm": 4.108460426330566, + "learning_rate": 0.0003745164960182025, + "loss": 5.5165, + "step": 5498 + }, + { + "epoch": 1.87679180887372, + "grad_norm": 2.9657464027404785, + "learning_rate": 0.0003744027303754266, + "loss": 6.6402, + "step": 5499 + }, + { + "epoch": 1.8771331058020477, + "grad_norm": 2.9928600788116455, + "learning_rate": 0.0003742889647326507, + "loss": 5.5103, + "step": 5500 + }, + { + "epoch": 1.8774744027303756, + "grad_norm": 2.88197922706604, + "learning_rate": 0.0003741751990898749, + "loss": 6.0788, + "step": 5501 + }, + { + "epoch": 1.877815699658703, + "grad_norm": 2.8990108966827393, + "learning_rate": 0.000374061433447099, + "loss": 5.9397, + "step": 5502 + }, + { + "epoch": 1.8781569965870308, + "grad_norm": 6.239475250244141, + "learning_rate": 0.0003739476678043231, + "loss": 4.31, + "step": 5503 + }, + { + "epoch": 1.8784982935153582, + "grad_norm": 2.9518520832061768, + "learning_rate": 0.00037383390216154725, + "loss": 6.4256, + "step": 5504 + }, + { + "epoch": 1.878839590443686, + "grad_norm": 3.101850748062134, + "learning_rate": 0.00037372013651877135, + "loss": 6.2821, + "step": 5505 + }, + { + "epoch": 1.8791808873720135, + "grad_norm": 2.954547643661499, + "learning_rate": 0.00037360637087599546, + "loss": 5.9507, + "step": 5506 + }, + { + "epoch": 1.8795221843003413, + "grad_norm": 3.1224567890167236, + "learning_rate": 0.0003734926052332196, + "loss": 6.0534, + "step": 5507 + }, + { + "epoch": 1.879863481228669, + "grad_norm": 2.9866816997528076, + "learning_rate": 0.0003733788395904437, + "loss": 6.8862, + "step": 5508 + }, + { + "epoch": 1.8802047781569966, + "grad_norm": 2.98016095161438, + "learning_rate": 0.00037326507394766777, + "loss": 6.7283, + "step": 5509 + }, + { + "epoch": 1.8805460750853242, + "grad_norm": 2.978584051132202, + "learning_rate": 0.00037315130830489193, + "loss": 5.4551, + "step": 5510 + }, + { + "epoch": 1.8808873720136519, + "grad_norm": 3.2457802295684814, + "learning_rate": 0.00037303754266211603, + "loss": 6.5083, + "step": 5511 + }, + { + "epoch": 1.8812286689419795, + "grad_norm": 2.834044933319092, + "learning_rate": 0.00037292377701934014, + "loss": 6.3825, + "step": 5512 + }, + { + "epoch": 1.8815699658703071, + "grad_norm": 3.000852346420288, + "learning_rate": 0.0003728100113765643, + "loss": 5.9047, + "step": 5513 + }, + { + "epoch": 1.881911262798635, + "grad_norm": 3.954674482345581, + "learning_rate": 0.0003726962457337884, + "loss": 5.7054, + "step": 5514 + }, + { + "epoch": 1.8822525597269624, + "grad_norm": 3.103377103805542, + "learning_rate": 0.0003725824800910125, + "loss": 6.532, + "step": 5515 + }, + { + "epoch": 1.8825938566552902, + "grad_norm": 3.011568546295166, + "learning_rate": 0.00037246871444823667, + "loss": 6.4145, + "step": 5516 + }, + { + "epoch": 1.8829351535836176, + "grad_norm": 2.94783091545105, + "learning_rate": 0.0003723549488054608, + "loss": 5.7669, + "step": 5517 + }, + { + "epoch": 1.8832764505119455, + "grad_norm": 3.0503170490264893, + "learning_rate": 0.0003722411831626849, + "loss": 6.1408, + "step": 5518 + }, + { + "epoch": 1.8836177474402729, + "grad_norm": 2.829469919204712, + "learning_rate": 0.000372127417519909, + "loss": 6.3711, + "step": 5519 + }, + { + "epoch": 1.8839590443686007, + "grad_norm": 2.903998374938965, + "learning_rate": 0.0003720136518771331, + "loss": 6.3651, + "step": 5520 + }, + { + "epoch": 1.8843003412969284, + "grad_norm": 3.0705928802490234, + "learning_rate": 0.00037189988623435725, + "loss": 5.9707, + "step": 5521 + }, + { + "epoch": 1.884641638225256, + "grad_norm": 3.0179994106292725, + "learning_rate": 0.00037178612059158135, + "loss": 5.813, + "step": 5522 + }, + { + "epoch": 1.8849829351535836, + "grad_norm": 3.0558056831359863, + "learning_rate": 0.00037167235494880546, + "loss": 5.9914, + "step": 5523 + }, + { + "epoch": 1.8853242320819112, + "grad_norm": 2.9075541496276855, + "learning_rate": 0.0003715585893060296, + "loss": 6.2369, + "step": 5524 + }, + { + "epoch": 1.8856655290102389, + "grad_norm": 2.8797600269317627, + "learning_rate": 0.0003714448236632537, + "loss": 6.2321, + "step": 5525 + }, + { + "epoch": 1.8860068259385665, + "grad_norm": 2.88400936126709, + "learning_rate": 0.0003713310580204778, + "loss": 5.2605, + "step": 5526 + }, + { + "epoch": 1.8863481228668944, + "grad_norm": 3.1179773807525635, + "learning_rate": 0.000371217292377702, + "loss": 6.1447, + "step": 5527 + }, + { + "epoch": 1.8866894197952218, + "grad_norm": 2.8629674911499023, + "learning_rate": 0.00037110352673492604, + "loss": 6.4715, + "step": 5528 + }, + { + "epoch": 1.8870307167235496, + "grad_norm": 2.896420478820801, + "learning_rate": 0.00037098976109215014, + "loss": 6.3566, + "step": 5529 + }, + { + "epoch": 1.887372013651877, + "grad_norm": 2.9856109619140625, + "learning_rate": 0.0003708759954493743, + "loss": 6.3204, + "step": 5530 + }, + { + "epoch": 1.8877133105802049, + "grad_norm": 3.28257417678833, + "learning_rate": 0.0003707622298065984, + "loss": 5.3285, + "step": 5531 + }, + { + "epoch": 1.8880546075085323, + "grad_norm": 2.9387850761413574, + "learning_rate": 0.0003706484641638225, + "loss": 6.0632, + "step": 5532 + }, + { + "epoch": 1.8883959044368601, + "grad_norm": 2.9324870109558105, + "learning_rate": 0.00037053469852104667, + "loss": 5.8282, + "step": 5533 + }, + { + "epoch": 1.8887372013651877, + "grad_norm": 2.979710340499878, + "learning_rate": 0.0003704209328782708, + "loss": 6.2292, + "step": 5534 + }, + { + "epoch": 1.8890784982935154, + "grad_norm": 2.9082748889923096, + "learning_rate": 0.0003703071672354949, + "loss": 6.0311, + "step": 5535 + }, + { + "epoch": 1.889419795221843, + "grad_norm": 2.8582496643066406, + "learning_rate": 0.00037019340159271904, + "loss": 6.5768, + "step": 5536 + }, + { + "epoch": 1.8897610921501706, + "grad_norm": 3.1130855083465576, + "learning_rate": 0.00037007963594994314, + "loss": 5.4812, + "step": 5537 + }, + { + "epoch": 1.8901023890784983, + "grad_norm": 3.0239624977111816, + "learning_rate": 0.0003699658703071672, + "loss": 6.754, + "step": 5538 + }, + { + "epoch": 1.8904436860068259, + "grad_norm": 2.8626439571380615, + "learning_rate": 0.00036985210466439135, + "loss": 6.0451, + "step": 5539 + }, + { + "epoch": 1.8907849829351537, + "grad_norm": 3.52713942527771, + "learning_rate": 0.00036973833902161546, + "loss": 5.5374, + "step": 5540 + }, + { + "epoch": 1.8911262798634811, + "grad_norm": 3.0411159992218018, + "learning_rate": 0.0003696245733788396, + "loss": 6.253, + "step": 5541 + }, + { + "epoch": 1.891467576791809, + "grad_norm": 3.1052937507629395, + "learning_rate": 0.0003695108077360637, + "loss": 5.3534, + "step": 5542 + }, + { + "epoch": 1.8918088737201364, + "grad_norm": 3.007514715194702, + "learning_rate": 0.0003693970420932878, + "loss": 5.9601, + "step": 5543 + }, + { + "epoch": 1.8921501706484642, + "grad_norm": 3.2602505683898926, + "learning_rate": 0.000369283276450512, + "loss": 5.6877, + "step": 5544 + }, + { + "epoch": 1.8924914675767917, + "grad_norm": 3.1978507041931152, + "learning_rate": 0.0003691695108077361, + "loss": 5.6045, + "step": 5545 + }, + { + "epoch": 1.8928327645051195, + "grad_norm": 2.927302360534668, + "learning_rate": 0.0003690557451649602, + "loss": 5.7842, + "step": 5546 + }, + { + "epoch": 1.8931740614334471, + "grad_norm": 3.1250858306884766, + "learning_rate": 0.00036894197952218435, + "loss": 5.8389, + "step": 5547 + }, + { + "epoch": 1.8935153583617748, + "grad_norm": 3.004615545272827, + "learning_rate": 0.0003688282138794084, + "loss": 6.1601, + "step": 5548 + }, + { + "epoch": 1.8938566552901024, + "grad_norm": 3.0711987018585205, + "learning_rate": 0.0003687144482366325, + "loss": 6.316, + "step": 5549 + }, + { + "epoch": 1.89419795221843, + "grad_norm": 3.4832606315612793, + "learning_rate": 0.00036860068259385667, + "loss": 5.1936, + "step": 5550 + }, + { + "epoch": 1.8945392491467576, + "grad_norm": 3.176182508468628, + "learning_rate": 0.0003684869169510808, + "loss": 5.5715, + "step": 5551 + }, + { + "epoch": 1.8948805460750853, + "grad_norm": 2.930396318435669, + "learning_rate": 0.0003683731513083049, + "loss": 6.3699, + "step": 5552 + }, + { + "epoch": 1.8952218430034131, + "grad_norm": 2.8957557678222656, + "learning_rate": 0.00036825938566552904, + "loss": 5.121, + "step": 5553 + }, + { + "epoch": 1.8955631399317405, + "grad_norm": 2.885211229324341, + "learning_rate": 0.00036814562002275314, + "loss": 6.0966, + "step": 5554 + }, + { + "epoch": 1.8959044368600684, + "grad_norm": 3.3072848320007324, + "learning_rate": 0.00036803185437997725, + "loss": 5.6025, + "step": 5555 + }, + { + "epoch": 1.8962457337883958, + "grad_norm": 3.156798839569092, + "learning_rate": 0.0003679180887372014, + "loss": 6.0641, + "step": 5556 + }, + { + "epoch": 1.8965870307167236, + "grad_norm": 11.321859359741211, + "learning_rate": 0.00036780432309442546, + "loss": 5.4411, + "step": 5557 + }, + { + "epoch": 1.896928327645051, + "grad_norm": 2.977379322052002, + "learning_rate": 0.00036769055745164956, + "loss": 6.6542, + "step": 5558 + }, + { + "epoch": 1.8972696245733789, + "grad_norm": 3.0141522884368896, + "learning_rate": 0.0003675767918088737, + "loss": 6.0875, + "step": 5559 + }, + { + "epoch": 1.8976109215017065, + "grad_norm": 2.9410688877105713, + "learning_rate": 0.00036746302616609783, + "loss": 6.4585, + "step": 5560 + }, + { + "epoch": 1.8979522184300341, + "grad_norm": 3.0146484375, + "learning_rate": 0.000367349260523322, + "loss": 6.1579, + "step": 5561 + }, + { + "epoch": 1.8982935153583618, + "grad_norm": 2.873905897140503, + "learning_rate": 0.0003672354948805461, + "loss": 5.932, + "step": 5562 + }, + { + "epoch": 1.8986348122866894, + "grad_norm": 2.8637454509735107, + "learning_rate": 0.0003671217292377702, + "loss": 6.2974, + "step": 5563 + }, + { + "epoch": 1.898976109215017, + "grad_norm": 3.0655555725097656, + "learning_rate": 0.00036700796359499436, + "loss": 5.8897, + "step": 5564 + }, + { + "epoch": 1.8993174061433447, + "grad_norm": 2.8714025020599365, + "learning_rate": 0.00036689419795221846, + "loss": 6.1978, + "step": 5565 + }, + { + "epoch": 1.8996587030716725, + "grad_norm": 2.971374034881592, + "learning_rate": 0.00036678043230944257, + "loss": 5.2991, + "step": 5566 + }, + { + "epoch": 1.9, + "grad_norm": 2.9192891120910645, + "learning_rate": 0.00036666666666666667, + "loss": 5.9472, + "step": 5567 + }, + { + "epoch": 1.9003412969283278, + "grad_norm": 3.0181756019592285, + "learning_rate": 0.0003665529010238908, + "loss": 6.4597, + "step": 5568 + }, + { + "epoch": 1.9006825938566552, + "grad_norm": 2.9252567291259766, + "learning_rate": 0.0003664391353811149, + "loss": 6.3444, + "step": 5569 + }, + { + "epoch": 1.901023890784983, + "grad_norm": 3.165065050125122, + "learning_rate": 0.00036632536973833904, + "loss": 5.9354, + "step": 5570 + }, + { + "epoch": 1.9013651877133104, + "grad_norm": 2.935777425765991, + "learning_rate": 0.00036621160409556314, + "loss": 6.4579, + "step": 5571 + }, + { + "epoch": 1.9017064846416383, + "grad_norm": 2.785609722137451, + "learning_rate": 0.00036609783845278725, + "loss": 5.7246, + "step": 5572 + }, + { + "epoch": 1.902047781569966, + "grad_norm": 2.8585057258605957, + "learning_rate": 0.0003659840728100114, + "loss": 6.8029, + "step": 5573 + }, + { + "epoch": 1.9023890784982935, + "grad_norm": 2.8563308715820312, + "learning_rate": 0.0003658703071672355, + "loss": 6.3342, + "step": 5574 + }, + { + "epoch": 1.9027303754266212, + "grad_norm": 9.50164794921875, + "learning_rate": 0.0003657565415244596, + "loss": 5.5615, + "step": 5575 + }, + { + "epoch": 1.9030716723549488, + "grad_norm": 2.983273983001709, + "learning_rate": 0.0003656427758816838, + "loss": 6.0595, + "step": 5576 + }, + { + "epoch": 1.9034129692832764, + "grad_norm": 2.895350217819214, + "learning_rate": 0.00036552901023890783, + "loss": 6.2199, + "step": 5577 + }, + { + "epoch": 1.903754266211604, + "grad_norm": 3.4141218662261963, + "learning_rate": 0.00036541524459613193, + "loss": 6.1079, + "step": 5578 + }, + { + "epoch": 1.904095563139932, + "grad_norm": 3.1629059314727783, + "learning_rate": 0.0003653014789533561, + "loss": 5.895, + "step": 5579 + }, + { + "epoch": 1.9044368600682593, + "grad_norm": 2.89243483543396, + "learning_rate": 0.0003651877133105802, + "loss": 5.857, + "step": 5580 + }, + { + "epoch": 1.9047781569965871, + "grad_norm": 2.929072618484497, + "learning_rate": 0.0003650739476678043, + "loss": 6.2954, + "step": 5581 + }, + { + "epoch": 1.9051194539249146, + "grad_norm": 3.005039691925049, + "learning_rate": 0.00036496018202502846, + "loss": 6.2573, + "step": 5582 + }, + { + "epoch": 1.9054607508532424, + "grad_norm": 3.169196367263794, + "learning_rate": 0.00036484641638225257, + "loss": 5.6108, + "step": 5583 + }, + { + "epoch": 1.9058020477815698, + "grad_norm": 2.895752429962158, + "learning_rate": 0.0003647326507394767, + "loss": 6.3867, + "step": 5584 + }, + { + "epoch": 1.9061433447098977, + "grad_norm": 3.001361131668091, + "learning_rate": 0.00036461888509670083, + "loss": 6.3686, + "step": 5585 + }, + { + "epoch": 1.9064846416382253, + "grad_norm": 2.938120126724243, + "learning_rate": 0.00036450511945392494, + "loss": 6.4083, + "step": 5586 + }, + { + "epoch": 1.906825938566553, + "grad_norm": 2.791254758834839, + "learning_rate": 0.00036439135381114904, + "loss": 6.2613, + "step": 5587 + }, + { + "epoch": 1.9071672354948805, + "grad_norm": 3.121710777282715, + "learning_rate": 0.00036427758816837315, + "loss": 5.9979, + "step": 5588 + }, + { + "epoch": 1.9075085324232082, + "grad_norm": 3.0421764850616455, + "learning_rate": 0.00036416382252559725, + "loss": 5.7314, + "step": 5589 + }, + { + "epoch": 1.9078498293515358, + "grad_norm": 3.0027949810028076, + "learning_rate": 0.0003640500568828214, + "loss": 6.3889, + "step": 5590 + }, + { + "epoch": 1.9081911262798634, + "grad_norm": 2.965477466583252, + "learning_rate": 0.0003639362912400455, + "loss": 6.4909, + "step": 5591 + }, + { + "epoch": 1.9085324232081913, + "grad_norm": 3.089905023574829, + "learning_rate": 0.0003638225255972696, + "loss": 6.4991, + "step": 5592 + }, + { + "epoch": 1.9088737201365187, + "grad_norm": 2.832148551940918, + "learning_rate": 0.0003637087599544938, + "loss": 6.2179, + "step": 5593 + }, + { + "epoch": 1.9092150170648465, + "grad_norm": 2.9781718254089355, + "learning_rate": 0.0003635949943117179, + "loss": 6.366, + "step": 5594 + }, + { + "epoch": 1.909556313993174, + "grad_norm": 2.9966466426849365, + "learning_rate": 0.000363481228668942, + "loss": 6.1874, + "step": 5595 + }, + { + "epoch": 1.9098976109215018, + "grad_norm": 2.9594128131866455, + "learning_rate": 0.0003633674630261661, + "loss": 6.2093, + "step": 5596 + }, + { + "epoch": 1.9102389078498292, + "grad_norm": 2.8735949993133545, + "learning_rate": 0.0003632536973833902, + "loss": 6.6883, + "step": 5597 + }, + { + "epoch": 1.910580204778157, + "grad_norm": 3.143655300140381, + "learning_rate": 0.0003631399317406143, + "loss": 5.5111, + "step": 5598 + }, + { + "epoch": 1.9109215017064847, + "grad_norm": 2.9463629722595215, + "learning_rate": 0.00036302616609783846, + "loss": 6.3481, + "step": 5599 + }, + { + "epoch": 1.9112627986348123, + "grad_norm": 3.0451953411102295, + "learning_rate": 0.00036291240045506257, + "loss": 6.3788, + "step": 5600 + }, + { + "epoch": 1.91160409556314, + "grad_norm": 2.9045491218566895, + "learning_rate": 0.00036279863481228667, + "loss": 6.6246, + "step": 5601 + }, + { + "epoch": 1.9119453924914676, + "grad_norm": 2.970602035522461, + "learning_rate": 0.00036268486916951083, + "loss": 6.1013, + "step": 5602 + }, + { + "epoch": 1.9122866894197952, + "grad_norm": 2.88826847076416, + "learning_rate": 0.00036257110352673494, + "loss": 6.2451, + "step": 5603 + }, + { + "epoch": 1.9126279863481228, + "grad_norm": 2.8405919075012207, + "learning_rate": 0.0003624573378839591, + "loss": 6.4149, + "step": 5604 + }, + { + "epoch": 1.9129692832764507, + "grad_norm": 2.8814759254455566, + "learning_rate": 0.0003623435722411832, + "loss": 5.613, + "step": 5605 + }, + { + "epoch": 1.913310580204778, + "grad_norm": 2.8033180236816406, + "learning_rate": 0.00036222980659840725, + "loss": 5.9609, + "step": 5606 + }, + { + "epoch": 1.913651877133106, + "grad_norm": 2.9600167274475098, + "learning_rate": 0.0003621160409556314, + "loss": 6.6377, + "step": 5607 + }, + { + "epoch": 1.9139931740614333, + "grad_norm": 2.917205572128296, + "learning_rate": 0.0003620022753128555, + "loss": 5.7594, + "step": 5608 + }, + { + "epoch": 1.9143344709897612, + "grad_norm": 2.8245956897735596, + "learning_rate": 0.0003618885096700796, + "loss": 6.1144, + "step": 5609 + }, + { + "epoch": 1.9146757679180886, + "grad_norm": 6.755703449249268, + "learning_rate": 0.0003617747440273038, + "loss": 5.8463, + "step": 5610 + }, + { + "epoch": 1.9150170648464164, + "grad_norm": 2.968301773071289, + "learning_rate": 0.0003616609783845279, + "loss": 6.0244, + "step": 5611 + }, + { + "epoch": 1.915358361774744, + "grad_norm": 3.3348543643951416, + "learning_rate": 0.000361547212741752, + "loss": 4.6684, + "step": 5612 + }, + { + "epoch": 1.9156996587030717, + "grad_norm": 3.070195436477661, + "learning_rate": 0.00036143344709897615, + "loss": 5.955, + "step": 5613 + }, + { + "epoch": 1.9160409556313993, + "grad_norm": 3.008603572845459, + "learning_rate": 0.00036131968145620025, + "loss": 6.47, + "step": 5614 + }, + { + "epoch": 1.916382252559727, + "grad_norm": 3.069282293319702, + "learning_rate": 0.00036120591581342436, + "loss": 5.9866, + "step": 5615 + }, + { + "epoch": 1.9167235494880546, + "grad_norm": 2.977428913116455, + "learning_rate": 0.00036109215017064846, + "loss": 6.5193, + "step": 5616 + }, + { + "epoch": 1.9170648464163822, + "grad_norm": 2.9662117958068848, + "learning_rate": 0.00036097838452787257, + "loss": 5.8879, + "step": 5617 + }, + { + "epoch": 1.91740614334471, + "grad_norm": 3.1188912391662598, + "learning_rate": 0.0003608646188850967, + "loss": 5.7213, + "step": 5618 + }, + { + "epoch": 1.9177474402730375, + "grad_norm": 2.914436101913452, + "learning_rate": 0.00036075085324232083, + "loss": 6.7126, + "step": 5619 + }, + { + "epoch": 1.9180887372013653, + "grad_norm": 2.956573247909546, + "learning_rate": 0.00036063708759954494, + "loss": 6.3765, + "step": 5620 + }, + { + "epoch": 1.9184300341296927, + "grad_norm": 2.9372453689575195, + "learning_rate": 0.00036052332195676904, + "loss": 6.046, + "step": 5621 + }, + { + "epoch": 1.9187713310580206, + "grad_norm": 2.8803412914276123, + "learning_rate": 0.0003604095563139932, + "loss": 7.05, + "step": 5622 + }, + { + "epoch": 1.919112627986348, + "grad_norm": 3.022796392440796, + "learning_rate": 0.0003602957906712173, + "loss": 6.3435, + "step": 5623 + }, + { + "epoch": 1.9194539249146758, + "grad_norm": 2.9055187702178955, + "learning_rate": 0.00036018202502844147, + "loss": 6.021, + "step": 5624 + }, + { + "epoch": 1.9197952218430034, + "grad_norm": 3.274238348007202, + "learning_rate": 0.00036006825938566557, + "loss": 5.819, + "step": 5625 + }, + { + "epoch": 1.920136518771331, + "grad_norm": 3.3062515258789062, + "learning_rate": 0.0003599544937428896, + "loss": 6.3418, + "step": 5626 + }, + { + "epoch": 1.9204778156996587, + "grad_norm": 3.065157890319824, + "learning_rate": 0.0003598407281001138, + "loss": 6.9273, + "step": 5627 + }, + { + "epoch": 1.9208191126279863, + "grad_norm": 3.0245931148529053, + "learning_rate": 0.0003597269624573379, + "loss": 6.6343, + "step": 5628 + }, + { + "epoch": 1.921160409556314, + "grad_norm": 3.0174522399902344, + "learning_rate": 0.000359613196814562, + "loss": 5.9226, + "step": 5629 + }, + { + "epoch": 1.9215017064846416, + "grad_norm": 2.8802099227905273, + "learning_rate": 0.00035949943117178615, + "loss": 6.4793, + "step": 5630 + }, + { + "epoch": 1.9218430034129694, + "grad_norm": 3.4380247592926025, + "learning_rate": 0.00035938566552901025, + "loss": 5.8155, + "step": 5631 + }, + { + "epoch": 1.9221843003412968, + "grad_norm": 2.8854217529296875, + "learning_rate": 0.00035927189988623436, + "loss": 6.4598, + "step": 5632 + }, + { + "epoch": 1.9225255972696247, + "grad_norm": 3.8663971424102783, + "learning_rate": 0.0003591581342434585, + "loss": 4.1946, + "step": 5633 + }, + { + "epoch": 1.922866894197952, + "grad_norm": 2.801706314086914, + "learning_rate": 0.0003590443686006826, + "loss": 4.3066, + "step": 5634 + }, + { + "epoch": 1.92320819112628, + "grad_norm": 3.0731019973754883, + "learning_rate": 0.0003589306029579067, + "loss": 6.1641, + "step": 5635 + }, + { + "epoch": 1.9235494880546073, + "grad_norm": 3.997176170349121, + "learning_rate": 0.00035881683731513083, + "loss": 5.2667, + "step": 5636 + }, + { + "epoch": 1.9238907849829352, + "grad_norm": 3.5250935554504395, + "learning_rate": 0.00035870307167235494, + "loss": 3.3671, + "step": 5637 + }, + { + "epoch": 1.9242320819112628, + "grad_norm": 3.1355345249176025, + "learning_rate": 0.00035858930602957904, + "loss": 5.3227, + "step": 5638 + }, + { + "epoch": 1.9245733788395905, + "grad_norm": 3.095639228820801, + "learning_rate": 0.0003584755403868032, + "loss": 6.4683, + "step": 5639 + }, + { + "epoch": 1.924914675767918, + "grad_norm": 3.36194109916687, + "learning_rate": 0.0003583617747440273, + "loss": 6.0146, + "step": 5640 + }, + { + "epoch": 1.9252559726962457, + "grad_norm": 3.3805904388427734, + "learning_rate": 0.0003582480091012514, + "loss": 5.8779, + "step": 5641 + }, + { + "epoch": 1.9255972696245733, + "grad_norm": 3.0124683380126953, + "learning_rate": 0.00035813424345847557, + "loss": 6.5908, + "step": 5642 + }, + { + "epoch": 1.925938566552901, + "grad_norm": 3.2803475856781006, + "learning_rate": 0.0003580204778156997, + "loss": 5.7909, + "step": 5643 + }, + { + "epoch": 1.9262798634812288, + "grad_norm": 4.804317951202393, + "learning_rate": 0.00035790671217292384, + "loss": 5.5523, + "step": 5644 + }, + { + "epoch": 1.9266211604095562, + "grad_norm": 2.983696460723877, + "learning_rate": 0.0003577929465301479, + "loss": 6.0915, + "step": 5645 + }, + { + "epoch": 1.926962457337884, + "grad_norm": 2.929771900177002, + "learning_rate": 0.000357679180887372, + "loss": 5.4666, + "step": 5646 + }, + { + "epoch": 1.9273037542662115, + "grad_norm": 4.5054731369018555, + "learning_rate": 0.00035756541524459615, + "loss": 5.3908, + "step": 5647 + }, + { + "epoch": 1.9276450511945393, + "grad_norm": 2.8603098392486572, + "learning_rate": 0.00035745164960182026, + "loss": 6.3809, + "step": 5648 + }, + { + "epoch": 1.9279863481228667, + "grad_norm": 2.934943675994873, + "learning_rate": 0.00035733788395904436, + "loss": 5.8931, + "step": 5649 + }, + { + "epoch": 1.9283276450511946, + "grad_norm": 3.016292095184326, + "learning_rate": 0.0003572241183162685, + "loss": 5.6018, + "step": 5650 + }, + { + "epoch": 1.9286689419795222, + "grad_norm": 3.0841245651245117, + "learning_rate": 0.0003571103526734926, + "loss": 5.5162, + "step": 5651 + }, + { + "epoch": 1.9290102389078498, + "grad_norm": 2.870736837387085, + "learning_rate": 0.00035699658703071673, + "loss": 6.1772, + "step": 5652 + }, + { + "epoch": 1.9293515358361775, + "grad_norm": 6.509230136871338, + "learning_rate": 0.0003568828213879409, + "loss": 5.6765, + "step": 5653 + }, + { + "epoch": 1.929692832764505, + "grad_norm": 2.972872257232666, + "learning_rate": 0.000356769055745165, + "loss": 6.6489, + "step": 5654 + }, + { + "epoch": 1.9300341296928327, + "grad_norm": 3.467580556869507, + "learning_rate": 0.00035665529010238904, + "loss": 5.2504, + "step": 5655 + }, + { + "epoch": 1.9303754266211604, + "grad_norm": 3.250774621963501, + "learning_rate": 0.0003565415244596132, + "loss": 5.4942, + "step": 5656 + }, + { + "epoch": 1.9307167235494882, + "grad_norm": 2.9641740322113037, + "learning_rate": 0.0003564277588168373, + "loss": 6.0985, + "step": 5657 + }, + { + "epoch": 1.9310580204778156, + "grad_norm": 2.939337730407715, + "learning_rate": 0.0003563139931740614, + "loss": 5.0818, + "step": 5658 + }, + { + "epoch": 1.9313993174061435, + "grad_norm": 3.126194715499878, + "learning_rate": 0.00035620022753128557, + "loss": 6.1819, + "step": 5659 + }, + { + "epoch": 1.9317406143344709, + "grad_norm": 5.255848407745361, + "learning_rate": 0.0003560864618885097, + "loss": 5.0863, + "step": 5660 + }, + { + "epoch": 1.9320819112627987, + "grad_norm": 3.117389678955078, + "learning_rate": 0.0003559726962457338, + "loss": 6.127, + "step": 5661 + }, + { + "epoch": 1.9324232081911261, + "grad_norm": 2.9117963314056396, + "learning_rate": 0.00035585893060295794, + "loss": 6.263, + "step": 5662 + }, + { + "epoch": 1.932764505119454, + "grad_norm": 3.053349733352661, + "learning_rate": 0.00035574516496018205, + "loss": 6.0983, + "step": 5663 + }, + { + "epoch": 1.9331058020477816, + "grad_norm": 3.072718381881714, + "learning_rate": 0.0003556313993174061, + "loss": 5.3886, + "step": 5664 + }, + { + "epoch": 1.9334470989761092, + "grad_norm": 10.232709884643555, + "learning_rate": 0.00035551763367463026, + "loss": 6.7737, + "step": 5665 + }, + { + "epoch": 1.9337883959044369, + "grad_norm": 2.86478328704834, + "learning_rate": 0.00035540386803185436, + "loss": 5.8319, + "step": 5666 + }, + { + "epoch": 1.9341296928327645, + "grad_norm": 2.977330446243286, + "learning_rate": 0.0003552901023890785, + "loss": 6.3019, + "step": 5667 + }, + { + "epoch": 1.934470989761092, + "grad_norm": 3.6822073459625244, + "learning_rate": 0.0003551763367463026, + "loss": 3.6634, + "step": 5668 + }, + { + "epoch": 1.9348122866894197, + "grad_norm": 2.952073097229004, + "learning_rate": 0.00035506257110352673, + "loss": 6.0564, + "step": 5669 + }, + { + "epoch": 1.9351535836177476, + "grad_norm": 3.094076156616211, + "learning_rate": 0.0003549488054607509, + "loss": 6.3278, + "step": 5670 + }, + { + "epoch": 1.935494880546075, + "grad_norm": 3.062957763671875, + "learning_rate": 0.000354835039817975, + "loss": 6.9028, + "step": 5671 + }, + { + "epoch": 1.9358361774744028, + "grad_norm": 3.019155502319336, + "learning_rate": 0.0003547212741751991, + "loss": 5.549, + "step": 5672 + }, + { + "epoch": 1.9361774744027302, + "grad_norm": 2.855597734451294, + "learning_rate": 0.00035460750853242326, + "loss": 6.4601, + "step": 5673 + }, + { + "epoch": 1.936518771331058, + "grad_norm": 2.83564829826355, + "learning_rate": 0.0003544937428896473, + "loss": 6.4995, + "step": 5674 + }, + { + "epoch": 1.9368600682593855, + "grad_norm": 2.9659066200256348, + "learning_rate": 0.0003543799772468714, + "loss": 6.5664, + "step": 5675 + }, + { + "epoch": 1.9372013651877134, + "grad_norm": 2.9743475914001465, + "learning_rate": 0.0003542662116040956, + "loss": 7.1904, + "step": 5676 + }, + { + "epoch": 1.937542662116041, + "grad_norm": 8.094204902648926, + "learning_rate": 0.0003541524459613197, + "loss": 5.402, + "step": 5677 + }, + { + "epoch": 1.9378839590443686, + "grad_norm": 3.036323070526123, + "learning_rate": 0.0003540386803185438, + "loss": 6.4258, + "step": 5678 + }, + { + "epoch": 1.9382252559726962, + "grad_norm": 3.9837539196014404, + "learning_rate": 0.00035392491467576794, + "loss": 4.7752, + "step": 5679 + }, + { + "epoch": 1.9385665529010239, + "grad_norm": 3.116345167160034, + "learning_rate": 0.00035381114903299205, + "loss": 6.3365, + "step": 5680 + }, + { + "epoch": 1.9389078498293515, + "grad_norm": 2.885256290435791, + "learning_rate": 0.00035369738339021615, + "loss": 6.3768, + "step": 5681 + }, + { + "epoch": 1.9392491467576791, + "grad_norm": 2.9523653984069824, + "learning_rate": 0.0003535836177474403, + "loss": 6.459, + "step": 5682 + }, + { + "epoch": 1.939590443686007, + "grad_norm": 3.4799153804779053, + "learning_rate": 0.0003534698521046644, + "loss": 5.524, + "step": 5683 + }, + { + "epoch": 1.9399317406143344, + "grad_norm": 2.8675577640533447, + "learning_rate": 0.00035335608646188847, + "loss": 6.0246, + "step": 5684 + }, + { + "epoch": 1.9402730375426622, + "grad_norm": 2.8474411964416504, + "learning_rate": 0.0003532423208191126, + "loss": 6.6248, + "step": 5685 + }, + { + "epoch": 1.9406143344709896, + "grad_norm": 2.8228769302368164, + "learning_rate": 0.00035312855517633673, + "loss": 5.826, + "step": 5686 + }, + { + "epoch": 1.9409556313993175, + "grad_norm": 2.9075095653533936, + "learning_rate": 0.0003530147895335609, + "loss": 6.5578, + "step": 5687 + }, + { + "epoch": 1.9412969283276449, + "grad_norm": 2.9296059608459473, + "learning_rate": 0.000352901023890785, + "loss": 6.1996, + "step": 5688 + }, + { + "epoch": 1.9416382252559727, + "grad_norm": 3.073141098022461, + "learning_rate": 0.0003527872582480091, + "loss": 5.9504, + "step": 5689 + }, + { + "epoch": 1.9419795221843004, + "grad_norm": 2.9059715270996094, + "learning_rate": 0.00035267349260523326, + "loss": 6.5117, + "step": 5690 + }, + { + "epoch": 1.942320819112628, + "grad_norm": 2.9183788299560547, + "learning_rate": 0.00035255972696245736, + "loss": 6.7984, + "step": 5691 + }, + { + "epoch": 1.9426621160409556, + "grad_norm": 4.353701591491699, + "learning_rate": 0.00035244596131968147, + "loss": 4.3046, + "step": 5692 + }, + { + "epoch": 1.9430034129692833, + "grad_norm": 2.997781753540039, + "learning_rate": 0.00035233219567690563, + "loss": 6.7823, + "step": 5693 + }, + { + "epoch": 1.9433447098976109, + "grad_norm": 3.029731512069702, + "learning_rate": 0.0003522184300341297, + "loss": 6.3343, + "step": 5694 + }, + { + "epoch": 1.9436860068259385, + "grad_norm": 3.035346746444702, + "learning_rate": 0.0003521046643913538, + "loss": 6.6839, + "step": 5695 + }, + { + "epoch": 1.9440273037542664, + "grad_norm": 2.9383420944213867, + "learning_rate": 0.00035199089874857794, + "loss": 5.8788, + "step": 5696 + }, + { + "epoch": 1.9443686006825938, + "grad_norm": 2.8888626098632812, + "learning_rate": 0.00035187713310580205, + "loss": 6.6803, + "step": 5697 + }, + { + "epoch": 1.9447098976109216, + "grad_norm": 2.8905954360961914, + "learning_rate": 0.00035176336746302615, + "loss": 6.7251, + "step": 5698 + }, + { + "epoch": 1.945051194539249, + "grad_norm": 2.848459482192993, + "learning_rate": 0.0003516496018202503, + "loss": 6.3719, + "step": 5699 + }, + { + "epoch": 1.9453924914675769, + "grad_norm": 6.845625877380371, + "learning_rate": 0.0003515358361774744, + "loss": 5.3028, + "step": 5700 + }, + { + "epoch": 1.9457337883959043, + "grad_norm": 2.9232285022735596, + "learning_rate": 0.0003514220705346985, + "loss": 6.7182, + "step": 5701 + }, + { + "epoch": 1.9460750853242321, + "grad_norm": 2.9307868480682373, + "learning_rate": 0.0003513083048919227, + "loss": 5.891, + "step": 5702 + }, + { + "epoch": 1.9464163822525598, + "grad_norm": 4.067267894744873, + "learning_rate": 0.00035119453924914673, + "loss": 5.0473, + "step": 5703 + }, + { + "epoch": 1.9467576791808874, + "grad_norm": 3.438600778579712, + "learning_rate": 0.00035108077360637084, + "loss": 4.1299, + "step": 5704 + }, + { + "epoch": 1.947098976109215, + "grad_norm": 2.9375200271606445, + "learning_rate": 0.000350967007963595, + "loss": 6.3161, + "step": 5705 + }, + { + "epoch": 1.9474402730375426, + "grad_norm": 3.0774381160736084, + "learning_rate": 0.0003508532423208191, + "loss": 6.826, + "step": 5706 + }, + { + "epoch": 1.9477815699658703, + "grad_norm": 2.943984270095825, + "learning_rate": 0.00035073947667804326, + "loss": 6.4722, + "step": 5707 + }, + { + "epoch": 1.948122866894198, + "grad_norm": 2.850332021713257, + "learning_rate": 0.00035062571103526737, + "loss": 6.3759, + "step": 5708 + }, + { + "epoch": 1.9484641638225257, + "grad_norm": 2.926992177963257, + "learning_rate": 0.00035051194539249147, + "loss": 5.968, + "step": 5709 + }, + { + "epoch": 1.9488054607508531, + "grad_norm": 2.8328535556793213, + "learning_rate": 0.00035039817974971563, + "loss": 6.1558, + "step": 5710 + }, + { + "epoch": 1.949146757679181, + "grad_norm": 2.906438112258911, + "learning_rate": 0.00035028441410693973, + "loss": 6.7331, + "step": 5711 + }, + { + "epoch": 1.9494880546075084, + "grad_norm": 2.8392958641052246, + "learning_rate": 0.00035017064846416384, + "loss": 6.4798, + "step": 5712 + }, + { + "epoch": 1.9498293515358363, + "grad_norm": 3.147331476211548, + "learning_rate": 0.00035005688282138794, + "loss": 5.3819, + "step": 5713 + }, + { + "epoch": 1.9501706484641637, + "grad_norm": 2.8738808631896973, + "learning_rate": 0.00034994311717861205, + "loss": 6.0735, + "step": 5714 + }, + { + "epoch": 1.9505119453924915, + "grad_norm": 2.8813462257385254, + "learning_rate": 0.00034982935153583615, + "loss": 6.228, + "step": 5715 + }, + { + "epoch": 1.9508532423208191, + "grad_norm": 2.9783530235290527, + "learning_rate": 0.0003497155858930603, + "loss": 6.3557, + "step": 5716 + }, + { + "epoch": 1.9511945392491468, + "grad_norm": 2.8971831798553467, + "learning_rate": 0.0003496018202502844, + "loss": 6.5061, + "step": 5717 + }, + { + "epoch": 1.9515358361774744, + "grad_norm": 7.339874267578125, + "learning_rate": 0.0003494880546075085, + "loss": 5.3234, + "step": 5718 + }, + { + "epoch": 1.951877133105802, + "grad_norm": 2.992372751235962, + "learning_rate": 0.0003493742889647327, + "loss": 5.4487, + "step": 5719 + }, + { + "epoch": 1.9522184300341296, + "grad_norm": 2.892254590988159, + "learning_rate": 0.0003492605233219568, + "loss": 6.1741, + "step": 5720 + }, + { + "epoch": 1.9525597269624573, + "grad_norm": 2.97224497795105, + "learning_rate": 0.0003491467576791809, + "loss": 6.3184, + "step": 5721 + }, + { + "epoch": 1.9529010238907851, + "grad_norm": 2.979079008102417, + "learning_rate": 0.00034903299203640505, + "loss": 6.1467, + "step": 5722 + }, + { + "epoch": 1.9532423208191125, + "grad_norm": 3.279611825942993, + "learning_rate": 0.0003489192263936291, + "loss": 5.636, + "step": 5723 + }, + { + "epoch": 1.9535836177474404, + "grad_norm": 2.8635432720184326, + "learning_rate": 0.0003488054607508532, + "loss": 6.1327, + "step": 5724 + }, + { + "epoch": 1.9539249146757678, + "grad_norm": 2.876227855682373, + "learning_rate": 0.00034869169510807737, + "loss": 5.9195, + "step": 5725 + }, + { + "epoch": 1.9542662116040956, + "grad_norm": 2.923100233078003, + "learning_rate": 0.00034857792946530147, + "loss": 5.9977, + "step": 5726 + }, + { + "epoch": 1.954607508532423, + "grad_norm": 2.884539842605591, + "learning_rate": 0.00034846416382252563, + "loss": 6.6596, + "step": 5727 + }, + { + "epoch": 1.954948805460751, + "grad_norm": 2.869551420211792, + "learning_rate": 0.00034835039817974973, + "loss": 6.7263, + "step": 5728 + }, + { + "epoch": 1.9552901023890785, + "grad_norm": 3.0249316692352295, + "learning_rate": 0.00034823663253697384, + "loss": 5.9006, + "step": 5729 + }, + { + "epoch": 1.9556313993174061, + "grad_norm": 3.008315086364746, + "learning_rate": 0.000348122866894198, + "loss": 6.1326, + "step": 5730 + }, + { + "epoch": 1.9559726962457338, + "grad_norm": 2.924003839492798, + "learning_rate": 0.0003480091012514221, + "loss": 5.8715, + "step": 5731 + }, + { + "epoch": 1.9563139931740614, + "grad_norm": 2.9903616905212402, + "learning_rate": 0.00034789533560864615, + "loss": 6.8732, + "step": 5732 + }, + { + "epoch": 1.956655290102389, + "grad_norm": 3.197800636291504, + "learning_rate": 0.0003477815699658703, + "loss": 5.9274, + "step": 5733 + }, + { + "epoch": 1.9569965870307167, + "grad_norm": 2.8411238193511963, + "learning_rate": 0.0003476678043230944, + "loss": 6.1661, + "step": 5734 + }, + { + "epoch": 1.9573378839590445, + "grad_norm": 2.9080448150634766, + "learning_rate": 0.0003475540386803185, + "loss": 6.2348, + "step": 5735 + }, + { + "epoch": 1.957679180887372, + "grad_norm": 2.778029441833496, + "learning_rate": 0.0003474402730375427, + "loss": 6.2891, + "step": 5736 + }, + { + "epoch": 1.9580204778156998, + "grad_norm": 2.954080581665039, + "learning_rate": 0.0003473265073947668, + "loss": 6.1235, + "step": 5737 + }, + { + "epoch": 1.9583617747440272, + "grad_norm": 2.9157159328460693, + "learning_rate": 0.0003472127417519909, + "loss": 6.2918, + "step": 5738 + }, + { + "epoch": 1.958703071672355, + "grad_norm": 2.926003932952881, + "learning_rate": 0.00034709897610921505, + "loss": 6.6191, + "step": 5739 + }, + { + "epoch": 1.9590443686006824, + "grad_norm": 3.026291847229004, + "learning_rate": 0.00034698521046643916, + "loss": 6.0096, + "step": 5740 + }, + { + "epoch": 1.9593856655290103, + "grad_norm": 2.8232579231262207, + "learning_rate": 0.00034687144482366326, + "loss": 6.4494, + "step": 5741 + }, + { + "epoch": 1.959726962457338, + "grad_norm": 3.837078332901001, + "learning_rate": 0.00034675767918088737, + "loss": 5.45, + "step": 5742 + }, + { + "epoch": 1.9600682593856655, + "grad_norm": 2.8543741703033447, + "learning_rate": 0.00034664391353811147, + "loss": 6.1748, + "step": 5743 + }, + { + "epoch": 1.9604095563139932, + "grad_norm": 3.0786311626434326, + "learning_rate": 0.0003465301478953356, + "loss": 5.7486, + "step": 5744 + }, + { + "epoch": 1.9607508532423208, + "grad_norm": 3.161501407623291, + "learning_rate": 0.00034641638225255974, + "loss": 5.8587, + "step": 5745 + }, + { + "epoch": 1.9610921501706484, + "grad_norm": 2.9017693996429443, + "learning_rate": 0.00034630261660978384, + "loss": 6.2358, + "step": 5746 + }, + { + "epoch": 1.961433447098976, + "grad_norm": 2.9906439781188965, + "learning_rate": 0.000346188850967008, + "loss": 6.5953, + "step": 5747 + }, + { + "epoch": 1.961774744027304, + "grad_norm": 2.825509786605835, + "learning_rate": 0.0003460750853242321, + "loss": 6.4981, + "step": 5748 + }, + { + "epoch": 1.9621160409556313, + "grad_norm": 4.668010711669922, + "learning_rate": 0.0003459613196814562, + "loss": 4.1262, + "step": 5749 + }, + { + "epoch": 1.9624573378839592, + "grad_norm": 2.996762990951538, + "learning_rate": 0.00034584755403868037, + "loss": 5.5406, + "step": 5750 + }, + { + "epoch": 1.9627986348122866, + "grad_norm": 3.0203421115875244, + "learning_rate": 0.0003457337883959045, + "loss": 5.8989, + "step": 5751 + }, + { + "epoch": 1.9631399317406144, + "grad_norm": 2.8792803287506104, + "learning_rate": 0.0003456200227531285, + "loss": 6.4116, + "step": 5752 + }, + { + "epoch": 1.9634812286689418, + "grad_norm": 2.9204788208007812, + "learning_rate": 0.0003455062571103527, + "loss": 5.7728, + "step": 5753 + }, + { + "epoch": 1.9638225255972697, + "grad_norm": 2.9870834350585938, + "learning_rate": 0.0003453924914675768, + "loss": 6.179, + "step": 5754 + }, + { + "epoch": 1.9641638225255973, + "grad_norm": 3.1838748455047607, + "learning_rate": 0.0003452787258248009, + "loss": 5.5057, + "step": 5755 + }, + { + "epoch": 1.964505119453925, + "grad_norm": 2.916292667388916, + "learning_rate": 0.00034516496018202505, + "loss": 6.7139, + "step": 5756 + }, + { + "epoch": 1.9648464163822525, + "grad_norm": 2.948239326477051, + "learning_rate": 0.00034505119453924916, + "loss": 6.5786, + "step": 5757 + }, + { + "epoch": 1.9651877133105802, + "grad_norm": 2.867830276489258, + "learning_rate": 0.00034493742889647326, + "loss": 6.4021, + "step": 5758 + }, + { + "epoch": 1.9655290102389078, + "grad_norm": 3.0150563716888428, + "learning_rate": 0.0003448236632536974, + "loss": 5.8162, + "step": 5759 + }, + { + "epoch": 1.9658703071672354, + "grad_norm": 3.1503114700317383, + "learning_rate": 0.0003447098976109215, + "loss": 5.9391, + "step": 5760 + }, + { + "epoch": 1.9662116040955633, + "grad_norm": 2.8918957710266113, + "learning_rate": 0.00034459613196814563, + "loss": 6.6574, + "step": 5761 + }, + { + "epoch": 1.9665529010238907, + "grad_norm": 3.020156145095825, + "learning_rate": 0.00034448236632536974, + "loss": 5.4734, + "step": 5762 + }, + { + "epoch": 1.9668941979522185, + "grad_norm": 3.0351099967956543, + "learning_rate": 0.00034436860068259384, + "loss": 5.4351, + "step": 5763 + }, + { + "epoch": 1.967235494880546, + "grad_norm": 2.8954880237579346, + "learning_rate": 0.00034425483503981795, + "loss": 6.2349, + "step": 5764 + }, + { + "epoch": 1.9675767918088738, + "grad_norm": 2.7660810947418213, + "learning_rate": 0.0003441410693970421, + "loss": 6.1523, + "step": 5765 + }, + { + "epoch": 1.9679180887372012, + "grad_norm": 11.300498962402344, + "learning_rate": 0.0003440273037542662, + "loss": 3.3175, + "step": 5766 + }, + { + "epoch": 1.968259385665529, + "grad_norm": 9.648717880249023, + "learning_rate": 0.00034391353811149037, + "loss": 4.9295, + "step": 5767 + }, + { + "epoch": 1.9686006825938567, + "grad_norm": 3.16329288482666, + "learning_rate": 0.0003437997724687145, + "loss": 5.9512, + "step": 5768 + }, + { + "epoch": 1.9689419795221843, + "grad_norm": 3.4901375770568848, + "learning_rate": 0.0003436860068259386, + "loss": 5.3588, + "step": 5769 + }, + { + "epoch": 1.969283276450512, + "grad_norm": 3.6523478031158447, + "learning_rate": 0.00034357224118316274, + "loss": 3.9288, + "step": 5770 + }, + { + "epoch": 1.9696245733788396, + "grad_norm": 3.0849642753601074, + "learning_rate": 0.0003434584755403868, + "loss": 6.282, + "step": 5771 + }, + { + "epoch": 1.9699658703071672, + "grad_norm": 3.0182032585144043, + "learning_rate": 0.0003433447098976109, + "loss": 6.6789, + "step": 5772 + }, + { + "epoch": 1.9703071672354948, + "grad_norm": 2.9407904148101807, + "learning_rate": 0.00034323094425483505, + "loss": 6.3049, + "step": 5773 + }, + { + "epoch": 1.9706484641638227, + "grad_norm": 2.94191837310791, + "learning_rate": 0.00034311717861205916, + "loss": 6.1876, + "step": 5774 + }, + { + "epoch": 1.97098976109215, + "grad_norm": 2.8799357414245605, + "learning_rate": 0.00034300341296928326, + "loss": 6.0039, + "step": 5775 + }, + { + "epoch": 1.971331058020478, + "grad_norm": 3.216027021408081, + "learning_rate": 0.0003428896473265074, + "loss": 5.3073, + "step": 5776 + }, + { + "epoch": 1.9716723549488053, + "grad_norm": 2.9986681938171387, + "learning_rate": 0.00034277588168373153, + "loss": 5.4347, + "step": 5777 + }, + { + "epoch": 1.9720136518771332, + "grad_norm": 2.878371477127075, + "learning_rate": 0.00034266211604095563, + "loss": 6.3747, + "step": 5778 + }, + { + "epoch": 1.9723549488054608, + "grad_norm": 2.826388359069824, + "learning_rate": 0.0003425483503981798, + "loss": 6.0383, + "step": 5779 + }, + { + "epoch": 1.9726962457337884, + "grad_norm": 2.887346029281616, + "learning_rate": 0.0003424345847554039, + "loss": 5.9541, + "step": 5780 + }, + { + "epoch": 1.973037542662116, + "grad_norm": 3.0003409385681152, + "learning_rate": 0.00034232081911262795, + "loss": 5.4532, + "step": 5781 + }, + { + "epoch": 1.9733788395904437, + "grad_norm": 2.9644722938537598, + "learning_rate": 0.0003422070534698521, + "loss": 6.1134, + "step": 5782 + }, + { + "epoch": 1.9737201365187713, + "grad_norm": 3.0157651901245117, + "learning_rate": 0.0003420932878270762, + "loss": 6.7906, + "step": 5783 + }, + { + "epoch": 1.974061433447099, + "grad_norm": 3.0431642532348633, + "learning_rate": 0.0003419795221843003, + "loss": 6.5403, + "step": 5784 + }, + { + "epoch": 1.9744027303754266, + "grad_norm": 4.550543785095215, + "learning_rate": 0.0003418657565415245, + "loss": 5.8783, + "step": 5785 + }, + { + "epoch": 1.9747440273037542, + "grad_norm": 3.095123052597046, + "learning_rate": 0.0003417519908987486, + "loss": 5.6745, + "step": 5786 + }, + { + "epoch": 1.975085324232082, + "grad_norm": 2.822967290878296, + "learning_rate": 0.0003416382252559727, + "loss": 6.32, + "step": 5787 + }, + { + "epoch": 1.9754266211604095, + "grad_norm": 3.229649066925049, + "learning_rate": 0.00034152445961319684, + "loss": 6.0626, + "step": 5788 + }, + { + "epoch": 1.9757679180887373, + "grad_norm": 2.855510950088501, + "learning_rate": 0.00034141069397042095, + "loss": 5.828, + "step": 5789 + }, + { + "epoch": 1.9761092150170647, + "grad_norm": 2.838651657104492, + "learning_rate": 0.0003412969283276451, + "loss": 6.1334, + "step": 5790 + }, + { + "epoch": 1.9764505119453926, + "grad_norm": 2.9031753540039062, + "learning_rate": 0.00034118316268486916, + "loss": 6.1161, + "step": 5791 + }, + { + "epoch": 1.9767918088737202, + "grad_norm": 3.6981256008148193, + "learning_rate": 0.00034106939704209326, + "loss": 5.6065, + "step": 5792 + }, + { + "epoch": 1.9771331058020478, + "grad_norm": 2.8721656799316406, + "learning_rate": 0.0003409556313993174, + "loss": 6.2977, + "step": 5793 + }, + { + "epoch": 1.9774744027303754, + "grad_norm": 2.9534177780151367, + "learning_rate": 0.00034084186575654153, + "loss": 5.5651, + "step": 5794 + }, + { + "epoch": 1.977815699658703, + "grad_norm": 3.0801563262939453, + "learning_rate": 0.00034072810011376563, + "loss": 5.7325, + "step": 5795 + }, + { + "epoch": 1.9781569965870307, + "grad_norm": 3.880596399307251, + "learning_rate": 0.0003406143344709898, + "loss": 4.5394, + "step": 5796 + }, + { + "epoch": 1.9784982935153583, + "grad_norm": 2.8708319664001465, + "learning_rate": 0.0003405005688282139, + "loss": 6.3677, + "step": 5797 + }, + { + "epoch": 1.978839590443686, + "grad_norm": 2.896517753601074, + "learning_rate": 0.000340386803185438, + "loss": 6.0279, + "step": 5798 + }, + { + "epoch": 1.9791808873720136, + "grad_norm": 2.8943135738372803, + "learning_rate": 0.00034027303754266216, + "loss": 6.1411, + "step": 5799 + }, + { + "epoch": 1.9795221843003414, + "grad_norm": 2.9464333057403564, + "learning_rate": 0.00034015927189988627, + "loss": 6.6108, + "step": 5800 + }, + { + "epoch": 1.9798634812286688, + "grad_norm": 2.76023268699646, + "learning_rate": 0.0003400455062571103, + "loss": 6.2494, + "step": 5801 + }, + { + "epoch": 1.9802047781569967, + "grad_norm": 2.8224380016326904, + "learning_rate": 0.0003399317406143345, + "loss": 6.3798, + "step": 5802 + }, + { + "epoch": 1.980546075085324, + "grad_norm": 6.953861713409424, + "learning_rate": 0.0003398179749715586, + "loss": 6.2183, + "step": 5803 + }, + { + "epoch": 1.980887372013652, + "grad_norm": 3.372380256652832, + "learning_rate": 0.0003397042093287827, + "loss": 5.2131, + "step": 5804 + }, + { + "epoch": 1.9812286689419796, + "grad_norm": 2.964792490005493, + "learning_rate": 0.00033959044368600685, + "loss": 6.8906, + "step": 5805 + }, + { + "epoch": 1.9815699658703072, + "grad_norm": 9.055648803710938, + "learning_rate": 0.00033947667804323095, + "loss": 7.0398, + "step": 5806 + }, + { + "epoch": 1.9819112627986348, + "grad_norm": 3.136854648590088, + "learning_rate": 0.00033936291240045506, + "loss": 5.7341, + "step": 5807 + }, + { + "epoch": 1.9822525597269625, + "grad_norm": 3.117483139038086, + "learning_rate": 0.0003392491467576792, + "loss": 6.1711, + "step": 5808 + }, + { + "epoch": 1.98259385665529, + "grad_norm": 3.032315492630005, + "learning_rate": 0.0003391353811149033, + "loss": 5.4116, + "step": 5809 + }, + { + "epoch": 1.9829351535836177, + "grad_norm": 2.858267068862915, + "learning_rate": 0.0003390216154721274, + "loss": 6.2049, + "step": 5810 + }, + { + "epoch": 1.9832764505119453, + "grad_norm": 2.9900565147399902, + "learning_rate": 0.00033890784982935153, + "loss": 5.5319, + "step": 5811 + }, + { + "epoch": 1.983617747440273, + "grad_norm": 4.313586711883545, + "learning_rate": 0.00033879408418657563, + "loss": 4.6686, + "step": 5812 + }, + { + "epoch": 1.9839590443686008, + "grad_norm": 2.9994773864746094, + "learning_rate": 0.0003386803185437998, + "loss": 5.9989, + "step": 5813 + }, + { + "epoch": 1.9843003412969282, + "grad_norm": 2.9691319465637207, + "learning_rate": 0.0003385665529010239, + "loss": 6.2986, + "step": 5814 + }, + { + "epoch": 1.984641638225256, + "grad_norm": 2.8935835361480713, + "learning_rate": 0.000338452787258248, + "loss": 5.6747, + "step": 5815 + }, + { + "epoch": 1.9849829351535835, + "grad_norm": 3.180413007736206, + "learning_rate": 0.00033833902161547216, + "loss": 5.2211, + "step": 5816 + }, + { + "epoch": 1.9853242320819113, + "grad_norm": 3.1071324348449707, + "learning_rate": 0.00033822525597269627, + "loss": 6.0189, + "step": 5817 + }, + { + "epoch": 1.985665529010239, + "grad_norm": 3.498764753341675, + "learning_rate": 0.00033811149032992037, + "loss": 5.6862, + "step": 5818 + }, + { + "epoch": 1.9860068259385666, + "grad_norm": 2.8438425064086914, + "learning_rate": 0.00033799772468714453, + "loss": 6.1487, + "step": 5819 + }, + { + "epoch": 1.9863481228668942, + "grad_norm": 2.9216058254241943, + "learning_rate": 0.0003378839590443686, + "loss": 6.1612, + "step": 5820 + }, + { + "epoch": 1.9866894197952218, + "grad_norm": 2.917191743850708, + "learning_rate": 0.0003377701934015927, + "loss": 6.3926, + "step": 5821 + }, + { + "epoch": 1.9870307167235495, + "grad_norm": 2.862936496734619, + "learning_rate": 0.00033765642775881685, + "loss": 6.1182, + "step": 5822 + }, + { + "epoch": 1.987372013651877, + "grad_norm": 3.223823308944702, + "learning_rate": 0.00033754266211604095, + "loss": 5.9686, + "step": 5823 + }, + { + "epoch": 1.9877133105802047, + "grad_norm": 3.052494764328003, + "learning_rate": 0.00033742889647326506, + "loss": 5.3687, + "step": 5824 + }, + { + "epoch": 1.9880546075085324, + "grad_norm": 3.0097758769989014, + "learning_rate": 0.0003373151308304892, + "loss": 6.1109, + "step": 5825 + }, + { + "epoch": 1.9883959044368602, + "grad_norm": 2.950054168701172, + "learning_rate": 0.0003372013651877133, + "loss": 6.4818, + "step": 5826 + }, + { + "epoch": 1.9887372013651876, + "grad_norm": 2.8829801082611084, + "learning_rate": 0.0003370875995449374, + "loss": 6.2982, + "step": 5827 + }, + { + "epoch": 1.9890784982935155, + "grad_norm": 2.908803701400757, + "learning_rate": 0.0003369738339021616, + "loss": 6.0992, + "step": 5828 + }, + { + "epoch": 1.9894197952218429, + "grad_norm": 3.0301074981689453, + "learning_rate": 0.0003368600682593857, + "loss": 6.7445, + "step": 5829 + }, + { + "epoch": 1.9897610921501707, + "grad_norm": 2.9564156532287598, + "learning_rate": 0.0003367463026166098, + "loss": 6.5264, + "step": 5830 + }, + { + "epoch": 1.9901023890784983, + "grad_norm": 2.951019763946533, + "learning_rate": 0.0003366325369738339, + "loss": 5.8757, + "step": 5831 + }, + { + "epoch": 1.990443686006826, + "grad_norm": 4.174768924713135, + "learning_rate": 0.000336518771331058, + "loss": 4.409, + "step": 5832 + }, + { + "epoch": 1.9907849829351536, + "grad_norm": 6.287966251373291, + "learning_rate": 0.00033640500568828216, + "loss": 5.3929, + "step": 5833 + }, + { + "epoch": 1.9911262798634812, + "grad_norm": 2.996116876602173, + "learning_rate": 0.00033629124004550627, + "loss": 6.4399, + "step": 5834 + }, + { + "epoch": 1.9914675767918089, + "grad_norm": 3.3284964561462402, + "learning_rate": 0.0003361774744027304, + "loss": 5.0079, + "step": 5835 + }, + { + "epoch": 1.9918088737201365, + "grad_norm": 2.8886935710906982, + "learning_rate": 0.00033606370875995453, + "loss": 6.3832, + "step": 5836 + }, + { + "epoch": 1.9921501706484641, + "grad_norm": 2.967557907104492, + "learning_rate": 0.00033594994311717864, + "loss": 6.3409, + "step": 5837 + }, + { + "epoch": 1.9924914675767917, + "grad_norm": 2.8980183601379395, + "learning_rate": 0.00033583617747440274, + "loss": 6.3379, + "step": 5838 + }, + { + "epoch": 1.9928327645051196, + "grad_norm": 3.0234856605529785, + "learning_rate": 0.00033572241183162685, + "loss": 6.155, + "step": 5839 + }, + { + "epoch": 1.993174061433447, + "grad_norm": 2.7687742710113525, + "learning_rate": 0.00033560864618885095, + "loss": 6.3193, + "step": 5840 + }, + { + "epoch": 1.9935153583617748, + "grad_norm": 2.851742744445801, + "learning_rate": 0.00033549488054607506, + "loss": 6.1613, + "step": 5841 + }, + { + "epoch": 1.9938566552901023, + "grad_norm": 2.8196914196014404, + "learning_rate": 0.0003353811149032992, + "loss": 6.0988, + "step": 5842 + }, + { + "epoch": 1.99419795221843, + "grad_norm": 2.847468614578247, + "learning_rate": 0.0003352673492605233, + "loss": 6.8367, + "step": 5843 + }, + { + "epoch": 1.9945392491467577, + "grad_norm": 2.88217830657959, + "learning_rate": 0.0003351535836177474, + "loss": 6.3181, + "step": 5844 + }, + { + "epoch": 1.9948805460750854, + "grad_norm": 2.8297886848449707, + "learning_rate": 0.0003350398179749716, + "loss": 5.9361, + "step": 5845 + }, + { + "epoch": 1.995221843003413, + "grad_norm": 2.9879589080810547, + "learning_rate": 0.0003349260523321957, + "loss": 6.2692, + "step": 5846 + }, + { + "epoch": 1.9955631399317406, + "grad_norm": 2.923250198364258, + "learning_rate": 0.0003348122866894198, + "loss": 6.4919, + "step": 5847 + }, + { + "epoch": 1.9959044368600682, + "grad_norm": 2.852890968322754, + "learning_rate": 0.00033469852104664395, + "loss": 5.8794, + "step": 5848 + }, + { + "epoch": 1.9962457337883959, + "grad_norm": 4.5131001472473145, + "learning_rate": 0.000334584755403868, + "loss": 4.9364, + "step": 5849 + }, + { + "epoch": 1.9965870307167235, + "grad_norm": 4.635778903961182, + "learning_rate": 0.0003344709897610921, + "loss": 5.363, + "step": 5850 + }, + { + "epoch": 1.9969283276450511, + "grad_norm": 2.963787317276001, + "learning_rate": 0.00033435722411831627, + "loss": 6.3649, + "step": 5851 + }, + { + "epoch": 1.997269624573379, + "grad_norm": 2.9836738109588623, + "learning_rate": 0.0003342434584755404, + "loss": 6.69, + "step": 5852 + }, + { + "epoch": 1.9976109215017064, + "grad_norm": 2.863739252090454, + "learning_rate": 0.00033412969283276453, + "loss": 6.4131, + "step": 5853 + }, + { + "epoch": 1.9979522184300342, + "grad_norm": 2.899862289428711, + "learning_rate": 0.00033401592718998864, + "loss": 6.0362, + "step": 5854 + }, + { + "epoch": 1.9982935153583616, + "grad_norm": 2.9389188289642334, + "learning_rate": 0.00033390216154721274, + "loss": 6.5771, + "step": 5855 + }, + { + "epoch": 1.9986348122866895, + "grad_norm": 5.9408183097839355, + "learning_rate": 0.0003337883959044369, + "loss": 5.5061, + "step": 5856 + }, + { + "epoch": 1.9989761092150171, + "grad_norm": 2.952606201171875, + "learning_rate": 0.000333674630261661, + "loss": 6.0148, + "step": 5857 + }, + { + "epoch": 1.9993174061433447, + "grad_norm": 2.8606393337249756, + "learning_rate": 0.0003335608646188851, + "loss": 6.0906, + "step": 5858 + }, + { + "epoch": 1.9996587030716724, + "grad_norm": 2.852064371109009, + "learning_rate": 0.0003334470989761092, + "loss": 6.4203, + "step": 5859 + }, + { + "epoch": 2.0, + "grad_norm": 2.8471901416778564, + "learning_rate": 0.0003333333333333333, + "loss": 6.2409, + "step": 5860 + }, + { + "epoch": 2.000341296928328, + "grad_norm": 2.941194772720337, + "learning_rate": 0.00033321956769055743, + "loss": 6.3285, + "step": 5861 + }, + { + "epoch": 2.0006825938566553, + "grad_norm": 3.083709955215454, + "learning_rate": 0.0003331058020477816, + "loss": 6.2642, + "step": 5862 + }, + { + "epoch": 2.001023890784983, + "grad_norm": 3.8229711055755615, + "learning_rate": 0.0003329920364050057, + "loss": 3.7616, + "step": 5863 + }, + { + "epoch": 2.0013651877133105, + "grad_norm": 2.923698902130127, + "learning_rate": 0.0003328782707622298, + "loss": 5.66, + "step": 5864 + }, + { + "epoch": 2.0017064846416384, + "grad_norm": 3.1256601810455322, + "learning_rate": 0.00033276450511945396, + "loss": 5.7579, + "step": 5865 + }, + { + "epoch": 2.0020477815699658, + "grad_norm": 3.4216883182525635, + "learning_rate": 0.00033265073947667806, + "loss": 5.9409, + "step": 5866 + }, + { + "epoch": 2.0023890784982936, + "grad_norm": 3.03593373298645, + "learning_rate": 0.00033253697383390217, + "loss": 5.8932, + "step": 5867 + }, + { + "epoch": 2.002730375426621, + "grad_norm": 2.919698715209961, + "learning_rate": 0.0003324232081911263, + "loss": 5.8919, + "step": 5868 + }, + { + "epoch": 2.003071672354949, + "grad_norm": 2.8888025283813477, + "learning_rate": 0.0003323094425483504, + "loss": 5.7327, + "step": 5869 + }, + { + "epoch": 2.0034129692832763, + "grad_norm": 2.8589115142822266, + "learning_rate": 0.0003321956769055745, + "loss": 5.6514, + "step": 5870 + }, + { + "epoch": 2.003754266211604, + "grad_norm": 2.8871238231658936, + "learning_rate": 0.00033208191126279864, + "loss": 5.9583, + "step": 5871 + }, + { + "epoch": 2.0040955631399315, + "grad_norm": 2.953996181488037, + "learning_rate": 0.00033196814562002274, + "loss": 5.6646, + "step": 5872 + }, + { + "epoch": 2.0044368600682594, + "grad_norm": 2.8244423866271973, + "learning_rate": 0.0003318543799772469, + "loss": 6.174, + "step": 5873 + }, + { + "epoch": 2.0047781569965872, + "grad_norm": 2.8501133918762207, + "learning_rate": 0.000331740614334471, + "loss": 5.4038, + "step": 5874 + }, + { + "epoch": 2.0051194539249146, + "grad_norm": 2.8104753494262695, + "learning_rate": 0.0003316268486916951, + "loss": 6.5321, + "step": 5875 + }, + { + "epoch": 2.0054607508532425, + "grad_norm": 2.9065990447998047, + "learning_rate": 0.00033151308304891927, + "loss": 6.6124, + "step": 5876 + }, + { + "epoch": 2.00580204778157, + "grad_norm": 2.9225258827209473, + "learning_rate": 0.0003313993174061434, + "loss": 6.2582, + "step": 5877 + }, + { + "epoch": 2.0061433447098977, + "grad_norm": 2.842495918273926, + "learning_rate": 0.00033128555176336743, + "loss": 6.2283, + "step": 5878 + }, + { + "epoch": 2.006484641638225, + "grad_norm": 2.993636131286621, + "learning_rate": 0.0003311717861205916, + "loss": 6.1056, + "step": 5879 + }, + { + "epoch": 2.006825938566553, + "grad_norm": 2.954638957977295, + "learning_rate": 0.0003310580204778157, + "loss": 6.1397, + "step": 5880 + }, + { + "epoch": 2.0071672354948804, + "grad_norm": 2.83888840675354, + "learning_rate": 0.0003309442548350398, + "loss": 6.3731, + "step": 5881 + }, + { + "epoch": 2.0075085324232083, + "grad_norm": 2.950007438659668, + "learning_rate": 0.00033083048919226396, + "loss": 5.9267, + "step": 5882 + }, + { + "epoch": 2.0078498293515357, + "grad_norm": 2.8402795791625977, + "learning_rate": 0.00033071672354948806, + "loss": 6.5702, + "step": 5883 + }, + { + "epoch": 2.0081911262798635, + "grad_norm": 2.827852964401245, + "learning_rate": 0.00033060295790671217, + "loss": 6.7476, + "step": 5884 + }, + { + "epoch": 2.008532423208191, + "grad_norm": 2.843194007873535, + "learning_rate": 0.0003304891922639363, + "loss": 5.988, + "step": 5885 + }, + { + "epoch": 2.0088737201365188, + "grad_norm": 2.8419055938720703, + "learning_rate": 0.00033037542662116043, + "loss": 5.5696, + "step": 5886 + }, + { + "epoch": 2.0092150170648466, + "grad_norm": 2.968064308166504, + "learning_rate": 0.00033026166097838454, + "loss": 6.4374, + "step": 5887 + }, + { + "epoch": 2.009556313993174, + "grad_norm": 3.4870381355285645, + "learning_rate": 0.00033014789533560864, + "loss": 4.7893, + "step": 5888 + }, + { + "epoch": 2.009897610921502, + "grad_norm": 6.496885299682617, + "learning_rate": 0.00033003412969283275, + "loss": 4.7551, + "step": 5889 + }, + { + "epoch": 2.0102389078498293, + "grad_norm": 2.9047958850860596, + "learning_rate": 0.00032992036405005685, + "loss": 5.9317, + "step": 5890 + }, + { + "epoch": 2.010580204778157, + "grad_norm": 3.021362781524658, + "learning_rate": 0.000329806598407281, + "loss": 5.6877, + "step": 5891 + }, + { + "epoch": 2.0109215017064845, + "grad_norm": 3.03979754447937, + "learning_rate": 0.0003296928327645051, + "loss": 5.5397, + "step": 5892 + }, + { + "epoch": 2.0112627986348124, + "grad_norm": 3.0143215656280518, + "learning_rate": 0.0003295790671217293, + "loss": 6.1574, + "step": 5893 + }, + { + "epoch": 2.01160409556314, + "grad_norm": 2.9029927253723145, + "learning_rate": 0.0003294653014789534, + "loss": 6.3871, + "step": 5894 + }, + { + "epoch": 2.0119453924914676, + "grad_norm": 2.924941062927246, + "learning_rate": 0.0003293515358361775, + "loss": 5.6172, + "step": 5895 + }, + { + "epoch": 2.012286689419795, + "grad_norm": 2.9214353561401367, + "learning_rate": 0.00032923777019340164, + "loss": 6.5576, + "step": 5896 + }, + { + "epoch": 2.012627986348123, + "grad_norm": 2.9179751873016357, + "learning_rate": 0.00032912400455062575, + "loss": 6.3363, + "step": 5897 + }, + { + "epoch": 2.0129692832764503, + "grad_norm": 2.8579938411712646, + "learning_rate": 0.0003290102389078498, + "loss": 6.3428, + "step": 5898 + }, + { + "epoch": 2.013310580204778, + "grad_norm": 3.0229973793029785, + "learning_rate": 0.00032889647326507396, + "loss": 6.2619, + "step": 5899 + }, + { + "epoch": 2.013651877133106, + "grad_norm": 4.09330940246582, + "learning_rate": 0.00032878270762229806, + "loss": 4.4237, + "step": 5900 + }, + { + "epoch": 2.0139931740614334, + "grad_norm": 3.2335963249206543, + "learning_rate": 0.00032866894197952217, + "loss": 5.7218, + "step": 5901 + }, + { + "epoch": 2.0143344709897613, + "grad_norm": 3.3706767559051514, + "learning_rate": 0.0003285551763367463, + "loss": 5.6349, + "step": 5902 + }, + { + "epoch": 2.0146757679180887, + "grad_norm": 3.0019240379333496, + "learning_rate": 0.00032844141069397043, + "loss": 5.9354, + "step": 5903 + }, + { + "epoch": 2.0150170648464165, + "grad_norm": 2.8847649097442627, + "learning_rate": 0.00032832764505119454, + "loss": 6.571, + "step": 5904 + }, + { + "epoch": 2.015358361774744, + "grad_norm": 2.9488613605499268, + "learning_rate": 0.0003282138794084187, + "loss": 6.7779, + "step": 5905 + }, + { + "epoch": 2.0156996587030718, + "grad_norm": 2.824702739715576, + "learning_rate": 0.0003281001137656428, + "loss": 6.7856, + "step": 5906 + }, + { + "epoch": 2.016040955631399, + "grad_norm": 2.872587203979492, + "learning_rate": 0.00032798634812286685, + "loss": 6.1544, + "step": 5907 + }, + { + "epoch": 2.016382252559727, + "grad_norm": 3.073383092880249, + "learning_rate": 0.000327872582480091, + "loss": 6.0585, + "step": 5908 + }, + { + "epoch": 2.0167235494880544, + "grad_norm": 3.2500085830688477, + "learning_rate": 0.0003277588168373151, + "loss": 4.9903, + "step": 5909 + }, + { + "epoch": 2.0170648464163823, + "grad_norm": 3.061755895614624, + "learning_rate": 0.0003276450511945392, + "loss": 5.9263, + "step": 5910 + }, + { + "epoch": 2.0174061433447097, + "grad_norm": 3.07401442527771, + "learning_rate": 0.0003275312855517634, + "loss": 6.5287, + "step": 5911 + }, + { + "epoch": 2.0177474402730375, + "grad_norm": 2.897965908050537, + "learning_rate": 0.0003274175199089875, + "loss": 6.4134, + "step": 5912 + }, + { + "epoch": 2.0180887372013654, + "grad_norm": 3.0315048694610596, + "learning_rate": 0.00032730375426621164, + "loss": 5.2327, + "step": 5913 + }, + { + "epoch": 2.018430034129693, + "grad_norm": 2.9312515258789062, + "learning_rate": 0.00032718998862343575, + "loss": 5.4876, + "step": 5914 + }, + { + "epoch": 2.0187713310580206, + "grad_norm": 2.948835849761963, + "learning_rate": 0.00032707622298065985, + "loss": 6.1955, + "step": 5915 + }, + { + "epoch": 2.019112627986348, + "grad_norm": 2.9583792686462402, + "learning_rate": 0.000326962457337884, + "loss": 4.9185, + "step": 5916 + }, + { + "epoch": 2.019453924914676, + "grad_norm": 2.9608967304229736, + "learning_rate": 0.00032684869169510806, + "loss": 6.5537, + "step": 5917 + }, + { + "epoch": 2.0197952218430033, + "grad_norm": 2.8537795543670654, + "learning_rate": 0.00032673492605233217, + "loss": 6.518, + "step": 5918 + }, + { + "epoch": 2.020136518771331, + "grad_norm": 3.467559576034546, + "learning_rate": 0.0003266211604095563, + "loss": 5.4909, + "step": 5919 + }, + { + "epoch": 2.0204778156996586, + "grad_norm": 2.9610671997070312, + "learning_rate": 0.00032650739476678043, + "loss": 5.8352, + "step": 5920 + }, + { + "epoch": 2.0208191126279864, + "grad_norm": 2.9346728324890137, + "learning_rate": 0.00032639362912400454, + "loss": 5.7961, + "step": 5921 + }, + { + "epoch": 2.021160409556314, + "grad_norm": 3.0085277557373047, + "learning_rate": 0.0003262798634812287, + "loss": 5.5517, + "step": 5922 + }, + { + "epoch": 2.0215017064846417, + "grad_norm": 2.949371337890625, + "learning_rate": 0.0003261660978384528, + "loss": 6.364, + "step": 5923 + }, + { + "epoch": 2.021843003412969, + "grad_norm": 4.0263752937316895, + "learning_rate": 0.0003260523321956769, + "loss": 3.3965, + "step": 5924 + }, + { + "epoch": 2.022184300341297, + "grad_norm": 3.15136456489563, + "learning_rate": 0.00032593856655290107, + "loss": 5.744, + "step": 5925 + }, + { + "epoch": 2.0225255972696248, + "grad_norm": 2.9317257404327393, + "learning_rate": 0.00032582480091012517, + "loss": 6.2774, + "step": 5926 + }, + { + "epoch": 2.022866894197952, + "grad_norm": 2.901693344116211, + "learning_rate": 0.0003257110352673492, + "loss": 5.9736, + "step": 5927 + }, + { + "epoch": 2.02320819112628, + "grad_norm": 2.804474353790283, + "learning_rate": 0.0003255972696245734, + "loss": 6.2335, + "step": 5928 + }, + { + "epoch": 2.0235494880546074, + "grad_norm": 2.836078405380249, + "learning_rate": 0.0003254835039817975, + "loss": 6.5145, + "step": 5929 + }, + { + "epoch": 2.0238907849829353, + "grad_norm": 2.8914759159088135, + "learning_rate": 0.0003253697383390216, + "loss": 5.9104, + "step": 5930 + }, + { + "epoch": 2.0242320819112627, + "grad_norm": 3.001803398132324, + "learning_rate": 0.00032525597269624575, + "loss": 5.733, + "step": 5931 + }, + { + "epoch": 2.0245733788395905, + "grad_norm": 3.05202054977417, + "learning_rate": 0.00032514220705346985, + "loss": 5.7444, + "step": 5932 + }, + { + "epoch": 2.024914675767918, + "grad_norm": 2.8296566009521484, + "learning_rate": 0.000325028441410694, + "loss": 6.3294, + "step": 5933 + }, + { + "epoch": 2.025255972696246, + "grad_norm": 2.734096050262451, + "learning_rate": 0.0003249146757679181, + "loss": 6.2183, + "step": 5934 + }, + { + "epoch": 2.025597269624573, + "grad_norm": 2.8434300422668457, + "learning_rate": 0.0003248009101251422, + "loss": 5.6148, + "step": 5935 + }, + { + "epoch": 2.025938566552901, + "grad_norm": 3.7026021480560303, + "learning_rate": 0.0003246871444823664, + "loss": 5.4888, + "step": 5936 + }, + { + "epoch": 2.0262798634812285, + "grad_norm": 2.7599542140960693, + "learning_rate": 0.00032457337883959043, + "loss": 5.6935, + "step": 5937 + }, + { + "epoch": 2.0266211604095563, + "grad_norm": 4.715770244598389, + "learning_rate": 0.00032445961319681454, + "loss": 5.3177, + "step": 5938 + }, + { + "epoch": 2.026962457337884, + "grad_norm": 2.897782802581787, + "learning_rate": 0.0003243458475540387, + "loss": 5.7596, + "step": 5939 + }, + { + "epoch": 2.0273037542662116, + "grad_norm": 2.934124708175659, + "learning_rate": 0.0003242320819112628, + "loss": 6.1473, + "step": 5940 + }, + { + "epoch": 2.0276450511945394, + "grad_norm": 2.9181840419769287, + "learning_rate": 0.0003241183162684869, + "loss": 5.8151, + "step": 5941 + }, + { + "epoch": 2.027986348122867, + "grad_norm": 2.937746286392212, + "learning_rate": 0.00032400455062571107, + "loss": 5.9434, + "step": 5942 + }, + { + "epoch": 2.0283276450511947, + "grad_norm": 3.7255730628967285, + "learning_rate": 0.00032389078498293517, + "loss": 5.0394, + "step": 5943 + }, + { + "epoch": 2.028668941979522, + "grad_norm": 2.898202896118164, + "learning_rate": 0.0003237770193401593, + "loss": 5.7591, + "step": 5944 + }, + { + "epoch": 2.02901023890785, + "grad_norm": 3.671320915222168, + "learning_rate": 0.00032366325369738344, + "loss": 6.2918, + "step": 5945 + }, + { + "epoch": 2.0293515358361773, + "grad_norm": 2.9321327209472656, + "learning_rate": 0.0003235494880546075, + "loss": 6.0174, + "step": 5946 + }, + { + "epoch": 2.029692832764505, + "grad_norm": 2.786679744720459, + "learning_rate": 0.0003234357224118316, + "loss": 6.4303, + "step": 5947 + }, + { + "epoch": 2.0300341296928326, + "grad_norm": 2.862600088119507, + "learning_rate": 0.00032332195676905575, + "loss": 6.0229, + "step": 5948 + }, + { + "epoch": 2.0303754266211604, + "grad_norm": 2.9102251529693604, + "learning_rate": 0.00032320819112627985, + "loss": 5.6793, + "step": 5949 + }, + { + "epoch": 2.030716723549488, + "grad_norm": 2.8339219093322754, + "learning_rate": 0.00032309442548350396, + "loss": 6.1311, + "step": 5950 + }, + { + "epoch": 2.0310580204778157, + "grad_norm": 2.7801735401153564, + "learning_rate": 0.0003229806598407281, + "loss": 6.1714, + "step": 5951 + }, + { + "epoch": 2.0313993174061435, + "grad_norm": 2.792600631713867, + "learning_rate": 0.0003228668941979522, + "loss": 6.4515, + "step": 5952 + }, + { + "epoch": 2.031740614334471, + "grad_norm": 2.876671075820923, + "learning_rate": 0.0003227531285551764, + "loss": 6.3113, + "step": 5953 + }, + { + "epoch": 2.032081911262799, + "grad_norm": 2.9454519748687744, + "learning_rate": 0.0003226393629124005, + "loss": 6.5142, + "step": 5954 + }, + { + "epoch": 2.032423208191126, + "grad_norm": 2.833214044570923, + "learning_rate": 0.0003225255972696246, + "loss": 6.0907, + "step": 5955 + }, + { + "epoch": 2.032764505119454, + "grad_norm": 4.851685047149658, + "learning_rate": 0.0003224118316268487, + "loss": 5.22, + "step": 5956 + }, + { + "epoch": 2.0331058020477815, + "grad_norm": 3.4849393367767334, + "learning_rate": 0.0003222980659840728, + "loss": 5.0735, + "step": 5957 + }, + { + "epoch": 2.0334470989761093, + "grad_norm": 3.021411180496216, + "learning_rate": 0.0003221843003412969, + "loss": 6.2273, + "step": 5958 + }, + { + "epoch": 2.0337883959044367, + "grad_norm": 2.9004580974578857, + "learning_rate": 0.00032207053469852107, + "loss": 6.6881, + "step": 5959 + }, + { + "epoch": 2.0341296928327646, + "grad_norm": 2.9186768531799316, + "learning_rate": 0.00032195676905574517, + "loss": 5.6578, + "step": 5960 + }, + { + "epoch": 2.034470989761092, + "grad_norm": 2.9466514587402344, + "learning_rate": 0.0003218430034129693, + "loss": 6.1247, + "step": 5961 + }, + { + "epoch": 2.03481228668942, + "grad_norm": 2.9205827713012695, + "learning_rate": 0.00032172923777019344, + "loss": 5.8445, + "step": 5962 + }, + { + "epoch": 2.0351535836177472, + "grad_norm": 2.9399664402008057, + "learning_rate": 0.00032161547212741754, + "loss": 6.3061, + "step": 5963 + }, + { + "epoch": 2.035494880546075, + "grad_norm": 2.7923221588134766, + "learning_rate": 0.00032150170648464165, + "loss": 6.9318, + "step": 5964 + }, + { + "epoch": 2.035836177474403, + "grad_norm": 2.841721773147583, + "learning_rate": 0.0003213879408418658, + "loss": 6.2328, + "step": 5965 + }, + { + "epoch": 2.0361774744027303, + "grad_norm": 2.805899143218994, + "learning_rate": 0.00032127417519908986, + "loss": 5.0354, + "step": 5966 + }, + { + "epoch": 2.036518771331058, + "grad_norm": 2.8029048442840576, + "learning_rate": 0.00032116040955631396, + "loss": 6.3708, + "step": 5967 + }, + { + "epoch": 2.0368600682593856, + "grad_norm": 2.7926340103149414, + "learning_rate": 0.0003210466439135381, + "loss": 5.9584, + "step": 5968 + }, + { + "epoch": 2.0372013651877134, + "grad_norm": 2.774442672729492, + "learning_rate": 0.0003209328782707622, + "loss": 6.4848, + "step": 5969 + }, + { + "epoch": 2.037542662116041, + "grad_norm": 4.830069065093994, + "learning_rate": 0.00032081911262798633, + "loss": 5.8823, + "step": 5970 + }, + { + "epoch": 2.0378839590443687, + "grad_norm": 2.8296518325805664, + "learning_rate": 0.0003207053469852105, + "loss": 5.8528, + "step": 5971 + }, + { + "epoch": 2.038225255972696, + "grad_norm": 2.9755330085754395, + "learning_rate": 0.0003205915813424346, + "loss": 6.5889, + "step": 5972 + }, + { + "epoch": 2.038566552901024, + "grad_norm": 3.333840847015381, + "learning_rate": 0.0003204778156996587, + "loss": 5.3275, + "step": 5973 + }, + { + "epoch": 2.0389078498293514, + "grad_norm": 2.9385743141174316, + "learning_rate": 0.00032036405005688286, + "loss": 6.0905, + "step": 5974 + }, + { + "epoch": 2.039249146757679, + "grad_norm": 4.615128040313721, + "learning_rate": 0.00032025028441410696, + "loss": 5.379, + "step": 5975 + }, + { + "epoch": 2.0395904436860066, + "grad_norm": 2.938904047012329, + "learning_rate": 0.00032013651877133107, + "loss": 6.5808, + "step": 5976 + }, + { + "epoch": 2.0399317406143345, + "grad_norm": 2.9147865772247314, + "learning_rate": 0.00032002275312855517, + "loss": 6.7389, + "step": 5977 + }, + { + "epoch": 2.0402730375426623, + "grad_norm": 2.8320443630218506, + "learning_rate": 0.0003199089874857793, + "loss": 5.7683, + "step": 5978 + }, + { + "epoch": 2.0406143344709897, + "grad_norm": 3.8267099857330322, + "learning_rate": 0.00031979522184300344, + "loss": 4.3207, + "step": 5979 + }, + { + "epoch": 2.0409556313993176, + "grad_norm": 3.4486052989959717, + "learning_rate": 0.00031968145620022754, + "loss": 6.0473, + "step": 5980 + }, + { + "epoch": 2.041296928327645, + "grad_norm": 3.4830849170684814, + "learning_rate": 0.00031956769055745165, + "loss": 5.3791, + "step": 5981 + }, + { + "epoch": 2.041638225255973, + "grad_norm": 2.9314374923706055, + "learning_rate": 0.0003194539249146758, + "loss": 6.1228, + "step": 5982 + }, + { + "epoch": 2.0419795221843002, + "grad_norm": 3.42529034614563, + "learning_rate": 0.0003193401592718999, + "loss": 5.5453, + "step": 5983 + }, + { + "epoch": 2.042320819112628, + "grad_norm": 2.903790235519409, + "learning_rate": 0.000319226393629124, + "loss": 5.9758, + "step": 5984 + }, + { + "epoch": 2.0426621160409555, + "grad_norm": 3.009942054748535, + "learning_rate": 0.0003191126279863481, + "loss": 6.5988, + "step": 5985 + }, + { + "epoch": 2.0430034129692833, + "grad_norm": 2.9778008460998535, + "learning_rate": 0.0003189988623435722, + "loss": 6.0532, + "step": 5986 + }, + { + "epoch": 2.0433447098976107, + "grad_norm": 2.8316783905029297, + "learning_rate": 0.00031888509670079633, + "loss": 6.3487, + "step": 5987 + }, + { + "epoch": 2.0436860068259386, + "grad_norm": 2.837663173675537, + "learning_rate": 0.0003187713310580205, + "loss": 6.4316, + "step": 5988 + }, + { + "epoch": 2.044027303754266, + "grad_norm": 2.863551616668701, + "learning_rate": 0.0003186575654152446, + "loss": 5.7195, + "step": 5989 + }, + { + "epoch": 2.044368600682594, + "grad_norm": 2.814051389694214, + "learning_rate": 0.0003185437997724687, + "loss": 6.0491, + "step": 5990 + }, + { + "epoch": 2.0447098976109217, + "grad_norm": 2.8193931579589844, + "learning_rate": 0.00031843003412969286, + "loss": 6.4645, + "step": 5991 + }, + { + "epoch": 2.045051194539249, + "grad_norm": 2.9294307231903076, + "learning_rate": 0.00031831626848691696, + "loss": 5.8573, + "step": 5992 + }, + { + "epoch": 2.045392491467577, + "grad_norm": 2.906787395477295, + "learning_rate": 0.00031820250284414107, + "loss": 6.3565, + "step": 5993 + }, + { + "epoch": 2.0457337883959044, + "grad_norm": 3.021745443344116, + "learning_rate": 0.00031808873720136523, + "loss": 5.4691, + "step": 5994 + }, + { + "epoch": 2.046075085324232, + "grad_norm": 2.877673387527466, + "learning_rate": 0.0003179749715585893, + "loss": 5.5119, + "step": 5995 + }, + { + "epoch": 2.0464163822525596, + "grad_norm": 3.2106802463531494, + "learning_rate": 0.00031786120591581344, + "loss": 5.6146, + "step": 5996 + }, + { + "epoch": 2.0467576791808875, + "grad_norm": 2.90977144241333, + "learning_rate": 0.00031774744027303754, + "loss": 6.2368, + "step": 5997 + }, + { + "epoch": 2.047098976109215, + "grad_norm": 2.888395309448242, + "learning_rate": 0.00031763367463026165, + "loss": 6.3792, + "step": 5998 + }, + { + "epoch": 2.0474402730375427, + "grad_norm": 2.8118643760681152, + "learning_rate": 0.0003175199089874858, + "loss": 6.0576, + "step": 5999 + }, + { + "epoch": 2.04778156996587, + "grad_norm": 2.768461227416992, + "learning_rate": 0.0003174061433447099, + "loss": 6.3125, + "step": 6000 + }, + { + "epoch": 2.048122866894198, + "grad_norm": 2.7696850299835205, + "learning_rate": 0.000317292377701934, + "loss": 6.3775, + "step": 6001 + }, + { + "epoch": 2.0484641638225254, + "grad_norm": 2.7902398109436035, + "learning_rate": 0.0003171786120591582, + "loss": 6.0764, + "step": 6002 + }, + { + "epoch": 2.0488054607508532, + "grad_norm": 2.9154062271118164, + "learning_rate": 0.0003170648464163823, + "loss": 6.4892, + "step": 6003 + }, + { + "epoch": 2.049146757679181, + "grad_norm": 2.908606767654419, + "learning_rate": 0.0003169510807736064, + "loss": 6.5462, + "step": 6004 + }, + { + "epoch": 2.0494880546075085, + "grad_norm": 2.8876357078552246, + "learning_rate": 0.0003168373151308305, + "loss": 6.7508, + "step": 6005 + }, + { + "epoch": 2.0498293515358363, + "grad_norm": 2.836838960647583, + "learning_rate": 0.0003167235494880546, + "loss": 6.5657, + "step": 6006 + }, + { + "epoch": 2.0501706484641637, + "grad_norm": 2.8034183979034424, + "learning_rate": 0.0003166097838452787, + "loss": 6.3752, + "step": 6007 + }, + { + "epoch": 2.0505119453924916, + "grad_norm": 2.8830206394195557, + "learning_rate": 0.00031649601820250286, + "loss": 6.294, + "step": 6008 + }, + { + "epoch": 2.050853242320819, + "grad_norm": 2.811297655105591, + "learning_rate": 0.00031638225255972696, + "loss": 6.1428, + "step": 6009 + }, + { + "epoch": 2.051194539249147, + "grad_norm": 2.8806638717651367, + "learning_rate": 0.00031626848691695107, + "loss": 6.192, + "step": 6010 + }, + { + "epoch": 2.0515358361774743, + "grad_norm": 2.9297285079956055, + "learning_rate": 0.00031615472127417523, + "loss": 6.2655, + "step": 6011 + }, + { + "epoch": 2.051877133105802, + "grad_norm": 3.0114855766296387, + "learning_rate": 0.00031604095563139933, + "loss": 5.8824, + "step": 6012 + }, + { + "epoch": 2.0522184300341295, + "grad_norm": 3.1099727153778076, + "learning_rate": 0.00031592718998862344, + "loss": 5.416, + "step": 6013 + }, + { + "epoch": 2.0525597269624574, + "grad_norm": 2.8879382610321045, + "learning_rate": 0.00031581342434584754, + "loss": 5.822, + "step": 6014 + }, + { + "epoch": 2.0529010238907848, + "grad_norm": 2.9049463272094727, + "learning_rate": 0.00031569965870307165, + "loss": 5.4447, + "step": 6015 + }, + { + "epoch": 2.0532423208191126, + "grad_norm": 2.8603172302246094, + "learning_rate": 0.0003155858930602958, + "loss": 6.007, + "step": 6016 + }, + { + "epoch": 2.0535836177474405, + "grad_norm": 3.1179702281951904, + "learning_rate": 0.0003154721274175199, + "loss": 6.4848, + "step": 6017 + }, + { + "epoch": 2.053924914675768, + "grad_norm": 3.008000612258911, + "learning_rate": 0.000315358361774744, + "loss": 6.0294, + "step": 6018 + }, + { + "epoch": 2.0542662116040957, + "grad_norm": 2.8747920989990234, + "learning_rate": 0.0003152445961319682, + "loss": 6.0033, + "step": 6019 + }, + { + "epoch": 2.054607508532423, + "grad_norm": 2.8617520332336426, + "learning_rate": 0.0003151308304891923, + "loss": 6.4272, + "step": 6020 + }, + { + "epoch": 2.054948805460751, + "grad_norm": 2.861337900161743, + "learning_rate": 0.0003150170648464164, + "loss": 6.1443, + "step": 6021 + }, + { + "epoch": 2.0552901023890784, + "grad_norm": 2.852907180786133, + "learning_rate": 0.00031490329920364055, + "loss": 5.7435, + "step": 6022 + }, + { + "epoch": 2.0556313993174062, + "grad_norm": 3.9914369583129883, + "learning_rate": 0.00031478953356086465, + "loss": 5.5631, + "step": 6023 + }, + { + "epoch": 2.0559726962457336, + "grad_norm": 3.1209657192230225, + "learning_rate": 0.0003146757679180887, + "loss": 5.8574, + "step": 6024 + }, + { + "epoch": 2.0563139931740615, + "grad_norm": 2.9207725524902344, + "learning_rate": 0.00031456200227531286, + "loss": 6.2952, + "step": 6025 + }, + { + "epoch": 2.056655290102389, + "grad_norm": 3.0054728984832764, + "learning_rate": 0.00031444823663253697, + "loss": 5.3445, + "step": 6026 + }, + { + "epoch": 2.0569965870307167, + "grad_norm": 3.291370391845703, + "learning_rate": 0.00031433447098976107, + "loss": 4.9664, + "step": 6027 + }, + { + "epoch": 2.057337883959044, + "grad_norm": 2.8859901428222656, + "learning_rate": 0.00031422070534698523, + "loss": 5.9449, + "step": 6028 + }, + { + "epoch": 2.057679180887372, + "grad_norm": 3.0395874977111816, + "learning_rate": 0.00031410693970420933, + "loss": 6.0376, + "step": 6029 + }, + { + "epoch": 2.0580204778157, + "grad_norm": 2.9008989334106445, + "learning_rate": 0.00031399317406143344, + "loss": 6.4105, + "step": 6030 + }, + { + "epoch": 2.0583617747440273, + "grad_norm": 2.9411637783050537, + "learning_rate": 0.0003138794084186576, + "loss": 5.6366, + "step": 6031 + }, + { + "epoch": 2.058703071672355, + "grad_norm": 2.858750104904175, + "learning_rate": 0.0003137656427758817, + "loss": 6.5325, + "step": 6032 + }, + { + "epoch": 2.0590443686006825, + "grad_norm": 3.046865701675415, + "learning_rate": 0.0003136518771331058, + "loss": 6.317, + "step": 6033 + }, + { + "epoch": 2.0593856655290104, + "grad_norm": 2.996433734893799, + "learning_rate": 0.0003135381114903299, + "loss": 6.144, + "step": 6034 + }, + { + "epoch": 2.0597269624573378, + "grad_norm": 2.8240692615509033, + "learning_rate": 0.000313424345847554, + "loss": 6.3732, + "step": 6035 + }, + { + "epoch": 2.0600682593856656, + "grad_norm": 2.915379047393799, + "learning_rate": 0.0003133105802047782, + "loss": 6.1081, + "step": 6036 + }, + { + "epoch": 2.060409556313993, + "grad_norm": 6.734116077423096, + "learning_rate": 0.0003131968145620023, + "loss": 4.9619, + "step": 6037 + }, + { + "epoch": 2.060750853242321, + "grad_norm": 2.909330129623413, + "learning_rate": 0.0003130830489192264, + "loss": 6.0949, + "step": 6038 + }, + { + "epoch": 2.0610921501706483, + "grad_norm": 2.775883197784424, + "learning_rate": 0.00031296928327645055, + "loss": 6.0901, + "step": 6039 + }, + { + "epoch": 2.061433447098976, + "grad_norm": 3.041524648666382, + "learning_rate": 0.00031285551763367465, + "loss": 6.8417, + "step": 6040 + }, + { + "epoch": 2.0617747440273035, + "grad_norm": 2.883164882659912, + "learning_rate": 0.00031274175199089876, + "loss": 6.3803, + "step": 6041 + }, + { + "epoch": 2.0621160409556314, + "grad_norm": 3.1183626651763916, + "learning_rate": 0.0003126279863481229, + "loss": 6.1928, + "step": 6042 + }, + { + "epoch": 2.0624573378839592, + "grad_norm": 2.8333499431610107, + "learning_rate": 0.000312514220705347, + "loss": 6.2385, + "step": 6043 + }, + { + "epoch": 2.0627986348122866, + "grad_norm": 2.8665261268615723, + "learning_rate": 0.00031240045506257107, + "loss": 6.8755, + "step": 6044 + }, + { + "epoch": 2.0631399317406145, + "grad_norm": 4.331704139709473, + "learning_rate": 0.00031228668941979523, + "loss": 4.8018, + "step": 6045 + }, + { + "epoch": 2.063481228668942, + "grad_norm": 2.9394123554229736, + "learning_rate": 0.00031217292377701934, + "loss": 6.2594, + "step": 6046 + }, + { + "epoch": 2.0638225255972698, + "grad_norm": 2.87528395652771, + "learning_rate": 0.00031205915813424344, + "loss": 6.0032, + "step": 6047 + }, + { + "epoch": 2.064163822525597, + "grad_norm": 2.8144853115081787, + "learning_rate": 0.0003119453924914676, + "loss": 6.0189, + "step": 6048 + }, + { + "epoch": 2.064505119453925, + "grad_norm": 4.7918243408203125, + "learning_rate": 0.0003118316268486917, + "loss": 4.9166, + "step": 6049 + }, + { + "epoch": 2.0648464163822524, + "grad_norm": 2.0117385387420654, + "learning_rate": 0.0003117178612059158, + "loss": 3.1662, + "step": 6050 + }, + { + "epoch": 2.0651877133105803, + "grad_norm": 2.930635690689087, + "learning_rate": 0.00031160409556313997, + "loss": 6.7214, + "step": 6051 + }, + { + "epoch": 2.0655290102389077, + "grad_norm": 2.864907741546631, + "learning_rate": 0.0003114903299203641, + "loss": 6.3194, + "step": 6052 + }, + { + "epoch": 2.0658703071672355, + "grad_norm": 2.9267427921295166, + "learning_rate": 0.0003113765642775881, + "loss": 6.393, + "step": 6053 + }, + { + "epoch": 2.066211604095563, + "grad_norm": 2.9477977752685547, + "learning_rate": 0.0003112627986348123, + "loss": 6.4259, + "step": 6054 + }, + { + "epoch": 2.0665529010238908, + "grad_norm": 5.001644611358643, + "learning_rate": 0.0003111490329920364, + "loss": 4.6699, + "step": 6055 + }, + { + "epoch": 2.0668941979522186, + "grad_norm": 2.8814961910247803, + "learning_rate": 0.0003110352673492605, + "loss": 6.1531, + "step": 6056 + }, + { + "epoch": 2.067235494880546, + "grad_norm": 2.9188053607940674, + "learning_rate": 0.00031092150170648465, + "loss": 5.8409, + "step": 6057 + }, + { + "epoch": 2.067576791808874, + "grad_norm": 2.8715567588806152, + "learning_rate": 0.00031080773606370876, + "loss": 5.4793, + "step": 6058 + }, + { + "epoch": 2.0679180887372013, + "grad_norm": 9.393294334411621, + "learning_rate": 0.0003106939704209329, + "loss": 4.9354, + "step": 6059 + }, + { + "epoch": 2.068259385665529, + "grad_norm": 2.8406503200531006, + "learning_rate": 0.000310580204778157, + "loss": 5.92, + "step": 6060 + }, + { + "epoch": 2.0686006825938565, + "grad_norm": 3.1945927143096924, + "learning_rate": 0.0003104664391353811, + "loss": 5.5444, + "step": 6061 + }, + { + "epoch": 2.0689419795221844, + "grad_norm": 2.9627017974853516, + "learning_rate": 0.0003103526734926053, + "loss": 6.2811, + "step": 6062 + }, + { + "epoch": 2.069283276450512, + "grad_norm": 2.85520076751709, + "learning_rate": 0.00031023890784982934, + "loss": 6.3404, + "step": 6063 + }, + { + "epoch": 2.0696245733788396, + "grad_norm": 3.261565685272217, + "learning_rate": 0.00031012514220705344, + "loss": 5.8875, + "step": 6064 + }, + { + "epoch": 2.069965870307167, + "grad_norm": 2.8483152389526367, + "learning_rate": 0.0003100113765642776, + "loss": 6.4443, + "step": 6065 + }, + { + "epoch": 2.070307167235495, + "grad_norm": 2.9961583614349365, + "learning_rate": 0.0003098976109215017, + "loss": 5.5248, + "step": 6066 + }, + { + "epoch": 2.0706484641638223, + "grad_norm": 2.947376251220703, + "learning_rate": 0.0003097838452787258, + "loss": 5.3489, + "step": 6067 + }, + { + "epoch": 2.07098976109215, + "grad_norm": 2.8761508464813232, + "learning_rate": 0.00030967007963594997, + "loss": 5.4662, + "step": 6068 + }, + { + "epoch": 2.071331058020478, + "grad_norm": 2.8893637657165527, + "learning_rate": 0.0003095563139931741, + "loss": 5.9248, + "step": 6069 + }, + { + "epoch": 2.0716723549488054, + "grad_norm": 2.9129505157470703, + "learning_rate": 0.0003094425483503982, + "loss": 6.376, + "step": 6070 + }, + { + "epoch": 2.0720136518771333, + "grad_norm": 4.613974094390869, + "learning_rate": 0.00030932878270762234, + "loss": 5.5792, + "step": 6071 + }, + { + "epoch": 2.0723549488054607, + "grad_norm": 2.938722610473633, + "learning_rate": 0.00030921501706484644, + "loss": 6.8475, + "step": 6072 + }, + { + "epoch": 2.0726962457337885, + "grad_norm": 2.8759310245513916, + "learning_rate": 0.0003091012514220705, + "loss": 6.688, + "step": 6073 + }, + { + "epoch": 2.073037542662116, + "grad_norm": 2.9209375381469727, + "learning_rate": 0.00030898748577929465, + "loss": 6.3908, + "step": 6074 + }, + { + "epoch": 2.073378839590444, + "grad_norm": 2.9436604976654053, + "learning_rate": 0.00030887372013651876, + "loss": 6.0059, + "step": 6075 + }, + { + "epoch": 2.073720136518771, + "grad_norm": 3.093762159347534, + "learning_rate": 0.00030875995449374286, + "loss": 5.5346, + "step": 6076 + }, + { + "epoch": 2.074061433447099, + "grad_norm": 2.913107395172119, + "learning_rate": 0.000308646188850967, + "loss": 6.3079, + "step": 6077 + }, + { + "epoch": 2.0744027303754264, + "grad_norm": 2.8740787506103516, + "learning_rate": 0.00030853242320819113, + "loss": 5.9121, + "step": 6078 + }, + { + "epoch": 2.0747440273037543, + "grad_norm": 3.232553243637085, + "learning_rate": 0.0003084186575654153, + "loss": 5.6589, + "step": 6079 + }, + { + "epoch": 2.0750853242320817, + "grad_norm": 2.818389654159546, + "learning_rate": 0.0003083048919226394, + "loss": 6.1773, + "step": 6080 + }, + { + "epoch": 2.0754266211604095, + "grad_norm": 2.880063533782959, + "learning_rate": 0.0003081911262798635, + "loss": 6.005, + "step": 6081 + }, + { + "epoch": 2.0757679180887374, + "grad_norm": 3.2499842643737793, + "learning_rate": 0.00030807736063708766, + "loss": 6.1325, + "step": 6082 + }, + { + "epoch": 2.076109215017065, + "grad_norm": 2.889770746231079, + "learning_rate": 0.0003079635949943117, + "loss": 5.9898, + "step": 6083 + }, + { + "epoch": 2.0764505119453927, + "grad_norm": 2.825451612472534, + "learning_rate": 0.0003078498293515358, + "loss": 6.3039, + "step": 6084 + }, + { + "epoch": 2.07679180887372, + "grad_norm": 3.2816312313079834, + "learning_rate": 0.00030773606370875997, + "loss": 4.5501, + "step": 6085 + }, + { + "epoch": 2.077133105802048, + "grad_norm": 2.836883068084717, + "learning_rate": 0.0003076222980659841, + "loss": 6.3832, + "step": 6086 + }, + { + "epoch": 2.0774744027303753, + "grad_norm": 2.905423879623413, + "learning_rate": 0.0003075085324232082, + "loss": 6.6088, + "step": 6087 + }, + { + "epoch": 2.077815699658703, + "grad_norm": 3.0025651454925537, + "learning_rate": 0.00030739476678043234, + "loss": 6.6776, + "step": 6088 + }, + { + "epoch": 2.0781569965870306, + "grad_norm": 2.935211181640625, + "learning_rate": 0.00030728100113765644, + "loss": 5.9203, + "step": 6089 + }, + { + "epoch": 2.0784982935153584, + "grad_norm": 3.124129295349121, + "learning_rate": 0.00030716723549488055, + "loss": 5.9244, + "step": 6090 + }, + { + "epoch": 2.078839590443686, + "grad_norm": 2.805335283279419, + "learning_rate": 0.0003070534698521047, + "loss": 6.3963, + "step": 6091 + }, + { + "epoch": 2.0791808873720137, + "grad_norm": 2.806159496307373, + "learning_rate": 0.00030693970420932876, + "loss": 6.1115, + "step": 6092 + }, + { + "epoch": 2.079522184300341, + "grad_norm": 2.8614501953125, + "learning_rate": 0.00030682593856655286, + "loss": 6.4517, + "step": 6093 + }, + { + "epoch": 2.079863481228669, + "grad_norm": 2.736070394515991, + "learning_rate": 0.000306712172923777, + "loss": 6.4008, + "step": 6094 + }, + { + "epoch": 2.080204778156997, + "grad_norm": 2.861682176589966, + "learning_rate": 0.00030659840728100113, + "loss": 6.0243, + "step": 6095 + }, + { + "epoch": 2.080546075085324, + "grad_norm": 2.79305100440979, + "learning_rate": 0.00030648464163822523, + "loss": 5.9656, + "step": 6096 + }, + { + "epoch": 2.080887372013652, + "grad_norm": 2.781764030456543, + "learning_rate": 0.0003063708759954494, + "loss": 5.8456, + "step": 6097 + }, + { + "epoch": 2.0812286689419794, + "grad_norm": 3.013824224472046, + "learning_rate": 0.0003062571103526735, + "loss": 6.0931, + "step": 6098 + }, + { + "epoch": 2.0815699658703073, + "grad_norm": 2.9550583362579346, + "learning_rate": 0.00030614334470989766, + "loss": 6.5948, + "step": 6099 + }, + { + "epoch": 2.0819112627986347, + "grad_norm": 2.903958320617676, + "learning_rate": 0.00030602957906712176, + "loss": 6.3639, + "step": 6100 + }, + { + "epoch": 2.0822525597269625, + "grad_norm": 2.8269143104553223, + "learning_rate": 0.00030591581342434587, + "loss": 6.0978, + "step": 6101 + }, + { + "epoch": 2.08259385665529, + "grad_norm": 2.9017698764801025, + "learning_rate": 0.00030580204778156997, + "loss": 6.2451, + "step": 6102 + }, + { + "epoch": 2.082935153583618, + "grad_norm": 2.8492634296417236, + "learning_rate": 0.0003056882821387941, + "loss": 6.0594, + "step": 6103 + }, + { + "epoch": 2.083276450511945, + "grad_norm": 2.8176167011260986, + "learning_rate": 0.0003055745164960182, + "loss": 6.4956, + "step": 6104 + }, + { + "epoch": 2.083617747440273, + "grad_norm": 2.8858425617218018, + "learning_rate": 0.00030546075085324234, + "loss": 6.1588, + "step": 6105 + }, + { + "epoch": 2.0839590443686005, + "grad_norm": 2.932605266571045, + "learning_rate": 0.00030534698521046645, + "loss": 5.9801, + "step": 6106 + }, + { + "epoch": 2.0843003412969283, + "grad_norm": 2.9948959350585938, + "learning_rate": 0.00030523321956769055, + "loss": 6.1619, + "step": 6107 + }, + { + "epoch": 2.084641638225256, + "grad_norm": 2.8777294158935547, + "learning_rate": 0.0003051194539249147, + "loss": 5.9433, + "step": 6108 + }, + { + "epoch": 2.0849829351535836, + "grad_norm": 4.565830707550049, + "learning_rate": 0.0003050056882821388, + "loss": 5.8684, + "step": 6109 + }, + { + "epoch": 2.0853242320819114, + "grad_norm": 2.942716360092163, + "learning_rate": 0.0003048919226393629, + "loss": 6.3863, + "step": 6110 + }, + { + "epoch": 2.085665529010239, + "grad_norm": 3.1933271884918213, + "learning_rate": 0.0003047781569965871, + "loss": 4.8312, + "step": 6111 + }, + { + "epoch": 2.0860068259385667, + "grad_norm": 2.8330047130584717, + "learning_rate": 0.00030466439135381113, + "loss": 6.6125, + "step": 6112 + }, + { + "epoch": 2.086348122866894, + "grad_norm": 2.899695873260498, + "learning_rate": 0.00030455062571103523, + "loss": 5.9607, + "step": 6113 + }, + { + "epoch": 2.086689419795222, + "grad_norm": 2.945712089538574, + "learning_rate": 0.0003044368600682594, + "loss": 6.2117, + "step": 6114 + }, + { + "epoch": 2.0870307167235493, + "grad_norm": 3.072864532470703, + "learning_rate": 0.0003043230944254835, + "loss": 5.743, + "step": 6115 + }, + { + "epoch": 2.087372013651877, + "grad_norm": 2.916473865509033, + "learning_rate": 0.0003042093287827076, + "loss": 6.4488, + "step": 6116 + }, + { + "epoch": 2.0877133105802046, + "grad_norm": 2.8467938899993896, + "learning_rate": 0.00030409556313993176, + "loss": 6.1196, + "step": 6117 + }, + { + "epoch": 2.0880546075085324, + "grad_norm": 2.8041112422943115, + "learning_rate": 0.00030398179749715587, + "loss": 6.3867, + "step": 6118 + }, + { + "epoch": 2.08839590443686, + "grad_norm": 2.6718533039093018, + "learning_rate": 0.00030386803185438, + "loss": 6.1606, + "step": 6119 + }, + { + "epoch": 2.0887372013651877, + "grad_norm": 3.134387254714966, + "learning_rate": 0.00030375426621160413, + "loss": 5.3715, + "step": 6120 + }, + { + "epoch": 2.0890784982935156, + "grad_norm": 2.8396830558776855, + "learning_rate": 0.0003036405005688282, + "loss": 6.6257, + "step": 6121 + }, + { + "epoch": 2.089419795221843, + "grad_norm": 2.89739990234375, + "learning_rate": 0.00030352673492605234, + "loss": 6.4547, + "step": 6122 + }, + { + "epoch": 2.089761092150171, + "grad_norm": 2.870203971862793, + "learning_rate": 0.00030341296928327645, + "loss": 5.6783, + "step": 6123 + }, + { + "epoch": 2.090102389078498, + "grad_norm": 6.795978546142578, + "learning_rate": 0.00030329920364050055, + "loss": 4.8163, + "step": 6124 + }, + { + "epoch": 2.090443686006826, + "grad_norm": 2.864149570465088, + "learning_rate": 0.0003031854379977247, + "loss": 6.3274, + "step": 6125 + }, + { + "epoch": 2.0907849829351535, + "grad_norm": 2.8654708862304688, + "learning_rate": 0.0003030716723549488, + "loss": 6.3438, + "step": 6126 + }, + { + "epoch": 2.0911262798634813, + "grad_norm": 2.834479331970215, + "learning_rate": 0.0003029579067121729, + "loss": 6.1642, + "step": 6127 + }, + { + "epoch": 2.0914675767918087, + "grad_norm": 2.9898204803466797, + "learning_rate": 0.0003028441410693971, + "loss": 6.4757, + "step": 6128 + }, + { + "epoch": 2.0918088737201366, + "grad_norm": 2.8628549575805664, + "learning_rate": 0.0003027303754266212, + "loss": 6.5145, + "step": 6129 + }, + { + "epoch": 2.092150170648464, + "grad_norm": 2.945585012435913, + "learning_rate": 0.0003026166097838453, + "loss": 6.4056, + "step": 6130 + }, + { + "epoch": 2.092491467576792, + "grad_norm": 2.85774564743042, + "learning_rate": 0.0003025028441410694, + "loss": 5.6863, + "step": 6131 + }, + { + "epoch": 2.0928327645051192, + "grad_norm": 3.0979244709014893, + "learning_rate": 0.0003023890784982935, + "loss": 6.1059, + "step": 6132 + }, + { + "epoch": 2.093174061433447, + "grad_norm": 2.8489980697631836, + "learning_rate": 0.0003022753128555176, + "loss": 5.884, + "step": 6133 + }, + { + "epoch": 2.093515358361775, + "grad_norm": 2.7917068004608154, + "learning_rate": 0.00030216154721274176, + "loss": 5.8435, + "step": 6134 + }, + { + "epoch": 2.0938566552901023, + "grad_norm": 3.5278565883636475, + "learning_rate": 0.00030204778156996587, + "loss": 3.6562, + "step": 6135 + }, + { + "epoch": 2.09419795221843, + "grad_norm": 2.9529824256896973, + "learning_rate": 0.00030193401592718997, + "loss": 5.6781, + "step": 6136 + }, + { + "epoch": 2.0945392491467576, + "grad_norm": 7.044441223144531, + "learning_rate": 0.00030182025028441413, + "loss": 6.3167, + "step": 6137 + }, + { + "epoch": 2.0948805460750854, + "grad_norm": 3.1856296062469482, + "learning_rate": 0.00030170648464163824, + "loss": 4.6697, + "step": 6138 + }, + { + "epoch": 2.095221843003413, + "grad_norm": 2.9588096141815186, + "learning_rate": 0.0003015927189988624, + "loss": 6.2748, + "step": 6139 + }, + { + "epoch": 2.0955631399317407, + "grad_norm": 3.0708155632019043, + "learning_rate": 0.0003014789533560865, + "loss": 6.2729, + "step": 6140 + }, + { + "epoch": 2.095904436860068, + "grad_norm": 2.921039342880249, + "learning_rate": 0.00030136518771331055, + "loss": 5.9834, + "step": 6141 + }, + { + "epoch": 2.096245733788396, + "grad_norm": 2.7934975624084473, + "learning_rate": 0.0003012514220705347, + "loss": 5.9395, + "step": 6142 + }, + { + "epoch": 2.0965870307167234, + "grad_norm": 2.899808645248413, + "learning_rate": 0.0003011376564277588, + "loss": 6.1141, + "step": 6143 + }, + { + "epoch": 2.096928327645051, + "grad_norm": 2.8703773021698, + "learning_rate": 0.0003010238907849829, + "loss": 6.3106, + "step": 6144 + }, + { + "epoch": 2.0972696245733786, + "grad_norm": 2.8744289875030518, + "learning_rate": 0.0003009101251422071, + "loss": 6.4198, + "step": 6145 + }, + { + "epoch": 2.0976109215017065, + "grad_norm": 2.801576852798462, + "learning_rate": 0.0003007963594994312, + "loss": 6.0057, + "step": 6146 + }, + { + "epoch": 2.0979522184300343, + "grad_norm": 2.772566080093384, + "learning_rate": 0.0003006825938566553, + "loss": 5.799, + "step": 6147 + }, + { + "epoch": 2.0982935153583617, + "grad_norm": 2.8693549633026123, + "learning_rate": 0.00030056882821387945, + "loss": 6.134, + "step": 6148 + }, + { + "epoch": 2.0986348122866896, + "grad_norm": 1.9791274070739746, + "learning_rate": 0.00030045506257110355, + "loss": 3.122, + "step": 6149 + }, + { + "epoch": 2.098976109215017, + "grad_norm": 2.8072032928466797, + "learning_rate": 0.00030034129692832766, + "loss": 5.8741, + "step": 6150 + }, + { + "epoch": 2.099317406143345, + "grad_norm": 2.8439536094665527, + "learning_rate": 0.00030022753128555176, + "loss": 6.4318, + "step": 6151 + }, + { + "epoch": 2.0996587030716722, + "grad_norm": 2.8671748638153076, + "learning_rate": 0.00030011376564277587, + "loss": 6.1086, + "step": 6152 + }, + { + "epoch": 2.1, + "grad_norm": 2.7963056564331055, + "learning_rate": 0.0003, + "loss": 6.0888, + "step": 6153 + }, + { + "epoch": 2.1003412969283275, + "grad_norm": 2.9660708904266357, + "learning_rate": 0.00029988623435722413, + "loss": 6.0207, + "step": 6154 + }, + { + "epoch": 2.1006825938566553, + "grad_norm": 2.9538464546203613, + "learning_rate": 0.00029977246871444824, + "loss": 6.5961, + "step": 6155 + }, + { + "epoch": 2.1010238907849828, + "grad_norm": 2.916219472885132, + "learning_rate": 0.00029965870307167234, + "loss": 6.4961, + "step": 6156 + }, + { + "epoch": 2.1013651877133106, + "grad_norm": 3.0762720108032227, + "learning_rate": 0.0002995449374288965, + "loss": 5.8819, + "step": 6157 + }, + { + "epoch": 2.101706484641638, + "grad_norm": 2.9753077030181885, + "learning_rate": 0.0002994311717861206, + "loss": 6.0732, + "step": 6158 + }, + { + "epoch": 2.102047781569966, + "grad_norm": 3.163635492324829, + "learning_rate": 0.00029931740614334477, + "loss": 5.752, + "step": 6159 + }, + { + "epoch": 2.1023890784982937, + "grad_norm": 2.8499581813812256, + "learning_rate": 0.0002992036405005688, + "loss": 6.1047, + "step": 6160 + }, + { + "epoch": 2.102730375426621, + "grad_norm": 2.870997190475464, + "learning_rate": 0.0002990898748577929, + "loss": 6.2264, + "step": 6161 + }, + { + "epoch": 2.103071672354949, + "grad_norm": 2.834073305130005, + "learning_rate": 0.0002989761092150171, + "loss": 6.223, + "step": 6162 + }, + { + "epoch": 2.1034129692832764, + "grad_norm": 2.8211171627044678, + "learning_rate": 0.0002988623435722412, + "loss": 6.8218, + "step": 6163 + }, + { + "epoch": 2.103754266211604, + "grad_norm": 2.9839437007904053, + "learning_rate": 0.0002987485779294653, + "loss": 6.177, + "step": 6164 + }, + { + "epoch": 2.1040955631399316, + "grad_norm": 2.881976366043091, + "learning_rate": 0.00029863481228668945, + "loss": 6.3719, + "step": 6165 + }, + { + "epoch": 2.1044368600682595, + "grad_norm": 2.873582601547241, + "learning_rate": 0.00029852104664391355, + "loss": 6.6528, + "step": 6166 + }, + { + "epoch": 2.104778156996587, + "grad_norm": 2.877270460128784, + "learning_rate": 0.00029840728100113766, + "loss": 6.2307, + "step": 6167 + }, + { + "epoch": 2.1051194539249147, + "grad_norm": 2.885765314102173, + "learning_rate": 0.0002982935153583618, + "loss": 6.1185, + "step": 6168 + }, + { + "epoch": 2.105460750853242, + "grad_norm": 2.835249662399292, + "learning_rate": 0.0002981797497155859, + "loss": 6.2276, + "step": 6169 + }, + { + "epoch": 2.10580204778157, + "grad_norm": 2.9260549545288086, + "learning_rate": 0.00029806598407281, + "loss": 6.3071, + "step": 6170 + }, + { + "epoch": 2.1061433447098974, + "grad_norm": 2.8237810134887695, + "learning_rate": 0.00029795221843003413, + "loss": 6.1465, + "step": 6171 + }, + { + "epoch": 2.1064846416382252, + "grad_norm": 2.822054862976074, + "learning_rate": 0.00029783845278725824, + "loss": 5.8652, + "step": 6172 + }, + { + "epoch": 2.106825938566553, + "grad_norm": 2.801043748855591, + "learning_rate": 0.00029772468714448234, + "loss": 6.0659, + "step": 6173 + }, + { + "epoch": 2.1071672354948805, + "grad_norm": 2.7532236576080322, + "learning_rate": 0.0002976109215017065, + "loss": 6.1017, + "step": 6174 + }, + { + "epoch": 2.1075085324232083, + "grad_norm": 2.8876118659973145, + "learning_rate": 0.0002974971558589306, + "loss": 5.7315, + "step": 6175 + }, + { + "epoch": 2.1078498293515358, + "grad_norm": 2.9598159790039062, + "learning_rate": 0.0002973833902161547, + "loss": 6.4794, + "step": 6176 + }, + { + "epoch": 2.1081911262798636, + "grad_norm": 2.756242513656616, + "learning_rate": 0.00029726962457337887, + "loss": 6.6465, + "step": 6177 + }, + { + "epoch": 2.108532423208191, + "grad_norm": 3.125593662261963, + "learning_rate": 0.000297155858930603, + "loss": 6.2238, + "step": 6178 + }, + { + "epoch": 2.108873720136519, + "grad_norm": 2.907468557357788, + "learning_rate": 0.0002970420932878271, + "loss": 6.4868, + "step": 6179 + }, + { + "epoch": 2.1092150170648463, + "grad_norm": 4.4180755615234375, + "learning_rate": 0.0002969283276450512, + "loss": 5.2227, + "step": 6180 + }, + { + "epoch": 2.109556313993174, + "grad_norm": 2.873162269592285, + "learning_rate": 0.0002968145620022753, + "loss": 6.7799, + "step": 6181 + }, + { + "epoch": 2.1098976109215015, + "grad_norm": 2.8836138248443604, + "learning_rate": 0.00029670079635949945, + "loss": 5.8275, + "step": 6182 + }, + { + "epoch": 2.1102389078498294, + "grad_norm": 2.93564510345459, + "learning_rate": 0.00029658703071672356, + "loss": 5.3932, + "step": 6183 + }, + { + "epoch": 2.1105802047781568, + "grad_norm": 2.858703374862671, + "learning_rate": 0.00029647326507394766, + "loss": 6.2049, + "step": 6184 + }, + { + "epoch": 2.1109215017064846, + "grad_norm": 2.9091365337371826, + "learning_rate": 0.0002963594994311718, + "loss": 6.3892, + "step": 6185 + }, + { + "epoch": 2.1112627986348125, + "grad_norm": 2.96952486038208, + "learning_rate": 0.0002962457337883959, + "loss": 5.8229, + "step": 6186 + }, + { + "epoch": 2.11160409556314, + "grad_norm": 2.865198850631714, + "learning_rate": 0.00029613196814562003, + "loss": 6.4365, + "step": 6187 + }, + { + "epoch": 2.1119453924914677, + "grad_norm": 2.7699646949768066, + "learning_rate": 0.0002960182025028442, + "loss": 6.4912, + "step": 6188 + }, + { + "epoch": 2.112286689419795, + "grad_norm": 2.8436672687530518, + "learning_rate": 0.00029590443686006824, + "loss": 5.2912, + "step": 6189 + }, + { + "epoch": 2.112627986348123, + "grad_norm": 2.8417892456054688, + "learning_rate": 0.00029579067121729234, + "loss": 6.3945, + "step": 6190 + }, + { + "epoch": 2.1129692832764504, + "grad_norm": 6.457067966461182, + "learning_rate": 0.0002956769055745165, + "loss": 3.8569, + "step": 6191 + }, + { + "epoch": 2.1133105802047782, + "grad_norm": 4.360135555267334, + "learning_rate": 0.0002955631399317406, + "loss": 5.5622, + "step": 6192 + }, + { + "epoch": 2.1136518771331056, + "grad_norm": 3.0096099376678467, + "learning_rate": 0.0002954493742889647, + "loss": 5.8473, + "step": 6193 + }, + { + "epoch": 2.1139931740614335, + "grad_norm": 3.391237497329712, + "learning_rate": 0.00029533560864618887, + "loss": 5.4066, + "step": 6194 + }, + { + "epoch": 2.114334470989761, + "grad_norm": 2.9052274227142334, + "learning_rate": 0.000295221843003413, + "loss": 5.9522, + "step": 6195 + }, + { + "epoch": 2.1146757679180888, + "grad_norm": 2.8499536514282227, + "learning_rate": 0.0002951080773606371, + "loss": 5.3388, + "step": 6196 + }, + { + "epoch": 2.115017064846416, + "grad_norm": 2.8930346965789795, + "learning_rate": 0.00029499431171786124, + "loss": 6.1808, + "step": 6197 + }, + { + "epoch": 2.115358361774744, + "grad_norm": 8.893372535705566, + "learning_rate": 0.00029488054607508535, + "loss": 5.8041, + "step": 6198 + }, + { + "epoch": 2.115699658703072, + "grad_norm": 2.8673787117004395, + "learning_rate": 0.0002947667804323094, + "loss": 6.1321, + "step": 6199 + }, + { + "epoch": 2.1160409556313993, + "grad_norm": 2.806966543197632, + "learning_rate": 0.00029465301478953356, + "loss": 6.6198, + "step": 6200 + }, + { + "epoch": 2.116382252559727, + "grad_norm": 2.9241514205932617, + "learning_rate": 0.00029453924914675766, + "loss": 6.4543, + "step": 6201 + }, + { + "epoch": 2.1167235494880545, + "grad_norm": 2.939549684524536, + "learning_rate": 0.0002944254835039818, + "loss": 5.8664, + "step": 6202 + }, + { + "epoch": 2.1170648464163824, + "grad_norm": 2.817613124847412, + "learning_rate": 0.0002943117178612059, + "loss": 6.4852, + "step": 6203 + }, + { + "epoch": 2.11740614334471, + "grad_norm": 2.9055352210998535, + "learning_rate": 0.00029419795221843003, + "loss": 6.1691, + "step": 6204 + }, + { + "epoch": 2.1177474402730376, + "grad_norm": 2.756702184677124, + "learning_rate": 0.0002940841865756542, + "loss": 6.3137, + "step": 6205 + }, + { + "epoch": 2.118088737201365, + "grad_norm": 2.957275152206421, + "learning_rate": 0.0002939704209328783, + "loss": 6.5439, + "step": 6206 + }, + { + "epoch": 2.118430034129693, + "grad_norm": 2.8155031204223633, + "learning_rate": 0.0002938566552901024, + "loss": 6.0326, + "step": 6207 + }, + { + "epoch": 2.1187713310580203, + "grad_norm": 12.575434684753418, + "learning_rate": 0.00029374288964732656, + "loss": 4.9286, + "step": 6208 + }, + { + "epoch": 2.119112627986348, + "grad_norm": 5.587793350219727, + "learning_rate": 0.0002936291240045506, + "loss": 4.9998, + "step": 6209 + }, + { + "epoch": 2.1194539249146755, + "grad_norm": 3.0160374641418457, + "learning_rate": 0.0002935153583617747, + "loss": 5.9745, + "step": 6210 + }, + { + "epoch": 2.1197952218430034, + "grad_norm": 2.93703293800354, + "learning_rate": 0.0002934015927189989, + "loss": 6.4411, + "step": 6211 + }, + { + "epoch": 2.1201365187713312, + "grad_norm": 2.9282970428466797, + "learning_rate": 0.000293287827076223, + "loss": 6.1064, + "step": 6212 + }, + { + "epoch": 2.1204778156996587, + "grad_norm": 2.965182304382324, + "learning_rate": 0.0002931740614334471, + "loss": 6.5645, + "step": 6213 + }, + { + "epoch": 2.1208191126279865, + "grad_norm": 2.8125925064086914, + "learning_rate": 0.00029306029579067124, + "loss": 6.1774, + "step": 6214 + }, + { + "epoch": 2.121160409556314, + "grad_norm": 2.8314590454101562, + "learning_rate": 0.00029294653014789535, + "loss": 5.887, + "step": 6215 + }, + { + "epoch": 2.1215017064846418, + "grad_norm": 2.745131492614746, + "learning_rate": 0.00029283276450511945, + "loss": 6.5378, + "step": 6216 + }, + { + "epoch": 2.121843003412969, + "grad_norm": 2.8095803260803223, + "learning_rate": 0.0002927189988623436, + "loss": 6.3276, + "step": 6217 + }, + { + "epoch": 2.122184300341297, + "grad_norm": 2.9379096031188965, + "learning_rate": 0.0002926052332195677, + "loss": 6.4891, + "step": 6218 + }, + { + "epoch": 2.1225255972696244, + "grad_norm": 2.851707696914673, + "learning_rate": 0.00029249146757679177, + "loss": 6.2868, + "step": 6219 + }, + { + "epoch": 2.1228668941979523, + "grad_norm": 3.480109214782715, + "learning_rate": 0.0002923777019340159, + "loss": 5.4423, + "step": 6220 + }, + { + "epoch": 2.1232081911262797, + "grad_norm": 3.0105674266815186, + "learning_rate": 0.00029226393629124003, + "loss": 5.9214, + "step": 6221 + }, + { + "epoch": 2.1235494880546075, + "grad_norm": 2.8100273609161377, + "learning_rate": 0.0002921501706484642, + "loss": 6.3837, + "step": 6222 + }, + { + "epoch": 2.123890784982935, + "grad_norm": 2.886622190475464, + "learning_rate": 0.0002920364050056883, + "loss": 5.8483, + "step": 6223 + }, + { + "epoch": 2.124232081911263, + "grad_norm": 2.9184138774871826, + "learning_rate": 0.0002919226393629124, + "loss": 6.2556, + "step": 6224 + }, + { + "epoch": 2.1245733788395906, + "grad_norm": 3.3458776473999023, + "learning_rate": 0.00029180887372013656, + "loss": 4.7381, + "step": 6225 + }, + { + "epoch": 2.124914675767918, + "grad_norm": 2.833589553833008, + "learning_rate": 0.00029169510807736066, + "loss": 6.5108, + "step": 6226 + }, + { + "epoch": 2.125255972696246, + "grad_norm": 2.8403658866882324, + "learning_rate": 0.00029158134243458477, + "loss": 6.0265, + "step": 6227 + }, + { + "epoch": 2.1255972696245733, + "grad_norm": 2.9905974864959717, + "learning_rate": 0.0002914675767918089, + "loss": 5.9997, + "step": 6228 + }, + { + "epoch": 2.125938566552901, + "grad_norm": 3.724816083908081, + "learning_rate": 0.000291353811149033, + "loss": 5.0568, + "step": 6229 + }, + { + "epoch": 2.1262798634812285, + "grad_norm": 2.8078196048736572, + "learning_rate": 0.0002912400455062571, + "loss": 6.1329, + "step": 6230 + }, + { + "epoch": 2.1266211604095564, + "grad_norm": 2.8691749572753906, + "learning_rate": 0.00029112627986348124, + "loss": 6.2066, + "step": 6231 + }, + { + "epoch": 2.126962457337884, + "grad_norm": 2.830688953399658, + "learning_rate": 0.00029101251422070535, + "loss": 6.0778, + "step": 6232 + }, + { + "epoch": 2.1273037542662117, + "grad_norm": 2.865365505218506, + "learning_rate": 0.00029089874857792945, + "loss": 6.1605, + "step": 6233 + }, + { + "epoch": 2.127645051194539, + "grad_norm": 2.6853408813476562, + "learning_rate": 0.0002907849829351536, + "loss": 5.9814, + "step": 6234 + }, + { + "epoch": 2.127986348122867, + "grad_norm": 2.7538394927978516, + "learning_rate": 0.0002906712172923777, + "loss": 5.2563, + "step": 6235 + }, + { + "epoch": 2.1283276450511943, + "grad_norm": 3.2029075622558594, + "learning_rate": 0.0002905574516496018, + "loss": 5.537, + "step": 6236 + }, + { + "epoch": 2.128668941979522, + "grad_norm": 2.8655052185058594, + "learning_rate": 0.000290443686006826, + "loss": 6.3069, + "step": 6237 + }, + { + "epoch": 2.12901023890785, + "grad_norm": 2.836099624633789, + "learning_rate": 0.00029032992036405003, + "loss": 6.1336, + "step": 6238 + }, + { + "epoch": 2.1293515358361774, + "grad_norm": 2.8297688961029053, + "learning_rate": 0.00029021615472127414, + "loss": 5.9636, + "step": 6239 + }, + { + "epoch": 2.1296928327645053, + "grad_norm": 2.8413031101226807, + "learning_rate": 0.0002901023890784983, + "loss": 6.4108, + "step": 6240 + }, + { + "epoch": 2.1300341296928327, + "grad_norm": 2.8722383975982666, + "learning_rate": 0.0002899886234357224, + "loss": 6.5657, + "step": 6241 + }, + { + "epoch": 2.1303754266211605, + "grad_norm": 2.8211677074432373, + "learning_rate": 0.00028987485779294656, + "loss": 6.3421, + "step": 6242 + }, + { + "epoch": 2.130716723549488, + "grad_norm": 3.3281304836273193, + "learning_rate": 0.00028976109215017067, + "loss": 5.4648, + "step": 6243 + }, + { + "epoch": 2.131058020477816, + "grad_norm": 2.8957037925720215, + "learning_rate": 0.00028964732650739477, + "loss": 6.5093, + "step": 6244 + }, + { + "epoch": 2.131399317406143, + "grad_norm": 2.93902587890625, + "learning_rate": 0.00028953356086461893, + "loss": 6.1025, + "step": 6245 + }, + { + "epoch": 2.131740614334471, + "grad_norm": 2.877804756164551, + "learning_rate": 0.00028941979522184303, + "loss": 5.8682, + "step": 6246 + }, + { + "epoch": 2.1320819112627984, + "grad_norm": 2.8391692638397217, + "learning_rate": 0.00028930602957906714, + "loss": 6.1784, + "step": 6247 + }, + { + "epoch": 2.1324232081911263, + "grad_norm": 2.83154559135437, + "learning_rate": 0.00028919226393629124, + "loss": 6.0821, + "step": 6248 + }, + { + "epoch": 2.1327645051194537, + "grad_norm": 2.8575069904327393, + "learning_rate": 0.00028907849829351535, + "loss": 6.1532, + "step": 6249 + }, + { + "epoch": 2.1331058020477816, + "grad_norm": 2.8776323795318604, + "learning_rate": 0.00028896473265073945, + "loss": 6.1705, + "step": 6250 + }, + { + "epoch": 2.1334470989761094, + "grad_norm": 2.809744119644165, + "learning_rate": 0.0002888509670079636, + "loss": 5.6912, + "step": 6251 + }, + { + "epoch": 2.133788395904437, + "grad_norm": 2.8350303173065186, + "learning_rate": 0.0002887372013651877, + "loss": 6.6106, + "step": 6252 + }, + { + "epoch": 2.1341296928327647, + "grad_norm": 2.8744351863861084, + "learning_rate": 0.0002886234357224118, + "loss": 5.9893, + "step": 6253 + }, + { + "epoch": 2.134470989761092, + "grad_norm": 2.87178897857666, + "learning_rate": 0.000288509670079636, + "loss": 5.9063, + "step": 6254 + }, + { + "epoch": 2.13481228668942, + "grad_norm": 2.9190359115600586, + "learning_rate": 0.0002883959044368601, + "loss": 5.8424, + "step": 6255 + }, + { + "epoch": 2.1351535836177473, + "grad_norm": 2.8117921352386475, + "learning_rate": 0.0002882821387940842, + "loss": 6.3904, + "step": 6256 + }, + { + "epoch": 2.135494880546075, + "grad_norm": 2.7705960273742676, + "learning_rate": 0.00028816837315130835, + "loss": 6.1901, + "step": 6257 + }, + { + "epoch": 2.1358361774744026, + "grad_norm": 2.7972941398620605, + "learning_rate": 0.0002880546075085324, + "loss": 6.4878, + "step": 6258 + }, + { + "epoch": 2.1361774744027304, + "grad_norm": 2.915985345840454, + "learning_rate": 0.0002879408418657565, + "loss": 6.319, + "step": 6259 + }, + { + "epoch": 2.136518771331058, + "grad_norm": 3.0406363010406494, + "learning_rate": 0.00028782707622298067, + "loss": 5.3564, + "step": 6260 + }, + { + "epoch": 2.1368600682593857, + "grad_norm": 2.915806531906128, + "learning_rate": 0.00028771331058020477, + "loss": 6.8073, + "step": 6261 + }, + { + "epoch": 2.137201365187713, + "grad_norm": 2.840094804763794, + "learning_rate": 0.0002875995449374289, + "loss": 6.0616, + "step": 6262 + }, + { + "epoch": 2.137542662116041, + "grad_norm": 3.044546365737915, + "learning_rate": 0.00028748577929465304, + "loss": 5.9823, + "step": 6263 + }, + { + "epoch": 2.137883959044369, + "grad_norm": 2.8069345951080322, + "learning_rate": 0.00028737201365187714, + "loss": 5.8899, + "step": 6264 + }, + { + "epoch": 2.138225255972696, + "grad_norm": 2.8434646129608154, + "learning_rate": 0.0002872582480091013, + "loss": 6.471, + "step": 6265 + }, + { + "epoch": 2.138566552901024, + "grad_norm": 2.9248239994049072, + "learning_rate": 0.0002871444823663254, + "loss": 5.8759, + "step": 6266 + }, + { + "epoch": 2.1389078498293514, + "grad_norm": 2.838510036468506, + "learning_rate": 0.00028703071672354946, + "loss": 6.3492, + "step": 6267 + }, + { + "epoch": 2.1392491467576793, + "grad_norm": 2.9336483478546143, + "learning_rate": 0.0002869169510807736, + "loss": 6.0126, + "step": 6268 + }, + { + "epoch": 2.1395904436860067, + "grad_norm": 2.8124420642852783, + "learning_rate": 0.0002868031854379977, + "loss": 5.8763, + "step": 6269 + }, + { + "epoch": 2.1399317406143346, + "grad_norm": 2.821249485015869, + "learning_rate": 0.0002866894197952218, + "loss": 6.0869, + "step": 6270 + }, + { + "epoch": 2.140273037542662, + "grad_norm": 2.825366258621216, + "learning_rate": 0.000286575654152446, + "loss": 6.1345, + "step": 6271 + }, + { + "epoch": 2.14061433447099, + "grad_norm": 2.799079418182373, + "learning_rate": 0.0002864618885096701, + "loss": 5.919, + "step": 6272 + }, + { + "epoch": 2.140955631399317, + "grad_norm": 2.9337072372436523, + "learning_rate": 0.0002863481228668942, + "loss": 5.927, + "step": 6273 + }, + { + "epoch": 2.141296928327645, + "grad_norm": 2.8965418338775635, + "learning_rate": 0.00028623435722411835, + "loss": 6.1504, + "step": 6274 + }, + { + "epoch": 2.1416382252559725, + "grad_norm": 3.0523440837860107, + "learning_rate": 0.00028612059158134246, + "loss": 6.0062, + "step": 6275 + }, + { + "epoch": 2.1419795221843003, + "grad_norm": 2.986406087875366, + "learning_rate": 0.00028600682593856656, + "loss": 5.8823, + "step": 6276 + }, + { + "epoch": 2.142320819112628, + "grad_norm": 2.9808123111724854, + "learning_rate": 0.00028589306029579067, + "loss": 6.4025, + "step": 6277 + }, + { + "epoch": 2.1426621160409556, + "grad_norm": 2.6804819107055664, + "learning_rate": 0.00028577929465301477, + "loss": 5.7797, + "step": 6278 + }, + { + "epoch": 2.1430034129692834, + "grad_norm": 3.432440996170044, + "learning_rate": 0.0002856655290102389, + "loss": 4.9929, + "step": 6279 + }, + { + "epoch": 2.143344709897611, + "grad_norm": 3.000901222229004, + "learning_rate": 0.00028555176336746304, + "loss": 6.2861, + "step": 6280 + }, + { + "epoch": 2.1436860068259387, + "grad_norm": 2.8772642612457275, + "learning_rate": 0.00028543799772468714, + "loss": 6.4075, + "step": 6281 + }, + { + "epoch": 2.144027303754266, + "grad_norm": 2.7788705825805664, + "learning_rate": 0.00028532423208191125, + "loss": 4.92, + "step": 6282 + }, + { + "epoch": 2.144368600682594, + "grad_norm": 3.084559917449951, + "learning_rate": 0.0002852104664391354, + "loss": 5.9546, + "step": 6283 + }, + { + "epoch": 2.1447098976109213, + "grad_norm": 3.004912853240967, + "learning_rate": 0.0002850967007963595, + "loss": 6.2623, + "step": 6284 + }, + { + "epoch": 2.145051194539249, + "grad_norm": 4.325676441192627, + "learning_rate": 0.00028498293515358367, + "loss": 4.1026, + "step": 6285 + }, + { + "epoch": 2.1453924914675766, + "grad_norm": 2.822035551071167, + "learning_rate": 0.0002848691695108078, + "loss": 6.1718, + "step": 6286 + }, + { + "epoch": 2.1457337883959045, + "grad_norm": 3.587118148803711, + "learning_rate": 0.0002847554038680318, + "loss": 4.5978, + "step": 6287 + }, + { + "epoch": 2.146075085324232, + "grad_norm": 3.637235403060913, + "learning_rate": 0.000284641638225256, + "loss": 5.1437, + "step": 6288 + }, + { + "epoch": 2.1464163822525597, + "grad_norm": 3.1141090393066406, + "learning_rate": 0.0002845278725824801, + "loss": 4.2409, + "step": 6289 + }, + { + "epoch": 2.1467576791808876, + "grad_norm": 2.9125001430511475, + "learning_rate": 0.0002844141069397042, + "loss": 6.4584, + "step": 6290 + }, + { + "epoch": 2.147098976109215, + "grad_norm": 2.8799238204956055, + "learning_rate": 0.00028430034129692835, + "loss": 6.1453, + "step": 6291 + }, + { + "epoch": 2.147440273037543, + "grad_norm": 2.945450782775879, + "learning_rate": 0.00028418657565415246, + "loss": 5.9845, + "step": 6292 + }, + { + "epoch": 2.14778156996587, + "grad_norm": 2.812429428100586, + "learning_rate": 0.00028407281001137656, + "loss": 5.7118, + "step": 6293 + }, + { + "epoch": 2.148122866894198, + "grad_norm": 2.8083250522613525, + "learning_rate": 0.0002839590443686007, + "loss": 6.1992, + "step": 6294 + }, + { + "epoch": 2.1484641638225255, + "grad_norm": 2.8323144912719727, + "learning_rate": 0.00028384527872582483, + "loss": 6.9573, + "step": 6295 + }, + { + "epoch": 2.1488054607508533, + "grad_norm": 2.777940273284912, + "learning_rate": 0.0002837315130830489, + "loss": 5.7647, + "step": 6296 + }, + { + "epoch": 2.1491467576791807, + "grad_norm": 2.9180915355682373, + "learning_rate": 0.00028361774744027304, + "loss": 6.115, + "step": 6297 + }, + { + "epoch": 2.1494880546075086, + "grad_norm": 2.861795663833618, + "learning_rate": 0.00028350398179749714, + "loss": 5.9888, + "step": 6298 + }, + { + "epoch": 2.149829351535836, + "grad_norm": 2.804222822189331, + "learning_rate": 0.00028339021615472125, + "loss": 6.1902, + "step": 6299 + }, + { + "epoch": 2.150170648464164, + "grad_norm": 2.753507375717163, + "learning_rate": 0.0002832764505119454, + "loss": 6.6358, + "step": 6300 + }, + { + "epoch": 2.1505119453924912, + "grad_norm": 2.8499131202697754, + "learning_rate": 0.0002831626848691695, + "loss": 6.6611, + "step": 6301 + }, + { + "epoch": 2.150853242320819, + "grad_norm": 2.9446632862091064, + "learning_rate": 0.0002830489192263936, + "loss": 6.9951, + "step": 6302 + }, + { + "epoch": 2.151194539249147, + "grad_norm": 2.8921797275543213, + "learning_rate": 0.0002829351535836178, + "loss": 6.4307, + "step": 6303 + }, + { + "epoch": 2.1515358361774743, + "grad_norm": 2.776242971420288, + "learning_rate": 0.0002828213879408419, + "loss": 6.5389, + "step": 6304 + }, + { + "epoch": 2.151877133105802, + "grad_norm": 3.0470118522644043, + "learning_rate": 0.00028270762229806604, + "loss": 5.9021, + "step": 6305 + }, + { + "epoch": 2.1522184300341296, + "grad_norm": 2.870765209197998, + "learning_rate": 0.0002825938566552901, + "loss": 5.767, + "step": 6306 + }, + { + "epoch": 2.1525597269624575, + "grad_norm": 2.9112579822540283, + "learning_rate": 0.0002824800910125142, + "loss": 5.9856, + "step": 6307 + }, + { + "epoch": 2.152901023890785, + "grad_norm": 2.8619847297668457, + "learning_rate": 0.00028236632536973835, + "loss": 6.447, + "step": 6308 + }, + { + "epoch": 2.1532423208191127, + "grad_norm": 2.7564167976379395, + "learning_rate": 0.00028225255972696246, + "loss": 6.2936, + "step": 6309 + }, + { + "epoch": 2.15358361774744, + "grad_norm": 2.8190064430236816, + "learning_rate": 0.00028213879408418656, + "loss": 6.035, + "step": 6310 + }, + { + "epoch": 2.153924914675768, + "grad_norm": 2.4524943828582764, + "learning_rate": 0.0002820250284414107, + "loss": 3.2047, + "step": 6311 + }, + { + "epoch": 2.1542662116040954, + "grad_norm": 2.8662970066070557, + "learning_rate": 0.00028191126279863483, + "loss": 4.9013, + "step": 6312 + }, + { + "epoch": 2.154607508532423, + "grad_norm": 2.9255385398864746, + "learning_rate": 0.00028179749715585893, + "loss": 6.6828, + "step": 6313 + }, + { + "epoch": 2.1549488054607506, + "grad_norm": 2.917696475982666, + "learning_rate": 0.0002816837315130831, + "loss": 4.9657, + "step": 6314 + }, + { + "epoch": 2.1552901023890785, + "grad_norm": 2.835268259048462, + "learning_rate": 0.0002815699658703072, + "loss": 6.6023, + "step": 6315 + }, + { + "epoch": 2.1556313993174063, + "grad_norm": 2.958221435546875, + "learning_rate": 0.00028145620022753125, + "loss": 5.9026, + "step": 6316 + }, + { + "epoch": 2.1559726962457337, + "grad_norm": 2.8288214206695557, + "learning_rate": 0.0002813424345847554, + "loss": 5.8854, + "step": 6317 + }, + { + "epoch": 2.1563139931740616, + "grad_norm": 3.5853185653686523, + "learning_rate": 0.0002812286689419795, + "loss": 5.0054, + "step": 6318 + }, + { + "epoch": 2.156655290102389, + "grad_norm": 2.9272825717926025, + "learning_rate": 0.0002811149032992036, + "loss": 5.7341, + "step": 6319 + }, + { + "epoch": 2.156996587030717, + "grad_norm": 2.8925740718841553, + "learning_rate": 0.0002810011376564278, + "loss": 6.3701, + "step": 6320 + }, + { + "epoch": 2.1573378839590442, + "grad_norm": 2.798673152923584, + "learning_rate": 0.0002808873720136519, + "loss": 5.85, + "step": 6321 + }, + { + "epoch": 2.157679180887372, + "grad_norm": 2.7680532932281494, + "learning_rate": 0.000280773606370876, + "loss": 6.048, + "step": 6322 + }, + { + "epoch": 2.1580204778156995, + "grad_norm": 2.8404247760772705, + "learning_rate": 0.00028065984072810015, + "loss": 5.9817, + "step": 6323 + }, + { + "epoch": 2.1583617747440274, + "grad_norm": 3.323213815689087, + "learning_rate": 0.00028054607508532425, + "loss": 4.1193, + "step": 6324 + }, + { + "epoch": 2.1587030716723548, + "grad_norm": 3.4093360900878906, + "learning_rate": 0.0002804323094425484, + "loss": 4.3087, + "step": 6325 + }, + { + "epoch": 2.1590443686006826, + "grad_norm": 2.8730340003967285, + "learning_rate": 0.00028031854379977246, + "loss": 6.0826, + "step": 6326 + }, + { + "epoch": 2.15938566552901, + "grad_norm": 2.8777995109558105, + "learning_rate": 0.00028020477815699656, + "loss": 6.4141, + "step": 6327 + }, + { + "epoch": 2.159726962457338, + "grad_norm": 2.9867589473724365, + "learning_rate": 0.0002800910125142207, + "loss": 6.3958, + "step": 6328 + }, + { + "epoch": 2.1600682593856657, + "grad_norm": 2.889984369277954, + "learning_rate": 0.00027997724687144483, + "loss": 5.8593, + "step": 6329 + }, + { + "epoch": 2.160409556313993, + "grad_norm": 2.9526164531707764, + "learning_rate": 0.00027986348122866893, + "loss": 5.9088, + "step": 6330 + }, + { + "epoch": 2.160750853242321, + "grad_norm": 4.757111072540283, + "learning_rate": 0.0002797497155858931, + "loss": 5.8323, + "step": 6331 + }, + { + "epoch": 2.1610921501706484, + "grad_norm": 2.8701725006103516, + "learning_rate": 0.0002796359499431172, + "loss": 6.536, + "step": 6332 + }, + { + "epoch": 2.1614334470989762, + "grad_norm": 2.827101230621338, + "learning_rate": 0.0002795221843003413, + "loss": 5.6708, + "step": 6333 + }, + { + "epoch": 2.1617747440273036, + "grad_norm": 3.0027883052825928, + "learning_rate": 0.00027940841865756546, + "loss": 5.5514, + "step": 6334 + }, + { + "epoch": 2.1621160409556315, + "grad_norm": 3.244901418685913, + "learning_rate": 0.0002792946530147895, + "loss": 5.0491, + "step": 6335 + }, + { + "epoch": 2.162457337883959, + "grad_norm": 2.8494021892547607, + "learning_rate": 0.0002791808873720136, + "loss": 6.3514, + "step": 6336 + }, + { + "epoch": 2.1627986348122867, + "grad_norm": 2.9061434268951416, + "learning_rate": 0.0002790671217292378, + "loss": 6.184, + "step": 6337 + }, + { + "epoch": 2.163139931740614, + "grad_norm": 2.848724126815796, + "learning_rate": 0.0002789533560864619, + "loss": 6.5265, + "step": 6338 + }, + { + "epoch": 2.163481228668942, + "grad_norm": 3.203709602355957, + "learning_rate": 0.000278839590443686, + "loss": 6.0744, + "step": 6339 + }, + { + "epoch": 2.1638225255972694, + "grad_norm": 2.862330198287964, + "learning_rate": 0.00027872582480091015, + "loss": 6.3895, + "step": 6340 + }, + { + "epoch": 2.1641638225255972, + "grad_norm": 4.710589408874512, + "learning_rate": 0.00027861205915813425, + "loss": 4.9895, + "step": 6341 + }, + { + "epoch": 2.164505119453925, + "grad_norm": 2.8820667266845703, + "learning_rate": 0.00027849829351535836, + "loss": 5.7645, + "step": 6342 + }, + { + "epoch": 2.1648464163822525, + "grad_norm": 3.1783809661865234, + "learning_rate": 0.0002783845278725825, + "loss": 5.7725, + "step": 6343 + }, + { + "epoch": 2.1651877133105804, + "grad_norm": 2.8277227878570557, + "learning_rate": 0.0002782707622298066, + "loss": 6.6817, + "step": 6344 + }, + { + "epoch": 2.1655290102389078, + "grad_norm": 2.84521222114563, + "learning_rate": 0.00027815699658703067, + "loss": 5.903, + "step": 6345 + }, + { + "epoch": 2.1658703071672356, + "grad_norm": 2.908289909362793, + "learning_rate": 0.00027804323094425483, + "loss": 5.6247, + "step": 6346 + }, + { + "epoch": 2.166211604095563, + "grad_norm": 3.011632204055786, + "learning_rate": 0.00027792946530147893, + "loss": 6.108, + "step": 6347 + }, + { + "epoch": 2.166552901023891, + "grad_norm": 2.8673269748687744, + "learning_rate": 0.0002778156996587031, + "loss": 6.2876, + "step": 6348 + }, + { + "epoch": 2.1668941979522183, + "grad_norm": 2.8036434650421143, + "learning_rate": 0.0002777019340159272, + "loss": 6.3993, + "step": 6349 + }, + { + "epoch": 2.167235494880546, + "grad_norm": 2.8587241172790527, + "learning_rate": 0.0002775881683731513, + "loss": 6.0677, + "step": 6350 + }, + { + "epoch": 2.1675767918088735, + "grad_norm": 3.036465883255005, + "learning_rate": 0.00027747440273037546, + "loss": 5.7799, + "step": 6351 + }, + { + "epoch": 2.1679180887372014, + "grad_norm": 2.8174219131469727, + "learning_rate": 0.00027736063708759957, + "loss": 5.7702, + "step": 6352 + }, + { + "epoch": 2.168259385665529, + "grad_norm": 2.7940962314605713, + "learning_rate": 0.00027724687144482367, + "loss": 6.4102, + "step": 6353 + }, + { + "epoch": 2.1686006825938566, + "grad_norm": 4.701676368713379, + "learning_rate": 0.00027713310580204783, + "loss": 4.1398, + "step": 6354 + }, + { + "epoch": 2.1689419795221845, + "grad_norm": 2.993300676345825, + "learning_rate": 0.0002770193401592719, + "loss": 6.4199, + "step": 6355 + }, + { + "epoch": 2.169283276450512, + "grad_norm": 2.90317440032959, + "learning_rate": 0.000276905574516496, + "loss": 6.436, + "step": 6356 + }, + { + "epoch": 2.1696245733788397, + "grad_norm": 2.9535815715789795, + "learning_rate": 0.00027679180887372015, + "loss": 6.5237, + "step": 6357 + }, + { + "epoch": 2.169965870307167, + "grad_norm": 2.869474172592163, + "learning_rate": 0.00027667804323094425, + "loss": 6.0359, + "step": 6358 + }, + { + "epoch": 2.170307167235495, + "grad_norm": 2.888397693634033, + "learning_rate": 0.00027656427758816836, + "loss": 6.4275, + "step": 6359 + }, + { + "epoch": 2.1706484641638224, + "grad_norm": 2.8712522983551025, + "learning_rate": 0.0002764505119453925, + "loss": 6.1997, + "step": 6360 + }, + { + "epoch": 2.1709897610921502, + "grad_norm": 2.934269905090332, + "learning_rate": 0.0002763367463026166, + "loss": 6.4548, + "step": 6361 + }, + { + "epoch": 2.1713310580204777, + "grad_norm": 2.752089262008667, + "learning_rate": 0.0002762229806598407, + "loss": 6.0606, + "step": 6362 + }, + { + "epoch": 2.1716723549488055, + "grad_norm": 3.460238456726074, + "learning_rate": 0.0002761092150170649, + "loss": 4.7184, + "step": 6363 + }, + { + "epoch": 2.172013651877133, + "grad_norm": 2.8735015392303467, + "learning_rate": 0.00027599544937428894, + "loss": 5.8198, + "step": 6364 + }, + { + "epoch": 2.1723549488054608, + "grad_norm": 2.812506675720215, + "learning_rate": 0.00027588168373151304, + "loss": 5.6997, + "step": 6365 + }, + { + "epoch": 2.172696245733788, + "grad_norm": 2.769024133682251, + "learning_rate": 0.0002757679180887372, + "loss": 6.1244, + "step": 6366 + }, + { + "epoch": 2.173037542662116, + "grad_norm": 2.8082642555236816, + "learning_rate": 0.0002756541524459613, + "loss": 6.0761, + "step": 6367 + }, + { + "epoch": 2.173378839590444, + "grad_norm": 2.7583067417144775, + "learning_rate": 0.00027554038680318546, + "loss": 5.7482, + "step": 6368 + }, + { + "epoch": 2.1737201365187713, + "grad_norm": 2.982508659362793, + "learning_rate": 0.00027542662116040957, + "loss": 5.0801, + "step": 6369 + }, + { + "epoch": 2.174061433447099, + "grad_norm": 2.7821574211120605, + "learning_rate": 0.0002753128555176337, + "loss": 6.4252, + "step": 6370 + }, + { + "epoch": 2.1744027303754265, + "grad_norm": 2.783104419708252, + "learning_rate": 0.00027519908987485783, + "loss": 5.8358, + "step": 6371 + }, + { + "epoch": 2.1747440273037544, + "grad_norm": 2.8212077617645264, + "learning_rate": 0.00027508532423208194, + "loss": 6.0909, + "step": 6372 + }, + { + "epoch": 2.175085324232082, + "grad_norm": 2.8396553993225098, + "learning_rate": 0.00027497155858930604, + "loss": 6.2717, + "step": 6373 + }, + { + "epoch": 2.1754266211604096, + "grad_norm": 4.277924060821533, + "learning_rate": 0.00027485779294653015, + "loss": 5.4965, + "step": 6374 + }, + { + "epoch": 2.175767918088737, + "grad_norm": 2.8166744709014893, + "learning_rate": 0.00027474402730375425, + "loss": 5.7125, + "step": 6375 + }, + { + "epoch": 2.176109215017065, + "grad_norm": 2.8981945514678955, + "learning_rate": 0.00027463026166097836, + "loss": 5.2261, + "step": 6376 + }, + { + "epoch": 2.1764505119453923, + "grad_norm": 2.9114558696746826, + "learning_rate": 0.0002745164960182025, + "loss": 6.0895, + "step": 6377 + }, + { + "epoch": 2.17679180887372, + "grad_norm": 3.33648681640625, + "learning_rate": 0.0002744027303754266, + "loss": 5.1431, + "step": 6378 + }, + { + "epoch": 2.1771331058020476, + "grad_norm": 2.8457000255584717, + "learning_rate": 0.0002742889647326507, + "loss": 6.706, + "step": 6379 + }, + { + "epoch": 2.1774744027303754, + "grad_norm": 2.829103946685791, + "learning_rate": 0.0002741751990898749, + "loss": 6.4637, + "step": 6380 + }, + { + "epoch": 2.1778156996587033, + "grad_norm": 2.912069797515869, + "learning_rate": 0.000274061433447099, + "loss": 6.7109, + "step": 6381 + }, + { + "epoch": 2.1781569965870307, + "grad_norm": 2.960214853286743, + "learning_rate": 0.0002739476678043231, + "loss": 5.9655, + "step": 6382 + }, + { + "epoch": 2.1784982935153585, + "grad_norm": 3.0913033485412598, + "learning_rate": 0.00027383390216154725, + "loss": 5.843, + "step": 6383 + }, + { + "epoch": 2.178839590443686, + "grad_norm": 2.850743293762207, + "learning_rate": 0.0002737201365187713, + "loss": 4.9693, + "step": 6384 + }, + { + "epoch": 2.1791808873720138, + "grad_norm": 2.808077335357666, + "learning_rate": 0.0002736063708759954, + "loss": 6.0217, + "step": 6385 + }, + { + "epoch": 2.179522184300341, + "grad_norm": 2.9761393070220947, + "learning_rate": 0.00027349260523321957, + "loss": 4.5135, + "step": 6386 + }, + { + "epoch": 2.179863481228669, + "grad_norm": 2.9906089305877686, + "learning_rate": 0.0002733788395904437, + "loss": 6.1202, + "step": 6387 + }, + { + "epoch": 2.1802047781569964, + "grad_norm": 3.4929113388061523, + "learning_rate": 0.00027326507394766783, + "loss": 5.8859, + "step": 6388 + }, + { + "epoch": 2.1805460750853243, + "grad_norm": 2.846465826034546, + "learning_rate": 0.00027315130830489194, + "loss": 6.0745, + "step": 6389 + }, + { + "epoch": 2.1808873720136517, + "grad_norm": 2.907808780670166, + "learning_rate": 0.00027303754266211604, + "loss": 5.983, + "step": 6390 + }, + { + "epoch": 2.1812286689419795, + "grad_norm": 2.858454942703247, + "learning_rate": 0.0002729237770193402, + "loss": 6.0968, + "step": 6391 + }, + { + "epoch": 2.181569965870307, + "grad_norm": 2.9495015144348145, + "learning_rate": 0.0002728100113765643, + "loss": 6.0371, + "step": 6392 + }, + { + "epoch": 2.181911262798635, + "grad_norm": 3.2573885917663574, + "learning_rate": 0.0002726962457337884, + "loss": 5.8426, + "step": 6393 + }, + { + "epoch": 2.1822525597269626, + "grad_norm": 4.468471527099609, + "learning_rate": 0.0002725824800910125, + "loss": 5.1957, + "step": 6394 + }, + { + "epoch": 2.18259385665529, + "grad_norm": 2.860956907272339, + "learning_rate": 0.0002724687144482366, + "loss": 5.7918, + "step": 6395 + }, + { + "epoch": 2.182935153583618, + "grad_norm": 2.8932464122772217, + "learning_rate": 0.00027235494880546073, + "loss": 6.7351, + "step": 6396 + }, + { + "epoch": 2.1832764505119453, + "grad_norm": 3.92891001701355, + "learning_rate": 0.0002722411831626849, + "loss": 4.5889, + "step": 6397 + }, + { + "epoch": 2.183617747440273, + "grad_norm": 1.9834781885147095, + "learning_rate": 0.000272127417519909, + "loss": 3.1828, + "step": 6398 + }, + { + "epoch": 2.1839590443686006, + "grad_norm": 2.90181827545166, + "learning_rate": 0.0002720136518771331, + "loss": 6.1316, + "step": 6399 + }, + { + "epoch": 2.1843003412969284, + "grad_norm": 3.912259817123413, + "learning_rate": 0.00027189988623435726, + "loss": 5.1916, + "step": 6400 + }, + { + "epoch": 2.184641638225256, + "grad_norm": 2.8160667419433594, + "learning_rate": 0.00027178612059158136, + "loss": 5.7501, + "step": 6401 + }, + { + "epoch": 2.1849829351535837, + "grad_norm": 2.872741222381592, + "learning_rate": 0.00027167235494880547, + "loss": 6.3647, + "step": 6402 + }, + { + "epoch": 2.185324232081911, + "grad_norm": 3.7568233013153076, + "learning_rate": 0.00027155858930602957, + "loss": 5.5898, + "step": 6403 + }, + { + "epoch": 2.185665529010239, + "grad_norm": 2.84272837638855, + "learning_rate": 0.0002714448236632537, + "loss": 5.3566, + "step": 6404 + }, + { + "epoch": 2.1860068259385663, + "grad_norm": 2.7461118698120117, + "learning_rate": 0.0002713310580204778, + "loss": 5.7219, + "step": 6405 + }, + { + "epoch": 2.186348122866894, + "grad_norm": 2.8305881023406982, + "learning_rate": 0.00027121729237770194, + "loss": 6.4276, + "step": 6406 + }, + { + "epoch": 2.186689419795222, + "grad_norm": 2.7954049110412598, + "learning_rate": 0.00027110352673492604, + "loss": 6.167, + "step": 6407 + }, + { + "epoch": 2.1870307167235494, + "grad_norm": 2.9570295810699463, + "learning_rate": 0.0002709897610921502, + "loss": 5.7226, + "step": 6408 + }, + { + "epoch": 2.1873720136518773, + "grad_norm": 2.7967288494110107, + "learning_rate": 0.0002708759954493743, + "loss": 6.2319, + "step": 6409 + }, + { + "epoch": 2.1877133105802047, + "grad_norm": 6.396007061004639, + "learning_rate": 0.0002707622298065984, + "loss": 5.3139, + "step": 6410 + }, + { + "epoch": 2.1880546075085325, + "grad_norm": 2.839141607284546, + "learning_rate": 0.00027064846416382257, + "loss": 6.323, + "step": 6411 + }, + { + "epoch": 2.18839590443686, + "grad_norm": 2.803288459777832, + "learning_rate": 0.0002705346985210467, + "loss": 6.2097, + "step": 6412 + }, + { + "epoch": 2.188737201365188, + "grad_norm": 2.8960049152374268, + "learning_rate": 0.00027042093287827073, + "loss": 5.969, + "step": 6413 + }, + { + "epoch": 2.189078498293515, + "grad_norm": 2.7831318378448486, + "learning_rate": 0.0002703071672354949, + "loss": 6.3809, + "step": 6414 + }, + { + "epoch": 2.189419795221843, + "grad_norm": 2.8870432376861572, + "learning_rate": 0.000270193401592719, + "loss": 6.6029, + "step": 6415 + }, + { + "epoch": 2.1897610921501705, + "grad_norm": 2.961520195007324, + "learning_rate": 0.0002700796359499431, + "loss": 6.0527, + "step": 6416 + }, + { + "epoch": 2.1901023890784983, + "grad_norm": 3.088958978652954, + "learning_rate": 0.00026996587030716726, + "loss": 5.7707, + "step": 6417 + }, + { + "epoch": 2.1904436860068257, + "grad_norm": 2.74513840675354, + "learning_rate": 0.00026985210466439136, + "loss": 5.923, + "step": 6418 + }, + { + "epoch": 2.1907849829351536, + "grad_norm": 3.138380765914917, + "learning_rate": 0.00026973833902161547, + "loss": 5.148, + "step": 6419 + }, + { + "epoch": 2.1911262798634814, + "grad_norm": 3.0224714279174805, + "learning_rate": 0.0002696245733788396, + "loss": 6.7062, + "step": 6420 + }, + { + "epoch": 2.191467576791809, + "grad_norm": 2.8007893562316895, + "learning_rate": 0.00026951080773606373, + "loss": 6.122, + "step": 6421 + }, + { + "epoch": 2.1918088737201367, + "grad_norm": 2.802536725997925, + "learning_rate": 0.00026939704209328784, + "loss": 6.1242, + "step": 6422 + }, + { + "epoch": 2.192150170648464, + "grad_norm": 2.7604641914367676, + "learning_rate": 0.00026928327645051194, + "loss": 5.7296, + "step": 6423 + }, + { + "epoch": 2.192491467576792, + "grad_norm": 3.111950159072876, + "learning_rate": 0.00026916951080773605, + "loss": 5.6642, + "step": 6424 + }, + { + "epoch": 2.1928327645051193, + "grad_norm": 2.8447318077087402, + "learning_rate": 0.00026905574516496015, + "loss": 5.7298, + "step": 6425 + }, + { + "epoch": 2.193174061433447, + "grad_norm": 2.7198376655578613, + "learning_rate": 0.0002689419795221843, + "loss": 5.9149, + "step": 6426 + }, + { + "epoch": 2.1935153583617746, + "grad_norm": 2.7918670177459717, + "learning_rate": 0.0002688282138794084, + "loss": 6.2658, + "step": 6427 + }, + { + "epoch": 2.1938566552901024, + "grad_norm": 3.1611740589141846, + "learning_rate": 0.0002687144482366326, + "loss": 4.734, + "step": 6428 + }, + { + "epoch": 2.19419795221843, + "grad_norm": 2.8715784549713135, + "learning_rate": 0.0002686006825938567, + "loss": 6.3364, + "step": 6429 + }, + { + "epoch": 2.1945392491467577, + "grad_norm": 2.958388328552246, + "learning_rate": 0.0002684869169510808, + "loss": 6.582, + "step": 6430 + }, + { + "epoch": 2.194880546075085, + "grad_norm": 3.085158109664917, + "learning_rate": 0.00026837315130830494, + "loss": 5.759, + "step": 6431 + }, + { + "epoch": 2.195221843003413, + "grad_norm": 2.9389808177948, + "learning_rate": 0.00026825938566552905, + "loss": 5.6247, + "step": 6432 + }, + { + "epoch": 2.195563139931741, + "grad_norm": 4.009513854980469, + "learning_rate": 0.0002681456200227531, + "loss": 4.9773, + "step": 6433 + }, + { + "epoch": 2.195904436860068, + "grad_norm": 2.732940196990967, + "learning_rate": 0.00026803185437997726, + "loss": 5.8776, + "step": 6434 + }, + { + "epoch": 2.196245733788396, + "grad_norm": 2.8661234378814697, + "learning_rate": 0.00026791808873720136, + "loss": 6.5482, + "step": 6435 + }, + { + "epoch": 2.1965870307167235, + "grad_norm": 2.915010690689087, + "learning_rate": 0.00026780432309442547, + "loss": 6.4809, + "step": 6436 + }, + { + "epoch": 2.1969283276450513, + "grad_norm": 2.878875255584717, + "learning_rate": 0.0002676905574516496, + "loss": 6.1371, + "step": 6437 + }, + { + "epoch": 2.1972696245733787, + "grad_norm": 3.3660991191864014, + "learning_rate": 0.00026757679180887373, + "loss": 6.0642, + "step": 6438 + }, + { + "epoch": 2.1976109215017066, + "grad_norm": 2.9850966930389404, + "learning_rate": 0.00026746302616609784, + "loss": 5.561, + "step": 6439 + }, + { + "epoch": 2.197952218430034, + "grad_norm": 2.8198862075805664, + "learning_rate": 0.000267349260523322, + "loss": 6.2445, + "step": 6440 + }, + { + "epoch": 2.198293515358362, + "grad_norm": 3.0752429962158203, + "learning_rate": 0.0002672354948805461, + "loss": 5.486, + "step": 6441 + }, + { + "epoch": 2.198634812286689, + "grad_norm": 2.9619510173797607, + "learning_rate": 0.00026712172923777015, + "loss": 5.3807, + "step": 6442 + }, + { + "epoch": 2.198976109215017, + "grad_norm": 2.8163340091705322, + "learning_rate": 0.0002670079635949943, + "loss": 6.5359, + "step": 6443 + }, + { + "epoch": 2.1993174061433445, + "grad_norm": 2.8125815391540527, + "learning_rate": 0.0002668941979522184, + "loss": 6.039, + "step": 6444 + }, + { + "epoch": 2.1996587030716723, + "grad_norm": 2.7999801635742188, + "learning_rate": 0.0002667804323094425, + "loss": 6.1154, + "step": 6445 + }, + { + "epoch": 2.2, + "grad_norm": 2.8857717514038086, + "learning_rate": 0.0002666666666666667, + "loss": 5.9834, + "step": 6446 + }, + { + "epoch": 2.2003412969283276, + "grad_norm": 2.783299446105957, + "learning_rate": 0.0002665529010238908, + "loss": 6.3258, + "step": 6447 + }, + { + "epoch": 2.2006825938566554, + "grad_norm": 2.7714779376983643, + "learning_rate": 0.00026643913538111494, + "loss": 6.0787, + "step": 6448 + }, + { + "epoch": 2.201023890784983, + "grad_norm": 2.6970479488372803, + "learning_rate": 0.00026632536973833905, + "loss": 6.2198, + "step": 6449 + }, + { + "epoch": 2.2013651877133107, + "grad_norm": 2.753514051437378, + "learning_rate": 0.00026621160409556315, + "loss": 5.8685, + "step": 6450 + }, + { + "epoch": 2.201706484641638, + "grad_norm": 2.866901159286499, + "learning_rate": 0.0002660978384527873, + "loss": 6.4346, + "step": 6451 + }, + { + "epoch": 2.202047781569966, + "grad_norm": 2.863551139831543, + "learning_rate": 0.00026598407281001136, + "loss": 5.7247, + "step": 6452 + }, + { + "epoch": 2.2023890784982934, + "grad_norm": 2.88744854927063, + "learning_rate": 0.00026587030716723547, + "loss": 5.9772, + "step": 6453 + }, + { + "epoch": 2.202730375426621, + "grad_norm": 3.1774702072143555, + "learning_rate": 0.00026575654152445963, + "loss": 5.8586, + "step": 6454 + }, + { + "epoch": 2.2030716723549486, + "grad_norm": 2.86907958984375, + "learning_rate": 0.00026564277588168373, + "loss": 6.8104, + "step": 6455 + }, + { + "epoch": 2.2034129692832765, + "grad_norm": 3.6283483505249023, + "learning_rate": 0.00026552901023890784, + "loss": 4.1912, + "step": 6456 + }, + { + "epoch": 2.203754266211604, + "grad_norm": 2.87131404876709, + "learning_rate": 0.000265415244596132, + "loss": 6.1613, + "step": 6457 + }, + { + "epoch": 2.2040955631399317, + "grad_norm": 2.80851149559021, + "learning_rate": 0.0002653014789533561, + "loss": 5.8281, + "step": 6458 + }, + { + "epoch": 2.2044368600682596, + "grad_norm": 2.9386231899261475, + "learning_rate": 0.0002651877133105802, + "loss": 6.7991, + "step": 6459 + }, + { + "epoch": 2.204778156996587, + "grad_norm": 2.9392945766448975, + "learning_rate": 0.00026507394766780437, + "loss": 6.1992, + "step": 6460 + }, + { + "epoch": 2.205119453924915, + "grad_norm": 2.8204219341278076, + "learning_rate": 0.00026496018202502847, + "loss": 6.0136, + "step": 6461 + }, + { + "epoch": 2.2054607508532422, + "grad_norm": 2.826657295227051, + "learning_rate": 0.0002648464163822525, + "loss": 5.8623, + "step": 6462 + }, + { + "epoch": 2.20580204778157, + "grad_norm": 2.8905653953552246, + "learning_rate": 0.0002647326507394767, + "loss": 6.2452, + "step": 6463 + }, + { + "epoch": 2.2061433447098975, + "grad_norm": 2.7385172843933105, + "learning_rate": 0.0002646188850967008, + "loss": 5.9435, + "step": 6464 + }, + { + "epoch": 2.2064846416382253, + "grad_norm": 2.80544114112854, + "learning_rate": 0.0002645051194539249, + "loss": 5.9384, + "step": 6465 + }, + { + "epoch": 2.2068259385665527, + "grad_norm": 2.877896785736084, + "learning_rate": 0.00026439135381114905, + "loss": 5.8879, + "step": 6466 + }, + { + "epoch": 2.2071672354948806, + "grad_norm": 2.7671401500701904, + "learning_rate": 0.00026427758816837315, + "loss": 6.2006, + "step": 6467 + }, + { + "epoch": 2.207508532423208, + "grad_norm": 2.8723201751708984, + "learning_rate": 0.00026416382252559726, + "loss": 6.475, + "step": 6468 + }, + { + "epoch": 2.207849829351536, + "grad_norm": 2.728503704071045, + "learning_rate": 0.0002640500568828214, + "loss": 5.9604, + "step": 6469 + }, + { + "epoch": 2.2081911262798632, + "grad_norm": 2.96976900100708, + "learning_rate": 0.0002639362912400455, + "loss": 5.7367, + "step": 6470 + }, + { + "epoch": 2.208532423208191, + "grad_norm": 5.522859573364258, + "learning_rate": 0.00026382252559726963, + "loss": 4.7263, + "step": 6471 + }, + { + "epoch": 2.208873720136519, + "grad_norm": 3.003938674926758, + "learning_rate": 0.00026370875995449373, + "loss": 5.9086, + "step": 6472 + }, + { + "epoch": 2.2092150170648464, + "grad_norm": 2.7639968395233154, + "learning_rate": 0.00026359499431171784, + "loss": 5.8062, + "step": 6473 + }, + { + "epoch": 2.209556313993174, + "grad_norm": 2.8196215629577637, + "learning_rate": 0.000263481228668942, + "loss": 6.236, + "step": 6474 + }, + { + "epoch": 2.2098976109215016, + "grad_norm": 6.847387790679932, + "learning_rate": 0.0002633674630261661, + "loss": 5.3268, + "step": 6475 + }, + { + "epoch": 2.2102389078498295, + "grad_norm": 2.8416600227355957, + "learning_rate": 0.0002632536973833902, + "loss": 5.8468, + "step": 6476 + }, + { + "epoch": 2.210580204778157, + "grad_norm": 7.3218278884887695, + "learning_rate": 0.00026313993174061437, + "loss": 4.5238, + "step": 6477 + }, + { + "epoch": 2.2109215017064847, + "grad_norm": 2.832921266555786, + "learning_rate": 0.00026302616609783847, + "loss": 5.7101, + "step": 6478 + }, + { + "epoch": 2.211262798634812, + "grad_norm": 2.810177803039551, + "learning_rate": 0.0002629124004550626, + "loss": 5.9935, + "step": 6479 + }, + { + "epoch": 2.21160409556314, + "grad_norm": 2.747629404067993, + "learning_rate": 0.00026279863481228674, + "loss": 6.4255, + "step": 6480 + }, + { + "epoch": 2.2119453924914674, + "grad_norm": 2.820760726928711, + "learning_rate": 0.0002626848691695108, + "loss": 5.8886, + "step": 6481 + }, + { + "epoch": 2.2122866894197952, + "grad_norm": 3.057654857635498, + "learning_rate": 0.0002625711035267349, + "loss": 5.4938, + "step": 6482 + }, + { + "epoch": 2.2126279863481226, + "grad_norm": 3.943042755126953, + "learning_rate": 0.00026245733788395905, + "loss": 5.5298, + "step": 6483 + }, + { + "epoch": 2.2129692832764505, + "grad_norm": 2.72080135345459, + "learning_rate": 0.00026234357224118316, + "loss": 6.1378, + "step": 6484 + }, + { + "epoch": 2.2133105802047783, + "grad_norm": 2.7657735347747803, + "learning_rate": 0.00026222980659840726, + "loss": 6.1694, + "step": 6485 + }, + { + "epoch": 2.2136518771331057, + "grad_norm": 2.795632839202881, + "learning_rate": 0.0002621160409556314, + "loss": 6.1866, + "step": 6486 + }, + { + "epoch": 2.2139931740614336, + "grad_norm": 2.779759407043457, + "learning_rate": 0.0002620022753128555, + "loss": 5.7196, + "step": 6487 + }, + { + "epoch": 2.214334470989761, + "grad_norm": 2.8835349082946777, + "learning_rate": 0.00026188850967007963, + "loss": 6.2627, + "step": 6488 + }, + { + "epoch": 2.214675767918089, + "grad_norm": 5.428569316864014, + "learning_rate": 0.0002617747440273038, + "loss": 5.1553, + "step": 6489 + }, + { + "epoch": 2.2150170648464163, + "grad_norm": 2.7858505249023438, + "learning_rate": 0.0002616609783845279, + "loss": 6.2648, + "step": 6490 + }, + { + "epoch": 2.215358361774744, + "grad_norm": 2.827878475189209, + "learning_rate": 0.000261547212741752, + "loss": 6.1736, + "step": 6491 + }, + { + "epoch": 2.2156996587030715, + "grad_norm": 2.7949113845825195, + "learning_rate": 0.0002614334470989761, + "loss": 6.4478, + "step": 6492 + }, + { + "epoch": 2.2160409556313994, + "grad_norm": 3.1486802101135254, + "learning_rate": 0.0002613196814562002, + "loss": 5.8937, + "step": 6493 + }, + { + "epoch": 2.2163822525597268, + "grad_norm": 2.8336243629455566, + "learning_rate": 0.00026120591581342437, + "loss": 6.8206, + "step": 6494 + }, + { + "epoch": 2.2167235494880546, + "grad_norm": 2.8043153285980225, + "learning_rate": 0.00026109215017064847, + "loss": 6.1785, + "step": 6495 + }, + { + "epoch": 2.217064846416382, + "grad_norm": 2.822309732437134, + "learning_rate": 0.0002609783845278726, + "loss": 5.5858, + "step": 6496 + }, + { + "epoch": 2.21740614334471, + "grad_norm": 3.1378068923950195, + "learning_rate": 0.00026086461888509674, + "loss": 5.7572, + "step": 6497 + }, + { + "epoch": 2.2177474402730377, + "grad_norm": 2.852703332901001, + "learning_rate": 0.00026075085324232084, + "loss": 6.8021, + "step": 6498 + }, + { + "epoch": 2.218088737201365, + "grad_norm": 2.8655879497528076, + "learning_rate": 0.00026063708759954495, + "loss": 5.816, + "step": 6499 + }, + { + "epoch": 2.218430034129693, + "grad_norm": 2.730862855911255, + "learning_rate": 0.0002605233219567691, + "loss": 6.2995, + "step": 6500 + }, + { + "epoch": 2.2187713310580204, + "grad_norm": 2.8955562114715576, + "learning_rate": 0.00026040955631399316, + "loss": 6.3926, + "step": 6501 + }, + { + "epoch": 2.2191126279863482, + "grad_norm": 2.799837350845337, + "learning_rate": 0.00026029579067121726, + "loss": 6.1799, + "step": 6502 + }, + { + "epoch": 2.2194539249146756, + "grad_norm": 2.7385647296905518, + "learning_rate": 0.0002601820250284414, + "loss": 6.1926, + "step": 6503 + }, + { + "epoch": 2.2197952218430035, + "grad_norm": 2.808077096939087, + "learning_rate": 0.0002600682593856655, + "loss": 6.1108, + "step": 6504 + }, + { + "epoch": 2.220136518771331, + "grad_norm": 2.8860621452331543, + "learning_rate": 0.00025995449374288963, + "loss": 6.0537, + "step": 6505 + }, + { + "epoch": 2.2204778156996587, + "grad_norm": 2.850010871887207, + "learning_rate": 0.0002598407281001138, + "loss": 6.5343, + "step": 6506 + }, + { + "epoch": 2.220819112627986, + "grad_norm": 2.805081367492676, + "learning_rate": 0.0002597269624573379, + "loss": 6.2704, + "step": 6507 + }, + { + "epoch": 2.221160409556314, + "grad_norm": 3.074279308319092, + "learning_rate": 0.000259613196814562, + "loss": 5.9484, + "step": 6508 + }, + { + "epoch": 2.2215017064846414, + "grad_norm": 2.8078973293304443, + "learning_rate": 0.00025949943117178616, + "loss": 5.8503, + "step": 6509 + }, + { + "epoch": 2.2218430034129693, + "grad_norm": 2.807574987411499, + "learning_rate": 0.0002593856655290102, + "loss": 5.7379, + "step": 6510 + }, + { + "epoch": 2.222184300341297, + "grad_norm": 2.841683864593506, + "learning_rate": 0.00025927189988623437, + "loss": 6.2235, + "step": 6511 + }, + { + "epoch": 2.2225255972696245, + "grad_norm": 2.873955726623535, + "learning_rate": 0.0002591581342434585, + "loss": 5.6057, + "step": 6512 + }, + { + "epoch": 2.2228668941979524, + "grad_norm": 2.908351421356201, + "learning_rate": 0.0002590443686006826, + "loss": 5.904, + "step": 6513 + }, + { + "epoch": 2.2232081911262798, + "grad_norm": 7.525314807891846, + "learning_rate": 0.00025893060295790674, + "loss": 5.3918, + "step": 6514 + }, + { + "epoch": 2.2235494880546076, + "grad_norm": 2.931612253189087, + "learning_rate": 0.00025881683731513084, + "loss": 5.8895, + "step": 6515 + }, + { + "epoch": 2.223890784982935, + "grad_norm": 2.975277900695801, + "learning_rate": 0.00025870307167235495, + "loss": 6.1299, + "step": 6516 + }, + { + "epoch": 2.224232081911263, + "grad_norm": 2.8416590690612793, + "learning_rate": 0.0002585893060295791, + "loss": 6.2545, + "step": 6517 + }, + { + "epoch": 2.2245733788395903, + "grad_norm": 2.888201951980591, + "learning_rate": 0.0002584755403868032, + "loss": 6.193, + "step": 6518 + }, + { + "epoch": 2.224914675767918, + "grad_norm": 2.7630908489227295, + "learning_rate": 0.0002583617747440273, + "loss": 5.9981, + "step": 6519 + }, + { + "epoch": 2.2252559726962455, + "grad_norm": 2.9234695434570312, + "learning_rate": 0.0002582480091012514, + "loss": 6.2954, + "step": 6520 + }, + { + "epoch": 2.2255972696245734, + "grad_norm": 3.6878268718719482, + "learning_rate": 0.0002581342434584755, + "loss": 5.1989, + "step": 6521 + }, + { + "epoch": 2.225938566552901, + "grad_norm": 2.8937203884124756, + "learning_rate": 0.00025802047781569963, + "loss": 6.15, + "step": 6522 + }, + { + "epoch": 2.2262798634812286, + "grad_norm": 2.8810667991638184, + "learning_rate": 0.0002579067121729238, + "loss": 6.2176, + "step": 6523 + }, + { + "epoch": 2.2266211604095565, + "grad_norm": 3.0311129093170166, + "learning_rate": 0.0002577929465301479, + "loss": 5.6466, + "step": 6524 + }, + { + "epoch": 2.226962457337884, + "grad_norm": 3.263695001602173, + "learning_rate": 0.000257679180887372, + "loss": 4.1231, + "step": 6525 + }, + { + "epoch": 2.2273037542662117, + "grad_norm": 2.8339765071868896, + "learning_rate": 0.00025756541524459616, + "loss": 6.2549, + "step": 6526 + }, + { + "epoch": 2.227645051194539, + "grad_norm": 2.7960495948791504, + "learning_rate": 0.00025745164960182026, + "loss": 6.3462, + "step": 6527 + }, + { + "epoch": 2.227986348122867, + "grad_norm": 2.9964165687561035, + "learning_rate": 0.00025733788395904437, + "loss": 5.2995, + "step": 6528 + }, + { + "epoch": 2.2283276450511944, + "grad_norm": 2.8184449672698975, + "learning_rate": 0.00025722411831626853, + "loss": 6.5219, + "step": 6529 + }, + { + "epoch": 2.2286689419795223, + "grad_norm": 2.7724874019622803, + "learning_rate": 0.0002571103526734926, + "loss": 6.0435, + "step": 6530 + }, + { + "epoch": 2.2290102389078497, + "grad_norm": 3.961784601211548, + "learning_rate": 0.0002569965870307167, + "loss": 5.7937, + "step": 6531 + }, + { + "epoch": 2.2293515358361775, + "grad_norm": 2.875619888305664, + "learning_rate": 0.00025688282138794084, + "loss": 6.6192, + "step": 6532 + }, + { + "epoch": 2.229692832764505, + "grad_norm": 2.870389699935913, + "learning_rate": 0.00025676905574516495, + "loss": 6.4559, + "step": 6533 + }, + { + "epoch": 2.2300341296928328, + "grad_norm": 3.843813896179199, + "learning_rate": 0.0002566552901023891, + "loss": 4.74, + "step": 6534 + }, + { + "epoch": 2.2303754266211606, + "grad_norm": 3.107849597930908, + "learning_rate": 0.0002565415244596132, + "loss": 5.6221, + "step": 6535 + }, + { + "epoch": 2.230716723549488, + "grad_norm": 2.7651352882385254, + "learning_rate": 0.0002564277588168373, + "loss": 6.2125, + "step": 6536 + }, + { + "epoch": 2.231058020477816, + "grad_norm": 2.9481201171875, + "learning_rate": 0.0002563139931740615, + "loss": 5.7552, + "step": 6537 + }, + { + "epoch": 2.2313993174061433, + "grad_norm": 2.9435575008392334, + "learning_rate": 0.0002562002275312856, + "loss": 6.1869, + "step": 6538 + }, + { + "epoch": 2.231740614334471, + "grad_norm": 2.826810359954834, + "learning_rate": 0.0002560864618885097, + "loss": 6.1379, + "step": 6539 + }, + { + "epoch": 2.2320819112627985, + "grad_norm": 2.7598679065704346, + "learning_rate": 0.0002559726962457338, + "loss": 6.0253, + "step": 6540 + }, + { + "epoch": 2.2324232081911264, + "grad_norm": 2.83957576751709, + "learning_rate": 0.0002558589306029579, + "loss": 5.9333, + "step": 6541 + }, + { + "epoch": 2.232764505119454, + "grad_norm": 3.761160135269165, + "learning_rate": 0.000255745164960182, + "loss": 5.4008, + "step": 6542 + }, + { + "epoch": 2.2331058020477816, + "grad_norm": 2.7840142250061035, + "learning_rate": 0.00025563139931740616, + "loss": 6.4198, + "step": 6543 + }, + { + "epoch": 2.233447098976109, + "grad_norm": 2.7931652069091797, + "learning_rate": 0.00025551763367463026, + "loss": 6.5151, + "step": 6544 + }, + { + "epoch": 2.233788395904437, + "grad_norm": 2.733041286468506, + "learning_rate": 0.00025540386803185437, + "loss": 6.168, + "step": 6545 + }, + { + "epoch": 2.2341296928327643, + "grad_norm": 3.0885894298553467, + "learning_rate": 0.00025529010238907853, + "loss": 5.4425, + "step": 6546 + }, + { + "epoch": 2.234470989761092, + "grad_norm": 2.77445912361145, + "learning_rate": 0.00025517633674630263, + "loss": 6.0673, + "step": 6547 + }, + { + "epoch": 2.23481228668942, + "grad_norm": 2.836225986480713, + "learning_rate": 0.00025506257110352674, + "loss": 6.3828, + "step": 6548 + }, + { + "epoch": 2.2351535836177474, + "grad_norm": 2.814570903778076, + "learning_rate": 0.00025494880546075084, + "loss": 6.2877, + "step": 6549 + }, + { + "epoch": 2.2354948805460753, + "grad_norm": 2.966297149658203, + "learning_rate": 0.00025483503981797495, + "loss": 6.2828, + "step": 6550 + }, + { + "epoch": 2.2358361774744027, + "grad_norm": 2.8446590900421143, + "learning_rate": 0.00025472127417519905, + "loss": 5.9261, + "step": 6551 + }, + { + "epoch": 2.2361774744027305, + "grad_norm": 2.9468929767608643, + "learning_rate": 0.0002546075085324232, + "loss": 6.1299, + "step": 6552 + }, + { + "epoch": 2.236518771331058, + "grad_norm": 2.959243059158325, + "learning_rate": 0.0002544937428896473, + "loss": 5.9142, + "step": 6553 + }, + { + "epoch": 2.2368600682593858, + "grad_norm": 2.908616781234741, + "learning_rate": 0.0002543799772468715, + "loss": 5.9464, + "step": 6554 + }, + { + "epoch": 2.237201365187713, + "grad_norm": 2.857999563217163, + "learning_rate": 0.0002542662116040956, + "loss": 6.5652, + "step": 6555 + }, + { + "epoch": 2.237542662116041, + "grad_norm": 2.829493522644043, + "learning_rate": 0.0002541524459613197, + "loss": 5.7772, + "step": 6556 + }, + { + "epoch": 2.2378839590443684, + "grad_norm": 2.8293933868408203, + "learning_rate": 0.00025403868031854385, + "loss": 6.0328, + "step": 6557 + }, + { + "epoch": 2.2382252559726963, + "grad_norm": 2.949575662612915, + "learning_rate": 0.00025392491467576795, + "loss": 6.5024, + "step": 6558 + }, + { + "epoch": 2.2385665529010237, + "grad_norm": 3.105191946029663, + "learning_rate": 0.000253811149032992, + "loss": 6.3816, + "step": 6559 + }, + { + "epoch": 2.2389078498293515, + "grad_norm": 2.836264133453369, + "learning_rate": 0.00025369738339021616, + "loss": 6.4266, + "step": 6560 + }, + { + "epoch": 2.2392491467576794, + "grad_norm": 2.8078081607818604, + "learning_rate": 0.00025358361774744027, + "loss": 6.5147, + "step": 6561 + }, + { + "epoch": 2.239590443686007, + "grad_norm": 2.8312695026397705, + "learning_rate": 0.00025346985210466437, + "loss": 6.6833, + "step": 6562 + }, + { + "epoch": 2.2399317406143346, + "grad_norm": 3.007437229156494, + "learning_rate": 0.00025335608646188853, + "loss": 5.3381, + "step": 6563 + }, + { + "epoch": 2.240273037542662, + "grad_norm": 3.071810722351074, + "learning_rate": 0.00025324232081911263, + "loss": 5.3215, + "step": 6564 + }, + { + "epoch": 2.24061433447099, + "grad_norm": 2.7744386196136475, + "learning_rate": 0.00025312855517633674, + "loss": 6.1504, + "step": 6565 + }, + { + "epoch": 2.2409556313993173, + "grad_norm": 2.710651159286499, + "learning_rate": 0.0002530147895335609, + "loss": 6.2226, + "step": 6566 + }, + { + "epoch": 2.241296928327645, + "grad_norm": 2.8886449337005615, + "learning_rate": 0.000252901023890785, + "loss": 6.6402, + "step": 6567 + }, + { + "epoch": 2.2416382252559726, + "grad_norm": 2.7804300785064697, + "learning_rate": 0.0002527872582480091, + "loss": 5.756, + "step": 6568 + }, + { + "epoch": 2.2419795221843004, + "grad_norm": 2.8473620414733887, + "learning_rate": 0.0002526734926052332, + "loss": 6.4581, + "step": 6569 + }, + { + "epoch": 2.242320819112628, + "grad_norm": 2.7306015491485596, + "learning_rate": 0.0002525597269624573, + "loss": 5.8941, + "step": 6570 + }, + { + "epoch": 2.2426621160409557, + "grad_norm": 4.8221516609191895, + "learning_rate": 0.0002524459613196814, + "loss": 4.8643, + "step": 6571 + }, + { + "epoch": 2.243003412969283, + "grad_norm": 2.7513792514801025, + "learning_rate": 0.0002523321956769056, + "loss": 5.9781, + "step": 6572 + }, + { + "epoch": 2.243344709897611, + "grad_norm": 2.6935529708862305, + "learning_rate": 0.0002522184300341297, + "loss": 6.2471, + "step": 6573 + }, + { + "epoch": 2.2436860068259388, + "grad_norm": 2.7685117721557617, + "learning_rate": 0.00025210466439135385, + "loss": 6.2081, + "step": 6574 + }, + { + "epoch": 2.244027303754266, + "grad_norm": 2.770594596862793, + "learning_rate": 0.00025199089874857795, + "loss": 5.9571, + "step": 6575 + }, + { + "epoch": 2.244368600682594, + "grad_norm": 2.826627731323242, + "learning_rate": 0.00025187713310580206, + "loss": 5.9076, + "step": 6576 + }, + { + "epoch": 2.2447098976109214, + "grad_norm": 2.8154921531677246, + "learning_rate": 0.0002517633674630262, + "loss": 5.9773, + "step": 6577 + }, + { + "epoch": 2.2450511945392493, + "grad_norm": 2.681960105895996, + "learning_rate": 0.00025164960182025027, + "loss": 5.4657, + "step": 6578 + }, + { + "epoch": 2.2453924914675767, + "grad_norm": 2.780457019805908, + "learning_rate": 0.00025153583617747437, + "loss": 6.2352, + "step": 6579 + }, + { + "epoch": 2.2457337883959045, + "grad_norm": 2.8633296489715576, + "learning_rate": 0.00025142207053469853, + "loss": 6.102, + "step": 6580 + }, + { + "epoch": 2.246075085324232, + "grad_norm": 3.040717124938965, + "learning_rate": 0.00025130830489192264, + "loss": 5.2462, + "step": 6581 + }, + { + "epoch": 2.24641638225256, + "grad_norm": 2.7666218280792236, + "learning_rate": 0.00025119453924914674, + "loss": 5.4919, + "step": 6582 + }, + { + "epoch": 2.246757679180887, + "grad_norm": 2.838775157928467, + "learning_rate": 0.0002510807736063709, + "loss": 6.2391, + "step": 6583 + }, + { + "epoch": 2.247098976109215, + "grad_norm": 2.8280463218688965, + "learning_rate": 0.000250967007963595, + "loss": 6.3613, + "step": 6584 + }, + { + "epoch": 2.2474402730375425, + "grad_norm": 2.8792102336883545, + "learning_rate": 0.0002508532423208191, + "loss": 5.8355, + "step": 6585 + }, + { + "epoch": 2.2477815699658703, + "grad_norm": 2.7618908882141113, + "learning_rate": 0.00025073947667804327, + "loss": 6.0076, + "step": 6586 + }, + { + "epoch": 2.248122866894198, + "grad_norm": 2.7827494144439697, + "learning_rate": 0.0002506257110352674, + "loss": 4.3666, + "step": 6587 + }, + { + "epoch": 2.2484641638225256, + "grad_norm": 2.926374673843384, + "learning_rate": 0.0002505119453924914, + "loss": 6.048, + "step": 6588 + }, + { + "epoch": 2.2488054607508534, + "grad_norm": 2.982754945755005, + "learning_rate": 0.0002503981797497156, + "loss": 5.8397, + "step": 6589 + }, + { + "epoch": 2.249146757679181, + "grad_norm": 2.973355770111084, + "learning_rate": 0.0002502844141069397, + "loss": 5.4961, + "step": 6590 + }, + { + "epoch": 2.2494880546075087, + "grad_norm": 2.7500498294830322, + "learning_rate": 0.0002501706484641638, + "loss": 5.631, + "step": 6591 + }, + { + "epoch": 2.249829351535836, + "grad_norm": 2.882366180419922, + "learning_rate": 0.00025005688282138795, + "loss": 5.8569, + "step": 6592 + }, + { + "epoch": 2.250170648464164, + "grad_norm": 2.73225736618042, + "learning_rate": 0.00024994311717861206, + "loss": 6.4002, + "step": 6593 + }, + { + "epoch": 2.2505119453924913, + "grad_norm": 2.800004720687866, + "learning_rate": 0.0002498293515358362, + "loss": 6.4107, + "step": 6594 + }, + { + "epoch": 2.250853242320819, + "grad_norm": 2.6945483684539795, + "learning_rate": 0.00024971558589306027, + "loss": 6.414, + "step": 6595 + }, + { + "epoch": 2.2511945392491466, + "grad_norm": 2.850060224533081, + "learning_rate": 0.0002496018202502844, + "loss": 5.8978, + "step": 6596 + }, + { + "epoch": 2.2515358361774744, + "grad_norm": 2.810472249984741, + "learning_rate": 0.00024948805460750853, + "loss": 5.7288, + "step": 6597 + }, + { + "epoch": 2.2518771331058023, + "grad_norm": 4.052561283111572, + "learning_rate": 0.00024937428896473264, + "loss": 5.1013, + "step": 6598 + }, + { + "epoch": 2.2522184300341297, + "grad_norm": 3.0097126960754395, + "learning_rate": 0.0002492605233219568, + "loss": 5.8071, + "step": 6599 + }, + { + "epoch": 2.252559726962457, + "grad_norm": 2.8566362857818604, + "learning_rate": 0.0002491467576791809, + "loss": 6.1921, + "step": 6600 + }, + { + "epoch": 2.252901023890785, + "grad_norm": 3.514784336090088, + "learning_rate": 0.000249032992036405, + "loss": 4.8535, + "step": 6601 + }, + { + "epoch": 2.253242320819113, + "grad_norm": 3.010627031326294, + "learning_rate": 0.0002489192263936291, + "loss": 6.0636, + "step": 6602 + }, + { + "epoch": 2.25358361774744, + "grad_norm": 2.843026876449585, + "learning_rate": 0.00024880546075085327, + "loss": 5.8824, + "step": 6603 + }, + { + "epoch": 2.253924914675768, + "grad_norm": 2.940535306930542, + "learning_rate": 0.0002486916951080774, + "loss": 5.8778, + "step": 6604 + }, + { + "epoch": 2.2542662116040955, + "grad_norm": 2.917555570602417, + "learning_rate": 0.0002485779294653015, + "loss": 6.1297, + "step": 6605 + }, + { + "epoch": 2.2546075085324233, + "grad_norm": 3.041822910308838, + "learning_rate": 0.0002484641638225256, + "loss": 5.5592, + "step": 6606 + }, + { + "epoch": 2.2549488054607507, + "grad_norm": 2.8139913082122803, + "learning_rate": 0.00024835039817974974, + "loss": 6.2674, + "step": 6607 + }, + { + "epoch": 2.2552901023890786, + "grad_norm": 2.7989606857299805, + "learning_rate": 0.00024823663253697385, + "loss": 5.9394, + "step": 6608 + }, + { + "epoch": 2.255631399317406, + "grad_norm": 2.64013671875, + "learning_rate": 0.00024812286689419795, + "loss": 6.1189, + "step": 6609 + }, + { + "epoch": 2.255972696245734, + "grad_norm": 2.7237131595611572, + "learning_rate": 0.00024800910125142206, + "loss": 5.7884, + "step": 6610 + }, + { + "epoch": 2.2563139931740617, + "grad_norm": 4.8718719482421875, + "learning_rate": 0.00024789533560864616, + "loss": 6.2122, + "step": 6611 + }, + { + "epoch": 2.256655290102389, + "grad_norm": 3.137288808822632, + "learning_rate": 0.0002477815699658703, + "loss": 6.0289, + "step": 6612 + }, + { + "epoch": 2.2569965870307165, + "grad_norm": 2.8663458824157715, + "learning_rate": 0.00024766780432309443, + "loss": 5.903, + "step": 6613 + }, + { + "epoch": 2.2573378839590443, + "grad_norm": 2.8547210693359375, + "learning_rate": 0.0002475540386803186, + "loss": 6.4314, + "step": 6614 + }, + { + "epoch": 2.257679180887372, + "grad_norm": 2.7361135482788086, + "learning_rate": 0.00024744027303754264, + "loss": 5.6617, + "step": 6615 + }, + { + "epoch": 2.2580204778156996, + "grad_norm": 3.2815380096435547, + "learning_rate": 0.0002473265073947668, + "loss": 5.6537, + "step": 6616 + }, + { + "epoch": 2.2583617747440274, + "grad_norm": 2.7974159717559814, + "learning_rate": 0.0002472127417519909, + "loss": 6.072, + "step": 6617 + }, + { + "epoch": 2.258703071672355, + "grad_norm": 2.8251793384552, + "learning_rate": 0.000247098976109215, + "loss": 6.4367, + "step": 6618 + }, + { + "epoch": 2.2590443686006827, + "grad_norm": 2.9200313091278076, + "learning_rate": 0.00024698521046643917, + "loss": 5.9348, + "step": 6619 + }, + { + "epoch": 2.25938566552901, + "grad_norm": 3.057755708694458, + "learning_rate": 0.00024687144482366327, + "loss": 5.5646, + "step": 6620 + }, + { + "epoch": 2.259726962457338, + "grad_norm": 2.739042043685913, + "learning_rate": 0.0002467576791808874, + "loss": 6.5243, + "step": 6621 + }, + { + "epoch": 2.2600682593856654, + "grad_norm": 2.8239619731903076, + "learning_rate": 0.0002466439135381115, + "loss": 5.7505, + "step": 6622 + }, + { + "epoch": 2.260409556313993, + "grad_norm": 2.8290717601776123, + "learning_rate": 0.00024653014789533564, + "loss": 5.9059, + "step": 6623 + }, + { + "epoch": 2.260750853242321, + "grad_norm": 3.1619184017181396, + "learning_rate": 0.00024641638225255974, + "loss": 5.6338, + "step": 6624 + }, + { + "epoch": 2.2610921501706485, + "grad_norm": 2.940603256225586, + "learning_rate": 0.00024630261660978385, + "loss": 5.4515, + "step": 6625 + }, + { + "epoch": 2.261433447098976, + "grad_norm": 3.02312970161438, + "learning_rate": 0.00024618885096700795, + "loss": 5.0054, + "step": 6626 + }, + { + "epoch": 2.2617747440273037, + "grad_norm": 2.7911484241485596, + "learning_rate": 0.0002460750853242321, + "loss": 6.7703, + "step": 6627 + }, + { + "epoch": 2.2621160409556316, + "grad_norm": 2.8118064403533936, + "learning_rate": 0.0002459613196814562, + "loss": 5.8083, + "step": 6628 + }, + { + "epoch": 2.262457337883959, + "grad_norm": 2.7824995517730713, + "learning_rate": 0.0002458475540386803, + "loss": 5.9023, + "step": 6629 + }, + { + "epoch": 2.262798634812287, + "grad_norm": 2.8783822059631348, + "learning_rate": 0.00024573378839590443, + "loss": 6.2359, + "step": 6630 + }, + { + "epoch": 2.2631399317406142, + "grad_norm": 2.9949822425842285, + "learning_rate": 0.00024562002275312853, + "loss": 5.0531, + "step": 6631 + }, + { + "epoch": 2.263481228668942, + "grad_norm": 2.8121888637542725, + "learning_rate": 0.0002455062571103527, + "loss": 5.86, + "step": 6632 + }, + { + "epoch": 2.2638225255972695, + "grad_norm": 2.776623249053955, + "learning_rate": 0.0002453924914675768, + "loss": 6.3007, + "step": 6633 + }, + { + "epoch": 2.2641638225255973, + "grad_norm": 2.7938060760498047, + "learning_rate": 0.0002452787258248009, + "loss": 6.3209, + "step": 6634 + }, + { + "epoch": 2.2645051194539247, + "grad_norm": 2.750595808029175, + "learning_rate": 0.000245164960182025, + "loss": 6.1808, + "step": 6635 + }, + { + "epoch": 2.2648464163822526, + "grad_norm": 2.7990636825561523, + "learning_rate": 0.00024505119453924917, + "loss": 6.4628, + "step": 6636 + }, + { + "epoch": 2.2651877133105804, + "grad_norm": 2.674833059310913, + "learning_rate": 0.00024493742889647327, + "loss": 6.1304, + "step": 6637 + }, + { + "epoch": 2.265529010238908, + "grad_norm": 2.6972098350524902, + "learning_rate": 0.0002448236632536974, + "loss": 6.4638, + "step": 6638 + }, + { + "epoch": 2.2658703071672353, + "grad_norm": 2.8341851234436035, + "learning_rate": 0.0002447098976109215, + "loss": 6.2938, + "step": 6639 + }, + { + "epoch": 2.266211604095563, + "grad_norm": 2.8352243900299072, + "learning_rate": 0.00024459613196814564, + "loss": 5.7276, + "step": 6640 + }, + { + "epoch": 2.266552901023891, + "grad_norm": 2.8336145877838135, + "learning_rate": 0.00024448236632536975, + "loss": 5.9019, + "step": 6641 + }, + { + "epoch": 2.2668941979522184, + "grad_norm": 2.872852087020874, + "learning_rate": 0.00024436860068259385, + "loss": 6.4402, + "step": 6642 + }, + { + "epoch": 2.267235494880546, + "grad_norm": 2.7706029415130615, + "learning_rate": 0.000244254835039818, + "loss": 5.992, + "step": 6643 + }, + { + "epoch": 2.2675767918088736, + "grad_norm": 2.805307149887085, + "learning_rate": 0.00024414106939704206, + "loss": 5.5684, + "step": 6644 + }, + { + "epoch": 2.2679180887372015, + "grad_norm": 2.671431303024292, + "learning_rate": 0.00024402730375426622, + "loss": 5.8068, + "step": 6645 + }, + { + "epoch": 2.268259385665529, + "grad_norm": 2.8474628925323486, + "learning_rate": 0.00024391353811149032, + "loss": 6.6582, + "step": 6646 + }, + { + "epoch": 2.2686006825938567, + "grad_norm": 2.825212240219116, + "learning_rate": 0.00024379977246871446, + "loss": 6.1744, + "step": 6647 + }, + { + "epoch": 2.268941979522184, + "grad_norm": 2.828723669052124, + "learning_rate": 0.0002436860068259386, + "loss": 5.794, + "step": 6648 + }, + { + "epoch": 2.269283276450512, + "grad_norm": 2.701645612716675, + "learning_rate": 0.00024357224118316267, + "loss": 6.222, + "step": 6649 + }, + { + "epoch": 2.26962457337884, + "grad_norm": 2.8495328426361084, + "learning_rate": 0.0002434584755403868, + "loss": 6.0397, + "step": 6650 + }, + { + "epoch": 2.2699658703071672, + "grad_norm": 2.8856194019317627, + "learning_rate": 0.00024334470989761093, + "loss": 5.6817, + "step": 6651 + }, + { + "epoch": 2.2703071672354946, + "grad_norm": 2.8644917011260986, + "learning_rate": 0.00024323094425483506, + "loss": 5.5872, + "step": 6652 + }, + { + "epoch": 2.2706484641638225, + "grad_norm": 2.808652877807617, + "learning_rate": 0.00024311717861205917, + "loss": 5.4768, + "step": 6653 + }, + { + "epoch": 2.2709897610921503, + "grad_norm": 3.034461259841919, + "learning_rate": 0.00024300341296928327, + "loss": 6.1287, + "step": 6654 + }, + { + "epoch": 2.2713310580204777, + "grad_norm": 2.723137140274048, + "learning_rate": 0.0002428896473265074, + "loss": 6.1159, + "step": 6655 + }, + { + "epoch": 2.2716723549488056, + "grad_norm": 2.77011775970459, + "learning_rate": 0.0002427758816837315, + "loss": 6.3846, + "step": 6656 + }, + { + "epoch": 2.272013651877133, + "grad_norm": 2.8131182193756104, + "learning_rate": 0.00024266211604095564, + "loss": 6.6823, + "step": 6657 + }, + { + "epoch": 2.272354948805461, + "grad_norm": 2.815347194671631, + "learning_rate": 0.00024254835039817977, + "loss": 6.3332, + "step": 6658 + }, + { + "epoch": 2.2726962457337883, + "grad_norm": 2.8586032390594482, + "learning_rate": 0.00024243458475540385, + "loss": 6.2077, + "step": 6659 + }, + { + "epoch": 2.273037542662116, + "grad_norm": 2.8561007976531982, + "learning_rate": 0.00024232081911262798, + "loss": 5.5016, + "step": 6660 + }, + { + "epoch": 2.2733788395904435, + "grad_norm": 2.8218114376068115, + "learning_rate": 0.00024220705346985212, + "loss": 6.3183, + "step": 6661 + }, + { + "epoch": 2.2737201365187714, + "grad_norm": 2.9033164978027344, + "learning_rate": 0.00024209328782707625, + "loss": 5.2998, + "step": 6662 + }, + { + "epoch": 2.274061433447099, + "grad_norm": 2.784006357192993, + "learning_rate": 0.00024197952218430033, + "loss": 6.1072, + "step": 6663 + }, + { + "epoch": 2.2744027303754266, + "grad_norm": 2.8394157886505127, + "learning_rate": 0.00024186575654152446, + "loss": 5.9447, + "step": 6664 + }, + { + "epoch": 2.274744027303754, + "grad_norm": 2.8039710521698, + "learning_rate": 0.0002417519908987486, + "loss": 6.2788, + "step": 6665 + }, + { + "epoch": 2.275085324232082, + "grad_norm": 2.781151533126831, + "learning_rate": 0.0002416382252559727, + "loss": 6.0061, + "step": 6666 + }, + { + "epoch": 2.2754266211604097, + "grad_norm": 2.732513427734375, + "learning_rate": 0.00024152445961319683, + "loss": 6.1009, + "step": 6667 + }, + { + "epoch": 2.275767918088737, + "grad_norm": 2.8295493125915527, + "learning_rate": 0.00024141069397042093, + "loss": 5.4378, + "step": 6668 + }, + { + "epoch": 2.276109215017065, + "grad_norm": 2.9463069438934326, + "learning_rate": 0.00024129692832764504, + "loss": 5.8396, + "step": 6669 + }, + { + "epoch": 2.2764505119453924, + "grad_norm": 2.698521375656128, + "learning_rate": 0.00024118316268486917, + "loss": 5.6336, + "step": 6670 + }, + { + "epoch": 2.2767918088737202, + "grad_norm": 4.356855869293213, + "learning_rate": 0.0002410693970420933, + "loss": 4.6489, + "step": 6671 + }, + { + "epoch": 2.2771331058020476, + "grad_norm": 2.87615704536438, + "learning_rate": 0.00024095563139931743, + "loss": 6.4171, + "step": 6672 + }, + { + "epoch": 2.2774744027303755, + "grad_norm": 2.763654947280884, + "learning_rate": 0.0002408418657565415, + "loss": 6.1001, + "step": 6673 + }, + { + "epoch": 2.277815699658703, + "grad_norm": 2.8707218170166016, + "learning_rate": 0.00024072810011376564, + "loss": 5.8888, + "step": 6674 + }, + { + "epoch": 2.2781569965870307, + "grad_norm": 2.8277971744537354, + "learning_rate": 0.00024061433447098977, + "loss": 5.6618, + "step": 6675 + }, + { + "epoch": 2.2784982935153586, + "grad_norm": 2.787092447280884, + "learning_rate": 0.00024050056882821388, + "loss": 6.5541, + "step": 6676 + }, + { + "epoch": 2.278839590443686, + "grad_norm": 3.1452341079711914, + "learning_rate": 0.000240386803185438, + "loss": 4.8793, + "step": 6677 + }, + { + "epoch": 2.2791808873720134, + "grad_norm": 2.8025355339050293, + "learning_rate": 0.00024027303754266212, + "loss": 5.8608, + "step": 6678 + }, + { + "epoch": 2.2795221843003413, + "grad_norm": 2.7934341430664062, + "learning_rate": 0.00024015927189988622, + "loss": 6.2955, + "step": 6679 + }, + { + "epoch": 2.279863481228669, + "grad_norm": 2.744135618209839, + "learning_rate": 0.00024004550625711035, + "loss": 6.4455, + "step": 6680 + }, + { + "epoch": 2.2802047781569965, + "grad_norm": 2.76705002784729, + "learning_rate": 0.00023993174061433449, + "loss": 5.3531, + "step": 6681 + }, + { + "epoch": 2.2805460750853244, + "grad_norm": 2.9204037189483643, + "learning_rate": 0.00023981797497155862, + "loss": 6.5813, + "step": 6682 + }, + { + "epoch": 2.2808873720136518, + "grad_norm": 2.7960617542266846, + "learning_rate": 0.0002397042093287827, + "loss": 6.678, + "step": 6683 + }, + { + "epoch": 2.2812286689419796, + "grad_norm": 2.736600399017334, + "learning_rate": 0.00023959044368600683, + "loss": 6.0339, + "step": 6684 + }, + { + "epoch": 2.281569965870307, + "grad_norm": 3.3651063442230225, + "learning_rate": 0.00023947667804323096, + "loss": 5.7733, + "step": 6685 + }, + { + "epoch": 2.281911262798635, + "grad_norm": 2.7875654697418213, + "learning_rate": 0.00023936291240045506, + "loss": 5.9666, + "step": 6686 + }, + { + "epoch": 2.2822525597269623, + "grad_norm": 2.8029191493988037, + "learning_rate": 0.0002392491467576792, + "loss": 6.4682, + "step": 6687 + }, + { + "epoch": 2.28259385665529, + "grad_norm": 2.731361150741577, + "learning_rate": 0.0002391353811149033, + "loss": 6.54, + "step": 6688 + }, + { + "epoch": 2.282935153583618, + "grad_norm": 2.759530782699585, + "learning_rate": 0.0002390216154721274, + "loss": 6.4203, + "step": 6689 + }, + { + "epoch": 2.2832764505119454, + "grad_norm": 2.8398633003234863, + "learning_rate": 0.00023890784982935154, + "loss": 6.7545, + "step": 6690 + }, + { + "epoch": 2.283617747440273, + "grad_norm": 4.068755626678467, + "learning_rate": 0.00023879408418657567, + "loss": 5.7944, + "step": 6691 + }, + { + "epoch": 2.2839590443686006, + "grad_norm": 2.7552263736724854, + "learning_rate": 0.0002386803185437998, + "loss": 6.4369, + "step": 6692 + }, + { + "epoch": 2.2843003412969285, + "grad_norm": 2.8534114360809326, + "learning_rate": 0.00023856655290102388, + "loss": 5.6889, + "step": 6693 + }, + { + "epoch": 2.284641638225256, + "grad_norm": 2.8475186824798584, + "learning_rate": 0.000238452787258248, + "loss": 5.7206, + "step": 6694 + }, + { + "epoch": 2.2849829351535837, + "grad_norm": 2.8369054794311523, + "learning_rate": 0.00023833902161547214, + "loss": 5.6994, + "step": 6695 + }, + { + "epoch": 2.285324232081911, + "grad_norm": 2.86474609375, + "learning_rate": 0.00023822525597269625, + "loss": 6.6088, + "step": 6696 + }, + { + "epoch": 2.285665529010239, + "grad_norm": 2.8578531742095947, + "learning_rate": 0.00023811149032992038, + "loss": 6.2553, + "step": 6697 + }, + { + "epoch": 2.2860068259385664, + "grad_norm": 2.844667673110962, + "learning_rate": 0.00023799772468714449, + "loss": 5.9709, + "step": 6698 + }, + { + "epoch": 2.2863481228668943, + "grad_norm": 2.900826930999756, + "learning_rate": 0.0002378839590443686, + "loss": 6.7451, + "step": 6699 + }, + { + "epoch": 2.2866894197952217, + "grad_norm": 2.819692373275757, + "learning_rate": 0.00023777019340159272, + "loss": 5.666, + "step": 6700 + }, + { + "epoch": 2.2870307167235495, + "grad_norm": 2.7792670726776123, + "learning_rate": 0.00023765642775881686, + "loss": 6.7826, + "step": 6701 + }, + { + "epoch": 2.2873720136518774, + "grad_norm": 2.857227087020874, + "learning_rate": 0.00023754266211604096, + "loss": 5.8068, + "step": 6702 + }, + { + "epoch": 2.2877133105802048, + "grad_norm": 2.8718831539154053, + "learning_rate": 0.00023742889647326507, + "loss": 6.0465, + "step": 6703 + }, + { + "epoch": 2.288054607508532, + "grad_norm": 2.8006720542907715, + "learning_rate": 0.0002373151308304892, + "loss": 6.0748, + "step": 6704 + }, + { + "epoch": 2.28839590443686, + "grad_norm": 2.717714548110962, + "learning_rate": 0.00023720136518771333, + "loss": 6.2189, + "step": 6705 + }, + { + "epoch": 2.288737201365188, + "grad_norm": 2.80198073387146, + "learning_rate": 0.00023708759954493743, + "loss": 6.7073, + "step": 6706 + }, + { + "epoch": 2.2890784982935153, + "grad_norm": 2.960460901260376, + "learning_rate": 0.00023697383390216154, + "loss": 5.2394, + "step": 6707 + }, + { + "epoch": 2.289419795221843, + "grad_norm": 2.737926483154297, + "learning_rate": 0.00023686006825938567, + "loss": 6.5359, + "step": 6708 + }, + { + "epoch": 2.2897610921501705, + "grad_norm": 2.870438814163208, + "learning_rate": 0.00023674630261660978, + "loss": 6.5294, + "step": 6709 + }, + { + "epoch": 2.2901023890784984, + "grad_norm": 2.9853978157043457, + "learning_rate": 0.0002366325369738339, + "loss": 6.5247, + "step": 6710 + }, + { + "epoch": 2.290443686006826, + "grad_norm": 2.814044713973999, + "learning_rate": 0.00023651877133105804, + "loss": 6.4487, + "step": 6711 + }, + { + "epoch": 2.2907849829351536, + "grad_norm": 2.9334306716918945, + "learning_rate": 0.00023640500568828215, + "loss": 5.1492, + "step": 6712 + }, + { + "epoch": 2.291126279863481, + "grad_norm": 2.8538413047790527, + "learning_rate": 0.00023629124004550625, + "loss": 6.6194, + "step": 6713 + }, + { + "epoch": 2.291467576791809, + "grad_norm": 2.9160053730010986, + "learning_rate": 0.00023617747440273038, + "loss": 5.5613, + "step": 6714 + }, + { + "epoch": 2.2918088737201368, + "grad_norm": 2.795891046524048, + "learning_rate": 0.00023606370875995451, + "loss": 6.1532, + "step": 6715 + }, + { + "epoch": 2.292150170648464, + "grad_norm": 2.7880423069000244, + "learning_rate": 0.00023594994311717862, + "loss": 5.6862, + "step": 6716 + }, + { + "epoch": 2.2924914675767916, + "grad_norm": 2.863717794418335, + "learning_rate": 0.00023583617747440272, + "loss": 5.6751, + "step": 6717 + }, + { + "epoch": 2.2928327645051194, + "grad_norm": 2.9119949340820312, + "learning_rate": 0.00023572241183162686, + "loss": 5.5249, + "step": 6718 + }, + { + "epoch": 2.2931740614334473, + "grad_norm": 2.8489530086517334, + "learning_rate": 0.00023560864618885096, + "loss": 6.3403, + "step": 6719 + }, + { + "epoch": 2.2935153583617747, + "grad_norm": 3.275228261947632, + "learning_rate": 0.0002354948805460751, + "loss": 5.1688, + "step": 6720 + }, + { + "epoch": 2.2938566552901025, + "grad_norm": 2.758864164352417, + "learning_rate": 0.00023538111490329923, + "loss": 6.241, + "step": 6721 + }, + { + "epoch": 2.29419795221843, + "grad_norm": 2.7460415363311768, + "learning_rate": 0.00023526734926052333, + "loss": 5.9875, + "step": 6722 + }, + { + "epoch": 2.2945392491467578, + "grad_norm": 2.8659422397613525, + "learning_rate": 0.00023515358361774744, + "loss": 6.1308, + "step": 6723 + }, + { + "epoch": 2.294880546075085, + "grad_norm": 8.827744483947754, + "learning_rate": 0.00023503981797497157, + "loss": 6.2413, + "step": 6724 + }, + { + "epoch": 2.295221843003413, + "grad_norm": 2.7797117233276367, + "learning_rate": 0.0002349260523321957, + "loss": 6.1305, + "step": 6725 + }, + { + "epoch": 2.2955631399317404, + "grad_norm": 2.6930477619171143, + "learning_rate": 0.0002348122866894198, + "loss": 6.202, + "step": 6726 + }, + { + "epoch": 2.2959044368600683, + "grad_norm": 2.801419258117676, + "learning_rate": 0.0002346985210466439, + "loss": 6.2353, + "step": 6727 + }, + { + "epoch": 2.296245733788396, + "grad_norm": 2.8180465698242188, + "learning_rate": 0.00023458475540386804, + "loss": 6.1544, + "step": 6728 + }, + { + "epoch": 2.2965870307167235, + "grad_norm": 2.789066791534424, + "learning_rate": 0.00023447098976109215, + "loss": 5.9909, + "step": 6729 + }, + { + "epoch": 2.296928327645051, + "grad_norm": 2.7527191638946533, + "learning_rate": 0.00023435722411831628, + "loss": 6.1873, + "step": 6730 + }, + { + "epoch": 2.297269624573379, + "grad_norm": 2.798922061920166, + "learning_rate": 0.0002342434584755404, + "loss": 6.2598, + "step": 6731 + }, + { + "epoch": 2.2976109215017066, + "grad_norm": 2.9935081005096436, + "learning_rate": 0.0002341296928327645, + "loss": 5.3781, + "step": 6732 + }, + { + "epoch": 2.297952218430034, + "grad_norm": 2.8289272785186768, + "learning_rate": 0.00023401592718998862, + "loss": 6.3926, + "step": 6733 + }, + { + "epoch": 2.298293515358362, + "grad_norm": 2.873661756515503, + "learning_rate": 0.00023390216154721275, + "loss": 5.2289, + "step": 6734 + }, + { + "epoch": 2.2986348122866893, + "grad_norm": 2.7585761547088623, + "learning_rate": 0.00023378839590443688, + "loss": 6.4229, + "step": 6735 + }, + { + "epoch": 2.298976109215017, + "grad_norm": 2.7429072856903076, + "learning_rate": 0.00023367463026166096, + "loss": 6.1628, + "step": 6736 + }, + { + "epoch": 2.2993174061433446, + "grad_norm": 2.8091378211975098, + "learning_rate": 0.0002335608646188851, + "loss": 6.2183, + "step": 6737 + }, + { + "epoch": 2.2996587030716724, + "grad_norm": 2.830545425415039, + "learning_rate": 0.00023344709897610923, + "loss": 6.1877, + "step": 6738 + }, + { + "epoch": 2.3, + "grad_norm": 3.6861021518707275, + "learning_rate": 0.00023333333333333333, + "loss": 4.8636, + "step": 6739 + }, + { + "epoch": 2.3003412969283277, + "grad_norm": 2.8198957443237305, + "learning_rate": 0.00023321956769055746, + "loss": 6.044, + "step": 6740 + }, + { + "epoch": 2.3006825938566555, + "grad_norm": 2.8199527263641357, + "learning_rate": 0.00023310580204778157, + "loss": 6.2105, + "step": 6741 + }, + { + "epoch": 2.301023890784983, + "grad_norm": 2.9623944759368896, + "learning_rate": 0.00023299203640500567, + "loss": 5.464, + "step": 6742 + }, + { + "epoch": 2.3013651877133103, + "grad_norm": 2.8883421421051025, + "learning_rate": 0.0002328782707622298, + "loss": 5.7796, + "step": 6743 + }, + { + "epoch": 2.301706484641638, + "grad_norm": 2.6974728107452393, + "learning_rate": 0.00023276450511945394, + "loss": 6.3523, + "step": 6744 + }, + { + "epoch": 2.302047781569966, + "grad_norm": 2.7741858959198, + "learning_rate": 0.00023265073947667807, + "loss": 6.0041, + "step": 6745 + }, + { + "epoch": 2.3023890784982934, + "grad_norm": 2.771564245223999, + "learning_rate": 0.00023253697383390215, + "loss": 6.3403, + "step": 6746 + }, + { + "epoch": 2.3027303754266213, + "grad_norm": 2.801663875579834, + "learning_rate": 0.00023242320819112628, + "loss": 5.9708, + "step": 6747 + }, + { + "epoch": 2.3030716723549487, + "grad_norm": 2.906155586242676, + "learning_rate": 0.0002323094425483504, + "loss": 5.9501, + "step": 6748 + }, + { + "epoch": 2.3034129692832765, + "grad_norm": 2.7590839862823486, + "learning_rate": 0.00023219567690557452, + "loss": 6.2378, + "step": 6749 + }, + { + "epoch": 2.303754266211604, + "grad_norm": 2.7589144706726074, + "learning_rate": 0.00023208191126279865, + "loss": 6.4537, + "step": 6750 + }, + { + "epoch": 2.304095563139932, + "grad_norm": 2.7965896129608154, + "learning_rate": 0.00023196814562002275, + "loss": 5.9146, + "step": 6751 + }, + { + "epoch": 2.304436860068259, + "grad_norm": 2.827988862991333, + "learning_rate": 0.00023185437997724686, + "loss": 6.109, + "step": 6752 + }, + { + "epoch": 2.304778156996587, + "grad_norm": 2.7847177982330322, + "learning_rate": 0.000231740614334471, + "loss": 6.2527, + "step": 6753 + }, + { + "epoch": 2.305119453924915, + "grad_norm": 2.7947700023651123, + "learning_rate": 0.00023162684869169512, + "loss": 5.7674, + "step": 6754 + }, + { + "epoch": 2.3054607508532423, + "grad_norm": 2.828664779663086, + "learning_rate": 0.00023151308304891925, + "loss": 6.086, + "step": 6755 + }, + { + "epoch": 2.3058020477815697, + "grad_norm": 2.7907068729400635, + "learning_rate": 0.00023139931740614333, + "loss": 6.1609, + "step": 6756 + }, + { + "epoch": 2.3061433447098976, + "grad_norm": 2.880263566970825, + "learning_rate": 0.00023128555176336746, + "loss": 5.7513, + "step": 6757 + }, + { + "epoch": 2.3064846416382254, + "grad_norm": 2.764763832092285, + "learning_rate": 0.0002311717861205916, + "loss": 6.418, + "step": 6758 + }, + { + "epoch": 2.306825938566553, + "grad_norm": 5.498814105987549, + "learning_rate": 0.0002310580204778157, + "loss": 4.4087, + "step": 6759 + }, + { + "epoch": 2.3071672354948807, + "grad_norm": 2.8534750938415527, + "learning_rate": 0.00023094425483503983, + "loss": 6.3137, + "step": 6760 + }, + { + "epoch": 2.307508532423208, + "grad_norm": 2.6376616954803467, + "learning_rate": 0.00023083048919226394, + "loss": 6.0889, + "step": 6761 + }, + { + "epoch": 2.307849829351536, + "grad_norm": 3.473785877227783, + "learning_rate": 0.00023071672354948804, + "loss": 5.482, + "step": 6762 + }, + { + "epoch": 2.3081911262798633, + "grad_norm": 3.221046209335327, + "learning_rate": 0.00023060295790671217, + "loss": 5.6875, + "step": 6763 + }, + { + "epoch": 2.308532423208191, + "grad_norm": 2.871093273162842, + "learning_rate": 0.0002304891922639363, + "loss": 6.1357, + "step": 6764 + }, + { + "epoch": 2.3088737201365186, + "grad_norm": 4.9006667137146, + "learning_rate": 0.00023037542662116044, + "loss": 5.354, + "step": 6765 + }, + { + "epoch": 2.3092150170648464, + "grad_norm": 5.541250228881836, + "learning_rate": 0.00023026166097838452, + "loss": 5.1683, + "step": 6766 + }, + { + "epoch": 2.3095563139931743, + "grad_norm": 2.8012752532958984, + "learning_rate": 0.00023014789533560865, + "loss": 6.1139, + "step": 6767 + }, + { + "epoch": 2.3098976109215017, + "grad_norm": 2.7383131980895996, + "learning_rate": 0.00023003412969283278, + "loss": 5.1907, + "step": 6768 + }, + { + "epoch": 2.310238907849829, + "grad_norm": 2.7988059520721436, + "learning_rate": 0.00022992036405005689, + "loss": 5.9688, + "step": 6769 + }, + { + "epoch": 2.310580204778157, + "grad_norm": 2.8070616722106934, + "learning_rate": 0.000229806598407281, + "loss": 6.0292, + "step": 6770 + }, + { + "epoch": 2.310921501706485, + "grad_norm": 2.855241537094116, + "learning_rate": 0.00022969283276450512, + "loss": 5.8921, + "step": 6771 + }, + { + "epoch": 2.311262798634812, + "grad_norm": 6.3953142166137695, + "learning_rate": 0.00022957906712172923, + "loss": 5.0532, + "step": 6772 + }, + { + "epoch": 2.31160409556314, + "grad_norm": 6.097601890563965, + "learning_rate": 0.00022946530147895336, + "loss": 4.7717, + "step": 6773 + }, + { + "epoch": 2.3119453924914675, + "grad_norm": 4.208084583282471, + "learning_rate": 0.0002293515358361775, + "loss": 3.4272, + "step": 6774 + }, + { + "epoch": 2.3122866894197953, + "grad_norm": 2.8885231018066406, + "learning_rate": 0.0002292377701934016, + "loss": 6.0002, + "step": 6775 + }, + { + "epoch": 2.3126279863481227, + "grad_norm": 2.9425899982452393, + "learning_rate": 0.0002291240045506257, + "loss": 5.8151, + "step": 6776 + }, + { + "epoch": 2.3129692832764506, + "grad_norm": 2.7507827281951904, + "learning_rate": 0.00022901023890784983, + "loss": 6.2093, + "step": 6777 + }, + { + "epoch": 2.313310580204778, + "grad_norm": 2.732609272003174, + "learning_rate": 0.00022889647326507397, + "loss": 5.5137, + "step": 6778 + }, + { + "epoch": 2.313651877133106, + "grad_norm": 2.8268463611602783, + "learning_rate": 0.00022878270762229807, + "loss": 5.6401, + "step": 6779 + }, + { + "epoch": 2.3139931740614337, + "grad_norm": 2.7519233226776123, + "learning_rate": 0.00022866894197952218, + "loss": 6.2487, + "step": 6780 + }, + { + "epoch": 2.314334470989761, + "grad_norm": 2.7072274684906006, + "learning_rate": 0.0002285551763367463, + "loss": 5.4718, + "step": 6781 + }, + { + "epoch": 2.3146757679180885, + "grad_norm": 2.7394325733184814, + "learning_rate": 0.0002284414106939704, + "loss": 6.1958, + "step": 6782 + }, + { + "epoch": 2.3150170648464163, + "grad_norm": 2.812884569168091, + "learning_rate": 0.00022832764505119454, + "loss": 6.1972, + "step": 6783 + }, + { + "epoch": 2.315358361774744, + "grad_norm": 2.7412703037261963, + "learning_rate": 0.00022821387940841868, + "loss": 6.1518, + "step": 6784 + }, + { + "epoch": 2.3156996587030716, + "grad_norm": 2.686539888381958, + "learning_rate": 0.00022810011376564278, + "loss": 6.5791, + "step": 6785 + }, + { + "epoch": 2.3160409556313994, + "grad_norm": 2.780839443206787, + "learning_rate": 0.0002279863481228669, + "loss": 5.8385, + "step": 6786 + }, + { + "epoch": 2.316382252559727, + "grad_norm": 2.8812310695648193, + "learning_rate": 0.00022787258248009102, + "loss": 4.8785, + "step": 6787 + }, + { + "epoch": 2.3167235494880547, + "grad_norm": 2.8832690715789795, + "learning_rate": 0.00022775881683731515, + "loss": 6.5055, + "step": 6788 + }, + { + "epoch": 2.317064846416382, + "grad_norm": 2.7269794940948486, + "learning_rate": 0.00022764505119453926, + "loss": 6.2457, + "step": 6789 + }, + { + "epoch": 2.31740614334471, + "grad_norm": 2.879974126815796, + "learning_rate": 0.00022753128555176336, + "loss": 5.7098, + "step": 6790 + }, + { + "epoch": 2.3177474402730374, + "grad_norm": 2.8237853050231934, + "learning_rate": 0.0002274175199089875, + "loss": 5.9351, + "step": 6791 + }, + { + "epoch": 2.318088737201365, + "grad_norm": 2.740983247756958, + "learning_rate": 0.0002273037542662116, + "loss": 6.1166, + "step": 6792 + }, + { + "epoch": 2.318430034129693, + "grad_norm": 2.9319350719451904, + "learning_rate": 0.00022718998862343573, + "loss": 6.4873, + "step": 6793 + }, + { + "epoch": 2.3187713310580205, + "grad_norm": 2.7880241870880127, + "learning_rate": 0.00022707622298065986, + "loss": 5.8139, + "step": 6794 + }, + { + "epoch": 2.319112627986348, + "grad_norm": 2.693197727203369, + "learning_rate": 0.00022696245733788397, + "loss": 6.0517, + "step": 6795 + }, + { + "epoch": 2.3194539249146757, + "grad_norm": 2.8118865489959717, + "learning_rate": 0.00022684869169510807, + "loss": 6.0578, + "step": 6796 + }, + { + "epoch": 2.3197952218430036, + "grad_norm": 3.474181890487671, + "learning_rate": 0.0002267349260523322, + "loss": 4.9987, + "step": 6797 + }, + { + "epoch": 2.320136518771331, + "grad_norm": 2.758629083633423, + "learning_rate": 0.00022662116040955634, + "loss": 6.036, + "step": 6798 + }, + { + "epoch": 2.320477815699659, + "grad_norm": 2.87188720703125, + "learning_rate": 0.00022650739476678044, + "loss": 6.5413, + "step": 6799 + }, + { + "epoch": 2.3208191126279862, + "grad_norm": 2.650592803955078, + "learning_rate": 0.00022639362912400455, + "loss": 6.0238, + "step": 6800 + }, + { + "epoch": 2.321160409556314, + "grad_norm": 2.7691047191619873, + "learning_rate": 0.00022627986348122868, + "loss": 6.3138, + "step": 6801 + }, + { + "epoch": 2.3215017064846415, + "grad_norm": 2.800083637237549, + "learning_rate": 0.00022616609783845278, + "loss": 6.0257, + "step": 6802 + }, + { + "epoch": 2.3218430034129693, + "grad_norm": 2.9052236080169678, + "learning_rate": 0.00022605233219567691, + "loss": 5.8308, + "step": 6803 + }, + { + "epoch": 2.3221843003412967, + "grad_norm": 3.039931058883667, + "learning_rate": 0.00022593856655290102, + "loss": 5.0397, + "step": 6804 + }, + { + "epoch": 2.3225255972696246, + "grad_norm": 2.8269150257110596, + "learning_rate": 0.00022582480091012515, + "loss": 5.7589, + "step": 6805 + }, + { + "epoch": 2.3228668941979524, + "grad_norm": 2.7746031284332275, + "learning_rate": 0.00022571103526734926, + "loss": 5.6901, + "step": 6806 + }, + { + "epoch": 2.32320819112628, + "grad_norm": 1.9601713418960571, + "learning_rate": 0.0002255972696245734, + "loss": 3.2623, + "step": 6807 + }, + { + "epoch": 2.3235494880546073, + "grad_norm": 2.8860151767730713, + "learning_rate": 0.00022548350398179752, + "loss": 5.5029, + "step": 6808 + }, + { + "epoch": 2.323890784982935, + "grad_norm": 2.7598114013671875, + "learning_rate": 0.0002253697383390216, + "loss": 5.9686, + "step": 6809 + }, + { + "epoch": 2.324232081911263, + "grad_norm": 2.8497135639190674, + "learning_rate": 0.00022525597269624573, + "loss": 6.0748, + "step": 6810 + }, + { + "epoch": 2.3245733788395904, + "grad_norm": 2.8299918174743652, + "learning_rate": 0.00022514220705346986, + "loss": 6.3305, + "step": 6811 + }, + { + "epoch": 2.324914675767918, + "grad_norm": 2.794346332550049, + "learning_rate": 0.00022502844141069397, + "loss": 5.8338, + "step": 6812 + }, + { + "epoch": 2.3252559726962456, + "grad_norm": 2.809251546859741, + "learning_rate": 0.0002249146757679181, + "loss": 6.1905, + "step": 6813 + }, + { + "epoch": 2.3255972696245735, + "grad_norm": 3.0918211936950684, + "learning_rate": 0.0002248009101251422, + "loss": 6.2205, + "step": 6814 + }, + { + "epoch": 2.325938566552901, + "grad_norm": 6.421435832977295, + "learning_rate": 0.00022468714448236634, + "loss": 4.1736, + "step": 6815 + }, + { + "epoch": 2.3262798634812287, + "grad_norm": 3.02647066116333, + "learning_rate": 0.00022457337883959044, + "loss": 5.8284, + "step": 6816 + }, + { + "epoch": 2.326621160409556, + "grad_norm": 2.943453550338745, + "learning_rate": 0.00022445961319681457, + "loss": 5.329, + "step": 6817 + }, + { + "epoch": 2.326962457337884, + "grad_norm": 2.718707323074341, + "learning_rate": 0.0002243458475540387, + "loss": 6.4537, + "step": 6818 + }, + { + "epoch": 2.327303754266212, + "grad_norm": 2.7228763103485107, + "learning_rate": 0.00022423208191126278, + "loss": 5.5298, + "step": 6819 + }, + { + "epoch": 2.3276450511945392, + "grad_norm": 2.7870371341705322, + "learning_rate": 0.00022411831626848692, + "loss": 5.9528, + "step": 6820 + }, + { + "epoch": 2.3279863481228666, + "grad_norm": 2.7378652095794678, + "learning_rate": 0.00022400455062571105, + "loss": 5.9546, + "step": 6821 + }, + { + "epoch": 2.3283276450511945, + "grad_norm": 2.816159725189209, + "learning_rate": 0.00022389078498293515, + "loss": 6.0227, + "step": 6822 + }, + { + "epoch": 2.3286689419795223, + "grad_norm": 3.0372061729431152, + "learning_rate": 0.00022377701934015928, + "loss": 5.1104, + "step": 6823 + }, + { + "epoch": 2.3290102389078498, + "grad_norm": 3.7163870334625244, + "learning_rate": 0.0002236632536973834, + "loss": 5.6614, + "step": 6824 + }, + { + "epoch": 2.3293515358361776, + "grad_norm": 2.7196543216705322, + "learning_rate": 0.00022354948805460752, + "loss": 6.4318, + "step": 6825 + }, + { + "epoch": 2.329692832764505, + "grad_norm": 2.8511674404144287, + "learning_rate": 0.00022343572241183163, + "loss": 5.7164, + "step": 6826 + }, + { + "epoch": 2.330034129692833, + "grad_norm": 2.838578462600708, + "learning_rate": 0.00022332195676905576, + "loss": 5.8845, + "step": 6827 + }, + { + "epoch": 2.3303754266211603, + "grad_norm": 3.3302481174468994, + "learning_rate": 0.0002232081911262799, + "loss": 5.2908, + "step": 6828 + }, + { + "epoch": 2.330716723549488, + "grad_norm": 2.730325222015381, + "learning_rate": 0.00022309442548350397, + "loss": 6.1103, + "step": 6829 + }, + { + "epoch": 2.3310580204778155, + "grad_norm": 2.734823226928711, + "learning_rate": 0.0002229806598407281, + "loss": 6.5294, + "step": 6830 + }, + { + "epoch": 2.3313993174061434, + "grad_norm": 2.777035713195801, + "learning_rate": 0.00022286689419795223, + "loss": 6.0642, + "step": 6831 + }, + { + "epoch": 2.331740614334471, + "grad_norm": 2.896202802658081, + "learning_rate": 0.00022275312855517634, + "loss": 5.5833, + "step": 6832 + }, + { + "epoch": 2.3320819112627986, + "grad_norm": 2.8327341079711914, + "learning_rate": 0.00022263936291240047, + "loss": 5.7914, + "step": 6833 + }, + { + "epoch": 2.3324232081911265, + "grad_norm": 7.732255935668945, + "learning_rate": 0.00022252559726962457, + "loss": 3.887, + "step": 6834 + }, + { + "epoch": 2.332764505119454, + "grad_norm": 2.896789312362671, + "learning_rate": 0.00022241183162684868, + "loss": 6.0555, + "step": 6835 + }, + { + "epoch": 2.3331058020477817, + "grad_norm": 2.7806904315948486, + "learning_rate": 0.0002222980659840728, + "loss": 6.4753, + "step": 6836 + }, + { + "epoch": 2.333447098976109, + "grad_norm": 2.836212635040283, + "learning_rate": 0.00022218430034129694, + "loss": 6.3409, + "step": 6837 + }, + { + "epoch": 2.333788395904437, + "grad_norm": 2.7770421504974365, + "learning_rate": 0.00022207053469852108, + "loss": 6.5263, + "step": 6838 + }, + { + "epoch": 2.3341296928327644, + "grad_norm": 2.7537899017333984, + "learning_rate": 0.00022195676905574515, + "loss": 6.3918, + "step": 6839 + }, + { + "epoch": 2.3344709897610922, + "grad_norm": 2.9366977214813232, + "learning_rate": 0.00022184300341296929, + "loss": 5.7525, + "step": 6840 + }, + { + "epoch": 2.3348122866894196, + "grad_norm": 3.4186301231384277, + "learning_rate": 0.00022172923777019342, + "loss": 5.2059, + "step": 6841 + }, + { + "epoch": 2.3351535836177475, + "grad_norm": 2.832503080368042, + "learning_rate": 0.00022161547212741752, + "loss": 5.9087, + "step": 6842 + }, + { + "epoch": 2.335494880546075, + "grad_norm": 2.741806983947754, + "learning_rate": 0.00022150170648464163, + "loss": 6.3415, + "step": 6843 + }, + { + "epoch": 2.3358361774744028, + "grad_norm": 2.8064048290252686, + "learning_rate": 0.00022138794084186576, + "loss": 6.0018, + "step": 6844 + }, + { + "epoch": 2.3361774744027306, + "grad_norm": 2.7633867263793945, + "learning_rate": 0.00022127417519908986, + "loss": 6.0155, + "step": 6845 + }, + { + "epoch": 2.336518771331058, + "grad_norm": 2.8978452682495117, + "learning_rate": 0.000221160409556314, + "loss": 5.8894, + "step": 6846 + }, + { + "epoch": 2.336860068259386, + "grad_norm": 2.9551198482513428, + "learning_rate": 0.00022104664391353813, + "loss": 4.3913, + "step": 6847 + }, + { + "epoch": 2.3372013651877133, + "grad_norm": 2.845735549926758, + "learning_rate": 0.00022093287827076223, + "loss": 5.5712, + "step": 6848 + }, + { + "epoch": 2.337542662116041, + "grad_norm": 2.8292698860168457, + "learning_rate": 0.00022081911262798634, + "loss": 6.0293, + "step": 6849 + }, + { + "epoch": 2.3378839590443685, + "grad_norm": 2.7965333461761475, + "learning_rate": 0.00022070534698521047, + "loss": 5.9881, + "step": 6850 + }, + { + "epoch": 2.3382252559726964, + "grad_norm": 2.763213872909546, + "learning_rate": 0.0002205915813424346, + "loss": 6.1471, + "step": 6851 + }, + { + "epoch": 2.3385665529010238, + "grad_norm": 3.1655666828155518, + "learning_rate": 0.0002204778156996587, + "loss": 5.4231, + "step": 6852 + }, + { + "epoch": 2.3389078498293516, + "grad_norm": 2.8350963592529297, + "learning_rate": 0.0002203640500568828, + "loss": 5.9476, + "step": 6853 + }, + { + "epoch": 2.339249146757679, + "grad_norm": 2.731945514678955, + "learning_rate": 0.00022025028441410694, + "loss": 5.8654, + "step": 6854 + }, + { + "epoch": 2.339590443686007, + "grad_norm": 2.7220661640167236, + "learning_rate": 0.00022013651877133105, + "loss": 6.4109, + "step": 6855 + }, + { + "epoch": 2.3399317406143343, + "grad_norm": 2.7597591876983643, + "learning_rate": 0.00022002275312855518, + "loss": 6.377, + "step": 6856 + }, + { + "epoch": 2.340273037542662, + "grad_norm": 2.7260942459106445, + "learning_rate": 0.0002199089874857793, + "loss": 5.9268, + "step": 6857 + }, + { + "epoch": 2.34061433447099, + "grad_norm": 2.8388872146606445, + "learning_rate": 0.00021979522184300342, + "loss": 5.8152, + "step": 6858 + }, + { + "epoch": 2.3409556313993174, + "grad_norm": 2.754322052001953, + "learning_rate": 0.00021968145620022752, + "loss": 6.2267, + "step": 6859 + }, + { + "epoch": 2.3412969283276452, + "grad_norm": 2.73994779586792, + "learning_rate": 0.00021956769055745166, + "loss": 6.0755, + "step": 6860 + }, + { + "epoch": 2.3416382252559726, + "grad_norm": 2.6968204975128174, + "learning_rate": 0.0002194539249146758, + "loss": 6.0136, + "step": 6861 + }, + { + "epoch": 2.3419795221843005, + "grad_norm": 2.7095813751220703, + "learning_rate": 0.0002193401592718999, + "loss": 5.9502, + "step": 6862 + }, + { + "epoch": 2.342320819112628, + "grad_norm": 4.785019397735596, + "learning_rate": 0.000219226393629124, + "loss": 4.9616, + "step": 6863 + }, + { + "epoch": 2.3426621160409558, + "grad_norm": 2.7408225536346436, + "learning_rate": 0.00021911262798634813, + "loss": 6.1393, + "step": 6864 + }, + { + "epoch": 2.343003412969283, + "grad_norm": 2.8245584964752197, + "learning_rate": 0.00021899886234357223, + "loss": 6.7514, + "step": 6865 + }, + { + "epoch": 2.343344709897611, + "grad_norm": 2.7768466472625732, + "learning_rate": 0.00021888509670079637, + "loss": 6.0515, + "step": 6866 + }, + { + "epoch": 2.3436860068259384, + "grad_norm": 2.7474124431610107, + "learning_rate": 0.0002187713310580205, + "loss": 6.5632, + "step": 6867 + }, + { + "epoch": 2.3440273037542663, + "grad_norm": 2.730879545211792, + "learning_rate": 0.0002186575654152446, + "loss": 6.0169, + "step": 6868 + }, + { + "epoch": 2.3443686006825937, + "grad_norm": 2.721271514892578, + "learning_rate": 0.0002185437997724687, + "loss": 6.3651, + "step": 6869 + }, + { + "epoch": 2.3447098976109215, + "grad_norm": 2.727299690246582, + "learning_rate": 0.00021843003412969284, + "loss": 6.0875, + "step": 6870 + }, + { + "epoch": 2.3450511945392494, + "grad_norm": 2.7979040145874023, + "learning_rate": 0.00021831626848691697, + "loss": 5.3848, + "step": 6871 + }, + { + "epoch": 2.345392491467577, + "grad_norm": 2.7405858039855957, + "learning_rate": 0.00021820250284414108, + "loss": 5.9585, + "step": 6872 + }, + { + "epoch": 2.3457337883959046, + "grad_norm": 2.7488925457000732, + "learning_rate": 0.00021808873720136518, + "loss": 6.4375, + "step": 6873 + }, + { + "epoch": 2.346075085324232, + "grad_norm": 2.80232310295105, + "learning_rate": 0.00021797497155858931, + "loss": 6.0547, + "step": 6874 + }, + { + "epoch": 2.34641638225256, + "grad_norm": 2.803708791732788, + "learning_rate": 0.00021786120591581342, + "loss": 5.5016, + "step": 6875 + }, + { + "epoch": 2.3467576791808873, + "grad_norm": 2.724703550338745, + "learning_rate": 0.00021774744027303755, + "loss": 6.1233, + "step": 6876 + }, + { + "epoch": 2.347098976109215, + "grad_norm": 2.8196821212768555, + "learning_rate": 0.00021763367463026166, + "loss": 5.8877, + "step": 6877 + }, + { + "epoch": 2.3474402730375425, + "grad_norm": 3.931154251098633, + "learning_rate": 0.0002175199089874858, + "loss": 5.5875, + "step": 6878 + }, + { + "epoch": 2.3477815699658704, + "grad_norm": 3.5176618099212646, + "learning_rate": 0.0002174061433447099, + "loss": 4.2908, + "step": 6879 + }, + { + "epoch": 2.348122866894198, + "grad_norm": 2.752065896987915, + "learning_rate": 0.00021729237770193403, + "loss": 5.6804, + "step": 6880 + }, + { + "epoch": 2.3484641638225257, + "grad_norm": 2.8360445499420166, + "learning_rate": 0.00021717861205915816, + "loss": 6.1461, + "step": 6881 + }, + { + "epoch": 2.348805460750853, + "grad_norm": 2.739427328109741, + "learning_rate": 0.00021706484641638224, + "loss": 6.4064, + "step": 6882 + }, + { + "epoch": 2.349146757679181, + "grad_norm": 3.3373124599456787, + "learning_rate": 0.00021695108077360637, + "loss": 6.0023, + "step": 6883 + }, + { + "epoch": 2.3494880546075088, + "grad_norm": 2.7712957859039307, + "learning_rate": 0.0002168373151308305, + "loss": 6.3348, + "step": 6884 + }, + { + "epoch": 2.349829351535836, + "grad_norm": 3.1796998977661133, + "learning_rate": 0.0002167235494880546, + "loss": 5.1784, + "step": 6885 + }, + { + "epoch": 2.350170648464164, + "grad_norm": 2.7970149517059326, + "learning_rate": 0.00021660978384527874, + "loss": 6.1688, + "step": 6886 + }, + { + "epoch": 2.3505119453924914, + "grad_norm": 2.6617581844329834, + "learning_rate": 0.00021649601820250284, + "loss": 6.3794, + "step": 6887 + }, + { + "epoch": 2.3508532423208193, + "grad_norm": 2.752316474914551, + "learning_rate": 0.00021638225255972697, + "loss": 5.9957, + "step": 6888 + }, + { + "epoch": 2.3511945392491467, + "grad_norm": 2.7273192405700684, + "learning_rate": 0.00021626848691695108, + "loss": 6.0486, + "step": 6889 + }, + { + "epoch": 2.3515358361774745, + "grad_norm": 2.755157947540283, + "learning_rate": 0.0002161547212741752, + "loss": 6.2515, + "step": 6890 + }, + { + "epoch": 2.351877133105802, + "grad_norm": 2.757171154022217, + "learning_rate": 0.00021604095563139934, + "loss": 6.3392, + "step": 6891 + }, + { + "epoch": 2.35221843003413, + "grad_norm": 2.830286979675293, + "learning_rate": 0.00021592718998862342, + "loss": 6.4345, + "step": 6892 + }, + { + "epoch": 2.352559726962457, + "grad_norm": 2.98397159576416, + "learning_rate": 0.00021581342434584755, + "loss": 5.7659, + "step": 6893 + }, + { + "epoch": 2.352901023890785, + "grad_norm": 2.784120798110962, + "learning_rate": 0.00021569965870307168, + "loss": 5.3936, + "step": 6894 + }, + { + "epoch": 2.3532423208191124, + "grad_norm": 2.79896879196167, + "learning_rate": 0.0002155858930602958, + "loss": 6.6646, + "step": 6895 + }, + { + "epoch": 2.3535836177474403, + "grad_norm": 2.7373616695404053, + "learning_rate": 0.00021547212741751992, + "loss": 6.4598, + "step": 6896 + }, + { + "epoch": 2.353924914675768, + "grad_norm": 2.7969889640808105, + "learning_rate": 0.00021535836177474403, + "loss": 5.9267, + "step": 6897 + }, + { + "epoch": 2.3542662116040955, + "grad_norm": 2.9162516593933105, + "learning_rate": 0.00021524459613196816, + "loss": 5.6167, + "step": 6898 + }, + { + "epoch": 2.3546075085324234, + "grad_norm": 2.745880365371704, + "learning_rate": 0.00021513083048919226, + "loss": 5.9538, + "step": 6899 + }, + { + "epoch": 2.354948805460751, + "grad_norm": 2.799854278564453, + "learning_rate": 0.0002150170648464164, + "loss": 4.8416, + "step": 6900 + }, + { + "epoch": 2.3552901023890787, + "grad_norm": 2.84338116645813, + "learning_rate": 0.00021490329920364053, + "loss": 6.2332, + "step": 6901 + }, + { + "epoch": 2.355631399317406, + "grad_norm": 2.713643789291382, + "learning_rate": 0.0002147895335608646, + "loss": 6.3292, + "step": 6902 + }, + { + "epoch": 2.355972696245734, + "grad_norm": 2.7593953609466553, + "learning_rate": 0.00021467576791808874, + "loss": 6.1654, + "step": 6903 + }, + { + "epoch": 2.3563139931740613, + "grad_norm": 2.7301793098449707, + "learning_rate": 0.00021456200227531287, + "loss": 5.8449, + "step": 6904 + }, + { + "epoch": 2.356655290102389, + "grad_norm": 2.736713409423828, + "learning_rate": 0.00021444823663253697, + "loss": 6.0168, + "step": 6905 + }, + { + "epoch": 2.3569965870307166, + "grad_norm": 2.8782119750976562, + "learning_rate": 0.0002143344709897611, + "loss": 6.1835, + "step": 6906 + }, + { + "epoch": 2.3573378839590444, + "grad_norm": 2.7333459854125977, + "learning_rate": 0.0002142207053469852, + "loss": 6.9639, + "step": 6907 + }, + { + "epoch": 2.357679180887372, + "grad_norm": 2.8486573696136475, + "learning_rate": 0.00021410693970420934, + "loss": 5.9174, + "step": 6908 + }, + { + "epoch": 2.3580204778156997, + "grad_norm": 2.6959550380706787, + "learning_rate": 0.00021399317406143345, + "loss": 6.3947, + "step": 6909 + }, + { + "epoch": 2.3583617747440275, + "grad_norm": 2.771045207977295, + "learning_rate": 0.00021387940841865758, + "loss": 5.9439, + "step": 6910 + }, + { + "epoch": 2.358703071672355, + "grad_norm": 2.7466630935668945, + "learning_rate": 0.00021376564277588169, + "loss": 5.7553, + "step": 6911 + }, + { + "epoch": 2.359044368600683, + "grad_norm": 2.725827217102051, + "learning_rate": 0.0002136518771331058, + "loss": 5.7062, + "step": 6912 + }, + { + "epoch": 2.35938566552901, + "grad_norm": 2.763960123062134, + "learning_rate": 0.00021353811149032992, + "loss": 6.1409, + "step": 6913 + }, + { + "epoch": 2.359726962457338, + "grad_norm": 2.7459399700164795, + "learning_rate": 0.00021342434584755405, + "loss": 6.2805, + "step": 6914 + }, + { + "epoch": 2.3600682593856654, + "grad_norm": 2.6126558780670166, + "learning_rate": 0.00021331058020477816, + "loss": 5.6644, + "step": 6915 + }, + { + "epoch": 2.3604095563139933, + "grad_norm": 2.730069160461426, + "learning_rate": 0.00021319681456200226, + "loss": 5.9614, + "step": 6916 + }, + { + "epoch": 2.3607508532423207, + "grad_norm": 2.6941754817962646, + "learning_rate": 0.0002130830489192264, + "loss": 6.082, + "step": 6917 + }, + { + "epoch": 2.3610921501706486, + "grad_norm": 2.685556173324585, + "learning_rate": 0.00021296928327645053, + "loss": 5.7493, + "step": 6918 + }, + { + "epoch": 2.361433447098976, + "grad_norm": 2.748690366744995, + "learning_rate": 0.00021285551763367463, + "loss": 6.2632, + "step": 6919 + }, + { + "epoch": 2.361774744027304, + "grad_norm": 2.6726412773132324, + "learning_rate": 0.00021274175199089877, + "loss": 5.5741, + "step": 6920 + }, + { + "epoch": 2.362116040955631, + "grad_norm": 2.778284788131714, + "learning_rate": 0.00021262798634812287, + "loss": 5.3961, + "step": 6921 + }, + { + "epoch": 2.362457337883959, + "grad_norm": 2.7935075759887695, + "learning_rate": 0.00021251422070534698, + "loss": 6.1253, + "step": 6922 + }, + { + "epoch": 2.362798634812287, + "grad_norm": 2.765655279159546, + "learning_rate": 0.0002124004550625711, + "loss": 6.6871, + "step": 6923 + }, + { + "epoch": 2.3631399317406143, + "grad_norm": 2.659959554672241, + "learning_rate": 0.00021228668941979524, + "loss": 6.0147, + "step": 6924 + }, + { + "epoch": 2.363481228668942, + "grad_norm": 2.7546181678771973, + "learning_rate": 0.00021217292377701934, + "loss": 6.5698, + "step": 6925 + }, + { + "epoch": 2.3638225255972696, + "grad_norm": 2.8323190212249756, + "learning_rate": 0.00021205915813424345, + "loss": 5.7636, + "step": 6926 + }, + { + "epoch": 2.3641638225255974, + "grad_norm": 2.7773520946502686, + "learning_rate": 0.00021194539249146758, + "loss": 6.3198, + "step": 6927 + }, + { + "epoch": 2.364505119453925, + "grad_norm": 2.6490156650543213, + "learning_rate": 0.0002118316268486917, + "loss": 5.7899, + "step": 6928 + }, + { + "epoch": 2.3648464163822527, + "grad_norm": 2.878728151321411, + "learning_rate": 0.00021171786120591582, + "loss": 5.5224, + "step": 6929 + }, + { + "epoch": 2.36518771331058, + "grad_norm": 2.7923343181610107, + "learning_rate": 0.00021160409556313995, + "loss": 6.871, + "step": 6930 + }, + { + "epoch": 2.365529010238908, + "grad_norm": 4.340980052947998, + "learning_rate": 0.00021149032992036406, + "loss": 5.0883, + "step": 6931 + }, + { + "epoch": 2.3658703071672353, + "grad_norm": 2.780867338180542, + "learning_rate": 0.00021137656427758816, + "loss": 5.614, + "step": 6932 + }, + { + "epoch": 2.366211604095563, + "grad_norm": 2.7651314735412598, + "learning_rate": 0.0002112627986348123, + "loss": 6.4194, + "step": 6933 + }, + { + "epoch": 2.3665529010238906, + "grad_norm": 2.7465429306030273, + "learning_rate": 0.00021114903299203642, + "loss": 5.8274, + "step": 6934 + }, + { + "epoch": 2.3668941979522184, + "grad_norm": 2.8030600547790527, + "learning_rate": 0.00021103526734926053, + "loss": 5.9884, + "step": 6935 + }, + { + "epoch": 2.3672354948805463, + "grad_norm": 2.78486704826355, + "learning_rate": 0.00021092150170648463, + "loss": 6.4359, + "step": 6936 + }, + { + "epoch": 2.3675767918088737, + "grad_norm": 2.7584781646728516, + "learning_rate": 0.00021080773606370877, + "loss": 6.4127, + "step": 6937 + }, + { + "epoch": 2.3679180887372016, + "grad_norm": 2.7212891578674316, + "learning_rate": 0.00021069397042093287, + "loss": 6.8108, + "step": 6938 + }, + { + "epoch": 2.368259385665529, + "grad_norm": 2.8116931915283203, + "learning_rate": 0.000210580204778157, + "loss": 6.0632, + "step": 6939 + }, + { + "epoch": 2.368600682593857, + "grad_norm": 2.847207546234131, + "learning_rate": 0.00021046643913538114, + "loss": 5.8893, + "step": 6940 + }, + { + "epoch": 2.368941979522184, + "grad_norm": 2.7255427837371826, + "learning_rate": 0.00021035267349260524, + "loss": 6.3627, + "step": 6941 + }, + { + "epoch": 2.369283276450512, + "grad_norm": 3.3348500728607178, + "learning_rate": 0.00021023890784982934, + "loss": 6.4244, + "step": 6942 + }, + { + "epoch": 2.3696245733788395, + "grad_norm": 2.6929471492767334, + "learning_rate": 0.00021012514220705348, + "loss": 6.4524, + "step": 6943 + }, + { + "epoch": 2.3699658703071673, + "grad_norm": 2.598615884780884, + "learning_rate": 0.0002100113765642776, + "loss": 5.8728, + "step": 6944 + }, + { + "epoch": 2.3703071672354947, + "grad_norm": 2.8167927265167236, + "learning_rate": 0.0002098976109215017, + "loss": 5.5652, + "step": 6945 + }, + { + "epoch": 2.3706484641638226, + "grad_norm": 3.9340548515319824, + "learning_rate": 0.00020978384527872582, + "loss": 5.4056, + "step": 6946 + }, + { + "epoch": 2.37098976109215, + "grad_norm": 2.9168150424957275, + "learning_rate": 0.00020967007963594995, + "loss": 5.4847, + "step": 6947 + }, + { + "epoch": 2.371331058020478, + "grad_norm": 2.80441951751709, + "learning_rate": 0.00020955631399317406, + "loss": 6.661, + "step": 6948 + }, + { + "epoch": 2.3716723549488057, + "grad_norm": 2.7382166385650635, + "learning_rate": 0.0002094425483503982, + "loss": 5.718, + "step": 6949 + }, + { + "epoch": 2.372013651877133, + "grad_norm": 2.878997802734375, + "learning_rate": 0.0002093287827076223, + "loss": 6.9364, + "step": 6950 + }, + { + "epoch": 2.372354948805461, + "grad_norm": 2.7554514408111572, + "learning_rate": 0.00020921501706484642, + "loss": 6.7696, + "step": 6951 + }, + { + "epoch": 2.3726962457337883, + "grad_norm": 2.6601312160491943, + "learning_rate": 0.00020910125142207053, + "loss": 6.2882, + "step": 6952 + }, + { + "epoch": 2.373037542662116, + "grad_norm": 2.9576683044433594, + "learning_rate": 0.00020898748577929466, + "loss": 5.6812, + "step": 6953 + }, + { + "epoch": 2.3733788395904436, + "grad_norm": 2.700512170791626, + "learning_rate": 0.0002088737201365188, + "loss": 6.0231, + "step": 6954 + }, + { + "epoch": 2.3737201365187715, + "grad_norm": 2.6124114990234375, + "learning_rate": 0.00020875995449374287, + "loss": 5.9432, + "step": 6955 + }, + { + "epoch": 2.374061433447099, + "grad_norm": 4.633599281311035, + "learning_rate": 0.000208646188850967, + "loss": 5.6296, + "step": 6956 + }, + { + "epoch": 2.3744027303754267, + "grad_norm": 2.7569868564605713, + "learning_rate": 0.00020853242320819114, + "loss": 6.6165, + "step": 6957 + }, + { + "epoch": 2.374744027303754, + "grad_norm": 2.804227590560913, + "learning_rate": 0.00020841865756541524, + "loss": 5.4564, + "step": 6958 + }, + { + "epoch": 2.375085324232082, + "grad_norm": 2.7688915729522705, + "learning_rate": 0.00020830489192263937, + "loss": 6.1656, + "step": 6959 + }, + { + "epoch": 2.3754266211604094, + "grad_norm": 2.7455203533172607, + "learning_rate": 0.00020819112627986348, + "loss": 6.0186, + "step": 6960 + }, + { + "epoch": 2.375767918088737, + "grad_norm": 2.7759294509887695, + "learning_rate": 0.0002080773606370876, + "loss": 5.8901, + "step": 6961 + }, + { + "epoch": 2.376109215017065, + "grad_norm": 2.660414218902588, + "learning_rate": 0.00020796359499431171, + "loss": 6.0713, + "step": 6962 + }, + { + "epoch": 2.3764505119453925, + "grad_norm": 2.7604238986968994, + "learning_rate": 0.00020784982935153585, + "loss": 6.2344, + "step": 6963 + }, + { + "epoch": 2.3767918088737203, + "grad_norm": 2.65087890625, + "learning_rate": 0.00020773606370875998, + "loss": 6.0383, + "step": 6964 + }, + { + "epoch": 2.3771331058020477, + "grad_norm": 2.73121976852417, + "learning_rate": 0.00020762229806598406, + "loss": 5.8064, + "step": 6965 + }, + { + "epoch": 2.3774744027303756, + "grad_norm": 11.710968017578125, + "learning_rate": 0.0002075085324232082, + "loss": 5.0296, + "step": 6966 + }, + { + "epoch": 2.377815699658703, + "grad_norm": 2.5780153274536133, + "learning_rate": 0.00020739476678043232, + "loss": 6.0447, + "step": 6967 + }, + { + "epoch": 2.378156996587031, + "grad_norm": 2.7825236320495605, + "learning_rate": 0.00020728100113765643, + "loss": 6.6443, + "step": 6968 + }, + { + "epoch": 2.3784982935153582, + "grad_norm": 2.7635421752929688, + "learning_rate": 0.00020716723549488056, + "loss": 5.4036, + "step": 6969 + }, + { + "epoch": 2.378839590443686, + "grad_norm": 2.749699592590332, + "learning_rate": 0.00020705346985210466, + "loss": 5.763, + "step": 6970 + }, + { + "epoch": 2.3791808873720135, + "grad_norm": 2.686196804046631, + "learning_rate": 0.0002069397042093288, + "loss": 5.9363, + "step": 6971 + }, + { + "epoch": 2.3795221843003413, + "grad_norm": 2.801814317703247, + "learning_rate": 0.0002068259385665529, + "loss": 6.4167, + "step": 6972 + }, + { + "epoch": 2.3798634812286688, + "grad_norm": 2.927759885787964, + "learning_rate": 0.00020671217292377703, + "loss": 5.5384, + "step": 6973 + }, + { + "epoch": 2.3802047781569966, + "grad_norm": 5.355523109436035, + "learning_rate": 0.00020659840728100116, + "loss": 4.3588, + "step": 6974 + }, + { + "epoch": 2.3805460750853245, + "grad_norm": 6.205738544464111, + "learning_rate": 0.00020648464163822524, + "loss": 5.5929, + "step": 6975 + }, + { + "epoch": 2.380887372013652, + "grad_norm": 2.8085031509399414, + "learning_rate": 0.00020637087599544937, + "loss": 6.4081, + "step": 6976 + }, + { + "epoch": 2.3812286689419797, + "grad_norm": 2.7193679809570312, + "learning_rate": 0.0002062571103526735, + "loss": 6.0632, + "step": 6977 + }, + { + "epoch": 2.381569965870307, + "grad_norm": 2.8042826652526855, + "learning_rate": 0.0002061433447098976, + "loss": 5.7394, + "step": 6978 + }, + { + "epoch": 2.381911262798635, + "grad_norm": 2.7161612510681152, + "learning_rate": 0.00020602957906712174, + "loss": 6.55, + "step": 6979 + }, + { + "epoch": 2.3822525597269624, + "grad_norm": 2.6074378490448, + "learning_rate": 0.00020591581342434585, + "loss": 5.7456, + "step": 6980 + }, + { + "epoch": 2.38259385665529, + "grad_norm": 2.804840087890625, + "learning_rate": 0.00020580204778156998, + "loss": 4.9974, + "step": 6981 + }, + { + "epoch": 2.3829351535836176, + "grad_norm": 2.746267795562744, + "learning_rate": 0.00020568828213879408, + "loss": 6.2011, + "step": 6982 + }, + { + "epoch": 2.3832764505119455, + "grad_norm": 2.72701096534729, + "learning_rate": 0.00020557451649601822, + "loss": 6.2791, + "step": 6983 + }, + { + "epoch": 2.383617747440273, + "grad_norm": 2.6555674076080322, + "learning_rate": 0.00020546075085324232, + "loss": 5.5011, + "step": 6984 + }, + { + "epoch": 2.3839590443686007, + "grad_norm": 2.7322537899017334, + "learning_rate": 0.00020534698521046643, + "loss": 5.9383, + "step": 6985 + }, + { + "epoch": 2.384300341296928, + "grad_norm": 2.779167652130127, + "learning_rate": 0.00020523321956769056, + "loss": 6.5133, + "step": 6986 + }, + { + "epoch": 2.384641638225256, + "grad_norm": 2.8352394104003906, + "learning_rate": 0.0002051194539249147, + "loss": 6.4203, + "step": 6987 + }, + { + "epoch": 2.384982935153584, + "grad_norm": 2.740598440170288, + "learning_rate": 0.0002050056882821388, + "loss": 5.7477, + "step": 6988 + }, + { + "epoch": 2.3853242320819112, + "grad_norm": 2.779649496078491, + "learning_rate": 0.0002048919226393629, + "loss": 6.0869, + "step": 6989 + }, + { + "epoch": 2.385665529010239, + "grad_norm": 2.779036521911621, + "learning_rate": 0.00020477815699658703, + "loss": 5.5761, + "step": 6990 + }, + { + "epoch": 2.3860068259385665, + "grad_norm": 2.7772841453552246, + "learning_rate": 0.00020466439135381116, + "loss": 6.7393, + "step": 6991 + }, + { + "epoch": 2.3863481228668944, + "grad_norm": 2.7716453075408936, + "learning_rate": 0.00020455062571103527, + "loss": 6.1316, + "step": 6992 + }, + { + "epoch": 2.3866894197952218, + "grad_norm": 3.0707526206970215, + "learning_rate": 0.0002044368600682594, + "loss": 5.781, + "step": 6993 + }, + { + "epoch": 2.3870307167235496, + "grad_norm": 2.663928270339966, + "learning_rate": 0.0002043230944254835, + "loss": 6.2102, + "step": 6994 + }, + { + "epoch": 2.387372013651877, + "grad_norm": 2.7111122608184814, + "learning_rate": 0.0002042093287827076, + "loss": 6.2127, + "step": 6995 + }, + { + "epoch": 2.387713310580205, + "grad_norm": 2.7795569896698, + "learning_rate": 0.00020409556313993174, + "loss": 5.7084, + "step": 6996 + }, + { + "epoch": 2.3880546075085323, + "grad_norm": 2.881326198577881, + "learning_rate": 0.00020398179749715588, + "loss": 5.929, + "step": 6997 + }, + { + "epoch": 2.38839590443686, + "grad_norm": 2.725304365158081, + "learning_rate": 0.00020386803185437998, + "loss": 6.2795, + "step": 6998 + }, + { + "epoch": 2.3887372013651875, + "grad_norm": 2.7434685230255127, + "learning_rate": 0.00020375426621160409, + "loss": 6.0435, + "step": 6999 + }, + { + "epoch": 2.3890784982935154, + "grad_norm": 2.676453113555908, + "learning_rate": 0.00020364050056882822, + "loss": 6.1874, + "step": 7000 + }, + { + "epoch": 2.3894197952218432, + "grad_norm": 2.7269630432128906, + "learning_rate": 0.00020352673492605235, + "loss": 5.7898, + "step": 7001 + }, + { + "epoch": 2.3897610921501706, + "grad_norm": 2.72902774810791, + "learning_rate": 0.00020341296928327645, + "loss": 5.5644, + "step": 7002 + }, + { + "epoch": 2.3901023890784985, + "grad_norm": 2.750995397567749, + "learning_rate": 0.0002032992036405006, + "loss": 6.2748, + "step": 7003 + }, + { + "epoch": 2.390443686006826, + "grad_norm": 2.7395596504211426, + "learning_rate": 0.0002031854379977247, + "loss": 6.0858, + "step": 7004 + }, + { + "epoch": 2.3907849829351537, + "grad_norm": 2.73018217086792, + "learning_rate": 0.0002030716723549488, + "loss": 5.9797, + "step": 7005 + }, + { + "epoch": 2.391126279863481, + "grad_norm": 2.739351511001587, + "learning_rate": 0.00020295790671217293, + "loss": 6.2691, + "step": 7006 + }, + { + "epoch": 2.391467576791809, + "grad_norm": 2.804325580596924, + "learning_rate": 0.00020284414106939706, + "loss": 6.5061, + "step": 7007 + }, + { + "epoch": 2.3918088737201364, + "grad_norm": 2.734001636505127, + "learning_rate": 0.00020273037542662117, + "loss": 6.3185, + "step": 7008 + }, + { + "epoch": 2.3921501706484642, + "grad_norm": 2.743891954421997, + "learning_rate": 0.00020261660978384527, + "loss": 6.3436, + "step": 7009 + }, + { + "epoch": 2.3924914675767917, + "grad_norm": 3.208683729171753, + "learning_rate": 0.0002025028441410694, + "loss": 4.3648, + "step": 7010 + }, + { + "epoch": 2.3928327645051195, + "grad_norm": 2.7099032402038574, + "learning_rate": 0.00020238907849829353, + "loss": 5.1361, + "step": 7011 + }, + { + "epoch": 2.393174061433447, + "grad_norm": 2.8060643672943115, + "learning_rate": 0.00020227531285551764, + "loss": 6.1124, + "step": 7012 + }, + { + "epoch": 2.3935153583617748, + "grad_norm": 2.7645676136016846, + "learning_rate": 0.00020216154721274177, + "loss": 5.3761, + "step": 7013 + }, + { + "epoch": 2.3938566552901026, + "grad_norm": 2.6478114128112793, + "learning_rate": 0.00020204778156996588, + "loss": 5.6484, + "step": 7014 + }, + { + "epoch": 2.39419795221843, + "grad_norm": 2.8686115741729736, + "learning_rate": 0.00020193401592718998, + "loss": 5.5769, + "step": 7015 + }, + { + "epoch": 2.394539249146758, + "grad_norm": 2.700092315673828, + "learning_rate": 0.00020182025028441411, + "loss": 6.5028, + "step": 7016 + }, + { + "epoch": 2.3948805460750853, + "grad_norm": 2.810314655303955, + "learning_rate": 0.00020170648464163825, + "loss": 6.574, + "step": 7017 + }, + { + "epoch": 2.395221843003413, + "grad_norm": 2.6746256351470947, + "learning_rate": 0.00020159271899886232, + "loss": 5.7884, + "step": 7018 + }, + { + "epoch": 2.3955631399317405, + "grad_norm": 2.735712766647339, + "learning_rate": 0.00020147895335608646, + "loss": 6.5509, + "step": 7019 + }, + { + "epoch": 2.3959044368600684, + "grad_norm": 2.7703819274902344, + "learning_rate": 0.0002013651877133106, + "loss": 6.1054, + "step": 7020 + }, + { + "epoch": 2.396245733788396, + "grad_norm": 2.840043306350708, + "learning_rate": 0.00020125142207053472, + "loss": 6.2688, + "step": 7021 + }, + { + "epoch": 2.3965870307167236, + "grad_norm": 2.789001226425171, + "learning_rate": 0.00020113765642775882, + "loss": 6.1622, + "step": 7022 + }, + { + "epoch": 2.396928327645051, + "grad_norm": 2.6123547554016113, + "learning_rate": 0.00020102389078498293, + "loss": 5.9342, + "step": 7023 + }, + { + "epoch": 2.397269624573379, + "grad_norm": 2.793732166290283, + "learning_rate": 0.00020091012514220706, + "loss": 5.487, + "step": 7024 + }, + { + "epoch": 2.3976109215017063, + "grad_norm": 2.722079277038574, + "learning_rate": 0.00020079635949943117, + "loss": 6.1717, + "step": 7025 + }, + { + "epoch": 2.397952218430034, + "grad_norm": 2.798908233642578, + "learning_rate": 0.0002006825938566553, + "loss": 6.3019, + "step": 7026 + }, + { + "epoch": 2.398293515358362, + "grad_norm": 2.8467278480529785, + "learning_rate": 0.00020056882821387943, + "loss": 5.7511, + "step": 7027 + }, + { + "epoch": 2.3986348122866894, + "grad_norm": 2.7143468856811523, + "learning_rate": 0.0002004550625711035, + "loss": 5.4081, + "step": 7028 + }, + { + "epoch": 2.3989761092150172, + "grad_norm": 2.916228771209717, + "learning_rate": 0.00020034129692832764, + "loss": 5.913, + "step": 7029 + }, + { + "epoch": 2.3993174061433447, + "grad_norm": 2.752556562423706, + "learning_rate": 0.00020022753128555177, + "loss": 6.2246, + "step": 7030 + }, + { + "epoch": 2.3996587030716725, + "grad_norm": 2.748680591583252, + "learning_rate": 0.0002001137656427759, + "loss": 6.3106, + "step": 7031 + }, + { + "epoch": 2.4, + "grad_norm": 2.7810096740722656, + "learning_rate": 0.0002, + "loss": 6.5981, + "step": 7032 + }, + { + "epoch": 2.4003412969283278, + "grad_norm": 2.9772250652313232, + "learning_rate": 0.00019988623435722411, + "loss": 4.0665, + "step": 7033 + }, + { + "epoch": 2.400682593856655, + "grad_norm": 2.8036811351776123, + "learning_rate": 0.00019977246871444825, + "loss": 6.0835, + "step": 7034 + }, + { + "epoch": 2.401023890784983, + "grad_norm": 2.714700698852539, + "learning_rate": 0.00019965870307167235, + "loss": 6.4391, + "step": 7035 + }, + { + "epoch": 2.4013651877133104, + "grad_norm": 2.8477370738983154, + "learning_rate": 0.00019954493742889648, + "loss": 5.4793, + "step": 7036 + }, + { + "epoch": 2.4017064846416383, + "grad_norm": 2.8073480129241943, + "learning_rate": 0.00019943117178612062, + "loss": 5.7709, + "step": 7037 + }, + { + "epoch": 2.4020477815699657, + "grad_norm": 2.8629024028778076, + "learning_rate": 0.0001993174061433447, + "loss": 5.7823, + "step": 7038 + }, + { + "epoch": 2.4023890784982935, + "grad_norm": 2.7898664474487305, + "learning_rate": 0.00019920364050056883, + "loss": 6.2151, + "step": 7039 + }, + { + "epoch": 2.4027303754266214, + "grad_norm": 3.052391290664673, + "learning_rate": 0.00019908987485779296, + "loss": 4.0887, + "step": 7040 + }, + { + "epoch": 2.403071672354949, + "grad_norm": 2.876765489578247, + "learning_rate": 0.00019897610921501706, + "loss": 5.6112, + "step": 7041 + }, + { + "epoch": 2.4034129692832766, + "grad_norm": 2.745173931121826, + "learning_rate": 0.0001988623435722412, + "loss": 6.1771, + "step": 7042 + }, + { + "epoch": 2.403754266211604, + "grad_norm": 2.908230781555176, + "learning_rate": 0.0001987485779294653, + "loss": 5.1346, + "step": 7043 + }, + { + "epoch": 2.404095563139932, + "grad_norm": 2.827298402786255, + "learning_rate": 0.00019863481228668943, + "loss": 6.1908, + "step": 7044 + }, + { + "epoch": 2.4044368600682593, + "grad_norm": 2.6762778759002686, + "learning_rate": 0.00019852104664391354, + "loss": 6.1944, + "step": 7045 + }, + { + "epoch": 2.404778156996587, + "grad_norm": 2.9926464557647705, + "learning_rate": 0.00019840728100113767, + "loss": 4.689, + "step": 7046 + }, + { + "epoch": 2.4051194539249146, + "grad_norm": 2.697849750518799, + "learning_rate": 0.0001982935153583618, + "loss": 6.0584, + "step": 7047 + }, + { + "epoch": 2.4054607508532424, + "grad_norm": 3.2848122119903564, + "learning_rate": 0.00019817974971558588, + "loss": 5.2301, + "step": 7048 + }, + { + "epoch": 2.40580204778157, + "grad_norm": 3.0285191535949707, + "learning_rate": 0.00019806598407281, + "loss": 4.5672, + "step": 7049 + }, + { + "epoch": 2.4061433447098977, + "grad_norm": 2.768855094909668, + "learning_rate": 0.00019795221843003414, + "loss": 6.4452, + "step": 7050 + }, + { + "epoch": 2.406484641638225, + "grad_norm": 2.6719319820404053, + "learning_rate": 0.00019783845278725825, + "loss": 6.2495, + "step": 7051 + }, + { + "epoch": 2.406825938566553, + "grad_norm": 2.812721014022827, + "learning_rate": 0.00019772468714448235, + "loss": 5.9707, + "step": 7052 + }, + { + "epoch": 2.4071672354948808, + "grad_norm": 3.698657751083374, + "learning_rate": 0.00019761092150170648, + "loss": 5.3361, + "step": 7053 + }, + { + "epoch": 2.407508532423208, + "grad_norm": 2.74291729927063, + "learning_rate": 0.00019749715585893062, + "loss": 6.0255, + "step": 7054 + }, + { + "epoch": 2.407849829351536, + "grad_norm": 2.770951986312866, + "learning_rate": 0.00019738339021615472, + "loss": 6.6123, + "step": 7055 + }, + { + "epoch": 2.4081911262798634, + "grad_norm": 2.8375635147094727, + "learning_rate": 0.00019726962457337885, + "loss": 6.4971, + "step": 7056 + }, + { + "epoch": 2.4085324232081913, + "grad_norm": 2.986063003540039, + "learning_rate": 0.00019715585893060296, + "loss": 5.8929, + "step": 7057 + }, + { + "epoch": 2.4088737201365187, + "grad_norm": 4.404201030731201, + "learning_rate": 0.00019704209328782706, + "loss": 4.4985, + "step": 7058 + }, + { + "epoch": 2.4092150170648465, + "grad_norm": 2.6988513469696045, + "learning_rate": 0.0001969283276450512, + "loss": 5.791, + "step": 7059 + }, + { + "epoch": 2.409556313993174, + "grad_norm": 2.7154700756073, + "learning_rate": 0.00019681456200227533, + "loss": 6.4737, + "step": 7060 + }, + { + "epoch": 2.409897610921502, + "grad_norm": 2.815117597579956, + "learning_rate": 0.00019670079635949943, + "loss": 6.0718, + "step": 7061 + }, + { + "epoch": 2.410238907849829, + "grad_norm": 2.831260919570923, + "learning_rate": 0.00019658703071672354, + "loss": 5.3611, + "step": 7062 + }, + { + "epoch": 2.410580204778157, + "grad_norm": 1.9146329164505005, + "learning_rate": 0.00019647326507394767, + "loss": 2.7539, + "step": 7063 + }, + { + "epoch": 2.4109215017064844, + "grad_norm": 2.497642755508423, + "learning_rate": 0.0001963594994311718, + "loss": 5.5512, + "step": 7064 + }, + { + "epoch": 2.4112627986348123, + "grad_norm": 2.6266098022460938, + "learning_rate": 0.0001962457337883959, + "loss": 5.7939, + "step": 7065 + }, + { + "epoch": 2.41160409556314, + "grad_norm": 2.7678215503692627, + "learning_rate": 0.00019613196814562004, + "loss": 6.1091, + "step": 7066 + }, + { + "epoch": 2.4119453924914676, + "grad_norm": 3.060725212097168, + "learning_rate": 0.00019601820250284414, + "loss": 6.1383, + "step": 7067 + }, + { + "epoch": 2.4122866894197954, + "grad_norm": 2.825890302658081, + "learning_rate": 0.00019590443686006825, + "loss": 6.1728, + "step": 7068 + }, + { + "epoch": 2.412627986348123, + "grad_norm": 2.731437921524048, + "learning_rate": 0.00019579067121729238, + "loss": 6.2054, + "step": 7069 + }, + { + "epoch": 2.4129692832764507, + "grad_norm": 2.756383180618286, + "learning_rate": 0.0001956769055745165, + "loss": 5.4519, + "step": 7070 + }, + { + "epoch": 2.413310580204778, + "grad_norm": 2.7772936820983887, + "learning_rate": 0.00019556313993174062, + "loss": 5.739, + "step": 7071 + }, + { + "epoch": 2.413651877133106, + "grad_norm": 2.736973285675049, + "learning_rate": 0.00019544937428896472, + "loss": 5.8474, + "step": 7072 + }, + { + "epoch": 2.4139931740614333, + "grad_norm": 2.7424445152282715, + "learning_rate": 0.00019533560864618885, + "loss": 6.2036, + "step": 7073 + }, + { + "epoch": 2.414334470989761, + "grad_norm": 2.801053285598755, + "learning_rate": 0.00019522184300341299, + "loss": 6.0991, + "step": 7074 + }, + { + "epoch": 2.4146757679180886, + "grad_norm": 3.4473876953125, + "learning_rate": 0.0001951080773606371, + "loss": 5.6659, + "step": 7075 + }, + { + "epoch": 2.4150170648464164, + "grad_norm": 2.782585620880127, + "learning_rate": 0.00019499431171786122, + "loss": 5.653, + "step": 7076 + }, + { + "epoch": 2.415358361774744, + "grad_norm": 2.6526317596435547, + "learning_rate": 0.00019488054607508533, + "loss": 5.88, + "step": 7077 + }, + { + "epoch": 2.4156996587030717, + "grad_norm": 2.770357608795166, + "learning_rate": 0.00019476678043230943, + "loss": 5.7505, + "step": 7078 + }, + { + "epoch": 2.4160409556313995, + "grad_norm": 4.893315315246582, + "learning_rate": 0.00019465301478953357, + "loss": 4.9911, + "step": 7079 + }, + { + "epoch": 2.416382252559727, + "grad_norm": 2.6807408332824707, + "learning_rate": 0.0001945392491467577, + "loss": 6.1131, + "step": 7080 + }, + { + "epoch": 2.416723549488055, + "grad_norm": 2.761153221130371, + "learning_rate": 0.0001944254835039818, + "loss": 5.9863, + "step": 7081 + }, + { + "epoch": 2.417064846416382, + "grad_norm": 2.669250965118408, + "learning_rate": 0.0001943117178612059, + "loss": 5.7527, + "step": 7082 + }, + { + "epoch": 2.41740614334471, + "grad_norm": 2.7944650650024414, + "learning_rate": 0.00019419795221843004, + "loss": 6.0305, + "step": 7083 + }, + { + "epoch": 2.4177474402730375, + "grad_norm": 2.851085662841797, + "learning_rate": 0.00019408418657565417, + "loss": 5.8564, + "step": 7084 + }, + { + "epoch": 2.4180887372013653, + "grad_norm": 2.7912724018096924, + "learning_rate": 0.00019397042093287828, + "loss": 6.6317, + "step": 7085 + }, + { + "epoch": 2.4184300341296927, + "grad_norm": 2.802370309829712, + "learning_rate": 0.00019385665529010238, + "loss": 6.2561, + "step": 7086 + }, + { + "epoch": 2.4187713310580206, + "grad_norm": 2.675826072692871, + "learning_rate": 0.0001937428896473265, + "loss": 6.4923, + "step": 7087 + }, + { + "epoch": 2.419112627986348, + "grad_norm": 2.732436418533325, + "learning_rate": 0.00019362912400455062, + "loss": 5.8842, + "step": 7088 + }, + { + "epoch": 2.419453924914676, + "grad_norm": 2.6516783237457275, + "learning_rate": 0.00019351535836177475, + "loss": 6.1319, + "step": 7089 + }, + { + "epoch": 2.419795221843003, + "grad_norm": 2.7802767753601074, + "learning_rate": 0.00019340159271899888, + "loss": 6.2859, + "step": 7090 + }, + { + "epoch": 2.420136518771331, + "grad_norm": 2.645561695098877, + "learning_rate": 0.00019328782707622296, + "loss": 6.2186, + "step": 7091 + }, + { + "epoch": 2.420477815699659, + "grad_norm": 2.762645721435547, + "learning_rate": 0.0001931740614334471, + "loss": 5.679, + "step": 7092 + }, + { + "epoch": 2.4208191126279863, + "grad_norm": 2.7711853981018066, + "learning_rate": 0.00019306029579067122, + "loss": 6.269, + "step": 7093 + }, + { + "epoch": 2.421160409556314, + "grad_norm": 2.817148208618164, + "learning_rate": 0.00019294653014789536, + "loss": 5.9867, + "step": 7094 + }, + { + "epoch": 2.4215017064846416, + "grad_norm": 3.189115285873413, + "learning_rate": 0.00019283276450511946, + "loss": 5.4579, + "step": 7095 + }, + { + "epoch": 2.4218430034129694, + "grad_norm": 2.7254624366760254, + "learning_rate": 0.00019271899886234357, + "loss": 6.1964, + "step": 7096 + }, + { + "epoch": 2.422184300341297, + "grad_norm": 2.8384273052215576, + "learning_rate": 0.0001926052332195677, + "loss": 5.0824, + "step": 7097 + }, + { + "epoch": 2.4225255972696247, + "grad_norm": 2.7569327354431152, + "learning_rate": 0.0001924914675767918, + "loss": 5.8933, + "step": 7098 + }, + { + "epoch": 2.422866894197952, + "grad_norm": 2.6978225708007812, + "learning_rate": 0.00019237770193401594, + "loss": 4.7832, + "step": 7099 + }, + { + "epoch": 2.42320819112628, + "grad_norm": 2.7177062034606934, + "learning_rate": 0.00019226393629124007, + "loss": 5.4131, + "step": 7100 + }, + { + "epoch": 2.4235494880546073, + "grad_norm": 2.7541306018829346, + "learning_rate": 0.00019215017064846415, + "loss": 6.3098, + "step": 7101 + }, + { + "epoch": 2.423890784982935, + "grad_norm": 2.7490100860595703, + "learning_rate": 0.00019203640500568828, + "loss": 6.5531, + "step": 7102 + }, + { + "epoch": 2.4242320819112626, + "grad_norm": 2.74153208732605, + "learning_rate": 0.0001919226393629124, + "loss": 6.1171, + "step": 7103 + }, + { + "epoch": 2.4245733788395905, + "grad_norm": 2.7653117179870605, + "learning_rate": 0.00019180887372013654, + "loss": 5.8081, + "step": 7104 + }, + { + "epoch": 2.4249146757679183, + "grad_norm": 2.7591192722320557, + "learning_rate": 0.00019169510807736065, + "loss": 5.5008, + "step": 7105 + }, + { + "epoch": 2.4252559726962457, + "grad_norm": 2.5488815307617188, + "learning_rate": 0.00019158134243458475, + "loss": 6.4281, + "step": 7106 + }, + { + "epoch": 2.4255972696245736, + "grad_norm": 2.22991943359375, + "learning_rate": 0.00019146757679180888, + "loss": 2.1561, + "step": 7107 + }, + { + "epoch": 2.425938566552901, + "grad_norm": 3.231452465057373, + "learning_rate": 0.000191353811149033, + "loss": 5.4485, + "step": 7108 + }, + { + "epoch": 2.426279863481229, + "grad_norm": 3.1210076808929443, + "learning_rate": 0.00019124004550625712, + "loss": 4.5784, + "step": 7109 + }, + { + "epoch": 2.426621160409556, + "grad_norm": 2.622694253921509, + "learning_rate": 0.00019112627986348125, + "loss": 5.6532, + "step": 7110 + }, + { + "epoch": 2.426962457337884, + "grad_norm": 2.8294200897216797, + "learning_rate": 0.00019101251422070533, + "loss": 6.0965, + "step": 7111 + }, + { + "epoch": 2.4273037542662115, + "grad_norm": 2.737205982208252, + "learning_rate": 0.00019089874857792946, + "loss": 6.3284, + "step": 7112 + }, + { + "epoch": 2.4276450511945393, + "grad_norm": 2.6656839847564697, + "learning_rate": 0.0001907849829351536, + "loss": 6.3298, + "step": 7113 + }, + { + "epoch": 2.4279863481228667, + "grad_norm": 2.7031219005584717, + "learning_rate": 0.00019067121729237773, + "loss": 6.1422, + "step": 7114 + }, + { + "epoch": 2.4283276450511946, + "grad_norm": 2.792492389678955, + "learning_rate": 0.00019055745164960183, + "loss": 6.1246, + "step": 7115 + }, + { + "epoch": 2.428668941979522, + "grad_norm": 2.8730626106262207, + "learning_rate": 0.00019044368600682594, + "loss": 5.2022, + "step": 7116 + }, + { + "epoch": 2.42901023890785, + "grad_norm": 2.832547903060913, + "learning_rate": 0.00019032992036405007, + "loss": 5.8241, + "step": 7117 + }, + { + "epoch": 2.4293515358361777, + "grad_norm": 2.7533719539642334, + "learning_rate": 0.00019021615472127417, + "loss": 6.1243, + "step": 7118 + }, + { + "epoch": 2.429692832764505, + "grad_norm": 2.7654356956481934, + "learning_rate": 0.0001901023890784983, + "loss": 6.5303, + "step": 7119 + }, + { + "epoch": 2.430034129692833, + "grad_norm": 2.7612063884735107, + "learning_rate": 0.0001899886234357224, + "loss": 6.8023, + "step": 7120 + }, + { + "epoch": 2.4303754266211604, + "grad_norm": 2.8074557781219482, + "learning_rate": 0.00018987485779294652, + "loss": 5.4001, + "step": 7121 + }, + { + "epoch": 2.430716723549488, + "grad_norm": 2.6893937587738037, + "learning_rate": 0.00018976109215017065, + "loss": 5.6546, + "step": 7122 + }, + { + "epoch": 2.4310580204778156, + "grad_norm": 2.6178319454193115, + "learning_rate": 0.00018964732650739478, + "loss": 6.3508, + "step": 7123 + }, + { + "epoch": 2.4313993174061435, + "grad_norm": 2.972472906112671, + "learning_rate": 0.0001895335608646189, + "loss": 5.156, + "step": 7124 + }, + { + "epoch": 2.431740614334471, + "grad_norm": 2.7686290740966797, + "learning_rate": 0.000189419795221843, + "loss": 6.5945, + "step": 7125 + }, + { + "epoch": 2.4320819112627987, + "grad_norm": 2.8814847469329834, + "learning_rate": 0.00018930602957906712, + "loss": 5.6923, + "step": 7126 + }, + { + "epoch": 2.432423208191126, + "grad_norm": 1.9587899446487427, + "learning_rate": 0.00018919226393629125, + "loss": 3.0517, + "step": 7127 + }, + { + "epoch": 2.432764505119454, + "grad_norm": 2.7158172130584717, + "learning_rate": 0.00018907849829351536, + "loss": 5.734, + "step": 7128 + }, + { + "epoch": 2.4331058020477814, + "grad_norm": 2.719918727874756, + "learning_rate": 0.0001889647326507395, + "loss": 6.0725, + "step": 7129 + }, + { + "epoch": 2.4334470989761092, + "grad_norm": 3.0201733112335205, + "learning_rate": 0.0001888509670079636, + "loss": 5.2013, + "step": 7130 + }, + { + "epoch": 2.433788395904437, + "grad_norm": 2.701390266418457, + "learning_rate": 0.0001887372013651877, + "loss": 5.9561, + "step": 7131 + }, + { + "epoch": 2.4341296928327645, + "grad_norm": 2.7162129878997803, + "learning_rate": 0.00018862343572241183, + "loss": 5.5845, + "step": 7132 + }, + { + "epoch": 2.4344709897610923, + "grad_norm": 2.755629777908325, + "learning_rate": 0.00018850967007963596, + "loss": 5.9108, + "step": 7133 + }, + { + "epoch": 2.4348122866894197, + "grad_norm": 2.744990110397339, + "learning_rate": 0.0001883959044368601, + "loss": 6.0868, + "step": 7134 + }, + { + "epoch": 2.4351535836177476, + "grad_norm": 2.6877083778381348, + "learning_rate": 0.00018828213879408417, + "loss": 6.2003, + "step": 7135 + }, + { + "epoch": 2.435494880546075, + "grad_norm": 2.8432061672210693, + "learning_rate": 0.0001881683731513083, + "loss": 4.8051, + "step": 7136 + }, + { + "epoch": 2.435836177474403, + "grad_norm": 2.6555399894714355, + "learning_rate": 0.00018805460750853244, + "loss": 6.0686, + "step": 7137 + }, + { + "epoch": 2.4361774744027302, + "grad_norm": 2.7505736351013184, + "learning_rate": 0.00018794084186575654, + "loss": 6.1821, + "step": 7138 + }, + { + "epoch": 2.436518771331058, + "grad_norm": 2.7092573642730713, + "learning_rate": 0.00018782707622298068, + "loss": 6.1421, + "step": 7139 + }, + { + "epoch": 2.4368600682593855, + "grad_norm": 2.7411575317382812, + "learning_rate": 0.00018771331058020478, + "loss": 5.4851, + "step": 7140 + }, + { + "epoch": 2.4372013651877134, + "grad_norm": 2.853494882583618, + "learning_rate": 0.00018759954493742888, + "loss": 5.2069, + "step": 7141 + }, + { + "epoch": 2.4375426621160408, + "grad_norm": 2.8510758876800537, + "learning_rate": 0.00018748577929465302, + "loss": 5.1078, + "step": 7142 + }, + { + "epoch": 2.4378839590443686, + "grad_norm": 2.6476118564605713, + "learning_rate": 0.00018737201365187715, + "loss": 5.4156, + "step": 7143 + }, + { + "epoch": 2.4382252559726965, + "grad_norm": 2.692403793334961, + "learning_rate": 0.00018725824800910125, + "loss": 5.7651, + "step": 7144 + }, + { + "epoch": 2.438566552901024, + "grad_norm": 3.758758783340454, + "learning_rate": 0.00018714448236632536, + "loss": 5.061, + "step": 7145 + }, + { + "epoch": 2.4389078498293517, + "grad_norm": 2.8809521198272705, + "learning_rate": 0.0001870307167235495, + "loss": 5.8695, + "step": 7146 + }, + { + "epoch": 2.439249146757679, + "grad_norm": 3.381040573120117, + "learning_rate": 0.00018691695108077362, + "loss": 3.9914, + "step": 7147 + }, + { + "epoch": 2.439590443686007, + "grad_norm": 2.7137439250946045, + "learning_rate": 0.00018680318543799773, + "loss": 6.4293, + "step": 7148 + }, + { + "epoch": 2.4399317406143344, + "grad_norm": 2.7312674522399902, + "learning_rate": 0.00018668941979522186, + "loss": 5.4181, + "step": 7149 + }, + { + "epoch": 2.4402730375426622, + "grad_norm": 2.8323845863342285, + "learning_rate": 0.00018657565415244596, + "loss": 6.078, + "step": 7150 + }, + { + "epoch": 2.4406143344709896, + "grad_norm": 2.6376965045928955, + "learning_rate": 0.00018646188850967007, + "loss": 5.5007, + "step": 7151 + }, + { + "epoch": 2.4409556313993175, + "grad_norm": 2.6789960861206055, + "learning_rate": 0.0001863481228668942, + "loss": 5.7487, + "step": 7152 + }, + { + "epoch": 2.441296928327645, + "grad_norm": 2.624680995941162, + "learning_rate": 0.00018623435722411833, + "loss": 6.5413, + "step": 7153 + }, + { + "epoch": 2.4416382252559727, + "grad_norm": 2.9074723720550537, + "learning_rate": 0.00018612059158134244, + "loss": 5.3683, + "step": 7154 + }, + { + "epoch": 2.4419795221843, + "grad_norm": 2.71537446975708, + "learning_rate": 0.00018600682593856654, + "loss": 5.7032, + "step": 7155 + }, + { + "epoch": 2.442320819112628, + "grad_norm": 3.114678144454956, + "learning_rate": 0.00018589306029579068, + "loss": 5.9269, + "step": 7156 + }, + { + "epoch": 2.442662116040956, + "grad_norm": 2.7557051181793213, + "learning_rate": 0.0001857792946530148, + "loss": 6.3845, + "step": 7157 + }, + { + "epoch": 2.4430034129692833, + "grad_norm": 2.7388687133789062, + "learning_rate": 0.0001856655290102389, + "loss": 6.0723, + "step": 7158 + }, + { + "epoch": 2.443344709897611, + "grad_norm": 2.7414541244506836, + "learning_rate": 0.00018555176336746302, + "loss": 6.4165, + "step": 7159 + }, + { + "epoch": 2.4436860068259385, + "grad_norm": 2.7356631755828857, + "learning_rate": 0.00018543799772468715, + "loss": 6.1855, + "step": 7160 + }, + { + "epoch": 2.4440273037542664, + "grad_norm": 2.845184564590454, + "learning_rate": 0.00018532423208191125, + "loss": 5.8645, + "step": 7161 + }, + { + "epoch": 2.4443686006825938, + "grad_norm": 2.6517839431762695, + "learning_rate": 0.0001852104664391354, + "loss": 6.2252, + "step": 7162 + }, + { + "epoch": 2.4447098976109216, + "grad_norm": 2.7228269577026367, + "learning_rate": 0.00018509670079635952, + "loss": 6.3114, + "step": 7163 + }, + { + "epoch": 2.445051194539249, + "grad_norm": 2.680921792984009, + "learning_rate": 0.0001849829351535836, + "loss": 6.3173, + "step": 7164 + }, + { + "epoch": 2.445392491467577, + "grad_norm": 2.817765235900879, + "learning_rate": 0.00018486916951080773, + "loss": 6.0299, + "step": 7165 + }, + { + "epoch": 2.4457337883959043, + "grad_norm": 4.926273822784424, + "learning_rate": 0.00018475540386803186, + "loss": 5.256, + "step": 7166 + }, + { + "epoch": 2.446075085324232, + "grad_norm": 2.7189366817474365, + "learning_rate": 0.000184641638225256, + "loss": 5.6313, + "step": 7167 + }, + { + "epoch": 2.4464163822525595, + "grad_norm": 6.03790807723999, + "learning_rate": 0.0001845278725824801, + "loss": 4.8653, + "step": 7168 + }, + { + "epoch": 2.4467576791808874, + "grad_norm": 2.8209898471832275, + "learning_rate": 0.0001844141069397042, + "loss": 6.1202, + "step": 7169 + }, + { + "epoch": 2.4470989761092152, + "grad_norm": 2.895895004272461, + "learning_rate": 0.00018430034129692833, + "loss": 5.8068, + "step": 7170 + }, + { + "epoch": 2.4474402730375426, + "grad_norm": 2.669280529022217, + "learning_rate": 0.00018418657565415244, + "loss": 5.6466, + "step": 7171 + }, + { + "epoch": 2.4477815699658705, + "grad_norm": 2.70436429977417, + "learning_rate": 0.00018407281001137657, + "loss": 5.8402, + "step": 7172 + }, + { + "epoch": 2.448122866894198, + "grad_norm": 2.793987989425659, + "learning_rate": 0.0001839590443686007, + "loss": 6.507, + "step": 7173 + }, + { + "epoch": 2.4484641638225257, + "grad_norm": 2.6550915241241455, + "learning_rate": 0.00018384527872582478, + "loss": 5.8959, + "step": 7174 + }, + { + "epoch": 2.448805460750853, + "grad_norm": 2.6808414459228516, + "learning_rate": 0.00018373151308304891, + "loss": 6.6288, + "step": 7175 + }, + { + "epoch": 2.449146757679181, + "grad_norm": 2.6597981452941895, + "learning_rate": 0.00018361774744027305, + "loss": 5.4804, + "step": 7176 + }, + { + "epoch": 2.4494880546075084, + "grad_norm": 2.7056760787963867, + "learning_rate": 0.00018350398179749718, + "loss": 6.5242, + "step": 7177 + }, + { + "epoch": 2.4498293515358363, + "grad_norm": 2.821537733078003, + "learning_rate": 0.00018339021615472128, + "loss": 5.9898, + "step": 7178 + }, + { + "epoch": 2.4501706484641637, + "grad_norm": 4.618656635284424, + "learning_rate": 0.0001832764505119454, + "loss": 5.2683, + "step": 7179 + }, + { + "epoch": 2.4505119453924915, + "grad_norm": 2.7883718013763428, + "learning_rate": 0.00018316268486916952, + "loss": 6.4302, + "step": 7180 + }, + { + "epoch": 2.450853242320819, + "grad_norm": 2.782613515853882, + "learning_rate": 0.00018304891922639362, + "loss": 6.0204, + "step": 7181 + }, + { + "epoch": 2.4511945392491468, + "grad_norm": 3.4366047382354736, + "learning_rate": 0.00018293515358361776, + "loss": 4.9788, + "step": 7182 + }, + { + "epoch": 2.4515358361774746, + "grad_norm": 2.758683681488037, + "learning_rate": 0.0001828213879408419, + "loss": 6.1169, + "step": 7183 + }, + { + "epoch": 2.451877133105802, + "grad_norm": 2.7406606674194336, + "learning_rate": 0.00018270762229806597, + "loss": 6.1129, + "step": 7184 + }, + { + "epoch": 2.45221843003413, + "grad_norm": 2.7950448989868164, + "learning_rate": 0.0001825938566552901, + "loss": 6.2123, + "step": 7185 + }, + { + "epoch": 2.4525597269624573, + "grad_norm": 2.6972243785858154, + "learning_rate": 0.00018248009101251423, + "loss": 5.9016, + "step": 7186 + }, + { + "epoch": 2.452901023890785, + "grad_norm": 2.7879507541656494, + "learning_rate": 0.00018236632536973836, + "loss": 6.3663, + "step": 7187 + }, + { + "epoch": 2.4532423208191125, + "grad_norm": 2.7534303665161133, + "learning_rate": 0.00018225255972696247, + "loss": 6.1968, + "step": 7188 + }, + { + "epoch": 2.4535836177474404, + "grad_norm": 2.755431652069092, + "learning_rate": 0.00018213879408418657, + "loss": 5.782, + "step": 7189 + }, + { + "epoch": 2.453924914675768, + "grad_norm": 2.6141881942749023, + "learning_rate": 0.0001820250284414107, + "loss": 5.532, + "step": 7190 + }, + { + "epoch": 2.4542662116040956, + "grad_norm": 3.2307145595550537, + "learning_rate": 0.0001819112627986348, + "loss": 5.0953, + "step": 7191 + }, + { + "epoch": 2.454607508532423, + "grad_norm": 2.6708171367645264, + "learning_rate": 0.00018179749715585894, + "loss": 6.0323, + "step": 7192 + }, + { + "epoch": 2.454948805460751, + "grad_norm": 2.6066534519195557, + "learning_rate": 0.00018168373151308305, + "loss": 6.0203, + "step": 7193 + }, + { + "epoch": 2.4552901023890783, + "grad_norm": 2.7354557514190674, + "learning_rate": 0.00018156996587030715, + "loss": 6.1075, + "step": 7194 + }, + { + "epoch": 2.455631399317406, + "grad_norm": 2.2138519287109375, + "learning_rate": 0.00018145620022753128, + "loss": 4.2661, + "step": 7195 + }, + { + "epoch": 2.455972696245734, + "grad_norm": 2.87853741645813, + "learning_rate": 0.00018134243458475542, + "loss": 6.0552, + "step": 7196 + }, + { + "epoch": 2.4563139931740614, + "grad_norm": 2.7033181190490723, + "learning_rate": 0.00018122866894197955, + "loss": 6.5583, + "step": 7197 + }, + { + "epoch": 2.4566552901023893, + "grad_norm": 2.6118319034576416, + "learning_rate": 0.00018111490329920363, + "loss": 6.1334, + "step": 7198 + }, + { + "epoch": 2.4569965870307167, + "grad_norm": 2.641845941543579, + "learning_rate": 0.00018100113765642776, + "loss": 6.1401, + "step": 7199 + }, + { + "epoch": 2.4573378839590445, + "grad_norm": 2.8019094467163086, + "learning_rate": 0.0001808873720136519, + "loss": 5.518, + "step": 7200 + }, + { + "epoch": 2.457679180887372, + "grad_norm": 4.355568885803223, + "learning_rate": 0.000180773606370876, + "loss": 4.7298, + "step": 7201 + }, + { + "epoch": 2.4580204778156998, + "grad_norm": 3.0501973628997803, + "learning_rate": 0.00018065984072810013, + "loss": 5.1193, + "step": 7202 + }, + { + "epoch": 2.458361774744027, + "grad_norm": 2.716735363006592, + "learning_rate": 0.00018054607508532423, + "loss": 5.868, + "step": 7203 + }, + { + "epoch": 2.458703071672355, + "grad_norm": 2.653223991394043, + "learning_rate": 0.00018043230944254834, + "loss": 5.8835, + "step": 7204 + }, + { + "epoch": 2.4590443686006824, + "grad_norm": 2.7722156047821045, + "learning_rate": 0.00018031854379977247, + "loss": 6.1643, + "step": 7205 + }, + { + "epoch": 2.4593856655290103, + "grad_norm": 4.1703619956970215, + "learning_rate": 0.0001802047781569966, + "loss": 5.0742, + "step": 7206 + }, + { + "epoch": 2.4597269624573377, + "grad_norm": 2.799798011779785, + "learning_rate": 0.00018009101251422073, + "loss": 5.3693, + "step": 7207 + }, + { + "epoch": 2.4600682593856655, + "grad_norm": 2.693559169769287, + "learning_rate": 0.0001799772468714448, + "loss": 6.1607, + "step": 7208 + }, + { + "epoch": 2.4604095563139934, + "grad_norm": 2.786590576171875, + "learning_rate": 0.00017986348122866894, + "loss": 6.3078, + "step": 7209 + }, + { + "epoch": 2.460750853242321, + "grad_norm": 6.34663200378418, + "learning_rate": 0.00017974971558589307, + "loss": 3.7846, + "step": 7210 + }, + { + "epoch": 2.4610921501706486, + "grad_norm": 2.779407024383545, + "learning_rate": 0.00017963594994311718, + "loss": 6.0981, + "step": 7211 + }, + { + "epoch": 2.461433447098976, + "grad_norm": 2.8530948162078857, + "learning_rate": 0.0001795221843003413, + "loss": 6.5747, + "step": 7212 + }, + { + "epoch": 2.461774744027304, + "grad_norm": 2.747483491897583, + "learning_rate": 0.00017940841865756542, + "loss": 6.037, + "step": 7213 + }, + { + "epoch": 2.4621160409556313, + "grad_norm": 2.689389228820801, + "learning_rate": 0.00017929465301478952, + "loss": 5.8297, + "step": 7214 + }, + { + "epoch": 2.462457337883959, + "grad_norm": 2.734438419342041, + "learning_rate": 0.00017918088737201365, + "loss": 6.2479, + "step": 7215 + }, + { + "epoch": 2.4627986348122866, + "grad_norm": 5.3711466789245605, + "learning_rate": 0.00017906712172923779, + "loss": 4.8514, + "step": 7216 + }, + { + "epoch": 2.4631399317406144, + "grad_norm": 2.606839418411255, + "learning_rate": 0.00017895335608646192, + "loss": 6.2493, + "step": 7217 + }, + { + "epoch": 2.463481228668942, + "grad_norm": 2.6620965003967285, + "learning_rate": 0.000178839590443686, + "loss": 6.1907, + "step": 7218 + }, + { + "epoch": 2.4638225255972697, + "grad_norm": 2.8028299808502197, + "learning_rate": 0.00017872582480091013, + "loss": 6.3521, + "step": 7219 + }, + { + "epoch": 2.464163822525597, + "grad_norm": 2.6073973178863525, + "learning_rate": 0.00017861205915813426, + "loss": 5.6279, + "step": 7220 + }, + { + "epoch": 2.464505119453925, + "grad_norm": 3.001814603805542, + "learning_rate": 0.00017849829351535836, + "loss": 4.9883, + "step": 7221 + }, + { + "epoch": 2.4648464163822528, + "grad_norm": 2.9250714778900146, + "learning_rate": 0.0001783845278725825, + "loss": 5.5733, + "step": 7222 + }, + { + "epoch": 2.46518771331058, + "grad_norm": 2.743499994277954, + "learning_rate": 0.0001782707622298066, + "loss": 6.2804, + "step": 7223 + }, + { + "epoch": 2.465529010238908, + "grad_norm": 2.7742860317230225, + "learning_rate": 0.0001781569965870307, + "loss": 6.0203, + "step": 7224 + }, + { + "epoch": 2.4658703071672354, + "grad_norm": 2.818325996398926, + "learning_rate": 0.00017804323094425484, + "loss": 5.6781, + "step": 7225 + }, + { + "epoch": 2.4662116040955633, + "grad_norm": 2.795591115951538, + "learning_rate": 0.00017792946530147897, + "loss": 5.602, + "step": 7226 + }, + { + "epoch": 2.4665529010238907, + "grad_norm": 2.807055950164795, + "learning_rate": 0.00017781569965870305, + "loss": 5.1593, + "step": 7227 + }, + { + "epoch": 2.4668941979522185, + "grad_norm": 2.7123217582702637, + "learning_rate": 0.00017770193401592718, + "loss": 6.4722, + "step": 7228 + }, + { + "epoch": 2.467235494880546, + "grad_norm": 2.759861469268799, + "learning_rate": 0.0001775881683731513, + "loss": 5.4875, + "step": 7229 + }, + { + "epoch": 2.467576791808874, + "grad_norm": 2.78224515914917, + "learning_rate": 0.00017747440273037544, + "loss": 5.79, + "step": 7230 + }, + { + "epoch": 2.467918088737201, + "grad_norm": 2.7341177463531494, + "learning_rate": 0.00017736063708759955, + "loss": 5.9504, + "step": 7231 + }, + { + "epoch": 2.468259385665529, + "grad_norm": 2.6890923976898193, + "learning_rate": 0.00017724687144482365, + "loss": 6.5827, + "step": 7232 + }, + { + "epoch": 2.4686006825938565, + "grad_norm": 2.8176991939544678, + "learning_rate": 0.0001771331058020478, + "loss": 5.8202, + "step": 7233 + }, + { + "epoch": 2.4689419795221843, + "grad_norm": 2.746105670928955, + "learning_rate": 0.0001770193401592719, + "loss": 6.5549, + "step": 7234 + }, + { + "epoch": 2.469283276450512, + "grad_norm": 2.7601068019866943, + "learning_rate": 0.00017690557451649602, + "loss": 5.8594, + "step": 7235 + }, + { + "epoch": 2.4696245733788396, + "grad_norm": 2.550632953643799, + "learning_rate": 0.00017679180887372016, + "loss": 4.9317, + "step": 7236 + }, + { + "epoch": 2.4699658703071674, + "grad_norm": 2.6904959678649902, + "learning_rate": 0.00017667804323094423, + "loss": 6.5549, + "step": 7237 + }, + { + "epoch": 2.470307167235495, + "grad_norm": 2.70029616355896, + "learning_rate": 0.00017656427758816837, + "loss": 5.7531, + "step": 7238 + }, + { + "epoch": 2.4706484641638227, + "grad_norm": 2.799412250518799, + "learning_rate": 0.0001764505119453925, + "loss": 5.6873, + "step": 7239 + }, + { + "epoch": 2.47098976109215, + "grad_norm": 2.8735549449920654, + "learning_rate": 0.00017633674630261663, + "loss": 4.2249, + "step": 7240 + }, + { + "epoch": 2.471331058020478, + "grad_norm": 2.010611057281494, + "learning_rate": 0.00017622298065984073, + "loss": 2.7766, + "step": 7241 + }, + { + "epoch": 2.4716723549488053, + "grad_norm": 2.699479579925537, + "learning_rate": 0.00017610921501706484, + "loss": 5.8529, + "step": 7242 + }, + { + "epoch": 2.472013651877133, + "grad_norm": 2.7273783683776855, + "learning_rate": 0.00017599544937428897, + "loss": 6.4994, + "step": 7243 + }, + { + "epoch": 2.4723549488054606, + "grad_norm": 2.768795967102051, + "learning_rate": 0.00017588168373151308, + "loss": 6.4167, + "step": 7244 + }, + { + "epoch": 2.4726962457337884, + "grad_norm": 2.7882094383239746, + "learning_rate": 0.0001757679180887372, + "loss": 5.8934, + "step": 7245 + }, + { + "epoch": 2.473037542662116, + "grad_norm": 2.8088643550872803, + "learning_rate": 0.00017565415244596134, + "loss": 6.0832, + "step": 7246 + }, + { + "epoch": 2.4733788395904437, + "grad_norm": 2.7583882808685303, + "learning_rate": 0.00017554038680318542, + "loss": 6.4278, + "step": 7247 + }, + { + "epoch": 2.4737201365187715, + "grad_norm": 2.710397720336914, + "learning_rate": 0.00017542662116040955, + "loss": 5.9064, + "step": 7248 + }, + { + "epoch": 2.474061433447099, + "grad_norm": 3.8351919651031494, + "learning_rate": 0.00017531285551763368, + "loss": 5.1969, + "step": 7249 + }, + { + "epoch": 2.474402730375427, + "grad_norm": 2.7258529663085938, + "learning_rate": 0.00017519908987485781, + "loss": 5.6392, + "step": 7250 + }, + { + "epoch": 2.474744027303754, + "grad_norm": 2.758955717086792, + "learning_rate": 0.00017508532423208192, + "loss": 6.3959, + "step": 7251 + }, + { + "epoch": 2.475085324232082, + "grad_norm": 2.727679967880249, + "learning_rate": 0.00017497155858930602, + "loss": 5.689, + "step": 7252 + }, + { + "epoch": 2.4754266211604095, + "grad_norm": 2.7123825550079346, + "learning_rate": 0.00017485779294653016, + "loss": 6.4214, + "step": 7253 + }, + { + "epoch": 2.4757679180887373, + "grad_norm": 2.7021186351776123, + "learning_rate": 0.00017474402730375426, + "loss": 5.8807, + "step": 7254 + }, + { + "epoch": 2.4761092150170647, + "grad_norm": 2.6904120445251465, + "learning_rate": 0.0001746302616609784, + "loss": 6.3117, + "step": 7255 + }, + { + "epoch": 2.4764505119453926, + "grad_norm": 2.68574595451355, + "learning_rate": 0.00017451649601820253, + "loss": 6.2527, + "step": 7256 + }, + { + "epoch": 2.47679180887372, + "grad_norm": 2.733830213546753, + "learning_rate": 0.0001744027303754266, + "loss": 6.2249, + "step": 7257 + }, + { + "epoch": 2.477133105802048, + "grad_norm": 2.7638471126556396, + "learning_rate": 0.00017428896473265074, + "loss": 6.5467, + "step": 7258 + }, + { + "epoch": 2.4774744027303752, + "grad_norm": 2.627199172973633, + "learning_rate": 0.00017417519908987487, + "loss": 6.2666, + "step": 7259 + }, + { + "epoch": 2.477815699658703, + "grad_norm": 2.6954307556152344, + "learning_rate": 0.000174061433447099, + "loss": 5.407, + "step": 7260 + }, + { + "epoch": 2.478156996587031, + "grad_norm": 2.6798486709594727, + "learning_rate": 0.00017394766780432308, + "loss": 6.3605, + "step": 7261 + }, + { + "epoch": 2.4784982935153583, + "grad_norm": 2.6780683994293213, + "learning_rate": 0.0001738339021615472, + "loss": 5.8869, + "step": 7262 + }, + { + "epoch": 2.478839590443686, + "grad_norm": 2.620546340942383, + "learning_rate": 0.00017372013651877134, + "loss": 6.0429, + "step": 7263 + }, + { + "epoch": 2.4791808873720136, + "grad_norm": 3.052701711654663, + "learning_rate": 0.00017360637087599545, + "loss": 5.4183, + "step": 7264 + }, + { + "epoch": 2.4795221843003414, + "grad_norm": 2.6496641635894775, + "learning_rate": 0.00017349260523321958, + "loss": 5.8464, + "step": 7265 + }, + { + "epoch": 2.479863481228669, + "grad_norm": 2.715848684310913, + "learning_rate": 0.00017337883959044368, + "loss": 6.6531, + "step": 7266 + }, + { + "epoch": 2.4802047781569967, + "grad_norm": 2.6739721298217773, + "learning_rate": 0.0001732650739476678, + "loss": 6.264, + "step": 7267 + }, + { + "epoch": 2.480546075085324, + "grad_norm": 2.7070438861846924, + "learning_rate": 0.00017315130830489192, + "loss": 6.047, + "step": 7268 + }, + { + "epoch": 2.480887372013652, + "grad_norm": 2.752788782119751, + "learning_rate": 0.00017303754266211605, + "loss": 6.0782, + "step": 7269 + }, + { + "epoch": 2.4812286689419794, + "grad_norm": 2.7496471405029297, + "learning_rate": 0.00017292377701934018, + "loss": 6.4748, + "step": 7270 + }, + { + "epoch": 2.481569965870307, + "grad_norm": 2.7307982444763184, + "learning_rate": 0.00017281001137656426, + "loss": 6.6512, + "step": 7271 + }, + { + "epoch": 2.4819112627986346, + "grad_norm": 2.725822925567627, + "learning_rate": 0.0001726962457337884, + "loss": 6.5072, + "step": 7272 + }, + { + "epoch": 2.4822525597269625, + "grad_norm": 2.7123241424560547, + "learning_rate": 0.00017258248009101253, + "loss": 5.8875, + "step": 7273 + }, + { + "epoch": 2.4825938566552903, + "grad_norm": 2.633855104446411, + "learning_rate": 0.00017246871444823663, + "loss": 6.0088, + "step": 7274 + }, + { + "epoch": 2.4829351535836177, + "grad_norm": 4.553799629211426, + "learning_rate": 0.00017235494880546076, + "loss": 4.3472, + "step": 7275 + }, + { + "epoch": 2.4832764505119456, + "grad_norm": 2.7347793579101562, + "learning_rate": 0.00017224118316268487, + "loss": 5.2237, + "step": 7276 + }, + { + "epoch": 2.483617747440273, + "grad_norm": 2.723522424697876, + "learning_rate": 0.00017212741751990897, + "loss": 6.5071, + "step": 7277 + }, + { + "epoch": 2.483959044368601, + "grad_norm": 2.6944844722747803, + "learning_rate": 0.0001720136518771331, + "loss": 6.1664, + "step": 7278 + }, + { + "epoch": 2.4843003412969282, + "grad_norm": 2.7033700942993164, + "learning_rate": 0.00017189988623435724, + "loss": 6.2606, + "step": 7279 + }, + { + "epoch": 2.484641638225256, + "grad_norm": 3.5506386756896973, + "learning_rate": 0.00017178612059158137, + "loss": 4.5449, + "step": 7280 + }, + { + "epoch": 2.4849829351535835, + "grad_norm": 2.705688714981079, + "learning_rate": 0.00017167235494880545, + "loss": 6.3225, + "step": 7281 + }, + { + "epoch": 2.4853242320819113, + "grad_norm": 2.7204155921936035, + "learning_rate": 0.00017155858930602958, + "loss": 6.047, + "step": 7282 + }, + { + "epoch": 2.4856655290102387, + "grad_norm": 2.656323194503784, + "learning_rate": 0.0001714448236632537, + "loss": 6.1217, + "step": 7283 + }, + { + "epoch": 2.4860068259385666, + "grad_norm": 3.497453451156616, + "learning_rate": 0.00017133105802047782, + "loss": 5.0882, + "step": 7284 + }, + { + "epoch": 2.486348122866894, + "grad_norm": 2.69559383392334, + "learning_rate": 0.00017121729237770195, + "loss": 5.7036, + "step": 7285 + }, + { + "epoch": 2.486689419795222, + "grad_norm": 2.734285354614258, + "learning_rate": 0.00017110352673492605, + "loss": 5.8757, + "step": 7286 + }, + { + "epoch": 2.4870307167235497, + "grad_norm": 2.6888885498046875, + "learning_rate": 0.00017098976109215016, + "loss": 5.7562, + "step": 7287 + }, + { + "epoch": 2.487372013651877, + "grad_norm": 2.7632334232330322, + "learning_rate": 0.0001708759954493743, + "loss": 6.3917, + "step": 7288 + }, + { + "epoch": 2.487713310580205, + "grad_norm": 2.7298285961151123, + "learning_rate": 0.00017076222980659842, + "loss": 5.9822, + "step": 7289 + }, + { + "epoch": 2.4880546075085324, + "grad_norm": 2.7448837757110596, + "learning_rate": 0.00017064846416382255, + "loss": 6.1619, + "step": 7290 + }, + { + "epoch": 2.48839590443686, + "grad_norm": 2.6427669525146484, + "learning_rate": 0.00017053469852104663, + "loss": 5.762, + "step": 7291 + }, + { + "epoch": 2.4887372013651876, + "grad_norm": 2.733144998550415, + "learning_rate": 0.00017042093287827076, + "loss": 6.4147, + "step": 7292 + }, + { + "epoch": 2.4890784982935155, + "grad_norm": 2.6906228065490723, + "learning_rate": 0.0001703071672354949, + "loss": 6.2987, + "step": 7293 + }, + { + "epoch": 2.489419795221843, + "grad_norm": 2.6950948238372803, + "learning_rate": 0.000170193401592719, + "loss": 5.6162, + "step": 7294 + }, + { + "epoch": 2.4897610921501707, + "grad_norm": 2.7234177589416504, + "learning_rate": 0.00017007963594994313, + "loss": 5.8311, + "step": 7295 + }, + { + "epoch": 2.490102389078498, + "grad_norm": 2.7544310092926025, + "learning_rate": 0.00016996587030716724, + "loss": 6.3541, + "step": 7296 + }, + { + "epoch": 2.490443686006826, + "grad_norm": 2.595151424407959, + "learning_rate": 0.00016985210466439134, + "loss": 6.0401, + "step": 7297 + }, + { + "epoch": 2.4907849829351534, + "grad_norm": 3.826329469680786, + "learning_rate": 0.00016973833902161548, + "loss": 4.7459, + "step": 7298 + }, + { + "epoch": 2.4911262798634812, + "grad_norm": 2.8267500400543213, + "learning_rate": 0.0001696245733788396, + "loss": 5.6925, + "step": 7299 + }, + { + "epoch": 2.491467576791809, + "grad_norm": 2.7589223384857178, + "learning_rate": 0.0001695108077360637, + "loss": 5.8866, + "step": 7300 + }, + { + "epoch": 2.4918088737201365, + "grad_norm": 2.703211545944214, + "learning_rate": 0.00016939704209328782, + "loss": 6.3634, + "step": 7301 + }, + { + "epoch": 2.4921501706484643, + "grad_norm": 2.917485237121582, + "learning_rate": 0.00016928327645051195, + "loss": 5.7224, + "step": 7302 + }, + { + "epoch": 2.4924914675767917, + "grad_norm": 2.6582095623016357, + "learning_rate": 0.00016916951080773608, + "loss": 5.8915, + "step": 7303 + }, + { + "epoch": 2.4928327645051196, + "grad_norm": 2.8652775287628174, + "learning_rate": 0.00016905574516496019, + "loss": 3.9218, + "step": 7304 + }, + { + "epoch": 2.493174061433447, + "grad_norm": 2.739563226699829, + "learning_rate": 0.0001689419795221843, + "loss": 6.6156, + "step": 7305 + }, + { + "epoch": 2.493515358361775, + "grad_norm": 2.928471803665161, + "learning_rate": 0.00016882821387940842, + "loss": 5.1992, + "step": 7306 + }, + { + "epoch": 2.4938566552901023, + "grad_norm": 2.6852288246154785, + "learning_rate": 0.00016871444823663253, + "loss": 5.9716, + "step": 7307 + }, + { + "epoch": 2.49419795221843, + "grad_norm": 2.7359983921051025, + "learning_rate": 0.00016860068259385666, + "loss": 5.5148, + "step": 7308 + }, + { + "epoch": 2.4945392491467575, + "grad_norm": 2.796595335006714, + "learning_rate": 0.0001684869169510808, + "loss": 6.9963, + "step": 7309 + }, + { + "epoch": 2.4948805460750854, + "grad_norm": 2.682931900024414, + "learning_rate": 0.0001683731513083049, + "loss": 5.4007, + "step": 7310 + }, + { + "epoch": 2.4952218430034128, + "grad_norm": 1.8263013362884521, + "learning_rate": 0.000168259385665529, + "loss": 3.1275, + "step": 7311 + }, + { + "epoch": 2.4955631399317406, + "grad_norm": 2.828277587890625, + "learning_rate": 0.00016814562002275313, + "loss": 6.0928, + "step": 7312 + }, + { + "epoch": 2.4959044368600685, + "grad_norm": 2.7260549068450928, + "learning_rate": 0.00016803185437997727, + "loss": 5.7029, + "step": 7313 + }, + { + "epoch": 2.496245733788396, + "grad_norm": 2.6961669921875, + "learning_rate": 0.00016791808873720137, + "loss": 5.9228, + "step": 7314 + }, + { + "epoch": 2.4965870307167237, + "grad_norm": 2.813055992126465, + "learning_rate": 0.00016780432309442548, + "loss": 6.1379, + "step": 7315 + }, + { + "epoch": 2.496928327645051, + "grad_norm": 2.6785166263580322, + "learning_rate": 0.0001676905574516496, + "loss": 5.7098, + "step": 7316 + }, + { + "epoch": 2.497269624573379, + "grad_norm": 2.698068857192993, + "learning_rate": 0.0001675767918088737, + "loss": 6.0382, + "step": 7317 + }, + { + "epoch": 2.4976109215017064, + "grad_norm": 2.680922508239746, + "learning_rate": 0.00016746302616609785, + "loss": 6.1616, + "step": 7318 + }, + { + "epoch": 2.4979522184300342, + "grad_norm": 2.7005722522735596, + "learning_rate": 0.00016734926052332198, + "loss": 5.649, + "step": 7319 + }, + { + "epoch": 2.4982935153583616, + "grad_norm": 2.744039535522461, + "learning_rate": 0.00016723549488054606, + "loss": 6.2951, + "step": 7320 + }, + { + "epoch": 2.4986348122866895, + "grad_norm": 2.7696053981781006, + "learning_rate": 0.0001671217292377702, + "loss": 6.2824, + "step": 7321 + }, + { + "epoch": 2.498976109215017, + "grad_norm": 2.9358487129211426, + "learning_rate": 0.00016700796359499432, + "loss": 5.1869, + "step": 7322 + }, + { + "epoch": 2.4993174061433447, + "grad_norm": 2.740104913711548, + "learning_rate": 0.00016689419795221845, + "loss": 6.0786, + "step": 7323 + }, + { + "epoch": 2.499658703071672, + "grad_norm": 2.702230215072632, + "learning_rate": 0.00016678043230944256, + "loss": 6.4051, + "step": 7324 + }, + { + "epoch": 2.5, + "grad_norm": 4.933879375457764, + "learning_rate": 0.00016666666666666666, + "loss": 4.6992, + "step": 7325 + }, + { + "epoch": 2.500341296928328, + "grad_norm": 2.6900596618652344, + "learning_rate": 0.0001665529010238908, + "loss": 5.8764, + "step": 7326 + }, + { + "epoch": 2.5006825938566553, + "grad_norm": 2.7726457118988037, + "learning_rate": 0.0001664391353811149, + "loss": 5.6187, + "step": 7327 + }, + { + "epoch": 2.5010238907849827, + "grad_norm": 1.9055275917053223, + "learning_rate": 0.00016632536973833903, + "loss": 3.4067, + "step": 7328 + }, + { + "epoch": 2.5013651877133105, + "grad_norm": 2.7264151573181152, + "learning_rate": 0.00016621160409556316, + "loss": 6.0756, + "step": 7329 + }, + { + "epoch": 2.5017064846416384, + "grad_norm": 2.759608030319214, + "learning_rate": 0.00016609783845278724, + "loss": 6.1258, + "step": 7330 + }, + { + "epoch": 2.5020477815699658, + "grad_norm": 2.6936237812042236, + "learning_rate": 0.00016598407281001137, + "loss": 6.2095, + "step": 7331 + }, + { + "epoch": 2.5023890784982936, + "grad_norm": 2.6739792823791504, + "learning_rate": 0.0001658703071672355, + "loss": 6.5549, + "step": 7332 + }, + { + "epoch": 2.502730375426621, + "grad_norm": 2.725817918777466, + "learning_rate": 0.00016575654152445964, + "loss": 6.1217, + "step": 7333 + }, + { + "epoch": 2.503071672354949, + "grad_norm": 2.785630702972412, + "learning_rate": 0.00016564277588168371, + "loss": 6.0126, + "step": 7334 + }, + { + "epoch": 2.5034129692832767, + "grad_norm": 2.7225522994995117, + "learning_rate": 0.00016552901023890785, + "loss": 5.9088, + "step": 7335 + }, + { + "epoch": 2.503754266211604, + "grad_norm": 2.7138113975524902, + "learning_rate": 0.00016541524459613198, + "loss": 6.5287, + "step": 7336 + }, + { + "epoch": 2.5040955631399315, + "grad_norm": 2.659165143966675, + "learning_rate": 0.00016530147895335608, + "loss": 6.1559, + "step": 7337 + }, + { + "epoch": 2.5044368600682594, + "grad_norm": 2.6596744060516357, + "learning_rate": 0.00016518771331058022, + "loss": 5.9957, + "step": 7338 + }, + { + "epoch": 2.5047781569965872, + "grad_norm": 2.6613821983337402, + "learning_rate": 0.00016507394766780432, + "loss": 5.6435, + "step": 7339 + }, + { + "epoch": 2.5051194539249146, + "grad_norm": 2.7147886753082275, + "learning_rate": 0.00016496018202502842, + "loss": 6.0681, + "step": 7340 + }, + { + "epoch": 2.505460750853242, + "grad_norm": 2.6107845306396484, + "learning_rate": 0.00016484641638225256, + "loss": 5.9106, + "step": 7341 + }, + { + "epoch": 2.50580204778157, + "grad_norm": 2.7579963207244873, + "learning_rate": 0.0001647326507394767, + "loss": 6.6005, + "step": 7342 + }, + { + "epoch": 2.5061433447098977, + "grad_norm": 2.604501962661743, + "learning_rate": 0.00016461888509670082, + "loss": 5.8982, + "step": 7343 + }, + { + "epoch": 2.506484641638225, + "grad_norm": 2.7372636795043945, + "learning_rate": 0.0001645051194539249, + "loss": 5.3334, + "step": 7344 + }, + { + "epoch": 2.506825938566553, + "grad_norm": 2.69775390625, + "learning_rate": 0.00016439135381114903, + "loss": 6.0477, + "step": 7345 + }, + { + "epoch": 2.5071672354948804, + "grad_norm": 2.698512077331543, + "learning_rate": 0.00016427758816837316, + "loss": 5.5654, + "step": 7346 + }, + { + "epoch": 2.5075085324232083, + "grad_norm": 2.6294450759887695, + "learning_rate": 0.00016416382252559727, + "loss": 6.1314, + "step": 7347 + }, + { + "epoch": 2.507849829351536, + "grad_norm": 2.6949124336242676, + "learning_rate": 0.0001640500568828214, + "loss": 6.6314, + "step": 7348 + }, + { + "epoch": 2.5081911262798635, + "grad_norm": 2.7231664657592773, + "learning_rate": 0.0001639362912400455, + "loss": 6.1183, + "step": 7349 + }, + { + "epoch": 2.508532423208191, + "grad_norm": 2.9584968090057373, + "learning_rate": 0.0001638225255972696, + "loss": 5.1231, + "step": 7350 + }, + { + "epoch": 2.5088737201365188, + "grad_norm": 2.7133231163024902, + "learning_rate": 0.00016370875995449374, + "loss": 5.5295, + "step": 7351 + }, + { + "epoch": 2.5092150170648466, + "grad_norm": 2.7822914123535156, + "learning_rate": 0.00016359499431171787, + "loss": 5.4579, + "step": 7352 + }, + { + "epoch": 2.509556313993174, + "grad_norm": 2.6988959312438965, + "learning_rate": 0.000163481228668942, + "loss": 6.0726, + "step": 7353 + }, + { + "epoch": 2.5098976109215014, + "grad_norm": 2.623605489730835, + "learning_rate": 0.00016336746302616608, + "loss": 6.1312, + "step": 7354 + }, + { + "epoch": 2.5102389078498293, + "grad_norm": 2.7255163192749023, + "learning_rate": 0.00016325369738339022, + "loss": 5.9188, + "step": 7355 + }, + { + "epoch": 2.510580204778157, + "grad_norm": 2.6522679328918457, + "learning_rate": 0.00016313993174061435, + "loss": 6.2107, + "step": 7356 + }, + { + "epoch": 2.5109215017064845, + "grad_norm": 2.733840227127075, + "learning_rate": 0.00016302616609783845, + "loss": 6.0067, + "step": 7357 + }, + { + "epoch": 2.5112627986348124, + "grad_norm": 2.6680688858032227, + "learning_rate": 0.00016291240045506259, + "loss": 6.2216, + "step": 7358 + }, + { + "epoch": 2.51160409556314, + "grad_norm": 3.2532639503479004, + "learning_rate": 0.0001627986348122867, + "loss": 5.0936, + "step": 7359 + }, + { + "epoch": 2.5119453924914676, + "grad_norm": 2.6566052436828613, + "learning_rate": 0.0001626848691695108, + "loss": 6.2296, + "step": 7360 + }, + { + "epoch": 2.5122866894197955, + "grad_norm": 3.307908296585083, + "learning_rate": 0.00016257110352673493, + "loss": 4.3073, + "step": 7361 + }, + { + "epoch": 2.512627986348123, + "grad_norm": 2.7235147953033447, + "learning_rate": 0.00016245733788395906, + "loss": 5.967, + "step": 7362 + }, + { + "epoch": 2.5129692832764503, + "grad_norm": 2.7577085494995117, + "learning_rate": 0.0001623435722411832, + "loss": 4.6936, + "step": 7363 + }, + { + "epoch": 2.513310580204778, + "grad_norm": 2.6942477226257324, + "learning_rate": 0.00016222980659840727, + "loss": 6.1181, + "step": 7364 + }, + { + "epoch": 2.513651877133106, + "grad_norm": 2.6965997219085693, + "learning_rate": 0.0001621160409556314, + "loss": 4.4271, + "step": 7365 + }, + { + "epoch": 2.5139931740614334, + "grad_norm": 2.5623295307159424, + "learning_rate": 0.00016200227531285553, + "loss": 5.8977, + "step": 7366 + }, + { + "epoch": 2.514334470989761, + "grad_norm": 2.7341978549957275, + "learning_rate": 0.00016188850967007964, + "loss": 5.7355, + "step": 7367 + }, + { + "epoch": 2.5146757679180887, + "grad_norm": 2.686908721923828, + "learning_rate": 0.00016177474402730374, + "loss": 5.5919, + "step": 7368 + }, + { + "epoch": 2.5150170648464165, + "grad_norm": 2.768826961517334, + "learning_rate": 0.00016166097838452787, + "loss": 6.317, + "step": 7369 + }, + { + "epoch": 2.515358361774744, + "grad_norm": 2.7448062896728516, + "learning_rate": 0.00016154721274175198, + "loss": 5.9812, + "step": 7370 + }, + { + "epoch": 2.5156996587030718, + "grad_norm": 2.6533572673797607, + "learning_rate": 0.0001614334470989761, + "loss": 5.7661, + "step": 7371 + }, + { + "epoch": 2.516040955631399, + "grad_norm": 2.7210354804992676, + "learning_rate": 0.00016131968145620024, + "loss": 5.9004, + "step": 7372 + }, + { + "epoch": 2.516382252559727, + "grad_norm": 2.7211415767669678, + "learning_rate": 0.00016120591581342435, + "loss": 6.3239, + "step": 7373 + }, + { + "epoch": 2.516723549488055, + "grad_norm": 2.718151092529297, + "learning_rate": 0.00016109215017064845, + "loss": 6.2132, + "step": 7374 + }, + { + "epoch": 2.5170648464163823, + "grad_norm": 2.657224416732788, + "learning_rate": 0.00016097838452787259, + "loss": 5.9553, + "step": 7375 + }, + { + "epoch": 2.5174061433447097, + "grad_norm": 2.738079071044922, + "learning_rate": 0.00016086461888509672, + "loss": 5.824, + "step": 7376 + }, + { + "epoch": 2.5177474402730375, + "grad_norm": 2.7597906589508057, + "learning_rate": 0.00016075085324232082, + "loss": 6.2877, + "step": 7377 + }, + { + "epoch": 2.5180887372013654, + "grad_norm": 2.7214534282684326, + "learning_rate": 0.00016063708759954493, + "loss": 6.262, + "step": 7378 + }, + { + "epoch": 2.518430034129693, + "grad_norm": 2.705698013305664, + "learning_rate": 0.00016052332195676906, + "loss": 6.2512, + "step": 7379 + }, + { + "epoch": 2.51877133105802, + "grad_norm": 2.636011838912964, + "learning_rate": 0.00016040955631399316, + "loss": 4.9735, + "step": 7380 + }, + { + "epoch": 2.519112627986348, + "grad_norm": 3.66536021232605, + "learning_rate": 0.0001602957906712173, + "loss": 3.9323, + "step": 7381 + }, + { + "epoch": 2.519453924914676, + "grad_norm": 2.7328991889953613, + "learning_rate": 0.00016018202502844143, + "loss": 4.5109, + "step": 7382 + }, + { + "epoch": 2.5197952218430033, + "grad_norm": 2.689422369003296, + "learning_rate": 0.00016006825938566553, + "loss": 5.9128, + "step": 7383 + }, + { + "epoch": 2.520136518771331, + "grad_norm": 3.5838780403137207, + "learning_rate": 0.00015995449374288964, + "loss": 4.2017, + "step": 7384 + }, + { + "epoch": 2.5204778156996586, + "grad_norm": 2.720184087753296, + "learning_rate": 0.00015984072810011377, + "loss": 5.6297, + "step": 7385 + }, + { + "epoch": 2.5208191126279864, + "grad_norm": 2.6871819496154785, + "learning_rate": 0.0001597269624573379, + "loss": 5.9124, + "step": 7386 + }, + { + "epoch": 2.5211604095563143, + "grad_norm": 2.7035536766052246, + "learning_rate": 0.000159613196814562, + "loss": 6.2627, + "step": 7387 + }, + { + "epoch": 2.5215017064846417, + "grad_norm": 2.7028918266296387, + "learning_rate": 0.0001594994311717861, + "loss": 5.8171, + "step": 7388 + }, + { + "epoch": 2.521843003412969, + "grad_norm": 2.6863887310028076, + "learning_rate": 0.00015938566552901024, + "loss": 6.7491, + "step": 7389 + }, + { + "epoch": 2.522184300341297, + "grad_norm": 2.64327073097229, + "learning_rate": 0.00015927189988623435, + "loss": 6.236, + "step": 7390 + }, + { + "epoch": 2.5225255972696248, + "grad_norm": 2.628070592880249, + "learning_rate": 0.00015915813424345848, + "loss": 5.51, + "step": 7391 + }, + { + "epoch": 2.522866894197952, + "grad_norm": 2.6996893882751465, + "learning_rate": 0.00015904436860068261, + "loss": 6.1538, + "step": 7392 + }, + { + "epoch": 2.5232081911262796, + "grad_norm": 4.886688709259033, + "learning_rate": 0.00015893060295790672, + "loss": 4.6864, + "step": 7393 + }, + { + "epoch": 2.5235494880546074, + "grad_norm": 2.6805944442749023, + "learning_rate": 0.00015881683731513082, + "loss": 6.105, + "step": 7394 + }, + { + "epoch": 2.5238907849829353, + "grad_norm": 2.6910324096679688, + "learning_rate": 0.00015870307167235496, + "loss": 6.266, + "step": 7395 + }, + { + "epoch": 2.5242320819112627, + "grad_norm": 2.6817996501922607, + "learning_rate": 0.0001585893060295791, + "loss": 6.4578, + "step": 7396 + }, + { + "epoch": 2.5245733788395905, + "grad_norm": 3.1869544982910156, + "learning_rate": 0.0001584755403868032, + "loss": 5.4998, + "step": 7397 + }, + { + "epoch": 2.524914675767918, + "grad_norm": 2.6462249755859375, + "learning_rate": 0.0001583617747440273, + "loss": 6.3286, + "step": 7398 + }, + { + "epoch": 2.525255972696246, + "grad_norm": 2.711543321609497, + "learning_rate": 0.00015824800910125143, + "loss": 6.6815, + "step": 7399 + }, + { + "epoch": 2.5255972696245736, + "grad_norm": 2.6534812450408936, + "learning_rate": 0.00015813424345847553, + "loss": 6.3155, + "step": 7400 + }, + { + "epoch": 2.525938566552901, + "grad_norm": 2.68021559715271, + "learning_rate": 0.00015802047781569967, + "loss": 6.126, + "step": 7401 + }, + { + "epoch": 2.5262798634812285, + "grad_norm": 2.7050492763519287, + "learning_rate": 0.00015790671217292377, + "loss": 5.7176, + "step": 7402 + }, + { + "epoch": 2.5266211604095563, + "grad_norm": 2.6450881958007812, + "learning_rate": 0.0001577929465301479, + "loss": 6.5606, + "step": 7403 + }, + { + "epoch": 2.526962457337884, + "grad_norm": 2.6857900619506836, + "learning_rate": 0.000157679180887372, + "loss": 6.2598, + "step": 7404 + }, + { + "epoch": 2.5273037542662116, + "grad_norm": 2.6581039428710938, + "learning_rate": 0.00015756541524459614, + "loss": 6.5876, + "step": 7405 + }, + { + "epoch": 2.527645051194539, + "grad_norm": 2.6261470317840576, + "learning_rate": 0.00015745164960182027, + "loss": 5.4513, + "step": 7406 + }, + { + "epoch": 2.527986348122867, + "grad_norm": 2.684361696243286, + "learning_rate": 0.00015733788395904435, + "loss": 5.8607, + "step": 7407 + }, + { + "epoch": 2.5283276450511947, + "grad_norm": 2.6788904666900635, + "learning_rate": 0.00015722411831626848, + "loss": 6.32, + "step": 7408 + }, + { + "epoch": 2.528668941979522, + "grad_norm": 2.700685977935791, + "learning_rate": 0.00015711035267349261, + "loss": 5.7209, + "step": 7409 + }, + { + "epoch": 2.52901023890785, + "grad_norm": 2.7809455394744873, + "learning_rate": 0.00015699658703071672, + "loss": 6.0776, + "step": 7410 + }, + { + "epoch": 2.5293515358361773, + "grad_norm": 2.6642520427703857, + "learning_rate": 0.00015688282138794085, + "loss": 5.6266, + "step": 7411 + }, + { + "epoch": 2.529692832764505, + "grad_norm": 2.7473788261413574, + "learning_rate": 0.00015676905574516496, + "loss": 5.0825, + "step": 7412 + }, + { + "epoch": 2.530034129692833, + "grad_norm": 2.7476108074188232, + "learning_rate": 0.0001566552901023891, + "loss": 6.2024, + "step": 7413 + }, + { + "epoch": 2.5303754266211604, + "grad_norm": 2.6110386848449707, + "learning_rate": 0.0001565415244596132, + "loss": 6.1209, + "step": 7414 + }, + { + "epoch": 2.530716723549488, + "grad_norm": 2.9755313396453857, + "learning_rate": 0.00015642775881683733, + "loss": 5.5796, + "step": 7415 + }, + { + "epoch": 2.5310580204778157, + "grad_norm": 2.710559844970703, + "learning_rate": 0.00015631399317406146, + "loss": 5.7508, + "step": 7416 + }, + { + "epoch": 2.5313993174061435, + "grad_norm": 2.636697292327881, + "learning_rate": 0.00015620022753128554, + "loss": 6.1145, + "step": 7417 + }, + { + "epoch": 2.531740614334471, + "grad_norm": 2.8945558071136475, + "learning_rate": 0.00015608646188850967, + "loss": 5.4822, + "step": 7418 + }, + { + "epoch": 2.532081911262799, + "grad_norm": 2.7583422660827637, + "learning_rate": 0.0001559726962457338, + "loss": 6.3568, + "step": 7419 + }, + { + "epoch": 2.532423208191126, + "grad_norm": 2.6713101863861084, + "learning_rate": 0.0001558589306029579, + "loss": 5.554, + "step": 7420 + }, + { + "epoch": 2.532764505119454, + "grad_norm": 2.626268148422241, + "learning_rate": 0.00015574516496018204, + "loss": 5.8425, + "step": 7421 + }, + { + "epoch": 2.5331058020477815, + "grad_norm": 2.9670071601867676, + "learning_rate": 0.00015563139931740614, + "loss": 5.2788, + "step": 7422 + }, + { + "epoch": 2.5334470989761093, + "grad_norm": 2.694166660308838, + "learning_rate": 0.00015551763367463025, + "loss": 5.982, + "step": 7423 + }, + { + "epoch": 2.5337883959044367, + "grad_norm": 2.6969735622406006, + "learning_rate": 0.00015540386803185438, + "loss": 6.4294, + "step": 7424 + }, + { + "epoch": 2.5341296928327646, + "grad_norm": 2.7275049686431885, + "learning_rate": 0.0001552901023890785, + "loss": 6.0112, + "step": 7425 + }, + { + "epoch": 2.5344709897610924, + "grad_norm": 2.7712514400482178, + "learning_rate": 0.00015517633674630264, + "loss": 6.2063, + "step": 7426 + }, + { + "epoch": 2.53481228668942, + "grad_norm": 3.0031471252441406, + "learning_rate": 0.00015506257110352672, + "loss": 4.5911, + "step": 7427 + }, + { + "epoch": 2.5351535836177472, + "grad_norm": 2.857637882232666, + "learning_rate": 0.00015494880546075085, + "loss": 4.5048, + "step": 7428 + }, + { + "epoch": 2.535494880546075, + "grad_norm": 2.757772207260132, + "learning_rate": 0.00015483503981797498, + "loss": 5.9304, + "step": 7429 + }, + { + "epoch": 2.535836177474403, + "grad_norm": 2.77624773979187, + "learning_rate": 0.0001547212741751991, + "loss": 5.2982, + "step": 7430 + }, + { + "epoch": 2.5361774744027303, + "grad_norm": 2.9393277168273926, + "learning_rate": 0.00015460750853242322, + "loss": 5.1273, + "step": 7431 + }, + { + "epoch": 2.536518771331058, + "grad_norm": 2.715731143951416, + "learning_rate": 0.00015449374288964733, + "loss": 6.2645, + "step": 7432 + }, + { + "epoch": 2.5368600682593856, + "grad_norm": 2.7153773307800293, + "learning_rate": 0.00015437997724687143, + "loss": 5.8669, + "step": 7433 + }, + { + "epoch": 2.5372013651877134, + "grad_norm": 2.890367031097412, + "learning_rate": 0.00015426621160409556, + "loss": 5.8867, + "step": 7434 + }, + { + "epoch": 2.537542662116041, + "grad_norm": 2.7217259407043457, + "learning_rate": 0.0001541524459613197, + "loss": 6.2305, + "step": 7435 + }, + { + "epoch": 2.5378839590443687, + "grad_norm": 5.296313285827637, + "learning_rate": 0.00015403868031854383, + "loss": 3.7479, + "step": 7436 + }, + { + "epoch": 2.538225255972696, + "grad_norm": 2.703120231628418, + "learning_rate": 0.0001539249146757679, + "loss": 6.5971, + "step": 7437 + }, + { + "epoch": 2.538566552901024, + "grad_norm": 3.4888710975646973, + "learning_rate": 0.00015381114903299204, + "loss": 4.5474, + "step": 7438 + }, + { + "epoch": 2.538907849829352, + "grad_norm": 2.7433698177337646, + "learning_rate": 0.00015369738339021617, + "loss": 5.4895, + "step": 7439 + }, + { + "epoch": 2.539249146757679, + "grad_norm": 2.715934991836548, + "learning_rate": 0.00015358361774744027, + "loss": 4.5857, + "step": 7440 + }, + { + "epoch": 2.5395904436860066, + "grad_norm": 2.6325573921203613, + "learning_rate": 0.00015346985210466438, + "loss": 6.0054, + "step": 7441 + }, + { + "epoch": 2.5399317406143345, + "grad_norm": 2.666834831237793, + "learning_rate": 0.0001533560864618885, + "loss": 6.4198, + "step": 7442 + }, + { + "epoch": 2.5402730375426623, + "grad_norm": 2.6133065223693848, + "learning_rate": 0.00015324232081911262, + "loss": 6.1836, + "step": 7443 + }, + { + "epoch": 2.5406143344709897, + "grad_norm": 2.7008965015411377, + "learning_rate": 0.00015312855517633675, + "loss": 6.5665, + "step": 7444 + }, + { + "epoch": 2.5409556313993176, + "grad_norm": 2.5905418395996094, + "learning_rate": 0.00015301478953356088, + "loss": 5.7239, + "step": 7445 + }, + { + "epoch": 2.541296928327645, + "grad_norm": 2.6478431224823, + "learning_rate": 0.00015290102389078499, + "loss": 6.0945, + "step": 7446 + }, + { + "epoch": 2.541638225255973, + "grad_norm": 2.7686331272125244, + "learning_rate": 0.0001527872582480091, + "loss": 5.9458, + "step": 7447 + }, + { + "epoch": 2.5419795221843002, + "grad_norm": 2.7820048332214355, + "learning_rate": 0.00015267349260523322, + "loss": 5.3187, + "step": 7448 + }, + { + "epoch": 2.542320819112628, + "grad_norm": 2.7252564430236816, + "learning_rate": 0.00015255972696245735, + "loss": 6.1386, + "step": 7449 + }, + { + "epoch": 2.5426621160409555, + "grad_norm": 3.9234542846679688, + "learning_rate": 0.00015244596131968146, + "loss": 4.9931, + "step": 7450 + }, + { + "epoch": 2.5430034129692833, + "grad_norm": 2.7013142108917236, + "learning_rate": 0.00015233219567690556, + "loss": 6.5903, + "step": 7451 + }, + { + "epoch": 2.543344709897611, + "grad_norm": 2.7234249114990234, + "learning_rate": 0.0001522184300341297, + "loss": 6.5083, + "step": 7452 + }, + { + "epoch": 2.5436860068259386, + "grad_norm": 2.646691083908081, + "learning_rate": 0.0001521046643913538, + "loss": 5.8912, + "step": 7453 + }, + { + "epoch": 2.544027303754266, + "grad_norm": 2.6651551723480225, + "learning_rate": 0.00015199089874857793, + "loss": 6.4359, + "step": 7454 + }, + { + "epoch": 2.544368600682594, + "grad_norm": 2.7203242778778076, + "learning_rate": 0.00015187713310580207, + "loss": 6.1125, + "step": 7455 + }, + { + "epoch": 2.5447098976109217, + "grad_norm": 2.7022976875305176, + "learning_rate": 0.00015176336746302617, + "loss": 6.3177, + "step": 7456 + }, + { + "epoch": 2.545051194539249, + "grad_norm": 3.036482334136963, + "learning_rate": 0.00015164960182025028, + "loss": 5.8322, + "step": 7457 + }, + { + "epoch": 2.545392491467577, + "grad_norm": 2.6736273765563965, + "learning_rate": 0.0001515358361774744, + "loss": 5.683, + "step": 7458 + }, + { + "epoch": 2.5457337883959044, + "grad_norm": 2.6975038051605225, + "learning_rate": 0.00015142207053469854, + "loss": 5.9692, + "step": 7459 + }, + { + "epoch": 2.546075085324232, + "grad_norm": 2.6708292961120605, + "learning_rate": 0.00015130830489192264, + "loss": 6.2471, + "step": 7460 + }, + { + "epoch": 2.5464163822525596, + "grad_norm": 4.273525714874268, + "learning_rate": 0.00015119453924914675, + "loss": 5.0722, + "step": 7461 + }, + { + "epoch": 2.5467576791808875, + "grad_norm": 2.717935085296631, + "learning_rate": 0.00015108077360637088, + "loss": 6.2979, + "step": 7462 + }, + { + "epoch": 2.547098976109215, + "grad_norm": 2.620076894760132, + "learning_rate": 0.00015096700796359499, + "loss": 6.0223, + "step": 7463 + }, + { + "epoch": 2.5474402730375427, + "grad_norm": 2.6915009021759033, + "learning_rate": 0.00015085324232081912, + "loss": 6.0069, + "step": 7464 + }, + { + "epoch": 2.5477815699658706, + "grad_norm": 2.7233173847198486, + "learning_rate": 0.00015073947667804325, + "loss": 5.3867, + "step": 7465 + }, + { + "epoch": 2.548122866894198, + "grad_norm": 2.6472926139831543, + "learning_rate": 0.00015062571103526736, + "loss": 6.8144, + "step": 7466 + }, + { + "epoch": 2.5484641638225254, + "grad_norm": 2.7000956535339355, + "learning_rate": 0.00015051194539249146, + "loss": 5.9423, + "step": 7467 + }, + { + "epoch": 2.5488054607508532, + "grad_norm": 2.6225948333740234, + "learning_rate": 0.0001503981797497156, + "loss": 6.0564, + "step": 7468 + }, + { + "epoch": 2.549146757679181, + "grad_norm": 3.0721187591552734, + "learning_rate": 0.00015028441410693972, + "loss": 4.9398, + "step": 7469 + }, + { + "epoch": 2.5494880546075085, + "grad_norm": 2.621379852294922, + "learning_rate": 0.00015017064846416383, + "loss": 6.065, + "step": 7470 + }, + { + "epoch": 2.5498293515358363, + "grad_norm": 2.6615641117095947, + "learning_rate": 0.00015005688282138793, + "loss": 6.5425, + "step": 7471 + }, + { + "epoch": 2.5501706484641637, + "grad_norm": 2.7236409187316895, + "learning_rate": 0.00014994311717861207, + "loss": 6.1923, + "step": 7472 + }, + { + "epoch": 2.5505119453924916, + "grad_norm": 2.776329517364502, + "learning_rate": 0.00014982935153583617, + "loss": 6.3152, + "step": 7473 + }, + { + "epoch": 2.550853242320819, + "grad_norm": 2.7233684062957764, + "learning_rate": 0.0001497155858930603, + "loss": 5.9222, + "step": 7474 + }, + { + "epoch": 2.551194539249147, + "grad_norm": 2.578994035720825, + "learning_rate": 0.0001496018202502844, + "loss": 5.9317, + "step": 7475 + }, + { + "epoch": 2.5515358361774743, + "grad_norm": 2.6251556873321533, + "learning_rate": 0.00014948805460750854, + "loss": 6.1575, + "step": 7476 + }, + { + "epoch": 2.551877133105802, + "grad_norm": 2.599755048751831, + "learning_rate": 0.00014937428896473265, + "loss": 6.3228, + "step": 7477 + }, + { + "epoch": 2.55221843003413, + "grad_norm": 2.6746912002563477, + "learning_rate": 0.00014926052332195678, + "loss": 6.0529, + "step": 7478 + }, + { + "epoch": 2.5525597269624574, + "grad_norm": 2.6559457778930664, + "learning_rate": 0.0001491467576791809, + "loss": 5.4679, + "step": 7479 + }, + { + "epoch": 2.5529010238907848, + "grad_norm": 2.8027467727661133, + "learning_rate": 0.000149032992036405, + "loss": 5.8367, + "step": 7480 + }, + { + "epoch": 2.5532423208191126, + "grad_norm": 2.701709747314453, + "learning_rate": 0.00014891922639362912, + "loss": 6.8529, + "step": 7481 + }, + { + "epoch": 2.5535836177474405, + "grad_norm": 2.6528892517089844, + "learning_rate": 0.00014880546075085325, + "loss": 5.5392, + "step": 7482 + }, + { + "epoch": 2.553924914675768, + "grad_norm": 2.7285146713256836, + "learning_rate": 0.00014869169510807736, + "loss": 6.1253, + "step": 7483 + }, + { + "epoch": 2.5542662116040957, + "grad_norm": 2.646544933319092, + "learning_rate": 0.0001485779294653015, + "loss": 6.2362, + "step": 7484 + }, + { + "epoch": 2.554607508532423, + "grad_norm": 2.632233142852783, + "learning_rate": 0.0001484641638225256, + "loss": 5.9352, + "step": 7485 + }, + { + "epoch": 2.554948805460751, + "grad_norm": 2.641140937805176, + "learning_rate": 0.00014835039817974973, + "loss": 6.2673, + "step": 7486 + }, + { + "epoch": 2.5552901023890784, + "grad_norm": 2.7124555110931396, + "learning_rate": 0.00014823663253697383, + "loss": 5.4315, + "step": 7487 + }, + { + "epoch": 2.5556313993174062, + "grad_norm": 2.6860406398773193, + "learning_rate": 0.00014812286689419796, + "loss": 6.3044, + "step": 7488 + }, + { + "epoch": 2.5559726962457336, + "grad_norm": 2.6111605167388916, + "learning_rate": 0.0001480091012514221, + "loss": 6.2049, + "step": 7489 + }, + { + "epoch": 2.5563139931740615, + "grad_norm": 2.7284209728240967, + "learning_rate": 0.00014789533560864617, + "loss": 6.7773, + "step": 7490 + }, + { + "epoch": 2.5566552901023893, + "grad_norm": 2.732853651046753, + "learning_rate": 0.0001477815699658703, + "loss": 5.6464, + "step": 7491 + }, + { + "epoch": 2.5569965870307167, + "grad_norm": 2.888129949569702, + "learning_rate": 0.00014766780432309444, + "loss": 5.4964, + "step": 7492 + }, + { + "epoch": 2.557337883959044, + "grad_norm": 2.723909616470337, + "learning_rate": 0.00014755403868031854, + "loss": 6.3293, + "step": 7493 + }, + { + "epoch": 2.557679180887372, + "grad_norm": 2.6410248279571533, + "learning_rate": 0.00014744027303754267, + "loss": 5.7546, + "step": 7494 + }, + { + "epoch": 2.5580204778157, + "grad_norm": 2.708897829055786, + "learning_rate": 0.00014732650739476678, + "loss": 5.9368, + "step": 7495 + }, + { + "epoch": 2.5583617747440273, + "grad_norm": 2.6864473819732666, + "learning_rate": 0.0001472127417519909, + "loss": 6.3272, + "step": 7496 + }, + { + "epoch": 2.558703071672355, + "grad_norm": 2.6929290294647217, + "learning_rate": 0.00014709897610921502, + "loss": 6.0851, + "step": 7497 + }, + { + "epoch": 2.5590443686006825, + "grad_norm": 2.725696325302124, + "learning_rate": 0.00014698521046643915, + "loss": 6.092, + "step": 7498 + }, + { + "epoch": 2.5593856655290104, + "grad_norm": 2.703749418258667, + "learning_rate": 0.00014687144482366328, + "loss": 5.9719, + "step": 7499 + }, + { + "epoch": 2.5597269624573378, + "grad_norm": 3.734013319015503, + "learning_rate": 0.00014675767918088736, + "loss": 4.0252, + "step": 7500 + }, + { + "epoch": 2.5600682593856656, + "grad_norm": 2.6439578533172607, + "learning_rate": 0.0001466439135381115, + "loss": 5.7126, + "step": 7501 + }, + { + "epoch": 2.560409556313993, + "grad_norm": 2.6712331771850586, + "learning_rate": 0.00014653014789533562, + "loss": 6.0862, + "step": 7502 + }, + { + "epoch": 2.560750853242321, + "grad_norm": 2.6167094707489014, + "learning_rate": 0.00014641638225255973, + "loss": 6.5083, + "step": 7503 + }, + { + "epoch": 2.5610921501706487, + "grad_norm": 2.631967544555664, + "learning_rate": 0.00014630261660978386, + "loss": 5.9288, + "step": 7504 + }, + { + "epoch": 2.561433447098976, + "grad_norm": 2.8037562370300293, + "learning_rate": 0.00014618885096700796, + "loss": 5.0377, + "step": 7505 + }, + { + "epoch": 2.5617747440273035, + "grad_norm": 2.6779751777648926, + "learning_rate": 0.0001460750853242321, + "loss": 6.2956, + "step": 7506 + }, + { + "epoch": 2.5621160409556314, + "grad_norm": 2.6464788913726807, + "learning_rate": 0.0001459613196814562, + "loss": 5.9083, + "step": 7507 + }, + { + "epoch": 2.5624573378839592, + "grad_norm": 2.7481160163879395, + "learning_rate": 0.00014584755403868033, + "loss": 6.2735, + "step": 7508 + }, + { + "epoch": 2.5627986348122866, + "grad_norm": 2.6728515625, + "learning_rate": 0.00014573378839590444, + "loss": 6.0976, + "step": 7509 + }, + { + "epoch": 2.5631399317406145, + "grad_norm": 3.7015953063964844, + "learning_rate": 0.00014562002275312854, + "loss": 3.9438, + "step": 7510 + }, + { + "epoch": 2.563481228668942, + "grad_norm": 4.3790693283081055, + "learning_rate": 0.00014550625711035267, + "loss": 4.7137, + "step": 7511 + }, + { + "epoch": 2.5638225255972698, + "grad_norm": 2.6874852180480957, + "learning_rate": 0.0001453924914675768, + "loss": 6.2475, + "step": 7512 + }, + { + "epoch": 2.564163822525597, + "grad_norm": 2.693321943283081, + "learning_rate": 0.0001452787258248009, + "loss": 6.2983, + "step": 7513 + }, + { + "epoch": 2.564505119453925, + "grad_norm": 2.621973752975464, + "learning_rate": 0.00014516496018202502, + "loss": 6.1115, + "step": 7514 + }, + { + "epoch": 2.5648464163822524, + "grad_norm": 2.658691167831421, + "learning_rate": 0.00014505119453924915, + "loss": 5.8714, + "step": 7515 + }, + { + "epoch": 2.5651877133105803, + "grad_norm": 2.660804271697998, + "learning_rate": 0.00014493742889647328, + "loss": 6.6816, + "step": 7516 + }, + { + "epoch": 2.565529010238908, + "grad_norm": 2.6108481884002686, + "learning_rate": 0.00014482366325369739, + "loss": 5.8989, + "step": 7517 + }, + { + "epoch": 2.5658703071672355, + "grad_norm": 2.703063488006592, + "learning_rate": 0.00014470989761092152, + "loss": 5.9069, + "step": 7518 + }, + { + "epoch": 2.566211604095563, + "grad_norm": 2.910573959350586, + "learning_rate": 0.00014459613196814562, + "loss": 5.1331, + "step": 7519 + }, + { + "epoch": 2.5665529010238908, + "grad_norm": 2.6214263439178467, + "learning_rate": 0.00014448236632536973, + "loss": 6.2886, + "step": 7520 + }, + { + "epoch": 2.5668941979522186, + "grad_norm": 2.604752779006958, + "learning_rate": 0.00014436860068259386, + "loss": 6.0879, + "step": 7521 + }, + { + "epoch": 2.567235494880546, + "grad_norm": 2.679976224899292, + "learning_rate": 0.000144254835039818, + "loss": 6.2263, + "step": 7522 + }, + { + "epoch": 2.567576791808874, + "grad_norm": 2.592036485671997, + "learning_rate": 0.0001441410693970421, + "loss": 6.2702, + "step": 7523 + }, + { + "epoch": 2.5679180887372013, + "grad_norm": 2.691864490509033, + "learning_rate": 0.0001440273037542662, + "loss": 6.3147, + "step": 7524 + }, + { + "epoch": 2.568259385665529, + "grad_norm": 2.749069929122925, + "learning_rate": 0.00014391353811149033, + "loss": 5.9712, + "step": 7525 + }, + { + "epoch": 2.5686006825938565, + "grad_norm": 2.7472119331359863, + "learning_rate": 0.00014379977246871444, + "loss": 6.6555, + "step": 7526 + }, + { + "epoch": 2.5689419795221844, + "grad_norm": 2.7437517642974854, + "learning_rate": 0.00014368600682593857, + "loss": 5.8816, + "step": 7527 + }, + { + "epoch": 2.569283276450512, + "grad_norm": 2.7553818225860596, + "learning_rate": 0.0001435722411831627, + "loss": 5.6382, + "step": 7528 + }, + { + "epoch": 2.5696245733788396, + "grad_norm": 2.6735825538635254, + "learning_rate": 0.0001434584755403868, + "loss": 5.8185, + "step": 7529 + }, + { + "epoch": 2.5699658703071675, + "grad_norm": 2.8182194232940674, + "learning_rate": 0.0001433447098976109, + "loss": 5.0823, + "step": 7530 + }, + { + "epoch": 2.570307167235495, + "grad_norm": 2.7155697345733643, + "learning_rate": 0.00014323094425483504, + "loss": 5.3627, + "step": 7531 + }, + { + "epoch": 2.5706484641638223, + "grad_norm": 2.6774308681488037, + "learning_rate": 0.00014311717861205918, + "loss": 5.8861, + "step": 7532 + }, + { + "epoch": 2.57098976109215, + "grad_norm": 2.782548189163208, + "learning_rate": 0.00014300341296928328, + "loss": 5.6668, + "step": 7533 + }, + { + "epoch": 2.571331058020478, + "grad_norm": 2.7212352752685547, + "learning_rate": 0.00014288964732650739, + "loss": 6.0719, + "step": 7534 + }, + { + "epoch": 2.5716723549488054, + "grad_norm": 2.695491313934326, + "learning_rate": 0.00014277588168373152, + "loss": 6.0384, + "step": 7535 + }, + { + "epoch": 2.5720136518771333, + "grad_norm": 2.6470556259155273, + "learning_rate": 0.00014266211604095562, + "loss": 5.7107, + "step": 7536 + }, + { + "epoch": 2.5723549488054607, + "grad_norm": 2.6767444610595703, + "learning_rate": 0.00014254835039817976, + "loss": 5.8553, + "step": 7537 + }, + { + "epoch": 2.5726962457337885, + "grad_norm": 2.643503189086914, + "learning_rate": 0.0001424345847554039, + "loss": 6.1327, + "step": 7538 + }, + { + "epoch": 2.573037542662116, + "grad_norm": 2.6985726356506348, + "learning_rate": 0.000142320819112628, + "loss": 5.551, + "step": 7539 + }, + { + "epoch": 2.573378839590444, + "grad_norm": 2.657045364379883, + "learning_rate": 0.0001422070534698521, + "loss": 5.9295, + "step": 7540 + }, + { + "epoch": 2.573720136518771, + "grad_norm": 2.6590969562530518, + "learning_rate": 0.00014209328782707623, + "loss": 5.922, + "step": 7541 + }, + { + "epoch": 2.574061433447099, + "grad_norm": 2.6547012329101562, + "learning_rate": 0.00014197952218430036, + "loss": 6.1562, + "step": 7542 + }, + { + "epoch": 2.574402730375427, + "grad_norm": 2.6703603267669678, + "learning_rate": 0.00014186575654152444, + "loss": 5.9321, + "step": 7543 + }, + { + "epoch": 2.5747440273037543, + "grad_norm": 2.9982120990753174, + "learning_rate": 0.00014175199089874857, + "loss": 5.0249, + "step": 7544 + }, + { + "epoch": 2.5750853242320817, + "grad_norm": 2.565819263458252, + "learning_rate": 0.0001416382252559727, + "loss": 5.3644, + "step": 7545 + }, + { + "epoch": 2.5754266211604095, + "grad_norm": 2.6522724628448486, + "learning_rate": 0.0001415244596131968, + "loss": 6.1545, + "step": 7546 + }, + { + "epoch": 2.5757679180887374, + "grad_norm": 2.639831781387329, + "learning_rate": 0.00014141069397042094, + "loss": 5.8925, + "step": 7547 + }, + { + "epoch": 2.576109215017065, + "grad_norm": 2.7260892391204834, + "learning_rate": 0.00014129692832764505, + "loss": 6.6641, + "step": 7548 + }, + { + "epoch": 2.5764505119453927, + "grad_norm": 2.7644131183624268, + "learning_rate": 0.00014118316268486918, + "loss": 5.8881, + "step": 7549 + }, + { + "epoch": 2.57679180887372, + "grad_norm": 2.671483039855957, + "learning_rate": 0.00014106939704209328, + "loss": 6.047, + "step": 7550 + }, + { + "epoch": 2.577133105802048, + "grad_norm": 2.5989768505096436, + "learning_rate": 0.00014095563139931741, + "loss": 6.5126, + "step": 7551 + }, + { + "epoch": 2.5774744027303753, + "grad_norm": 2.6520800590515137, + "learning_rate": 0.00014084186575654155, + "loss": 5.6839, + "step": 7552 + }, + { + "epoch": 2.577815699658703, + "grad_norm": 2.7389848232269287, + "learning_rate": 0.00014072810011376562, + "loss": 6.3082, + "step": 7553 + }, + { + "epoch": 2.5781569965870306, + "grad_norm": 2.6571834087371826, + "learning_rate": 0.00014061433447098976, + "loss": 2.0284, + "step": 7554 + }, + { + "epoch": 2.5784982935153584, + "grad_norm": 2.616753339767456, + "learning_rate": 0.0001405005688282139, + "loss": 5.1408, + "step": 7555 + }, + { + "epoch": 2.5788395904436863, + "grad_norm": 2.6577699184417725, + "learning_rate": 0.000140386803185438, + "loss": 6.3211, + "step": 7556 + }, + { + "epoch": 2.5791808873720137, + "grad_norm": 2.816687822341919, + "learning_rate": 0.00014027303754266213, + "loss": 6.3943, + "step": 7557 + }, + { + "epoch": 2.579522184300341, + "grad_norm": 2.7218778133392334, + "learning_rate": 0.00014015927189988623, + "loss": 6.1001, + "step": 7558 + }, + { + "epoch": 2.579863481228669, + "grad_norm": 2.7211270332336426, + "learning_rate": 0.00014004550625711036, + "loss": 5.817, + "step": 7559 + }, + { + "epoch": 2.580204778156997, + "grad_norm": 2.6363632678985596, + "learning_rate": 0.00013993174061433447, + "loss": 6.3371, + "step": 7560 + }, + { + "epoch": 2.580546075085324, + "grad_norm": 2.642151355743408, + "learning_rate": 0.0001398179749715586, + "loss": 6.1385, + "step": 7561 + }, + { + "epoch": 2.580887372013652, + "grad_norm": 3.0516741275787354, + "learning_rate": 0.00013970420932878273, + "loss": 5.0531, + "step": 7562 + }, + { + "epoch": 2.5812286689419794, + "grad_norm": 2.72440505027771, + "learning_rate": 0.0001395904436860068, + "loss": 6.3252, + "step": 7563 + }, + { + "epoch": 2.5815699658703073, + "grad_norm": 2.5998916625976562, + "learning_rate": 0.00013947667804323094, + "loss": 5.7731, + "step": 7564 + }, + { + "epoch": 2.5819112627986347, + "grad_norm": 2.963616371154785, + "learning_rate": 0.00013936291240045507, + "loss": 4.7545, + "step": 7565 + }, + { + "epoch": 2.5822525597269625, + "grad_norm": 2.8518779277801514, + "learning_rate": 0.00013924914675767918, + "loss": 5.6993, + "step": 7566 + }, + { + "epoch": 2.58259385665529, + "grad_norm": 2.6663856506347656, + "learning_rate": 0.0001391353811149033, + "loss": 4.3027, + "step": 7567 + }, + { + "epoch": 2.582935153583618, + "grad_norm": 2.5740647315979004, + "learning_rate": 0.00013902161547212741, + "loss": 6.0794, + "step": 7568 + }, + { + "epoch": 2.5832764505119457, + "grad_norm": 2.7939860820770264, + "learning_rate": 0.00013890784982935155, + "loss": 5.3234, + "step": 7569 + }, + { + "epoch": 2.583617747440273, + "grad_norm": 2.7503066062927246, + "learning_rate": 0.00013879408418657565, + "loss": 6.3546, + "step": 7570 + }, + { + "epoch": 2.5839590443686005, + "grad_norm": 2.6611952781677246, + "learning_rate": 0.00013868031854379978, + "loss": 5.9687, + "step": 7571 + }, + { + "epoch": 2.5843003412969283, + "grad_norm": 2.741959810256958, + "learning_rate": 0.00013856655290102392, + "loss": 5.5272, + "step": 7572 + }, + { + "epoch": 2.584641638225256, + "grad_norm": 2.7685506343841553, + "learning_rate": 0.000138452787258248, + "loss": 5.9441, + "step": 7573 + }, + { + "epoch": 2.5849829351535836, + "grad_norm": 2.6498281955718994, + "learning_rate": 0.00013833902161547213, + "loss": 5.6183, + "step": 7574 + }, + { + "epoch": 2.5853242320819114, + "grad_norm": 2.6913111209869385, + "learning_rate": 0.00013822525597269626, + "loss": 6.2333, + "step": 7575 + }, + { + "epoch": 2.585665529010239, + "grad_norm": 2.6091015338897705, + "learning_rate": 0.00013811149032992036, + "loss": 6.4434, + "step": 7576 + }, + { + "epoch": 2.5860068259385667, + "grad_norm": 2.665632486343384, + "learning_rate": 0.00013799772468714447, + "loss": 6.0747, + "step": 7577 + }, + { + "epoch": 2.586348122866894, + "grad_norm": 2.7666921615600586, + "learning_rate": 0.0001378839590443686, + "loss": 6.2239, + "step": 7578 + }, + { + "epoch": 2.586689419795222, + "grad_norm": 2.6611504554748535, + "learning_rate": 0.00013777019340159273, + "loss": 6.6124, + "step": 7579 + }, + { + "epoch": 2.5870307167235493, + "grad_norm": 2.631537437438965, + "learning_rate": 0.00013765642775881684, + "loss": 6.2377, + "step": 7580 + }, + { + "epoch": 2.587372013651877, + "grad_norm": 2.7600057125091553, + "learning_rate": 0.00013754266211604097, + "loss": 5.9184, + "step": 7581 + }, + { + "epoch": 2.587713310580205, + "grad_norm": 2.6380791664123535, + "learning_rate": 0.00013742889647326507, + "loss": 6.3217, + "step": 7582 + }, + { + "epoch": 2.5880546075085324, + "grad_norm": 2.6468701362609863, + "learning_rate": 0.00013731513083048918, + "loss": 5.9776, + "step": 7583 + }, + { + "epoch": 2.58839590443686, + "grad_norm": 2.6581053733825684, + "learning_rate": 0.0001372013651877133, + "loss": 6.3691, + "step": 7584 + }, + { + "epoch": 2.5887372013651877, + "grad_norm": 2.5985143184661865, + "learning_rate": 0.00013708759954493744, + "loss": 6.0739, + "step": 7585 + }, + { + "epoch": 2.5890784982935156, + "grad_norm": 2.678396701812744, + "learning_rate": 0.00013697383390216155, + "loss": 6.2776, + "step": 7586 + }, + { + "epoch": 2.589419795221843, + "grad_norm": 2.7191009521484375, + "learning_rate": 0.00013686006825938565, + "loss": 6.1219, + "step": 7587 + }, + { + "epoch": 2.589761092150171, + "grad_norm": 2.738563299179077, + "learning_rate": 0.00013674630261660978, + "loss": 6.3903, + "step": 7588 + }, + { + "epoch": 2.590102389078498, + "grad_norm": 2.5974106788635254, + "learning_rate": 0.00013663253697383392, + "loss": 6.0863, + "step": 7589 + }, + { + "epoch": 2.590443686006826, + "grad_norm": 2.727854013442993, + "learning_rate": 0.00013651877133105802, + "loss": 5.466, + "step": 7590 + }, + { + "epoch": 2.5907849829351535, + "grad_norm": 2.6561155319213867, + "learning_rate": 0.00013640500568828215, + "loss": 5.8661, + "step": 7591 + }, + { + "epoch": 2.5911262798634813, + "grad_norm": 2.671494483947754, + "learning_rate": 0.00013629124004550626, + "loss": 5.6121, + "step": 7592 + }, + { + "epoch": 2.5914675767918087, + "grad_norm": 2.713571786880493, + "learning_rate": 0.00013617747440273036, + "loss": 6.0892, + "step": 7593 + }, + { + "epoch": 2.5918088737201366, + "grad_norm": 2.6231658458709717, + "learning_rate": 0.0001360637087599545, + "loss": 5.5849, + "step": 7594 + }, + { + "epoch": 2.5921501706484644, + "grad_norm": 4.061588287353516, + "learning_rate": 0.00013594994311717863, + "loss": 4.2174, + "step": 7595 + }, + { + "epoch": 2.592491467576792, + "grad_norm": 2.6725544929504395, + "learning_rate": 0.00013583617747440273, + "loss": 5.5262, + "step": 7596 + }, + { + "epoch": 2.5928327645051192, + "grad_norm": 3.150416612625122, + "learning_rate": 0.00013572241183162684, + "loss": 4.9608, + "step": 7597 + }, + { + "epoch": 2.593174061433447, + "grad_norm": 2.6914148330688477, + "learning_rate": 0.00013560864618885097, + "loss": 6.0067, + "step": 7598 + }, + { + "epoch": 2.593515358361775, + "grad_norm": 2.676713705062866, + "learning_rate": 0.0001354948805460751, + "loss": 6.4768, + "step": 7599 + }, + { + "epoch": 2.5938566552901023, + "grad_norm": 2.7440037727355957, + "learning_rate": 0.0001353811149032992, + "loss": 5.4661, + "step": 7600 + }, + { + "epoch": 2.59419795221843, + "grad_norm": 2.6399929523468018, + "learning_rate": 0.00013526734926052334, + "loss": 6.3171, + "step": 7601 + }, + { + "epoch": 2.5945392491467576, + "grad_norm": 2.746882438659668, + "learning_rate": 0.00013515358361774744, + "loss": 6.0754, + "step": 7602 + }, + { + "epoch": 2.5948805460750854, + "grad_norm": 2.638324022293091, + "learning_rate": 0.00013503981797497155, + "loss": 5.8914, + "step": 7603 + }, + { + "epoch": 2.595221843003413, + "grad_norm": 2.618677854537964, + "learning_rate": 0.00013492605233219568, + "loss": 6.0271, + "step": 7604 + }, + { + "epoch": 2.5955631399317407, + "grad_norm": 2.606271743774414, + "learning_rate": 0.0001348122866894198, + "loss": 4.9815, + "step": 7605 + }, + { + "epoch": 2.595904436860068, + "grad_norm": 3.786407470703125, + "learning_rate": 0.00013469852104664392, + "loss": 4.8562, + "step": 7606 + }, + { + "epoch": 2.596245733788396, + "grad_norm": 3.0423500537872314, + "learning_rate": 0.00013458475540386802, + "loss": 4.8359, + "step": 7607 + }, + { + "epoch": 2.596587030716724, + "grad_norm": 3.009005308151245, + "learning_rate": 0.00013447098976109215, + "loss": 4.6837, + "step": 7608 + }, + { + "epoch": 2.596928327645051, + "grad_norm": 2.638024091720581, + "learning_rate": 0.0001343572241183163, + "loss": 5.9415, + "step": 7609 + }, + { + "epoch": 2.5972696245733786, + "grad_norm": 2.710456371307373, + "learning_rate": 0.0001342434584755404, + "loss": 5.729, + "step": 7610 + }, + { + "epoch": 2.5976109215017065, + "grad_norm": 2.8217124938964844, + "learning_rate": 0.00013412969283276452, + "loss": 5.2043, + "step": 7611 + }, + { + "epoch": 2.5979522184300343, + "grad_norm": 2.6994223594665527, + "learning_rate": 0.00013401592718998863, + "loss": 5.7662, + "step": 7612 + }, + { + "epoch": 2.5982935153583617, + "grad_norm": 2.637618064880371, + "learning_rate": 0.00013390216154721273, + "loss": 5.9881, + "step": 7613 + }, + { + "epoch": 2.5986348122866896, + "grad_norm": 2.62662935256958, + "learning_rate": 0.00013378839590443687, + "loss": 5.9036, + "step": 7614 + }, + { + "epoch": 2.598976109215017, + "grad_norm": 2.6811110973358154, + "learning_rate": 0.000133674630261661, + "loss": 5.7672, + "step": 7615 + }, + { + "epoch": 2.599317406143345, + "grad_norm": 2.62439227104187, + "learning_rate": 0.00013356086461888508, + "loss": 6.594, + "step": 7616 + }, + { + "epoch": 2.5996587030716722, + "grad_norm": 2.640967607498169, + "learning_rate": 0.0001334470989761092, + "loss": 6.0876, + "step": 7617 + }, + { + "epoch": 2.6, + "grad_norm": 2.6701509952545166, + "learning_rate": 0.00013333333333333334, + "loss": 6.2532, + "step": 7618 + }, + { + "epoch": 2.6003412969283275, + "grad_norm": 2.628978967666626, + "learning_rate": 0.00013321956769055747, + "loss": 6.0009, + "step": 7619 + }, + { + "epoch": 2.6006825938566553, + "grad_norm": 3.894216775894165, + "learning_rate": 0.00013310580204778158, + "loss": 5.3864, + "step": 7620 + }, + { + "epoch": 2.601023890784983, + "grad_norm": 2.637104034423828, + "learning_rate": 0.00013299203640500568, + "loss": 6.1776, + "step": 7621 + }, + { + "epoch": 2.6013651877133106, + "grad_norm": 2.7615833282470703, + "learning_rate": 0.00013287827076222981, + "loss": 6.4661, + "step": 7622 + }, + { + "epoch": 2.601706484641638, + "grad_norm": 3.8650755882263184, + "learning_rate": 0.00013276450511945392, + "loss": 5.7443, + "step": 7623 + }, + { + "epoch": 2.602047781569966, + "grad_norm": 2.71655011177063, + "learning_rate": 0.00013265073947667805, + "loss": 6.3148, + "step": 7624 + }, + { + "epoch": 2.6023890784982937, + "grad_norm": 2.681018114089966, + "learning_rate": 0.00013253697383390218, + "loss": 6.3714, + "step": 7625 + }, + { + "epoch": 2.602730375426621, + "grad_norm": 2.730727195739746, + "learning_rate": 0.00013242320819112626, + "loss": 6.1858, + "step": 7626 + }, + { + "epoch": 2.603071672354949, + "grad_norm": 2.642637014389038, + "learning_rate": 0.0001323094425483504, + "loss": 5.5325, + "step": 7627 + }, + { + "epoch": 2.6034129692832764, + "grad_norm": 2.673652172088623, + "learning_rate": 0.00013219567690557452, + "loss": 6.3912, + "step": 7628 + }, + { + "epoch": 2.603754266211604, + "grad_norm": 2.6778078079223633, + "learning_rate": 0.00013208191126279863, + "loss": 6.2991, + "step": 7629 + }, + { + "epoch": 2.6040955631399316, + "grad_norm": 2.6532318592071533, + "learning_rate": 0.00013196814562002276, + "loss": 5.6819, + "step": 7630 + }, + { + "epoch": 2.6044368600682595, + "grad_norm": 3.36240291595459, + "learning_rate": 0.00013185437997724687, + "loss": 4.7713, + "step": 7631 + }, + { + "epoch": 2.604778156996587, + "grad_norm": 2.656712532043457, + "learning_rate": 0.000131740614334471, + "loss": 5.4265, + "step": 7632 + }, + { + "epoch": 2.6051194539249147, + "grad_norm": 2.6337647438049316, + "learning_rate": 0.0001316268486916951, + "loss": 6.0607, + "step": 7633 + }, + { + "epoch": 2.6054607508532426, + "grad_norm": 2.639120578765869, + "learning_rate": 0.00013151308304891924, + "loss": 5.8628, + "step": 7634 + }, + { + "epoch": 2.60580204778157, + "grad_norm": 2.6694211959838867, + "learning_rate": 0.00013139931740614337, + "loss": 6.3282, + "step": 7635 + }, + { + "epoch": 2.6061433447098974, + "grad_norm": 2.556114435195923, + "learning_rate": 0.00013128555176336745, + "loss": 5.6204, + "step": 7636 + }, + { + "epoch": 2.6064846416382252, + "grad_norm": 2.6191563606262207, + "learning_rate": 0.00013117178612059158, + "loss": 4.6587, + "step": 7637 + }, + { + "epoch": 2.606825938566553, + "grad_norm": 2.7384796142578125, + "learning_rate": 0.0001310580204778157, + "loss": 5.1479, + "step": 7638 + }, + { + "epoch": 2.6071672354948805, + "grad_norm": 2.6779263019561768, + "learning_rate": 0.00013094425483503981, + "loss": 5.5593, + "step": 7639 + }, + { + "epoch": 2.6075085324232083, + "grad_norm": 2.643181324005127, + "learning_rate": 0.00013083048919226395, + "loss": 6.0678, + "step": 7640 + }, + { + "epoch": 2.6078498293515358, + "grad_norm": 2.6609034538269043, + "learning_rate": 0.00013071672354948805, + "loss": 5.7646, + "step": 7641 + }, + { + "epoch": 2.6081911262798636, + "grad_norm": 2.6710946559906006, + "learning_rate": 0.00013060295790671218, + "loss": 5.7591, + "step": 7642 + }, + { + "epoch": 2.608532423208191, + "grad_norm": 2.7074642181396484, + "learning_rate": 0.0001304891922639363, + "loss": 5.8659, + "step": 7643 + }, + { + "epoch": 2.608873720136519, + "grad_norm": 2.806108236312866, + "learning_rate": 0.00013037542662116042, + "loss": 5.3068, + "step": 7644 + }, + { + "epoch": 2.6092150170648463, + "grad_norm": 2.8876841068267822, + "learning_rate": 0.00013026166097838455, + "loss": 4.9614, + "step": 7645 + }, + { + "epoch": 2.609556313993174, + "grad_norm": 2.6452441215515137, + "learning_rate": 0.00013014789533560863, + "loss": 4.6235, + "step": 7646 + }, + { + "epoch": 2.609897610921502, + "grad_norm": 2.64217209815979, + "learning_rate": 0.00013003412969283276, + "loss": 5.6776, + "step": 7647 + }, + { + "epoch": 2.6102389078498294, + "grad_norm": 2.6999876499176025, + "learning_rate": 0.0001299203640500569, + "loss": 5.8503, + "step": 7648 + }, + { + "epoch": 2.6105802047781568, + "grad_norm": 2.6543946266174316, + "learning_rate": 0.000129806598407281, + "loss": 6.0889, + "step": 7649 + }, + { + "epoch": 2.6109215017064846, + "grad_norm": 2.6520276069641113, + "learning_rate": 0.0001296928327645051, + "loss": 5.8006, + "step": 7650 + }, + { + "epoch": 2.6112627986348125, + "grad_norm": 2.520744562149048, + "learning_rate": 0.00012957906712172924, + "loss": 6.255, + "step": 7651 + }, + { + "epoch": 2.61160409556314, + "grad_norm": 2.6043193340301514, + "learning_rate": 0.00012946530147895337, + "loss": 5.8413, + "step": 7652 + }, + { + "epoch": 2.6119453924914677, + "grad_norm": 2.659938335418701, + "learning_rate": 0.00012935153583617747, + "loss": 5.6511, + "step": 7653 + }, + { + "epoch": 2.612286689419795, + "grad_norm": 2.6185102462768555, + "learning_rate": 0.0001292377701934016, + "loss": 6.55, + "step": 7654 + }, + { + "epoch": 2.612627986348123, + "grad_norm": 2.686222553253174, + "learning_rate": 0.0001291240045506257, + "loss": 5.7808, + "step": 7655 + }, + { + "epoch": 2.6129692832764504, + "grad_norm": 2.712735414505005, + "learning_rate": 0.00012901023890784982, + "loss": 5.6224, + "step": 7656 + }, + { + "epoch": 2.6133105802047782, + "grad_norm": 2.8246679306030273, + "learning_rate": 0.00012889647326507395, + "loss": 4.6806, + "step": 7657 + }, + { + "epoch": 2.6136518771331056, + "grad_norm": 2.6839938163757324, + "learning_rate": 0.00012878270762229808, + "loss": 5.9373, + "step": 7658 + }, + { + "epoch": 2.6139931740614335, + "grad_norm": 2.625178575515747, + "learning_rate": 0.00012866894197952218, + "loss": 6.0065, + "step": 7659 + }, + { + "epoch": 2.6143344709897613, + "grad_norm": 2.6909379959106445, + "learning_rate": 0.0001285551763367463, + "loss": 6.3687, + "step": 7660 + }, + { + "epoch": 2.6146757679180888, + "grad_norm": 2.5886874198913574, + "learning_rate": 0.00012844141069397042, + "loss": 5.5438, + "step": 7661 + }, + { + "epoch": 2.615017064846416, + "grad_norm": 2.6835293769836426, + "learning_rate": 0.00012832764505119455, + "loss": 6.2562, + "step": 7662 + }, + { + "epoch": 2.615358361774744, + "grad_norm": 2.7177085876464844, + "learning_rate": 0.00012821387940841866, + "loss": 6.1722, + "step": 7663 + }, + { + "epoch": 2.615699658703072, + "grad_norm": 2.651545763015747, + "learning_rate": 0.0001281001137656428, + "loss": 6.2457, + "step": 7664 + }, + { + "epoch": 2.6160409556313993, + "grad_norm": 2.6499643325805664, + "learning_rate": 0.0001279863481228669, + "loss": 5.9292, + "step": 7665 + }, + { + "epoch": 2.616382252559727, + "grad_norm": 2.6566162109375, + "learning_rate": 0.000127872582480091, + "loss": 5.9867, + "step": 7666 + }, + { + "epoch": 2.6167235494880545, + "grad_norm": 2.6786088943481445, + "learning_rate": 0.00012775881683731513, + "loss": 6.447, + "step": 7667 + }, + { + "epoch": 2.6170648464163824, + "grad_norm": 2.659654140472412, + "learning_rate": 0.00012764505119453926, + "loss": 5.8441, + "step": 7668 + }, + { + "epoch": 2.61740614334471, + "grad_norm": 2.6184911727905273, + "learning_rate": 0.00012753128555176337, + "loss": 5.8193, + "step": 7669 + }, + { + "epoch": 2.6177474402730376, + "grad_norm": 2.677710771560669, + "learning_rate": 0.00012741751990898747, + "loss": 6.0518, + "step": 7670 + }, + { + "epoch": 2.618088737201365, + "grad_norm": 2.705430030822754, + "learning_rate": 0.0001273037542662116, + "loss": 6.224, + "step": 7671 + }, + { + "epoch": 2.618430034129693, + "grad_norm": 2.7365448474884033, + "learning_rate": 0.00012718998862343574, + "loss": 6.3756, + "step": 7672 + }, + { + "epoch": 2.6187713310580207, + "grad_norm": 2.6876437664031982, + "learning_rate": 0.00012707622298065984, + "loss": 6.07, + "step": 7673 + }, + { + "epoch": 2.619112627986348, + "grad_norm": 2.6189560890197754, + "learning_rate": 0.00012696245733788398, + "loss": 6.1245, + "step": 7674 + }, + { + "epoch": 2.6194539249146755, + "grad_norm": 3.7583062648773193, + "learning_rate": 0.00012684869169510808, + "loss": 5.6039, + "step": 7675 + }, + { + "epoch": 2.6197952218430034, + "grad_norm": 2.7220373153686523, + "learning_rate": 0.00012673492605233219, + "loss": 6.0232, + "step": 7676 + }, + { + "epoch": 2.6201365187713312, + "grad_norm": 2.691070556640625, + "learning_rate": 0.00012662116040955632, + "loss": 6.2148, + "step": 7677 + }, + { + "epoch": 2.6204778156996587, + "grad_norm": 2.6679129600524902, + "learning_rate": 0.00012650739476678045, + "loss": 6.1872, + "step": 7678 + }, + { + "epoch": 2.6208191126279865, + "grad_norm": 2.7614474296569824, + "learning_rate": 0.00012639362912400455, + "loss": 5.669, + "step": 7679 + }, + { + "epoch": 2.621160409556314, + "grad_norm": 2.7429089546203613, + "learning_rate": 0.00012627986348122866, + "loss": 5.5727, + "step": 7680 + }, + { + "epoch": 2.6215017064846418, + "grad_norm": 2.684476613998413, + "learning_rate": 0.0001261660978384528, + "loss": 5.7475, + "step": 7681 + }, + { + "epoch": 2.621843003412969, + "grad_norm": 2.627525806427002, + "learning_rate": 0.00012605233219567692, + "loss": 6.2481, + "step": 7682 + }, + { + "epoch": 2.622184300341297, + "grad_norm": 2.691311836242676, + "learning_rate": 0.00012593856655290103, + "loss": 6.0962, + "step": 7683 + }, + { + "epoch": 2.6225255972696244, + "grad_norm": 2.6626622676849365, + "learning_rate": 0.00012582480091012513, + "loss": 6.2863, + "step": 7684 + }, + { + "epoch": 2.6228668941979523, + "grad_norm": 2.6201605796813965, + "learning_rate": 0.00012571103526734927, + "loss": 5.6584, + "step": 7685 + }, + { + "epoch": 2.62320819112628, + "grad_norm": 2.704043388366699, + "learning_rate": 0.00012559726962457337, + "loss": 6.1213, + "step": 7686 + }, + { + "epoch": 2.6235494880546075, + "grad_norm": 2.5595293045043945, + "learning_rate": 0.0001254835039817975, + "loss": 6.3741, + "step": 7687 + }, + { + "epoch": 2.623890784982935, + "grad_norm": 5.192468643188477, + "learning_rate": 0.00012536973833902163, + "loss": 4.9172, + "step": 7688 + }, + { + "epoch": 2.624232081911263, + "grad_norm": 2.794823169708252, + "learning_rate": 0.0001252559726962457, + "loss": 5.9293, + "step": 7689 + }, + { + "epoch": 2.6245733788395906, + "grad_norm": 2.6778604984283447, + "learning_rate": 0.00012514220705346984, + "loss": 6.2208, + "step": 7690 + }, + { + "epoch": 2.624914675767918, + "grad_norm": 2.6402392387390137, + "learning_rate": 0.00012502844141069398, + "loss": 5.5919, + "step": 7691 + }, + { + "epoch": 2.625255972696246, + "grad_norm": 2.6135995388031006, + "learning_rate": 0.0001249146757679181, + "loss": 6.367, + "step": 7692 + }, + { + "epoch": 2.6255972696245733, + "grad_norm": 2.745082378387451, + "learning_rate": 0.0001248009101251422, + "loss": 5.8043, + "step": 7693 + }, + { + "epoch": 2.625938566552901, + "grad_norm": 2.596097946166992, + "learning_rate": 0.00012468714448236632, + "loss": 5.4991, + "step": 7694 + }, + { + "epoch": 2.6262798634812285, + "grad_norm": 2.6917331218719482, + "learning_rate": 0.00012457337883959045, + "loss": 6.1673, + "step": 7695 + }, + { + "epoch": 2.6266211604095564, + "grad_norm": 2.624852180480957, + "learning_rate": 0.00012445961319681456, + "loss": 6.381, + "step": 7696 + }, + { + "epoch": 2.626962457337884, + "grad_norm": 2.734668493270874, + "learning_rate": 0.0001243458475540387, + "loss": 5.8872, + "step": 7697 + }, + { + "epoch": 2.6273037542662117, + "grad_norm": 2.7033309936523438, + "learning_rate": 0.0001242320819112628, + "loss": 5.7082, + "step": 7698 + }, + { + "epoch": 2.6276450511945395, + "grad_norm": 2.640166759490967, + "learning_rate": 0.00012411831626848692, + "loss": 5.962, + "step": 7699 + }, + { + "epoch": 2.627986348122867, + "grad_norm": 2.584916591644287, + "learning_rate": 0.00012400455062571103, + "loss": 6.1359, + "step": 7700 + }, + { + "epoch": 2.6283276450511943, + "grad_norm": 2.798306465148926, + "learning_rate": 0.00012389078498293516, + "loss": 6.2166, + "step": 7701 + }, + { + "epoch": 2.628668941979522, + "grad_norm": 2.539496660232544, + "learning_rate": 0.0001237770193401593, + "loss": 5.7527, + "step": 7702 + }, + { + "epoch": 2.62901023890785, + "grad_norm": 2.606154680252075, + "learning_rate": 0.0001236632536973834, + "loss": 6.1712, + "step": 7703 + }, + { + "epoch": 2.6293515358361774, + "grad_norm": 4.360844612121582, + "learning_rate": 0.0001235494880546075, + "loss": 5.0891, + "step": 7704 + }, + { + "epoch": 2.6296928327645053, + "grad_norm": 2.640214443206787, + "learning_rate": 0.00012343572241183164, + "loss": 5.8549, + "step": 7705 + }, + { + "epoch": 2.6300341296928327, + "grad_norm": 2.8788836002349854, + "learning_rate": 0.00012332195676905574, + "loss": 5.118, + "step": 7706 + }, + { + "epoch": 2.6303754266211605, + "grad_norm": 2.637471914291382, + "learning_rate": 0.00012320819112627987, + "loss": 6.1626, + "step": 7707 + }, + { + "epoch": 2.630716723549488, + "grad_norm": 2.6709108352661133, + "learning_rate": 0.00012309442548350398, + "loss": 5.7018, + "step": 7708 + }, + { + "epoch": 2.631058020477816, + "grad_norm": 2.575793504714966, + "learning_rate": 0.0001229806598407281, + "loss": 5.8412, + "step": 7709 + }, + { + "epoch": 2.631399317406143, + "grad_norm": 2.7106573581695557, + "learning_rate": 0.00012286689419795221, + "loss": 5.151, + "step": 7710 + }, + { + "epoch": 2.631740614334471, + "grad_norm": 2.6999032497406006, + "learning_rate": 0.00012275312855517635, + "loss": 6.1057, + "step": 7711 + }, + { + "epoch": 2.632081911262799, + "grad_norm": 2.700033664703369, + "learning_rate": 0.00012263936291240045, + "loss": 6.1486, + "step": 7712 + }, + { + "epoch": 2.6324232081911263, + "grad_norm": 2.705191135406494, + "learning_rate": 0.00012252559726962458, + "loss": 5.427, + "step": 7713 + }, + { + "epoch": 2.6327645051194537, + "grad_norm": 2.5965819358825684, + "learning_rate": 0.0001224118316268487, + "loss": 6.0963, + "step": 7714 + }, + { + "epoch": 2.6331058020477816, + "grad_norm": 2.7236809730529785, + "learning_rate": 0.00012229806598407282, + "loss": 5.8642, + "step": 7715 + }, + { + "epoch": 2.6334470989761094, + "grad_norm": 2.6513259410858154, + "learning_rate": 0.00012218430034129693, + "loss": 6.547, + "step": 7716 + }, + { + "epoch": 2.633788395904437, + "grad_norm": 2.6121363639831543, + "learning_rate": 0.00012207053469852103, + "loss": 5.9717, + "step": 7717 + }, + { + "epoch": 2.6341296928327647, + "grad_norm": 2.756356716156006, + "learning_rate": 0.00012195676905574516, + "loss": 6.0155, + "step": 7718 + }, + { + "epoch": 2.634470989761092, + "grad_norm": 2.6240437030792236, + "learning_rate": 0.0001218430034129693, + "loss": 5.6416, + "step": 7719 + }, + { + "epoch": 2.63481228668942, + "grad_norm": 2.6901957988739014, + "learning_rate": 0.0001217292377701934, + "loss": 6.0908, + "step": 7720 + }, + { + "epoch": 2.6351535836177473, + "grad_norm": 2.5926854610443115, + "learning_rate": 0.00012161547212741753, + "loss": 5.8315, + "step": 7721 + }, + { + "epoch": 2.635494880546075, + "grad_norm": 2.6455578804016113, + "learning_rate": 0.00012150170648464164, + "loss": 6.1936, + "step": 7722 + }, + { + "epoch": 2.6358361774744026, + "grad_norm": 2.6685874462127686, + "learning_rate": 0.00012138794084186575, + "loss": 6.2457, + "step": 7723 + }, + { + "epoch": 2.6361774744027304, + "grad_norm": 2.5576179027557373, + "learning_rate": 0.00012127417519908989, + "loss": 6.1212, + "step": 7724 + }, + { + "epoch": 2.6365187713310583, + "grad_norm": 2.6035420894622803, + "learning_rate": 0.00012116040955631399, + "loss": 6.173, + "step": 7725 + }, + { + "epoch": 2.6368600682593857, + "grad_norm": 2.7565255165100098, + "learning_rate": 0.00012104664391353812, + "loss": 5.909, + "step": 7726 + }, + { + "epoch": 2.637201365187713, + "grad_norm": 2.3989455699920654, + "learning_rate": 0.00012093287827076223, + "loss": 5.1077, + "step": 7727 + }, + { + "epoch": 2.637542662116041, + "grad_norm": 2.7297885417938232, + "learning_rate": 0.00012081911262798635, + "loss": 5.1734, + "step": 7728 + }, + { + "epoch": 2.637883959044369, + "grad_norm": 2.634373664855957, + "learning_rate": 0.00012070534698521047, + "loss": 5.418, + "step": 7729 + }, + { + "epoch": 2.638225255972696, + "grad_norm": 2.557570457458496, + "learning_rate": 0.00012059158134243458, + "loss": 6.1528, + "step": 7730 + }, + { + "epoch": 2.638566552901024, + "grad_norm": 2.647575855255127, + "learning_rate": 0.00012047781569965872, + "loss": 6.0692, + "step": 7731 + }, + { + "epoch": 2.6389078498293514, + "grad_norm": 2.5773942470550537, + "learning_rate": 0.00012036405005688282, + "loss": 5.6637, + "step": 7732 + }, + { + "epoch": 2.6392491467576793, + "grad_norm": 2.692434549331665, + "learning_rate": 0.00012025028441410694, + "loss": 6.029, + "step": 7733 + }, + { + "epoch": 2.6395904436860067, + "grad_norm": 2.6368980407714844, + "learning_rate": 0.00012013651877133106, + "loss": 6.3336, + "step": 7734 + }, + { + "epoch": 2.6399317406143346, + "grad_norm": 2.6980204582214355, + "learning_rate": 0.00012002275312855518, + "loss": 6.1817, + "step": 7735 + }, + { + "epoch": 2.640273037542662, + "grad_norm": 2.6697869300842285, + "learning_rate": 0.00011990898748577931, + "loss": 6.2257, + "step": 7736 + }, + { + "epoch": 2.64061433447099, + "grad_norm": 2.5750112533569336, + "learning_rate": 0.00011979522184300341, + "loss": 5.8854, + "step": 7737 + }, + { + "epoch": 2.6409556313993177, + "grad_norm": 2.6087377071380615, + "learning_rate": 0.00011968145620022753, + "loss": 5.9283, + "step": 7738 + }, + { + "epoch": 2.641296928327645, + "grad_norm": 2.607609272003174, + "learning_rate": 0.00011956769055745165, + "loss": 6.1513, + "step": 7739 + }, + { + "epoch": 2.6416382252559725, + "grad_norm": 2.6486034393310547, + "learning_rate": 0.00011945392491467577, + "loss": 5.6541, + "step": 7740 + }, + { + "epoch": 2.6419795221843003, + "grad_norm": 2.6967036724090576, + "learning_rate": 0.0001193401592718999, + "loss": 6.3013, + "step": 7741 + }, + { + "epoch": 2.642320819112628, + "grad_norm": 2.7555899620056152, + "learning_rate": 0.000119226393629124, + "loss": 5.9503, + "step": 7742 + }, + { + "epoch": 2.6426621160409556, + "grad_norm": 2.6998205184936523, + "learning_rate": 0.00011911262798634812, + "loss": 5.8985, + "step": 7743 + }, + { + "epoch": 2.6430034129692834, + "grad_norm": 2.662689208984375, + "learning_rate": 0.00011899886234357224, + "loss": 6.3067, + "step": 7744 + }, + { + "epoch": 2.643344709897611, + "grad_norm": 2.587956190109253, + "learning_rate": 0.00011888509670079636, + "loss": 6.137, + "step": 7745 + }, + { + "epoch": 2.6436860068259387, + "grad_norm": 2.9233665466308594, + "learning_rate": 0.00011877133105802048, + "loss": 5.8408, + "step": 7746 + }, + { + "epoch": 2.644027303754266, + "grad_norm": 2.765820264816284, + "learning_rate": 0.0001186575654152446, + "loss": 6.4021, + "step": 7747 + }, + { + "epoch": 2.644368600682594, + "grad_norm": 2.7021546363830566, + "learning_rate": 0.00011854379977246872, + "loss": 6.3432, + "step": 7748 + }, + { + "epoch": 2.6447098976109213, + "grad_norm": 2.628324031829834, + "learning_rate": 0.00011843003412969284, + "loss": 5.5596, + "step": 7749 + }, + { + "epoch": 2.645051194539249, + "grad_norm": 2.6931304931640625, + "learning_rate": 0.00011831626848691695, + "loss": 6.4549, + "step": 7750 + }, + { + "epoch": 2.645392491467577, + "grad_norm": 2.3119070529937744, + "learning_rate": 0.00011820250284414107, + "loss": 4.2596, + "step": 7751 + }, + { + "epoch": 2.6457337883959045, + "grad_norm": 2.604924440383911, + "learning_rate": 0.00011808873720136519, + "loss": 5.8979, + "step": 7752 + }, + { + "epoch": 2.646075085324232, + "grad_norm": 2.697232723236084, + "learning_rate": 0.00011797497155858931, + "loss": 5.1108, + "step": 7753 + }, + { + "epoch": 2.6464163822525597, + "grad_norm": 2.6660239696502686, + "learning_rate": 0.00011786120591581343, + "loss": 6.2313, + "step": 7754 + }, + { + "epoch": 2.6467576791808876, + "grad_norm": 2.5658226013183594, + "learning_rate": 0.00011774744027303755, + "loss": 5.9833, + "step": 7755 + }, + { + "epoch": 2.647098976109215, + "grad_norm": 2.658269166946411, + "learning_rate": 0.00011763367463026167, + "loss": 5.9185, + "step": 7756 + }, + { + "epoch": 2.647440273037543, + "grad_norm": 2.5440711975097656, + "learning_rate": 0.00011751990898748578, + "loss": 5.47, + "step": 7757 + }, + { + "epoch": 2.64778156996587, + "grad_norm": 2.6822307109832764, + "learning_rate": 0.0001174061433447099, + "loss": 6.2749, + "step": 7758 + }, + { + "epoch": 2.648122866894198, + "grad_norm": 2.758451461791992, + "learning_rate": 0.00011729237770193402, + "loss": 6.2035, + "step": 7759 + }, + { + "epoch": 2.6484641638225255, + "grad_norm": 2.591820240020752, + "learning_rate": 0.00011717861205915814, + "loss": 6.4704, + "step": 7760 + }, + { + "epoch": 2.6488054607508533, + "grad_norm": 2.8156046867370605, + "learning_rate": 0.00011706484641638224, + "loss": 6.4334, + "step": 7761 + }, + { + "epoch": 2.6491467576791807, + "grad_norm": 2.6117329597473145, + "learning_rate": 0.00011695108077360638, + "loss": 6.3525, + "step": 7762 + }, + { + "epoch": 2.6494880546075086, + "grad_norm": 2.6518020629882812, + "learning_rate": 0.00011683731513083048, + "loss": 5.9983, + "step": 7763 + }, + { + "epoch": 2.6498293515358364, + "grad_norm": 2.606523275375366, + "learning_rate": 0.00011672354948805461, + "loss": 5.6909, + "step": 7764 + }, + { + "epoch": 2.650170648464164, + "grad_norm": 2.7865447998046875, + "learning_rate": 0.00011660978384527873, + "loss": 3.9492, + "step": 7765 + }, + { + "epoch": 2.6505119453924912, + "grad_norm": 2.6306912899017334, + "learning_rate": 0.00011649601820250284, + "loss": 5.3691, + "step": 7766 + }, + { + "epoch": 2.650853242320819, + "grad_norm": 2.5862772464752197, + "learning_rate": 0.00011638225255972697, + "loss": 6.0451, + "step": 7767 + }, + { + "epoch": 2.651194539249147, + "grad_norm": 2.6710424423217773, + "learning_rate": 0.00011626848691695107, + "loss": 6.339, + "step": 7768 + }, + { + "epoch": 2.6515358361774743, + "grad_norm": 2.731431484222412, + "learning_rate": 0.0001161547212741752, + "loss": 6.5248, + "step": 7769 + }, + { + "epoch": 2.651877133105802, + "grad_norm": 2.6861982345581055, + "learning_rate": 0.00011604095563139932, + "loss": 5.8373, + "step": 7770 + }, + { + "epoch": 2.6522184300341296, + "grad_norm": 2.622677803039551, + "learning_rate": 0.00011592718998862343, + "loss": 6.0151, + "step": 7771 + }, + { + "epoch": 2.6525597269624575, + "grad_norm": 2.653522491455078, + "learning_rate": 0.00011581342434584756, + "loss": 6.2623, + "step": 7772 + }, + { + "epoch": 2.652901023890785, + "grad_norm": 2.724036693572998, + "learning_rate": 0.00011569965870307167, + "loss": 5.8716, + "step": 7773 + }, + { + "epoch": 2.6532423208191127, + "grad_norm": 2.6914572715759277, + "learning_rate": 0.0001155858930602958, + "loss": 6.0782, + "step": 7774 + }, + { + "epoch": 2.65358361774744, + "grad_norm": 2.528007984161377, + "learning_rate": 0.00011547212741751992, + "loss": 5.6457, + "step": 7775 + }, + { + "epoch": 2.653924914675768, + "grad_norm": 2.635348081588745, + "learning_rate": 0.00011535836177474402, + "loss": 6.2753, + "step": 7776 + }, + { + "epoch": 2.654266211604096, + "grad_norm": 2.6857311725616455, + "learning_rate": 0.00011524459613196815, + "loss": 5.6202, + "step": 7777 + }, + { + "epoch": 2.654607508532423, + "grad_norm": 2.612197160720825, + "learning_rate": 0.00011513083048919226, + "loss": 5.9211, + "step": 7778 + }, + { + "epoch": 2.6549488054607506, + "grad_norm": 2.6178486347198486, + "learning_rate": 0.00011501706484641639, + "loss": 4.9022, + "step": 7779 + }, + { + "epoch": 2.6552901023890785, + "grad_norm": 2.6042556762695312, + "learning_rate": 0.0001149032992036405, + "loss": 6.2243, + "step": 7780 + }, + { + "epoch": 2.6556313993174063, + "grad_norm": 2.5869898796081543, + "learning_rate": 0.00011478953356086461, + "loss": 6.0258, + "step": 7781 + }, + { + "epoch": 2.6559726962457337, + "grad_norm": 2.7476601600646973, + "learning_rate": 0.00011467576791808875, + "loss": 5.2693, + "step": 7782 + }, + { + "epoch": 2.6563139931740616, + "grad_norm": 2.492094039916992, + "learning_rate": 0.00011456200227531285, + "loss": 6.0099, + "step": 7783 + }, + { + "epoch": 2.656655290102389, + "grad_norm": 2.6337618827819824, + "learning_rate": 0.00011444823663253698, + "loss": 6.8608, + "step": 7784 + }, + { + "epoch": 2.656996587030717, + "grad_norm": 2.559250593185425, + "learning_rate": 0.00011433447098976109, + "loss": 5.9022, + "step": 7785 + }, + { + "epoch": 2.6573378839590442, + "grad_norm": 2.6352384090423584, + "learning_rate": 0.0001142207053469852, + "loss": 6.3508, + "step": 7786 + }, + { + "epoch": 2.657679180887372, + "grad_norm": 2.701533317565918, + "learning_rate": 0.00011410693970420934, + "loss": 6.1916, + "step": 7787 + }, + { + "epoch": 2.6580204778156995, + "grad_norm": 2.6848697662353516, + "learning_rate": 0.00011399317406143344, + "loss": 6.2393, + "step": 7788 + }, + { + "epoch": 2.6583617747440274, + "grad_norm": 3.9241063594818115, + "learning_rate": 0.00011387940841865758, + "loss": 5.0438, + "step": 7789 + }, + { + "epoch": 2.658703071672355, + "grad_norm": 2.6796340942382812, + "learning_rate": 0.00011376564277588168, + "loss": 5.3564, + "step": 7790 + }, + { + "epoch": 2.6590443686006826, + "grad_norm": 2.6478474140167236, + "learning_rate": 0.0001136518771331058, + "loss": 6.3117, + "step": 7791 + }, + { + "epoch": 2.65938566552901, + "grad_norm": 2.6212310791015625, + "learning_rate": 0.00011353811149032993, + "loss": 6.6302, + "step": 7792 + }, + { + "epoch": 2.659726962457338, + "grad_norm": 2.6432223320007324, + "learning_rate": 0.00011342434584755404, + "loss": 6.1271, + "step": 7793 + }, + { + "epoch": 2.6600682593856657, + "grad_norm": 2.6077170372009277, + "learning_rate": 0.00011331058020477817, + "loss": 6.1778, + "step": 7794 + }, + { + "epoch": 2.660409556313993, + "grad_norm": 2.6975767612457275, + "learning_rate": 0.00011319681456200227, + "loss": 6.2841, + "step": 7795 + }, + { + "epoch": 2.660750853242321, + "grad_norm": 2.6964032649993896, + "learning_rate": 0.00011308304891922639, + "loss": 5.8834, + "step": 7796 + }, + { + "epoch": 2.6610921501706484, + "grad_norm": 2.594709873199463, + "learning_rate": 0.00011296928327645051, + "loss": 5.8787, + "step": 7797 + }, + { + "epoch": 2.6614334470989762, + "grad_norm": 2.6809041500091553, + "learning_rate": 0.00011285551763367463, + "loss": 6.3782, + "step": 7798 + }, + { + "epoch": 2.6617747440273036, + "grad_norm": 2.5971288681030273, + "learning_rate": 0.00011274175199089876, + "loss": 6.1397, + "step": 7799 + }, + { + "epoch": 2.6621160409556315, + "grad_norm": 2.648439407348633, + "learning_rate": 0.00011262798634812287, + "loss": 5.7752, + "step": 7800 + }, + { + "epoch": 2.662457337883959, + "grad_norm": 2.61542010307312, + "learning_rate": 0.00011251422070534698, + "loss": 5.6754, + "step": 7801 + }, + { + "epoch": 2.6627986348122867, + "grad_norm": 2.5414392948150635, + "learning_rate": 0.0001124004550625711, + "loss": 5.9982, + "step": 7802 + }, + { + "epoch": 2.6631399317406146, + "grad_norm": 2.6229312419891357, + "learning_rate": 0.00011228668941979522, + "loss": 6.584, + "step": 7803 + }, + { + "epoch": 2.663481228668942, + "grad_norm": 2.5685508251190186, + "learning_rate": 0.00011217292377701935, + "loss": 6.1668, + "step": 7804 + }, + { + "epoch": 2.6638225255972694, + "grad_norm": 2.811805486679077, + "learning_rate": 0.00011205915813424346, + "loss": 5.735, + "step": 7805 + }, + { + "epoch": 2.6641638225255972, + "grad_norm": 2.5972208976745605, + "learning_rate": 0.00011194539249146758, + "loss": 6.849, + "step": 7806 + }, + { + "epoch": 2.664505119453925, + "grad_norm": 2.659641742706299, + "learning_rate": 0.0001118316268486917, + "loss": 6.3136, + "step": 7807 + }, + { + "epoch": 2.6648464163822525, + "grad_norm": 2.7042298316955566, + "learning_rate": 0.00011171786120591581, + "loss": 6.0122, + "step": 7808 + }, + { + "epoch": 2.6651877133105804, + "grad_norm": 2.617111921310425, + "learning_rate": 0.00011160409556313995, + "loss": 6.1822, + "step": 7809 + }, + { + "epoch": 2.6655290102389078, + "grad_norm": 2.545041084289551, + "learning_rate": 0.00011149032992036405, + "loss": 6.1464, + "step": 7810 + }, + { + "epoch": 2.6658703071672356, + "grad_norm": 3.4677798748016357, + "learning_rate": 0.00011137656427758817, + "loss": 4.4972, + "step": 7811 + }, + { + "epoch": 2.666211604095563, + "grad_norm": 2.6456897258758545, + "learning_rate": 0.00011126279863481229, + "loss": 5.4503, + "step": 7812 + }, + { + "epoch": 2.666552901023891, + "grad_norm": 2.6556036472320557, + "learning_rate": 0.0001111490329920364, + "loss": 6.0158, + "step": 7813 + }, + { + "epoch": 2.6668941979522183, + "grad_norm": 2.6038191318511963, + "learning_rate": 0.00011103526734926054, + "loss": 6.0356, + "step": 7814 + }, + { + "epoch": 2.667235494880546, + "grad_norm": 2.673539400100708, + "learning_rate": 0.00011092150170648464, + "loss": 5.9474, + "step": 7815 + }, + { + "epoch": 2.667576791808874, + "grad_norm": 2.661170721054077, + "learning_rate": 0.00011080773606370876, + "loss": 6.2451, + "step": 7816 + }, + { + "epoch": 2.6679180887372014, + "grad_norm": 2.639185905456543, + "learning_rate": 0.00011069397042093288, + "loss": 5.3801, + "step": 7817 + }, + { + "epoch": 2.668259385665529, + "grad_norm": 2.672795057296753, + "learning_rate": 0.000110580204778157, + "loss": 6.2753, + "step": 7818 + }, + { + "epoch": 2.6686006825938566, + "grad_norm": 1.828800916671753, + "learning_rate": 0.00011046643913538112, + "loss": 2.4337, + "step": 7819 + }, + { + "epoch": 2.6689419795221845, + "grad_norm": 2.646155834197998, + "learning_rate": 0.00011035267349260524, + "loss": 5.8717, + "step": 7820 + }, + { + "epoch": 2.669283276450512, + "grad_norm": 2.6443183422088623, + "learning_rate": 0.00011023890784982935, + "loss": 6.0079, + "step": 7821 + }, + { + "epoch": 2.6696245733788397, + "grad_norm": 2.595067262649536, + "learning_rate": 0.00011012514220705347, + "loss": 5.6408, + "step": 7822 + }, + { + "epoch": 2.669965870307167, + "grad_norm": 2.655027151107788, + "learning_rate": 0.00011001137656427759, + "loss": 5.8955, + "step": 7823 + }, + { + "epoch": 2.670307167235495, + "grad_norm": 2.6794850826263428, + "learning_rate": 0.00010989761092150171, + "loss": 5.8458, + "step": 7824 + }, + { + "epoch": 2.6706484641638224, + "grad_norm": 2.556561231613159, + "learning_rate": 0.00010978384527872583, + "loss": 5.9302, + "step": 7825 + }, + { + "epoch": 2.6709897610921502, + "grad_norm": 2.651296854019165, + "learning_rate": 0.00010967007963594995, + "loss": 6.2365, + "step": 7826 + }, + { + "epoch": 2.6713310580204777, + "grad_norm": 2.6557466983795166, + "learning_rate": 0.00010955631399317406, + "loss": 5.845, + "step": 7827 + }, + { + "epoch": 2.6716723549488055, + "grad_norm": 2.8386025428771973, + "learning_rate": 0.00010944254835039818, + "loss": 5.3838, + "step": 7828 + }, + { + "epoch": 2.6720136518771334, + "grad_norm": 2.6636369228363037, + "learning_rate": 0.0001093287827076223, + "loss": 6.2458, + "step": 7829 + }, + { + "epoch": 2.6723549488054608, + "grad_norm": 2.6208786964416504, + "learning_rate": 0.00010921501706484642, + "loss": 6.4429, + "step": 7830 + }, + { + "epoch": 2.672696245733788, + "grad_norm": 2.61503005027771, + "learning_rate": 0.00010910125142207054, + "loss": 6.0356, + "step": 7831 + }, + { + "epoch": 2.673037542662116, + "grad_norm": 2.5759787559509277, + "learning_rate": 0.00010898748577929466, + "loss": 5.025, + "step": 7832 + }, + { + "epoch": 2.673378839590444, + "grad_norm": 2.680743932723999, + "learning_rate": 0.00010887372013651878, + "loss": 5.651, + "step": 7833 + }, + { + "epoch": 2.6737201365187713, + "grad_norm": 2.4130074977874756, + "learning_rate": 0.0001087599544937429, + "loss": 4.161, + "step": 7834 + }, + { + "epoch": 2.674061433447099, + "grad_norm": 2.479400634765625, + "learning_rate": 0.00010864618885096701, + "loss": 4.6418, + "step": 7835 + }, + { + "epoch": 2.6744027303754265, + "grad_norm": 2.6607160568237305, + "learning_rate": 0.00010853242320819112, + "loss": 5.682, + "step": 7836 + }, + { + "epoch": 2.6747440273037544, + "grad_norm": 2.6387665271759033, + "learning_rate": 0.00010841865756541525, + "loss": 6.2866, + "step": 7837 + }, + { + "epoch": 2.675085324232082, + "grad_norm": 2.6623637676239014, + "learning_rate": 0.00010830489192263937, + "loss": 5.5277, + "step": 7838 + }, + { + "epoch": 2.6754266211604096, + "grad_norm": 2.7224626541137695, + "learning_rate": 0.00010819112627986349, + "loss": 6.5055, + "step": 7839 + }, + { + "epoch": 2.675767918088737, + "grad_norm": 1.7368601560592651, + "learning_rate": 0.0001080773606370876, + "loss": 2.5353, + "step": 7840 + }, + { + "epoch": 2.676109215017065, + "grad_norm": 2.5209169387817383, + "learning_rate": 0.00010796359499431171, + "loss": 5.4228, + "step": 7841 + }, + { + "epoch": 2.6764505119453927, + "grad_norm": 2.6286282539367676, + "learning_rate": 0.00010784982935153584, + "loss": 6.5636, + "step": 7842 + }, + { + "epoch": 2.67679180887372, + "grad_norm": 2.581543207168579, + "learning_rate": 0.00010773606370875996, + "loss": 4.6108, + "step": 7843 + }, + { + "epoch": 2.6771331058020476, + "grad_norm": 2.5444607734680176, + "learning_rate": 0.00010762229806598408, + "loss": 6.3867, + "step": 7844 + }, + { + "epoch": 2.6774744027303754, + "grad_norm": 2.6205897331237793, + "learning_rate": 0.0001075085324232082, + "loss": 6.3425, + "step": 7845 + }, + { + "epoch": 2.6778156996587033, + "grad_norm": 2.6083576679229736, + "learning_rate": 0.0001073947667804323, + "loss": 5.435, + "step": 7846 + }, + { + "epoch": 2.6781569965870307, + "grad_norm": 2.6391236782073975, + "learning_rate": 0.00010728100113765643, + "loss": 5.6658, + "step": 7847 + }, + { + "epoch": 2.6784982935153585, + "grad_norm": 2.71447491645813, + "learning_rate": 0.00010716723549488055, + "loss": 5.6652, + "step": 7848 + }, + { + "epoch": 2.678839590443686, + "grad_norm": 2.63681697845459, + "learning_rate": 0.00010705346985210467, + "loss": 5.891, + "step": 7849 + }, + { + "epoch": 2.6791808873720138, + "grad_norm": 2.910919427871704, + "learning_rate": 0.00010693970420932879, + "loss": 5.8579, + "step": 7850 + }, + { + "epoch": 2.679522184300341, + "grad_norm": 2.6132237911224365, + "learning_rate": 0.0001068259385665529, + "loss": 5.8708, + "step": 7851 + }, + { + "epoch": 2.679863481228669, + "grad_norm": 2.6288161277770996, + "learning_rate": 0.00010671217292377703, + "loss": 5.7728, + "step": 7852 + }, + { + "epoch": 2.6802047781569964, + "grad_norm": 2.5175042152404785, + "learning_rate": 0.00010659840728100113, + "loss": 5.9085, + "step": 7853 + }, + { + "epoch": 2.6805460750853243, + "grad_norm": 2.57769775390625, + "learning_rate": 0.00010648464163822526, + "loss": 6.2867, + "step": 7854 + }, + { + "epoch": 2.680887372013652, + "grad_norm": 2.5458014011383057, + "learning_rate": 0.00010637087599544938, + "loss": 6.0821, + "step": 7855 + }, + { + "epoch": 2.6812286689419795, + "grad_norm": 2.641937732696533, + "learning_rate": 0.00010625711035267349, + "loss": 6.295, + "step": 7856 + }, + { + "epoch": 2.681569965870307, + "grad_norm": 2.6056764125823975, + "learning_rate": 0.00010614334470989762, + "loss": 5.8905, + "step": 7857 + }, + { + "epoch": 2.681911262798635, + "grad_norm": 2.6439177989959717, + "learning_rate": 0.00010602957906712172, + "loss": 6.2158, + "step": 7858 + }, + { + "epoch": 2.6822525597269626, + "grad_norm": 2.680771589279175, + "learning_rate": 0.00010591581342434586, + "loss": 5.609, + "step": 7859 + }, + { + "epoch": 2.68259385665529, + "grad_norm": 2.6440012454986572, + "learning_rate": 0.00010580204778156998, + "loss": 5.7455, + "step": 7860 + }, + { + "epoch": 2.682935153583618, + "grad_norm": 2.6077284812927246, + "learning_rate": 0.00010568828213879408, + "loss": 6.4567, + "step": 7861 + }, + { + "epoch": 2.6832764505119453, + "grad_norm": 2.5259697437286377, + "learning_rate": 0.00010557451649601821, + "loss": 5.9364, + "step": 7862 + }, + { + "epoch": 2.683617747440273, + "grad_norm": 2.647272825241089, + "learning_rate": 0.00010546075085324232, + "loss": 6.4563, + "step": 7863 + }, + { + "epoch": 2.6839590443686006, + "grad_norm": 2.600893020629883, + "learning_rate": 0.00010534698521046644, + "loss": 5.9343, + "step": 7864 + }, + { + "epoch": 2.6843003412969284, + "grad_norm": 2.8401756286621094, + "learning_rate": 0.00010523321956769057, + "loss": 4.4045, + "step": 7865 + }, + { + "epoch": 2.684641638225256, + "grad_norm": 2.6061348915100098, + "learning_rate": 0.00010511945392491467, + "loss": 4.6237, + "step": 7866 + }, + { + "epoch": 2.6849829351535837, + "grad_norm": 2.599360466003418, + "learning_rate": 0.0001050056882821388, + "loss": 6.0888, + "step": 7867 + }, + { + "epoch": 2.6853242320819115, + "grad_norm": 2.5830085277557373, + "learning_rate": 0.00010489192263936291, + "loss": 5.5227, + "step": 7868 + }, + { + "epoch": 2.685665529010239, + "grad_norm": 2.7643754482269287, + "learning_rate": 0.00010477815699658703, + "loss": 5.0325, + "step": 7869 + }, + { + "epoch": 2.6860068259385663, + "grad_norm": 2.723921537399292, + "learning_rate": 0.00010466439135381115, + "loss": 5.5498, + "step": 7870 + }, + { + "epoch": 2.686348122866894, + "grad_norm": 2.556098461151123, + "learning_rate": 0.00010455062571103526, + "loss": 5.6835, + "step": 7871 + }, + { + "epoch": 2.686689419795222, + "grad_norm": 2.629362106323242, + "learning_rate": 0.0001044368600682594, + "loss": 6.4194, + "step": 7872 + }, + { + "epoch": 2.6870307167235494, + "grad_norm": 2.6125411987304688, + "learning_rate": 0.0001043230944254835, + "loss": 6.0512, + "step": 7873 + }, + { + "epoch": 2.6873720136518773, + "grad_norm": 2.6440958976745605, + "learning_rate": 0.00010420932878270762, + "loss": 6.0304, + "step": 7874 + }, + { + "epoch": 2.6877133105802047, + "grad_norm": 2.598612070083618, + "learning_rate": 0.00010409556313993174, + "loss": 5.6832, + "step": 7875 + }, + { + "epoch": 2.6880546075085325, + "grad_norm": 2.6545569896698, + "learning_rate": 0.00010398179749715586, + "loss": 6.4576, + "step": 7876 + }, + { + "epoch": 2.68839590443686, + "grad_norm": 2.641326427459717, + "learning_rate": 0.00010386803185437999, + "loss": 6.3398, + "step": 7877 + }, + { + "epoch": 2.688737201365188, + "grad_norm": 2.6488940715789795, + "learning_rate": 0.0001037542662116041, + "loss": 5.7814, + "step": 7878 + }, + { + "epoch": 2.689078498293515, + "grad_norm": 2.5873377323150635, + "learning_rate": 0.00010364050056882821, + "loss": 6.4943, + "step": 7879 + }, + { + "epoch": 2.689419795221843, + "grad_norm": 2.5985260009765625, + "learning_rate": 0.00010352673492605233, + "loss": 6.0007, + "step": 7880 + }, + { + "epoch": 2.689761092150171, + "grad_norm": 2.6462574005126953, + "learning_rate": 0.00010341296928327645, + "loss": 6.4716, + "step": 7881 + }, + { + "epoch": 2.6901023890784983, + "grad_norm": 2.649360418319702, + "learning_rate": 0.00010329920364050058, + "loss": 5.7194, + "step": 7882 + }, + { + "epoch": 2.6904436860068257, + "grad_norm": 2.5704243183135986, + "learning_rate": 0.00010318543799772469, + "loss": 5.9094, + "step": 7883 + }, + { + "epoch": 2.6907849829351536, + "grad_norm": 2.7193174362182617, + "learning_rate": 0.0001030716723549488, + "loss": 6.4267, + "step": 7884 + }, + { + "epoch": 2.6911262798634814, + "grad_norm": 2.567355155944824, + "learning_rate": 0.00010295790671217292, + "loss": 5.5288, + "step": 7885 + }, + { + "epoch": 2.691467576791809, + "grad_norm": 2.8648149967193604, + "learning_rate": 0.00010284414106939704, + "loss": 5.4581, + "step": 7886 + }, + { + "epoch": 2.6918088737201367, + "grad_norm": 2.6088626384735107, + "learning_rate": 0.00010273037542662116, + "loss": 6.3214, + "step": 7887 + }, + { + "epoch": 2.692150170648464, + "grad_norm": 2.7508292198181152, + "learning_rate": 0.00010261660978384528, + "loss": 6.3367, + "step": 7888 + }, + { + "epoch": 2.692491467576792, + "grad_norm": 2.5980231761932373, + "learning_rate": 0.0001025028441410694, + "loss": 5.7762, + "step": 7889 + }, + { + "epoch": 2.6928327645051193, + "grad_norm": 2.668694496154785, + "learning_rate": 0.00010238907849829352, + "loss": 5.5341, + "step": 7890 + }, + { + "epoch": 2.693174061433447, + "grad_norm": 2.5616753101348877, + "learning_rate": 0.00010227531285551763, + "loss": 6.0325, + "step": 7891 + }, + { + "epoch": 2.6935153583617746, + "grad_norm": 5.180400371551514, + "learning_rate": 0.00010216154721274175, + "loss": 4.5402, + "step": 7892 + }, + { + "epoch": 2.6938566552901024, + "grad_norm": 2.6964242458343506, + "learning_rate": 0.00010204778156996587, + "loss": 6.8569, + "step": 7893 + }, + { + "epoch": 2.6941979522184303, + "grad_norm": 2.5725672245025635, + "learning_rate": 0.00010193401592718999, + "loss": 5.7403, + "step": 7894 + }, + { + "epoch": 2.6945392491467577, + "grad_norm": 2.649010181427002, + "learning_rate": 0.00010182025028441411, + "loss": 6.1924, + "step": 7895 + }, + { + "epoch": 2.694880546075085, + "grad_norm": 2.6579694747924805, + "learning_rate": 0.00010170648464163823, + "loss": 5.7039, + "step": 7896 + }, + { + "epoch": 2.695221843003413, + "grad_norm": 2.6075141429901123, + "learning_rate": 0.00010159271899886235, + "loss": 6.356, + "step": 7897 + }, + { + "epoch": 2.695563139931741, + "grad_norm": 2.6528830528259277, + "learning_rate": 0.00010147895335608646, + "loss": 5.4291, + "step": 7898 + }, + { + "epoch": 2.695904436860068, + "grad_norm": 2.625629425048828, + "learning_rate": 0.00010136518771331058, + "loss": 6.2645, + "step": 7899 + }, + { + "epoch": 2.696245733788396, + "grad_norm": 2.710127830505371, + "learning_rate": 0.0001012514220705347, + "loss": 4.9046, + "step": 7900 + }, + { + "epoch": 2.6965870307167235, + "grad_norm": 2.4944448471069336, + "learning_rate": 0.00010113765642775882, + "loss": 5.518, + "step": 7901 + }, + { + "epoch": 2.6969283276450513, + "grad_norm": 2.6292831897735596, + "learning_rate": 0.00010102389078498294, + "loss": 6.1254, + "step": 7902 + }, + { + "epoch": 2.6972696245733787, + "grad_norm": 2.5295937061309814, + "learning_rate": 0.00010091012514220706, + "loss": 6.5135, + "step": 7903 + }, + { + "epoch": 2.6976109215017066, + "grad_norm": 2.602799415588379, + "learning_rate": 0.00010079635949943116, + "loss": 5.8567, + "step": 7904 + }, + { + "epoch": 2.697952218430034, + "grad_norm": 2.6777186393737793, + "learning_rate": 0.0001006825938566553, + "loss": 6.0356, + "step": 7905 + }, + { + "epoch": 2.698293515358362, + "grad_norm": 2.5789544582366943, + "learning_rate": 0.00010056882821387941, + "loss": 5.3017, + "step": 7906 + }, + { + "epoch": 2.6986348122866897, + "grad_norm": 2.565323829650879, + "learning_rate": 0.00010045506257110353, + "loss": 6.0801, + "step": 7907 + }, + { + "epoch": 2.698976109215017, + "grad_norm": 2.089174270629883, + "learning_rate": 0.00010034129692832765, + "loss": 3.2135, + "step": 7908 + }, + { + "epoch": 2.6993174061433445, + "grad_norm": 2.6627211570739746, + "learning_rate": 0.00010022753128555175, + "loss": 5.925, + "step": 7909 + }, + { + "epoch": 2.6996587030716723, + "grad_norm": 2.5624616146087646, + "learning_rate": 0.00010011376564277589, + "loss": 5.6055, + "step": 7910 + }, + { + "epoch": 2.7, + "grad_norm": 3.5882010459899902, + "learning_rate": 0.0001, + "loss": 4.6856, + "step": 7911 + }, + { + "epoch": 2.7003412969283276, + "grad_norm": 2.569596290588379, + "learning_rate": 9.988623435722412e-05, + "loss": 5.86, + "step": 7912 + }, + { + "epoch": 2.7006825938566554, + "grad_norm": 2.565866708755493, + "learning_rate": 9.977246871444824e-05, + "loss": 6.1467, + "step": 7913 + }, + { + "epoch": 2.701023890784983, + "grad_norm": 2.524012327194214, + "learning_rate": 9.965870307167235e-05, + "loss": 5.9074, + "step": 7914 + }, + { + "epoch": 2.7013651877133107, + "grad_norm": 1.8367999792099, + "learning_rate": 9.954493742889648e-05, + "loss": 3.4121, + "step": 7915 + }, + { + "epoch": 2.701706484641638, + "grad_norm": 2.5629944801330566, + "learning_rate": 9.94311717861206e-05, + "loss": 6.5814, + "step": 7916 + }, + { + "epoch": 2.702047781569966, + "grad_norm": 2.7166037559509277, + "learning_rate": 9.931740614334472e-05, + "loss": 6.285, + "step": 7917 + }, + { + "epoch": 2.7023890784982934, + "grad_norm": 2.672518730163574, + "learning_rate": 9.920364050056883e-05, + "loss": 3.7134, + "step": 7918 + }, + { + "epoch": 2.702730375426621, + "grad_norm": 2.4576282501220703, + "learning_rate": 9.908987485779294e-05, + "loss": 4.0857, + "step": 7919 + }, + { + "epoch": 2.703071672354949, + "grad_norm": 2.5885422229766846, + "learning_rate": 9.897610921501707e-05, + "loss": 5.9866, + "step": 7920 + }, + { + "epoch": 2.7034129692832765, + "grad_norm": 2.686434030532837, + "learning_rate": 9.886234357224118e-05, + "loss": 5.7955, + "step": 7921 + }, + { + "epoch": 2.703754266211604, + "grad_norm": 2.6147704124450684, + "learning_rate": 9.874857792946531e-05, + "loss": 6.1341, + "step": 7922 + }, + { + "epoch": 2.7040955631399317, + "grad_norm": 3.224640369415283, + "learning_rate": 9.863481228668943e-05, + "loss": 4.9066, + "step": 7923 + }, + { + "epoch": 2.7044368600682596, + "grad_norm": 2.6708364486694336, + "learning_rate": 9.852104664391353e-05, + "loss": 5.7106, + "step": 7924 + }, + { + "epoch": 2.704778156996587, + "grad_norm": 2.5874176025390625, + "learning_rate": 9.840728100113766e-05, + "loss": 4.865, + "step": 7925 + }, + { + "epoch": 2.705119453924915, + "grad_norm": 2.6783151626586914, + "learning_rate": 9.829351535836177e-05, + "loss": 4.9307, + "step": 7926 + }, + { + "epoch": 2.7054607508532422, + "grad_norm": 2.5469775199890137, + "learning_rate": 9.81797497155859e-05, + "loss": 5.7146, + "step": 7927 + }, + { + "epoch": 2.70580204778157, + "grad_norm": 2.5792226791381836, + "learning_rate": 9.806598407281002e-05, + "loss": 5.7424, + "step": 7928 + }, + { + "epoch": 2.7061433447098975, + "grad_norm": 2.562011241912842, + "learning_rate": 9.795221843003412e-05, + "loss": 5.7029, + "step": 7929 + }, + { + "epoch": 2.7064846416382253, + "grad_norm": 3.7608301639556885, + "learning_rate": 9.783845278725826e-05, + "loss": 4.2989, + "step": 7930 + }, + { + "epoch": 2.7068259385665527, + "grad_norm": 2.5663902759552, + "learning_rate": 9.772468714448236e-05, + "loss": 6.4621, + "step": 7931 + }, + { + "epoch": 2.7071672354948806, + "grad_norm": 2.6280720233917236, + "learning_rate": 9.761092150170649e-05, + "loss": 6.3645, + "step": 7932 + }, + { + "epoch": 2.7075085324232084, + "grad_norm": 2.5684332847595215, + "learning_rate": 9.749715585893061e-05, + "loss": 5.8003, + "step": 7933 + }, + { + "epoch": 2.707849829351536, + "grad_norm": 2.643038034439087, + "learning_rate": 9.738339021615472e-05, + "loss": 6.4916, + "step": 7934 + }, + { + "epoch": 2.7081911262798632, + "grad_norm": 2.559818983078003, + "learning_rate": 9.726962457337885e-05, + "loss": 6.41, + "step": 7935 + }, + { + "epoch": 2.708532423208191, + "grad_norm": 2.636819839477539, + "learning_rate": 9.715585893060295e-05, + "loss": 5.2788, + "step": 7936 + }, + { + "epoch": 2.708873720136519, + "grad_norm": 2.651435613632202, + "learning_rate": 9.704209328782709e-05, + "loss": 5.8778, + "step": 7937 + }, + { + "epoch": 2.7092150170648464, + "grad_norm": 2.6740775108337402, + "learning_rate": 9.692832764505119e-05, + "loss": 4.9705, + "step": 7938 + }, + { + "epoch": 2.709556313993174, + "grad_norm": 2.6301076412200928, + "learning_rate": 9.681456200227531e-05, + "loss": 6.1048, + "step": 7939 + }, + { + "epoch": 2.7098976109215016, + "grad_norm": 2.613173007965088, + "learning_rate": 9.670079635949944e-05, + "loss": 5.3901, + "step": 7940 + }, + { + "epoch": 2.7102389078498295, + "grad_norm": 2.5331554412841797, + "learning_rate": 9.658703071672355e-05, + "loss": 5.4303, + "step": 7941 + }, + { + "epoch": 2.710580204778157, + "grad_norm": 2.602365016937256, + "learning_rate": 9.647326507394768e-05, + "loss": 6.2944, + "step": 7942 + }, + { + "epoch": 2.7109215017064847, + "grad_norm": 2.6677892208099365, + "learning_rate": 9.635949943117178e-05, + "loss": 5.7442, + "step": 7943 + }, + { + "epoch": 2.711262798634812, + "grad_norm": 2.62030291557312, + "learning_rate": 9.62457337883959e-05, + "loss": 6.2834, + "step": 7944 + }, + { + "epoch": 2.71160409556314, + "grad_norm": 2.5873615741729736, + "learning_rate": 9.613196814562003e-05, + "loss": 5.7214, + "step": 7945 + }, + { + "epoch": 2.711945392491468, + "grad_norm": 2.6523163318634033, + "learning_rate": 9.601820250284414e-05, + "loss": 5.5457, + "step": 7946 + }, + { + "epoch": 2.7122866894197952, + "grad_norm": 2.6249499320983887, + "learning_rate": 9.590443686006827e-05, + "loss": 5.5907, + "step": 7947 + }, + { + "epoch": 2.7126279863481226, + "grad_norm": 2.6475677490234375, + "learning_rate": 9.579067121729238e-05, + "loss": 6.2695, + "step": 7948 + }, + { + "epoch": 2.7129692832764505, + "grad_norm": 2.638388156890869, + "learning_rate": 9.56769055745165e-05, + "loss": 6.0452, + "step": 7949 + }, + { + "epoch": 2.7133105802047783, + "grad_norm": 2.5917885303497314, + "learning_rate": 9.556313993174063e-05, + "loss": 5.4567, + "step": 7950 + }, + { + "epoch": 2.7136518771331057, + "grad_norm": 2.65228533744812, + "learning_rate": 9.544937428896473e-05, + "loss": 6.3895, + "step": 7951 + }, + { + "epoch": 2.7139931740614336, + "grad_norm": 2.614609956741333, + "learning_rate": 9.533560864618886e-05, + "loss": 6.4066, + "step": 7952 + }, + { + "epoch": 2.714334470989761, + "grad_norm": 2.717390298843384, + "learning_rate": 9.522184300341297e-05, + "loss": 4.8762, + "step": 7953 + }, + { + "epoch": 2.714675767918089, + "grad_norm": 2.660956859588623, + "learning_rate": 9.510807736063709e-05, + "loss": 5.9635, + "step": 7954 + }, + { + "epoch": 2.7150170648464163, + "grad_norm": 2.5839521884918213, + "learning_rate": 9.49943117178612e-05, + "loss": 5.8931, + "step": 7955 + }, + { + "epoch": 2.715358361774744, + "grad_norm": 2.445621967315674, + "learning_rate": 9.488054607508532e-05, + "loss": 3.551, + "step": 7956 + }, + { + "epoch": 2.7156996587030715, + "grad_norm": 2.659536838531494, + "learning_rate": 9.476678043230946e-05, + "loss": 6.4926, + "step": 7957 + }, + { + "epoch": 2.7160409556313994, + "grad_norm": 2.6747255325317383, + "learning_rate": 9.465301478953356e-05, + "loss": 6.5505, + "step": 7958 + }, + { + "epoch": 2.716382252559727, + "grad_norm": 2.663691520690918, + "learning_rate": 9.453924914675768e-05, + "loss": 6.1892, + "step": 7959 + }, + { + "epoch": 2.7167235494880546, + "grad_norm": 2.6698076725006104, + "learning_rate": 9.44254835039818e-05, + "loss": 5.9412, + "step": 7960 + }, + { + "epoch": 2.717064846416382, + "grad_norm": 2.622624635696411, + "learning_rate": 9.431171786120592e-05, + "loss": 5.906, + "step": 7961 + }, + { + "epoch": 2.71740614334471, + "grad_norm": 2.6831278800964355, + "learning_rate": 9.419795221843005e-05, + "loss": 5.8821, + "step": 7962 + }, + { + "epoch": 2.7177474402730377, + "grad_norm": 2.650803565979004, + "learning_rate": 9.408418657565415e-05, + "loss": 5.7527, + "step": 7963 + }, + { + "epoch": 2.718088737201365, + "grad_norm": 2.567662477493286, + "learning_rate": 9.397042093287827e-05, + "loss": 5.8555, + "step": 7964 + }, + { + "epoch": 2.718430034129693, + "grad_norm": 2.6113736629486084, + "learning_rate": 9.385665529010239e-05, + "loss": 5.5252, + "step": 7965 + }, + { + "epoch": 2.7187713310580204, + "grad_norm": 2.6406290531158447, + "learning_rate": 9.374288964732651e-05, + "loss": 6.5528, + "step": 7966 + }, + { + "epoch": 2.7191126279863482, + "grad_norm": 2.571793556213379, + "learning_rate": 9.362912400455063e-05, + "loss": 5.9012, + "step": 7967 + }, + { + "epoch": 2.7194539249146756, + "grad_norm": 2.543337345123291, + "learning_rate": 9.351535836177475e-05, + "loss": 5.6792, + "step": 7968 + }, + { + "epoch": 2.7197952218430035, + "grad_norm": 2.667567014694214, + "learning_rate": 9.340159271899886e-05, + "loss": 6.1126, + "step": 7969 + }, + { + "epoch": 2.720136518771331, + "grad_norm": 2.7619335651397705, + "learning_rate": 9.328782707622298e-05, + "loss": 4.8842, + "step": 7970 + }, + { + "epoch": 2.7204778156996587, + "grad_norm": 2.702486753463745, + "learning_rate": 9.31740614334471e-05, + "loss": 5.1622, + "step": 7971 + }, + { + "epoch": 2.7208191126279866, + "grad_norm": 2.543321132659912, + "learning_rate": 9.306029579067122e-05, + "loss": 6.5731, + "step": 7972 + }, + { + "epoch": 2.721160409556314, + "grad_norm": 2.6276233196258545, + "learning_rate": 9.294653014789534e-05, + "loss": 6.0075, + "step": 7973 + }, + { + "epoch": 2.7215017064846414, + "grad_norm": 2.569683074951172, + "learning_rate": 9.283276450511946e-05, + "loss": 5.8531, + "step": 7974 + }, + { + "epoch": 2.7218430034129693, + "grad_norm": 2.636974334716797, + "learning_rate": 9.271899886234357e-05, + "loss": 5.6186, + "step": 7975 + }, + { + "epoch": 2.722184300341297, + "grad_norm": 2.7969841957092285, + "learning_rate": 9.26052332195677e-05, + "loss": 5.0165, + "step": 7976 + }, + { + "epoch": 2.7225255972696245, + "grad_norm": 2.6524500846862793, + "learning_rate": 9.24914675767918e-05, + "loss": 5.6564, + "step": 7977 + }, + { + "epoch": 2.7228668941979524, + "grad_norm": 2.553088903427124, + "learning_rate": 9.237770193401593e-05, + "loss": 6.0369, + "step": 7978 + }, + { + "epoch": 2.7232081911262798, + "grad_norm": 2.4714152812957764, + "learning_rate": 9.226393629124005e-05, + "loss": 5.8547, + "step": 7979 + }, + { + "epoch": 2.7235494880546076, + "grad_norm": 2.634051561355591, + "learning_rate": 9.215017064846417e-05, + "loss": 6.1899, + "step": 7980 + }, + { + "epoch": 2.723890784982935, + "grad_norm": 2.6114561557769775, + "learning_rate": 9.203640500568829e-05, + "loss": 6.6533, + "step": 7981 + }, + { + "epoch": 2.724232081911263, + "grad_norm": 2.653291702270508, + "learning_rate": 9.192263936291239e-05, + "loss": 6.1926, + "step": 7982 + }, + { + "epoch": 2.7245733788395903, + "grad_norm": 2.614102363586426, + "learning_rate": 9.180887372013652e-05, + "loss": 6.1669, + "step": 7983 + }, + { + "epoch": 2.724914675767918, + "grad_norm": 2.6618130207061768, + "learning_rate": 9.169510807736064e-05, + "loss": 4.0645, + "step": 7984 + }, + { + "epoch": 2.725255972696246, + "grad_norm": 2.57953143119812, + "learning_rate": 9.158134243458476e-05, + "loss": 6.201, + "step": 7985 + }, + { + "epoch": 2.7255972696245734, + "grad_norm": 2.664710760116577, + "learning_rate": 9.146757679180888e-05, + "loss": 6.319, + "step": 7986 + }, + { + "epoch": 2.725938566552901, + "grad_norm": 2.681464672088623, + "learning_rate": 9.135381114903298e-05, + "loss": 5.8685, + "step": 7987 + }, + { + "epoch": 2.7262798634812286, + "grad_norm": 2.5687036514282227, + "learning_rate": 9.124004550625712e-05, + "loss": 6.3869, + "step": 7988 + }, + { + "epoch": 2.7266211604095565, + "grad_norm": 2.604609489440918, + "learning_rate": 9.112627986348123e-05, + "loss": 5.8885, + "step": 7989 + }, + { + "epoch": 2.726962457337884, + "grad_norm": 2.5861759185791016, + "learning_rate": 9.101251422070535e-05, + "loss": 5.1008, + "step": 7990 + }, + { + "epoch": 2.7273037542662117, + "grad_norm": 2.619473934173584, + "learning_rate": 9.089874857792947e-05, + "loss": 6.0467, + "step": 7991 + }, + { + "epoch": 2.727645051194539, + "grad_norm": 2.6422886848449707, + "learning_rate": 9.078498293515358e-05, + "loss": 6.595, + "step": 7992 + }, + { + "epoch": 2.727986348122867, + "grad_norm": 2.61552357673645, + "learning_rate": 9.067121729237771e-05, + "loss": 5.6713, + "step": 7993 + }, + { + "epoch": 2.7283276450511944, + "grad_norm": 2.6517817974090576, + "learning_rate": 9.055745164960181e-05, + "loss": 6.3159, + "step": 7994 + }, + { + "epoch": 2.7286689419795223, + "grad_norm": 2.8126118183135986, + "learning_rate": 9.044368600682594e-05, + "loss": 4.8574, + "step": 7995 + }, + { + "epoch": 2.7290102389078497, + "grad_norm": 2.5647075176239014, + "learning_rate": 9.032992036405006e-05, + "loss": 5.692, + "step": 7996 + }, + { + "epoch": 2.7293515358361775, + "grad_norm": 2.641845941543579, + "learning_rate": 9.021615472127417e-05, + "loss": 6.3246, + "step": 7997 + }, + { + "epoch": 2.7296928327645054, + "grad_norm": 2.609525203704834, + "learning_rate": 9.01023890784983e-05, + "loss": 6.4175, + "step": 7998 + }, + { + "epoch": 2.7300341296928328, + "grad_norm": 2.601841688156128, + "learning_rate": 8.99886234357224e-05, + "loss": 6.5798, + "step": 7999 + }, + { + "epoch": 2.73037542662116, + "grad_norm": 2.6355714797973633, + "learning_rate": 8.987485779294654e-05, + "loss": 6.3919, + "step": 8000 + }, + { + "epoch": 2.730716723549488, + "grad_norm": 2.583918333053589, + "learning_rate": 8.976109215017066e-05, + "loss": 6.3685, + "step": 8001 + }, + { + "epoch": 2.731058020477816, + "grad_norm": 2.569997549057007, + "learning_rate": 8.964732650739476e-05, + "loss": 6.1898, + "step": 8002 + }, + { + "epoch": 2.7313993174061433, + "grad_norm": 2.597790002822876, + "learning_rate": 8.953356086461889e-05, + "loss": 6.2867, + "step": 8003 + }, + { + "epoch": 2.731740614334471, + "grad_norm": 2.638692855834961, + "learning_rate": 8.9419795221843e-05, + "loss": 5.6171, + "step": 8004 + }, + { + "epoch": 2.7320819112627985, + "grad_norm": 2.7340500354766846, + "learning_rate": 8.930602957906713e-05, + "loss": 4.4383, + "step": 8005 + }, + { + "epoch": 2.7324232081911264, + "grad_norm": 2.652092933654785, + "learning_rate": 8.919226393629125e-05, + "loss": 6.168, + "step": 8006 + }, + { + "epoch": 2.732764505119454, + "grad_norm": 2.593048572540283, + "learning_rate": 8.907849829351535e-05, + "loss": 6.4116, + "step": 8007 + }, + { + "epoch": 2.7331058020477816, + "grad_norm": 2.6806387901306152, + "learning_rate": 8.896473265073949e-05, + "loss": 5.6523, + "step": 8008 + }, + { + "epoch": 2.733447098976109, + "grad_norm": 2.5880985260009766, + "learning_rate": 8.885096700796359e-05, + "loss": 6.137, + "step": 8009 + }, + { + "epoch": 2.733788395904437, + "grad_norm": 2.6142632961273193, + "learning_rate": 8.873720136518772e-05, + "loss": 6.0497, + "step": 8010 + }, + { + "epoch": 2.7341296928327647, + "grad_norm": 2.7263357639312744, + "learning_rate": 8.862343572241183e-05, + "loss": 6.2217, + "step": 8011 + }, + { + "epoch": 2.734470989761092, + "grad_norm": 2.6077632904052734, + "learning_rate": 8.850967007963595e-05, + "loss": 5.723, + "step": 8012 + }, + { + "epoch": 2.7348122866894196, + "grad_norm": 2.4901204109191895, + "learning_rate": 8.839590443686008e-05, + "loss": 5.1386, + "step": 8013 + }, + { + "epoch": 2.7351535836177474, + "grad_norm": 2.620048999786377, + "learning_rate": 8.828213879408418e-05, + "loss": 6.5681, + "step": 8014 + }, + { + "epoch": 2.7354948805460753, + "grad_norm": 2.5127553939819336, + "learning_rate": 8.816837315130831e-05, + "loss": 3.7144, + "step": 8015 + }, + { + "epoch": 2.7358361774744027, + "grad_norm": 2.677717447280884, + "learning_rate": 8.805460750853242e-05, + "loss": 5.7862, + "step": 8016 + }, + { + "epoch": 2.7361774744027305, + "grad_norm": 2.5813772678375244, + "learning_rate": 8.794084186575654e-05, + "loss": 6.38, + "step": 8017 + }, + { + "epoch": 2.736518771331058, + "grad_norm": 2.593780279159546, + "learning_rate": 8.782707622298067e-05, + "loss": 6.2665, + "step": 8018 + }, + { + "epoch": 2.7368600682593858, + "grad_norm": 2.6182265281677246, + "learning_rate": 8.771331058020478e-05, + "loss": 5.6717, + "step": 8019 + }, + { + "epoch": 2.737201365187713, + "grad_norm": 2.615816593170166, + "learning_rate": 8.759954493742891e-05, + "loss": 6.4123, + "step": 8020 + }, + { + "epoch": 2.737542662116041, + "grad_norm": 2.6214540004730225, + "learning_rate": 8.748577929465301e-05, + "loss": 6.0376, + "step": 8021 + }, + { + "epoch": 2.7378839590443684, + "grad_norm": 2.631145715713501, + "learning_rate": 8.737201365187713e-05, + "loss": 5.6407, + "step": 8022 + }, + { + "epoch": 2.7382252559726963, + "grad_norm": 2.52872371673584, + "learning_rate": 8.725824800910126e-05, + "loss": 6.0498, + "step": 8023 + }, + { + "epoch": 2.738566552901024, + "grad_norm": 2.6615374088287354, + "learning_rate": 8.714448236632537e-05, + "loss": 5.4418, + "step": 8024 + }, + { + "epoch": 2.7389078498293515, + "grad_norm": 4.1157917976379395, + "learning_rate": 8.70307167235495e-05, + "loss": 4.1903, + "step": 8025 + }, + { + "epoch": 2.739249146757679, + "grad_norm": 2.548863410949707, + "learning_rate": 8.69169510807736e-05, + "loss": 6.1133, + "step": 8026 + }, + { + "epoch": 2.739590443686007, + "grad_norm": 2.564734935760498, + "learning_rate": 8.680318543799772e-05, + "loss": 6.0158, + "step": 8027 + }, + { + "epoch": 2.7399317406143346, + "grad_norm": 2.6325366497039795, + "learning_rate": 8.668941979522184e-05, + "loss": 6.132, + "step": 8028 + }, + { + "epoch": 2.740273037542662, + "grad_norm": 2.528510808944702, + "learning_rate": 8.657565415244596e-05, + "loss": 6.0174, + "step": 8029 + }, + { + "epoch": 2.74061433447099, + "grad_norm": 2.521850347518921, + "learning_rate": 8.646188850967009e-05, + "loss": 5.7658, + "step": 8030 + }, + { + "epoch": 2.7409556313993173, + "grad_norm": 2.5946974754333496, + "learning_rate": 8.63481228668942e-05, + "loss": 6.4626, + "step": 8031 + }, + { + "epoch": 2.741296928327645, + "grad_norm": 2.647458076477051, + "learning_rate": 8.623435722411832e-05, + "loss": 6.2879, + "step": 8032 + }, + { + "epoch": 2.7416382252559726, + "grad_norm": 2.7117865085601807, + "learning_rate": 8.612059158134243e-05, + "loss": 5.5557, + "step": 8033 + }, + { + "epoch": 2.7419795221843004, + "grad_norm": 2.601431131362915, + "learning_rate": 8.600682593856655e-05, + "loss": 5.8308, + "step": 8034 + }, + { + "epoch": 2.742320819112628, + "grad_norm": 2.799042224884033, + "learning_rate": 8.589306029579068e-05, + "loss": 5.5987, + "step": 8035 + }, + { + "epoch": 2.7426621160409557, + "grad_norm": 2.671846389770508, + "learning_rate": 8.577929465301479e-05, + "loss": 6.4048, + "step": 8036 + }, + { + "epoch": 2.7430034129692835, + "grad_norm": 2.5593509674072266, + "learning_rate": 8.566552901023891e-05, + "loss": 5.5486, + "step": 8037 + }, + { + "epoch": 2.743344709897611, + "grad_norm": 2.5567145347595215, + "learning_rate": 8.555176336746303e-05, + "loss": 6.0707, + "step": 8038 + }, + { + "epoch": 2.7436860068259383, + "grad_norm": 2.59529185295105, + "learning_rate": 8.543799772468715e-05, + "loss": 5.8477, + "step": 8039 + }, + { + "epoch": 2.744027303754266, + "grad_norm": 2.6873974800109863, + "learning_rate": 8.532423208191128e-05, + "loss": 6.008, + "step": 8040 + }, + { + "epoch": 2.744368600682594, + "grad_norm": 2.6299967765808105, + "learning_rate": 8.521046643913538e-05, + "loss": 5.65, + "step": 8041 + }, + { + "epoch": 2.7447098976109214, + "grad_norm": 2.6703577041625977, + "learning_rate": 8.50967007963595e-05, + "loss": 6.3595, + "step": 8042 + }, + { + "epoch": 2.7450511945392493, + "grad_norm": 2.5916976928710938, + "learning_rate": 8.498293515358362e-05, + "loss": 6.1119, + "step": 8043 + }, + { + "epoch": 2.7453924914675767, + "grad_norm": 2.577881097793579, + "learning_rate": 8.486916951080774e-05, + "loss": 5.5663, + "step": 8044 + }, + { + "epoch": 2.7457337883959045, + "grad_norm": 2.5749382972717285, + "learning_rate": 8.475540386803186e-05, + "loss": 5.7458, + "step": 8045 + }, + { + "epoch": 2.746075085324232, + "grad_norm": 2.5341036319732666, + "learning_rate": 8.464163822525597e-05, + "loss": 5.6241, + "step": 8046 + }, + { + "epoch": 2.74641638225256, + "grad_norm": 2.6867024898529053, + "learning_rate": 8.452787258248009e-05, + "loss": 6.3056, + "step": 8047 + }, + { + "epoch": 2.746757679180887, + "grad_norm": 2.5458154678344727, + "learning_rate": 8.441410693970421e-05, + "loss": 6.0838, + "step": 8048 + }, + { + "epoch": 2.747098976109215, + "grad_norm": 2.6471140384674072, + "learning_rate": 8.430034129692833e-05, + "loss": 5.9484, + "step": 8049 + }, + { + "epoch": 2.747440273037543, + "grad_norm": 2.6072444915771484, + "learning_rate": 8.418657565415245e-05, + "loss": 6.4437, + "step": 8050 + }, + { + "epoch": 2.7477815699658703, + "grad_norm": 2.5881097316741943, + "learning_rate": 8.407281001137657e-05, + "loss": 5.7884, + "step": 8051 + }, + { + "epoch": 2.7481228668941977, + "grad_norm": 2.7298855781555176, + "learning_rate": 8.395904436860069e-05, + "loss": 4.845, + "step": 8052 + }, + { + "epoch": 2.7484641638225256, + "grad_norm": 2.646090507507324, + "learning_rate": 8.38452787258248e-05, + "loss": 6.2684, + "step": 8053 + }, + { + "epoch": 2.7488054607508534, + "grad_norm": 2.5868003368377686, + "learning_rate": 8.373151308304892e-05, + "loss": 5.8335, + "step": 8054 + }, + { + "epoch": 2.749146757679181, + "grad_norm": 2.60349440574646, + "learning_rate": 8.361774744027303e-05, + "loss": 5.6704, + "step": 8055 + }, + { + "epoch": 2.7494880546075087, + "grad_norm": 2.5340113639831543, + "learning_rate": 8.350398179749716e-05, + "loss": 5.5741, + "step": 8056 + }, + { + "epoch": 2.749829351535836, + "grad_norm": 2.6440858840942383, + "learning_rate": 8.339021615472128e-05, + "loss": 5.4331, + "step": 8057 + }, + { + "epoch": 2.750170648464164, + "grad_norm": 2.588313579559326, + "learning_rate": 8.32764505119454e-05, + "loss": 6.2252, + "step": 8058 + }, + { + "epoch": 2.7505119453924913, + "grad_norm": 2.574767827987671, + "learning_rate": 8.316268486916952e-05, + "loss": 6.3772, + "step": 8059 + }, + { + "epoch": 2.750853242320819, + "grad_norm": 2.5740933418273926, + "learning_rate": 8.304891922639362e-05, + "loss": 6.231, + "step": 8060 + }, + { + "epoch": 2.7511945392491466, + "grad_norm": 2.542821168899536, + "learning_rate": 8.293515358361775e-05, + "loss": 5.886, + "step": 8061 + }, + { + "epoch": 2.7515358361774744, + "grad_norm": 2.542856454849243, + "learning_rate": 8.282138794084186e-05, + "loss": 5.9924, + "step": 8062 + }, + { + "epoch": 2.7518771331058023, + "grad_norm": 2.618867874145508, + "learning_rate": 8.270762229806599e-05, + "loss": 5.3548, + "step": 8063 + }, + { + "epoch": 2.7522184300341297, + "grad_norm": 2.605292797088623, + "learning_rate": 8.259385665529011e-05, + "loss": 5.7645, + "step": 8064 + }, + { + "epoch": 2.752559726962457, + "grad_norm": 2.6475188732147217, + "learning_rate": 8.248009101251421e-05, + "loss": 6.4186, + "step": 8065 + }, + { + "epoch": 2.752901023890785, + "grad_norm": 2.5117578506469727, + "learning_rate": 8.236632536973834e-05, + "loss": 5.997, + "step": 8066 + }, + { + "epoch": 2.753242320819113, + "grad_norm": 2.5080535411834717, + "learning_rate": 8.225255972696245e-05, + "loss": 6.1888, + "step": 8067 + }, + { + "epoch": 2.75358361774744, + "grad_norm": 2.5350992679595947, + "learning_rate": 8.213879408418658e-05, + "loss": 5.9308, + "step": 8068 + }, + { + "epoch": 2.753924914675768, + "grad_norm": 2.5473434925079346, + "learning_rate": 8.20250284414107e-05, + "loss": 6.1776, + "step": 8069 + }, + { + "epoch": 2.7542662116040955, + "grad_norm": 2.4866843223571777, + "learning_rate": 8.19112627986348e-05, + "loss": 4.3249, + "step": 8070 + }, + { + "epoch": 2.7546075085324233, + "grad_norm": 2.6929423809051514, + "learning_rate": 8.179749715585894e-05, + "loss": 4.4646, + "step": 8071 + }, + { + "epoch": 2.7549488054607507, + "grad_norm": 2.553462505340576, + "learning_rate": 8.168373151308304e-05, + "loss": 6.4205, + "step": 8072 + }, + { + "epoch": 2.7552901023890786, + "grad_norm": 2.5952062606811523, + "learning_rate": 8.156996587030717e-05, + "loss": 6.2428, + "step": 8073 + }, + { + "epoch": 2.755631399317406, + "grad_norm": 2.697514057159424, + "learning_rate": 8.145620022753129e-05, + "loss": 6.3499, + "step": 8074 + }, + { + "epoch": 2.755972696245734, + "grad_norm": 2.558379888534546, + "learning_rate": 8.13424345847554e-05, + "loss": 5.889, + "step": 8075 + }, + { + "epoch": 2.7563139931740617, + "grad_norm": 2.633807420730591, + "learning_rate": 8.122866894197953e-05, + "loss": 5.544, + "step": 8076 + }, + { + "epoch": 2.756655290102389, + "grad_norm": 2.470966100692749, + "learning_rate": 8.111490329920363e-05, + "loss": 4.4105, + "step": 8077 + }, + { + "epoch": 2.7569965870307165, + "grad_norm": 2.6610970497131348, + "learning_rate": 8.100113765642777e-05, + "loss": 6.4596, + "step": 8078 + }, + { + "epoch": 2.7573378839590443, + "grad_norm": 2.5555026531219482, + "learning_rate": 8.088737201365187e-05, + "loss": 6.3518, + "step": 8079 + }, + { + "epoch": 2.757679180887372, + "grad_norm": 2.5696256160736084, + "learning_rate": 8.077360637087599e-05, + "loss": 6.0327, + "step": 8080 + }, + { + "epoch": 2.7580204778156996, + "grad_norm": 2.542940855026245, + "learning_rate": 8.065984072810012e-05, + "loss": 5.54, + "step": 8081 + }, + { + "epoch": 2.7583617747440274, + "grad_norm": 2.5934388637542725, + "learning_rate": 8.054607508532423e-05, + "loss": 6.237, + "step": 8082 + }, + { + "epoch": 2.758703071672355, + "grad_norm": 2.5500986576080322, + "learning_rate": 8.043230944254836e-05, + "loss": 6.0471, + "step": 8083 + }, + { + "epoch": 2.7590443686006827, + "grad_norm": 2.556598663330078, + "learning_rate": 8.031854379977246e-05, + "loss": 5.7077, + "step": 8084 + }, + { + "epoch": 2.75938566552901, + "grad_norm": 2.6231422424316406, + "learning_rate": 8.020477815699658e-05, + "loss": 5.1179, + "step": 8085 + }, + { + "epoch": 2.759726962457338, + "grad_norm": 2.5738203525543213, + "learning_rate": 8.009101251422071e-05, + "loss": 6.3482, + "step": 8086 + }, + { + "epoch": 2.7600682593856654, + "grad_norm": 2.713271379470825, + "learning_rate": 7.997724687144482e-05, + "loss": 6.5092, + "step": 8087 + }, + { + "epoch": 2.760409556313993, + "grad_norm": 2.689073085784912, + "learning_rate": 7.986348122866895e-05, + "loss": 5.0591, + "step": 8088 + }, + { + "epoch": 2.760750853242321, + "grad_norm": 2.620436429977417, + "learning_rate": 7.974971558589306e-05, + "loss": 6.0436, + "step": 8089 + }, + { + "epoch": 2.7610921501706485, + "grad_norm": 2.5438833236694336, + "learning_rate": 7.963594994311717e-05, + "loss": 5.7818, + "step": 8090 + }, + { + "epoch": 2.761433447098976, + "grad_norm": 2.6242706775665283, + "learning_rate": 7.952218430034131e-05, + "loss": 5.8769, + "step": 8091 + }, + { + "epoch": 2.7617747440273037, + "grad_norm": 2.4857864379882812, + "learning_rate": 7.940841865756541e-05, + "loss": 5.7621, + "step": 8092 + }, + { + "epoch": 2.7621160409556316, + "grad_norm": 2.627063035964966, + "learning_rate": 7.929465301478954e-05, + "loss": 6.45, + "step": 8093 + }, + { + "epoch": 2.762457337883959, + "grad_norm": 2.4991109371185303, + "learning_rate": 7.918088737201365e-05, + "loss": 5.7273, + "step": 8094 + }, + { + "epoch": 2.762798634812287, + "grad_norm": 2.532963991165161, + "learning_rate": 7.906712172923777e-05, + "loss": 6.2703, + "step": 8095 + }, + { + "epoch": 2.7631399317406142, + "grad_norm": 2.5222792625427246, + "learning_rate": 7.895335608646189e-05, + "loss": 5.554, + "step": 8096 + }, + { + "epoch": 2.763481228668942, + "grad_norm": 2.6223628520965576, + "learning_rate": 7.8839590443686e-05, + "loss": 6.0185, + "step": 8097 + }, + { + "epoch": 2.7638225255972695, + "grad_norm": 2.598527669906616, + "learning_rate": 7.872582480091014e-05, + "loss": 5.8662, + "step": 8098 + }, + { + "epoch": 2.7641638225255973, + "grad_norm": 2.4972290992736816, + "learning_rate": 7.861205915813424e-05, + "loss": 6.2221, + "step": 8099 + }, + { + "epoch": 2.7645051194539247, + "grad_norm": 2.5884251594543457, + "learning_rate": 7.849829351535836e-05, + "loss": 6.2226, + "step": 8100 + }, + { + "epoch": 2.7648464163822526, + "grad_norm": 2.6210410594940186, + "learning_rate": 7.838452787258248e-05, + "loss": 5.378, + "step": 8101 + }, + { + "epoch": 2.7651877133105804, + "grad_norm": 2.380128860473633, + "learning_rate": 7.82707622298066e-05, + "loss": 4.1676, + "step": 8102 + }, + { + "epoch": 2.765529010238908, + "grad_norm": 2.6382617950439453, + "learning_rate": 7.815699658703073e-05, + "loss": 6.0014, + "step": 8103 + }, + { + "epoch": 2.7658703071672353, + "grad_norm": 2.6179182529449463, + "learning_rate": 7.804323094425483e-05, + "loss": 5.9414, + "step": 8104 + }, + { + "epoch": 2.766211604095563, + "grad_norm": 2.5637147426605225, + "learning_rate": 7.792946530147895e-05, + "loss": 5.5923, + "step": 8105 + }, + { + "epoch": 2.766552901023891, + "grad_norm": 2.6299054622650146, + "learning_rate": 7.781569965870307e-05, + "loss": 5.6322, + "step": 8106 + }, + { + "epoch": 2.7668941979522184, + "grad_norm": 2.7012157440185547, + "learning_rate": 7.770193401592719e-05, + "loss": 5.6976, + "step": 8107 + }, + { + "epoch": 2.767235494880546, + "grad_norm": 2.60090970993042, + "learning_rate": 7.758816837315132e-05, + "loss": 6.3271, + "step": 8108 + }, + { + "epoch": 2.7675767918088736, + "grad_norm": 2.5655224323272705, + "learning_rate": 7.747440273037543e-05, + "loss": 5.833, + "step": 8109 + }, + { + "epoch": 2.7679180887372015, + "grad_norm": 2.630000114440918, + "learning_rate": 7.736063708759954e-05, + "loss": 5.7833, + "step": 8110 + }, + { + "epoch": 2.768259385665529, + "grad_norm": 2.582270860671997, + "learning_rate": 7.724687144482366e-05, + "loss": 5.4806, + "step": 8111 + }, + { + "epoch": 2.7686006825938567, + "grad_norm": 2.481131076812744, + "learning_rate": 7.713310580204778e-05, + "loss": 4.7699, + "step": 8112 + }, + { + "epoch": 2.768941979522184, + "grad_norm": 1.8021996021270752, + "learning_rate": 7.701934015927191e-05, + "loss": 3.0427, + "step": 8113 + }, + { + "epoch": 2.769283276450512, + "grad_norm": 2.502584457397461, + "learning_rate": 7.690557451649602e-05, + "loss": 6.344, + "step": 8114 + }, + { + "epoch": 2.76962457337884, + "grad_norm": 2.3312814235687256, + "learning_rate": 7.679180887372014e-05, + "loss": 4.1687, + "step": 8115 + }, + { + "epoch": 2.7699658703071672, + "grad_norm": 2.548377752304077, + "learning_rate": 7.667804323094426e-05, + "loss": 5.0895, + "step": 8116 + }, + { + "epoch": 2.7703071672354946, + "grad_norm": 2.6071834564208984, + "learning_rate": 7.656427758816837e-05, + "loss": 5.0985, + "step": 8117 + }, + { + "epoch": 2.7706484641638225, + "grad_norm": 2.3951501846313477, + "learning_rate": 7.645051194539249e-05, + "loss": 4.862, + "step": 8118 + }, + { + "epoch": 2.7709897610921503, + "grad_norm": 2.610914945602417, + "learning_rate": 7.633674630261661e-05, + "loss": 6.137, + "step": 8119 + }, + { + "epoch": 2.7713310580204777, + "grad_norm": 2.4345929622650146, + "learning_rate": 7.622298065984073e-05, + "loss": 4.7642, + "step": 8120 + }, + { + "epoch": 2.7716723549488056, + "grad_norm": 2.550601005554199, + "learning_rate": 7.610921501706485e-05, + "loss": 5.3778, + "step": 8121 + }, + { + "epoch": 2.772013651877133, + "grad_norm": 2.6055097579956055, + "learning_rate": 7.599544937428897e-05, + "loss": 5.7184, + "step": 8122 + }, + { + "epoch": 2.772354948805461, + "grad_norm": 2.512861490249634, + "learning_rate": 7.588168373151309e-05, + "loss": 5.2337, + "step": 8123 + }, + { + "epoch": 2.7726962457337883, + "grad_norm": 2.71311354637146, + "learning_rate": 7.57679180887372e-05, + "loss": 6.3602, + "step": 8124 + }, + { + "epoch": 2.773037542662116, + "grad_norm": 2.6916327476501465, + "learning_rate": 7.565415244596132e-05, + "loss": 6.0187, + "step": 8125 + }, + { + "epoch": 2.7733788395904435, + "grad_norm": 2.6451501846313477, + "learning_rate": 7.554038680318544e-05, + "loss": 5.8547, + "step": 8126 + }, + { + "epoch": 2.7737201365187714, + "grad_norm": 2.736882209777832, + "learning_rate": 7.542662116040956e-05, + "loss": 6.2056, + "step": 8127 + }, + { + "epoch": 2.774061433447099, + "grad_norm": 2.5844454765319824, + "learning_rate": 7.531285551763368e-05, + "loss": 5.795, + "step": 8128 + }, + { + "epoch": 2.7744027303754266, + "grad_norm": 2.4675440788269043, + "learning_rate": 7.51990898748578e-05, + "loss": 5.4714, + "step": 8129 + }, + { + "epoch": 2.774744027303754, + "grad_norm": 2.4359679222106934, + "learning_rate": 7.508532423208191e-05, + "loss": 5.8461, + "step": 8130 + }, + { + "epoch": 2.775085324232082, + "grad_norm": 2.678617477416992, + "learning_rate": 7.497155858930603e-05, + "loss": 5.3954, + "step": 8131 + }, + { + "epoch": 2.7754266211604097, + "grad_norm": 2.575972557067871, + "learning_rate": 7.485779294653015e-05, + "loss": 6.3859, + "step": 8132 + }, + { + "epoch": 2.775767918088737, + "grad_norm": 2.5873968601226807, + "learning_rate": 7.474402730375427e-05, + "loss": 6.2744, + "step": 8133 + }, + { + "epoch": 2.776109215017065, + "grad_norm": 2.5679585933685303, + "learning_rate": 7.463026166097839e-05, + "loss": 6.3518, + "step": 8134 + }, + { + "epoch": 2.7764505119453924, + "grad_norm": 2.5673532485961914, + "learning_rate": 7.45164960182025e-05, + "loss": 5.3267, + "step": 8135 + }, + { + "epoch": 2.7767918088737202, + "grad_norm": 2.5870144367218018, + "learning_rate": 7.440273037542663e-05, + "loss": 5.5847, + "step": 8136 + }, + { + "epoch": 2.7771331058020476, + "grad_norm": 2.542771100997925, + "learning_rate": 7.428896473265074e-05, + "loss": 4.9482, + "step": 8137 + }, + { + "epoch": 2.7774744027303755, + "grad_norm": 2.619039535522461, + "learning_rate": 7.417519908987486e-05, + "loss": 6.4865, + "step": 8138 + }, + { + "epoch": 2.777815699658703, + "grad_norm": 2.628445863723755, + "learning_rate": 7.406143344709898e-05, + "loss": 5.8237, + "step": 8139 + }, + { + "epoch": 2.7781569965870307, + "grad_norm": 2.5685930252075195, + "learning_rate": 7.394766780432309e-05, + "loss": 6.1367, + "step": 8140 + }, + { + "epoch": 2.7784982935153586, + "grad_norm": 2.6697919368743896, + "learning_rate": 7.383390216154722e-05, + "loss": 5.8577, + "step": 8141 + }, + { + "epoch": 2.778839590443686, + "grad_norm": 2.6898036003112793, + "learning_rate": 7.372013651877134e-05, + "loss": 6.3594, + "step": 8142 + }, + { + "epoch": 2.7791808873720134, + "grad_norm": 2.5270895957946777, + "learning_rate": 7.360637087599546e-05, + "loss": 6.1053, + "step": 8143 + }, + { + "epoch": 2.7795221843003413, + "grad_norm": 2.6355865001678467, + "learning_rate": 7.349260523321957e-05, + "loss": 4.7732, + "step": 8144 + }, + { + "epoch": 2.779863481228669, + "grad_norm": 2.624152898788452, + "learning_rate": 7.337883959044368e-05, + "loss": 6.3116, + "step": 8145 + }, + { + "epoch": 2.7802047781569965, + "grad_norm": 2.588355302810669, + "learning_rate": 7.326507394766781e-05, + "loss": 5.3328, + "step": 8146 + }, + { + "epoch": 2.7805460750853244, + "grad_norm": 2.633571147918701, + "learning_rate": 7.315130830489193e-05, + "loss": 6.1316, + "step": 8147 + }, + { + "epoch": 2.7808873720136518, + "grad_norm": 2.573197364807129, + "learning_rate": 7.303754266211605e-05, + "loss": 6.7343, + "step": 8148 + }, + { + "epoch": 2.7812286689419796, + "grad_norm": 2.558769464492798, + "learning_rate": 7.292377701934017e-05, + "loss": 5.357, + "step": 8149 + }, + { + "epoch": 2.781569965870307, + "grad_norm": 2.6176862716674805, + "learning_rate": 7.281001137656427e-05, + "loss": 5.6288, + "step": 8150 + }, + { + "epoch": 2.781911262798635, + "grad_norm": 2.510423183441162, + "learning_rate": 7.26962457337884e-05, + "loss": 6.4119, + "step": 8151 + }, + { + "epoch": 2.7822525597269623, + "grad_norm": 2.7808709144592285, + "learning_rate": 7.258248009101251e-05, + "loss": 4.3593, + "step": 8152 + }, + { + "epoch": 2.78259385665529, + "grad_norm": 2.575263261795044, + "learning_rate": 7.246871444823664e-05, + "loss": 6.4656, + "step": 8153 + }, + { + "epoch": 2.782935153583618, + "grad_norm": 2.588742256164551, + "learning_rate": 7.235494880546076e-05, + "loss": 5.9384, + "step": 8154 + }, + { + "epoch": 2.7832764505119454, + "grad_norm": 2.6123979091644287, + "learning_rate": 7.224118316268486e-05, + "loss": 6.0583, + "step": 8155 + }, + { + "epoch": 2.783617747440273, + "grad_norm": 2.4757981300354004, + "learning_rate": 7.2127417519909e-05, + "loss": 5.5663, + "step": 8156 + }, + { + "epoch": 2.7839590443686006, + "grad_norm": 2.5429530143737793, + "learning_rate": 7.20136518771331e-05, + "loss": 5.9294, + "step": 8157 + }, + { + "epoch": 2.7843003412969285, + "grad_norm": 2.5413453578948975, + "learning_rate": 7.189988623435722e-05, + "loss": 5.9371, + "step": 8158 + }, + { + "epoch": 2.784641638225256, + "grad_norm": 2.5826592445373535, + "learning_rate": 7.178612059158135e-05, + "loss": 5.9754, + "step": 8159 + }, + { + "epoch": 2.7849829351535837, + "grad_norm": 2.4613709449768066, + "learning_rate": 7.167235494880546e-05, + "loss": 5.7554, + "step": 8160 + }, + { + "epoch": 2.785324232081911, + "grad_norm": 2.59067964553833, + "learning_rate": 7.155858930602959e-05, + "loss": 6.3009, + "step": 8161 + }, + { + "epoch": 2.785665529010239, + "grad_norm": 2.543574810028076, + "learning_rate": 7.144482366325369e-05, + "loss": 5.7074, + "step": 8162 + }, + { + "epoch": 2.7860068259385664, + "grad_norm": 2.595146656036377, + "learning_rate": 7.133105802047781e-05, + "loss": 6.4757, + "step": 8163 + }, + { + "epoch": 2.7863481228668943, + "grad_norm": 5.569501876831055, + "learning_rate": 7.121729237770194e-05, + "loss": 4.9309, + "step": 8164 + }, + { + "epoch": 2.7866894197952217, + "grad_norm": 2.49450945854187, + "learning_rate": 7.110352673492605e-05, + "loss": 5.1546, + "step": 8165 + }, + { + "epoch": 2.7870307167235495, + "grad_norm": 2.642744541168213, + "learning_rate": 7.098976109215018e-05, + "loss": 6.0856, + "step": 8166 + }, + { + "epoch": 2.7873720136518774, + "grad_norm": 2.5893845558166504, + "learning_rate": 7.087599544937429e-05, + "loss": 6.0408, + "step": 8167 + }, + { + "epoch": 2.7877133105802048, + "grad_norm": 2.603260040283203, + "learning_rate": 7.07622298065984e-05, + "loss": 5.7834, + "step": 8168 + }, + { + "epoch": 2.788054607508532, + "grad_norm": 2.669589042663574, + "learning_rate": 7.064846416382252e-05, + "loss": 5.6021, + "step": 8169 + }, + { + "epoch": 2.78839590443686, + "grad_norm": 2.57565975189209, + "learning_rate": 7.053469852104664e-05, + "loss": 5.8934, + "step": 8170 + }, + { + "epoch": 2.788737201365188, + "grad_norm": 2.455730438232422, + "learning_rate": 7.042093287827077e-05, + "loss": 5.5977, + "step": 8171 + }, + { + "epoch": 2.7890784982935153, + "grad_norm": 2.560124635696411, + "learning_rate": 7.030716723549488e-05, + "loss": 5.4134, + "step": 8172 + }, + { + "epoch": 2.789419795221843, + "grad_norm": 2.573716163635254, + "learning_rate": 7.0193401592719e-05, + "loss": 5.9382, + "step": 8173 + }, + { + "epoch": 2.7897610921501705, + "grad_norm": 2.4947168827056885, + "learning_rate": 7.007963594994311e-05, + "loss": 5.3002, + "step": 8174 + }, + { + "epoch": 2.7901023890784984, + "grad_norm": 2.6110596656799316, + "learning_rate": 6.996587030716723e-05, + "loss": 6.0533, + "step": 8175 + }, + { + "epoch": 2.790443686006826, + "grad_norm": 2.5457863807678223, + "learning_rate": 6.985210466439137e-05, + "loss": 6.1391, + "step": 8176 + }, + { + "epoch": 2.7907849829351536, + "grad_norm": 2.6228179931640625, + "learning_rate": 6.973833902161547e-05, + "loss": 6.0603, + "step": 8177 + }, + { + "epoch": 2.791126279863481, + "grad_norm": 2.5657594203948975, + "learning_rate": 6.962457337883959e-05, + "loss": 5.8292, + "step": 8178 + }, + { + "epoch": 2.791467576791809, + "grad_norm": 2.512878894805908, + "learning_rate": 6.951080773606371e-05, + "loss": 5.5296, + "step": 8179 + }, + { + "epoch": 2.7918088737201368, + "grad_norm": 2.526179075241089, + "learning_rate": 6.939704209328783e-05, + "loss": 5.4596, + "step": 8180 + }, + { + "epoch": 2.792150170648464, + "grad_norm": 2.5494656562805176, + "learning_rate": 6.928327645051196e-05, + "loss": 5.7413, + "step": 8181 + }, + { + "epoch": 2.7924914675767916, + "grad_norm": 2.623551368713379, + "learning_rate": 6.916951080773606e-05, + "loss": 5.9198, + "step": 8182 + }, + { + "epoch": 2.7928327645051194, + "grad_norm": 2.609656572341919, + "learning_rate": 6.905574516496018e-05, + "loss": 6.1618, + "step": 8183 + }, + { + "epoch": 2.7931740614334473, + "grad_norm": 3.3349082469940186, + "learning_rate": 6.89419795221843e-05, + "loss": 4.9097, + "step": 8184 + }, + { + "epoch": 2.7935153583617747, + "grad_norm": 2.6144847869873047, + "learning_rate": 6.882821387940842e-05, + "loss": 6.3871, + "step": 8185 + }, + { + "epoch": 2.7938566552901025, + "grad_norm": 2.5866599082946777, + "learning_rate": 6.871444823663254e-05, + "loss": 5.3323, + "step": 8186 + }, + { + "epoch": 2.79419795221843, + "grad_norm": 2.489753246307373, + "learning_rate": 6.860068259385666e-05, + "loss": 5.2399, + "step": 8187 + }, + { + "epoch": 2.7945392491467578, + "grad_norm": 2.5292809009552, + "learning_rate": 6.848691695108077e-05, + "loss": 4.0588, + "step": 8188 + }, + { + "epoch": 2.794880546075085, + "grad_norm": 2.519035577774048, + "learning_rate": 6.837315130830489e-05, + "loss": 5.8272, + "step": 8189 + }, + { + "epoch": 2.795221843003413, + "grad_norm": 2.5505032539367676, + "learning_rate": 6.825938566552901e-05, + "loss": 6.0068, + "step": 8190 + }, + { + "epoch": 2.7955631399317404, + "grad_norm": 2.5838968753814697, + "learning_rate": 6.814562002275313e-05, + "loss": 6.1091, + "step": 8191 + }, + { + "epoch": 2.7959044368600683, + "grad_norm": 4.409801006317139, + "learning_rate": 6.803185437997725e-05, + "loss": 4.6617, + "step": 8192 + }, + { + "epoch": 2.796245733788396, + "grad_norm": 2.558980703353882, + "learning_rate": 6.791808873720137e-05, + "loss": 5.8792, + "step": 8193 + }, + { + "epoch": 2.7965870307167235, + "grad_norm": 2.547933340072632, + "learning_rate": 6.780432309442548e-05, + "loss": 6.102, + "step": 8194 + }, + { + "epoch": 2.796928327645051, + "grad_norm": 2.719900608062744, + "learning_rate": 6.76905574516496e-05, + "loss": 5.5898, + "step": 8195 + }, + { + "epoch": 2.797269624573379, + "grad_norm": 2.68369722366333, + "learning_rate": 6.757679180887372e-05, + "loss": 5.1553, + "step": 8196 + }, + { + "epoch": 2.7976109215017066, + "grad_norm": 2.6060352325439453, + "learning_rate": 6.746302616609784e-05, + "loss": 6.2941, + "step": 8197 + }, + { + "epoch": 2.797952218430034, + "grad_norm": 2.6711232662200928, + "learning_rate": 6.734926052332196e-05, + "loss": 6.4384, + "step": 8198 + }, + { + "epoch": 2.798293515358362, + "grad_norm": 2.690762996673584, + "learning_rate": 6.723549488054608e-05, + "loss": 6.0928, + "step": 8199 + }, + { + "epoch": 2.7986348122866893, + "grad_norm": 2.6682162284851074, + "learning_rate": 6.71217292377702e-05, + "loss": 6.2376, + "step": 8200 + }, + { + "epoch": 2.798976109215017, + "grad_norm": 2.683518171310425, + "learning_rate": 6.700796359499431e-05, + "loss": 6.319, + "step": 8201 + }, + { + "epoch": 2.7993174061433446, + "grad_norm": 2.5631539821624756, + "learning_rate": 6.689419795221843e-05, + "loss": 6.1925, + "step": 8202 + }, + { + "epoch": 2.7996587030716724, + "grad_norm": 2.5291433334350586, + "learning_rate": 6.678043230944254e-05, + "loss": 6.2034, + "step": 8203 + }, + { + "epoch": 2.8, + "grad_norm": 2.4929697513580322, + "learning_rate": 6.666666666666667e-05, + "loss": 5.6572, + "step": 8204 + }, + { + "epoch": 2.8003412969283277, + "grad_norm": 2.6479387283325195, + "learning_rate": 6.655290102389079e-05, + "loss": 5.844, + "step": 8205 + }, + { + "epoch": 2.8006825938566555, + "grad_norm": 3.730424404144287, + "learning_rate": 6.643913538111491e-05, + "loss": 3.1324, + "step": 8206 + }, + { + "epoch": 2.801023890784983, + "grad_norm": 2.496727228164673, + "learning_rate": 6.632536973833903e-05, + "loss": 5.9789, + "step": 8207 + }, + { + "epoch": 2.8013651877133103, + "grad_norm": 2.9955997467041016, + "learning_rate": 6.621160409556313e-05, + "loss": 4.9055, + "step": 8208 + }, + { + "epoch": 2.801706484641638, + "grad_norm": 2.6087541580200195, + "learning_rate": 6.609783845278726e-05, + "loss": 6.3396, + "step": 8209 + }, + { + "epoch": 2.802047781569966, + "grad_norm": 2.530484676361084, + "learning_rate": 6.598407281001138e-05, + "loss": 5.6127, + "step": 8210 + }, + { + "epoch": 2.8023890784982934, + "grad_norm": 2.2731897830963135, + "learning_rate": 6.58703071672355e-05, + "loss": 3.3329, + "step": 8211 + }, + { + "epoch": 2.8027303754266213, + "grad_norm": 2.5415871143341064, + "learning_rate": 6.575654152445962e-05, + "loss": 5.8304, + "step": 8212 + }, + { + "epoch": 2.8030716723549487, + "grad_norm": 2.5074963569641113, + "learning_rate": 6.564277588168372e-05, + "loss": 5.6721, + "step": 8213 + }, + { + "epoch": 2.8034129692832765, + "grad_norm": 2.5421886444091797, + "learning_rate": 6.552901023890785e-05, + "loss": 6.2318, + "step": 8214 + }, + { + "epoch": 2.803754266211604, + "grad_norm": 2.5332021713256836, + "learning_rate": 6.541524459613197e-05, + "loss": 5.7743, + "step": 8215 + }, + { + "epoch": 2.804095563139932, + "grad_norm": 2.539001226425171, + "learning_rate": 6.530147895335609e-05, + "loss": 5.5197, + "step": 8216 + }, + { + "epoch": 2.804436860068259, + "grad_norm": 2.585606336593628, + "learning_rate": 6.518771331058021e-05, + "loss": 5.9402, + "step": 8217 + }, + { + "epoch": 2.804778156996587, + "grad_norm": 2.5669174194335938, + "learning_rate": 6.507394766780432e-05, + "loss": 5.8436, + "step": 8218 + }, + { + "epoch": 2.805119453924915, + "grad_norm": 2.6501104831695557, + "learning_rate": 6.496018202502845e-05, + "loss": 6.0262, + "step": 8219 + }, + { + "epoch": 2.8054607508532423, + "grad_norm": 2.6469192504882812, + "learning_rate": 6.484641638225255e-05, + "loss": 6.1936, + "step": 8220 + }, + { + "epoch": 2.8058020477815697, + "grad_norm": 2.4249207973480225, + "learning_rate": 6.473265073947668e-05, + "loss": 5.074, + "step": 8221 + }, + { + "epoch": 2.8061433447098976, + "grad_norm": 2.5924134254455566, + "learning_rate": 6.46188850967008e-05, + "loss": 6.188, + "step": 8222 + }, + { + "epoch": 2.8064846416382254, + "grad_norm": 2.559102773666382, + "learning_rate": 6.450511945392491e-05, + "loss": 5.493, + "step": 8223 + }, + { + "epoch": 2.806825938566553, + "grad_norm": 2.6225290298461914, + "learning_rate": 6.439135381114904e-05, + "loss": 5.5104, + "step": 8224 + }, + { + "epoch": 2.8071672354948807, + "grad_norm": 2.5742197036743164, + "learning_rate": 6.427758816837314e-05, + "loss": 6.0183, + "step": 8225 + }, + { + "epoch": 2.807508532423208, + "grad_norm": 2.534911870956421, + "learning_rate": 6.416382252559728e-05, + "loss": 5.6932, + "step": 8226 + }, + { + "epoch": 2.807849829351536, + "grad_norm": 2.5558876991271973, + "learning_rate": 6.40500568828214e-05, + "loss": 5.7026, + "step": 8227 + }, + { + "epoch": 2.8081911262798633, + "grad_norm": 2.528634786605835, + "learning_rate": 6.39362912400455e-05, + "loss": 5.7845, + "step": 8228 + }, + { + "epoch": 2.808532423208191, + "grad_norm": 2.934544324874878, + "learning_rate": 6.382252559726963e-05, + "loss": 4.4868, + "step": 8229 + }, + { + "epoch": 2.8088737201365186, + "grad_norm": 2.5007529258728027, + "learning_rate": 6.370875995449374e-05, + "loss": 4.6475, + "step": 8230 + }, + { + "epoch": 2.8092150170648464, + "grad_norm": 2.566521644592285, + "learning_rate": 6.359499431171787e-05, + "loss": 5.7933, + "step": 8231 + }, + { + "epoch": 2.8095563139931743, + "grad_norm": 2.5641417503356934, + "learning_rate": 6.348122866894199e-05, + "loss": 5.4627, + "step": 8232 + }, + { + "epoch": 2.8098976109215017, + "grad_norm": 2.489718437194824, + "learning_rate": 6.336746302616609e-05, + "loss": 5.6577, + "step": 8233 + }, + { + "epoch": 2.810238907849829, + "grad_norm": 2.5271124839782715, + "learning_rate": 6.325369738339022e-05, + "loss": 5.8988, + "step": 8234 + }, + { + "epoch": 2.810580204778157, + "grad_norm": 2.544497013092041, + "learning_rate": 6.313993174061433e-05, + "loss": 6.0608, + "step": 8235 + }, + { + "epoch": 2.810921501706485, + "grad_norm": 3.1858813762664795, + "learning_rate": 6.302616609783846e-05, + "loss": 4.988, + "step": 8236 + }, + { + "epoch": 2.811262798634812, + "grad_norm": 2.5615243911743164, + "learning_rate": 6.291240045506257e-05, + "loss": 5.9591, + "step": 8237 + }, + { + "epoch": 2.81160409556314, + "grad_norm": 2.4544830322265625, + "learning_rate": 6.279863481228669e-05, + "loss": 6.0645, + "step": 8238 + }, + { + "epoch": 2.8119453924914675, + "grad_norm": 2.5661332607269287, + "learning_rate": 6.268486916951082e-05, + "loss": 4.455, + "step": 8239 + }, + { + "epoch": 2.8122866894197953, + "grad_norm": 2.561377763748169, + "learning_rate": 6.257110352673492e-05, + "loss": 6.0717, + "step": 8240 + }, + { + "epoch": 2.8126279863481227, + "grad_norm": 2.537170648574829, + "learning_rate": 6.245733788395905e-05, + "loss": 4.6822, + "step": 8241 + }, + { + "epoch": 2.8129692832764506, + "grad_norm": 2.4561476707458496, + "learning_rate": 6.234357224118316e-05, + "loss": 5.5428, + "step": 8242 + }, + { + "epoch": 2.813310580204778, + "grad_norm": 2.530233383178711, + "learning_rate": 6.222980659840728e-05, + "loss": 6.0164, + "step": 8243 + }, + { + "epoch": 2.813651877133106, + "grad_norm": 2.61234974861145, + "learning_rate": 6.21160409556314e-05, + "loss": 5.5028, + "step": 8244 + }, + { + "epoch": 2.8139931740614337, + "grad_norm": 2.5273263454437256, + "learning_rate": 6.200227531285551e-05, + "loss": 5.8513, + "step": 8245 + }, + { + "epoch": 2.814334470989761, + "grad_norm": 2.5896804332733154, + "learning_rate": 6.188850967007965e-05, + "loss": 5.3815, + "step": 8246 + }, + { + "epoch": 2.8146757679180885, + "grad_norm": 2.605863332748413, + "learning_rate": 6.177474402730375e-05, + "loss": 5.9044, + "step": 8247 + }, + { + "epoch": 2.8150170648464163, + "grad_norm": 2.4373059272766113, + "learning_rate": 6.166097838452787e-05, + "loss": 5.8908, + "step": 8248 + }, + { + "epoch": 2.815358361774744, + "grad_norm": 2.578674793243408, + "learning_rate": 6.154721274175199e-05, + "loss": 6.1842, + "step": 8249 + }, + { + "epoch": 2.8156996587030716, + "grad_norm": 2.639101028442383, + "learning_rate": 6.143344709897611e-05, + "loss": 5.4184, + "step": 8250 + }, + { + "epoch": 2.8160409556313994, + "grad_norm": 2.5843536853790283, + "learning_rate": 6.131968145620023e-05, + "loss": 6.1219, + "step": 8251 + }, + { + "epoch": 2.816382252559727, + "grad_norm": 2.562448263168335, + "learning_rate": 6.120591581342434e-05, + "loss": 5.6921, + "step": 8252 + }, + { + "epoch": 2.8167235494880547, + "grad_norm": 2.592428207397461, + "learning_rate": 6.109215017064846e-05, + "loss": 6.1349, + "step": 8253 + }, + { + "epoch": 2.817064846416382, + "grad_norm": 2.594329357147217, + "learning_rate": 6.097838452787258e-05, + "loss": 6.4242, + "step": 8254 + }, + { + "epoch": 2.81740614334471, + "grad_norm": 2.5750083923339844, + "learning_rate": 6.08646188850967e-05, + "loss": 5.7389, + "step": 8255 + }, + { + "epoch": 2.8177474402730374, + "grad_norm": 2.5281083583831787, + "learning_rate": 6.075085324232082e-05, + "loss": 5.9597, + "step": 8256 + }, + { + "epoch": 2.818088737201365, + "grad_norm": 2.5450093746185303, + "learning_rate": 6.063708759954494e-05, + "loss": 5.7704, + "step": 8257 + }, + { + "epoch": 2.818430034129693, + "grad_norm": 2.50821852684021, + "learning_rate": 6.052332195676906e-05, + "loss": 5.5946, + "step": 8258 + }, + { + "epoch": 2.8187713310580205, + "grad_norm": 2.6139910221099854, + "learning_rate": 6.0409556313993174e-05, + "loss": 6.1745, + "step": 8259 + }, + { + "epoch": 2.819112627986348, + "grad_norm": 2.6088783740997314, + "learning_rate": 6.029579067121729e-05, + "loss": 5.8251, + "step": 8260 + }, + { + "epoch": 2.8194539249146757, + "grad_norm": 2.6340761184692383, + "learning_rate": 6.018202502844141e-05, + "loss": 4.9621, + "step": 8261 + }, + { + "epoch": 2.8197952218430036, + "grad_norm": 2.5626347064971924, + "learning_rate": 6.006825938566553e-05, + "loss": 5.829, + "step": 8262 + }, + { + "epoch": 2.820136518771331, + "grad_norm": 2.573450803756714, + "learning_rate": 5.9954493742889654e-05, + "loss": 5.9689, + "step": 8263 + }, + { + "epoch": 2.820477815699659, + "grad_norm": 2.446423053741455, + "learning_rate": 5.9840728100113766e-05, + "loss": 5.088, + "step": 8264 + }, + { + "epoch": 2.8208191126279862, + "grad_norm": 2.590911865234375, + "learning_rate": 5.9726962457337885e-05, + "loss": 5.8315, + "step": 8265 + }, + { + "epoch": 2.821160409556314, + "grad_norm": 2.581800699234009, + "learning_rate": 5.9613196814562e-05, + "loss": 6.0672, + "step": 8266 + }, + { + "epoch": 2.8215017064846415, + "grad_norm": 2.534433126449585, + "learning_rate": 5.949943117178612e-05, + "loss": 5.7578, + "step": 8267 + }, + { + "epoch": 2.8218430034129693, + "grad_norm": 2.6216630935668945, + "learning_rate": 5.938566552901024e-05, + "loss": 5.6905, + "step": 8268 + }, + { + "epoch": 2.8221843003412967, + "grad_norm": 2.5895118713378906, + "learning_rate": 5.927189988623436e-05, + "loss": 6.1605, + "step": 8269 + }, + { + "epoch": 2.8225255972696246, + "grad_norm": 2.5925073623657227, + "learning_rate": 5.915813424345848e-05, + "loss": 6.0352, + "step": 8270 + }, + { + "epoch": 2.8228668941979524, + "grad_norm": 2.5645461082458496, + "learning_rate": 5.9044368600682596e-05, + "loss": 5.7567, + "step": 8271 + }, + { + "epoch": 2.82320819112628, + "grad_norm": 2.479318857192993, + "learning_rate": 5.8930602957906714e-05, + "loss": 5.6575, + "step": 8272 + }, + { + "epoch": 2.8235494880546073, + "grad_norm": 2.6021623611450195, + "learning_rate": 5.881683731513083e-05, + "loss": 4.8975, + "step": 8273 + }, + { + "epoch": 2.823890784982935, + "grad_norm": 2.5270769596099854, + "learning_rate": 5.870307167235495e-05, + "loss": 6.0327, + "step": 8274 + }, + { + "epoch": 2.824232081911263, + "grad_norm": 2.6603550910949707, + "learning_rate": 5.858930602957907e-05, + "loss": 6.38, + "step": 8275 + }, + { + "epoch": 2.8245733788395904, + "grad_norm": 2.523515462875366, + "learning_rate": 5.847554038680319e-05, + "loss": 5.7957, + "step": 8276 + }, + { + "epoch": 2.824914675767918, + "grad_norm": 2.602630615234375, + "learning_rate": 5.8361774744027307e-05, + "loss": 6.3797, + "step": 8277 + }, + { + "epoch": 2.8252559726962456, + "grad_norm": 3.1169798374176025, + "learning_rate": 5.824800910125142e-05, + "loss": 4.2113, + "step": 8278 + }, + { + "epoch": 2.8255972696245735, + "grad_norm": 2.580352306365967, + "learning_rate": 5.813424345847554e-05, + "loss": 6.75, + "step": 8279 + }, + { + "epoch": 2.825938566552901, + "grad_norm": 2.5643091201782227, + "learning_rate": 5.802047781569966e-05, + "loss": 5.7626, + "step": 8280 + }, + { + "epoch": 2.8262798634812287, + "grad_norm": 2.5562422275543213, + "learning_rate": 5.790671217292378e-05, + "loss": 6.2436, + "step": 8281 + }, + { + "epoch": 2.826621160409556, + "grad_norm": 2.547008514404297, + "learning_rate": 5.77929465301479e-05, + "loss": 6.1454, + "step": 8282 + }, + { + "epoch": 2.826962457337884, + "grad_norm": 2.544355869293213, + "learning_rate": 5.767918088737201e-05, + "loss": 6.3044, + "step": 8283 + }, + { + "epoch": 2.827303754266212, + "grad_norm": 2.6169373989105225, + "learning_rate": 5.756541524459613e-05, + "loss": 5.7332, + "step": 8284 + }, + { + "epoch": 2.8276450511945392, + "grad_norm": 2.581423759460449, + "learning_rate": 5.745164960182025e-05, + "loss": 5.234, + "step": 8285 + }, + { + "epoch": 2.8279863481228666, + "grad_norm": 2.5888426303863525, + "learning_rate": 5.733788395904437e-05, + "loss": 6.8817, + "step": 8286 + }, + { + "epoch": 2.8283276450511945, + "grad_norm": 2.491856098175049, + "learning_rate": 5.722411831626849e-05, + "loss": 5.7124, + "step": 8287 + }, + { + "epoch": 2.8286689419795223, + "grad_norm": 2.546119213104248, + "learning_rate": 5.71103526734926e-05, + "loss": 5.9523, + "step": 8288 + }, + { + "epoch": 2.8290102389078498, + "grad_norm": 2.61152720451355, + "learning_rate": 5.699658703071672e-05, + "loss": 5.9196, + "step": 8289 + }, + { + "epoch": 2.8293515358361776, + "grad_norm": 2.7438294887542725, + "learning_rate": 5.688282138794084e-05, + "loss": 5.4704, + "step": 8290 + }, + { + "epoch": 2.829692832764505, + "grad_norm": 2.6315395832061768, + "learning_rate": 5.6769055745164965e-05, + "loss": 5.323, + "step": 8291 + }, + { + "epoch": 2.830034129692833, + "grad_norm": 2.675381660461426, + "learning_rate": 5.6655290102389084e-05, + "loss": 5.9233, + "step": 8292 + }, + { + "epoch": 2.8303754266211603, + "grad_norm": 2.479240894317627, + "learning_rate": 5.6541524459613196e-05, + "loss": 5.9549, + "step": 8293 + }, + { + "epoch": 2.830716723549488, + "grad_norm": 2.5712265968322754, + "learning_rate": 5.6427758816837314e-05, + "loss": 6.1167, + "step": 8294 + }, + { + "epoch": 2.8310580204778155, + "grad_norm": 2.5504567623138428, + "learning_rate": 5.631399317406143e-05, + "loss": 6.1586, + "step": 8295 + }, + { + "epoch": 2.8313993174061434, + "grad_norm": 2.505173444747925, + "learning_rate": 5.620022753128555e-05, + "loss": 6.0159, + "step": 8296 + }, + { + "epoch": 2.831740614334471, + "grad_norm": 2.5844502449035645, + "learning_rate": 5.6086461888509676e-05, + "loss": 6.1771, + "step": 8297 + }, + { + "epoch": 2.8320819112627986, + "grad_norm": 2.5144166946411133, + "learning_rate": 5.597269624573379e-05, + "loss": 6.1626, + "step": 8298 + }, + { + "epoch": 2.832423208191126, + "grad_norm": 2.5902957916259766, + "learning_rate": 5.585893060295791e-05, + "loss": 5.6985, + "step": 8299 + }, + { + "epoch": 2.832764505119454, + "grad_norm": 2.809479236602783, + "learning_rate": 5.5745164960182025e-05, + "loss": 4.2213, + "step": 8300 + }, + { + "epoch": 2.8331058020477817, + "grad_norm": 2.563436269760132, + "learning_rate": 5.5631399317406144e-05, + "loss": 5.617, + "step": 8301 + }, + { + "epoch": 2.833447098976109, + "grad_norm": 2.593966484069824, + "learning_rate": 5.551763367463027e-05, + "loss": 5.7472, + "step": 8302 + }, + { + "epoch": 2.833788395904437, + "grad_norm": 2.590921640396118, + "learning_rate": 5.540386803185438e-05, + "loss": 5.1288, + "step": 8303 + }, + { + "epoch": 2.8341296928327644, + "grad_norm": 2.558959722518921, + "learning_rate": 5.52901023890785e-05, + "loss": 5.8822, + "step": 8304 + }, + { + "epoch": 2.8344709897610922, + "grad_norm": 2.5313167572021484, + "learning_rate": 5.517633674630262e-05, + "loss": 6.0652, + "step": 8305 + }, + { + "epoch": 2.8348122866894196, + "grad_norm": 2.59818172454834, + "learning_rate": 5.5062571103526736e-05, + "loss": 5.8414, + "step": 8306 + }, + { + "epoch": 2.8351535836177475, + "grad_norm": 2.5654821395874023, + "learning_rate": 5.4948805460750855e-05, + "loss": 5.9746, + "step": 8307 + }, + { + "epoch": 2.835494880546075, + "grad_norm": 2.573014974594116, + "learning_rate": 5.483503981797497e-05, + "loss": 6.3835, + "step": 8308 + }, + { + "epoch": 2.8358361774744028, + "grad_norm": 2.426826238632202, + "learning_rate": 5.472127417519909e-05, + "loss": 5.2187, + "step": 8309 + }, + { + "epoch": 2.8361774744027306, + "grad_norm": 2.524761199951172, + "learning_rate": 5.460750853242321e-05, + "loss": 6.1975, + "step": 8310 + }, + { + "epoch": 2.836518771331058, + "grad_norm": 2.5690839290618896, + "learning_rate": 5.449374288964733e-05, + "loss": 5.8855, + "step": 8311 + }, + { + "epoch": 2.8368600682593854, + "grad_norm": 2.8350605964660645, + "learning_rate": 5.437997724687145e-05, + "loss": 4.5688, + "step": 8312 + }, + { + "epoch": 2.8372013651877133, + "grad_norm": 2.7381675243377686, + "learning_rate": 5.426621160409556e-05, + "loss": 5.0834, + "step": 8313 + }, + { + "epoch": 2.837542662116041, + "grad_norm": 2.534423828125, + "learning_rate": 5.4152445961319684e-05, + "loss": 6.1567, + "step": 8314 + }, + { + "epoch": 2.8378839590443685, + "grad_norm": 2.4711785316467285, + "learning_rate": 5.40386803185438e-05, + "loss": 5.8467, + "step": 8315 + }, + { + "epoch": 2.8382252559726964, + "grad_norm": 2.48486328125, + "learning_rate": 5.392491467576792e-05, + "loss": 5.5474, + "step": 8316 + }, + { + "epoch": 2.8385665529010238, + "grad_norm": 2.6153039932250977, + "learning_rate": 5.381114903299204e-05, + "loss": 6.1734, + "step": 8317 + }, + { + "epoch": 2.8389078498293516, + "grad_norm": 2.5097267627716064, + "learning_rate": 5.369738339021615e-05, + "loss": 5.9648, + "step": 8318 + }, + { + "epoch": 2.839249146757679, + "grad_norm": 2.603243827819824, + "learning_rate": 5.3583617747440277e-05, + "loss": 5.7459, + "step": 8319 + }, + { + "epoch": 2.839590443686007, + "grad_norm": 2.543027877807617, + "learning_rate": 5.3469852104664395e-05, + "loss": 6.1973, + "step": 8320 + }, + { + "epoch": 2.8399317406143343, + "grad_norm": 2.5744729042053223, + "learning_rate": 5.3356086461888514e-05, + "loss": 6.2723, + "step": 8321 + }, + { + "epoch": 2.840273037542662, + "grad_norm": 2.5325405597686768, + "learning_rate": 5.324232081911263e-05, + "loss": 6.2439, + "step": 8322 + }, + { + "epoch": 2.84061433447099, + "grad_norm": 2.521803617477417, + "learning_rate": 5.3128555176336744e-05, + "loss": 4.9205, + "step": 8323 + }, + { + "epoch": 2.8409556313993174, + "grad_norm": 2.6305558681488037, + "learning_rate": 5.301478953356086e-05, + "loss": 6.7376, + "step": 8324 + }, + { + "epoch": 2.841296928327645, + "grad_norm": 2.589456558227539, + "learning_rate": 5.290102389078499e-05, + "loss": 5.3846, + "step": 8325 + }, + { + "epoch": 2.8416382252559726, + "grad_norm": 2.5877840518951416, + "learning_rate": 5.2787258248009106e-05, + "loss": 5.3142, + "step": 8326 + }, + { + "epoch": 2.8419795221843005, + "grad_norm": 2.562431573867798, + "learning_rate": 5.267349260523322e-05, + "loss": 5.6504, + "step": 8327 + }, + { + "epoch": 2.842320819112628, + "grad_norm": 2.539492607116699, + "learning_rate": 5.2559726962457336e-05, + "loss": 4.5076, + "step": 8328 + }, + { + "epoch": 2.8426621160409558, + "grad_norm": 2.5125441551208496, + "learning_rate": 5.2445961319681455e-05, + "loss": 6.1126, + "step": 8329 + }, + { + "epoch": 2.843003412969283, + "grad_norm": 2.546273708343506, + "learning_rate": 5.233219567690557e-05, + "loss": 5.7161, + "step": 8330 + }, + { + "epoch": 2.843344709897611, + "grad_norm": 2.596364736557007, + "learning_rate": 5.22184300341297e-05, + "loss": 6.1584, + "step": 8331 + }, + { + "epoch": 2.8436860068259384, + "grad_norm": 2.556121349334717, + "learning_rate": 5.210466439135381e-05, + "loss": 5.4851, + "step": 8332 + }, + { + "epoch": 2.8440273037542663, + "grad_norm": 2.4971086978912354, + "learning_rate": 5.199089874857793e-05, + "loss": 6.1228, + "step": 8333 + }, + { + "epoch": 2.8443686006825937, + "grad_norm": 2.5893213748931885, + "learning_rate": 5.187713310580205e-05, + "loss": 4.826, + "step": 8334 + }, + { + "epoch": 2.8447098976109215, + "grad_norm": 2.562465190887451, + "learning_rate": 5.1763367463026166e-05, + "loss": 4.6726, + "step": 8335 + }, + { + "epoch": 2.8450511945392494, + "grad_norm": 2.575310707092285, + "learning_rate": 5.164960182025029e-05, + "loss": 6.3681, + "step": 8336 + }, + { + "epoch": 2.845392491467577, + "grad_norm": 3.0285212993621826, + "learning_rate": 5.15358361774744e-05, + "loss": 5.0006, + "step": 8337 + }, + { + "epoch": 2.845733788395904, + "grad_norm": 2.7478995323181152, + "learning_rate": 5.142207053469852e-05, + "loss": 5.4962, + "step": 8338 + }, + { + "epoch": 2.846075085324232, + "grad_norm": 2.4442386627197266, + "learning_rate": 5.130830489192264e-05, + "loss": 5.7498, + "step": 8339 + }, + { + "epoch": 2.84641638225256, + "grad_norm": 2.5991361141204834, + "learning_rate": 5.119453924914676e-05, + "loss": 5.419, + "step": 8340 + }, + { + "epoch": 2.8467576791808873, + "grad_norm": 2.547792434692383, + "learning_rate": 5.108077360637088e-05, + "loss": 5.9812, + "step": 8341 + }, + { + "epoch": 2.847098976109215, + "grad_norm": 2.6672022342681885, + "learning_rate": 5.0967007963594995e-05, + "loss": 4.3484, + "step": 8342 + }, + { + "epoch": 2.8474402730375425, + "grad_norm": 2.547494649887085, + "learning_rate": 5.0853242320819114e-05, + "loss": 6.2943, + "step": 8343 + }, + { + "epoch": 2.8477815699658704, + "grad_norm": 2.4189071655273438, + "learning_rate": 5.073947667804323e-05, + "loss": 5.0494, + "step": 8344 + }, + { + "epoch": 2.848122866894198, + "grad_norm": 2.471524477005005, + "learning_rate": 5.062571103526735e-05, + "loss": 5.5994, + "step": 8345 + }, + { + "epoch": 2.8484641638225257, + "grad_norm": 2.490009069442749, + "learning_rate": 5.051194539249147e-05, + "loss": 5.8388, + "step": 8346 + }, + { + "epoch": 2.848805460750853, + "grad_norm": 2.542736053466797, + "learning_rate": 5.039817974971558e-05, + "loss": 5.7709, + "step": 8347 + }, + { + "epoch": 2.849146757679181, + "grad_norm": 2.5808589458465576, + "learning_rate": 5.0284414106939706e-05, + "loss": 6.7117, + "step": 8348 + }, + { + "epoch": 2.8494880546075088, + "grad_norm": 2.597972869873047, + "learning_rate": 5.0170648464163825e-05, + "loss": 6.4791, + "step": 8349 + }, + { + "epoch": 2.849829351535836, + "grad_norm": 2.577030658721924, + "learning_rate": 5.005688282138794e-05, + "loss": 6.3102, + "step": 8350 + }, + { + "epoch": 2.8501706484641636, + "grad_norm": 2.47444486618042, + "learning_rate": 4.994311717861206e-05, + "loss": 5.5967, + "step": 8351 + }, + { + "epoch": 2.8505119453924914, + "grad_norm": 2.5281221866607666, + "learning_rate": 4.982935153583617e-05, + "loss": 5.4821, + "step": 8352 + }, + { + "epoch": 2.8508532423208193, + "grad_norm": 2.5494225025177, + "learning_rate": 4.97155858930603e-05, + "loss": 6.4092, + "step": 8353 + }, + { + "epoch": 2.8511945392491467, + "grad_norm": 2.561234712600708, + "learning_rate": 4.960182025028442e-05, + "loss": 5.7412, + "step": 8354 + }, + { + "epoch": 2.8515358361774745, + "grad_norm": 3.5011022090911865, + "learning_rate": 4.9488054607508536e-05, + "loss": 5.4677, + "step": 8355 + }, + { + "epoch": 2.851877133105802, + "grad_norm": 2.60447359085083, + "learning_rate": 4.9374288964732654e-05, + "loss": 6.2699, + "step": 8356 + }, + { + "epoch": 2.85221843003413, + "grad_norm": 2.556480884552002, + "learning_rate": 4.9260523321956766e-05, + "loss": 5.9936, + "step": 8357 + }, + { + "epoch": 2.852559726962457, + "grad_norm": 2.508237600326538, + "learning_rate": 4.9146757679180884e-05, + "loss": 5.9199, + "step": 8358 + }, + { + "epoch": 2.852901023890785, + "grad_norm": 2.5938949584960938, + "learning_rate": 4.903299203640501e-05, + "loss": 6.2548, + "step": 8359 + }, + { + "epoch": 2.8532423208191124, + "grad_norm": 2.570672035217285, + "learning_rate": 4.891922639362913e-05, + "loss": 6.0058, + "step": 8360 + }, + { + "epoch": 2.8535836177474403, + "grad_norm": 2.583148717880249, + "learning_rate": 4.8805460750853247e-05, + "loss": 6.0509, + "step": 8361 + }, + { + "epoch": 2.853924914675768, + "grad_norm": 2.4389407634735107, + "learning_rate": 4.869169510807736e-05, + "loss": 6.2846, + "step": 8362 + }, + { + "epoch": 2.8542662116040955, + "grad_norm": 2.574002981185913, + "learning_rate": 4.857792946530148e-05, + "loss": 5.7958, + "step": 8363 + }, + { + "epoch": 2.854607508532423, + "grad_norm": 2.5079104900360107, + "learning_rate": 4.8464163822525595e-05, + "loss": 6.4898, + "step": 8364 + }, + { + "epoch": 2.854948805460751, + "grad_norm": 2.6473233699798584, + "learning_rate": 4.835039817974972e-05, + "loss": 5.9305, + "step": 8365 + }, + { + "epoch": 2.8552901023890787, + "grad_norm": 2.551525831222534, + "learning_rate": 4.823663253697384e-05, + "loss": 5.7632, + "step": 8366 + }, + { + "epoch": 2.855631399317406, + "grad_norm": 2.5253288745880127, + "learning_rate": 4.812286689419795e-05, + "loss": 6.1019, + "step": 8367 + }, + { + "epoch": 2.855972696245734, + "grad_norm": 2.598867654800415, + "learning_rate": 4.800910125142207e-05, + "loss": 6.0308, + "step": 8368 + }, + { + "epoch": 2.8563139931740613, + "grad_norm": 2.4677653312683105, + "learning_rate": 4.789533560864619e-05, + "loss": 5.5467, + "step": 8369 + }, + { + "epoch": 2.856655290102389, + "grad_norm": 2.6016745567321777, + "learning_rate": 4.778156996587031e-05, + "loss": 5.6461, + "step": 8370 + }, + { + "epoch": 2.8569965870307166, + "grad_norm": 2.613708734512329, + "learning_rate": 4.766780432309443e-05, + "loss": 6.6418, + "step": 8371 + }, + { + "epoch": 2.8573378839590444, + "grad_norm": 2.5926055908203125, + "learning_rate": 4.755403868031854e-05, + "loss": 5.8853, + "step": 8372 + }, + { + "epoch": 2.857679180887372, + "grad_norm": 2.541905403137207, + "learning_rate": 4.744027303754266e-05, + "loss": 6.4039, + "step": 8373 + }, + { + "epoch": 2.8580204778156997, + "grad_norm": 2.5121161937713623, + "learning_rate": 4.732650739476678e-05, + "loss": 5.7596, + "step": 8374 + }, + { + "epoch": 2.8583617747440275, + "grad_norm": 2.536179780960083, + "learning_rate": 4.72127417519909e-05, + "loss": 6.1308, + "step": 8375 + }, + { + "epoch": 2.858703071672355, + "grad_norm": 7.43336296081543, + "learning_rate": 4.7098976109215024e-05, + "loss": 5.0863, + "step": 8376 + }, + { + "epoch": 2.8590443686006823, + "grad_norm": 2.5915966033935547, + "learning_rate": 4.6985210466439136e-05, + "loss": 6.191, + "step": 8377 + }, + { + "epoch": 2.85938566552901, + "grad_norm": 2.5173799991607666, + "learning_rate": 4.6871444823663254e-05, + "loss": 6.0567, + "step": 8378 + }, + { + "epoch": 2.859726962457338, + "grad_norm": 2.5869557857513428, + "learning_rate": 4.675767918088737e-05, + "loss": 5.7846, + "step": 8379 + }, + { + "epoch": 2.8600682593856654, + "grad_norm": 2.5171236991882324, + "learning_rate": 4.664391353811149e-05, + "loss": 6.236, + "step": 8380 + }, + { + "epoch": 2.8604095563139933, + "grad_norm": 2.6231682300567627, + "learning_rate": 4.653014789533561e-05, + "loss": 5.6616, + "step": 8381 + }, + { + "epoch": 2.8607508532423207, + "grad_norm": 2.555541753768921, + "learning_rate": 4.641638225255973e-05, + "loss": 6.3776, + "step": 8382 + }, + { + "epoch": 2.8610921501706486, + "grad_norm": 2.5504350662231445, + "learning_rate": 4.630261660978385e-05, + "loss": 6.1276, + "step": 8383 + }, + { + "epoch": 2.861433447098976, + "grad_norm": 2.523817300796509, + "learning_rate": 4.6188850967007965e-05, + "loss": 5.6059, + "step": 8384 + }, + { + "epoch": 2.861774744027304, + "grad_norm": 2.5135388374328613, + "learning_rate": 4.6075085324232084e-05, + "loss": 5.924, + "step": 8385 + }, + { + "epoch": 2.862116040955631, + "grad_norm": 2.5756356716156006, + "learning_rate": 4.5961319681456195e-05, + "loss": 5.9326, + "step": 8386 + }, + { + "epoch": 2.862457337883959, + "grad_norm": 2.5897839069366455, + "learning_rate": 4.584755403868032e-05, + "loss": 6.2464, + "step": 8387 + }, + { + "epoch": 2.862798634812287, + "grad_norm": 2.522111177444458, + "learning_rate": 4.573378839590444e-05, + "loss": 5.7213, + "step": 8388 + }, + { + "epoch": 2.8631399317406143, + "grad_norm": 2.5438270568847656, + "learning_rate": 4.562002275312856e-05, + "loss": 5.5916, + "step": 8389 + }, + { + "epoch": 2.8634812286689417, + "grad_norm": 2.5333592891693115, + "learning_rate": 4.5506257110352676e-05, + "loss": 6.1719, + "step": 8390 + }, + { + "epoch": 2.8638225255972696, + "grad_norm": 2.5089774131774902, + "learning_rate": 4.539249146757679e-05, + "loss": 6.1262, + "step": 8391 + }, + { + "epoch": 2.8641638225255974, + "grad_norm": 2.4316928386688232, + "learning_rate": 4.5278725824800906e-05, + "loss": 4.518, + "step": 8392 + }, + { + "epoch": 2.864505119453925, + "grad_norm": 2.5709829330444336, + "learning_rate": 4.516496018202503e-05, + "loss": 6.1676, + "step": 8393 + }, + { + "epoch": 2.8648464163822527, + "grad_norm": 2.5119333267211914, + "learning_rate": 4.505119453924915e-05, + "loss": 4.8229, + "step": 8394 + }, + { + "epoch": 2.86518771331058, + "grad_norm": 2.5550503730773926, + "learning_rate": 4.493742889647327e-05, + "loss": 5.6363, + "step": 8395 + }, + { + "epoch": 2.865529010238908, + "grad_norm": 2.5302844047546387, + "learning_rate": 4.482366325369738e-05, + "loss": 5.3337, + "step": 8396 + }, + { + "epoch": 2.8658703071672353, + "grad_norm": 2.5047590732574463, + "learning_rate": 4.47098976109215e-05, + "loss": 6.5914, + "step": 8397 + }, + { + "epoch": 2.866211604095563, + "grad_norm": 2.5279815196990967, + "learning_rate": 4.4596131968145624e-05, + "loss": 6.4273, + "step": 8398 + }, + { + "epoch": 2.8665529010238906, + "grad_norm": 2.603135347366333, + "learning_rate": 4.448236632536974e-05, + "loss": 6.5059, + "step": 8399 + }, + { + "epoch": 2.8668941979522184, + "grad_norm": 2.582097053527832, + "learning_rate": 4.436860068259386e-05, + "loss": 5.8008, + "step": 8400 + }, + { + "epoch": 2.8672354948805463, + "grad_norm": 2.5195491313934326, + "learning_rate": 4.425483503981797e-05, + "loss": 6.622, + "step": 8401 + }, + { + "epoch": 2.8675767918088737, + "grad_norm": 2.5339550971984863, + "learning_rate": 4.414106939704209e-05, + "loss": 6.4584, + "step": 8402 + }, + { + "epoch": 2.867918088737201, + "grad_norm": 2.524705648422241, + "learning_rate": 4.402730375426621e-05, + "loss": 5.2988, + "step": 8403 + }, + { + "epoch": 2.868259385665529, + "grad_norm": 2.5974693298339844, + "learning_rate": 4.3913538111490335e-05, + "loss": 6.255, + "step": 8404 + }, + { + "epoch": 2.868600682593857, + "grad_norm": 2.66146183013916, + "learning_rate": 4.3799772468714454e-05, + "loss": 4.7422, + "step": 8405 + }, + { + "epoch": 2.868941979522184, + "grad_norm": 2.778348922729492, + "learning_rate": 4.3686006825938565e-05, + "loss": 5.1854, + "step": 8406 + }, + { + "epoch": 2.869283276450512, + "grad_norm": 2.8919944763183594, + "learning_rate": 4.3572241183162684e-05, + "loss": 3.8649, + "step": 8407 + }, + { + "epoch": 2.8696245733788395, + "grad_norm": 2.5984740257263184, + "learning_rate": 4.34584755403868e-05, + "loss": 5.6027, + "step": 8408 + }, + { + "epoch": 2.8699658703071673, + "grad_norm": 2.449209690093994, + "learning_rate": 4.334470989761092e-05, + "loss": 5.0734, + "step": 8409 + }, + { + "epoch": 2.8703071672354947, + "grad_norm": 2.4918715953826904, + "learning_rate": 4.3230944254835046e-05, + "loss": 5.4387, + "step": 8410 + }, + { + "epoch": 2.8706484641638226, + "grad_norm": 2.528923749923706, + "learning_rate": 4.311717861205916e-05, + "loss": 5.9894, + "step": 8411 + }, + { + "epoch": 2.87098976109215, + "grad_norm": 2.5853374004364014, + "learning_rate": 4.3003412969283276e-05, + "loss": 6.5989, + "step": 8412 + }, + { + "epoch": 2.871331058020478, + "grad_norm": 2.531834363937378, + "learning_rate": 4.2889647326507395e-05, + "loss": 6.2346, + "step": 8413 + }, + { + "epoch": 2.8716723549488057, + "grad_norm": 2.5188536643981934, + "learning_rate": 4.277588168373151e-05, + "loss": 5.9237, + "step": 8414 + }, + { + "epoch": 2.872013651877133, + "grad_norm": 2.5645134449005127, + "learning_rate": 4.266211604095564e-05, + "loss": 5.9186, + "step": 8415 + }, + { + "epoch": 2.8723549488054605, + "grad_norm": 2.4560606479644775, + "learning_rate": 4.254835039817975e-05, + "loss": 5.6281, + "step": 8416 + }, + { + "epoch": 2.8726962457337883, + "grad_norm": 2.49910306930542, + "learning_rate": 4.243458475540387e-05, + "loss": 6.1462, + "step": 8417 + }, + { + "epoch": 2.873037542662116, + "grad_norm": 2.547170877456665, + "learning_rate": 4.232081911262799e-05, + "loss": 5.9647, + "step": 8418 + }, + { + "epoch": 2.8733788395904436, + "grad_norm": 2.5908734798431396, + "learning_rate": 4.2207053469852106e-05, + "loss": 5.909, + "step": 8419 + }, + { + "epoch": 2.8737201365187715, + "grad_norm": 2.5554044246673584, + "learning_rate": 4.2093287827076224e-05, + "loss": 5.9551, + "step": 8420 + }, + { + "epoch": 2.874061433447099, + "grad_norm": 2.5563385486602783, + "learning_rate": 4.197952218430034e-05, + "loss": 6.0185, + "step": 8421 + }, + { + "epoch": 2.8744027303754267, + "grad_norm": 2.525693893432617, + "learning_rate": 4.186575654152446e-05, + "loss": 5.7964, + "step": 8422 + }, + { + "epoch": 2.874744027303754, + "grad_norm": 2.5117642879486084, + "learning_rate": 4.175199089874858e-05, + "loss": 5.5265, + "step": 8423 + }, + { + "epoch": 2.875085324232082, + "grad_norm": 2.5649948120117188, + "learning_rate": 4.16382252559727e-05, + "loss": 5.9956, + "step": 8424 + }, + { + "epoch": 2.8754266211604094, + "grad_norm": 2.43780255317688, + "learning_rate": 4.152445961319681e-05, + "loss": 6.1702, + "step": 8425 + }, + { + "epoch": 2.875767918088737, + "grad_norm": 2.5544321537017822, + "learning_rate": 4.141069397042093e-05, + "loss": 5.5664, + "step": 8426 + }, + { + "epoch": 2.876109215017065, + "grad_norm": 2.5855114459991455, + "learning_rate": 4.1296928327645054e-05, + "loss": 6.168, + "step": 8427 + }, + { + "epoch": 2.8764505119453925, + "grad_norm": 2.5726702213287354, + "learning_rate": 4.118316268486917e-05, + "loss": 5.963, + "step": 8428 + }, + { + "epoch": 2.87679180887372, + "grad_norm": 2.6072628498077393, + "learning_rate": 4.106939704209329e-05, + "loss": 6.4451, + "step": 8429 + }, + { + "epoch": 2.8771331058020477, + "grad_norm": 2.586865186691284, + "learning_rate": 4.09556313993174e-05, + "loss": 6.5633, + "step": 8430 + }, + { + "epoch": 2.8774744027303756, + "grad_norm": 2.524959087371826, + "learning_rate": 4.084186575654152e-05, + "loss": 6.5356, + "step": 8431 + }, + { + "epoch": 2.877815699658703, + "grad_norm": 2.5504794120788574, + "learning_rate": 4.0728100113765646e-05, + "loss": 5.9479, + "step": 8432 + }, + { + "epoch": 2.878156996587031, + "grad_norm": 2.483062505722046, + "learning_rate": 4.0614334470989765e-05, + "loss": 6.0917, + "step": 8433 + }, + { + "epoch": 2.8784982935153582, + "grad_norm": 2.506287097930908, + "learning_rate": 4.050056882821388e-05, + "loss": 6.0076, + "step": 8434 + }, + { + "epoch": 2.878839590443686, + "grad_norm": 2.6461734771728516, + "learning_rate": 4.0386803185437995e-05, + "loss": 5.6557, + "step": 8435 + }, + { + "epoch": 2.8791808873720135, + "grad_norm": 2.628688335418701, + "learning_rate": 4.0273037542662113e-05, + "loss": 6.4374, + "step": 8436 + }, + { + "epoch": 2.8795221843003413, + "grad_norm": 2.541585683822632, + "learning_rate": 4.015927189988623e-05, + "loss": 6.1847, + "step": 8437 + }, + { + "epoch": 2.8798634812286688, + "grad_norm": 2.503748655319214, + "learning_rate": 4.004550625711036e-05, + "loss": 5.7091, + "step": 8438 + }, + { + "epoch": 2.8802047781569966, + "grad_norm": 2.540815591812134, + "learning_rate": 3.9931740614334476e-05, + "loss": 5.6816, + "step": 8439 + }, + { + "epoch": 2.8805460750853245, + "grad_norm": 2.56005597114563, + "learning_rate": 3.981797497155859e-05, + "loss": 5.871, + "step": 8440 + }, + { + "epoch": 2.880887372013652, + "grad_norm": 2.4481024742126465, + "learning_rate": 3.9704209328782706e-05, + "loss": 5.8886, + "step": 8441 + }, + { + "epoch": 2.8812286689419793, + "grad_norm": 2.5519607067108154, + "learning_rate": 3.9590443686006824e-05, + "loss": 5.7186, + "step": 8442 + }, + { + "epoch": 2.881569965870307, + "grad_norm": 2.5279147624969482, + "learning_rate": 3.947667804323094e-05, + "loss": 5.7121, + "step": 8443 + }, + { + "epoch": 2.881911262798635, + "grad_norm": 2.5419199466705322, + "learning_rate": 3.936291240045507e-05, + "loss": 6.1733, + "step": 8444 + }, + { + "epoch": 2.8822525597269624, + "grad_norm": 2.4785492420196533, + "learning_rate": 3.924914675767918e-05, + "loss": 5.7262, + "step": 8445 + }, + { + "epoch": 2.88259385665529, + "grad_norm": 2.5293445587158203, + "learning_rate": 3.91353811149033e-05, + "loss": 6.0911, + "step": 8446 + }, + { + "epoch": 2.8829351535836176, + "grad_norm": 2.6208319664001465, + "learning_rate": 3.902161547212742e-05, + "loss": 6.4624, + "step": 8447 + }, + { + "epoch": 2.8832764505119455, + "grad_norm": 2.5119760036468506, + "learning_rate": 3.8907849829351535e-05, + "loss": 5.5541, + "step": 8448 + }, + { + "epoch": 2.883617747440273, + "grad_norm": 2.55000638961792, + "learning_rate": 3.879408418657566e-05, + "loss": 6.5788, + "step": 8449 + }, + { + "epoch": 2.8839590443686007, + "grad_norm": 2.545145273208618, + "learning_rate": 3.868031854379977e-05, + "loss": 6.2904, + "step": 8450 + }, + { + "epoch": 2.884300341296928, + "grad_norm": 2.5100696086883545, + "learning_rate": 3.856655290102389e-05, + "loss": 6.0244, + "step": 8451 + }, + { + "epoch": 2.884641638225256, + "grad_norm": 2.528092384338379, + "learning_rate": 3.845278725824801e-05, + "loss": 4.7265, + "step": 8452 + }, + { + "epoch": 2.884982935153584, + "grad_norm": 2.526163339614868, + "learning_rate": 3.833902161547213e-05, + "loss": 5.983, + "step": 8453 + }, + { + "epoch": 2.8853242320819112, + "grad_norm": 2.3863651752471924, + "learning_rate": 3.8225255972696246e-05, + "loss": 4.602, + "step": 8454 + }, + { + "epoch": 2.8856655290102387, + "grad_norm": 2.4391610622406006, + "learning_rate": 3.8111490329920365e-05, + "loss": 5.4737, + "step": 8455 + }, + { + "epoch": 2.8860068259385665, + "grad_norm": 2.584704875946045, + "learning_rate": 3.799772468714448e-05, + "loss": 5.8695, + "step": 8456 + }, + { + "epoch": 2.8863481228668944, + "grad_norm": 2.5321240425109863, + "learning_rate": 3.78839590443686e-05, + "loss": 5.7091, + "step": 8457 + }, + { + "epoch": 2.8866894197952218, + "grad_norm": 1.8349437713623047, + "learning_rate": 3.777019340159272e-05, + "loss": 3.3003, + "step": 8458 + }, + { + "epoch": 2.8870307167235496, + "grad_norm": 2.5653040409088135, + "learning_rate": 3.765642775881684e-05, + "loss": 6.2374, + "step": 8459 + }, + { + "epoch": 2.887372013651877, + "grad_norm": 2.696136236190796, + "learning_rate": 3.754266211604096e-05, + "loss": 5.7614, + "step": 8460 + }, + { + "epoch": 2.887713310580205, + "grad_norm": 2.485445499420166, + "learning_rate": 3.7428896473265076e-05, + "loss": 4.7444, + "step": 8461 + }, + { + "epoch": 2.8880546075085323, + "grad_norm": 2.547309398651123, + "learning_rate": 3.7315130830489194e-05, + "loss": 6.003, + "step": 8462 + }, + { + "epoch": 2.88839590443686, + "grad_norm": 2.4875457286834717, + "learning_rate": 3.720136518771331e-05, + "loss": 6.3583, + "step": 8463 + }, + { + "epoch": 2.8887372013651875, + "grad_norm": 2.5078489780426025, + "learning_rate": 3.708759954493743e-05, + "loss": 5.2605, + "step": 8464 + }, + { + "epoch": 2.8890784982935154, + "grad_norm": 2.540022373199463, + "learning_rate": 3.697383390216154e-05, + "loss": 6.3982, + "step": 8465 + }, + { + "epoch": 2.8894197952218432, + "grad_norm": 2.497128963470459, + "learning_rate": 3.686006825938567e-05, + "loss": 5.7944, + "step": 8466 + }, + { + "epoch": 2.8897610921501706, + "grad_norm": 2.550382614135742, + "learning_rate": 3.674630261660979e-05, + "loss": 6.0483, + "step": 8467 + }, + { + "epoch": 2.890102389078498, + "grad_norm": 2.4876744747161865, + "learning_rate": 3.6632536973833905e-05, + "loss": 5.8666, + "step": 8468 + }, + { + "epoch": 2.890443686006826, + "grad_norm": 2.4927735328674316, + "learning_rate": 3.6518771331058024e-05, + "loss": 5.7912, + "step": 8469 + }, + { + "epoch": 2.8907849829351537, + "grad_norm": 2.4995381832122803, + "learning_rate": 3.6405005688282136e-05, + "loss": 5.6446, + "step": 8470 + }, + { + "epoch": 2.891126279863481, + "grad_norm": 2.485089063644409, + "learning_rate": 3.6291240045506254e-05, + "loss": 6.2976, + "step": 8471 + }, + { + "epoch": 2.891467576791809, + "grad_norm": 2.451723337173462, + "learning_rate": 3.617747440273038e-05, + "loss": 5.6149, + "step": 8472 + }, + { + "epoch": 2.8918088737201364, + "grad_norm": 2.487278938293457, + "learning_rate": 3.60637087599545e-05, + "loss": 5.683, + "step": 8473 + }, + { + "epoch": 2.8921501706484642, + "grad_norm": 2.5790092945098877, + "learning_rate": 3.594994311717861e-05, + "loss": 5.1775, + "step": 8474 + }, + { + "epoch": 2.8924914675767917, + "grad_norm": 2.5560343265533447, + "learning_rate": 3.583617747440273e-05, + "loss": 5.4929, + "step": 8475 + }, + { + "epoch": 2.8928327645051195, + "grad_norm": 2.5981833934783936, + "learning_rate": 3.5722411831626847e-05, + "loss": 5.6103, + "step": 8476 + }, + { + "epoch": 2.893174061433447, + "grad_norm": 2.4823999404907227, + "learning_rate": 3.560864618885097e-05, + "loss": 5.6562, + "step": 8477 + }, + { + "epoch": 2.8935153583617748, + "grad_norm": 2.4643754959106445, + "learning_rate": 3.549488054607509e-05, + "loss": 6.0838, + "step": 8478 + }, + { + "epoch": 2.8938566552901026, + "grad_norm": 2.5785889625549316, + "learning_rate": 3.53811149032992e-05, + "loss": 5.6819, + "step": 8479 + }, + { + "epoch": 2.89419795221843, + "grad_norm": 2.451211452484131, + "learning_rate": 3.526734926052332e-05, + "loss": 5.2065, + "step": 8480 + }, + { + "epoch": 2.8945392491467574, + "grad_norm": 2.5396275520324707, + "learning_rate": 3.515358361774744e-05, + "loss": 6.2585, + "step": 8481 + }, + { + "epoch": 2.8948805460750853, + "grad_norm": 2.5079259872436523, + "learning_rate": 3.503981797497156e-05, + "loss": 5.9257, + "step": 8482 + }, + { + "epoch": 2.895221843003413, + "grad_norm": 2.489826202392578, + "learning_rate": 3.492605233219568e-05, + "loss": 5.6171, + "step": 8483 + }, + { + "epoch": 2.8955631399317405, + "grad_norm": 2.556549310684204, + "learning_rate": 3.4812286689419794e-05, + "loss": 5.8783, + "step": 8484 + }, + { + "epoch": 2.8959044368600684, + "grad_norm": 2.518484354019165, + "learning_rate": 3.469852104664391e-05, + "loss": 5.9915, + "step": 8485 + }, + { + "epoch": 2.896245733788396, + "grad_norm": 2.582871675491333, + "learning_rate": 3.458475540386803e-05, + "loss": 6.5008, + "step": 8486 + }, + { + "epoch": 2.8965870307167236, + "grad_norm": 2.5308432579040527, + "learning_rate": 3.447098976109215e-05, + "loss": 6.2562, + "step": 8487 + }, + { + "epoch": 2.896928327645051, + "grad_norm": 2.4115796089172363, + "learning_rate": 3.435722411831627e-05, + "loss": 4.786, + "step": 8488 + }, + { + "epoch": 2.897269624573379, + "grad_norm": 2.572611093521118, + "learning_rate": 3.424345847554039e-05, + "loss": 6.1011, + "step": 8489 + }, + { + "epoch": 2.8976109215017063, + "grad_norm": 2.5578956604003906, + "learning_rate": 3.4129692832764505e-05, + "loss": 6.097, + "step": 8490 + }, + { + "epoch": 2.897952218430034, + "grad_norm": 2.5361123085021973, + "learning_rate": 3.4015927189988624e-05, + "loss": 5.612, + "step": 8491 + }, + { + "epoch": 2.898293515358362, + "grad_norm": 2.522542715072632, + "learning_rate": 3.390216154721274e-05, + "loss": 6.287, + "step": 8492 + }, + { + "epoch": 2.8986348122866894, + "grad_norm": 2.521287202835083, + "learning_rate": 3.378839590443686e-05, + "loss": 6.448, + "step": 8493 + }, + { + "epoch": 2.898976109215017, + "grad_norm": 2.5632736682891846, + "learning_rate": 3.367463026166098e-05, + "loss": 6.7079, + "step": 8494 + }, + { + "epoch": 2.8993174061433447, + "grad_norm": 2.497307300567627, + "learning_rate": 3.35608646188851e-05, + "loss": 5.8383, + "step": 8495 + }, + { + "epoch": 2.8996587030716725, + "grad_norm": 2.5428879261016846, + "learning_rate": 3.3447098976109216e-05, + "loss": 6.4592, + "step": 8496 + }, + { + "epoch": 2.9, + "grad_norm": 2.5488977432250977, + "learning_rate": 3.3333333333333335e-05, + "loss": 5.6038, + "step": 8497 + }, + { + "epoch": 2.9003412969283278, + "grad_norm": 2.5850279331207275, + "learning_rate": 3.3219567690557453e-05, + "loss": 6.1336, + "step": 8498 + }, + { + "epoch": 2.900682593856655, + "grad_norm": 2.4947924613952637, + "learning_rate": 3.3105802047781565e-05, + "loss": 5.3009, + "step": 8499 + }, + { + "epoch": 2.901023890784983, + "grad_norm": 2.5952110290527344, + "learning_rate": 3.299203640500569e-05, + "loss": 5.9782, + "step": 8500 + }, + { + "epoch": 2.9013651877133104, + "grad_norm": 2.5056798458099365, + "learning_rate": 3.287827076222981e-05, + "loss": 5.4483, + "step": 8501 + }, + { + "epoch": 2.9017064846416383, + "grad_norm": 2.5299181938171387, + "learning_rate": 3.276450511945393e-05, + "loss": 6.3067, + "step": 8502 + }, + { + "epoch": 2.9020477815699657, + "grad_norm": 2.47225284576416, + "learning_rate": 3.2650739476678046e-05, + "loss": 5.9558, + "step": 8503 + }, + { + "epoch": 2.9023890784982935, + "grad_norm": 2.519516944885254, + "learning_rate": 3.253697383390216e-05, + "loss": 5.8317, + "step": 8504 + }, + { + "epoch": 2.9027303754266214, + "grad_norm": 2.538496255874634, + "learning_rate": 3.2423208191126276e-05, + "loss": 6.2876, + "step": 8505 + }, + { + "epoch": 2.903071672354949, + "grad_norm": 1.7910349369049072, + "learning_rate": 3.23094425483504e-05, + "loss": 3.0475, + "step": 8506 + }, + { + "epoch": 2.903412969283276, + "grad_norm": 2.440401077270508, + "learning_rate": 3.219567690557452e-05, + "loss": 6.2855, + "step": 8507 + }, + { + "epoch": 2.903754266211604, + "grad_norm": 2.4906156063079834, + "learning_rate": 3.208191126279864e-05, + "loss": 5.975, + "step": 8508 + }, + { + "epoch": 2.904095563139932, + "grad_norm": 2.4778618812561035, + "learning_rate": 3.196814562002275e-05, + "loss": 5.3293, + "step": 8509 + }, + { + "epoch": 2.9044368600682593, + "grad_norm": 2.5149154663085938, + "learning_rate": 3.185437997724687e-05, + "loss": 5.6077, + "step": 8510 + }, + { + "epoch": 2.904778156996587, + "grad_norm": 2.450698137283325, + "learning_rate": 3.1740614334470994e-05, + "loss": 5.5491, + "step": 8511 + }, + { + "epoch": 2.9051194539249146, + "grad_norm": 2.482588768005371, + "learning_rate": 3.162684869169511e-05, + "loss": 4.8993, + "step": 8512 + }, + { + "epoch": 2.9054607508532424, + "grad_norm": 2.5095643997192383, + "learning_rate": 3.151308304891923e-05, + "loss": 5.3198, + "step": 8513 + }, + { + "epoch": 2.90580204778157, + "grad_norm": 2.603271484375, + "learning_rate": 3.139931740614334e-05, + "loss": 6.0782, + "step": 8514 + }, + { + "epoch": 2.9061433447098977, + "grad_norm": 2.445197105407715, + "learning_rate": 3.128555176336746e-05, + "loss": 6.073, + "step": 8515 + }, + { + "epoch": 2.906484641638225, + "grad_norm": 2.500511407852173, + "learning_rate": 3.117178612059158e-05, + "loss": 5.7921, + "step": 8516 + }, + { + "epoch": 2.906825938566553, + "grad_norm": 2.6357011795043945, + "learning_rate": 3.10580204778157e-05, + "loss": 5.9283, + "step": 8517 + }, + { + "epoch": 2.9071672354948808, + "grad_norm": 2.5531325340270996, + "learning_rate": 3.094425483503982e-05, + "loss": 5.7393, + "step": 8518 + }, + { + "epoch": 2.907508532423208, + "grad_norm": 2.4458229541778564, + "learning_rate": 3.0830489192263935e-05, + "loss": 5.7482, + "step": 8519 + }, + { + "epoch": 2.9078498293515356, + "grad_norm": 2.4953360557556152, + "learning_rate": 3.0716723549488054e-05, + "loss": 5.7151, + "step": 8520 + }, + { + "epoch": 2.9081911262798634, + "grad_norm": 2.5121448040008545, + "learning_rate": 3.060295790671217e-05, + "loss": 5.8653, + "step": 8521 + }, + { + "epoch": 2.9085324232081913, + "grad_norm": 2.5156524181365967, + "learning_rate": 3.048919226393629e-05, + "loss": 6.4765, + "step": 8522 + }, + { + "epoch": 2.9088737201365187, + "grad_norm": 2.5591423511505127, + "learning_rate": 3.037542662116041e-05, + "loss": 6.2486, + "step": 8523 + }, + { + "epoch": 2.9092150170648465, + "grad_norm": 2.460395336151123, + "learning_rate": 3.026166097838453e-05, + "loss": 5.5661, + "step": 8524 + }, + { + "epoch": 2.909556313993174, + "grad_norm": 2.557854413986206, + "learning_rate": 3.0147895335608646e-05, + "loss": 6.0055, + "step": 8525 + }, + { + "epoch": 2.909897610921502, + "grad_norm": 2.463956594467163, + "learning_rate": 3.0034129692832765e-05, + "loss": 5.6952, + "step": 8526 + }, + { + "epoch": 2.910238907849829, + "grad_norm": 2.577049493789673, + "learning_rate": 2.9920364050056883e-05, + "loss": 6.4598, + "step": 8527 + }, + { + "epoch": 2.910580204778157, + "grad_norm": 2.5568654537200928, + "learning_rate": 2.9806598407281e-05, + "loss": 5.7478, + "step": 8528 + }, + { + "epoch": 2.9109215017064844, + "grad_norm": 2.576349973678589, + "learning_rate": 2.969283276450512e-05, + "loss": 6.455, + "step": 8529 + }, + { + "epoch": 2.9112627986348123, + "grad_norm": 2.497002601623535, + "learning_rate": 2.957906712172924e-05, + "loss": 5.4758, + "step": 8530 + }, + { + "epoch": 2.91160409556314, + "grad_norm": 2.4823288917541504, + "learning_rate": 2.9465301478953357e-05, + "loss": 6.516, + "step": 8531 + }, + { + "epoch": 2.9119453924914676, + "grad_norm": 2.561558246612549, + "learning_rate": 2.9351535836177476e-05, + "loss": 6.5232, + "step": 8532 + }, + { + "epoch": 2.912286689419795, + "grad_norm": 2.498506546020508, + "learning_rate": 2.9237770193401594e-05, + "loss": 5.9132, + "step": 8533 + }, + { + "epoch": 2.912627986348123, + "grad_norm": 2.6166279315948486, + "learning_rate": 2.912400455062571e-05, + "loss": 6.2046, + "step": 8534 + }, + { + "epoch": 2.9129692832764507, + "grad_norm": 2.5233633518218994, + "learning_rate": 2.901023890784983e-05, + "loss": 5.4725, + "step": 8535 + }, + { + "epoch": 2.913310580204778, + "grad_norm": 2.531095266342163, + "learning_rate": 2.889647326507395e-05, + "loss": 5.3685, + "step": 8536 + }, + { + "epoch": 2.913651877133106, + "grad_norm": 2.4622771739959717, + "learning_rate": 2.8782707622298065e-05, + "loss": 6.094, + "step": 8537 + }, + { + "epoch": 2.9139931740614333, + "grad_norm": 2.830827474594116, + "learning_rate": 2.8668941979522186e-05, + "loss": 5.0274, + "step": 8538 + }, + { + "epoch": 2.914334470989761, + "grad_norm": 2.535754442214966, + "learning_rate": 2.85551763367463e-05, + "loss": 5.6579, + "step": 8539 + }, + { + "epoch": 2.9146757679180886, + "grad_norm": 2.4490978717803955, + "learning_rate": 2.844141069397042e-05, + "loss": 5.3953, + "step": 8540 + }, + { + "epoch": 2.9150170648464164, + "grad_norm": 2.5437428951263428, + "learning_rate": 2.8327645051194542e-05, + "loss": 5.3498, + "step": 8541 + }, + { + "epoch": 2.915358361774744, + "grad_norm": 2.509765863418579, + "learning_rate": 2.8213879408418657e-05, + "loss": 5.8658, + "step": 8542 + }, + { + "epoch": 2.9156996587030717, + "grad_norm": 2.643817186355591, + "learning_rate": 2.8100113765642776e-05, + "loss": 4.4102, + "step": 8543 + }, + { + "epoch": 2.9160409556313995, + "grad_norm": 2.594874858856201, + "learning_rate": 2.7986348122866894e-05, + "loss": 5.9731, + "step": 8544 + }, + { + "epoch": 2.916382252559727, + "grad_norm": 2.5506644248962402, + "learning_rate": 2.7872582480091013e-05, + "loss": 5.838, + "step": 8545 + }, + { + "epoch": 2.9167235494880543, + "grad_norm": 2.4979164600372314, + "learning_rate": 2.7758816837315134e-05, + "loss": 6.2731, + "step": 8546 + }, + { + "epoch": 2.917064846416382, + "grad_norm": 2.5016138553619385, + "learning_rate": 2.764505119453925e-05, + "loss": 6.0537, + "step": 8547 + }, + { + "epoch": 2.91740614334471, + "grad_norm": 2.495339870452881, + "learning_rate": 2.7531285551763368e-05, + "loss": 6.1774, + "step": 8548 + }, + { + "epoch": 2.9177474402730375, + "grad_norm": 2.524669647216797, + "learning_rate": 2.7417519908987487e-05, + "loss": 5.435, + "step": 8549 + }, + { + "epoch": 2.9180887372013653, + "grad_norm": 2.4896841049194336, + "learning_rate": 2.7303754266211605e-05, + "loss": 5.6336, + "step": 8550 + }, + { + "epoch": 2.9184300341296927, + "grad_norm": 2.673529624938965, + "learning_rate": 2.7189988623435724e-05, + "loss": 5.4314, + "step": 8551 + }, + { + "epoch": 2.9187713310580206, + "grad_norm": 2.626699447631836, + "learning_rate": 2.7076222980659842e-05, + "loss": 5.5602, + "step": 8552 + }, + { + "epoch": 2.919112627986348, + "grad_norm": 2.520334243774414, + "learning_rate": 2.696245733788396e-05, + "loss": 5.9856, + "step": 8553 + }, + { + "epoch": 2.919453924914676, + "grad_norm": 2.5466573238372803, + "learning_rate": 2.6848691695108076e-05, + "loss": 5.8577, + "step": 8554 + }, + { + "epoch": 2.919795221843003, + "grad_norm": 2.380843162536621, + "learning_rate": 2.6734926052332198e-05, + "loss": 6.0584, + "step": 8555 + }, + { + "epoch": 2.920136518771331, + "grad_norm": 2.4532737731933594, + "learning_rate": 2.6621160409556316e-05, + "loss": 5.8358, + "step": 8556 + }, + { + "epoch": 2.920477815699659, + "grad_norm": 2.493149518966675, + "learning_rate": 2.650739476678043e-05, + "loss": 6.2587, + "step": 8557 + }, + { + "epoch": 2.9208191126279863, + "grad_norm": 2.247823476791382, + "learning_rate": 2.6393629124004553e-05, + "loss": 4.0832, + "step": 8558 + }, + { + "epoch": 2.9211604095563137, + "grad_norm": 2.4827280044555664, + "learning_rate": 2.6279863481228668e-05, + "loss": 5.8317, + "step": 8559 + }, + { + "epoch": 2.9215017064846416, + "grad_norm": 2.504302978515625, + "learning_rate": 2.6166097838452787e-05, + "loss": 5.7784, + "step": 8560 + }, + { + "epoch": 2.9218430034129694, + "grad_norm": 2.515334367752075, + "learning_rate": 2.6052332195676905e-05, + "loss": 5.533, + "step": 8561 + }, + { + "epoch": 2.922184300341297, + "grad_norm": 2.5521583557128906, + "learning_rate": 2.5938566552901024e-05, + "loss": 5.6752, + "step": 8562 + }, + { + "epoch": 2.9225255972696247, + "grad_norm": 2.538944959640503, + "learning_rate": 2.5824800910125145e-05, + "loss": 6.2666, + "step": 8563 + }, + { + "epoch": 2.922866894197952, + "grad_norm": 5.985457897186279, + "learning_rate": 2.571103526734926e-05, + "loss": 4.8984, + "step": 8564 + }, + { + "epoch": 2.92320819112628, + "grad_norm": 2.5538158416748047, + "learning_rate": 2.559726962457338e-05, + "loss": 5.5896, + "step": 8565 + }, + { + "epoch": 2.9235494880546073, + "grad_norm": 2.5504369735717773, + "learning_rate": 2.5483503981797498e-05, + "loss": 5.352, + "step": 8566 + }, + { + "epoch": 2.923890784982935, + "grad_norm": 2.536086082458496, + "learning_rate": 2.5369738339021616e-05, + "loss": 5.7882, + "step": 8567 + }, + { + "epoch": 2.9242320819112626, + "grad_norm": 2.487410068511963, + "learning_rate": 2.5255972696245735e-05, + "loss": 6.1777, + "step": 8568 + }, + { + "epoch": 2.9245733788395905, + "grad_norm": 2.55271577835083, + "learning_rate": 2.5142207053469853e-05, + "loss": 6.6132, + "step": 8569 + }, + { + "epoch": 2.9249146757679183, + "grad_norm": 2.5541188716888428, + "learning_rate": 2.502844141069397e-05, + "loss": 4.3692, + "step": 8570 + }, + { + "epoch": 2.9252559726962457, + "grad_norm": 2.487154006958008, + "learning_rate": 2.4914675767918087e-05, + "loss": 6.1641, + "step": 8571 + }, + { + "epoch": 2.925597269624573, + "grad_norm": 2.4862101078033447, + "learning_rate": 2.480091012514221e-05, + "loss": 6.3252, + "step": 8572 + }, + { + "epoch": 2.925938566552901, + "grad_norm": 2.580897569656372, + "learning_rate": 2.4687144482366327e-05, + "loss": 5.0885, + "step": 8573 + }, + { + "epoch": 2.926279863481229, + "grad_norm": 2.546022653579712, + "learning_rate": 2.4573378839590442e-05, + "loss": 5.7194, + "step": 8574 + }, + { + "epoch": 2.926621160409556, + "grad_norm": 2.55588436126709, + "learning_rate": 2.4459613196814564e-05, + "loss": 6.2847, + "step": 8575 + }, + { + "epoch": 2.926962457337884, + "grad_norm": 2.4706380367279053, + "learning_rate": 2.434584755403868e-05, + "loss": 5.9013, + "step": 8576 + }, + { + "epoch": 2.9273037542662115, + "grad_norm": 2.4694294929504395, + "learning_rate": 2.4232081911262798e-05, + "loss": 5.854, + "step": 8577 + }, + { + "epoch": 2.9276450511945393, + "grad_norm": 2.516496181488037, + "learning_rate": 2.411831626848692e-05, + "loss": 5.9697, + "step": 8578 + }, + { + "epoch": 2.9279863481228667, + "grad_norm": 2.545330762863159, + "learning_rate": 2.4004550625711035e-05, + "loss": 6.2448, + "step": 8579 + }, + { + "epoch": 2.9283276450511946, + "grad_norm": 2.4520492553710938, + "learning_rate": 2.3890784982935157e-05, + "loss": 5.3888, + "step": 8580 + }, + { + "epoch": 2.928668941979522, + "grad_norm": 2.5857083797454834, + "learning_rate": 2.377701934015927e-05, + "loss": 6.2331, + "step": 8581 + }, + { + "epoch": 2.92901023890785, + "grad_norm": 2.9561941623687744, + "learning_rate": 2.366325369738339e-05, + "loss": 4.9944, + "step": 8582 + }, + { + "epoch": 2.9293515358361777, + "grad_norm": 2.5174217224121094, + "learning_rate": 2.3549488054607512e-05, + "loss": 5.897, + "step": 8583 + }, + { + "epoch": 2.929692832764505, + "grad_norm": 2.585947275161743, + "learning_rate": 2.3435722411831627e-05, + "loss": 6.0279, + "step": 8584 + }, + { + "epoch": 2.9300341296928325, + "grad_norm": 2.439552068710327, + "learning_rate": 2.3321956769055746e-05, + "loss": 5.9078, + "step": 8585 + }, + { + "epoch": 2.9303754266211604, + "grad_norm": 2.4478914737701416, + "learning_rate": 2.3208191126279864e-05, + "loss": 5.6497, + "step": 8586 + }, + { + "epoch": 2.930716723549488, + "grad_norm": 2.537717342376709, + "learning_rate": 2.3094425483503983e-05, + "loss": 6.0121, + "step": 8587 + }, + { + "epoch": 2.9310580204778156, + "grad_norm": 2.4616639614105225, + "learning_rate": 2.2980659840728098e-05, + "loss": 5.9547, + "step": 8588 + }, + { + "epoch": 2.9313993174061435, + "grad_norm": 2.514845371246338, + "learning_rate": 2.286689419795222e-05, + "loss": 6.3048, + "step": 8589 + }, + { + "epoch": 2.931740614334471, + "grad_norm": 2.4787206649780273, + "learning_rate": 2.2753128555176338e-05, + "loss": 6.2414, + "step": 8590 + }, + { + "epoch": 2.9320819112627987, + "grad_norm": 2.4855761528015137, + "learning_rate": 2.2639362912400453e-05, + "loss": 6.3916, + "step": 8591 + }, + { + "epoch": 2.932423208191126, + "grad_norm": 2.6569809913635254, + "learning_rate": 2.2525597269624575e-05, + "loss": 6.2394, + "step": 8592 + }, + { + "epoch": 2.932764505119454, + "grad_norm": 2.4555177688598633, + "learning_rate": 2.241183162684869e-05, + "loss": 5.8789, + "step": 8593 + }, + { + "epoch": 2.9331058020477814, + "grad_norm": 2.477505683898926, + "learning_rate": 2.2298065984072812e-05, + "loss": 5.7146, + "step": 8594 + }, + { + "epoch": 2.9334470989761092, + "grad_norm": 2.5200958251953125, + "learning_rate": 2.218430034129693e-05, + "loss": 6.0025, + "step": 8595 + }, + { + "epoch": 2.933788395904437, + "grad_norm": 2.5263843536376953, + "learning_rate": 2.2070534698521046e-05, + "loss": 6.2893, + "step": 8596 + }, + { + "epoch": 2.9341296928327645, + "grad_norm": 2.552253007888794, + "learning_rate": 2.1956769055745168e-05, + "loss": 6.0527, + "step": 8597 + }, + { + "epoch": 2.934470989761092, + "grad_norm": 2.494657039642334, + "learning_rate": 2.1843003412969283e-05, + "loss": 5.55, + "step": 8598 + }, + { + "epoch": 2.9348122866894197, + "grad_norm": 2.446608543395996, + "learning_rate": 2.17292377701934e-05, + "loss": 5.6856, + "step": 8599 + }, + { + "epoch": 2.9351535836177476, + "grad_norm": 2.4764108657836914, + "learning_rate": 2.1615472127417523e-05, + "loss": 6.2355, + "step": 8600 + }, + { + "epoch": 2.935494880546075, + "grad_norm": 2.570817232131958, + "learning_rate": 2.1501706484641638e-05, + "loss": 6.3309, + "step": 8601 + }, + { + "epoch": 2.935836177474403, + "grad_norm": 2.5633604526519775, + "learning_rate": 2.1387940841865757e-05, + "loss": 5.7691, + "step": 8602 + }, + { + "epoch": 2.9361774744027302, + "grad_norm": 2.5183494091033936, + "learning_rate": 2.1274175199089875e-05, + "loss": 4.9273, + "step": 8603 + }, + { + "epoch": 2.936518771331058, + "grad_norm": 2.4383068084716797, + "learning_rate": 2.1160409556313994e-05, + "loss": 5.0008, + "step": 8604 + }, + { + "epoch": 2.9368600682593855, + "grad_norm": 2.5003600120544434, + "learning_rate": 2.1046643913538112e-05, + "loss": 5.857, + "step": 8605 + }, + { + "epoch": 2.9372013651877134, + "grad_norm": 2.480581283569336, + "learning_rate": 2.093287827076223e-05, + "loss": 5.1716, + "step": 8606 + }, + { + "epoch": 2.9375426621160408, + "grad_norm": 2.540728807449341, + "learning_rate": 2.081911262798635e-05, + "loss": 6.2848, + "step": 8607 + }, + { + "epoch": 2.9378839590443686, + "grad_norm": 2.5028791427612305, + "learning_rate": 2.0705346985210464e-05, + "loss": 5.7391, + "step": 8608 + }, + { + "epoch": 2.9382252559726965, + "grad_norm": 2.4789538383483887, + "learning_rate": 2.0591581342434586e-05, + "loss": 5.223, + "step": 8609 + }, + { + "epoch": 2.938566552901024, + "grad_norm": 2.4967267513275146, + "learning_rate": 2.04778156996587e-05, + "loss": 5.668, + "step": 8610 + }, + { + "epoch": 2.9389078498293513, + "grad_norm": 2.519948959350586, + "learning_rate": 2.0364050056882823e-05, + "loss": 5.9026, + "step": 8611 + }, + { + "epoch": 2.939249146757679, + "grad_norm": 2.4573259353637695, + "learning_rate": 2.025028441410694e-05, + "loss": 5.8025, + "step": 8612 + }, + { + "epoch": 2.939590443686007, + "grad_norm": 2.5251667499542236, + "learning_rate": 2.0136518771331057e-05, + "loss": 5.5212, + "step": 8613 + }, + { + "epoch": 2.9399317406143344, + "grad_norm": 2.6210274696350098, + "learning_rate": 2.002275312855518e-05, + "loss": 5.3951, + "step": 8614 + }, + { + "epoch": 2.9402730375426622, + "grad_norm": 2.532759189605713, + "learning_rate": 1.9908987485779294e-05, + "loss": 5.797, + "step": 8615 + }, + { + "epoch": 2.9406143344709896, + "grad_norm": 2.4565927982330322, + "learning_rate": 1.9795221843003412e-05, + "loss": 5.4728, + "step": 8616 + }, + { + "epoch": 2.9409556313993175, + "grad_norm": 2.4736921787261963, + "learning_rate": 1.9681456200227534e-05, + "loss": 5.9357, + "step": 8617 + }, + { + "epoch": 2.941296928327645, + "grad_norm": 2.5852224826812744, + "learning_rate": 1.956769055745165e-05, + "loss": 4.7119, + "step": 8618 + }, + { + "epoch": 2.9416382252559727, + "grad_norm": 2.5384979248046875, + "learning_rate": 1.9453924914675768e-05, + "loss": 5.4497, + "step": 8619 + }, + { + "epoch": 2.9419795221843, + "grad_norm": 2.4566867351531982, + "learning_rate": 1.9340159271899886e-05, + "loss": 5.7871, + "step": 8620 + }, + { + "epoch": 2.942320819112628, + "grad_norm": 2.4984500408172607, + "learning_rate": 1.9226393629124005e-05, + "loss": 5.9535, + "step": 8621 + }, + { + "epoch": 2.942662116040956, + "grad_norm": 2.4898228645324707, + "learning_rate": 1.9112627986348123e-05, + "loss": 5.7868, + "step": 8622 + }, + { + "epoch": 2.9430034129692833, + "grad_norm": 2.410717248916626, + "learning_rate": 1.899886234357224e-05, + "loss": 4.692, + "step": 8623 + }, + { + "epoch": 2.9433447098976107, + "grad_norm": 2.5031991004943848, + "learning_rate": 1.888509670079636e-05, + "loss": 6.0732, + "step": 8624 + }, + { + "epoch": 2.9436860068259385, + "grad_norm": 2.5316109657287598, + "learning_rate": 1.877133105802048e-05, + "loss": 5.5196, + "step": 8625 + }, + { + "epoch": 2.9440273037542664, + "grad_norm": 2.44270658493042, + "learning_rate": 1.8657565415244597e-05, + "loss": 5.973, + "step": 8626 + }, + { + "epoch": 2.9443686006825938, + "grad_norm": 2.500929355621338, + "learning_rate": 1.8543799772468716e-05, + "loss": 6.2362, + "step": 8627 + }, + { + "epoch": 2.9447098976109216, + "grad_norm": 2.4967660903930664, + "learning_rate": 1.8430034129692834e-05, + "loss": 6.0688, + "step": 8628 + }, + { + "epoch": 2.945051194539249, + "grad_norm": 2.551168441772461, + "learning_rate": 1.8316268486916953e-05, + "loss": 6.4435, + "step": 8629 + }, + { + "epoch": 2.945392491467577, + "grad_norm": 2.529707431793213, + "learning_rate": 1.8202502844141068e-05, + "loss": 6.0493, + "step": 8630 + }, + { + "epoch": 2.9457337883959043, + "grad_norm": 2.394730567932129, + "learning_rate": 1.808873720136519e-05, + "loss": 5.1711, + "step": 8631 + }, + { + "epoch": 2.946075085324232, + "grad_norm": 2.5961592197418213, + "learning_rate": 1.7974971558589305e-05, + "loss": 6.0267, + "step": 8632 + }, + { + "epoch": 2.9464163822525595, + "grad_norm": 2.4679746627807617, + "learning_rate": 1.7861205915813423e-05, + "loss": 5.6721, + "step": 8633 + }, + { + "epoch": 2.9467576791808874, + "grad_norm": 2.5363996028900146, + "learning_rate": 1.7747440273037545e-05, + "loss": 5.6486, + "step": 8634 + }, + { + "epoch": 2.9470989761092152, + "grad_norm": 2.4979100227355957, + "learning_rate": 1.763367463026166e-05, + "loss": 6.2355, + "step": 8635 + }, + { + "epoch": 2.9474402730375426, + "grad_norm": 2.5005037784576416, + "learning_rate": 1.751990898748578e-05, + "loss": 6.0746, + "step": 8636 + }, + { + "epoch": 2.94778156996587, + "grad_norm": 2.6224281787872314, + "learning_rate": 1.7406143344709897e-05, + "loss": 6.1915, + "step": 8637 + }, + { + "epoch": 2.948122866894198, + "grad_norm": 2.4762070178985596, + "learning_rate": 1.7292377701934016e-05, + "loss": 5.9801, + "step": 8638 + }, + { + "epoch": 2.9484641638225257, + "grad_norm": 2.5673601627349854, + "learning_rate": 1.7178612059158134e-05, + "loss": 5.5908, + "step": 8639 + }, + { + "epoch": 2.948805460750853, + "grad_norm": 2.461416482925415, + "learning_rate": 1.7064846416382253e-05, + "loss": 5.2207, + "step": 8640 + }, + { + "epoch": 2.949146757679181, + "grad_norm": 2.4866373538970947, + "learning_rate": 1.695108077360637e-05, + "loss": 6.422, + "step": 8641 + }, + { + "epoch": 2.9494880546075084, + "grad_norm": 2.6868081092834473, + "learning_rate": 1.683731513083049e-05, + "loss": 4.5656, + "step": 8642 + }, + { + "epoch": 2.9498293515358363, + "grad_norm": 2.542024612426758, + "learning_rate": 1.6723549488054608e-05, + "loss": 5.2073, + "step": 8643 + }, + { + "epoch": 2.9501706484641637, + "grad_norm": 2.4783406257629395, + "learning_rate": 1.6609783845278727e-05, + "loss": 5.8559, + "step": 8644 + }, + { + "epoch": 2.9505119453924915, + "grad_norm": 2.5044283866882324, + "learning_rate": 1.6496018202502845e-05, + "loss": 6.2763, + "step": 8645 + }, + { + "epoch": 2.950853242320819, + "grad_norm": 2.472458600997925, + "learning_rate": 1.6382252559726964e-05, + "loss": 6.0363, + "step": 8646 + }, + { + "epoch": 2.9511945392491468, + "grad_norm": 2.4711015224456787, + "learning_rate": 1.626848691695108e-05, + "loss": 5.4181, + "step": 8647 + }, + { + "epoch": 2.9515358361774746, + "grad_norm": 2.5318753719329834, + "learning_rate": 1.61547212741752e-05, + "loss": 6.4278, + "step": 8648 + }, + { + "epoch": 2.951877133105802, + "grad_norm": 2.4760727882385254, + "learning_rate": 1.604095563139932e-05, + "loss": 6.1243, + "step": 8649 + }, + { + "epoch": 2.9522184300341294, + "grad_norm": 2.4930100440979004, + "learning_rate": 1.5927189988623434e-05, + "loss": 5.0605, + "step": 8650 + }, + { + "epoch": 2.9525597269624573, + "grad_norm": 2.477997064590454, + "learning_rate": 1.5813424345847556e-05, + "loss": 6.0705, + "step": 8651 + }, + { + "epoch": 2.952901023890785, + "grad_norm": 2.1888606548309326, + "learning_rate": 1.569965870307167e-05, + "loss": 4.6192, + "step": 8652 + }, + { + "epoch": 2.9532423208191125, + "grad_norm": 2.3284385204315186, + "learning_rate": 1.558589306029579e-05, + "loss": 4.0104, + "step": 8653 + }, + { + "epoch": 2.9535836177474404, + "grad_norm": 2.4472768306732178, + "learning_rate": 1.547212741751991e-05, + "loss": 5.7057, + "step": 8654 + }, + { + "epoch": 2.953924914675768, + "grad_norm": 2.4223828315734863, + "learning_rate": 1.5358361774744027e-05, + "loss": 5.7879, + "step": 8655 + }, + { + "epoch": 2.9542662116040956, + "grad_norm": 2.5395631790161133, + "learning_rate": 1.5244596131968145e-05, + "loss": 5.7686, + "step": 8656 + }, + { + "epoch": 2.954607508532423, + "grad_norm": 2.530900239944458, + "learning_rate": 1.5130830489192265e-05, + "loss": 4.8828, + "step": 8657 + }, + { + "epoch": 2.954948805460751, + "grad_norm": 2.5554354190826416, + "learning_rate": 1.5017064846416382e-05, + "loss": 6.6185, + "step": 8658 + }, + { + "epoch": 2.9552901023890783, + "grad_norm": 2.492499828338623, + "learning_rate": 1.49032992036405e-05, + "loss": 5.1967, + "step": 8659 + }, + { + "epoch": 2.955631399317406, + "grad_norm": 2.4863228797912598, + "learning_rate": 1.478953356086462e-05, + "loss": 5.9918, + "step": 8660 + }, + { + "epoch": 2.955972696245734, + "grad_norm": 2.4008748531341553, + "learning_rate": 1.4675767918088738e-05, + "loss": 5.112, + "step": 8661 + }, + { + "epoch": 2.9563139931740614, + "grad_norm": 2.5058634281158447, + "learning_rate": 1.4562002275312855e-05, + "loss": 5.9349, + "step": 8662 + }, + { + "epoch": 2.956655290102389, + "grad_norm": 2.5977463722229004, + "learning_rate": 1.4448236632536975e-05, + "loss": 5.3876, + "step": 8663 + }, + { + "epoch": 2.9569965870307167, + "grad_norm": 1.7508560419082642, + "learning_rate": 1.4334470989761093e-05, + "loss": 2.9253, + "step": 8664 + }, + { + "epoch": 2.9573378839590445, + "grad_norm": 2.433396816253662, + "learning_rate": 1.422070534698521e-05, + "loss": 6.2839, + "step": 8665 + }, + { + "epoch": 2.957679180887372, + "grad_norm": 2.559898614883423, + "learning_rate": 1.4106939704209329e-05, + "loss": 5.909, + "step": 8666 + }, + { + "epoch": 2.9580204778156998, + "grad_norm": 2.5637035369873047, + "learning_rate": 1.3993174061433447e-05, + "loss": 5.9518, + "step": 8667 + }, + { + "epoch": 2.958361774744027, + "grad_norm": 2.566162586212158, + "learning_rate": 1.3879408418657567e-05, + "loss": 6.0647, + "step": 8668 + }, + { + "epoch": 2.958703071672355, + "grad_norm": 2.502931594848633, + "learning_rate": 1.3765642775881684e-05, + "loss": 4.3806, + "step": 8669 + }, + { + "epoch": 2.9590443686006824, + "grad_norm": 2.453782320022583, + "learning_rate": 1.3651877133105803e-05, + "loss": 5.6523, + "step": 8670 + }, + { + "epoch": 2.9593856655290103, + "grad_norm": 2.585204601287842, + "learning_rate": 1.3538111490329921e-05, + "loss": 5.9991, + "step": 8671 + }, + { + "epoch": 2.9597269624573377, + "grad_norm": 2.713543176651001, + "learning_rate": 1.3424345847554038e-05, + "loss": 4.8614, + "step": 8672 + }, + { + "epoch": 2.9600682593856655, + "grad_norm": 2.511810541152954, + "learning_rate": 1.3310580204778158e-05, + "loss": 6.274, + "step": 8673 + }, + { + "epoch": 2.9604095563139934, + "grad_norm": 2.4611916542053223, + "learning_rate": 1.3196814562002277e-05, + "loss": 5.9661, + "step": 8674 + }, + { + "epoch": 2.960750853242321, + "grad_norm": 2.5248091220855713, + "learning_rate": 1.3083048919226393e-05, + "loss": 6.2989, + "step": 8675 + }, + { + "epoch": 2.961092150170648, + "grad_norm": 2.4829485416412354, + "learning_rate": 1.2969283276450512e-05, + "loss": 5.8782, + "step": 8676 + }, + { + "epoch": 2.961433447098976, + "grad_norm": 2.432086229324341, + "learning_rate": 1.285551763367463e-05, + "loss": 5.5988, + "step": 8677 + }, + { + "epoch": 2.961774744027304, + "grad_norm": 2.426537036895752, + "learning_rate": 1.2741751990898749e-05, + "loss": 4.3375, + "step": 8678 + }, + { + "epoch": 2.9621160409556313, + "grad_norm": 2.4175045490264893, + "learning_rate": 1.2627986348122867e-05, + "loss": 5.8644, + "step": 8679 + }, + { + "epoch": 2.962457337883959, + "grad_norm": 2.5056276321411133, + "learning_rate": 1.2514220705346986e-05, + "loss": 5.4849, + "step": 8680 + }, + { + "epoch": 2.9627986348122866, + "grad_norm": 2.4787650108337402, + "learning_rate": 1.2400455062571104e-05, + "loss": 5.944, + "step": 8681 + }, + { + "epoch": 2.9631399317406144, + "grad_norm": 2.4695487022399902, + "learning_rate": 1.2286689419795221e-05, + "loss": 5.8351, + "step": 8682 + }, + { + "epoch": 2.963481228668942, + "grad_norm": 2.573592185974121, + "learning_rate": 1.217292377701934e-05, + "loss": 5.9925, + "step": 8683 + }, + { + "epoch": 2.9638225255972697, + "grad_norm": 2.236001968383789, + "learning_rate": 1.205915813424346e-05, + "loss": 4.7678, + "step": 8684 + }, + { + "epoch": 2.964163822525597, + "grad_norm": 2.475437641143799, + "learning_rate": 1.1945392491467578e-05, + "loss": 6.0264, + "step": 8685 + }, + { + "epoch": 2.964505119453925, + "grad_norm": 2.467595100402832, + "learning_rate": 1.1831626848691695e-05, + "loss": 5.2569, + "step": 8686 + }, + { + "epoch": 2.9648464163822528, + "grad_norm": 2.4960720539093018, + "learning_rate": 1.1717861205915814e-05, + "loss": 5.4893, + "step": 8687 + }, + { + "epoch": 2.96518771331058, + "grad_norm": 2.579531192779541, + "learning_rate": 1.1604095563139932e-05, + "loss": 6.2595, + "step": 8688 + }, + { + "epoch": 2.9655290102389076, + "grad_norm": 2.4547293186187744, + "learning_rate": 1.1490329920364049e-05, + "loss": 6.4029, + "step": 8689 + }, + { + "epoch": 2.9658703071672354, + "grad_norm": 2.6123619079589844, + "learning_rate": 1.1376564277588169e-05, + "loss": 5.7366, + "step": 8690 + }, + { + "epoch": 2.9662116040955633, + "grad_norm": 2.4635937213897705, + "learning_rate": 1.1262798634812288e-05, + "loss": 5.4398, + "step": 8691 + }, + { + "epoch": 2.9665529010238907, + "grad_norm": 2.44295597076416, + "learning_rate": 1.1149032992036406e-05, + "loss": 5.6055, + "step": 8692 + }, + { + "epoch": 2.9668941979522185, + "grad_norm": 2.5429694652557373, + "learning_rate": 1.1035267349260523e-05, + "loss": 5.9528, + "step": 8693 + }, + { + "epoch": 2.967235494880546, + "grad_norm": 2.481144428253174, + "learning_rate": 1.0921501706484641e-05, + "loss": 6.0664, + "step": 8694 + }, + { + "epoch": 2.967576791808874, + "grad_norm": 2.5129120349884033, + "learning_rate": 1.0807736063708762e-05, + "loss": 5.1419, + "step": 8695 + }, + { + "epoch": 2.967918088737201, + "grad_norm": 2.522629737854004, + "learning_rate": 1.0693970420932878e-05, + "loss": 6.0011, + "step": 8696 + }, + { + "epoch": 2.968259385665529, + "grad_norm": 2.5712890625, + "learning_rate": 1.0580204778156997e-05, + "loss": 5.9975, + "step": 8697 + }, + { + "epoch": 2.9686006825938565, + "grad_norm": 2.445847511291504, + "learning_rate": 1.0466439135381115e-05, + "loss": 5.2381, + "step": 8698 + }, + { + "epoch": 2.9689419795221843, + "grad_norm": 2.4890472888946533, + "learning_rate": 1.0352673492605232e-05, + "loss": 6.3443, + "step": 8699 + }, + { + "epoch": 2.969283276450512, + "grad_norm": 2.519037961959839, + "learning_rate": 1.023890784982935e-05, + "loss": 4.9883, + "step": 8700 + }, + { + "epoch": 2.9696245733788396, + "grad_norm": 2.496644973754883, + "learning_rate": 1.012514220705347e-05, + "loss": 5.4765, + "step": 8701 + }, + { + "epoch": 2.969965870307167, + "grad_norm": 2.4772489070892334, + "learning_rate": 1.001137656427759e-05, + "loss": 6.0936, + "step": 8702 + }, + { + "epoch": 2.970307167235495, + "grad_norm": 2.4817209243774414, + "learning_rate": 9.897610921501706e-06, + "loss": 5.833, + "step": 8703 + }, + { + "epoch": 2.9706484641638227, + "grad_norm": 2.488058567047119, + "learning_rate": 9.783845278725825e-06, + "loss": 6.3268, + "step": 8704 + }, + { + "epoch": 2.97098976109215, + "grad_norm": 2.6241326332092285, + "learning_rate": 9.670079635949943e-06, + "loss": 6.1281, + "step": 8705 + }, + { + "epoch": 2.971331058020478, + "grad_norm": 2.4947798252105713, + "learning_rate": 9.556313993174062e-06, + "loss": 6.1815, + "step": 8706 + }, + { + "epoch": 2.9716723549488053, + "grad_norm": 2.588179349899292, + "learning_rate": 9.44254835039818e-06, + "loss": 6.1517, + "step": 8707 + }, + { + "epoch": 2.972013651877133, + "grad_norm": 2.5421693325042725, + "learning_rate": 9.328782707622299e-06, + "loss": 5.874, + "step": 8708 + }, + { + "epoch": 2.972354948805461, + "grad_norm": 2.4261326789855957, + "learning_rate": 9.215017064846417e-06, + "loss": 5.1937, + "step": 8709 + }, + { + "epoch": 2.9726962457337884, + "grad_norm": 4.1984076499938965, + "learning_rate": 9.101251422070534e-06, + "loss": 4.4323, + "step": 8710 + }, + { + "epoch": 2.973037542662116, + "grad_norm": 2.5183022022247314, + "learning_rate": 8.987485779294652e-06, + "loss": 6.3584, + "step": 8711 + }, + { + "epoch": 2.9733788395904437, + "grad_norm": 2.593282699584961, + "learning_rate": 8.873720136518773e-06, + "loss": 6.1431, + "step": 8712 + }, + { + "epoch": 2.9737201365187715, + "grad_norm": 2.50671124458313, + "learning_rate": 8.75995449374289e-06, + "loss": 5.9708, + "step": 8713 + }, + { + "epoch": 2.974061433447099, + "grad_norm": 2.5852482318878174, + "learning_rate": 8.646188850967008e-06, + "loss": 6.1701, + "step": 8714 + }, + { + "epoch": 2.9744027303754264, + "grad_norm": 2.4211649894714355, + "learning_rate": 8.532423208191126e-06, + "loss": 5.8361, + "step": 8715 + }, + { + "epoch": 2.974744027303754, + "grad_norm": 2.5099081993103027, + "learning_rate": 8.418657565415245e-06, + "loss": 5.9594, + "step": 8716 + }, + { + "epoch": 2.975085324232082, + "grad_norm": 2.5803756713867188, + "learning_rate": 8.304891922639363e-06, + "loss": 6.0941, + "step": 8717 + }, + { + "epoch": 2.9754266211604095, + "grad_norm": 2.493959665298462, + "learning_rate": 8.191126279863482e-06, + "loss": 6.2058, + "step": 8718 + }, + { + "epoch": 2.9757679180887373, + "grad_norm": 2.3790855407714844, + "learning_rate": 8.0773606370876e-06, + "loss": 4.5639, + "step": 8719 + }, + { + "epoch": 2.9761092150170647, + "grad_norm": 2.463736057281494, + "learning_rate": 7.963594994311717e-06, + "loss": 5.2897, + "step": 8720 + }, + { + "epoch": 2.9764505119453926, + "grad_norm": 2.368633270263672, + "learning_rate": 7.849829351535836e-06, + "loss": 4.8404, + "step": 8721 + }, + { + "epoch": 2.9767918088737204, + "grad_norm": 2.4608263969421387, + "learning_rate": 7.736063708759956e-06, + "loss": 6.1187, + "step": 8722 + }, + { + "epoch": 2.977133105802048, + "grad_norm": 2.5266082286834717, + "learning_rate": 7.622298065984073e-06, + "loss": 4.6866, + "step": 8723 + }, + { + "epoch": 2.9774744027303752, + "grad_norm": 2.4819984436035156, + "learning_rate": 7.508532423208191e-06, + "loss": 5.8985, + "step": 8724 + }, + { + "epoch": 2.977815699658703, + "grad_norm": 2.534745931625366, + "learning_rate": 7.39476678043231e-06, + "loss": 6.0181, + "step": 8725 + }, + { + "epoch": 2.978156996587031, + "grad_norm": 2.522770643234253, + "learning_rate": 7.281001137656427e-06, + "loss": 4.879, + "step": 8726 + }, + { + "epoch": 2.9784982935153583, + "grad_norm": 2.298532009124756, + "learning_rate": 7.167235494880547e-06, + "loss": 4.4423, + "step": 8727 + }, + { + "epoch": 2.9788395904436857, + "grad_norm": 2.4776766300201416, + "learning_rate": 7.053469852104664e-06, + "loss": 5.5068, + "step": 8728 + }, + { + "epoch": 2.9791808873720136, + "grad_norm": 2.411564588546753, + "learning_rate": 6.939704209328784e-06, + "loss": 5.7158, + "step": 8729 + }, + { + "epoch": 2.9795221843003414, + "grad_norm": 2.4252982139587402, + "learning_rate": 6.825938566552901e-06, + "loss": 4.9397, + "step": 8730 + }, + { + "epoch": 2.979863481228669, + "grad_norm": 2.472968339920044, + "learning_rate": 6.712172923777019e-06, + "loss": 4.6723, + "step": 8731 + }, + { + "epoch": 2.9802047781569967, + "grad_norm": 2.463764190673828, + "learning_rate": 6.598407281001138e-06, + "loss": 5.8935, + "step": 8732 + }, + { + "epoch": 2.980546075085324, + "grad_norm": 2.4861762523651123, + "learning_rate": 6.484641638225256e-06, + "loss": 5.6101, + "step": 8733 + }, + { + "epoch": 2.980887372013652, + "grad_norm": 2.5116894245147705, + "learning_rate": 6.370875995449374e-06, + "loss": 6.019, + "step": 8734 + }, + { + "epoch": 2.98122866894198, + "grad_norm": 2.589373826980591, + "learning_rate": 6.257110352673493e-06, + "loss": 5.3002, + "step": 8735 + }, + { + "epoch": 2.981569965870307, + "grad_norm": 2.5636868476867676, + "learning_rate": 6.1433447098976105e-06, + "loss": 6.0956, + "step": 8736 + }, + { + "epoch": 2.9819112627986346, + "grad_norm": 2.4157183170318604, + "learning_rate": 6.02957906712173e-06, + "loss": 6.1327, + "step": 8737 + }, + { + "epoch": 2.9822525597269625, + "grad_norm": 2.5133705139160156, + "learning_rate": 5.9158134243458475e-06, + "loss": 6.5907, + "step": 8738 + }, + { + "epoch": 2.9825938566552903, + "grad_norm": 2.6293768882751465, + "learning_rate": 5.802047781569966e-06, + "loss": 6.3252, + "step": 8739 + }, + { + "epoch": 2.9829351535836177, + "grad_norm": 2.530759334564209, + "learning_rate": 5.6882821387940845e-06, + "loss": 5.8731, + "step": 8740 + }, + { + "epoch": 2.983276450511945, + "grad_norm": 2.5379390716552734, + "learning_rate": 5.574516496018203e-06, + "loss": 5.9245, + "step": 8741 + }, + { + "epoch": 2.983617747440273, + "grad_norm": 2.5322394371032715, + "learning_rate": 5.460750853242321e-06, + "loss": 4.8229, + "step": 8742 + }, + { + "epoch": 2.983959044368601, + "grad_norm": 2.562993049621582, + "learning_rate": 5.346985210466439e-06, + "loss": 6.2476, + "step": 8743 + }, + { + "epoch": 2.9843003412969282, + "grad_norm": 2.5595932006835938, + "learning_rate": 5.233219567690558e-06, + "loss": 6.0822, + "step": 8744 + }, + { + "epoch": 2.984641638225256, + "grad_norm": 2.542302131652832, + "learning_rate": 5.119453924914675e-06, + "loss": 5.6779, + "step": 8745 + }, + { + "epoch": 2.9849829351535835, + "grad_norm": 2.3574278354644775, + "learning_rate": 5.005688282138795e-06, + "loss": 5.5255, + "step": 8746 + }, + { + "epoch": 2.9853242320819113, + "grad_norm": 2.4841222763061523, + "learning_rate": 4.891922639362912e-06, + "loss": 5.7245, + "step": 8747 + }, + { + "epoch": 2.985665529010239, + "grad_norm": 2.4711790084838867, + "learning_rate": 4.778156996587031e-06, + "loss": 6.1008, + "step": 8748 + }, + { + "epoch": 2.9860068259385666, + "grad_norm": 2.472411632537842, + "learning_rate": 4.664391353811149e-06, + "loss": 5.9053, + "step": 8749 + }, + { + "epoch": 2.986348122866894, + "grad_norm": 2.540469169616699, + "learning_rate": 4.550625711035267e-06, + "loss": 5.4633, + "step": 8750 + }, + { + "epoch": 2.986689419795222, + "grad_norm": 2.4350697994232178, + "learning_rate": 4.436860068259386e-06, + "loss": 5.7402, + "step": 8751 + }, + { + "epoch": 2.9870307167235497, + "grad_norm": 2.4693779945373535, + "learning_rate": 4.323094425483504e-06, + "loss": 5.52, + "step": 8752 + }, + { + "epoch": 2.987372013651877, + "grad_norm": 2.453641414642334, + "learning_rate": 4.2093287827076224e-06, + "loss": 6.0574, + "step": 8753 + }, + { + "epoch": 2.9877133105802045, + "grad_norm": 2.4429779052734375, + "learning_rate": 4.095563139931741e-06, + "loss": 5.5031, + "step": 8754 + }, + { + "epoch": 2.9880546075085324, + "grad_norm": 2.479944944381714, + "learning_rate": 3.9817974971558586e-06, + "loss": 5.6135, + "step": 8755 + }, + { + "epoch": 2.98839590443686, + "grad_norm": 2.4810147285461426, + "learning_rate": 3.868031854379978e-06, + "loss": 6.4582, + "step": 8756 + }, + { + "epoch": 2.9887372013651876, + "grad_norm": 2.5106160640716553, + "learning_rate": 3.7542662116040956e-06, + "loss": 6.1153, + "step": 8757 + }, + { + "epoch": 2.9890784982935155, + "grad_norm": 2.4401440620422363, + "learning_rate": 3.6405005688282136e-06, + "loss": 5.8295, + "step": 8758 + }, + { + "epoch": 2.989419795221843, + "grad_norm": 2.520796298980713, + "learning_rate": 3.526734926052332e-06, + "loss": 6.0053, + "step": 8759 + }, + { + "epoch": 2.9897610921501707, + "grad_norm": 2.531611204147339, + "learning_rate": 3.4129692832764506e-06, + "loss": 6.1409, + "step": 8760 + }, + { + "epoch": 2.9901023890784986, + "grad_norm": 2.4728920459747314, + "learning_rate": 3.299203640500569e-06, + "loss": 5.6671, + "step": 8761 + }, + { + "epoch": 2.990443686006826, + "grad_norm": 2.493472099304199, + "learning_rate": 3.185437997724687e-06, + "loss": 6.0225, + "step": 8762 + }, + { + "epoch": 2.9907849829351534, + "grad_norm": 2.4957869052886963, + "learning_rate": 3.0716723549488053e-06, + "loss": 6.2367, + "step": 8763 + }, + { + "epoch": 2.9911262798634812, + "grad_norm": 2.530949115753174, + "learning_rate": 2.9579067121729238e-06, + "loss": 5.9412, + "step": 8764 + }, + { + "epoch": 2.991467576791809, + "grad_norm": 2.5535237789154053, + "learning_rate": 2.8441410693970423e-06, + "loss": 5.8223, + "step": 8765 + }, + { + "epoch": 2.9918088737201365, + "grad_norm": 2.5407912731170654, + "learning_rate": 2.7303754266211603e-06, + "loss": 5.8673, + "step": 8766 + }, + { + "epoch": 2.992150170648464, + "grad_norm": 2.4849934577941895, + "learning_rate": 2.616609783845279e-06, + "loss": 6.2063, + "step": 8767 + }, + { + "epoch": 2.9924914675767917, + "grad_norm": 2.459456443786621, + "learning_rate": 2.5028441410693973e-06, + "loss": 5.7936, + "step": 8768 + }, + { + "epoch": 2.9928327645051196, + "grad_norm": 2.41933012008667, + "learning_rate": 2.3890784982935154e-06, + "loss": 6.1125, + "step": 8769 + }, + { + "epoch": 2.993174061433447, + "grad_norm": 2.5088438987731934, + "learning_rate": 2.2753128555176335e-06, + "loss": 5.7709, + "step": 8770 + }, + { + "epoch": 2.993515358361775, + "grad_norm": 2.481630563735962, + "learning_rate": 2.161547212741752e-06, + "loss": 5.1395, + "step": 8771 + }, + { + "epoch": 2.9938566552901023, + "grad_norm": 2.4694602489471436, + "learning_rate": 2.0477815699658705e-06, + "loss": 6.408, + "step": 8772 + }, + { + "epoch": 2.99419795221843, + "grad_norm": 2.609255313873291, + "learning_rate": 1.934015927189989e-06, + "loss": 5.2451, + "step": 8773 + }, + { + "epoch": 2.994539249146758, + "grad_norm": 2.562502145767212, + "learning_rate": 1.8202502844141068e-06, + "loss": 6.0563, + "step": 8774 + }, + { + "epoch": 2.9948805460750854, + "grad_norm": 2.526449203491211, + "learning_rate": 1.7064846416382253e-06, + "loss": 5.7829, + "step": 8775 + }, + { + "epoch": 2.9952218430034128, + "grad_norm": 2.5452816486358643, + "learning_rate": 1.5927189988623436e-06, + "loss": 6.2753, + "step": 8776 + }, + { + "epoch": 2.9955631399317406, + "grad_norm": 2.4952499866485596, + "learning_rate": 1.4789533560864619e-06, + "loss": 6.2114, + "step": 8777 + }, + { + "epoch": 2.9959044368600685, + "grad_norm": 2.254758358001709, + "learning_rate": 1.3651877133105802e-06, + "loss": 4.1049, + "step": 8778 + }, + { + "epoch": 2.996245733788396, + "grad_norm": 2.518864870071411, + "learning_rate": 1.2514220705346987e-06, + "loss": 6.5108, + "step": 8779 + }, + { + "epoch": 2.9965870307167233, + "grad_norm": 2.4607439041137695, + "learning_rate": 1.1376564277588167e-06, + "loss": 5.4417, + "step": 8780 + }, + { + "epoch": 2.996928327645051, + "grad_norm": 2.581373453140259, + "learning_rate": 1.0238907849829352e-06, + "loss": 5.4841, + "step": 8781 + }, + { + "epoch": 2.997269624573379, + "grad_norm": 2.3811047077178955, + "learning_rate": 9.101251422070534e-07, + "loss": 5.2381, + "step": 8782 + }, + { + "epoch": 2.9976109215017064, + "grad_norm": 2.4670047760009766, + "learning_rate": 7.963594994311718e-07, + "loss": 6.3373, + "step": 8783 + }, + { + "epoch": 2.9979522184300342, + "grad_norm": 2.462789297103882, + "learning_rate": 6.825938566552901e-07, + "loss": 5.4766, + "step": 8784 + }, + { + "epoch": 2.9982935153583616, + "grad_norm": 2.467465877532959, + "learning_rate": 5.688282138794084e-07, + "loss": 6.3852, + "step": 8785 + }, + { + "epoch": 2.9986348122866895, + "grad_norm": 2.4952261447906494, + "learning_rate": 4.550625711035267e-07, + "loss": 5.9034, + "step": 8786 + }, + { + "epoch": 2.9989761092150173, + "grad_norm": 2.497133731842041, + "learning_rate": 3.4129692832764504e-07, + "loss": 5.4161, + "step": 8787 + }, + { + "epoch": 2.9993174061433447, + "grad_norm": 2.377657890319824, + "learning_rate": 2.2753128555176335e-07, + "loss": 5.5288, + "step": 8788 + }, + { + "epoch": 2.999658703071672, + "grad_norm": 2.2658233642578125, + "learning_rate": 1.1376564277588168e-07, + "loss": 3.8865, + "step": 8789 + }, + { + "epoch": 3.0, + "grad_norm": 2.476238965988159, + "learning_rate": 0.0, + "loss": 5.7151, + "step": 8790 + } + ], + "logging_steps": 1, + "max_steps": 8790, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 120000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.53227897518424e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}