File size: 23,259 Bytes
55dd3fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 50,
  "global_step": 352,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.028409090909090908,
      "grad_norm": 58.560175797662396,
      "learning_rate": 1.3888888888888888e-07,
      "logits/chosen": -2.854257822036743,
      "logits/rejected": -2.6711959838867188,
      "logps/chosen": -372.8061828613281,
      "logps/rejected": -748.3225708007812,
      "loss": 0.6864,
      "rewards/accuracies": 0.6499999761581421,
      "rewards/chosen": 0.0024342918768525124,
      "rewards/margins": 0.015255051665008068,
      "rewards/rejected": -0.01282076071947813,
      "step": 10
    },
    {
      "epoch": 0.056818181818181816,
      "grad_norm": 19.273001268117515,
      "learning_rate": 2.7777777777777776e-07,
      "logits/chosen": -2.8417019844055176,
      "logits/rejected": -2.6597249507904053,
      "logps/chosen": -380.89154052734375,
      "logps/rejected": -695.0466918945312,
      "loss": 0.4818,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.08547976613044739,
      "rewards/margins": 0.59254390001297,
      "rewards/rejected": -0.5070642232894897,
      "step": 20
    },
    {
      "epoch": 0.08522727272727272,
      "grad_norm": 4.004572162611932,
      "learning_rate": 4.1666666666666667e-07,
      "logits/chosen": -2.8608241081237793,
      "logits/rejected": -2.65569806098938,
      "logps/chosen": -324.8751220703125,
      "logps/rejected": -1104.629150390625,
      "loss": 0.1406,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.39872533082962036,
      "rewards/margins": 4.522292137145996,
      "rewards/rejected": -4.123566627502441,
      "step": 30
    },
    {
      "epoch": 0.11363636363636363,
      "grad_norm": 0.38876724989910894,
      "learning_rate": 4.998023493068254e-07,
      "logits/chosen": -2.869859218597412,
      "logits/rejected": -2.628769874572754,
      "logps/chosen": -311.97674560546875,
      "logps/rejected": -2135.577392578125,
      "loss": 0.0209,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.43964704871177673,
      "rewards/margins": 15.060437202453613,
      "rewards/rejected": -14.620790481567383,
      "step": 40
    },
    {
      "epoch": 0.14204545454545456,
      "grad_norm": 0.782568924522184,
      "learning_rate": 4.975823666181255e-07,
      "logits/chosen": -2.880276918411255,
      "logits/rejected": -2.6541552543640137,
      "logps/chosen": -422.45806884765625,
      "logps/rejected": -3464.00537109375,
      "loss": 0.0053,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -0.41893959045410156,
      "rewards/margins": 27.855571746826172,
      "rewards/rejected": -28.274511337280273,
      "step": 50
    },
    {
      "epoch": 0.14204545454545456,
      "eval_logits/chosen": -2.8265042304992676,
      "eval_logits/rejected": -2.55804181098938,
      "eval_logps/chosen": -454.01495361328125,
      "eval_logps/rejected": -4311.6689453125,
      "eval_loss": 0.0024393389467149973,
      "eval_rewards/accuracies": 0.9979838728904724,
      "eval_rewards/chosen": -0.8703324198722839,
      "eval_rewards/margins": 35.83847427368164,
      "eval_rewards/rejected": -36.70880889892578,
      "eval_runtime": 193.6452,
      "eval_samples_per_second": 20.166,
      "eval_steps_per_second": 0.32,
      "step": 50
    },
    {
      "epoch": 0.17045454545454544,
      "grad_norm": 3.6303694847529218,
      "learning_rate": 4.929173350101024e-07,
      "logits/chosen": -2.867490291595459,
      "logits/rejected": -2.5985803604125977,
      "logps/chosen": -477.73272705078125,
      "logps/rejected": -5529.82861328125,
      "loss": 0.0218,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.165496826171875,
      "rewards/margins": 46.3895378112793,
      "rewards/rejected": -47.55502700805664,
      "step": 60
    },
    {
      "epoch": 0.19886363636363635,
      "grad_norm": 0.02570393857059677,
      "learning_rate": 4.858533249305336e-07,
      "logits/chosen": -2.8956971168518066,
      "logits/rejected": -2.5464015007019043,
      "logps/chosen": -492.2872619628906,
      "logps/rejected": -5038.3505859375,
      "loss": 0.0042,
      "rewards/accuracies": 0.9937499761581421,
      "rewards/chosen": -1.363173007965088,
      "rewards/margins": 41.90122604370117,
      "rewards/rejected": -43.26439666748047,
      "step": 70
    },
    {
      "epoch": 0.22727272727272727,
      "grad_norm": 0.019530473634071552,
      "learning_rate": 4.764600984163808e-07,
      "logits/chosen": -2.9800267219543457,
      "logits/rejected": -2.6762020587921143,
      "logps/chosen": -501.85772705078125,
      "logps/rejected": -5076.8662109375,
      "loss": 0.0028,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.2898057699203491,
      "rewards/margins": 42.464393615722656,
      "rewards/rejected": -43.75419235229492,
      "step": 80
    },
    {
      "epoch": 0.2556818181818182,
      "grad_norm": 0.021637173397401434,
      "learning_rate": 4.6483042014491527e-07,
      "logits/chosen": -3.0119636058807373,
      "logits/rejected": -2.6861469745635986,
      "logps/chosen": -490.847900390625,
      "logps/rejected": -5491.890625,
      "loss": 0.0015,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.2475790977478027,
      "rewards/margins": 47.27143478393555,
      "rewards/rejected": -48.519012451171875,
      "step": 90
    },
    {
      "epoch": 0.2840909090909091,
      "grad_norm": 0.24085676005599221,
      "learning_rate": 4.510791413176912e-07,
      "logits/chosen": -2.996633529663086,
      "logits/rejected": -2.7358081340789795,
      "logps/chosen": -525.2289428710938,
      "logps/rejected": -5033.7158203125,
      "loss": 0.0061,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.5802116394042969,
      "rewards/margins": 41.77141571044922,
      "rewards/rejected": -43.35162353515625,
      "step": 100
    },
    {
      "epoch": 0.2840909090909091,
      "eval_logits/chosen": -2.9615049362182617,
      "eval_logits/rejected": -2.6516737937927246,
      "eval_logps/chosen": -529.8892822265625,
      "eval_logps/rejected": -4822.52294921875,
      "eval_loss": 0.0012528152437880635,
      "eval_rewards/accuracies": 0.9979838728904724,
      "eval_rewards/chosen": -1.629075527191162,
      "eval_rewards/margins": 40.188270568847656,
      "eval_rewards/rejected": -41.81734848022461,
      "eval_runtime": 192.9327,
      "eval_samples_per_second": 20.24,
      "eval_steps_per_second": 0.321,
      "step": 100
    },
    {
      "epoch": 0.3125,
      "grad_norm": 0.5179684876316148,
      "learning_rate": 4.353420654246546e-07,
      "logits/chosen": -2.9635558128356934,
      "logits/rejected": -2.640312433242798,
      "logps/chosen": -524.4633178710938,
      "logps/rejected": -5045.9384765625,
      "loss": 0.0018,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.5064172744750977,
      "rewards/margins": 42.47917175292969,
      "rewards/rejected": -43.98558807373047,
      "step": 110
    },
    {
      "epoch": 0.3409090909090909,
      "grad_norm": 1.1132308145442282,
      "learning_rate": 4.177746070897592e-07,
      "logits/chosen": -2.987412214279175,
      "logits/rejected": -2.5702311992645264,
      "logps/chosen": -529.0863647460938,
      "logps/rejected": -4814.87939453125,
      "loss": 0.001,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.6565755605697632,
      "rewards/margins": 40.433387756347656,
      "rewards/rejected": -42.089962005615234,
      "step": 120
    },
    {
      "epoch": 0.3693181818181818,
      "grad_norm": 0.1524427654589431,
      "learning_rate": 3.9855025724292763e-07,
      "logits/chosen": -2.988529682159424,
      "logits/rejected": -2.547990083694458,
      "logps/chosen": -530.0753784179688,
      "logps/rejected": -4828.005859375,
      "loss": 0.0006,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.5551540851593018,
      "rewards/margins": 40.872337341308594,
      "rewards/rejected": -42.427490234375,
      "step": 130
    },
    {
      "epoch": 0.3977272727272727,
      "grad_norm": 0.058047718713258305,
      "learning_rate": 3.7785886977585555e-07,
      "logits/chosen": -2.9764513969421387,
      "logits/rejected": -2.531805992126465,
      "logps/chosen": -517.7172241210938,
      "logps/rejected": -5057.0849609375,
      "loss": 0.001,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.4110362529754639,
      "rewards/margins": 42.79331588745117,
      "rewards/rejected": -44.2043571472168,
      "step": 140
    },
    {
      "epoch": 0.42613636363636365,
      "grad_norm": 0.03612899599023959,
      "learning_rate": 3.5590478660213206e-07,
      "logits/chosen": -3.0368993282318115,
      "logits/rejected": -2.6156864166259766,
      "logps/chosen": -539.3234252929688,
      "logps/rejected": -5024.9345703125,
      "loss": 0.0019,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.587400197982788,
      "rewards/margins": 41.970149993896484,
      "rewards/rejected": -43.55754852294922,
      "step": 150
    },
    {
      "epoch": 0.42613636363636365,
      "eval_logits/chosen": -2.9830355644226074,
      "eval_logits/rejected": -2.518221378326416,
      "eval_logps/chosen": -536.5906372070312,
      "eval_logps/rejected": -5789.0673828125,
      "eval_loss": 0.00048427816363982856,
      "eval_rewards/accuracies": 1.0,
      "eval_rewards/chosen": -1.696089267730713,
      "eval_rewards/margins": 49.78670120239258,
      "eval_rewards/rejected": -51.4827880859375,
      "eval_runtime": 191.9777,
      "eval_samples_per_second": 20.341,
      "eval_steps_per_second": 0.323,
      "step": 150
    },
    {
      "epoch": 0.45454545454545453,
      "grad_norm": 0.017819958857612898,
      "learning_rate": 3.3290481963801696e-07,
      "logits/chosen": -2.969292402267456,
      "logits/rejected": -2.488386631011963,
      "logps/chosen": -585.0005493164062,
      "logps/rejected": -6239.01025390625,
      "loss": 0.0014,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.9822533130645752,
      "rewards/margins": 53.68230438232422,
      "rewards/rejected": -55.6645622253418,
      "step": 160
    },
    {
      "epoch": 0.48295454545454547,
      "grad_norm": 0.04768437011269856,
      "learning_rate": 3.0908610963322626e-07,
      "logits/chosen": -2.916259765625,
      "logits/rejected": -2.3954265117645264,
      "logps/chosen": -582.5315551757812,
      "logps/rejected": -6029.98486328125,
      "loss": 0.0008,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -2.1212856769561768,
      "rewards/margins": 51.80366897583008,
      "rewards/rejected": -53.924957275390625,
      "step": 170
    },
    {
      "epoch": 0.5113636363636364,
      "grad_norm": 0.00786856607244259,
      "learning_rate": 2.846838829972671e-07,
      "logits/chosen": -2.9504363536834717,
      "logits/rejected": -2.4123263359069824,
      "logps/chosen": -604.1171875,
      "logps/rejected": -6656.84521484375,
      "loss": 0.0012,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -2.2390635013580322,
      "rewards/margins": 57.837486267089844,
      "rewards/rejected": -60.07655715942383,
      "step": 180
    },
    {
      "epoch": 0.5397727272727273,
      "grad_norm": 0.02736966148588083,
      "learning_rate": 2.5993912877423147e-07,
      "logits/chosen": -2.9569239616394043,
      "logits/rejected": -2.3740897178649902,
      "logps/chosen": -581.5017700195312,
      "logps/rejected": -6748.75927734375,
      "loss": 0.003,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -2.1127352714538574,
      "rewards/margins": 58.927764892578125,
      "rewards/rejected": -61.040489196777344,
      "step": 190
    },
    {
      "epoch": 0.5681818181818182,
      "grad_norm": 0.06339173150454364,
      "learning_rate": 2.3509621870754504e-07,
      "logits/chosen": -3.002689838409424,
      "logits/rejected": -2.4718198776245117,
      "logps/chosen": -596.9158935546875,
      "logps/rejected": -6154.20263671875,
      "loss": 0.0022,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -2.0328099727630615,
      "rewards/margins": 52.925392150878906,
      "rewards/rejected": -54.95819854736328,
      "step": 200
    },
    {
      "epoch": 0.5681818181818182,
      "eval_logits/chosen": -2.9937610626220703,
      "eval_logits/rejected": -2.5169804096221924,
      "eval_logps/chosen": -570.3004150390625,
      "eval_logps/rejected": -5663.16015625,
      "eval_loss": 0.00048508381587453187,
      "eval_rewards/accuracies": 1.0,
      "eval_rewards/chosen": -2.033186435699463,
      "eval_rewards/margins": 48.190528869628906,
      "eval_rewards/rejected": -50.223716735839844,
      "eval_runtime": 193.4558,
      "eval_samples_per_second": 20.185,
      "eval_steps_per_second": 0.32,
      "step": 200
    },
    {
      "epoch": 0.5965909090909091,
      "grad_norm": 0.016437717754495415,
      "learning_rate": 2.1040049389819624e-07,
      "logits/chosen": -3.0172648429870605,
      "logits/rejected": -2.565308094024658,
      "logps/chosen": -547.1297607421875,
      "logps/rejected": -5656.95166015625,
      "loss": 0.0007,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -2.0488336086273193,
      "rewards/margins": 47.970909118652344,
      "rewards/rejected": -50.019744873046875,
      "step": 210
    },
    {
      "epoch": 0.625,
      "grad_norm": 1.693727446683944,
      "learning_rate": 1.8609584188988133e-07,
      "logits/chosen": -3.0191075801849365,
      "logits/rejected": -2.507822036743164,
      "logps/chosen": -582.245849609375,
      "logps/rejected": -5883.4912109375,
      "loss": 0.0014,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.9353878498077393,
      "rewards/margins": 50.56637191772461,
      "rewards/rejected": -52.50175857543945,
      "step": 220
    },
    {
      "epoch": 0.6534090909090909,
      "grad_norm": 0.0008297976344692307,
      "learning_rate": 1.624222881090439e-07,
      "logits/chosen": -3.0655417442321777,
      "logits/rejected": -2.5936379432678223,
      "logps/chosen": -571.0089111328125,
      "logps/rejected": -5979.38037109375,
      "loss": 0.0006,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.8964475393295288,
      "rewards/margins": 51.060691833496094,
      "rewards/rejected": -52.9571418762207,
      "step": 230
    },
    {
      "epoch": 0.6818181818181818,
      "grad_norm": 12.048855827746156,
      "learning_rate": 1.3961362544602212e-07,
      "logits/chosen": -3.0751774311065674,
      "logits/rejected": -2.6132192611694336,
      "logps/chosen": -555.5328979492188,
      "logps/rejected": -5732.3876953125,
      "loss": 0.0044,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -2.028006076812744,
      "rewards/margins": 48.74276351928711,
      "rewards/rejected": -50.770774841308594,
      "step": 240
    },
    {
      "epoch": 0.7102272727272727,
      "grad_norm": 0.030090428752627654,
      "learning_rate": 1.1789510538684522e-07,
      "logits/chosen": -3.1127967834472656,
      "logits/rejected": -2.603487730026245,
      "logps/chosen": -577.8153076171875,
      "logps/rejected": -5749.29541015625,
      "loss": 0.0066,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -2.0291168689727783,
      "rewards/margins": 49.05834197998047,
      "rewards/rejected": -51.087459564208984,
      "step": 250
    },
    {
      "epoch": 0.7102272727272727,
      "eval_logits/chosen": -3.0874154567718506,
      "eval_logits/rejected": -2.5713446140289307,
      "eval_logps/chosen": -571.5287475585938,
      "eval_logps/rejected": -5825.4208984375,
      "eval_loss": 0.0006546394433826208,
      "eval_rewards/accuracies": 1.0,
      "eval_rewards/chosen": -2.0454704761505127,
      "eval_rewards/margins": 49.80085754394531,
      "eval_rewards/rejected": -51.8463249206543,
      "eval_runtime": 191.814,
      "eval_samples_per_second": 20.358,
      "eval_steps_per_second": 0.323,
      "step": 250
    },
    {
      "epoch": 0.7386363636363636,
      "grad_norm": 0.008133900264785228,
      "learning_rate": 9.748121349736891e-08,
      "logits/chosen": -3.1245901584625244,
      "logits/rejected": -2.5510289669036865,
      "logps/chosen": -562.6072387695312,
      "logps/rejected": -6041.6435546875,
      "loss": 0.001,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.9334728717803955,
      "rewards/margins": 51.701171875,
      "rewards/rejected": -53.6346435546875,
      "step": 260
    },
    {
      "epoch": 0.7670454545454546,
      "grad_norm": 0.03464071283654092,
      "learning_rate": 7.857355122839673e-08,
      "logits/chosen": -3.0760347843170166,
      "logits/rejected": -2.473140239715576,
      "logps/chosen": -577.8392333984375,
      "logps/rejected": -6429.56005859375,
      "loss": 0.0013,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.981665015220642,
      "rewards/margins": 55.79956817626953,
      "rewards/rejected": -57.78123092651367,
      "step": 270
    },
    {
      "epoch": 0.7954545454545454,
      "grad_norm": 0.09200228880640939,
      "learning_rate": 6.135884496044244e-08,
      "logits/chosen": -3.094788074493408,
      "logits/rejected": -2.5839569568634033,
      "logps/chosen": -593.9009399414062,
      "logps/rejected": -6378.4140625,
      "loss": 0.0005,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -2.036696672439575,
      "rewards/margins": 55.2489128112793,
      "rewards/rejected": -57.285614013671875,
      "step": 280
    },
    {
      "epoch": 0.8238636363636364,
      "grad_norm": 0.04828363097903115,
      "learning_rate": 4.600710195020982e-08,
      "logits/chosen": -3.084293842315674,
      "logits/rejected": -2.5237388610839844,
      "logps/chosen": -586.5794067382812,
      "logps/rejected": -5471.75146484375,
      "loss": 0.0027,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -2.0138309001922607,
      "rewards/margins": 46.394798278808594,
      "rewards/rejected": -48.408634185791016,
      "step": 290
    },
    {
      "epoch": 0.8522727272727273,
      "grad_norm": 1.5124351519089532,
      "learning_rate": 3.2669931390104374e-08,
      "logits/chosen": -3.095336437225342,
      "logits/rejected": -2.56819748878479,
      "logps/chosen": -608.9658813476562,
      "logps/rejected": -6138.87451171875,
      "loss": 0.0055,
      "rewards/accuracies": 0.987500011920929,
      "rewards/chosen": -2.1408629417419434,
      "rewards/margins": 52.5675163269043,
      "rewards/rejected": -54.70838165283203,
      "step": 300
    },
    {
      "epoch": 0.8522727272727273,
      "eval_logits/chosen": -3.0607314109802246,
      "eval_logits/rejected": -2.524188756942749,
      "eval_logps/chosen": -572.0144653320312,
      "eval_logps/rejected": -5950.38330078125,
      "eval_loss": 0.0002967408508993685,
      "eval_rewards/accuracies": 1.0,
      "eval_rewards/chosen": -2.0503275394439697,
      "eval_rewards/margins": 51.04560852050781,
      "eval_rewards/rejected": -53.095943450927734,
      "eval_runtime": 191.8524,
      "eval_samples_per_second": 20.354,
      "eval_steps_per_second": 0.323,
      "step": 300
    },
    {
      "epoch": 0.8806818181818182,
      "grad_norm": 0.08621703547632295,
      "learning_rate": 2.147904716149135e-08,
      "logits/chosen": -3.0894367694854736,
      "logits/rejected": -2.5899417400360107,
      "logps/chosen": -578.2093505859375,
      "logps/rejected": -5487.2734375,
      "loss": 0.0003,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -2.0271658897399902,
      "rewards/margins": 46.07707214355469,
      "rewards/rejected": -48.10424041748047,
      "step": 310
    },
    {
      "epoch": 0.9090909090909091,
      "grad_norm": 0.04429289003054511,
      "learning_rate": 1.254496706805433e-08,
      "logits/chosen": -3.071737051010132,
      "logits/rejected": -2.5150017738342285,
      "logps/chosen": -575.0449829101562,
      "logps/rejected": -5943.66552734375,
      "loss": 0.0011,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.9631164073944092,
      "rewards/margins": 51.358001708984375,
      "rewards/rejected": -53.32111358642578,
      "step": 320
    },
    {
      "epoch": 0.9375,
      "grad_norm": 0.05330126976035339,
      "learning_rate": 5.955921395237318e-09,
      "logits/chosen": -3.074503183364868,
      "logits/rejected": -2.5923588275909424,
      "logps/chosen": -587.8565673828125,
      "logps/rejected": -5959.51318359375,
      "loss": 0.0012,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -2.0806593894958496,
      "rewards/margins": 50.763099670410156,
      "rewards/rejected": -52.84375762939453,
      "step": 330
    },
    {
      "epoch": 0.9659090909090909,
      "grad_norm": 0.0015915884352458761,
      "learning_rate": 1.7769815745066474e-09,
      "logits/chosen": -3.0953116416931152,
      "logits/rejected": -2.6012253761291504,
      "logps/chosen": -584.4715576171875,
      "logps/rejected": -5886.49951171875,
      "loss": 0.0006,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -2.0364444255828857,
      "rewards/margins": 50.340614318847656,
      "rewards/rejected": -52.37705612182617,
      "step": 340
    },
    {
      "epoch": 0.9943181818181818,
      "grad_norm": 0.36200936509368786,
      "learning_rate": 4.9417557483610875e-11,
      "logits/chosen": -3.0942559242248535,
      "logits/rejected": -2.5950565338134766,
      "logps/chosen": -553.3095703125,
      "logps/rejected": -5615.38134765625,
      "loss": 0.0011,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.989404320716858,
      "rewards/margins": 47.7803955078125,
      "rewards/rejected": -49.769798278808594,
      "step": 350
    },
    {
      "epoch": 0.9943181818181818,
      "eval_logits/chosen": -3.071096897125244,
      "eval_logits/rejected": -2.540203809738159,
      "eval_logps/chosen": -571.614990234375,
      "eval_logps/rejected": -5922.82763671875,
      "eval_loss": 0.00029848425765521824,
      "eval_rewards/accuracies": 1.0,
      "eval_rewards/chosen": -2.046332597732544,
      "eval_rewards/margins": 50.7740592956543,
      "eval_rewards/rejected": -52.82038879394531,
      "eval_runtime": 192.8009,
      "eval_samples_per_second": 20.254,
      "eval_steps_per_second": 0.322,
      "step": 350
    },
    {
      "epoch": 1.0,
      "step": 352,
      "total_flos": 0.0,
      "train_loss": 0.04021276280278614,
      "train_runtime": 10097.1377,
      "train_samples_per_second": 4.457,
      "train_steps_per_second": 0.035
    }
  ],
  "logging_steps": 10,
  "max_steps": 352,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 100,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}