tomvoelker commited on
Commit
82c66f0
·
verified ·
1 Parent(s): 03b7098

End of training

Browse files
Files changed (4) hide show
  1. all_results.json +12 -12
  2. test_results.json +8 -8
  3. train_results.json +4 -4
  4. trainer_state.json +211 -211
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
  "epoch": 2.999832822513235,
3
- "test_loss": 2.405472993850708,
4
- "test_rouge1": 0.30014736256715036,
5
- "test_rouge2": 0.08990177140739183,
6
- "test_rougeL": 0.17839451537149986,
7
- "test_rougeLsum": 0.2793680267506967,
8
- "test_runtime": 3332.6261,
9
- "test_samples_per_second": 3.448,
10
- "test_steps_per_second": 0.216,
11
  "total_flos": 5.251566637814907e+17,
12
- "train_loss": 2.7471534659841614,
13
- "train_runtime": 62610.6103,
14
- "train_samples_per_second": 13.757,
15
- "train_steps_per_second": 0.43
16
  }
 
1
  {
2
  "epoch": 2.999832822513235,
3
+ "test_loss": 2.3661417961120605,
4
+ "test_rouge1": 0.24069119580905096,
5
+ "test_rouge2": 0.06279204840567007,
6
+ "test_rougeL": 0.1497130207459511,
7
+ "test_rougeLsum": 0.22519873523651693,
8
+ "test_runtime": 3342.168,
9
+ "test_samples_per_second": 3.438,
10
+ "test_steps_per_second": 0.215,
11
  "total_flos": 5.251566637814907e+17,
12
+ "train_loss": 2.7409434880486105,
13
+ "train_runtime": 62948.108,
14
+ "train_samples_per_second": 13.683,
15
+ "train_steps_per_second": 0.428
16
  }
test_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "test_loss": 2.405472993850708,
3
- "test_rouge1": 0.30014736256715036,
4
- "test_rouge2": 0.08990177140739183,
5
- "test_rougeL": 0.17839451537149986,
6
- "test_rougeLsum": 0.2793680267506967,
7
- "test_runtime": 3332.6261,
8
- "test_samples_per_second": 3.448,
9
- "test_steps_per_second": 0.216
10
  }
 
1
  {
2
+ "test_loss": 2.3661417961120605,
3
+ "test_rouge1": 0.24069119580905096,
4
+ "test_rouge2": 0.06279204840567007,
5
+ "test_rougeL": 0.1497130207459511,
6
+ "test_rougeLsum": 0.22519873523651693,
7
+ "test_runtime": 3342.168,
8
+ "test_samples_per_second": 3.438,
9
+ "test_steps_per_second": 0.215
10
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.999832822513235,
3
  "total_flos": 5.251566637814907e+17,
4
- "train_loss": 2.7471534659841614,
5
- "train_runtime": 62610.6103,
6
- "train_samples_per_second": 13.757,
7
- "train_steps_per_second": 0.43
8
  }
 
1
  {
2
  "epoch": 2.999832822513235,
3
  "total_flos": 5.251566637814907e+17,
4
+ "train_loss": 2.7409434880486105,
5
+ "train_runtime": 62948.108,
6
+ "train_samples_per_second": 13.683,
7
+ "train_steps_per_second": 0.428
8
  }
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": 0.09498188720823034,
3
- "best_model_checkpoint": "/bartabsa-reproduce/outputs/gpt22gpt2_42/checkpoint-26000",
4
  "epoch": 2.999832822513235,
5
  "eval_steps": 2000,
6
  "global_step": 26916,
@@ -10,539 +10,539 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.05572582892170521,
13
- "grad_norm": 4.041100025177002,
14
  "learning_rate": 2.5e-05,
15
- "loss": 4.6671,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 0.11145165784341042,
20
- "grad_norm": 2.432424545288086,
21
  "learning_rate": 5e-05,
22
- "loss": 3.4845,
23
  "step": 1000
24
  },
25
  {
26
  "epoch": 0.16717748676511562,
27
- "grad_norm": 1.953045129776001,
28
  "learning_rate": 4.9035344960642076e-05,
29
- "loss": 3.3593,
30
  "step": 1500
31
  },
32
  {
33
  "epoch": 0.22290331568682084,
34
- "grad_norm": 1.6982029676437378,
35
  "learning_rate": 4.807068992128415e-05,
36
- "loss": 3.2585,
37
  "step": 2000
38
  },
39
  {
40
  "epoch": 0.22290331568682084,
41
- "eval_loss": 3.0468475818634033,
42
- "eval_rouge1": 0.20414052990162646,
43
- "eval_rouge2": 0.0368331377057871,
44
- "eval_rougeL": 0.12707636549492063,
45
- "eval_rougeLsum": 0.19158184931332545,
46
- "eval_runtime": 3954.9669,
47
- "eval_samples_per_second": 3.38,
48
- "eval_steps_per_second": 0.211,
49
  "step": 2000
50
  },
51
  {
52
  "epoch": 0.27862914460852606,
53
- "grad_norm": 1.7004557847976685,
54
  "learning_rate": 4.7106034881926225e-05,
55
- "loss": 3.1954,
56
  "step": 2500
57
  },
58
  {
59
  "epoch": 0.33435497353023125,
60
- "grad_norm": 1.7124439477920532,
61
  "learning_rate": 4.61413798425683e-05,
62
- "loss": 3.1283,
63
  "step": 3000
64
  },
65
  {
66
  "epoch": 0.3900808024519365,
67
- "grad_norm": 1.6656184196472168,
68
  "learning_rate": 4.517672480321037e-05,
69
- "loss": 3.074,
70
  "step": 3500
71
  },
72
  {
73
  "epoch": 0.4458066313736417,
74
- "grad_norm": 1.6584104299545288,
75
  "learning_rate": 4.421206976385245e-05,
76
- "loss": 3.021,
77
  "step": 4000
78
  },
79
  {
80
  "epoch": 0.4458066313736417,
81
- "eval_loss": 2.824082136154175,
82
- "eval_rouge1": 0.24506751051320486,
83
- "eval_rouge2": 0.054679372312090684,
84
- "eval_rougeL": 0.14905741774213627,
85
- "eval_rougeLsum": 0.22935701387043644,
86
- "eval_runtime": 3934.6858,
87
- "eval_samples_per_second": 3.397,
88
  "eval_steps_per_second": 0.212,
89
  "step": 4000
90
  },
91
  {
92
  "epoch": 0.5015324602953469,
93
- "grad_norm": 1.7051591873168945,
94
  "learning_rate": 4.324741472449452e-05,
95
- "loss": 2.9832,
96
  "step": 4500
97
  },
98
  {
99
  "epoch": 0.5572582892170521,
100
- "grad_norm": 1.6298202276229858,
101
  "learning_rate": 4.2282759685136595e-05,
102
- "loss": 2.9583,
103
  "step": 5000
104
  },
105
  {
106
  "epoch": 0.6129841181387573,
107
- "grad_norm": 1.6464300155639648,
108
  "learning_rate": 4.131810464577867e-05,
109
- "loss": 2.9228,
110
  "step": 5500
111
  },
112
  {
113
  "epoch": 0.6687099470604625,
114
- "grad_norm": 1.739127278327942,
115
  "learning_rate": 4.035344960642074e-05,
116
- "loss": 2.9032,
117
  "step": 6000
118
  },
119
  {
120
  "epoch": 0.6687099470604625,
121
- "eval_loss": 2.7089996337890625,
122
- "eval_rouge1": 0.2663717664757944,
123
- "eval_rouge2": 0.06537748809482974,
124
- "eval_rougeL": 0.1592442331800362,
125
- "eval_rougeLsum": 0.24867499259615916,
126
- "eval_runtime": 3934.9758,
127
- "eval_samples_per_second": 3.397,
128
- "eval_steps_per_second": 0.212,
129
  "step": 6000
130
  },
131
  {
132
  "epoch": 0.7244357759821677,
133
- "grad_norm": 1.7684314250946045,
134
  "learning_rate": 3.938879456706282e-05,
135
- "loss": 2.882,
136
  "step": 6500
137
  },
138
  {
139
  "epoch": 0.780161604903873,
140
- "grad_norm": 1.6454176902770996,
141
  "learning_rate": 3.84241395277049e-05,
142
- "loss": 2.865,
143
  "step": 7000
144
  },
145
  {
146
  "epoch": 0.8358874338255782,
147
- "grad_norm": 1.666963815689087,
148
  "learning_rate": 3.745948448834697e-05,
149
- "loss": 2.8506,
150
  "step": 7500
151
  },
152
  {
153
  "epoch": 0.8916132627472834,
154
- "grad_norm": 1.8170151710510254,
155
  "learning_rate": 3.6494829448989046e-05,
156
- "loss": 2.8327,
157
  "step": 8000
158
  },
159
  {
160
  "epoch": 0.8916132627472834,
161
- "eval_loss": 2.642709732055664,
162
- "eval_rouge1": 0.2751055377920662,
163
- "eval_rouge2": 0.07034575867678786,
164
- "eval_rougeL": 0.16339764548616517,
165
- "eval_rougeLsum": 0.25646025355771046,
166
- "eval_runtime": 3890.3417,
167
- "eval_samples_per_second": 3.436,
168
- "eval_steps_per_second": 0.215,
169
  "step": 8000
170
  },
171
  {
172
  "epoch": 0.9473390916689886,
173
- "grad_norm": 1.5809513330459595,
174
  "learning_rate": 3.553017440963112e-05,
175
- "loss": 2.8165,
176
  "step": 8500
177
  },
178
  {
179
  "epoch": 1.0030649205906939,
180
- "grad_norm": 1.7124121189117432,
181
  "learning_rate": 3.4565519370273194e-05,
182
- "loss": 2.7999,
183
  "step": 9000
184
  },
185
  {
186
  "epoch": 1.058790749512399,
187
- "grad_norm": 1.7687199115753174,
188
  "learning_rate": 3.360086433091527e-05,
189
- "loss": 2.7008,
190
  "step": 9500
191
  },
192
  {
193
  "epoch": 1.1145165784341042,
194
- "grad_norm": 1.5963129997253418,
195
  "learning_rate": 3.263620929155734e-05,
196
- "loss": 2.6888,
197
  "step": 10000
198
  },
199
  {
200
  "epoch": 1.1145165784341042,
201
- "eval_loss": 2.59445858001709,
202
- "eval_rouge1": 0.2837999756517398,
203
- "eval_rouge2": 0.07560808665349861,
204
- "eval_rougeL": 0.1683748136203092,
205
- "eval_rougeLsum": 0.2645926430882342,
206
- "eval_runtime": 3881.3598,
207
- "eval_samples_per_second": 3.444,
208
- "eval_steps_per_second": 0.215,
209
  "step": 10000
210
  },
211
  {
212
  "epoch": 1.1702424073558095,
213
- "grad_norm": 1.5700680017471313,
214
  "learning_rate": 3.1671554252199416e-05,
215
- "loss": 2.6877,
216
  "step": 10500
217
  },
218
  {
219
  "epoch": 1.2259682362775146,
220
- "grad_norm": 1.6018140316009521,
221
  "learning_rate": 3.070689921284149e-05,
222
- "loss": 2.6832,
223
  "step": 11000
224
  },
225
  {
226
  "epoch": 1.28169406519922,
227
- "grad_norm": 1.6392451524734497,
228
  "learning_rate": 2.9742244173483564e-05,
229
- "loss": 2.6644,
230
  "step": 11500
231
  },
232
  {
233
  "epoch": 1.337419894120925,
234
- "grad_norm": 1.6229345798492432,
235
  "learning_rate": 2.8777589134125638e-05,
236
- "loss": 2.6639,
237
  "step": 12000
238
  },
239
  {
240
  "epoch": 1.337419894120925,
241
- "eval_loss": 2.552922487258911,
242
- "eval_rouge1": 0.29085662314708605,
243
- "eval_rouge2": 0.0793640709354625,
244
- "eval_rougeL": 0.17167813435302517,
245
- "eval_rougeLsum": 0.2712184275883871,
246
- "eval_runtime": 3882.9542,
247
- "eval_samples_per_second": 3.443,
248
- "eval_steps_per_second": 0.215,
249
  "step": 12000
250
  },
251
  {
252
  "epoch": 1.3931457230426303,
253
- "grad_norm": 1.6006370782852173,
254
  "learning_rate": 2.7812934094767712e-05,
255
- "loss": 2.6644,
256
  "step": 12500
257
  },
258
  {
259
  "epoch": 1.4488715519643356,
260
- "grad_norm": 1.647545337677002,
261
  "learning_rate": 2.6848279055409786e-05,
262
- "loss": 2.6594,
263
  "step": 13000
264
  },
265
  {
266
  "epoch": 1.5045973808860407,
267
- "grad_norm": 1.629269003868103,
268
  "learning_rate": 2.588362401605186e-05,
269
- "loss": 2.6461,
270
  "step": 13500
271
  },
272
  {
273
  "epoch": 1.5603232098077457,
274
- "grad_norm": 1.6553572416305542,
275
  "learning_rate": 2.4918968976693934e-05,
276
- "loss": 2.6351,
277
  "step": 14000
278
  },
279
  {
280
  "epoch": 1.5603232098077457,
281
- "eval_loss": 2.515895366668701,
282
- "eval_rouge1": 0.2917033862169448,
283
- "eval_rouge2": 0.08093829177522276,
284
- "eval_rougeL": 0.17244898354413596,
285
- "eval_rougeLsum": 0.2717652486926211,
286
- "eval_runtime": 3880.2314,
287
- "eval_samples_per_second": 3.445,
288
- "eval_steps_per_second": 0.215,
289
  "step": 14000
290
  },
291
  {
292
  "epoch": 1.616049038729451,
293
- "grad_norm": 1.8069674968719482,
294
  "learning_rate": 2.3954313937336008e-05,
295
- "loss": 2.635,
296
  "step": 14500
297
  },
298
  {
299
  "epoch": 1.6717748676511563,
300
- "grad_norm": 1.620038628578186,
301
  "learning_rate": 2.2989658897978082e-05,
302
- "loss": 2.6173,
303
  "step": 15000
304
  },
305
  {
306
  "epoch": 1.7275006965728616,
307
- "grad_norm": 1.6973850727081299,
308
  "learning_rate": 2.2025003858620156e-05,
309
- "loss": 2.6134,
310
  "step": 15500
311
  },
312
  {
313
  "epoch": 1.7832265254945667,
314
- "grad_norm": 1.762335181236267,
315
  "learning_rate": 2.1060348819262234e-05,
316
- "loss": 2.6154,
317
  "step": 16000
318
  },
319
  {
320
  "epoch": 1.7832265254945667,
321
- "eval_loss": 2.4802534580230713,
322
- "eval_rouge1": 0.2993308883027235,
323
- "eval_rouge2": 0.08591108128077216,
324
- "eval_rougeL": 0.17652195635463208,
325
- "eval_rougeLsum": 0.2793858239831221,
326
- "eval_runtime": 3852.8094,
327
- "eval_samples_per_second": 3.47,
328
- "eval_steps_per_second": 0.217,
329
  "step": 16000
330
  },
331
  {
332
  "epoch": 1.8389523544162718,
333
- "grad_norm": 1.8116912841796875,
334
  "learning_rate": 2.0095693779904308e-05,
335
- "loss": 2.6106,
336
  "step": 16500
337
  },
338
  {
339
  "epoch": 1.894678183337977,
340
- "grad_norm": 1.81112802028656,
341
  "learning_rate": 1.9131038740546382e-05,
342
- "loss": 2.5978,
343
  "step": 17000
344
  },
345
  {
346
  "epoch": 1.9504040122596824,
347
- "grad_norm": 1.9138634204864502,
348
  "learning_rate": 1.8166383701188456e-05,
349
- "loss": 2.5951,
350
  "step": 17500
351
  },
352
  {
353
  "epoch": 2.0061298411813877,
354
- "grad_norm": 1.8298559188842773,
355
  "learning_rate": 1.720172866183053e-05,
356
- "loss": 2.5888,
357
  "step": 18000
358
  },
359
  {
360
  "epoch": 2.0061298411813877,
361
- "eval_loss": 2.4529457092285156,
362
- "eval_rouge1": 0.30422361394671005,
363
- "eval_rouge2": 0.08905283916557813,
364
- "eval_rougeL": 0.1793231859010479,
365
- "eval_rougeLsum": 0.2836599615028822,
366
- "eval_runtime": 3878.3678,
367
- "eval_samples_per_second": 3.447,
368
  "eval_steps_per_second": 0.216,
369
  "step": 18000
370
  },
371
  {
372
  "epoch": 2.0618556701030926,
373
- "grad_norm": 1.7736088037490845,
374
  "learning_rate": 1.6237073622472604e-05,
375
- "loss": 2.5044,
376
  "step": 18500
377
  },
378
  {
379
  "epoch": 2.117581499024798,
380
- "grad_norm": 2.166782855987549,
381
  "learning_rate": 1.5272418583114678e-05,
382
- "loss": 2.5033,
383
  "step": 19000
384
  },
385
  {
386
  "epoch": 2.173307327946503,
387
- "grad_norm": 2.1640796661376953,
388
  "learning_rate": 1.4307763543756752e-05,
389
- "loss": 2.5094,
390
  "step": 19500
391
  },
392
  {
393
  "epoch": 2.2290331568682085,
394
- "grad_norm": 2.0196428298950195,
395
  "learning_rate": 1.3343108504398828e-05,
396
- "loss": 2.508,
397
  "step": 20000
398
  },
399
  {
400
  "epoch": 2.2290331568682085,
401
- "eval_loss": 2.433751106262207,
402
- "eval_rouge1": 0.30609130777182225,
403
- "eval_rouge2": 0.09102846739541004,
404
- "eval_rougeL": 0.18076964405517054,
405
- "eval_rougeLsum": 0.2854008545253584,
406
- "eval_runtime": 3855.7671,
407
- "eval_samples_per_second": 3.467,
408
- "eval_steps_per_second": 0.217,
409
  "step": 20000
410
  },
411
  {
412
  "epoch": 2.2847589857899138,
413
- "grad_norm": 2.035296678543091,
414
  "learning_rate": 1.2378453465040902e-05,
415
- "loss": 2.493,
416
  "step": 20500
417
  },
418
  {
419
  "epoch": 2.340484814711619,
420
- "grad_norm": 2.1855478286743164,
421
  "learning_rate": 1.1413798425682977e-05,
422
- "loss": 2.4977,
423
  "step": 21000
424
  },
425
  {
426
  "epoch": 2.396210643633324,
427
- "grad_norm": 2.0839128494262695,
428
  "learning_rate": 1.0449143386325052e-05,
429
- "loss": 2.4902,
430
  "step": 21500
431
  },
432
  {
433
  "epoch": 2.4519364725550292,
434
- "grad_norm": 2.094590425491333,
435
  "learning_rate": 9.484488346967126e-06,
436
- "loss": 2.4864,
437
  "step": 22000
438
  },
439
  {
440
  "epoch": 2.4519364725550292,
441
- "eval_loss": 2.4146804809570312,
442
- "eval_rouge1": 0.3078587783094971,
443
- "eval_rouge2": 0.0924346301955819,
444
- "eval_rougeL": 0.18175483469212192,
445
- "eval_rougeLsum": 0.2867290629199985,
446
- "eval_runtime": 3892.4925,
447
- "eval_samples_per_second": 3.434,
448
  "eval_steps_per_second": 0.215,
449
  "step": 22000
450
  },
451
  {
452
  "epoch": 2.5076623014767345,
453
- "grad_norm": 1.884666919708252,
454
  "learning_rate": 8.5198333076092e-06,
455
- "loss": 2.4762,
456
  "step": 22500
457
  },
458
  {
459
  "epoch": 2.56338813039844,
460
- "grad_norm": 2.1555168628692627,
461
  "learning_rate": 7.5551782682512745e-06,
462
- "loss": 2.475,
463
  "step": 23000
464
  },
465
  {
466
  "epoch": 2.6191139593201447,
467
- "grad_norm": 2.227051019668579,
468
  "learning_rate": 6.5905232288933485e-06,
469
- "loss": 2.4787,
470
  "step": 23500
471
  },
472
  {
473
  "epoch": 2.67483978824185,
474
- "grad_norm": 1.9909260272979736,
475
  "learning_rate": 5.6258681895354226e-06,
476
- "loss": 2.472,
477
  "step": 24000
478
  },
479
  {
480
  "epoch": 2.67483978824185,
481
- "eval_loss": 2.399353504180908,
482
- "eval_rouge1": 0.310019871588452,
483
- "eval_rouge2": 0.09398335137757877,
484
- "eval_rougeL": 0.18325719765288911,
485
- "eval_rougeLsum": 0.28927642168730905,
486
- "eval_runtime": 3895.1314,
487
- "eval_samples_per_second": 3.432,
488
  "eval_steps_per_second": 0.215,
489
  "step": 24000
490
  },
491
  {
492
  "epoch": 2.7305656171635553,
493
- "grad_norm": 1.9598902463912964,
494
  "learning_rate": 4.661213150177497e-06,
495
- "loss": 2.4851,
496
  "step": 24500
497
  },
498
  {
499
  "epoch": 2.7862914460852606,
500
- "grad_norm": 2.2174415588378906,
501
  "learning_rate": 3.6965581108195706e-06,
502
- "loss": 2.4702,
503
  "step": 25000
504
  },
505
  {
506
  "epoch": 2.842017275006966,
507
- "grad_norm": 2.0245871543884277,
508
  "learning_rate": 2.7319030714616455e-06,
509
- "loss": 2.4762,
510
  "step": 25500
511
  },
512
  {
513
  "epoch": 2.897743103928671,
514
- "grad_norm": 2.0211620330810547,
515
  "learning_rate": 1.7672480321037198e-06,
516
- "loss": 2.4727,
517
  "step": 26000
518
  },
519
  {
520
  "epoch": 2.897743103928671,
521
- "eval_loss": 2.3910329341888428,
522
- "eval_rouge1": 0.3114248373581785,
523
- "eval_rouge2": 0.09498188720823034,
524
- "eval_rougeL": 0.18378574041919338,
525
- "eval_rougeLsum": 0.290684624851027,
526
- "eval_runtime": 3893.8898,
527
- "eval_samples_per_second": 3.433,
528
  "eval_steps_per_second": 0.215,
529
  "step": 26000
530
  },
531
  {
532
  "epoch": 2.953468932850376,
533
- "grad_norm": 2.2476003170013428,
534
  "learning_rate": 8.025929927457941e-07,
535
- "loss": 2.4671,
536
  "step": 26500
537
  },
538
  {
539
  "epoch": 2.999832822513235,
540
  "step": 26916,
541
  "total_flos": 5.251566637814907e+17,
542
- "train_loss": 2.7471534659841614,
543
- "train_runtime": 62610.6103,
544
- "train_samples_per_second": 13.757,
545
- "train_steps_per_second": 0.43
546
  }
547
  ],
548
  "logging_steps": 500,
 
1
  {
2
+ "best_metric": 0.07459307722662609,
3
+ "best_model_checkpoint": "/bartabsa-reproduce/outputs/gpt22gpt2_42/checkpoint-20000",
4
  "epoch": 2.999832822513235,
5
  "eval_steps": 2000,
6
  "global_step": 26916,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.05572582892170521,
13
+ "grad_norm": 3.767733097076416,
14
  "learning_rate": 2.5e-05,
15
+ "loss": 4.6099,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 0.11145165784341042,
20
+ "grad_norm": 2.3378305435180664,
21
  "learning_rate": 5e-05,
22
+ "loss": 3.4791,
23
  "step": 1000
24
  },
25
  {
26
  "epoch": 0.16717748676511562,
27
+ "grad_norm": 1.7727080583572388,
28
  "learning_rate": 4.9035344960642076e-05,
29
+ "loss": 3.3408,
30
  "step": 1500
31
  },
32
  {
33
  "epoch": 0.22290331568682084,
34
+ "grad_norm": 1.6580827236175537,
35
  "learning_rate": 4.807068992128415e-05,
36
+ "loss": 3.2453,
37
  "step": 2000
38
  },
39
  {
40
  "epoch": 0.22290331568682084,
41
+ "eval_loss": 3.040846347808838,
42
+ "eval_rouge1": 0.2030762835922255,
43
+ "eval_rouge2": 0.03654806097360143,
44
+ "eval_rougeL": 0.1269024607851587,
45
+ "eval_rougeLsum": 0.19064872923784548,
46
+ "eval_runtime": 3935.594,
47
+ "eval_samples_per_second": 3.397,
48
+ "eval_steps_per_second": 0.212,
49
  "step": 2000
50
  },
51
  {
52
  "epoch": 0.27862914460852606,
53
+ "grad_norm": 1.6749674081802368,
54
  "learning_rate": 4.7106034881926225e-05,
55
+ "loss": 3.1966,
56
  "step": 2500
57
  },
58
  {
59
  "epoch": 0.33435497353023125,
60
+ "grad_norm": 1.6479393243789673,
61
  "learning_rate": 4.61413798425683e-05,
62
+ "loss": 3.1416,
63
  "step": 3000
64
  },
65
  {
66
  "epoch": 0.3900808024519365,
67
+ "grad_norm": 1.6228386163711548,
68
  "learning_rate": 4.517672480321037e-05,
69
+ "loss": 3.0931,
70
  "step": 3500
71
  },
72
  {
73
  "epoch": 0.4458066313736417,
74
+ "grad_norm": 1.6669822931289673,
75
  "learning_rate": 4.421206976385245e-05,
76
+ "loss": 3.0421,
77
  "step": 4000
78
  },
79
  {
80
  "epoch": 0.4458066313736417,
81
+ "eval_loss": 2.8455963134765625,
82
+ "eval_rouge1": 0.2380314893051126,
83
+ "eval_rouge2": 0.05128827161353091,
84
+ "eval_rougeL": 0.1453227983282736,
85
+ "eval_rougeLsum": 0.22321551567650025,
86
+ "eval_runtime": 3950.4694,
87
+ "eval_samples_per_second": 3.384,
88
  "eval_steps_per_second": 0.212,
89
  "step": 4000
90
  },
91
  {
92
  "epoch": 0.5015324602953469,
93
+ "grad_norm": 1.721129059791565,
94
  "learning_rate": 4.324741472449452e-05,
95
+ "loss": 3.0031,
96
  "step": 4500
97
  },
98
  {
99
  "epoch": 0.5572582892170521,
100
+ "grad_norm": 1.6570061445236206,
101
  "learning_rate": 4.2282759685136595e-05,
102
+ "loss": 2.9799,
103
  "step": 5000
104
  },
105
  {
106
  "epoch": 0.6129841181387573,
107
+ "grad_norm": 1.6984457969665527,
108
  "learning_rate": 4.131810464577867e-05,
109
+ "loss": 2.9441,
110
  "step": 5500
111
  },
112
  {
113
  "epoch": 0.6687099470604625,
114
+ "grad_norm": 1.780073642730713,
115
  "learning_rate": 4.035344960642074e-05,
116
+ "loss": 2.9227,
117
  "step": 6000
118
  },
119
  {
120
  "epoch": 0.6687099470604625,
121
+ "eval_loss": 2.728790044784546,
122
+ "eval_rouge1": 0.2595394320658464,
123
+ "eval_rouge2": 0.06168732304612659,
124
+ "eval_rougeL": 0.15581664226279762,
125
+ "eval_rougeLsum": 0.2423658467883625,
126
+ "eval_runtime": 3924.3137,
127
+ "eval_samples_per_second": 3.406,
128
+ "eval_steps_per_second": 0.213,
129
  "step": 6000
130
  },
131
  {
132
  "epoch": 0.7244357759821677,
133
+ "grad_norm": 1.7621432542800903,
134
  "learning_rate": 3.938879456706282e-05,
135
+ "loss": 2.9003,
136
  "step": 6500
137
  },
138
  {
139
  "epoch": 0.780161604903873,
140
+ "grad_norm": 1.6608766317367554,
141
  "learning_rate": 3.84241395277049e-05,
142
+ "loss": 2.8805,
143
  "step": 7000
144
  },
145
  {
146
  "epoch": 0.8358874338255782,
147
+ "grad_norm": 1.6685175895690918,
148
  "learning_rate": 3.745948448834697e-05,
149
+ "loss": 2.8633,
150
  "step": 7500
151
  },
152
  {
153
  "epoch": 0.8916132627472834,
154
+ "grad_norm": 1.766258955001831,
155
  "learning_rate": 3.6494829448989046e-05,
156
+ "loss": 2.8436,
157
  "step": 8000
158
  },
159
  {
160
  "epoch": 0.8916132627472834,
161
+ "eval_loss": 2.6555898189544678,
162
+ "eval_rouge1": 0.2583863310766251,
163
+ "eval_rouge2": 0.06324442312681633,
164
+ "eval_rougeL": 0.1554957409851852,
165
+ "eval_rougeLsum": 0.24164198724587968,
166
+ "eval_runtime": 3936.4212,
167
+ "eval_samples_per_second": 3.396,
168
+ "eval_steps_per_second": 0.212,
169
  "step": 8000
170
  },
171
  {
172
  "epoch": 0.9473390916689886,
173
+ "grad_norm": 1.59657621383667,
174
  "learning_rate": 3.553017440963112e-05,
175
+ "loss": 2.8263,
176
  "step": 8500
177
  },
178
  {
179
  "epoch": 1.0030649205906939,
180
+ "grad_norm": 1.5849162340164185,
181
  "learning_rate": 3.4565519370273194e-05,
182
+ "loss": 2.8088,
183
  "step": 9000
184
  },
185
  {
186
  "epoch": 1.058790749512399,
187
+ "grad_norm": 1.7484833002090454,
188
  "learning_rate": 3.360086433091527e-05,
189
+ "loss": 2.7095,
190
  "step": 9500
191
  },
192
  {
193
  "epoch": 1.1145165784341042,
194
+ "grad_norm": 1.5881661176681519,
195
  "learning_rate": 3.263620929155734e-05,
196
+ "loss": 2.6961,
197
  "step": 10000
198
  },
199
  {
200
  "epoch": 1.1145165784341042,
201
+ "eval_loss": 2.5992419719696045,
202
+ "eval_rouge1": 0.25777068407797354,
203
+ "eval_rouge2": 0.06420331632465279,
204
+ "eval_rougeL": 0.15702078007420395,
205
+ "eval_rougeLsum": 0.2409777539933322,
206
+ "eval_runtime": 3924.4889,
207
+ "eval_samples_per_second": 3.406,
208
+ "eval_steps_per_second": 0.213,
209
  "step": 10000
210
  },
211
  {
212
  "epoch": 1.1702424073558095,
213
+ "grad_norm": 1.5762600898742676,
214
  "learning_rate": 3.1671554252199416e-05,
215
+ "loss": 2.6942,
216
  "step": 10500
217
  },
218
  {
219
  "epoch": 1.2259682362775146,
220
+ "grad_norm": 1.630346655845642,
221
  "learning_rate": 3.070689921284149e-05,
222
+ "loss": 2.688,
223
  "step": 11000
224
  },
225
  {
226
  "epoch": 1.28169406519922,
227
+ "grad_norm": 1.6222407817840576,
228
  "learning_rate": 2.9742244173483564e-05,
229
+ "loss": 2.6682,
230
  "step": 11500
231
  },
232
  {
233
  "epoch": 1.337419894120925,
234
+ "grad_norm": 1.6392185688018799,
235
  "learning_rate": 2.8777589134125638e-05,
236
+ "loss": 2.6662,
237
  "step": 12000
238
  },
239
  {
240
  "epoch": 1.337419894120925,
241
+ "eval_loss": 2.551286458969116,
242
+ "eval_rouge1": 0.27486750620247946,
243
+ "eval_rouge2": 0.07168156814787813,
244
+ "eval_rougeL": 0.1642252769198796,
245
+ "eval_rougeLsum": 0.25714973714244077,
246
+ "eval_runtime": 3900.0246,
247
+ "eval_samples_per_second": 3.428,
248
+ "eval_steps_per_second": 0.214,
249
  "step": 12000
250
  },
251
  {
252
  "epoch": 1.3931457230426303,
253
+ "grad_norm": 1.5941892862319946,
254
  "learning_rate": 2.7812934094767712e-05,
255
+ "loss": 2.6653,
256
  "step": 12500
257
  },
258
  {
259
  "epoch": 1.4488715519643356,
260
+ "grad_norm": 1.6253877878189087,
261
  "learning_rate": 2.6848279055409786e-05,
262
+ "loss": 2.6586,
263
  "step": 13000
264
  },
265
  {
266
  "epoch": 1.5045973808860407,
267
+ "grad_norm": 1.6063872575759888,
268
  "learning_rate": 2.588362401605186e-05,
269
+ "loss": 2.6443,
270
  "step": 13500
271
  },
272
  {
273
  "epoch": 1.5603232098077457,
274
+ "grad_norm": 1.713887095451355,
275
  "learning_rate": 2.4918968976693934e-05,
276
+ "loss": 2.6312,
277
  "step": 14000
278
  },
279
  {
280
  "epoch": 1.5603232098077457,
281
+ "eval_loss": 2.508091688156128,
282
+ "eval_rouge1": 0.2530435850081141,
283
+ "eval_rouge2": 0.06384506041315902,
284
+ "eval_rougeL": 0.1543038576804575,
285
+ "eval_rougeLsum": 0.2366360294033542,
286
+ "eval_runtime": 3907.0776,
287
+ "eval_samples_per_second": 3.421,
288
+ "eval_steps_per_second": 0.214,
289
  "step": 14000
290
  },
291
  {
292
  "epoch": 1.616049038729451,
293
+ "grad_norm": 1.8332961797714233,
294
  "learning_rate": 2.3954313937336008e-05,
295
+ "loss": 2.63,
296
  "step": 14500
297
  },
298
  {
299
  "epoch": 1.6717748676511563,
300
+ "grad_norm": 1.6101057529449463,
301
  "learning_rate": 2.2989658897978082e-05,
302
+ "loss": 2.6114,
303
  "step": 15000
304
  },
305
  {
306
  "epoch": 1.7275006965728616,
307
+ "grad_norm": 1.7273740768432617,
308
  "learning_rate": 2.2025003858620156e-05,
309
+ "loss": 2.6051,
310
  "step": 15500
311
  },
312
  {
313
  "epoch": 1.7832265254945667,
314
+ "grad_norm": 1.7632737159729004,
315
  "learning_rate": 2.1060348819262234e-05,
316
+ "loss": 2.6058,
317
  "step": 16000
318
  },
319
  {
320
  "epoch": 1.7832265254945667,
321
+ "eval_loss": 2.463944911956787,
322
+ "eval_rouge1": 0.2636323697106167,
323
+ "eval_rouge2": 0.07174514437983107,
324
+ "eval_rougeL": 0.1601389399578005,
325
+ "eval_rougeLsum": 0.2469347915587097,
326
+ "eval_runtime": 3915.9785,
327
+ "eval_samples_per_second": 3.414,
328
+ "eval_steps_per_second": 0.213,
329
  "step": 16000
330
  },
331
  {
332
  "epoch": 1.8389523544162718,
333
+ "grad_norm": 1.8221988677978516,
334
  "learning_rate": 2.0095693779904308e-05,
335
+ "loss": 2.5988,
336
  "step": 16500
337
  },
338
  {
339
  "epoch": 1.894678183337977,
340
+ "grad_norm": 1.8893871307373047,
341
  "learning_rate": 1.9131038740546382e-05,
342
+ "loss": 2.5847,
343
  "step": 17000
344
  },
345
  {
346
  "epoch": 1.9504040122596824,
347
+ "grad_norm": 1.953140139579773,
348
  "learning_rate": 1.8166383701188456e-05,
349
+ "loss": 2.5804,
350
  "step": 17500
351
  },
352
  {
353
  "epoch": 2.0061298411813877,
354
+ "grad_norm": 1.8473776578903198,
355
  "learning_rate": 1.720172866183053e-05,
356
+ "loss": 2.5725,
357
  "step": 18000
358
  },
359
  {
360
  "epoch": 2.0061298411813877,
361
+ "eval_loss": 2.4292125701904297,
362
+ "eval_rouge1": 0.2567421048616416,
363
+ "eval_rouge2": 0.06891060288535017,
364
+ "eval_rougeL": 0.15595194613787078,
365
+ "eval_rougeLsum": 0.24070474254739155,
366
+ "eval_runtime": 3869.9616,
367
+ "eval_samples_per_second": 3.454,
368
  "eval_steps_per_second": 0.216,
369
  "step": 18000
370
  },
371
  {
372
  "epoch": 2.0618556701030926,
373
+ "grad_norm": 1.7851742506027222,
374
  "learning_rate": 1.6237073622472604e-05,
375
+ "loss": 2.489,
376
  "step": 18500
377
  },
378
  {
379
  "epoch": 2.117581499024798,
380
+ "grad_norm": 2.2768101692199707,
381
  "learning_rate": 1.5272418583114678e-05,
382
+ "loss": 2.4861,
383
  "step": 19000
384
  },
385
  {
386
  "epoch": 2.173307327946503,
387
+ "grad_norm": 2.209219455718994,
388
  "learning_rate": 1.4307763543756752e-05,
389
+ "loss": 2.4912,
390
  "step": 19500
391
  },
392
  {
393
  "epoch": 2.2290331568682085,
394
+ "grad_norm": 2.0397818088531494,
395
  "learning_rate": 1.3343108504398828e-05,
396
+ "loss": 2.4892,
397
  "step": 20000
398
  },
399
  {
400
  "epoch": 2.2290331568682085,
401
+ "eval_loss": 2.4027278423309326,
402
+ "eval_rouge1": 0.2706722599374948,
403
+ "eval_rouge2": 0.07459307722662609,
404
+ "eval_rougeL": 0.16398173707926839,
405
+ "eval_rougeLsum": 0.2530942151578608,
406
+ "eval_runtime": 3893.765,
407
+ "eval_samples_per_second": 3.433,
408
+ "eval_steps_per_second": 0.215,
409
  "step": 20000
410
  },
411
  {
412
  "epoch": 2.2847589857899138,
413
+ "grad_norm": 2.035895824432373,
414
  "learning_rate": 1.2378453465040902e-05,
415
+ "loss": 2.4728,
416
  "step": 20500
417
  },
418
  {
419
  "epoch": 2.340484814711619,
420
+ "grad_norm": 2.106766939163208,
421
  "learning_rate": 1.1413798425682977e-05,
422
+ "loss": 2.4768,
423
  "step": 21000
424
  },
425
  {
426
  "epoch": 2.396210643633324,
427
+ "grad_norm": 2.103576183319092,
428
  "learning_rate": 1.0449143386325052e-05,
429
+ "loss": 2.4689,
430
  "step": 21500
431
  },
432
  {
433
  "epoch": 2.4519364725550292,
434
+ "grad_norm": 2.0902152061462402,
435
  "learning_rate": 9.484488346967126e-06,
436
+ "loss": 2.4647,
437
  "step": 22000
438
  },
439
  {
440
  "epoch": 2.4519364725550292,
441
+ "eval_loss": 2.3800978660583496,
442
+ "eval_rouge1": 0.25082265645038193,
443
+ "eval_rouge2": 0.06640380147549775,
444
+ "eval_rougeL": 0.1539963772798671,
445
+ "eval_rougeLsum": 0.23498739580707717,
446
+ "eval_runtime": 3884.9364,
447
+ "eval_samples_per_second": 3.441,
448
  "eval_steps_per_second": 0.215,
449
  "step": 22000
450
  },
451
  {
452
  "epoch": 2.5076623014767345,
453
+ "grad_norm": 1.8595211505889893,
454
  "learning_rate": 8.5198333076092e-06,
455
+ "loss": 2.4541,
456
  "step": 22500
457
  },
458
  {
459
  "epoch": 2.56338813039844,
460
+ "grad_norm": 2.1612913608551025,
461
  "learning_rate": 7.5551782682512745e-06,
462
+ "loss": 2.4519,
463
  "step": 23000
464
  },
465
  {
466
  "epoch": 2.6191139593201447,
467
+ "grad_norm": 2.2538599967956543,
468
  "learning_rate": 6.5905232288933485e-06,
469
+ "loss": 2.4544,
470
  "step": 23500
471
  },
472
  {
473
  "epoch": 2.67483978824185,
474
+ "grad_norm": 2.060137987136841,
475
  "learning_rate": 5.6258681895354226e-06,
476
+ "loss": 2.4479,
477
  "step": 24000
478
  },
479
  {
480
  "epoch": 2.67483978824185,
481
+ "eval_loss": 2.361970901489258,
482
+ "eval_rouge1": 0.263764877338478,
483
+ "eval_rouge2": 0.07266606751022181,
484
+ "eval_rougeL": 0.1608073972426968,
485
+ "eval_rougeLsum": 0.24731330981409283,
486
+ "eval_runtime": 3893.8374,
487
+ "eval_samples_per_second": 3.433,
488
  "eval_steps_per_second": 0.215,
489
  "step": 24000
490
  },
491
  {
492
  "epoch": 2.7305656171635553,
493
+ "grad_norm": 1.9665076732635498,
494
  "learning_rate": 4.661213150177497e-06,
495
+ "loss": 2.4587,
496
  "step": 24500
497
  },
498
  {
499
  "epoch": 2.7862914460852606,
500
+ "grad_norm": 2.218065023422241,
501
  "learning_rate": 3.6965581108195706e-06,
502
+ "loss": 2.4458,
503
  "step": 25000
504
  },
505
  {
506
  "epoch": 2.842017275006966,
507
+ "grad_norm": 2.042405605316162,
508
  "learning_rate": 2.7319030714616455e-06,
509
+ "loss": 2.4513,
510
  "step": 25500
511
  },
512
  {
513
  "epoch": 2.897743103928671,
514
+ "grad_norm": 1.9766805171966553,
515
  "learning_rate": 1.7672480321037198e-06,
516
+ "loss": 2.4474,
517
  "step": 26000
518
  },
519
  {
520
  "epoch": 2.897743103928671,
521
+ "eval_loss": 2.3526828289031982,
522
+ "eval_rouge1": 0.2544085209463849,
523
+ "eval_rouge2": 0.06856415444008992,
524
+ "eval_rougeL": 0.15601198971765073,
525
+ "eval_rougeLsum": 0.23845835139467592,
526
+ "eval_runtime": 3886.9121,
527
+ "eval_samples_per_second": 3.439,
528
  "eval_steps_per_second": 0.215,
529
  "step": 26000
530
  },
531
  {
532
  "epoch": 2.953468932850376,
533
+ "grad_norm": 2.4569876194000244,
534
  "learning_rate": 8.025929927457941e-07,
535
+ "loss": 2.4411,
536
  "step": 26500
537
  },
538
  {
539
  "epoch": 2.999832822513235,
540
  "step": 26916,
541
  "total_flos": 5.251566637814907e+17,
542
+ "train_loss": 2.7409434880486105,
543
+ "train_runtime": 62948.108,
544
+ "train_samples_per_second": 13.683,
545
+ "train_steps_per_second": 0.428
546
  }
547
  ],
548
  "logging_steps": 500,