misterJB commited on
Commit
7eae7f8
·
verified ·
1 Parent(s): 3e822c3

Training in progress, step 500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed0d669212b40c60987a9febdec5b6e21bd03351aaa4ded9b393cddd4885a0c8
3
  size 6425529112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b8fac3343992a3bef6c12ed1a0758e36cad3b2839846b13971ee09879131c54
3
  size 6425529112
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fb1f5a17180bd5fce232cab2baace30fee1b53707b577513427207805ccc2976
3
  size 12851224679
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04bbebfcb471d4a88a032f2525708906a30884fc984ae26b975c1a13cf99d67d
3
  size 12851224679
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61c19bab1174704a4a4441475683bf1270277af15d2e2c95e964789128e482c4
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a95301d85992ed73403f16c486c03c4e66c9e324b5fb37edd6c6389bb791936a
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e10df580efad97bb633668d65429a37f5f53374d3373d28957a85a8b301eb78e
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,518 +2,118 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.953125,
6
  "eval_steps": 500,
7
- "global_step": 2500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 1.4100425505638123,
14
  "epoch": 0.0390625,
15
- "grad_norm": 1.703125,
16
- "learning_rate": 2.5520833333333334e-06,
17
- "loss": 1.4939990234375,
18
- "mean_token_accuracy": 0.7383647415041924,
19
  "num_tokens": 257685.0,
20
  "step": 50
21
  },
22
  {
23
- "entropy": 0.7098006828129292,
24
  "epoch": 0.078125,
25
- "grad_norm": 1.3203125,
26
- "learning_rate": 5.156250000000001e-06,
27
- "loss": 0.6490231323242187,
28
- "mean_token_accuracy": 0.8852380600571632,
29
  "num_tokens": 513846.0,
30
  "step": 100
31
  },
32
  {
33
- "entropy": 0.30407395515590907,
34
  "epoch": 0.1171875,
35
- "grad_norm": 1.2265625,
36
- "learning_rate": 7.760416666666666e-06,
37
- "loss": 0.258120174407959,
38
- "mean_token_accuracy": 0.9527129030227661,
39
  "num_tokens": 766003.0,
40
  "step": 150
41
  },
42
  {
43
- "entropy": 0.20104818791151047,
44
  "epoch": 0.15625,
45
- "grad_norm": 1.734375,
46
- "learning_rate": 1.0364583333333334e-05,
47
- "loss": 0.1583445167541504,
48
- "mean_token_accuracy": 0.9671251937747002,
49
  "num_tokens": 1024512.0,
50
  "step": 200
51
  },
52
  {
53
- "entropy": 0.13147448537871242,
54
  "epoch": 0.1953125,
55
- "grad_norm": 0.98828125,
56
- "learning_rate": 1.2968750000000002e-05,
57
- "loss": 0.10042434692382812,
58
- "mean_token_accuracy": 0.9772825425863266,
59
  "num_tokens": 1280614.0,
60
  "step": 250
61
  },
62
  {
63
- "entropy": 0.10732088888064027,
64
  "epoch": 0.234375,
65
- "grad_norm": 0.54296875,
66
- "learning_rate": 1.5572916666666668e-05,
67
- "loss": 0.08049575805664062,
68
- "mean_token_accuracy": 0.9807940790057182,
69
  "num_tokens": 1533690.0,
70
  "step": 300
71
  },
72
  {
73
- "entropy": 0.0891605723835528,
74
  "epoch": 0.2734375,
75
- "grad_norm": 0.6640625,
76
- "learning_rate": 1.8177083333333332e-05,
77
- "loss": 0.06720943927764893,
78
- "mean_token_accuracy": 0.9835837849974632,
79
  "num_tokens": 1783169.0,
80
  "step": 350
81
  },
82
  {
83
- "entropy": 0.06704788052476943,
84
  "epoch": 0.3125,
85
- "grad_norm": 0.8125,
86
- "learning_rate": 1.9913194444444447e-05,
87
- "loss": 0.04878067970275879,
88
- "mean_token_accuracy": 0.9872324925661087,
89
  "num_tokens": 2039809.0,
90
  "step": 400
91
  },
92
  {
93
- "entropy": 0.05854248736985028,
94
  "epoch": 0.3515625,
95
- "grad_norm": 0.2421875,
96
- "learning_rate": 1.9623842592592593e-05,
97
- "loss": 0.04276306629180908,
98
- "mean_token_accuracy": 0.988223501443863,
99
  "num_tokens": 2296699.0,
100
  "step": 450
101
  },
102
  {
103
- "entropy": 0.047506115855649116,
104
  "epoch": 0.390625,
105
- "grad_norm": 0.6484375,
106
- "learning_rate": 1.9334490740740743e-05,
107
- "loss": 0.034850025177001955,
108
- "mean_token_accuracy": 0.9899384224414826,
109
  "num_tokens": 2551050.0,
110
  "step": 500
111
- },
112
- {
113
- "entropy": 0.05063623377121985,
114
- "epoch": 0.4296875,
115
- "grad_norm": 0.248046875,
116
- "learning_rate": 1.904513888888889e-05,
117
- "loss": 0.03843466758728027,
118
- "mean_token_accuracy": 0.9891716066002846,
119
- "num_tokens": 2802698.0,
120
- "step": 550
121
- },
122
- {
123
- "entropy": 0.04273756206966937,
124
- "epoch": 0.46875,
125
- "grad_norm": 0.361328125,
126
- "learning_rate": 1.875578703703704e-05,
127
- "loss": 0.032549431324005125,
128
- "mean_token_accuracy": 0.9905735540390015,
129
- "num_tokens": 3060791.0,
130
- "step": 600
131
- },
132
- {
133
- "entropy": 0.043252475708723065,
134
- "epoch": 0.5078125,
135
- "grad_norm": 0.4765625,
136
- "learning_rate": 1.8466435185185186e-05,
137
- "loss": 0.032298707962036134,
138
- "mean_token_accuracy": 0.9904984217882157,
139
- "num_tokens": 3313602.0,
140
- "step": 650
141
- },
142
- {
143
- "entropy": 0.03833162900991738,
144
- "epoch": 0.546875,
145
- "grad_norm": 0.1845703125,
146
- "learning_rate": 1.8177083333333332e-05,
147
- "loss": 0.027402305603027345,
148
- "mean_token_accuracy": 0.9914076882600784,
149
- "num_tokens": 3572334.0,
150
- "step": 700
151
- },
152
- {
153
- "entropy": 0.037472997857257724,
154
- "epoch": 0.5859375,
155
- "grad_norm": 0.1572265625,
156
- "learning_rate": 1.7887731481481482e-05,
157
- "loss": 0.027391784191131592,
158
- "mean_token_accuracy": 0.9915520316362381,
159
- "num_tokens": 3822475.0,
160
- "step": 750
161
- },
162
- {
163
- "entropy": 0.03570145134814084,
164
- "epoch": 0.625,
165
- "grad_norm": 0.33984375,
166
- "learning_rate": 1.7598379629629632e-05,
167
- "loss": 0.02662867307662964,
168
- "mean_token_accuracy": 0.991677038371563,
169
- "num_tokens": 4077791.0,
170
- "step": 800
171
- },
172
- {
173
- "entropy": 0.03883639902807772,
174
- "epoch": 0.6640625,
175
- "grad_norm": 0.23828125,
176
- "learning_rate": 1.730902777777778e-05,
177
- "loss": 0.0284798264503479,
178
- "mean_token_accuracy": 0.9913818097114563,
179
- "num_tokens": 4327537.0,
180
- "step": 850
181
- },
182
- {
183
- "entropy": 0.03742775755003094,
184
- "epoch": 0.703125,
185
- "grad_norm": 0.21875,
186
- "learning_rate": 1.701967592592593e-05,
187
- "loss": 0.027604823112487794,
188
- "mean_token_accuracy": 0.9913884291052818,
189
- "num_tokens": 4582391.0,
190
- "step": 900
191
- },
192
- {
193
- "entropy": 0.033991393875330685,
194
- "epoch": 0.7421875,
195
- "grad_norm": 0.30859375,
196
- "learning_rate": 1.6730324074074075e-05,
197
- "loss": 0.02489029407501221,
198
- "mean_token_accuracy": 0.9922845155000687,
199
- "num_tokens": 4842265.0,
200
- "step": 950
201
- },
202
- {
203
- "entropy": 0.03376254609320313,
204
- "epoch": 0.78125,
205
- "grad_norm": 0.23828125,
206
- "learning_rate": 1.6440972222222225e-05,
207
- "loss": 0.024386107921600342,
208
- "mean_token_accuracy": 0.9923295575380325,
209
- "num_tokens": 5099714.0,
210
- "step": 1000
211
- },
212
- {
213
- "entropy": 0.03393112221732736,
214
- "epoch": 0.8203125,
215
- "grad_norm": 0.1904296875,
216
- "learning_rate": 1.615162037037037e-05,
217
- "loss": 0.024903218746185302,
218
- "mean_token_accuracy": 0.9919200077652931,
219
- "num_tokens": 5353934.0,
220
- "step": 1050
221
- },
222
- {
223
- "entropy": 0.03500156338326633,
224
- "epoch": 0.859375,
225
- "grad_norm": 0.400390625,
226
- "learning_rate": 1.586226851851852e-05,
227
- "loss": 0.02582158088684082,
228
- "mean_token_accuracy": 0.9918030974268913,
229
- "num_tokens": 5610229.0,
230
- "step": 1100
231
- },
232
- {
233
- "entropy": 0.03437284361571073,
234
- "epoch": 0.8984375,
235
- "grad_norm": 0.2080078125,
236
- "learning_rate": 1.5572916666666668e-05,
237
- "loss": 0.025576255321502685,
238
- "mean_token_accuracy": 0.9920255246758461,
239
- "num_tokens": 5862791.0,
240
- "step": 1150
241
- },
242
- {
243
- "entropy": 0.03161145319696516,
244
- "epoch": 0.9375,
245
- "grad_norm": 0.1376953125,
246
- "learning_rate": 1.5283564814814814e-05,
247
- "loss": 0.022945339679718017,
248
- "mean_token_accuracy": 0.992571029663086,
249
- "num_tokens": 6121431.0,
250
- "step": 1200
251
- },
252
- {
253
- "entropy": 0.030242039281874897,
254
- "epoch": 0.9765625,
255
- "grad_norm": 0.173828125,
256
- "learning_rate": 1.4994212962962964e-05,
257
- "loss": 0.021836049556732177,
258
- "mean_token_accuracy": 0.9925165721774101,
259
- "num_tokens": 6379675.0,
260
- "step": 1250
261
- },
262
- {
263
- "entropy": 0.03080349043942988,
264
- "epoch": 1.015625,
265
- "grad_norm": 0.162109375,
266
- "learning_rate": 1.4704861111111113e-05,
267
- "loss": 0.021759965419769288,
268
- "mean_token_accuracy": 0.9926620882749557,
269
- "num_tokens": 6635287.0,
270
- "step": 1300
271
- },
272
- {
273
- "entropy": 0.030266724079847335,
274
- "epoch": 1.0546875,
275
- "grad_norm": 0.32421875,
276
- "learning_rate": 1.4415509259259259e-05,
277
- "loss": 0.02140357255935669,
278
- "mean_token_accuracy": 0.9928847193717957,
279
- "num_tokens": 6888440.0,
280
- "step": 1350
281
- },
282
- {
283
- "entropy": 0.03156906962394714,
284
- "epoch": 1.09375,
285
- "grad_norm": 0.193359375,
286
- "learning_rate": 1.4126157407407407e-05,
287
- "loss": 0.02241924524307251,
288
- "mean_token_accuracy": 0.9925972136855126,
289
- "num_tokens": 7143446.0,
290
- "step": 1400
291
- },
292
- {
293
- "entropy": 0.03107087403535843,
294
- "epoch": 1.1328125,
295
- "grad_norm": 0.197265625,
296
- "learning_rate": 1.3836805555555557e-05,
297
- "loss": 0.02277942180633545,
298
- "mean_token_accuracy": 0.9925328662991524,
299
- "num_tokens": 7400487.0,
300
- "step": 1450
301
- },
302
- {
303
- "entropy": 0.029809724665246905,
304
- "epoch": 1.171875,
305
- "grad_norm": 0.1435546875,
306
- "learning_rate": 1.3547453703703705e-05,
307
- "loss": 0.021194422245025636,
308
- "mean_token_accuracy": 0.9928572303056717,
309
- "num_tokens": 7655674.0,
310
- "step": 1500
311
- },
312
- {
313
- "entropy": 0.03159077540971339,
314
- "epoch": 1.2109375,
315
- "grad_norm": 0.2060546875,
316
- "learning_rate": 1.3258101851851852e-05,
317
- "loss": 0.02314976692199707,
318
- "mean_token_accuracy": 0.9924373865127564,
319
- "num_tokens": 7908433.0,
320
- "step": 1550
321
- },
322
- {
323
- "entropy": 0.027922376818023623,
324
- "epoch": 1.25,
325
- "grad_norm": 0.1640625,
326
- "learning_rate": 1.2968750000000002e-05,
327
- "loss": 0.019616029262542724,
328
- "mean_token_accuracy": 0.9931238636374473,
329
- "num_tokens": 8163713.0,
330
- "step": 1600
331
- },
332
- {
333
- "entropy": 0.030897088246420026,
334
- "epoch": 1.2890625,
335
- "grad_norm": 0.1552734375,
336
- "learning_rate": 1.267939814814815e-05,
337
- "loss": 0.022646543979644777,
338
- "mean_token_accuracy": 0.992621060013771,
339
- "num_tokens": 8413661.0,
340
- "step": 1650
341
- },
342
- {
343
- "entropy": 0.03009945319034159,
344
- "epoch": 1.328125,
345
- "grad_norm": 0.1826171875,
346
- "learning_rate": 1.2390046296296297e-05,
347
- "loss": 0.022125842571258544,
348
- "mean_token_accuracy": 0.9926324704289436,
349
- "num_tokens": 8666242.0,
350
- "step": 1700
351
- },
352
- {
353
- "entropy": 0.030331599721685053,
354
- "epoch": 1.3671875,
355
- "grad_norm": 0.15234375,
356
- "learning_rate": 1.2100694444444445e-05,
357
- "loss": 0.021965017318725587,
358
- "mean_token_accuracy": 0.9924738201498985,
359
- "num_tokens": 8923368.0,
360
- "step": 1750
361
- },
362
- {
363
- "entropy": 0.029221815695054828,
364
- "epoch": 1.40625,
365
- "grad_norm": 0.1904296875,
366
- "learning_rate": 1.1811342592592595e-05,
367
- "loss": 0.02077268123626709,
368
- "mean_token_accuracy": 0.9929761862754822,
369
- "num_tokens": 9177821.0,
370
- "step": 1800
371
- },
372
- {
373
- "entropy": 0.02919468770734966,
374
- "epoch": 1.4453125,
375
- "grad_norm": 0.1513671875,
376
- "learning_rate": 1.1521990740740743e-05,
377
- "loss": 0.020907692909240723,
378
- "mean_token_accuracy": 0.9928768157958985,
379
- "num_tokens": 9430080.0,
380
- "step": 1850
381
- },
382
- {
383
- "entropy": 0.027250755606219174,
384
- "epoch": 1.484375,
385
- "grad_norm": 0.16015625,
386
- "learning_rate": 1.123263888888889e-05,
387
- "loss": 0.01935715675354004,
388
- "mean_token_accuracy": 0.9932011726498604,
389
- "num_tokens": 9688541.0,
390
- "step": 1900
391
- },
392
- {
393
- "entropy": 0.02769404204096645,
394
- "epoch": 1.5234375,
395
- "grad_norm": 0.1376953125,
396
- "learning_rate": 1.0943287037037038e-05,
397
- "loss": 0.01980802059173584,
398
- "mean_token_accuracy": 0.9932876801490784,
399
- "num_tokens": 9946814.0,
400
- "step": 1950
401
- },
402
- {
403
- "entropy": 0.029987394027411938,
404
- "epoch": 1.5625,
405
- "grad_norm": 0.2890625,
406
- "learning_rate": 1.0653935185185187e-05,
407
- "loss": 0.022057452201843263,
408
- "mean_token_accuracy": 0.9924592301249504,
409
- "num_tokens": 10201426.0,
410
- "step": 2000
411
- },
412
- {
413
- "entropy": 0.02940327289979905,
414
- "epoch": 1.6015625,
415
- "grad_norm": 0.375,
416
- "learning_rate": 1.0364583333333334e-05,
417
- "loss": 0.02158078670501709,
418
- "mean_token_accuracy": 0.9927120169997216,
419
- "num_tokens": 10454276.0,
420
- "step": 2050
421
- },
422
- {
423
- "entropy": 0.02797958446200937,
424
- "epoch": 1.640625,
425
- "grad_norm": 0.13671875,
426
- "learning_rate": 1.0075231481481482e-05,
427
- "loss": 0.020346088409423826,
428
- "mean_token_accuracy": 0.9930062460899353,
429
- "num_tokens": 10707497.0,
430
- "step": 2100
431
- },
432
- {
433
- "entropy": 0.027742678658105435,
434
- "epoch": 1.6796875,
435
- "grad_norm": 0.12890625,
436
- "learning_rate": 9.78587962962963e-06,
437
- "loss": 0.019992319345474244,
438
- "mean_token_accuracy": 0.9929967644810677,
439
- "num_tokens": 10962955.0,
440
- "step": 2150
441
- },
442
- {
443
- "entropy": 0.028013068232685328,
444
- "epoch": 1.71875,
445
- "grad_norm": 0.1337890625,
446
- "learning_rate": 9.496527777777779e-06,
447
- "loss": 0.020120697021484377,
448
- "mean_token_accuracy": 0.9930884554982186,
449
- "num_tokens": 11217333.0,
450
- "step": 2200
451
- },
452
- {
453
- "entropy": 0.027716009449213742,
454
- "epoch": 1.7578125,
455
- "grad_norm": 0.1943359375,
456
- "learning_rate": 9.207175925925927e-06,
457
- "loss": 0.019945393800735473,
458
- "mean_token_accuracy": 0.9929410392045974,
459
- "num_tokens": 11472495.0,
460
- "step": 2250
461
- },
462
- {
463
- "entropy": 0.027896945285610854,
464
- "epoch": 1.796875,
465
- "grad_norm": 0.169921875,
466
- "learning_rate": 8.917824074074075e-06,
467
- "loss": 0.019913656711578367,
468
- "mean_token_accuracy": 0.9931384524703026,
469
- "num_tokens": 11728662.0,
470
- "step": 2300
471
- },
472
- {
473
- "entropy": 0.028674062341451645,
474
- "epoch": 1.8359375,
475
- "grad_norm": 0.31640625,
476
- "learning_rate": 8.628472222222223e-06,
477
- "loss": 0.02098811149597168,
478
- "mean_token_accuracy": 0.992857717871666,
479
- "num_tokens": 11983347.0,
480
- "step": 2350
481
- },
482
- {
483
- "entropy": 0.02859616348519921,
484
- "epoch": 1.875,
485
- "grad_norm": 0.216796875,
486
- "learning_rate": 8.339120370370371e-06,
487
- "loss": 0.02054593563079834,
488
- "mean_token_accuracy": 0.992903139591217,
489
- "num_tokens": 12236526.0,
490
- "step": 2400
491
- },
492
- {
493
- "entropy": 0.029245859370566906,
494
- "epoch": 1.9140625,
495
- "grad_norm": 0.142578125,
496
- "learning_rate": 8.04976851851852e-06,
497
- "loss": 0.021238679885864257,
498
- "mean_token_accuracy": 0.9926192459464073,
499
- "num_tokens": 12489562.0,
500
- "step": 2450
501
- },
502
- {
503
- "entropy": 0.02795163292437792,
504
- "epoch": 1.953125,
505
- "grad_norm": 0.126953125,
506
- "learning_rate": 7.760416666666666e-06,
507
- "loss": 0.0203912615776062,
508
- "mean_token_accuracy": 0.9929744681715965,
509
- "num_tokens": 12748444.0,
510
- "step": 2500
511
  }
512
  ],
513
  "logging_steps": 50,
514
- "max_steps": 3840,
515
  "num_input_tokens_seen": 0,
516
- "num_train_epochs": 3,
517
  "save_steps": 500,
518
  "stateful_callbacks": {
519
  "TrainerControl": {
@@ -527,7 +127,7 @@
527
  "attributes": {}
528
  }
529
  },
530
- "total_flos": 2.4192422651355955e+17,
531
  "train_batch_size": 2,
532
  "trial_name": null,
533
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.390625,
6
  "eval_steps": 500,
7
+ "global_step": 500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 1.351678059399128,
14
  "epoch": 0.0390625,
15
+ "grad_norm": 1.6015625,
16
+ "learning_rate": 3.828125000000001e-06,
17
+ "loss": 1.394322509765625,
18
+ "mean_token_accuracy": 0.7548403647542,
19
  "num_tokens": 257685.0,
20
  "step": 50
21
  },
22
  {
23
+ "entropy": 0.47313837975263595,
24
  "epoch": 0.078125,
25
+ "grad_norm": 0.91015625,
26
+ "learning_rate": 7.734375e-06,
27
+ "loss": 0.4260553359985352,
28
+ "mean_token_accuracy": 0.9251225611567497,
29
  "num_tokens": 513846.0,
30
  "step": 100
31
  },
32
  {
33
+ "entropy": 0.22530030721798538,
34
  "epoch": 0.1171875,
35
+ "grad_norm": 0.921875,
36
+ "learning_rate": 1.1640625000000002e-05,
37
+ "loss": 0.18161891937255858,
38
+ "mean_token_accuracy": 0.9637172383069992,
39
  "num_tokens": 766003.0,
40
  "step": 150
41
  },
42
  {
43
+ "entropy": 0.14051863566040992,
44
  "epoch": 0.15625,
45
+ "grad_norm": 1.6484375,
46
+ "learning_rate": 1.5546875e-05,
47
+ "loss": 0.1136919116973877,
48
+ "mean_token_accuracy": 0.9747392472624778,
49
  "num_tokens": 1024512.0,
50
  "step": 200
51
  },
52
  {
53
+ "entropy": 0.09577633743174374,
54
  "epoch": 0.1953125,
55
+ "grad_norm": 0.71484375,
56
+ "learning_rate": 1.9453125e-05,
57
+ "loss": 0.0734261655807495,
58
+ "mean_token_accuracy": 0.9820188581943512,
59
  "num_tokens": 1280614.0,
60
  "step": 250
61
  },
62
  {
63
+ "entropy": 0.08148466867394745,
64
  "epoch": 0.234375,
65
+ "grad_norm": 0.3828125,
66
+ "learning_rate": 1.9626736111111114e-05,
67
+ "loss": 0.062112469673156735,
68
+ "mean_token_accuracy": 0.9844269120693206,
69
  "num_tokens": 1533690.0,
70
  "step": 300
71
  },
72
  {
73
+ "entropy": 0.06672279690392316,
74
  "epoch": 0.2734375,
75
+ "grad_norm": 0.455078125,
76
+ "learning_rate": 1.9192708333333335e-05,
77
+ "loss": 0.05034114837646484,
78
+ "mean_token_accuracy": 0.986752623617649,
79
  "num_tokens": 1783169.0,
80
  "step": 350
81
  },
82
  {
83
+ "entropy": 0.051021190043538805,
84
  "epoch": 0.3125,
85
+ "grad_norm": 0.49609375,
86
+ "learning_rate": 1.8758680555555557e-05,
87
+ "loss": 0.03769558668136597,
88
+ "mean_token_accuracy": 0.9895561364293098,
89
  "num_tokens": 2039809.0,
90
  "step": 400
91
  },
92
  {
93
+ "entropy": 0.04628240401856601,
94
  "epoch": 0.3515625,
95
+ "grad_norm": 0.1962890625,
96
+ "learning_rate": 1.8324652777777778e-05,
97
+ "loss": 0.034056272506713864,
98
+ "mean_token_accuracy": 0.9902530950307846,
99
  "num_tokens": 2296699.0,
100
  "step": 450
101
  },
102
  {
103
+ "entropy": 0.041101934388279915,
104
  "epoch": 0.390625,
105
+ "grad_norm": 0.498046875,
106
+ "learning_rate": 1.7890625000000003e-05,
107
+ "loss": 0.030158956050872803,
108
+ "mean_token_accuracy": 0.9908681440353394,
109
  "num_tokens": 2551050.0,
110
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  }
112
  ],
113
  "logging_steps": 50,
114
+ "max_steps": 2560,
115
  "num_input_tokens_seen": 0,
116
+ "num_train_epochs": 2,
117
  "save_steps": 500,
118
  "stateful_callbacks": {
119
  "TrainerControl": {
 
127
  "attributes": {}
128
  }
129
  },
130
+ "total_flos": 4.833703032695194e+16,
131
  "train_batch_size": 2,
132
  "trial_name": null,
133
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f10de4aab36802dd9ce012f3583586ae2292b7eb4d0601cf4c5d6f23d568ac7f
3
  size 5713
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26cbf16b2edabc5b8c36be1cccac63caa078ca871b6b4fdcff3ff77d550c19b0
3
  size 5713