misterJB commited on
Commit
f873d20
·
verified ·
1 Parent(s): 3c8b98c

Training in progress, step 1500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72812eb4a2663fa8500c7a5809ba3d8d1217fa71da08a4104ac84b2c4baffe49
3
  size 6425529112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c651d278253319c1b93c3b131fc31165efa751e16ffe94eb310a7b4bdfe01084
3
  size 6425529112
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dcb53afe4d66917c9a3fdf0d2fb1683d61488399a5c074f28626c3ba952ecd8f
3
  size 12851224679
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a43fa5ae43e347ff1e2a0cdec05688f22dd267a5acf33ab37f021f044220db69
3
  size 12851224679
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61c19bab1174704a4a4441475683bf1270277af15d2e2c95e964789128e482c4
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d816652875af5b096437609afdf7a105e45a5aed127110ed63352bdde6ad2657
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8a0161fb643893b4bd0a9724aa51736729cc07ff0a3f386f1ba978002596386
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.78125,
6
  "eval_steps": 500,
7
- "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -208,6 +208,106 @@
208
  "mean_token_accuracy": 0.9924442365765571,
209
  "num_tokens": 5099714.0,
210
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  }
212
  ],
213
  "logging_steps": 50,
@@ -227,7 +327,7 @@
227
  "attributes": {}
228
  }
229
  },
230
- "total_flos": 9.663810471217152e+16,
231
  "train_batch_size": 2,
232
  "trial_name": null,
233
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.171875,
6
  "eval_steps": 500,
7
+ "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
208
  "mean_token_accuracy": 0.9924442365765571,
209
  "num_tokens": 5099714.0,
210
  "step": 1000
211
+ },
212
+ {
213
+ "entropy": 0.032110756486654284,
214
+ "epoch": 0.8203125,
215
+ "grad_norm": 0.171875,
216
+ "learning_rate": 1.3116319444444446e-05,
217
+ "loss": 0.023927602767944336,
218
+ "mean_token_accuracy": 0.992151814699173,
219
+ "num_tokens": 5353934.0,
220
+ "step": 1050
221
+ },
222
+ {
223
+ "entropy": 0.03357607708312571,
224
+ "epoch": 0.859375,
225
+ "grad_norm": 0.220703125,
226
+ "learning_rate": 1.2682291666666669e-05,
227
+ "loss": 0.024996912479400633,
228
+ "mean_token_accuracy": 0.9920313712954522,
229
+ "num_tokens": 5610229.0,
230
+ "step": 1100
231
+ },
232
+ {
233
+ "entropy": 0.03356592872180045,
234
+ "epoch": 0.8984375,
235
+ "grad_norm": 0.203125,
236
+ "learning_rate": 1.2248263888888889e-05,
237
+ "loss": 0.025175034999847412,
238
+ "mean_token_accuracy": 0.9921249234676361,
239
+ "num_tokens": 5862791.0,
240
+ "step": 1150
241
+ },
242
+ {
243
+ "entropy": 0.031079287379980086,
244
+ "epoch": 0.9375,
245
+ "grad_norm": 0.1318359375,
246
+ "learning_rate": 1.1814236111111112e-05,
247
+ "loss": 0.022713756561279295,
248
+ "mean_token_accuracy": 0.9926198759675026,
249
+ "num_tokens": 6121431.0,
250
+ "step": 1200
251
+ },
252
+ {
253
+ "entropy": 0.02976180042140186,
254
+ "epoch": 0.9765625,
255
+ "grad_norm": 0.154296875,
256
+ "learning_rate": 1.1380208333333333e-05,
257
+ "loss": 0.02123898983001709,
258
+ "mean_token_accuracy": 0.992766418159008,
259
+ "num_tokens": 6379675.0,
260
+ "step": 1250
261
+ },
262
+ {
263
+ "entropy": 0.030388496736995875,
264
+ "epoch": 1.015625,
265
+ "grad_norm": 0.1650390625,
266
+ "learning_rate": 1.0946180555555556e-05,
267
+ "loss": 0.021283388137817383,
268
+ "mean_token_accuracy": 0.9927816662192345,
269
+ "num_tokens": 6635287.0,
270
+ "step": 1300
271
+ },
272
+ {
273
+ "entropy": 0.029865577281452716,
274
+ "epoch": 1.0546875,
275
+ "grad_norm": 0.265625,
276
+ "learning_rate": 1.0512152777777778e-05,
277
+ "loss": 0.021030676364898682,
278
+ "mean_token_accuracy": 0.9929129666090012,
279
+ "num_tokens": 6888440.0,
280
+ "step": 1350
281
+ },
282
+ {
283
+ "entropy": 0.031085506100207567,
284
+ "epoch": 1.09375,
285
+ "grad_norm": 0.1748046875,
286
+ "learning_rate": 1.0078125000000001e-05,
287
+ "loss": 0.02215445041656494,
288
+ "mean_token_accuracy": 0.9926813915371895,
289
+ "num_tokens": 7143446.0,
290
+ "step": 1400
291
+ },
292
+ {
293
+ "entropy": 0.03091464822180569,
294
+ "epoch": 1.1328125,
295
+ "grad_norm": 0.2255859375,
296
+ "learning_rate": 9.644097222222222e-06,
297
+ "loss": 0.022361652851104738,
298
+ "mean_token_accuracy": 0.9926716023683548,
299
+ "num_tokens": 7400487.0,
300
+ "step": 1450
301
+ },
302
+ {
303
+ "entropy": 0.029652795745059846,
304
+ "epoch": 1.171875,
305
+ "grad_norm": 0.130859375,
306
+ "learning_rate": 9.210069444444446e-06,
307
+ "loss": 0.02084646940231323,
308
+ "mean_token_accuracy": 0.9928994616866111,
309
+ "num_tokens": 7655674.0,
310
+ "step": 1500
311
  }
312
  ],
313
  "logging_steps": 50,
 
327
  "attributes": {}
328
  }
329
  },
330
+ "total_flos": 1.453331385078866e+17,
331
  "train_batch_size": 2,
332
  "trial_name": null,
333
  "trial_params": null