misterJB commited on
Commit
d902074
·
verified ·
1 Parent(s): c624da1

Training in progress, step 2500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fafc67897a73ac0e70a597348c3155bc381e60dd90c67d5d6ed7fc84ba105613
3
  size 6425529112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57c4831d4e47f3c33f5f8969a02983374946ab3e5f383e907aca585a86e5b94d
3
  size 6425529112
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0f91ec07067e47aed7410c0ad51ca6a2cf72d126a46306b9c6488df8bde784a
3
  size 12851224679
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4159a3207a568baa044ce95dd31d1026fc5b9d5c7ec0e1a1965827fde31d1cab
3
  size 12851224679
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7949a82b4956d6dc1683588213cba80de08632b440866c7377566d6b5ddef0f6
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ab5f3bc9f12c23cdfc964a4b7a0357b0f5ab599c9285206de2dfd5600d45e1d
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.5625,
6
  "eval_steps": 500,
7
- "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -408,6 +408,106 @@
408
  "mean_token_accuracy": 0.9926387491822243,
409
  "num_tokens": 10201426.0,
410
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  }
412
  ],
413
  "logging_steps": 50,
@@ -427,7 +527,7 @@
427
  "attributes": {}
428
  }
429
  },
430
- "total_flos": 1.936166915593175e+17,
431
  "train_batch_size": 2,
432
  "trial_name": null,
433
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.953125,
6
  "eval_steps": 500,
7
+ "global_step": 2500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
408
  "mean_token_accuracy": 0.9926387491822243,
409
  "num_tokens": 10201426.0,
410
  "step": 2000
411
+ },
412
+ {
413
+ "entropy": 0.030316293751820923,
414
+ "epoch": 1.6015625,
415
+ "grad_norm": 0.396484375,
416
+ "learning_rate": 4.435763888888889e-06,
417
+ "loss": 0.021486189365386963,
418
+ "mean_token_accuracy": 0.9927674040198327,
419
+ "num_tokens": 10454276.0,
420
+ "step": 2050
421
+ },
422
+ {
423
+ "entropy": 0.02887956439051777,
424
+ "epoch": 1.640625,
425
+ "grad_norm": 0.1513671875,
426
+ "learning_rate": 4.001736111111112e-06,
427
+ "loss": 0.020056800842285158,
428
+ "mean_token_accuracy": 0.9931298586726188,
429
+ "num_tokens": 10707497.0,
430
+ "step": 2100
431
+ },
432
+ {
433
+ "entropy": 0.028787780185230077,
434
+ "epoch": 1.6796875,
435
+ "grad_norm": 0.1455078125,
436
+ "learning_rate": 3.5677083333333335e-06,
437
+ "loss": 0.0200783896446228,
438
+ "mean_token_accuracy": 0.9929500755667686,
439
+ "num_tokens": 10962955.0,
440
+ "step": 2150
441
+ },
442
+ {
443
+ "entropy": 0.028631422137841582,
444
+ "epoch": 1.71875,
445
+ "grad_norm": 0.1484375,
446
+ "learning_rate": 3.1336805555555562e-06,
447
+ "loss": 0.02008913516998291,
448
+ "mean_token_accuracy": 0.9930686053633689,
449
+ "num_tokens": 11217333.0,
450
+ "step": 2200
451
+ },
452
+ {
453
+ "entropy": 0.02850784788839519,
454
+ "epoch": 1.7578125,
455
+ "grad_norm": 0.1884765625,
456
+ "learning_rate": 2.699652777777778e-06,
457
+ "loss": 0.01996502876281738,
458
+ "mean_token_accuracy": 0.9930599120259285,
459
+ "num_tokens": 11472495.0,
460
+ "step": 2250
461
+ },
462
+ {
463
+ "entropy": 0.028683945639058947,
464
+ "epoch": 1.796875,
465
+ "grad_norm": 0.1728515625,
466
+ "learning_rate": 2.265625e-06,
467
+ "loss": 0.019921081066131593,
468
+ "mean_token_accuracy": 0.9932510870695114,
469
+ "num_tokens": 11728662.0,
470
+ "step": 2300
471
+ },
472
+ {
473
+ "entropy": 0.02975102465134114,
474
+ "epoch": 1.8359375,
475
+ "grad_norm": 0.32421875,
476
+ "learning_rate": 1.8315972222222223e-06,
477
+ "loss": 0.020963990688323976,
478
+ "mean_token_accuracy": 0.9929134699702263,
479
+ "num_tokens": 11983347.0,
480
+ "step": 2350
481
+ },
482
+ {
483
+ "entropy": 0.02991143790073693,
484
+ "epoch": 1.875,
485
+ "grad_norm": 0.169921875,
486
+ "learning_rate": 1.3975694444444446e-06,
487
+ "loss": 0.020808370113372804,
488
+ "mean_token_accuracy": 0.9929909712076187,
489
+ "num_tokens": 12236526.0,
490
+ "step": 2400
491
+ },
492
+ {
493
+ "entropy": 0.030044674500823022,
494
+ "epoch": 1.9140625,
495
+ "grad_norm": 0.169921875,
496
+ "learning_rate": 9.635416666666667e-07,
497
+ "loss": 0.021207802295684815,
498
+ "mean_token_accuracy": 0.9927184066176414,
499
+ "num_tokens": 12489562.0,
500
+ "step": 2450
501
+ },
502
+ {
503
+ "entropy": 0.02905242417007685,
504
+ "epoch": 1.953125,
505
+ "grad_norm": 0.1552734375,
506
+ "learning_rate": 5.295138888888889e-07,
507
+ "loss": 0.020578203201293947,
508
+ "mean_token_accuracy": 0.9929935920238495,
509
+ "num_tokens": 12748444.0,
510
+ "step": 2500
511
  }
512
  ],
513
  "logging_steps": 50,
 
527
  "attributes": {}
528
  }
529
  },
530
+ "total_flos": 2.4192422651355955e+17,
531
  "train_batch_size": 2,
532
  "trial_name": null,
533
  "trial_params": null