{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.8261366232579392,
  "eval_steps": 500,
  "global_step": 500,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.03655471784327165,
      "grad_norm": 41.77480697631836,
      "learning_rate": 0.00027,
      "loss": 61.162,
      "step": 10
    },
    {
      "epoch": 0.0731094356865433,
      "grad_norm": 21.519004821777344,
      "learning_rate": 0.0002999388289242284,
      "loss": 29.6052,
      "step": 20
    },
    {
      "epoch": 0.10966415352981494,
      "grad_norm": 21.299711227416992,
      "learning_rate": 0.00029972743740666765,
      "loss": 26.4378,
      "step": 30
    },
    {
      "epoch": 0.1462188713730866,
      "grad_norm": 18.043352127075195,
      "learning_rate": 0.0002993652830514899,
      "loss": 26.6099,
      "step": 40
    },
    {
      "epoch": 0.18277358921635822,
      "grad_norm": 14.862595558166504,
      "learning_rate": 0.00029885273051743214,
      "loss": 27.796,
      "step": 50
    },
    {
      "epoch": 0.21932830705962988,
      "grad_norm": 18.81863784790039,
      "learning_rate": 0.000298190295901449,
      "loss": 26.9974,
      "step": 60
    },
    {
      "epoch": 0.25588302490290155,
      "grad_norm": 22.05609130859375,
      "learning_rate": 0.0002973786462190466,
      "loss": 24.4708,
      "step": 70
    },
    {
      "epoch": 0.2924377427461732,
      "grad_norm": 21.369543075561523,
      "learning_rate": 0.0002964185987326545,
      "loss": 25.8129,
      "step": 80
    },
    {
      "epoch": 0.3289924605894448,
      "grad_norm": 21.98700523376465,
      "learning_rate": 0.00029531112012871175,
      "loss": 25.3774,
      "step": 90
    },
    {
      "epoch": 0.36554717843271645,
      "grad_norm": 21.832794189453125,
      "learning_rate": 0.00029405732554429564,
      "loss": 27.0519,
      "step": 100
    },
    {
      "epoch": 0.40210189627598814,
      "grad_norm": 15.660602569580078,
      "learning_rate": 0.00029265847744427303,
      "loss": 26.0219,
      "step": 110
    },
    {
      "epoch": 0.43865661411925977,
      "grad_norm": 21.285099029541016,
      "learning_rate": 0.0002911159843501053,
      "loss": 25.9179,
      "step": 120
    },
    {
      "epoch": 0.4752113319625314,
      "grad_norm": 18.867576599121094,
      "learning_rate": 0.00028943139942158683,
      "loss": 24.2593,
      "step": 130
    },
    {
      "epoch": 0.5117660498058031,
      "grad_norm": 22.40102195739746,
      "learning_rate": 0.00028760641889294446,
      "loss": 23.7113,
      "step": 140
    },
    {
      "epoch": 0.5483207676490747,
      "grad_norm": 24.922903060913086,
      "learning_rate": 0.00028564288036487357,
      "loss": 25.1095,
      "step": 150
    },
    {
      "epoch": 0.5848754854923464,
      "grad_norm": 21.92047882080078,
      "learning_rate": 0.0002835427609542298,
      "loss": 24.1838,
      "step": 160
    },
    {
      "epoch": 0.621430203335618,
      "grad_norm": 19.682661056518555,
      "learning_rate": 0.0002813081753032403,
      "loss": 25.6995,
      "step": 170
    },
    {
      "epoch": 0.6579849211788896,
      "grad_norm": 18.963228225708008,
      "learning_rate": 0.00027894137345023785,
      "loss": 24.6019,
      "step": 180
    },
    {
      "epoch": 0.6945396390221613,
      "grad_norm": 21.453981399536133,
      "learning_rate": 0.0002764447385640632,
      "loss": 25.8001,
      "step": 190
    },
    {
      "epoch": 0.7310943568654329,
      "grad_norm": 21.415788650512695,
      "learning_rate": 0.00027382078454441606,
      "loss": 25.5969,
      "step": 200
    },
    {
      "epoch": 0.7676490747087046,
      "grad_norm": 14.518242835998535,
      "learning_rate": 0.0002710721534905712,
      "loss": 23.8269,
      "step": 210
    },
    {
      "epoch": 0.8042037925519763,
      "grad_norm": 16.467975616455078,
      "learning_rate": 0.00026820161304100823,
      "loss": 24.8,
      "step": 220
    },
    {
      "epoch": 0.8407585103952478,
      "grad_norm": 20.016340255737305,
      "learning_rate": 0.00026521205358663477,
      "loss": 24.9825,
      "step": 230
    },
    {
      "epoch": 0.8773132282385195,
      "grad_norm": 20.681249618530273,
      "learning_rate": 0.0002621064853604071,
      "loss": 23.6612,
      "step": 240
    },
    {
      "epoch": 0.9138679460817912,
      "grad_norm": 19.81218910217285,
      "learning_rate": 0.0002588880354062814,
      "loss": 23.9873,
      "step": 250
    },
    {
      "epoch": 0.9504226639250628,
      "grad_norm": 23.763002395629883,
      "learning_rate": 0.00025555994443054504,
      "loss": 24.4914,
      "step": 260
    },
    {
      "epoch": 0.9869773817683345,
      "grad_norm": 19.300912857055664,
      "learning_rate": 0.0002521255635387005,
      "loss": 24.607,
      "step": 270
    },
    {
      "epoch": 1.021932830705963,
      "grad_norm": 19.00154685974121,
      "learning_rate": 0.0002485883508611858,
      "loss": 19.2292,
      "step": 280
    },
    {
      "epoch": 1.0584875485492347,
      "grad_norm": 16.176002502441406,
      "learning_rate": 0.00024495186807133056,
      "loss": 17.2361,
      "step": 290
    },
    {
      "epoch": 1.0950422663925063,
      "grad_norm": 19.164424896240234,
      "learning_rate": 0.00024121977679905266,
      "loss": 18.1384,
      "step": 300
    },
    {
      "epoch": 1.1315969842357778,
      "grad_norm": 17.439308166503906,
      "learning_rate": 0.00023739583494390752,
      "loss": 17.726,
      "step": 310
    },
    {
      "epoch": 1.1681517020790495,
      "grad_norm": 23.18360710144043,
      "learning_rate": 0.00023348389289120158,
      "loss": 18.0533,
      "step": 320
    },
    {
      "epoch": 1.2047064199223212,
      "grad_norm": 23.22918701171875,
      "learning_rate": 0.0002294878896349807,
      "loss": 18.3334,
      "step": 330
    },
    {
      "epoch": 1.2412611377655929,
      "grad_norm": 20.49732780456543,
      "learning_rate": 0.00022541184881179737,
      "loss": 17.3156,
      "step": 340
    },
    {
      "epoch": 1.2778158556088646,
      "grad_norm": 23.029027938842773,
      "learning_rate": 0.00022125987464924926,
      "loss": 18.2255,
      "step": 350
    },
    {
      "epoch": 1.3143705734521363,
      "grad_norm": 19.006935119628906,
      "learning_rate": 0.0002170361478333702,
      "loss": 18.849,
      "step": 360
    },
    {
      "epoch": 1.350925291295408,
      "grad_norm": 19.27879524230957,
      "learning_rate": 0.0002127449212990339,
      "loss": 18.9721,
      "step": 370
    },
    {
      "epoch": 1.3874800091386794,
      "grad_norm": 19.815698623657227,
      "learning_rate": 0.00020839051594760872,
      "loss": 17.9975,
      "step": 380
    },
    {
      "epoch": 1.424034726981951,
      "grad_norm": 22.413816452026367,
      "learning_rate": 0.00020397731629617636,
      "loss": 17.729,
      "step": 390
    },
    {
      "epoch": 1.4605894448252228,
      "grad_norm": 19.09907341003418,
      "learning_rate": 0.00019950976606269497,
      "loss": 18.0831,
      "step": 400
    },
    {
      "epoch": 1.4971441626684945,
      "grad_norm": 17.786975860595703,
      "learning_rate": 0.00019499236369155157,
      "loss": 17.9862,
      "step": 410
    },
    {
      "epoch": 1.533698880511766,
      "grad_norm": 22.56906509399414,
      "learning_rate": 0.00019042965782401018,
      "loss": 17.6971,
      "step": 420
    },
    {
      "epoch": 1.5702535983550376,
      "grad_norm": 24.24004364013672,
      "learning_rate": 0.00018582624271811532,
      "loss": 18.3465,
      "step": 430
    },
    {
      "epoch": 1.6068083161983093,
      "grad_norm": 25.849702835083008,
      "learning_rate": 0.00018118675362266385,
      "loss": 18.3028,
      "step": 440
    },
    {
      "epoch": 1.643363034041581,
      "grad_norm": 26.149951934814453,
      "learning_rate": 0.00017651586210990232,
      "loss": 16.919,
      "step": 450
    },
    {
      "epoch": 1.6799177518848527,
      "grad_norm": 21.565109252929688,
      "learning_rate": 0.00017181827137164953,
      "loss": 18.9473,
      "step": 460
    },
    {
      "epoch": 1.7164724697281244,
      "grad_norm": 19.176294326782227,
      "learning_rate": 0.00016709871148358108,
      "loss": 16.9236,
      "step": 470
    },
    {
      "epoch": 1.753027187571396,
      "grad_norm": 19.63998031616211,
      "learning_rate": 0.00016236193464244444,
      "loss": 17.8228,
      "step": 480
    },
    {
      "epoch": 1.7895819054146676,
      "grad_norm": 23.75813865661621,
      "learning_rate": 0.00015761271038099912,
      "loss": 17.3249,
      "step": 490
    },
    {
      "epoch": 1.8261366232579392,
      "grad_norm": 23.72920036315918,
      "learning_rate": 0.00015285582076550198,
      "loss": 19.046,
      "step": 500
    }
  ],
  "logging_steps": 10,
  "max_steps": 1000,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 4,
  "save_steps": 100,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 3.47221899456696e+16,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}