{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 2.9528535980148884,
  "eval_steps": 500,
  "global_step": 150,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.03970223325062035,
      "grad_norm": 10.819287300109863,
      "learning_rate": 0.00019866666666666668,
      "loss": 75.0498,
      "step": 2
    },
    {
      "epoch": 0.0794044665012407,
      "grad_norm": NaN,
      "learning_rate": 0.000196,
      "loss": 70.9894,
      "step": 4
    },
    {
      "epoch": 0.11910669975186104,
      "grad_norm": NaN,
      "learning_rate": 0.0001946666666666667,
      "loss": 67.1935,
      "step": 6
    },
    {
      "epoch": 0.1588089330024814,
      "grad_norm": 14.99397087097168,
      "learning_rate": 0.00019333333333333333,
      "loss": 64.9122,
      "step": 8
    },
    {
      "epoch": 0.19851116625310175,
      "grad_norm": 23.344423294067383,
      "learning_rate": 0.00019066666666666668,
      "loss": 61.9918,
      "step": 10
    },
    {
      "epoch": 0.23821339950372208,
      "grad_norm": 22.76801109313965,
      "learning_rate": 0.000188,
      "loss": 60.0971,
      "step": 12
    },
    {
      "epoch": 0.27791563275434245,
      "grad_norm": 12.378619194030762,
      "learning_rate": 0.00018533333333333333,
      "loss": 57.308,
      "step": 14
    },
    {
      "epoch": 0.3176178660049628,
      "grad_norm": 10.69680118560791,
      "learning_rate": 0.00018266666666666667,
      "loss": 55.1176,
      "step": 16
    },
    {
      "epoch": 0.3573200992555831,
      "grad_norm": 11.427177429199219,
      "learning_rate": 0.00018,
      "loss": 54.4207,
      "step": 18
    },
    {
      "epoch": 0.3970223325062035,
      "grad_norm": 8.827279090881348,
      "learning_rate": 0.00017733333333333335,
      "loss": 55.7351,
      "step": 20
    },
    {
      "epoch": 0.43672456575682383,
      "grad_norm": 9.299980163574219,
      "learning_rate": 0.00017466666666666667,
      "loss": 53.5308,
      "step": 22
    },
    {
      "epoch": 0.47642679900744417,
      "grad_norm": 7.532495021820068,
      "learning_rate": 0.000172,
      "loss": 53.108,
      "step": 24
    },
    {
      "epoch": 0.5161290322580645,
      "grad_norm": 6.720007419586182,
      "learning_rate": 0.00016933333333333335,
      "loss": 51.7795,
      "step": 26
    },
    {
      "epoch": 0.5558312655086849,
      "grad_norm": 6.452204704284668,
      "learning_rate": 0.0001666666666666667,
      "loss": 52.2811,
      "step": 28
    },
    {
      "epoch": 0.5955334987593052,
      "grad_norm": 13.8781156539917,
      "learning_rate": 0.000164,
      "loss": 55.3741,
      "step": 30
    },
    {
      "epoch": 0.6352357320099256,
      "grad_norm": 7.135878562927246,
      "learning_rate": 0.00016133333333333334,
      "loss": 52.6366,
      "step": 32
    },
    {
      "epoch": 0.674937965260546,
      "grad_norm": 5.283834457397461,
      "learning_rate": 0.00015866666666666668,
      "loss": 52.9453,
      "step": 34
    },
    {
      "epoch": 0.7146401985111662,
      "grad_norm": 5.093813419342041,
      "learning_rate": 0.00015600000000000002,
      "loss": 51.6369,
      "step": 36
    },
    {
      "epoch": 0.7543424317617866,
      "grad_norm": 9.148886680603027,
      "learning_rate": 0.00015333333333333334,
      "loss": 52.6797,
      "step": 38
    },
    {
      "epoch": 0.794044665012407,
      "grad_norm": 5.6377081871032715,
      "learning_rate": 0.00015066666666666668,
      "loss": 52.1141,
      "step": 40
    },
    {
      "epoch": 0.8337468982630273,
      "grad_norm": 5.645318984985352,
      "learning_rate": 0.000148,
      "loss": 53.2159,
      "step": 42
    },
    {
      "epoch": 0.8734491315136477,
      "grad_norm": 4.151162147521973,
      "learning_rate": 0.00014533333333333333,
      "loss": 53.3123,
      "step": 44
    },
    {
      "epoch": 0.913151364764268,
      "grad_norm": 5.807156562805176,
      "learning_rate": 0.00014266666666666667,
      "loss": 51.7508,
      "step": 46
    },
    {
      "epoch": 0.9528535980148883,
      "grad_norm": 4.72116756439209,
      "learning_rate": 0.00014,
      "loss": 50.6579,
      "step": 48
    },
    {
      "epoch": 0.9925558312655087,
      "grad_norm": 4.37299108505249,
      "learning_rate": 0.00013733333333333333,
      "loss": 52.3235,
      "step": 50
    },
    {
      "epoch": 1.0198511166253101,
      "grad_norm": 7.881499290466309,
      "learning_rate": 0.00013466666666666667,
      "loss": 36.5848,
      "step": 52
    },
    {
      "epoch": 1.0595533498759304,
      "grad_norm": 5.981872081756592,
      "learning_rate": 0.000132,
      "loss": 52.5394,
      "step": 54
    },
    {
      "epoch": 1.099255583126551,
      "grad_norm": 8.142722129821777,
      "learning_rate": 0.00012933333333333332,
      "loss": 51.9813,
      "step": 56
    },
    {
      "epoch": 1.1389578163771712,
      "grad_norm": 8.74074935913086,
      "learning_rate": 0.00012666666666666666,
      "loss": 52.258,
      "step": 58
    },
    {
      "epoch": 1.1786600496277915,
      "grad_norm": 4.094906806945801,
      "learning_rate": 0.000124,
      "loss": 50.8239,
      "step": 60
    },
    {
      "epoch": 1.218362282878412,
      "grad_norm": 6.232289791107178,
      "learning_rate": 0.00012133333333333335,
      "loss": 51.2459,
      "step": 62
    },
    {
      "epoch": 1.2580645161290323,
      "grad_norm": 5.168397426605225,
      "learning_rate": 0.00011866666666666669,
      "loss": 50.1529,
      "step": 64
    },
    {
      "epoch": 1.2977667493796525,
      "grad_norm": 15.91019058227539,
      "learning_rate": 0.000116,
      "loss": 51.9121,
      "step": 66
    },
    {
      "epoch": 1.337468982630273,
      "grad_norm": 13.859478950500488,
      "learning_rate": 0.00011333333333333334,
      "loss": 52.3775,
      "step": 68
    },
    {
      "epoch": 1.3771712158808933,
      "grad_norm": 5.351056098937988,
      "learning_rate": 0.00011066666666666667,
      "loss": 52.3156,
      "step": 70
    },
    {
      "epoch": 1.4168734491315136,
      "grad_norm": 8.041162490844727,
      "learning_rate": 0.00010800000000000001,
      "loss": 51.5322,
      "step": 72
    },
    {
      "epoch": 1.4565756823821339,
      "grad_norm": 5.210399627685547,
      "learning_rate": 0.00010533333333333332,
      "loss": 50.7804,
      "step": 74
    },
    {
      "epoch": 1.4962779156327544,
      "grad_norm": 5.557844161987305,
      "learning_rate": 0.00010266666666666666,
      "loss": 51.3733,
      "step": 76
    },
    {
      "epoch": 1.5359801488833746,
      "grad_norm": 4.784997940063477,
      "learning_rate": 0.0001,
      "loss": 52.5475,
      "step": 78
    },
    {
      "epoch": 1.5756823821339951,
      "grad_norm": 6.19811487197876,
      "learning_rate": 9.733333333333335e-05,
      "loss": 51.6006,
      "step": 80
    },
    {
      "epoch": 1.6153846153846154,
      "grad_norm": 6.56746768951416,
      "learning_rate": 9.466666666666667e-05,
      "loss": 51.0926,
      "step": 82
    },
    {
      "epoch": 1.6550868486352357,
      "grad_norm": 5.6273369789123535,
      "learning_rate": 9.200000000000001e-05,
      "loss": 50.9075,
      "step": 84
    },
    {
      "epoch": 1.694789081885856,
      "grad_norm": 5.728956699371338,
      "learning_rate": 8.933333333333334e-05,
      "loss": 49.2599,
      "step": 86
    },
    {
      "epoch": 1.7344913151364765,
      "grad_norm": 5.968097686767578,
      "learning_rate": 8.666666666666667e-05,
      "loss": 51.7706,
      "step": 88
    },
    {
      "epoch": 1.7741935483870968,
      "grad_norm": 5.664118766784668,
      "learning_rate": 8.4e-05,
      "loss": 52.0589,
      "step": 90
    },
    {
      "epoch": 1.8138957816377173,
      "grad_norm": 5.830865383148193,
      "learning_rate": 8.133333333333334e-05,
      "loss": 51.3431,
      "step": 92
    },
    {
      "epoch": 1.8535980148883375,
      "grad_norm": 4.932041645050049,
      "learning_rate": 7.866666666666666e-05,
      "loss": 51.1417,
      "step": 94
    },
    {
      "epoch": 1.8933002481389578,
      "grad_norm": 6.466910362243652,
      "learning_rate": 7.6e-05,
      "loss": 52.1107,
      "step": 96
    },
    {
      "epoch": 1.933002481389578,
      "grad_norm": 6.244173049926758,
      "learning_rate": 7.333333333333333e-05,
      "loss": 53.797,
      "step": 98
    },
    {
      "epoch": 1.9727047146401984,
      "grad_norm": 5.435241222381592,
      "learning_rate": 7.066666666666667e-05,
      "loss": 50.5896,
      "step": 100
    },
    {
      "epoch": 2.0,
      "grad_norm": 3.3919055461883545,
      "learning_rate": 6.800000000000001e-05,
      "loss": 35.4669,
      "step": 102
    },
    {
      "epoch": 2.0397022332506203,
      "grad_norm": 5.537905216217041,
      "learning_rate": 6.533333333333334e-05,
      "loss": 52.3083,
      "step": 104
    },
    {
      "epoch": 2.0794044665012406,
      "grad_norm": 4.633804798126221,
      "learning_rate": 6.266666666666667e-05,
      "loss": 51.0959,
      "step": 106
    },
    {
      "epoch": 2.119106699751861,
      "grad_norm": 7.164112567901611,
      "learning_rate": 6e-05,
      "loss": 52.2189,
      "step": 108
    },
    {
      "epoch": 2.1588089330024816,
      "grad_norm": 8.255313873291016,
      "learning_rate": 5.7333333333333336e-05,
      "loss": 50.0571,
      "step": 110
    },
    {
      "epoch": 2.198511166253102,
      "grad_norm": 6.691186428070068,
      "learning_rate": 5.466666666666666e-05,
      "loss": 50.3818,
      "step": 112
    },
    {
      "epoch": 2.238213399503722,
      "grad_norm": 5.381030082702637,
      "learning_rate": 5.2000000000000004e-05,
      "loss": 49.0542,
      "step": 114
    },
    {
      "epoch": 2.2779156327543424,
      "grad_norm": 4.446993350982666,
      "learning_rate": 4.933333333333334e-05,
      "loss": 49.9555,
      "step": 116
    },
    {
      "epoch": 2.3176178660049627,
      "grad_norm": 7.161057949066162,
      "learning_rate": 4.666666666666667e-05,
      "loss": 51.0844,
      "step": 118
    },
    {
      "epoch": 2.357320099255583,
      "grad_norm": 11.226974487304688,
      "learning_rate": 4.4000000000000006e-05,
      "loss": 48.4176,
      "step": 120
    },
    {
      "epoch": 2.3970223325062037,
      "grad_norm": 4.884274005889893,
      "learning_rate": 4.133333333333333e-05,
      "loss": 51.0978,
      "step": 122
    },
    {
      "epoch": 2.436724565756824,
      "grad_norm": 6.24745512008667,
      "learning_rate": 3.866666666666667e-05,
      "loss": 50.5736,
      "step": 124
    },
    {
      "epoch": 2.4764267990074442,
      "grad_norm": 5.6129150390625,
      "learning_rate": 3.6e-05,
      "loss": 49.4998,
      "step": 126
    },
    {
      "epoch": 2.5161290322580645,
      "grad_norm": 7.322082042694092,
      "learning_rate": 3.3333333333333335e-05,
      "loss": 47.8495,
      "step": 128
    },
    {
      "epoch": 2.555831265508685,
      "grad_norm": 7.050683975219727,
      "learning_rate": 3.066666666666667e-05,
      "loss": 50.6109,
      "step": 130
    },
    {
      "epoch": 2.595533498759305,
      "grad_norm": 5.612835884094238,
      "learning_rate": 2.8000000000000003e-05,
      "loss": 50.7746,
      "step": 132
    },
    {
      "epoch": 2.6352357320099253,
      "grad_norm": 5.297985553741455,
      "learning_rate": 2.5333333333333337e-05,
      "loss": 51.2929,
      "step": 134
    },
    {
      "epoch": 2.674937965260546,
      "grad_norm": 5.420377731323242,
      "learning_rate": 2.2666666666666668e-05,
      "loss": 49.2917,
      "step": 136
    },
    {
      "epoch": 2.7146401985111663,
      "grad_norm": 8.291167259216309,
      "learning_rate": 2e-05,
      "loss": 51.0978,
      "step": 138
    },
    {
      "epoch": 2.7543424317617866,
      "grad_norm": 4.987746238708496,
      "learning_rate": 1.7333333333333336e-05,
      "loss": 49.8268,
      "step": 140
    },
    {
      "epoch": 2.794044665012407,
      "grad_norm": 5.7439494132995605,
      "learning_rate": 1.4666666666666668e-05,
      "loss": 49.5457,
      "step": 142
    },
    {
      "epoch": 2.833746898263027,
      "grad_norm": 4.388277053833008,
      "learning_rate": 1.2e-05,
      "loss": 50.3072,
      "step": 144
    },
    {
      "epoch": 2.873449131513648,
      "grad_norm": 10.240147590637207,
      "learning_rate": 9.333333333333334e-06,
      "loss": 49.2494,
      "step": 146
    },
    {
      "epoch": 2.9131513647642677,
      "grad_norm": 5.075774669647217,
      "learning_rate": 6.666666666666667e-06,
      "loss": 49.8291,
      "step": 148
    },
    {
      "epoch": 2.9528535980148884,
      "grad_norm": 5.719578742980957,
      "learning_rate": 4.000000000000001e-06,
      "loss": 50.2739,
      "step": 150
    },
    {
      "epoch": 2.9528535980148884,
      "step": 150,
      "total_flos": 246404020387764.0,
      "train_loss": 52.365604654947916,
      "train_runtime": 3857.4888,
      "train_samples_per_second": 0.626,
      "train_steps_per_second": 0.039
    }
  ],
  "logging_steps": 2,
  "max_steps": 150,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 246404020387764.0,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}