{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 100, "global_step": 360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06944444444444445, "grad_norm": 6.3636603355407715, "learning_rate": 1e-06, "loss": 2.6727, "step": 5 }, { "epoch": 0.1388888888888889, "grad_norm": 7.486879825592041, "learning_rate": 1e-06, "loss": 2.3642, "step": 10 }, { "epoch": 0.20833333333333334, "grad_norm": 6.5991997718811035, "learning_rate": 1e-06, "loss": 2.515, "step": 15 }, { "epoch": 0.2777777777777778, "grad_norm": 7.580630779266357, "learning_rate": 1e-06, "loss": 2.3997, "step": 20 }, { "epoch": 0.3472222222222222, "grad_norm": 7.5727410316467285, "learning_rate": 1e-06, "loss": 2.4551, "step": 25 }, { "epoch": 0.4166666666666667, "grad_norm": 8.835946083068848, "learning_rate": 1e-06, "loss": 2.4476, "step": 30 }, { "epoch": 0.4861111111111111, "grad_norm": 7.495606899261475, "learning_rate": 1e-06, "loss": 2.4191, "step": 35 }, { "epoch": 0.5555555555555556, "grad_norm": 8.057035446166992, "learning_rate": 1e-06, "loss": 2.441, "step": 40 }, { "epoch": 0.625, "grad_norm": 6.828744411468506, "learning_rate": 1e-06, "loss": 2.3052, "step": 45 }, { "epoch": 0.6944444444444444, "grad_norm": 7.163251876831055, "learning_rate": 1e-06, "loss": 2.1357, "step": 50 }, { "epoch": 0.7638888888888888, "grad_norm": 5.414941787719727, "learning_rate": 1e-06, "loss": 2.2248, "step": 55 }, { "epoch": 0.8333333333333334, "grad_norm": 6.0801544189453125, "learning_rate": 1e-06, "loss": 2.2934, "step": 60 }, { "epoch": 0.9027777777777778, "grad_norm": 6.054081439971924, "learning_rate": 1e-06, "loss": 2.3014, "step": 65 }, { "epoch": 0.9722222222222222, "grad_norm": 5.827741622924805, "learning_rate": 1e-06, "loss": 2.2515, "step": 70 }, { "epoch": 1.0416666666666667, "grad_norm": 3.5676162242889404, "learning_rate": 1e-06, "loss": 2.0915, "step": 75 }, { "epoch": 1.1111111111111112, "grad_norm": 5.15900993347168, "learning_rate": 1e-06, "loss": 2.0749, "step": 80 }, { "epoch": 1.1805555555555556, "grad_norm": 5.206437110900879, "learning_rate": 1e-06, "loss": 2.0539, "step": 85 }, { "epoch": 1.25, "grad_norm": 5.990969657897949, "learning_rate": 1e-06, "loss": 2.1308, "step": 90 }, { "epoch": 1.3194444444444444, "grad_norm": 6.198008060455322, "learning_rate": 1e-06, "loss": 2.3256, "step": 95 }, { "epoch": 1.3888888888888888, "grad_norm": 5.184628486633301, "learning_rate": 1e-06, "loss": 2.1566, "step": 100 }, { "epoch": 1.3888888888888888, "eval_loss": 2.0880796909332275, "eval_runtime": 34.0667, "eval_samples_per_second": 2.935, "eval_steps_per_second": 0.734, "step": 100 }, { "epoch": 1.4583333333333333, "grad_norm": 5.412724494934082, "learning_rate": 1e-06, "loss": 1.9085, "step": 105 }, { "epoch": 1.5277777777777777, "grad_norm": 3.459959030151367, "learning_rate": 1e-06, "loss": 1.9494, "step": 110 }, { "epoch": 1.5972222222222223, "grad_norm": 5.159445762634277, "learning_rate": 1e-06, "loss": 1.9334, "step": 115 }, { "epoch": 1.6666666666666665, "grad_norm": 5.133082389831543, "learning_rate": 1e-06, "loss": 2.0826, "step": 120 }, { "epoch": 1.7361111111111112, "grad_norm": 4.473026752471924, "learning_rate": 1e-06, "loss": 2.0585, "step": 125 }, { "epoch": 1.8055555555555556, "grad_norm": 5.063863754272461, "learning_rate": 1e-06, "loss": 2.1289, "step": 130 }, { "epoch": 1.875, "grad_norm": 4.927737236022949, "learning_rate": 1e-06, "loss": 1.9872, "step": 135 }, { "epoch": 1.9444444444444444, "grad_norm": 5.563902854919434, "learning_rate": 1e-06, "loss": 1.9803, "step": 140 }, { "epoch": 2.013888888888889, "grad_norm": 3.901442050933838, "learning_rate": 1e-06, "loss": 1.8309, "step": 145 }, { "epoch": 2.0833333333333335, "grad_norm": 3.771136999130249, "learning_rate": 1e-06, "loss": 1.7758, "step": 150 }, { "epoch": 2.1527777777777777, "grad_norm": 4.6159257888793945, "learning_rate": 1e-06, "loss": 1.9193, "step": 155 }, { "epoch": 2.2222222222222223, "grad_norm": 3.758843183517456, "learning_rate": 1e-06, "loss": 1.9329, "step": 160 }, { "epoch": 2.2916666666666665, "grad_norm": 4.267579078674316, "learning_rate": 1e-06, "loss": 2.0399, "step": 165 }, { "epoch": 2.361111111111111, "grad_norm": 3.9819560050964355, "learning_rate": 1e-06, "loss": 1.9568, "step": 170 }, { "epoch": 2.4305555555555554, "grad_norm": 3.8918192386627197, "learning_rate": 1e-06, "loss": 1.7377, "step": 175 }, { "epoch": 2.5, "grad_norm": 3.9746928215026855, "learning_rate": 1e-06, "loss": 1.8949, "step": 180 }, { "epoch": 2.5694444444444446, "grad_norm": 3.328784704208374, "learning_rate": 1e-06, "loss": 1.6509, "step": 185 }, { "epoch": 2.638888888888889, "grad_norm": 3.835324287414551, "learning_rate": 1e-06, "loss": 1.8321, "step": 190 }, { "epoch": 2.7083333333333335, "grad_norm": 3.3603885173797607, "learning_rate": 1e-06, "loss": 1.8628, "step": 195 }, { "epoch": 2.7777777777777777, "grad_norm": 3.7577502727508545, "learning_rate": 1e-06, "loss": 1.8447, "step": 200 }, { "epoch": 2.7777777777777777, "eval_loss": 1.8452154397964478, "eval_runtime": 34.0911, "eval_samples_per_second": 2.933, "eval_steps_per_second": 0.733, "step": 200 }, { "epoch": 2.8472222222222223, "grad_norm": 4.379385948181152, "learning_rate": 1e-06, "loss": 1.8212, "step": 205 }, { "epoch": 2.9166666666666665, "grad_norm": 3.7095022201538086, "learning_rate": 1e-06, "loss": 1.7862, "step": 210 }, { "epoch": 2.986111111111111, "grad_norm": 4.164438724517822, "learning_rate": 1e-06, "loss": 1.8046, "step": 215 }, { "epoch": 3.0555555555555554, "grad_norm": 3.6749582290649414, "learning_rate": 1e-06, "loss": 1.6358, "step": 220 }, { "epoch": 3.125, "grad_norm": 3.7247958183288574, "learning_rate": 1e-06, "loss": 1.791, "step": 225 }, { "epoch": 3.1944444444444446, "grad_norm": 2.9533472061157227, "learning_rate": 1e-06, "loss": 1.6251, "step": 230 }, { "epoch": 3.263888888888889, "grad_norm": 4.062502384185791, "learning_rate": 1e-06, "loss": 1.6976, "step": 235 }, { "epoch": 3.3333333333333335, "grad_norm": 4.328882217407227, "learning_rate": 1e-06, "loss": 1.8438, "step": 240 }, { "epoch": 3.4027777777777777, "grad_norm": 4.158596038818359, "learning_rate": 1e-06, "loss": 1.8998, "step": 245 }, { "epoch": 3.4722222222222223, "grad_norm": 5.7752556800842285, "learning_rate": 1e-06, "loss": 1.7517, "step": 250 }, { "epoch": 3.5416666666666665, "grad_norm": 4.568635940551758, "learning_rate": 1e-06, "loss": 1.6835, "step": 255 }, { "epoch": 3.611111111111111, "grad_norm": 3.6611974239349365, "learning_rate": 1e-06, "loss": 1.7852, "step": 260 }, { "epoch": 3.6805555555555554, "grad_norm": 4.026912212371826, "learning_rate": 1e-06, "loss": 1.7916, "step": 265 }, { "epoch": 3.75, "grad_norm": 4.750195026397705, "learning_rate": 1e-06, "loss": 1.7584, "step": 270 }, { "epoch": 3.8194444444444446, "grad_norm": 3.936798572540283, "learning_rate": 1e-06, "loss": 1.5877, "step": 275 }, { "epoch": 3.888888888888889, "grad_norm": 4.1127800941467285, "learning_rate": 1e-06, "loss": 1.5392, "step": 280 }, { "epoch": 3.9583333333333335, "grad_norm": 3.6437580585479736, "learning_rate": 1e-06, "loss": 1.6125, "step": 285 }, { "epoch": 4.027777777777778, "grad_norm": 3.641177177429199, "learning_rate": 1e-06, "loss": 1.687, "step": 290 }, { "epoch": 4.097222222222222, "grad_norm": 3.797327995300293, "learning_rate": 1e-06, "loss": 1.7779, "step": 295 }, { "epoch": 4.166666666666667, "grad_norm": 5.071943283081055, "learning_rate": 1e-06, "loss": 1.7103, "step": 300 }, { "epoch": 4.166666666666667, "eval_loss": 1.6850143671035767, "eval_runtime": 34.4694, "eval_samples_per_second": 2.901, "eval_steps_per_second": 0.725, "step": 300 }, { "epoch": 4.236111111111111, "grad_norm": 6.09140682220459, "learning_rate": 1e-06, "loss": 1.6347, "step": 305 }, { "epoch": 4.305555555555555, "grad_norm": 5.452902317047119, "learning_rate": 1e-06, "loss": 1.7689, "step": 310 }, { "epoch": 4.375, "grad_norm": 3.5834009647369385, "learning_rate": 1e-06, "loss": 1.6514, "step": 315 }, { "epoch": 4.444444444444445, "grad_norm": 3.288220167160034, "learning_rate": 1e-06, "loss": 1.4941, "step": 320 }, { "epoch": 4.513888888888889, "grad_norm": 4.202756404876709, "learning_rate": 1e-06, "loss": 1.5374, "step": 325 }, { "epoch": 4.583333333333333, "grad_norm": 3.9757556915283203, "learning_rate": 1e-06, "loss": 1.6289, "step": 330 }, { "epoch": 4.652777777777778, "grad_norm": 3.3575947284698486, "learning_rate": 1e-06, "loss": 1.5446, "step": 335 }, { "epoch": 4.722222222222222, "grad_norm": 4.207667350769043, "learning_rate": 1e-06, "loss": 1.5668, "step": 340 }, { "epoch": 4.791666666666667, "grad_norm": 3.2263221740722656, "learning_rate": 1e-06, "loss": 1.4529, "step": 345 }, { "epoch": 4.861111111111111, "grad_norm": 3.272395610809326, "learning_rate": 1e-06, "loss": 1.5215, "step": 350 }, { "epoch": 4.930555555555555, "grad_norm": 3.4315106868743896, "learning_rate": 1e-06, "loss": 1.5781, "step": 355 }, { "epoch": 5.0, "grad_norm": 3.9581406116485596, "learning_rate": 1e-06, "loss": 1.5001, "step": 360 }, { "epoch": 5.0, "step": 360, "total_flos": 2.3797808143060173e+17, "train_loss": 1.9143991947174073, "train_runtime": 6464.4185, "train_samples_per_second": 0.891, "train_steps_per_second": 0.056 } ], "logging_steps": 5, "max_steps": 360, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "total_flos": 2.3797808143060173e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }