{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100000.0, "global_step": 96, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03125, "grad_norm": 189.0, "learning_rate": 0.0, "loss": 5.3761, "mean_token_accuracy": 0.4222588539123535, "step": 1 }, { "epoch": 0.0625, "grad_norm": 187.0, "learning_rate": 9.999999999999999e-06, "loss": 5.4398, "mean_token_accuracy": 0.42141932249069214, "step": 2 }, { "epoch": 0.09375, "grad_norm": 29.375, "learning_rate": 1.9999999999999998e-05, "loss": 3.0716, "mean_token_accuracy": 0.5135114192962646, "step": 3 }, { "epoch": 0.125, "grad_norm": 17.625, "learning_rate": 3e-05, "loss": 2.1269, "mean_token_accuracy": 0.646616518497467, "step": 4 }, { "epoch": 0.15625, "grad_norm": 40.0, "learning_rate": 2.967741935483871e-05, "loss": 1.5739, "mean_token_accuracy": 0.7027881145477295, "step": 5 }, { "epoch": 0.1875, "grad_norm": 7.5625, "learning_rate": 2.935483870967742e-05, "loss": 1.2517, "mean_token_accuracy": 0.7468280792236328, "step": 6 }, { "epoch": 0.21875, "grad_norm": 8.125, "learning_rate": 2.903225806451613e-05, "loss": 1.1719, "mean_token_accuracy": 0.7558277249336243, "step": 7 }, { "epoch": 0.25, "grad_norm": 6.84375, "learning_rate": 2.870967741935484e-05, "loss": 1.0179, "mean_token_accuracy": 0.7789369821548462, "step": 8 }, { "epoch": 0.28125, "grad_norm": 4.5625, "learning_rate": 2.8387096774193552e-05, "loss": 0.9696, "mean_token_accuracy": 0.7895718812942505, "step": 9 }, { "epoch": 0.3125, "grad_norm": 3.71875, "learning_rate": 2.806451612903226e-05, "loss": 0.9511, "mean_token_accuracy": 0.7805737853050232, "step": 10 }, { "epoch": 0.34375, "grad_norm": 4.40625, "learning_rate": 2.7741935483870968e-05, "loss": 0.9864, "mean_token_accuracy": 0.7709536552429199, "step": 11 }, { "epoch": 0.375, "grad_norm": 3.75, "learning_rate": 2.7419354838709678e-05, "loss": 0.8482, "mean_token_accuracy": 0.7989001274108887, "step": 12 }, { "epoch": 0.40625, "grad_norm": 10.1875, "learning_rate": 2.7096774193548387e-05, "loss": 0.8758, "mean_token_accuracy": 0.7906646728515625, "step": 13 }, { "epoch": 0.4375, "grad_norm": 4.375, "learning_rate": 2.6774193548387097e-05, "loss": 0.8406, "mean_token_accuracy": 0.7955039739608765, "step": 14 }, { "epoch": 0.46875, "grad_norm": 3.34375, "learning_rate": 2.6451612903225806e-05, "loss": 0.8676, "mean_token_accuracy": 0.7854324579238892, "step": 15 }, { "epoch": 0.5, "grad_norm": 2.9375, "learning_rate": 2.6129032258064516e-05, "loss": 0.8414, "mean_token_accuracy": 0.7912613749504089, "step": 16 }, { "epoch": 0.53125, "grad_norm": 2.671875, "learning_rate": 2.5806451612903226e-05, "loss": 0.8772, "mean_token_accuracy": 0.7842865586280823, "step": 17 }, { "epoch": 0.5625, "grad_norm": 3.078125, "learning_rate": 2.548387096774194e-05, "loss": 0.8225, "mean_token_accuracy": 0.7936803102493286, "step": 18 }, { "epoch": 0.59375, "grad_norm": 2.9375, "learning_rate": 2.5161290322580648e-05, "loss": 0.7779, "mean_token_accuracy": 0.8040180802345276, "step": 19 }, { "epoch": 0.625, "grad_norm": 2.6875, "learning_rate": 2.4838709677419358e-05, "loss": 0.7973, "mean_token_accuracy": 0.7983251810073853, "step": 20 }, { "epoch": 0.65625, "grad_norm": 2.734375, "learning_rate": 2.4516129032258067e-05, "loss": 0.7968, "mean_token_accuracy": 0.7994269132614136, "step": 21 }, { "epoch": 0.6875, "grad_norm": 2.703125, "learning_rate": 2.4193548387096773e-05, "loss": 0.7902, "mean_token_accuracy": 0.799203634262085, "step": 22 }, { "epoch": 0.71875, "grad_norm": 2.796875, "learning_rate": 2.3870967741935483e-05, "loss": 0.7776, "mean_token_accuracy": 0.8002063035964966, "step": 23 }, { "epoch": 0.75, "grad_norm": 2.578125, "learning_rate": 2.3548387096774193e-05, "loss": 0.7243, "mean_token_accuracy": 0.8160389065742493, "step": 24 }, { "epoch": 0.78125, "grad_norm": 2.453125, "learning_rate": 2.3225806451612902e-05, "loss": 0.7039, "mean_token_accuracy": 0.817916750907898, "step": 25 }, { "epoch": 0.8125, "grad_norm": 2.4375, "learning_rate": 2.2903225806451612e-05, "loss": 0.7417, "mean_token_accuracy": 0.8118934035301208, "step": 26 }, { "epoch": 0.84375, "grad_norm": 2.65625, "learning_rate": 2.258064516129032e-05, "loss": 0.7577, "mean_token_accuracy": 0.8054953217506409, "step": 27 }, { "epoch": 0.875, "grad_norm": 2.546875, "learning_rate": 2.2258064516129034e-05, "loss": 0.7387, "mean_token_accuracy": 0.8119359612464905, "step": 28 }, { "epoch": 0.90625, "grad_norm": 2.421875, "learning_rate": 2.1935483870967744e-05, "loss": 0.7541, "mean_token_accuracy": 0.8092722296714783, "step": 29 }, { "epoch": 0.9375, "grad_norm": 2.4375, "learning_rate": 2.1612903225806454e-05, "loss": 0.768, "mean_token_accuracy": 0.8078325390815735, "step": 30 }, { "epoch": 0.96875, "grad_norm": 2.3125, "learning_rate": 2.1290322580645163e-05, "loss": 0.6699, "mean_token_accuracy": 0.8237451910972595, "step": 31 }, { "epoch": 1.0, "grad_norm": 2.5, "learning_rate": 2.0967741935483873e-05, "loss": 0.721, "mean_token_accuracy": 0.8169918656349182, "step": 32 }, { "epoch": 1.03125, "grad_norm": 2.140625, "learning_rate": 2.0645161290322582e-05, "loss": 0.564, "mean_token_accuracy": 0.848415732383728, "step": 33 }, { "epoch": 1.0625, "grad_norm": 2.171875, "learning_rate": 2.032258064516129e-05, "loss": 0.5677, "mean_token_accuracy": 0.8471966981887817, "step": 34 }, { "epoch": 1.09375, "grad_norm": 2.09375, "learning_rate": 1.9999999999999998e-05, "loss": 0.5395, "mean_token_accuracy": 0.8564569354057312, "step": 35 }, { "epoch": 1.125, "grad_norm": 2.125, "learning_rate": 1.9677419354838708e-05, "loss": 0.5333, "mean_token_accuracy": 0.8557505011558533, "step": 36 }, { "epoch": 1.15625, "grad_norm": 2.1875, "learning_rate": 1.935483870967742e-05, "loss": 0.5176, "mean_token_accuracy": 0.859412670135498, "step": 37 }, { "epoch": 1.1875, "grad_norm": 2.4375, "learning_rate": 1.903225806451613e-05, "loss": 0.5255, "mean_token_accuracy": 0.8575270771980286, "step": 38 }, { "epoch": 1.21875, "grad_norm": 2.5, "learning_rate": 1.870967741935484e-05, "loss": 0.533, "mean_token_accuracy": 0.8552377820014954, "step": 39 }, { "epoch": 1.25, "grad_norm": 2.6875, "learning_rate": 1.838709677419355e-05, "loss": 0.5193, "mean_token_accuracy": 0.8584305047988892, "step": 40 }, { "epoch": 1.28125, "grad_norm": 2.46875, "learning_rate": 1.806451612903226e-05, "loss": 0.484, "mean_token_accuracy": 0.8667942881584167, "step": 41 }, { "epoch": 1.3125, "grad_norm": 2.671875, "learning_rate": 1.774193548387097e-05, "loss": 0.536, "mean_token_accuracy": 0.8559280633926392, "step": 42 }, { "epoch": 1.34375, "grad_norm": 2.765625, "learning_rate": 1.741935483870968e-05, "loss": 0.5303, "mean_token_accuracy": 0.8593308329582214, "step": 43 }, { "epoch": 1.375, "grad_norm": 2.53125, "learning_rate": 1.7096774193548388e-05, "loss": 0.506, "mean_token_accuracy": 0.8613560199737549, "step": 44 }, { "epoch": 1.40625, "grad_norm": 2.609375, "learning_rate": 1.6774193548387094e-05, "loss": 0.51, "mean_token_accuracy": 0.8609563708305359, "step": 45 }, { "epoch": 1.4375, "grad_norm": 2.5625, "learning_rate": 1.6451612903225807e-05, "loss": 0.5152, "mean_token_accuracy": 0.8623529672622681, "step": 46 }, { "epoch": 1.46875, "grad_norm": 2.375, "learning_rate": 1.6129032258064517e-05, "loss": 0.4736, "mean_token_accuracy": 0.8748390078544617, "step": 47 }, { "epoch": 1.5, "grad_norm": 2.484375, "learning_rate": 1.5806451612903226e-05, "loss": 0.525, "mean_token_accuracy": 0.8599056005477905, "step": 48 }, { "epoch": 1.53125, "grad_norm": 2.546875, "learning_rate": 1.5483870967741936e-05, "loss": 0.5388, "mean_token_accuracy": 0.8566693067550659, "step": 49 }, { "epoch": 1.5625, "grad_norm": 2.53125, "learning_rate": 1.5161290322580646e-05, "loss": 0.5067, "mean_token_accuracy": 0.8631064295768738, "step": 50 }, { "epoch": 1.59375, "grad_norm": 2.453125, "learning_rate": 1.4838709677419355e-05, "loss": 0.5125, "mean_token_accuracy": 0.8618575930595398, "step": 51 }, { "epoch": 1.625, "grad_norm": 2.46875, "learning_rate": 1.4516129032258065e-05, "loss": 0.4827, "mean_token_accuracy": 0.8688983917236328, "step": 52 }, { "epoch": 1.65625, "grad_norm": 2.375, "learning_rate": 1.4193548387096776e-05, "loss": 0.4968, "mean_token_accuracy": 0.8661786317825317, "step": 53 }, { "epoch": 1.6875, "grad_norm": 2.390625, "learning_rate": 1.3870967741935484e-05, "loss": 0.5088, "mean_token_accuracy": 0.8644407391548157, "step": 54 }, { "epoch": 1.71875, "grad_norm": 2.3125, "learning_rate": 1.3548387096774194e-05, "loss": 0.4772, "mean_token_accuracy": 0.8696879148483276, "step": 55 }, { "epoch": 1.75, "grad_norm": 2.4375, "learning_rate": 1.3225806451612903e-05, "loss": 0.5017, "mean_token_accuracy": 0.8670803308486938, "step": 56 }, { "epoch": 1.78125, "grad_norm": 2.328125, "learning_rate": 1.2903225806451613e-05, "loss": 0.5076, "mean_token_accuracy": 0.8628109693527222, "step": 57 }, { "epoch": 1.8125, "grad_norm": 2.28125, "learning_rate": 1.2580645161290324e-05, "loss": 0.482, "mean_token_accuracy": 0.8687414526939392, "step": 58 }, { "epoch": 1.84375, "grad_norm": 2.3125, "learning_rate": 1.2258064516129034e-05, "loss": 0.4701, "mean_token_accuracy": 0.8710319399833679, "step": 59 }, { "epoch": 1.875, "grad_norm": 2.28125, "learning_rate": 1.1935483870967742e-05, "loss": 0.4713, "mean_token_accuracy": 0.8721932768821716, "step": 60 }, { "epoch": 1.90625, "grad_norm": 2.25, "learning_rate": 1.1612903225806451e-05, "loss": 0.4893, "mean_token_accuracy": 0.8665592074394226, "step": 61 }, { "epoch": 1.9375, "grad_norm": 2.34375, "learning_rate": 1.129032258064516e-05, "loss": 0.4741, "mean_token_accuracy": 0.8702882528305054, "step": 62 }, { "epoch": 1.96875, "grad_norm": 2.234375, "learning_rate": 1.0967741935483872e-05, "loss": 0.4385, "mean_token_accuracy": 0.8789963722229004, "step": 63 }, { "epoch": 2.0, "grad_norm": 2.109375, "learning_rate": 1.0645161290322582e-05, "loss": 0.4023, "mean_token_accuracy": 0.8905370235443115, "step": 64 }, { "epoch": 2.03125, "grad_norm": 2.125, "learning_rate": 1.0322580645161291e-05, "loss": 0.3732, "mean_token_accuracy": 0.9010424613952637, "step": 65 }, { "epoch": 2.0625, "grad_norm": 2.015625, "learning_rate": 9.999999999999999e-06, "loss": 0.3639, "mean_token_accuracy": 0.903084397315979, "step": 66 }, { "epoch": 2.09375, "grad_norm": 2.15625, "learning_rate": 9.67741935483871e-06, "loss": 0.338, "mean_token_accuracy": 0.9060129523277283, "step": 67 }, { "epoch": 2.125, "grad_norm": 2.09375, "learning_rate": 9.35483870967742e-06, "loss": 0.368, "mean_token_accuracy": 0.902858555316925, "step": 68 }, { "epoch": 2.15625, "grad_norm": 2.171875, "learning_rate": 9.03225806451613e-06, "loss": 0.3557, "mean_token_accuracy": 0.9019851088523865, "step": 69 }, { "epoch": 2.1875, "grad_norm": 2.1875, "learning_rate": 8.70967741935484e-06, "loss": 0.3477, "mean_token_accuracy": 0.9044142961502075, "step": 70 }, { "epoch": 2.21875, "grad_norm": 2.34375, "learning_rate": 8.387096774193547e-06, "loss": 0.3691, "mean_token_accuracy": 0.8983471989631653, "step": 71 }, { "epoch": 2.25, "grad_norm": 2.28125, "learning_rate": 8.064516129032258e-06, "loss": 0.3374, "mean_token_accuracy": 0.9074879884719849, "step": 72 }, { "epoch": 2.28125, "grad_norm": 2.34375, "learning_rate": 7.741935483870968e-06, "loss": 0.3373, "mean_token_accuracy": 0.9076383709907532, "step": 73 }, { "epoch": 2.3125, "grad_norm": 2.4375, "learning_rate": 7.419354838709678e-06, "loss": 0.3518, "mean_token_accuracy": 0.9008965492248535, "step": 74 }, { "epoch": 2.34375, "grad_norm": 2.3125, "learning_rate": 7.096774193548388e-06, "loss": 0.311, "mean_token_accuracy": 0.9127882719039917, "step": 75 }, { "epoch": 2.375, "grad_norm": 2.34375, "learning_rate": 6.774193548387097e-06, "loss": 0.3072, "mean_token_accuracy": 0.9146369099617004, "step": 76 }, { "epoch": 2.40625, "grad_norm": 2.578125, "learning_rate": 6.451612903225806e-06, "loss": 0.3627, "mean_token_accuracy": 0.9014822840690613, "step": 77 }, { "epoch": 2.4375, "grad_norm": 2.484375, "learning_rate": 6.129032258064517e-06, "loss": 0.3219, "mean_token_accuracy": 0.90999835729599, "step": 78 }, { "epoch": 2.46875, "grad_norm": 2.390625, "learning_rate": 5.8064516129032256e-06, "loss": 0.3146, "mean_token_accuracy": 0.9131765365600586, "step": 79 }, { "epoch": 2.5, "grad_norm": 2.625, "learning_rate": 5.483870967741936e-06, "loss": 0.3407, "mean_token_accuracy": 0.906125545501709, "step": 80 }, { "epoch": 2.53125, "grad_norm": 2.625, "learning_rate": 5.161290322580646e-06, "loss": 0.3365, "mean_token_accuracy": 0.9048438668251038, "step": 81 }, { "epoch": 2.5625, "grad_norm": 2.5625, "learning_rate": 4.838709677419355e-06, "loss": 0.336, "mean_token_accuracy": 0.9078318476676941, "step": 82 }, { "epoch": 2.59375, "grad_norm": 2.59375, "learning_rate": 4.516129032258065e-06, "loss": 0.3293, "mean_token_accuracy": 0.9083393812179565, "step": 83 }, { "epoch": 2.625, "grad_norm": 2.71875, "learning_rate": 4.1935483870967736e-06, "loss": 0.3619, "mean_token_accuracy": 0.9003615379333496, "step": 84 }, { "epoch": 2.65625, "grad_norm": 2.515625, "learning_rate": 3.870967741935484e-06, "loss": 0.348, "mean_token_accuracy": 0.9053718447685242, "step": 85 }, { "epoch": 2.6875, "grad_norm": 2.578125, "learning_rate": 3.548387096774194e-06, "loss": 0.3351, "mean_token_accuracy": 0.9073967933654785, "step": 86 }, { "epoch": 2.71875, "grad_norm": 2.578125, "learning_rate": 3.225806451612903e-06, "loss": 0.3425, "mean_token_accuracy": 0.9027067422866821, "step": 87 }, { "epoch": 2.75, "grad_norm": 2.484375, "learning_rate": 2.9032258064516128e-06, "loss": 0.3282, "mean_token_accuracy": 0.9083059430122375, "step": 88 }, { "epoch": 2.78125, "grad_norm": 2.53125, "learning_rate": 2.580645161290323e-06, "loss": 0.3159, "mean_token_accuracy": 0.9134910106658936, "step": 89 }, { "epoch": 2.8125, "grad_norm": 2.515625, "learning_rate": 2.2580645161290324e-06, "loss": 0.3426, "mean_token_accuracy": 0.9068294167518616, "step": 90 }, { "epoch": 2.84375, "grad_norm": 2.609375, "learning_rate": 1.935483870967742e-06, "loss": 0.3427, "mean_token_accuracy": 0.9076163172721863, "step": 91 }, { "epoch": 2.875, "grad_norm": 2.609375, "learning_rate": 1.6129032258064516e-06, "loss": 0.3513, "mean_token_accuracy": 0.9038658142089844, "step": 92 }, { "epoch": 2.90625, "grad_norm": 2.640625, "learning_rate": 1.2903225806451614e-06, "loss": 0.3378, "mean_token_accuracy": 0.9042639136314392, "step": 93 }, { "epoch": 2.9375, "grad_norm": 2.515625, "learning_rate": 9.67741935483871e-07, "loss": 0.3308, "mean_token_accuracy": 0.9092385172843933, "step": 94 }, { "epoch": 2.96875, "grad_norm": 2.59375, "learning_rate": 6.451612903225807e-07, "loss": 0.3369, "mean_token_accuracy": 0.9074664115905762, "step": 95 }, { "epoch": 3.0, "grad_norm": 2.453125, "learning_rate": 3.2258064516129035e-07, "loss": 0.331, "mean_token_accuracy": 0.9075567722320557, "step": 96 } ], "logging_steps": 1.0, "max_steps": 96, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.63102607549399e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }