{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.5625, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.8528947708010673, "epoch": 0.0390625, "grad_norm": 1.140625, "learning_rate": 3.828125000000001e-06, "loss": 1.0035149383544921, "mean_token_accuracy": 0.7955961337685585, "num_tokens": 289985.0, "step": 50 }, { "entropy": 0.19647314386442305, "epoch": 0.078125, "grad_norm": 1.03125, "learning_rate": 7.734375e-06, "loss": 0.1893597412109375, "mean_token_accuracy": 0.9560051780939102, "num_tokens": 578511.0, "step": 100 }, { "entropy": 0.09508214401081204, "epoch": 0.1171875, "grad_norm": 1.140625, "learning_rate": 1.1640625000000002e-05, "loss": 0.08917176246643066, "mean_token_accuracy": 0.9763016304373742, "num_tokens": 863089.0, "step": 150 }, { "entropy": 0.06972143727354706, "epoch": 0.15625, "grad_norm": 1.03125, "learning_rate": 1.5546875e-05, "loss": 0.0670042610168457, "mean_token_accuracy": 0.9812742775678635, "num_tokens": 1152491.0, "step": 200 }, { "entropy": 0.048063146471977236, "epoch": 0.1953125, "grad_norm": 0.78125, "learning_rate": 1.9453125e-05, "loss": 0.04478513240814209, "mean_token_accuracy": 0.9859032621979713, "num_tokens": 1439889.0, "step": 250 }, { "entropy": 0.04555465183220804, "epoch": 0.234375, "grad_norm": 0.39453125, "learning_rate": 1.9626736111111114e-05, "loss": 0.04354703426361084, "mean_token_accuracy": 0.986427218914032, "num_tokens": 1724136.0, "step": 300 }, { "entropy": 0.03841961392201483, "epoch": 0.2734375, "grad_norm": 0.40234375, "learning_rate": 1.9192708333333335e-05, "loss": 0.03762138843536377, "mean_token_accuracy": 0.9881205716729164, "num_tokens": 2005086.0, "step": 350 }, { "entropy": 0.028935400638729333, "epoch": 0.3125, "grad_norm": 0.427734375, "learning_rate": 1.8758680555555557e-05, "loss": 0.027575111389160155, "mean_token_accuracy": 0.9903467008471489, "num_tokens": 2294186.0, "step": 400 }, { "entropy": 0.025832002903334796, "epoch": 0.3515625, "grad_norm": 0.1171875, "learning_rate": 1.8324652777777778e-05, "loss": 0.024780447483062743, "mean_token_accuracy": 0.9910794907808304, "num_tokens": 2583118.0, "step": 450 }, { "entropy": 0.02405791813507676, "epoch": 0.390625, "grad_norm": 0.34765625, "learning_rate": 1.7890625000000003e-05, "loss": 0.0229555344581604, "mean_token_accuracy": 0.9914542031288147, "num_tokens": 2869922.0, "step": 500 }, { "entropy": 0.027367654335685075, "epoch": 0.4296875, "grad_norm": 0.130859375, "learning_rate": 1.7456597222222224e-05, "loss": 0.02657809019088745, "mean_token_accuracy": 0.9907592672109604, "num_tokens": 3151975.0, "step": 550 }, { "entropy": 0.02466743137687445, "epoch": 0.46875, "grad_norm": 0.2275390625, "learning_rate": 1.7022569444444446e-05, "loss": 0.023617582321166994, "mean_token_accuracy": 0.9914914935827255, "num_tokens": 3441223.0, "step": 600 }, { "entropy": 0.025295174946077168, "epoch": 0.5078125, "grad_norm": 0.23828125, "learning_rate": 1.6588541666666667e-05, "loss": 0.023536417484283447, "mean_token_accuracy": 0.9913991931080818, "num_tokens": 3725476.0, "step": 650 }, { "entropy": 0.021516693667508663, "epoch": 0.546875, "grad_norm": 0.1376953125, "learning_rate": 1.615451388888889e-05, "loss": 0.02034649610519409, "mean_token_accuracy": 0.9919455853104592, "num_tokens": 4015608.0, "step": 700 }, { "entropy": 0.02162676403298974, "epoch": 0.5859375, "grad_norm": 0.1025390625, "learning_rate": 1.5720486111111114e-05, "loss": 0.02098503351211548, "mean_token_accuracy": 0.9920175784826278, "num_tokens": 4297395.0, "step": 750 }, { "entropy": 0.02071377928368747, "epoch": 0.625, "grad_norm": 0.13671875, "learning_rate": 1.5286458333333335e-05, "loss": 0.020009536743164063, "mean_token_accuracy": 0.9920961040258408, "num_tokens": 4584615.0, "step": 800 }, { "entropy": 0.022029129918664694, "epoch": 0.6640625, "grad_norm": 0.07861328125, "learning_rate": 1.4852430555555556e-05, "loss": 0.021189706325531008, "mean_token_accuracy": 0.991996766924858, "num_tokens": 4865596.0, "step": 850 }, { "entropy": 0.021224360242486, "epoch": 0.703125, "grad_norm": 0.12890625, "learning_rate": 1.4418402777777778e-05, "loss": 0.020408225059509278, "mean_token_accuracy": 0.9920864021778106, "num_tokens": 5151434.0, "step": 900 }, { "entropy": 0.02010633623227477, "epoch": 0.7421875, "grad_norm": 0.734375, "learning_rate": 1.3984375000000001e-05, "loss": 0.01937895655632019, "mean_token_accuracy": 0.9925438091158867, "num_tokens": 5444488.0, "step": 950 }, { "entropy": 0.01990706148557365, "epoch": 0.78125, "grad_norm": 0.1162109375, "learning_rate": 1.3550347222222224e-05, "loss": 0.019094927310943602, "mean_token_accuracy": 0.9926347577571869, "num_tokens": 5734302.0, "step": 1000 }, { "entropy": 0.02041028759907931, "epoch": 0.8203125, "grad_norm": 0.09521484375, "learning_rate": 1.3116319444444446e-05, "loss": 0.019333359003067017, "mean_token_accuracy": 0.9924440628290176, "num_tokens": 6020184.0, "step": 1050 }, { "entropy": 0.020887044854462147, "epoch": 0.859375, "grad_norm": 0.1904296875, "learning_rate": 1.2682291666666669e-05, "loss": 0.01991814136505127, "mean_token_accuracy": 0.9922475999593735, "num_tokens": 6307384.0, "step": 1100 }, { "entropy": 0.020934463790617884, "epoch": 0.8984375, "grad_norm": 0.10546875, "learning_rate": 1.2248263888888889e-05, "loss": 0.020386738777160643, "mean_token_accuracy": 0.992243467271328, "num_tokens": 6590414.0, "step": 1150 }, { "entropy": 0.018531152345240116, "epoch": 0.9375, "grad_norm": 0.072265625, "learning_rate": 1.1814236111111112e-05, "loss": 0.01763615131378174, "mean_token_accuracy": 0.9930036315321922, "num_tokens": 6879445.0, "step": 1200 }, { "entropy": 0.018522553648799657, "epoch": 0.9765625, "grad_norm": 0.09521484375, "learning_rate": 1.1380208333333333e-05, "loss": 0.017574281692504884, "mean_token_accuracy": 0.9928433075547218, "num_tokens": 7168537.0, "step": 1250 }, { "entropy": 0.01806912823114544, "epoch": 1.015625, "grad_norm": 0.08349609375, "learning_rate": 1.0946180555555556e-05, "loss": 0.017191458940505982, "mean_token_accuracy": 0.9930350634455681, "num_tokens": 7455527.0, "step": 1300 }, { "entropy": 0.01792264294810593, "epoch": 1.0546875, "grad_norm": 0.1220703125, "learning_rate": 1.0512152777777778e-05, "loss": 0.017068485021591185, "mean_token_accuracy": 0.9930369177460671, "num_tokens": 7739474.0, "step": 1350 }, { "entropy": 0.018599705449305476, "epoch": 1.09375, "grad_norm": 0.08740234375, "learning_rate": 1.0078125000000001e-05, "loss": 0.017695863246917725, "mean_token_accuracy": 0.9929590111970902, "num_tokens": 8026170.0, "step": 1400 }, { "entropy": 0.018208509651012717, "epoch": 1.1328125, "grad_norm": 0.11181640625, "learning_rate": 9.644097222222222e-06, "loss": 0.01744949698448181, "mean_token_accuracy": 0.9930833280086517, "num_tokens": 8314626.0, "step": 1450 }, { "entropy": 0.01731262981891632, "epoch": 1.171875, "grad_norm": 0.07080078125, "learning_rate": 9.210069444444446e-06, "loss": 0.016677556037902833, "mean_token_accuracy": 0.9931734573841094, "num_tokens": 8601406.0, "step": 1500 }, { "entropy": 0.01845017326530069, "epoch": 1.2109375, "grad_norm": 0.087890625, "learning_rate": 8.776041666666667e-06, "loss": 0.017589352130889892, "mean_token_accuracy": 0.9928986424207688, "num_tokens": 8887025.0, "step": 1550 }, { "entropy": 0.016624693870544435, "epoch": 1.25, "grad_norm": 0.138671875, "learning_rate": 8.342013888888889e-06, "loss": 0.015853718519210816, "mean_token_accuracy": 0.9933998480439186, "num_tokens": 9173407.0, "step": 1600 }, { "entropy": 0.018500901735387744, "epoch": 1.2890625, "grad_norm": 0.1142578125, "learning_rate": 7.907986111111112e-06, "loss": 0.017626932859420776, "mean_token_accuracy": 0.9930863061547279, "num_tokens": 9454970.0, "step": 1650 }, { "entropy": 0.01800002105999738, "epoch": 1.328125, "grad_norm": 0.0986328125, "learning_rate": 7.473958333333334e-06, "loss": 0.017116209268569948, "mean_token_accuracy": 0.9930115470290184, "num_tokens": 9739610.0, "step": 1700 }, { "entropy": 0.017908857897855342, "epoch": 1.3671875, "grad_norm": 0.1064453125, "learning_rate": 7.039930555555556e-06, "loss": 0.017099602222442625, "mean_token_accuracy": 0.9931422612071037, "num_tokens": 10028723.0, "step": 1750 }, { "entropy": 0.017319361912086606, "epoch": 1.40625, "grad_norm": 0.14453125, "learning_rate": 6.605902777777779e-06, "loss": 0.016603636741638183, "mean_token_accuracy": 0.9933136883378029, "num_tokens": 10313957.0, "step": 1800 }, { "entropy": 0.017652450683526694, "epoch": 1.4453125, "grad_norm": 0.11572265625, "learning_rate": 6.171875e-06, "loss": 0.01667865037918091, "mean_token_accuracy": 0.9932633358240127, "num_tokens": 10597623.0, "step": 1850 }, { "entropy": 0.01626451033167541, "epoch": 1.484375, "grad_norm": 0.119140625, "learning_rate": 5.737847222222222e-06, "loss": 0.015588784217834472, "mean_token_accuracy": 0.993659851551056, "num_tokens": 10887482.0, "step": 1900 }, { "entropy": 0.016533851716667415, "epoch": 1.5234375, "grad_norm": 0.1630859375, "learning_rate": 5.303819444444445e-06, "loss": 0.01556604504585266, "mean_token_accuracy": 0.9936296039819718, "num_tokens": 11176455.0, "step": 1950 }, { "entropy": 0.017771934717893602, "epoch": 1.5625, "grad_norm": 0.11572265625, "learning_rate": 4.869791666666667e-06, "loss": 0.017040348052978514, "mean_token_accuracy": 0.9932206523418426, "num_tokens": 11462052.0, "step": 2000 } ], "logging_steps": 50, "max_steps": 2560, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.5546599214235546e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }