{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.953125, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.351678059399128, "epoch": 0.0390625, "grad_norm": 1.6015625, "learning_rate": 3.828125000000001e-06, "loss": 1.394322509765625, "mean_token_accuracy": 0.7548403647542, "num_tokens": 257685.0, "step": 50 }, { "entropy": 0.47313837975263595, "epoch": 0.078125, "grad_norm": 0.91015625, "learning_rate": 7.734375e-06, "loss": 0.4260553359985352, "mean_token_accuracy": 0.9251225611567497, "num_tokens": 513846.0, "step": 100 }, { "entropy": 0.22530030721798538, "epoch": 0.1171875, "grad_norm": 0.921875, "learning_rate": 1.1640625000000002e-05, "loss": 0.18161891937255858, "mean_token_accuracy": 0.9637172383069992, "num_tokens": 766003.0, "step": 150 }, { "entropy": 0.14051863566040992, "epoch": 0.15625, "grad_norm": 1.6484375, "learning_rate": 1.5546875e-05, "loss": 0.1136919116973877, "mean_token_accuracy": 0.9747392472624778, "num_tokens": 1024512.0, "step": 200 }, { "entropy": 0.09577633743174374, "epoch": 0.1953125, "grad_norm": 0.71484375, "learning_rate": 1.9453125e-05, "loss": 0.0734261655807495, "mean_token_accuracy": 0.9820188581943512, "num_tokens": 1280614.0, "step": 250 }, { "entropy": 0.08148466867394745, "epoch": 0.234375, "grad_norm": 0.3828125, "learning_rate": 1.9626736111111114e-05, "loss": 0.062112469673156735, "mean_token_accuracy": 0.9844269120693206, "num_tokens": 1533690.0, "step": 300 }, { "entropy": 0.06672279690392316, "epoch": 0.2734375, "grad_norm": 0.455078125, "learning_rate": 1.9192708333333335e-05, "loss": 0.05034114837646484, "mean_token_accuracy": 0.986752623617649, "num_tokens": 1783169.0, "step": 350 }, { "entropy": 0.051021190043538805, "epoch": 0.3125, "grad_norm": 0.49609375, "learning_rate": 1.8758680555555557e-05, "loss": 0.03769558668136597, "mean_token_accuracy": 0.9895561364293098, "num_tokens": 2039809.0, "step": 400 }, { "entropy": 0.04628240401856601, "epoch": 0.3515625, "grad_norm": 0.1962890625, "learning_rate": 1.8324652777777778e-05, "loss": 0.034056272506713864, "mean_token_accuracy": 0.9902530950307846, "num_tokens": 2296699.0, "step": 450 }, { "entropy": 0.041101934388279915, "epoch": 0.390625, "grad_norm": 0.498046875, "learning_rate": 1.7890625000000003e-05, "loss": 0.030158956050872803, "mean_token_accuracy": 0.9908681440353394, "num_tokens": 2551050.0, "step": 500 }, { "entropy": 0.046159422053024174, "epoch": 0.4296875, "grad_norm": 0.2080078125, "learning_rate": 1.7456597222222224e-05, "loss": 0.03463820457458496, "mean_token_accuracy": 0.9898904532194137, "num_tokens": 2802698.0, "step": 550 }, { "entropy": 0.03955025893636048, "epoch": 0.46875, "grad_norm": 0.31640625, "learning_rate": 1.7022569444444446e-05, "loss": 0.029909002780914306, "mean_token_accuracy": 0.9910529521107674, "num_tokens": 3060791.0, "step": 600 }, { "entropy": 0.040935935722664, "epoch": 0.5078125, "grad_norm": 0.28125, "learning_rate": 1.6588541666666667e-05, "loss": 0.030300111770629884, "mean_token_accuracy": 0.9909433552622795, "num_tokens": 3313602.0, "step": 650 }, { "entropy": 0.03562260726466775, "epoch": 0.546875, "grad_norm": 0.2119140625, "learning_rate": 1.615451388888889e-05, "loss": 0.025539636611938477, "mean_token_accuracy": 0.9918221846222878, "num_tokens": 3572334.0, "step": 700 }, { "entropy": 0.035833339411765336, "epoch": 0.5859375, "grad_norm": 0.1669921875, "learning_rate": 1.5720486111111114e-05, "loss": 0.026018803119659425, "mean_token_accuracy": 0.9919026476144791, "num_tokens": 3822475.0, "step": 750 }, { "entropy": 0.03438482533209026, "epoch": 0.625, "grad_norm": 0.318359375, "learning_rate": 1.5286458333333335e-05, "loss": 0.025241999626159667, "mean_token_accuracy": 0.9919598492980003, "num_tokens": 4077791.0, "step": 800 }, { "entropy": 0.03701488464139402, "epoch": 0.6640625, "grad_norm": 0.162109375, "learning_rate": 1.4852430555555556e-05, "loss": 0.0267392897605896, "mean_token_accuracy": 0.9916711059212685, "num_tokens": 4327537.0, "step": 850 }, { "entropy": 0.03509442439302802, "epoch": 0.703125, "grad_norm": 0.1943359375, "learning_rate": 1.4418402777777778e-05, "loss": 0.025892121791839598, "mean_token_accuracy": 0.9916690769791603, "num_tokens": 4582391.0, "step": 900 }, { "entropy": 0.032335253246128556, "epoch": 0.7421875, "grad_norm": 0.259765625, "learning_rate": 1.3984375000000001e-05, "loss": 0.023846192359924315, "mean_token_accuracy": 0.9924429550766944, "num_tokens": 4842265.0, "step": 950 }, { "entropy": 0.032289591124281286, "epoch": 0.78125, "grad_norm": 0.248046875, "learning_rate": 1.3550347222222224e-05, "loss": 0.023617899417877196, "mean_token_accuracy": 0.9924442365765571, "num_tokens": 5099714.0, "step": 1000 }, { "entropy": 0.032110756486654284, "epoch": 0.8203125, "grad_norm": 0.171875, "learning_rate": 1.3116319444444446e-05, "loss": 0.023927602767944336, "mean_token_accuracy": 0.992151814699173, "num_tokens": 5353934.0, "step": 1050 }, { "entropy": 0.03357607708312571, "epoch": 0.859375, "grad_norm": 0.220703125, "learning_rate": 1.2682291666666669e-05, "loss": 0.024996912479400633, "mean_token_accuracy": 0.9920313712954522, "num_tokens": 5610229.0, "step": 1100 }, { "entropy": 0.03356592872180045, "epoch": 0.8984375, "grad_norm": 0.203125, "learning_rate": 1.2248263888888889e-05, "loss": 0.025175034999847412, "mean_token_accuracy": 0.9921249234676361, "num_tokens": 5862791.0, "step": 1150 }, { "entropy": 0.031079287379980086, "epoch": 0.9375, "grad_norm": 0.1318359375, "learning_rate": 1.1814236111111112e-05, "loss": 0.022713756561279295, "mean_token_accuracy": 0.9926198759675026, "num_tokens": 6121431.0, "step": 1200 }, { "entropy": 0.02976180042140186, "epoch": 0.9765625, "grad_norm": 0.154296875, "learning_rate": 1.1380208333333333e-05, "loss": 0.02123898983001709, "mean_token_accuracy": 0.992766418159008, "num_tokens": 6379675.0, "step": 1250 }, { "entropy": 0.030388496736995875, "epoch": 1.015625, "grad_norm": 0.1650390625, "learning_rate": 1.0946180555555556e-05, "loss": 0.021283388137817383, "mean_token_accuracy": 0.9927816662192345, "num_tokens": 6635287.0, "step": 1300 }, { "entropy": 0.029865577281452716, "epoch": 1.0546875, "grad_norm": 0.265625, "learning_rate": 1.0512152777777778e-05, "loss": 0.021030676364898682, "mean_token_accuracy": 0.9929129666090012, "num_tokens": 6888440.0, "step": 1350 }, { "entropy": 0.031085506100207567, "epoch": 1.09375, "grad_norm": 0.1748046875, "learning_rate": 1.0078125000000001e-05, "loss": 0.02215445041656494, "mean_token_accuracy": 0.9926813915371895, "num_tokens": 7143446.0, "step": 1400 }, { "entropy": 0.03091464822180569, "epoch": 1.1328125, "grad_norm": 0.2255859375, "learning_rate": 9.644097222222222e-06, "loss": 0.022361652851104738, "mean_token_accuracy": 0.9926716023683548, "num_tokens": 7400487.0, "step": 1450 }, { "entropy": 0.029652795745059846, "epoch": 1.171875, "grad_norm": 0.130859375, "learning_rate": 9.210069444444446e-06, "loss": 0.02084646940231323, "mean_token_accuracy": 0.9928994616866111, "num_tokens": 7655674.0, "step": 1500 }, { "entropy": 0.031666285023093224, "epoch": 1.2109375, "grad_norm": 0.1787109375, "learning_rate": 8.776041666666667e-06, "loss": 0.022852597236633302, "mean_token_accuracy": 0.9925497883558273, "num_tokens": 7908433.0, "step": 1550 }, { "entropy": 0.027730579837225378, "epoch": 1.25, "grad_norm": 0.1689453125, "learning_rate": 8.342013888888889e-06, "loss": 0.019295313358306886, "mean_token_accuracy": 0.9932532203197479, "num_tokens": 8163713.0, "step": 1600 }, { "entropy": 0.030957318153232338, "epoch": 1.2890625, "grad_norm": 0.1533203125, "learning_rate": 7.907986111111112e-06, "loss": 0.022100534439086914, "mean_token_accuracy": 0.9926850625872612, "num_tokens": 8413661.0, "step": 1650 }, { "entropy": 0.03025919214822352, "epoch": 1.328125, "grad_norm": 0.19140625, "learning_rate": 7.473958333333334e-06, "loss": 0.021726396083831787, "mean_token_accuracy": 0.9928304460644722, "num_tokens": 8666242.0, "step": 1700 }, { "entropy": 0.030476445676758886, "epoch": 1.3671875, "grad_norm": 0.17578125, "learning_rate": 7.039930555555556e-06, "loss": 0.0215773606300354, "mean_token_accuracy": 0.9926398959755898, "num_tokens": 8923368.0, "step": 1750 }, { "entropy": 0.02955903219990432, "epoch": 1.40625, "grad_norm": 0.240234375, "learning_rate": 6.605902777777779e-06, "loss": 0.02073089599609375, "mean_token_accuracy": 0.9930617704987525, "num_tokens": 9177821.0, "step": 1800 }, { "entropy": 0.029813821725547314, "epoch": 1.4453125, "grad_norm": 0.1640625, "learning_rate": 6.171875e-06, "loss": 0.020715839862823486, "mean_token_accuracy": 0.9928940132260322, "num_tokens": 9430080.0, "step": 1850 }, { "entropy": 0.027619767771102487, "epoch": 1.484375, "grad_norm": 0.16015625, "learning_rate": 5.737847222222222e-06, "loss": 0.01917331576347351, "mean_token_accuracy": 0.9933555802702904, "num_tokens": 9688541.0, "step": 1900 }, { "entropy": 0.028464037650264798, "epoch": 1.5234375, "grad_norm": 0.173828125, "learning_rate": 5.303819444444445e-06, "loss": 0.01973416805267334, "mean_token_accuracy": 0.9932170230150222, "num_tokens": 9946814.0, "step": 1950 }, { "entropy": 0.03081795680336654, "epoch": 1.5625, "grad_norm": 0.296875, "learning_rate": 4.869791666666667e-06, "loss": 0.021953141689300536, "mean_token_accuracy": 0.9926387491822243, "num_tokens": 10201426.0, "step": 2000 }, { "entropy": 0.030316293751820923, "epoch": 1.6015625, "grad_norm": 0.396484375, "learning_rate": 4.435763888888889e-06, "loss": 0.021486189365386963, "mean_token_accuracy": 0.9927674040198327, "num_tokens": 10454276.0, "step": 2050 }, { "entropy": 0.02887956439051777, "epoch": 1.640625, "grad_norm": 0.1513671875, "learning_rate": 4.001736111111112e-06, "loss": 0.020056800842285158, "mean_token_accuracy": 0.9931298586726188, "num_tokens": 10707497.0, "step": 2100 }, { "entropy": 0.028787780185230077, "epoch": 1.6796875, "grad_norm": 0.1455078125, "learning_rate": 3.5677083333333335e-06, "loss": 0.0200783896446228, "mean_token_accuracy": 0.9929500755667686, "num_tokens": 10962955.0, "step": 2150 }, { "entropy": 0.028631422137841582, "epoch": 1.71875, "grad_norm": 0.1484375, "learning_rate": 3.1336805555555562e-06, "loss": 0.02008913516998291, "mean_token_accuracy": 0.9930686053633689, "num_tokens": 11217333.0, "step": 2200 }, { "entropy": 0.02850784788839519, "epoch": 1.7578125, "grad_norm": 0.1884765625, "learning_rate": 2.699652777777778e-06, "loss": 0.01996502876281738, "mean_token_accuracy": 0.9930599120259285, "num_tokens": 11472495.0, "step": 2250 }, { "entropy": 0.028683945639058947, "epoch": 1.796875, "grad_norm": 0.1728515625, "learning_rate": 2.265625e-06, "loss": 0.019921081066131593, "mean_token_accuracy": 0.9932510870695114, "num_tokens": 11728662.0, "step": 2300 }, { "entropy": 0.02975102465134114, "epoch": 1.8359375, "grad_norm": 0.32421875, "learning_rate": 1.8315972222222223e-06, "loss": 0.020963990688323976, "mean_token_accuracy": 0.9929134699702263, "num_tokens": 11983347.0, "step": 2350 }, { "entropy": 0.02991143790073693, "epoch": 1.875, "grad_norm": 0.169921875, "learning_rate": 1.3975694444444446e-06, "loss": 0.020808370113372804, "mean_token_accuracy": 0.9929909712076187, "num_tokens": 12236526.0, "step": 2400 }, { "entropy": 0.030044674500823022, "epoch": 1.9140625, "grad_norm": 0.169921875, "learning_rate": 9.635416666666667e-07, "loss": 0.021207802295684815, "mean_token_accuracy": 0.9927184066176414, "num_tokens": 12489562.0, "step": 2450 }, { "entropy": 0.02905242417007685, "epoch": 1.953125, "grad_norm": 0.1552734375, "learning_rate": 5.295138888888889e-07, "loss": 0.020578203201293947, "mean_token_accuracy": 0.9929935920238495, "num_tokens": 12748444.0, "step": 2500 } ], "logging_steps": 50, "max_steps": 2560, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.4192422651355955e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }