tata-field-432hz / last-checkpoint /trainer_state.json
misterJB's picture
Training in progress, step 2500, checkpoint
d902074 verified
Raw
History Blame
15.2 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.953125,
"eval_steps": 500,
"global_step": 2500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.351678059399128,
"epoch": 0.0390625,
"grad_norm": 1.6015625,
"learning_rate": 3.828125000000001e-06,
"loss": 1.394322509765625,
"mean_token_accuracy": 0.7548403647542,
"num_tokens": 257685.0,
"step": 50
},
{
"entropy": 0.47313837975263595,
"epoch": 0.078125,
"grad_norm": 0.91015625,
"learning_rate": 7.734375e-06,
"loss": 0.4260553359985352,
"mean_token_accuracy": 0.9251225611567497,
"num_tokens": 513846.0,
"step": 100
},
{
"entropy": 0.22530030721798538,
"epoch": 0.1171875,
"grad_norm": 0.921875,
"learning_rate": 1.1640625000000002e-05,
"loss": 0.18161891937255858,
"mean_token_accuracy": 0.9637172383069992,
"num_tokens": 766003.0,
"step": 150
},
{
"entropy": 0.14051863566040992,
"epoch": 0.15625,
"grad_norm": 1.6484375,
"learning_rate": 1.5546875e-05,
"loss": 0.1136919116973877,
"mean_token_accuracy": 0.9747392472624778,
"num_tokens": 1024512.0,
"step": 200
},
{
"entropy": 0.09577633743174374,
"epoch": 0.1953125,
"grad_norm": 0.71484375,
"learning_rate": 1.9453125e-05,
"loss": 0.0734261655807495,
"mean_token_accuracy": 0.9820188581943512,
"num_tokens": 1280614.0,
"step": 250
},
{
"entropy": 0.08148466867394745,
"epoch": 0.234375,
"grad_norm": 0.3828125,
"learning_rate": 1.9626736111111114e-05,
"loss": 0.062112469673156735,
"mean_token_accuracy": 0.9844269120693206,
"num_tokens": 1533690.0,
"step": 300
},
{
"entropy": 0.06672279690392316,
"epoch": 0.2734375,
"grad_norm": 0.455078125,
"learning_rate": 1.9192708333333335e-05,
"loss": 0.05034114837646484,
"mean_token_accuracy": 0.986752623617649,
"num_tokens": 1783169.0,
"step": 350
},
{
"entropy": 0.051021190043538805,
"epoch": 0.3125,
"grad_norm": 0.49609375,
"learning_rate": 1.8758680555555557e-05,
"loss": 0.03769558668136597,
"mean_token_accuracy": 0.9895561364293098,
"num_tokens": 2039809.0,
"step": 400
},
{
"entropy": 0.04628240401856601,
"epoch": 0.3515625,
"grad_norm": 0.1962890625,
"learning_rate": 1.8324652777777778e-05,
"loss": 0.034056272506713864,
"mean_token_accuracy": 0.9902530950307846,
"num_tokens": 2296699.0,
"step": 450
},
{
"entropy": 0.041101934388279915,
"epoch": 0.390625,
"grad_norm": 0.498046875,
"learning_rate": 1.7890625000000003e-05,
"loss": 0.030158956050872803,
"mean_token_accuracy": 0.9908681440353394,
"num_tokens": 2551050.0,
"step": 500
},
{
"entropy": 0.046159422053024174,
"epoch": 0.4296875,
"grad_norm": 0.2080078125,
"learning_rate": 1.7456597222222224e-05,
"loss": 0.03463820457458496,
"mean_token_accuracy": 0.9898904532194137,
"num_tokens": 2802698.0,
"step": 550
},
{
"entropy": 0.03955025893636048,
"epoch": 0.46875,
"grad_norm": 0.31640625,
"learning_rate": 1.7022569444444446e-05,
"loss": 0.029909002780914306,
"mean_token_accuracy": 0.9910529521107674,
"num_tokens": 3060791.0,
"step": 600
},
{
"entropy": 0.040935935722664,
"epoch": 0.5078125,
"grad_norm": 0.28125,
"learning_rate": 1.6588541666666667e-05,
"loss": 0.030300111770629884,
"mean_token_accuracy": 0.9909433552622795,
"num_tokens": 3313602.0,
"step": 650
},
{
"entropy": 0.03562260726466775,
"epoch": 0.546875,
"grad_norm": 0.2119140625,
"learning_rate": 1.615451388888889e-05,
"loss": 0.025539636611938477,
"mean_token_accuracy": 0.9918221846222878,
"num_tokens": 3572334.0,
"step": 700
},
{
"entropy": 0.035833339411765336,
"epoch": 0.5859375,
"grad_norm": 0.1669921875,
"learning_rate": 1.5720486111111114e-05,
"loss": 0.026018803119659425,
"mean_token_accuracy": 0.9919026476144791,
"num_tokens": 3822475.0,
"step": 750
},
{
"entropy": 0.03438482533209026,
"epoch": 0.625,
"grad_norm": 0.318359375,
"learning_rate": 1.5286458333333335e-05,
"loss": 0.025241999626159667,
"mean_token_accuracy": 0.9919598492980003,
"num_tokens": 4077791.0,
"step": 800
},
{
"entropy": 0.03701488464139402,
"epoch": 0.6640625,
"grad_norm": 0.162109375,
"learning_rate": 1.4852430555555556e-05,
"loss": 0.0267392897605896,
"mean_token_accuracy": 0.9916711059212685,
"num_tokens": 4327537.0,
"step": 850
},
{
"entropy": 0.03509442439302802,
"epoch": 0.703125,
"grad_norm": 0.1943359375,
"learning_rate": 1.4418402777777778e-05,
"loss": 0.025892121791839598,
"mean_token_accuracy": 0.9916690769791603,
"num_tokens": 4582391.0,
"step": 900
},
{
"entropy": 0.032335253246128556,
"epoch": 0.7421875,
"grad_norm": 0.259765625,
"learning_rate": 1.3984375000000001e-05,
"loss": 0.023846192359924315,
"mean_token_accuracy": 0.9924429550766944,
"num_tokens": 4842265.0,
"step": 950
},
{
"entropy": 0.032289591124281286,
"epoch": 0.78125,
"grad_norm": 0.248046875,
"learning_rate": 1.3550347222222224e-05,
"loss": 0.023617899417877196,
"mean_token_accuracy": 0.9924442365765571,
"num_tokens": 5099714.0,
"step": 1000
},
{
"entropy": 0.032110756486654284,
"epoch": 0.8203125,
"grad_norm": 0.171875,
"learning_rate": 1.3116319444444446e-05,
"loss": 0.023927602767944336,
"mean_token_accuracy": 0.992151814699173,
"num_tokens": 5353934.0,
"step": 1050
},
{
"entropy": 0.03357607708312571,
"epoch": 0.859375,
"grad_norm": 0.220703125,
"learning_rate": 1.2682291666666669e-05,
"loss": 0.024996912479400633,
"mean_token_accuracy": 0.9920313712954522,
"num_tokens": 5610229.0,
"step": 1100
},
{
"entropy": 0.03356592872180045,
"epoch": 0.8984375,
"grad_norm": 0.203125,
"learning_rate": 1.2248263888888889e-05,
"loss": 0.025175034999847412,
"mean_token_accuracy": 0.9921249234676361,
"num_tokens": 5862791.0,
"step": 1150
},
{
"entropy": 0.031079287379980086,
"epoch": 0.9375,
"grad_norm": 0.1318359375,
"learning_rate": 1.1814236111111112e-05,
"loss": 0.022713756561279295,
"mean_token_accuracy": 0.9926198759675026,
"num_tokens": 6121431.0,
"step": 1200
},
{
"entropy": 0.02976180042140186,
"epoch": 0.9765625,
"grad_norm": 0.154296875,
"learning_rate": 1.1380208333333333e-05,
"loss": 0.02123898983001709,
"mean_token_accuracy": 0.992766418159008,
"num_tokens": 6379675.0,
"step": 1250
},
{
"entropy": 0.030388496736995875,
"epoch": 1.015625,
"grad_norm": 0.1650390625,
"learning_rate": 1.0946180555555556e-05,
"loss": 0.021283388137817383,
"mean_token_accuracy": 0.9927816662192345,
"num_tokens": 6635287.0,
"step": 1300
},
{
"entropy": 0.029865577281452716,
"epoch": 1.0546875,
"grad_norm": 0.265625,
"learning_rate": 1.0512152777777778e-05,
"loss": 0.021030676364898682,
"mean_token_accuracy": 0.9929129666090012,
"num_tokens": 6888440.0,
"step": 1350
},
{
"entropy": 0.031085506100207567,
"epoch": 1.09375,
"grad_norm": 0.1748046875,
"learning_rate": 1.0078125000000001e-05,
"loss": 0.02215445041656494,
"mean_token_accuracy": 0.9926813915371895,
"num_tokens": 7143446.0,
"step": 1400
},
{
"entropy": 0.03091464822180569,
"epoch": 1.1328125,
"grad_norm": 0.2255859375,
"learning_rate": 9.644097222222222e-06,
"loss": 0.022361652851104738,
"mean_token_accuracy": 0.9926716023683548,
"num_tokens": 7400487.0,
"step": 1450
},
{
"entropy": 0.029652795745059846,
"epoch": 1.171875,
"grad_norm": 0.130859375,
"learning_rate": 9.210069444444446e-06,
"loss": 0.02084646940231323,
"mean_token_accuracy": 0.9928994616866111,
"num_tokens": 7655674.0,
"step": 1500
},
{
"entropy": 0.031666285023093224,
"epoch": 1.2109375,
"grad_norm": 0.1787109375,
"learning_rate": 8.776041666666667e-06,
"loss": 0.022852597236633302,
"mean_token_accuracy": 0.9925497883558273,
"num_tokens": 7908433.0,
"step": 1550
},
{
"entropy": 0.027730579837225378,
"epoch": 1.25,
"grad_norm": 0.1689453125,
"learning_rate": 8.342013888888889e-06,
"loss": 0.019295313358306886,
"mean_token_accuracy": 0.9932532203197479,
"num_tokens": 8163713.0,
"step": 1600
},
{
"entropy": 0.030957318153232338,
"epoch": 1.2890625,
"grad_norm": 0.1533203125,
"learning_rate": 7.907986111111112e-06,
"loss": 0.022100534439086914,
"mean_token_accuracy": 0.9926850625872612,
"num_tokens": 8413661.0,
"step": 1650
},
{
"entropy": 0.03025919214822352,
"epoch": 1.328125,
"grad_norm": 0.19140625,
"learning_rate": 7.473958333333334e-06,
"loss": 0.021726396083831787,
"mean_token_accuracy": 0.9928304460644722,
"num_tokens": 8666242.0,
"step": 1700
},
{
"entropy": 0.030476445676758886,
"epoch": 1.3671875,
"grad_norm": 0.17578125,
"learning_rate": 7.039930555555556e-06,
"loss": 0.0215773606300354,
"mean_token_accuracy": 0.9926398959755898,
"num_tokens": 8923368.0,
"step": 1750
},
{
"entropy": 0.02955903219990432,
"epoch": 1.40625,
"grad_norm": 0.240234375,
"learning_rate": 6.605902777777779e-06,
"loss": 0.02073089599609375,
"mean_token_accuracy": 0.9930617704987525,
"num_tokens": 9177821.0,
"step": 1800
},
{
"entropy": 0.029813821725547314,
"epoch": 1.4453125,
"grad_norm": 0.1640625,
"learning_rate": 6.171875e-06,
"loss": 0.020715839862823486,
"mean_token_accuracy": 0.9928940132260322,
"num_tokens": 9430080.0,
"step": 1850
},
{
"entropy": 0.027619767771102487,
"epoch": 1.484375,
"grad_norm": 0.16015625,
"learning_rate": 5.737847222222222e-06,
"loss": 0.01917331576347351,
"mean_token_accuracy": 0.9933555802702904,
"num_tokens": 9688541.0,
"step": 1900
},
{
"entropy": 0.028464037650264798,
"epoch": 1.5234375,
"grad_norm": 0.173828125,
"learning_rate": 5.303819444444445e-06,
"loss": 0.01973416805267334,
"mean_token_accuracy": 0.9932170230150222,
"num_tokens": 9946814.0,
"step": 1950
},
{
"entropy": 0.03081795680336654,
"epoch": 1.5625,
"grad_norm": 0.296875,
"learning_rate": 4.869791666666667e-06,
"loss": 0.021953141689300536,
"mean_token_accuracy": 0.9926387491822243,
"num_tokens": 10201426.0,
"step": 2000
},
{
"entropy": 0.030316293751820923,
"epoch": 1.6015625,
"grad_norm": 0.396484375,
"learning_rate": 4.435763888888889e-06,
"loss": 0.021486189365386963,
"mean_token_accuracy": 0.9927674040198327,
"num_tokens": 10454276.0,
"step": 2050
},
{
"entropy": 0.02887956439051777,
"epoch": 1.640625,
"grad_norm": 0.1513671875,
"learning_rate": 4.001736111111112e-06,
"loss": 0.020056800842285158,
"mean_token_accuracy": 0.9931298586726188,
"num_tokens": 10707497.0,
"step": 2100
},
{
"entropy": 0.028787780185230077,
"epoch": 1.6796875,
"grad_norm": 0.1455078125,
"learning_rate": 3.5677083333333335e-06,
"loss": 0.0200783896446228,
"mean_token_accuracy": 0.9929500755667686,
"num_tokens": 10962955.0,
"step": 2150
},
{
"entropy": 0.028631422137841582,
"epoch": 1.71875,
"grad_norm": 0.1484375,
"learning_rate": 3.1336805555555562e-06,
"loss": 0.02008913516998291,
"mean_token_accuracy": 0.9930686053633689,
"num_tokens": 11217333.0,
"step": 2200
},
{
"entropy": 0.02850784788839519,
"epoch": 1.7578125,
"grad_norm": 0.1884765625,
"learning_rate": 2.699652777777778e-06,
"loss": 0.01996502876281738,
"mean_token_accuracy": 0.9930599120259285,
"num_tokens": 11472495.0,
"step": 2250
},
{
"entropy": 0.028683945639058947,
"epoch": 1.796875,
"grad_norm": 0.1728515625,
"learning_rate": 2.265625e-06,
"loss": 0.019921081066131593,
"mean_token_accuracy": 0.9932510870695114,
"num_tokens": 11728662.0,
"step": 2300
},
{
"entropy": 0.02975102465134114,
"epoch": 1.8359375,
"grad_norm": 0.32421875,
"learning_rate": 1.8315972222222223e-06,
"loss": 0.020963990688323976,
"mean_token_accuracy": 0.9929134699702263,
"num_tokens": 11983347.0,
"step": 2350
},
{
"entropy": 0.02991143790073693,
"epoch": 1.875,
"grad_norm": 0.169921875,
"learning_rate": 1.3975694444444446e-06,
"loss": 0.020808370113372804,
"mean_token_accuracy": 0.9929909712076187,
"num_tokens": 12236526.0,
"step": 2400
},
{
"entropy": 0.030044674500823022,
"epoch": 1.9140625,
"grad_norm": 0.169921875,
"learning_rate": 9.635416666666667e-07,
"loss": 0.021207802295684815,
"mean_token_accuracy": 0.9927184066176414,
"num_tokens": 12489562.0,
"step": 2450
},
{
"entropy": 0.02905242417007685,
"epoch": 1.953125,
"grad_norm": 0.1552734375,
"learning_rate": 5.295138888888889e-07,
"loss": 0.020578203201293947,
"mean_token_accuracy": 0.9929935920238495,
"num_tokens": 12748444.0,
"step": 2500
}
],
"logging_steps": 50,
"max_steps": 2560,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.4192422651355955e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}