{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "grad_norm": 1.0946012735366821, "learning_rate": 3.6e-05, "loss": 1.1667, "step": 10 }, { "grad_norm": 0.2629280090332031, "learning_rate": 7.6e-05, "loss": 1.1633, "step": 20 }, { "grad_norm": 0.41352760791778564, "learning_rate": 9.998250366089848e-05, "loss": 1.1506, "step": 30 }, { "grad_norm": 0.43201273679733276, "learning_rate": 9.97858104436822e-05, "loss": 1.1244, "step": 40 }, { "grad_norm": 0.3578847050666809, "learning_rate": 9.937141654477528e-05, "loss": 1.1064, "step": 50 }, { "grad_norm": 0.37446966767311096, "learning_rate": 9.87411340032603e-05, "loss": 1.1005, "step": 60 }, { "grad_norm": 0.7768126726150513, "learning_rate": 9.789771888432375e-05, "loss": 1.0277, "step": 70 }, { "grad_norm": 0.6836130023002625, "learning_rate": 9.684485922768422e-05, "loss": 0.9788, "step": 80 }, { "grad_norm": 0.8855573534965515, "learning_rate": 9.558715892073323e-05, "loss": 0.8777, "step": 90 }, { "grad_norm": 1.29987633228302, "learning_rate": 9.413011756690685e-05, "loss": 0.778, "step": 100 }, { "grad_norm": 1.0969977378845215, "learning_rate": 9.248010643731935e-05, "loss": 0.6233, "step": 110 }, { "grad_norm": 1.4301836490631104, "learning_rate": 9.064434061081562e-05, "loss": 0.4718, "step": 120 }, { "grad_norm": 1.3080878257751465, "learning_rate": 8.863084742426719e-05, "loss": 0.3304, "step": 130 }, { "grad_norm": 0.9498032331466675, "learning_rate": 8.644843137107059e-05, "loss": 0.2441, "step": 140 }, { "grad_norm": 0.7782955169677734, "learning_rate": 8.410663560133784e-05, "loss": 0.2046, "step": 150 }, { "grad_norm": 0.7438144087791443, "learning_rate": 8.161570019212921e-05, "loss": 0.195, "step": 160 }, { "grad_norm": 0.7565449476242065, "learning_rate": 7.898651737020166e-05, "loss": 0.1667, "step": 170 }, { "grad_norm": 0.7198880314826965, "learning_rate": 7.623058388307269e-05, "loss": 0.1607, "step": 180 }, { "grad_norm": 0.5660191178321838, "learning_rate": 7.335995072666848e-05, "loss": 0.1407, "step": 190 }, { "grad_norm": 0.8990160822868347, "learning_rate": 7.038717044938519e-05, "loss": 0.1456, "step": 200 }, { "grad_norm": 0.8371200561523438, "learning_rate": 6.732524226298841e-05, "loss": 0.1305, "step": 210 }, { "grad_norm": 0.5769897103309631, "learning_rate": 6.418755520036775e-05, "loss": 0.1308, "step": 220 }, { "grad_norm": 0.6508622169494629, "learning_rate": 6.0987829568702656e-05, "loss": 0.1252, "step": 230 }, { "grad_norm": 0.6774729490280151, "learning_rate": 5.7740056954050084e-05, "loss": 0.1114, "step": 240 }, { "grad_norm": 0.5324140191078186, "learning_rate": 5.445843903969854e-05, "loss": 0.1041, "step": 250 }, { "grad_norm": 0.6443776488304138, "learning_rate": 5.1157325505820694e-05, "loss": 0.0992, "step": 260 }, { "grad_norm": 0.6866108179092407, "learning_rate": 4.785115128197298e-05, "loss": 0.1129, "step": 270 }, { "grad_norm": 0.6538270711898804, "learning_rate": 4.4554373426821374e-05, "loss": 0.0989, "step": 280 }, { "grad_norm": 0.4868132770061493, "learning_rate": 4.1281407911102425e-05, "loss": 0.1162, "step": 290 }, { "grad_norm": 0.7402431964874268, "learning_rate": 3.8046566580251e-05, "loss": 0.0921, "step": 300 }, { "grad_norm": 0.6450913548469543, "learning_rate": 3.4863994572341843e-05, "loss": 0.1054, "step": 310 }, { "grad_norm": 0.7487106919288635, "learning_rate": 3.1747608464999725e-05, "loss": 0.0853, "step": 320 }, { "grad_norm": 0.5797029733657837, "learning_rate": 2.8711035421746367e-05, "loss": 0.0849, "step": 330 }, { "grad_norm": 0.5133667588233948, "learning_rate": 2.5767553603881767e-05, "loss": 0.0791, "step": 340 }, { "grad_norm": 0.4757593870162964, "learning_rate": 2.29300341084631e-05, "loss": 0.0738, "step": 350 }, { "grad_norm": 0.5363937616348267, "learning_rate": 2.0210884686272368e-05, "loss": 0.0784, "step": 360 }, { "grad_norm": 0.4361656606197357, "learning_rate": 1.7621995485879062e-05, "loss": 0.0785, "step": 370 }, { "grad_norm": 0.683097243309021, "learning_rate": 1.517468706104589e-05, "loss": 0.0754, "step": 380 }, { "grad_norm": 0.46737223863601685, "learning_rate": 1.2879660868827508e-05, "loss": 0.0737, "step": 390 }, { "grad_norm": 0.37859612703323364, "learning_rate": 1.0746952474821614e-05, "loss": 0.0678, "step": 400 }, { "grad_norm": 0.45972710847854614, "learning_rate": 8.785887670194138e-06, "loss": 0.0761, "step": 410 }, { "grad_norm": 0.4084762632846832, "learning_rate": 7.005041692367154e-06, "loss": 0.0669, "step": 420 }, { "grad_norm": 0.5804421901702881, "learning_rate": 5.412201727687644e-06, "loss": 0.0776, "step": 430 }, { "grad_norm": 0.517287015914917, "learning_rate": 4.01433286004283e-06, "loss": 0.0671, "step": 440 }, { "grad_norm": 0.42058899998664856, "learning_rate": 2.817547614320615e-06, "loss": 0.0668, "step": 450 }, { "grad_norm": 0.4694521725177765, "learning_rate": 1.8270792278934302e-06, "loss": 0.0713, "step": 460 }, { "grad_norm": 0.3545616567134857, "learning_rate": 1.0472587670027678e-06, "loss": 0.0773, "step": 470 }, { "grad_norm": 0.3037451505661011, "learning_rate": 4.814961881085045e-07, "loss": 0.0681, "step": 480 }, { "grad_norm": 0.5359607934951782, "learning_rate": 1.3226542701689215e-07, "loss": 0.0712, "step": 490 }, { "grad_norm": 0.31134089827537537, "learning_rate": 1.0935809887702154e-09, "loss": 0.0664, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }