{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.8261366232579392, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03655471784327165, "grad_norm": 41.77480697631836, "learning_rate": 0.00027, "loss": 61.162, "step": 10 }, { "epoch": 0.0731094356865433, "grad_norm": 21.519004821777344, "learning_rate": 0.0002999388289242284, "loss": 29.6052, "step": 20 }, { "epoch": 0.10966415352981494, "grad_norm": 21.299711227416992, "learning_rate": 0.00029972743740666765, "loss": 26.4378, "step": 30 }, { "epoch": 0.1462188713730866, "grad_norm": 18.043352127075195, "learning_rate": 0.0002993652830514899, "loss": 26.6099, "step": 40 }, { "epoch": 0.18277358921635822, "grad_norm": 14.862595558166504, "learning_rate": 0.00029885273051743214, "loss": 27.796, "step": 50 }, { "epoch": 0.21932830705962988, "grad_norm": 18.81863784790039, "learning_rate": 0.000298190295901449, "loss": 26.9974, "step": 60 }, { "epoch": 0.25588302490290155, "grad_norm": 22.05609130859375, "learning_rate": 0.0002973786462190466, "loss": 24.4708, "step": 70 }, { "epoch": 0.2924377427461732, "grad_norm": 21.369543075561523, "learning_rate": 0.0002964185987326545, "loss": 25.8129, "step": 80 }, { "epoch": 0.3289924605894448, "grad_norm": 21.98700523376465, "learning_rate": 0.00029531112012871175, "loss": 25.3774, "step": 90 }, { "epoch": 0.36554717843271645, "grad_norm": 21.832794189453125, "learning_rate": 0.00029405732554429564, "loss": 27.0519, "step": 100 }, { "epoch": 0.40210189627598814, "grad_norm": 15.660602569580078, "learning_rate": 0.00029265847744427303, "loss": 26.0219, "step": 110 }, { "epoch": 0.43865661411925977, "grad_norm": 21.285099029541016, "learning_rate": 0.0002911159843501053, "loss": 25.9179, "step": 120 }, { "epoch": 0.4752113319625314, "grad_norm": 18.867576599121094, "learning_rate": 0.00028943139942158683, "loss": 24.2593, "step": 130 }, { "epoch": 0.5117660498058031, "grad_norm": 22.40102195739746, "learning_rate": 0.00028760641889294446, "loss": 23.7113, "step": 140 }, { "epoch": 0.5483207676490747, "grad_norm": 24.922903060913086, "learning_rate": 0.00028564288036487357, "loss": 25.1095, "step": 150 }, { "epoch": 0.5848754854923464, "grad_norm": 21.92047882080078, "learning_rate": 0.0002835427609542298, "loss": 24.1838, "step": 160 }, { "epoch": 0.621430203335618, "grad_norm": 19.682661056518555, "learning_rate": 0.0002813081753032403, "loss": 25.6995, "step": 170 }, { "epoch": 0.6579849211788896, "grad_norm": 18.963228225708008, "learning_rate": 0.00027894137345023785, "loss": 24.6019, "step": 180 }, { "epoch": 0.6945396390221613, "grad_norm": 21.453981399536133, "learning_rate": 0.0002764447385640632, "loss": 25.8001, "step": 190 }, { "epoch": 0.7310943568654329, "grad_norm": 21.415788650512695, "learning_rate": 0.00027382078454441606, "loss": 25.5969, "step": 200 }, { "epoch": 0.7676490747087046, "grad_norm": 14.518242835998535, "learning_rate": 0.0002710721534905712, "loss": 23.8269, "step": 210 }, { "epoch": 0.8042037925519763, "grad_norm": 16.467975616455078, "learning_rate": 0.00026820161304100823, "loss": 24.8, "step": 220 }, { "epoch": 0.8407585103952478, "grad_norm": 20.016340255737305, "learning_rate": 0.00026521205358663477, "loss": 24.9825, "step": 230 }, { "epoch": 0.8773132282385195, "grad_norm": 20.681249618530273, "learning_rate": 0.0002621064853604071, "loss": 23.6612, "step": 240 }, { "epoch": 0.9138679460817912, "grad_norm": 19.81218910217285, "learning_rate": 0.0002588880354062814, "loss": 23.9873, "step": 250 }, { "epoch": 0.9504226639250628, "grad_norm": 23.763002395629883, "learning_rate": 0.00025555994443054504, "loss": 24.4914, "step": 260 }, { "epoch": 0.9869773817683345, "grad_norm": 19.300912857055664, "learning_rate": 0.0002521255635387005, "loss": 24.607, "step": 270 }, { "epoch": 1.021932830705963, "grad_norm": 19.00154685974121, "learning_rate": 0.0002485883508611858, "loss": 19.2292, "step": 280 }, { "epoch": 1.0584875485492347, "grad_norm": 16.176002502441406, "learning_rate": 0.00024495186807133056, "loss": 17.2361, "step": 290 }, { "epoch": 1.0950422663925063, "grad_norm": 19.164424896240234, "learning_rate": 0.00024121977679905266, "loss": 18.1384, "step": 300 }, { "epoch": 1.1315969842357778, "grad_norm": 17.439308166503906, "learning_rate": 0.00023739583494390752, "loss": 17.726, "step": 310 }, { "epoch": 1.1681517020790495, "grad_norm": 23.18360710144043, "learning_rate": 0.00023348389289120158, "loss": 18.0533, "step": 320 }, { "epoch": 1.2047064199223212, "grad_norm": 23.22918701171875, "learning_rate": 0.0002294878896349807, "loss": 18.3334, "step": 330 }, { "epoch": 1.2412611377655929, "grad_norm": 20.49732780456543, "learning_rate": 0.00022541184881179737, "loss": 17.3156, "step": 340 }, { "epoch": 1.2778158556088646, "grad_norm": 23.029027938842773, "learning_rate": 0.00022125987464924926, "loss": 18.2255, "step": 350 }, { "epoch": 1.3143705734521363, "grad_norm": 19.006935119628906, "learning_rate": 0.0002170361478333702, "loss": 18.849, "step": 360 }, { "epoch": 1.350925291295408, "grad_norm": 19.27879524230957, "learning_rate": 0.0002127449212990339, "loss": 18.9721, "step": 370 }, { "epoch": 1.3874800091386794, "grad_norm": 19.815698623657227, "learning_rate": 0.00020839051594760872, "loss": 17.9975, "step": 380 }, { "epoch": 1.424034726981951, "grad_norm": 22.413816452026367, "learning_rate": 0.00020397731629617636, "loss": 17.729, "step": 390 }, { "epoch": 1.4605894448252228, "grad_norm": 19.09907341003418, "learning_rate": 0.00019950976606269497, "loss": 18.0831, "step": 400 }, { "epoch": 1.4971441626684945, "grad_norm": 17.786975860595703, "learning_rate": 0.00019499236369155157, "loss": 17.9862, "step": 410 }, { "epoch": 1.533698880511766, "grad_norm": 22.56906509399414, "learning_rate": 0.00019042965782401018, "loss": 17.6971, "step": 420 }, { "epoch": 1.5702535983550376, "grad_norm": 24.24004364013672, "learning_rate": 0.00018582624271811532, "loss": 18.3465, "step": 430 }, { "epoch": 1.6068083161983093, "grad_norm": 25.849702835083008, "learning_rate": 0.00018118675362266385, "loss": 18.3028, "step": 440 }, { "epoch": 1.643363034041581, "grad_norm": 26.149951934814453, "learning_rate": 0.00017651586210990232, "loss": 16.919, "step": 450 }, { "epoch": 1.6799177518848527, "grad_norm": 21.565109252929688, "learning_rate": 0.00017181827137164953, "loss": 18.9473, "step": 460 }, { "epoch": 1.7164724697281244, "grad_norm": 19.176294326782227, "learning_rate": 0.00016709871148358108, "loss": 16.9236, "step": 470 }, { "epoch": 1.753027187571396, "grad_norm": 19.63998031616211, "learning_rate": 0.00016236193464244444, "loss": 17.8228, "step": 480 }, { "epoch": 1.7895819054146676, "grad_norm": 23.75813865661621, "learning_rate": 0.00015761271038099912, "loss": 17.3249, "step": 490 }, { "epoch": 1.8261366232579392, "grad_norm": 23.72920036315918, "learning_rate": 0.00015285582076550198, "loss": 19.046, "step": 500 } ], "logging_steps": 10, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.47221899456696e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }