{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0309544282029235, "eval_steps": 500, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008598452278589854, "grad_norm": 0.5736560821533203, "learning_rate": 5.517241379310345e-08, "loss": 1.4983, "step": 5 }, { "epoch": 0.017196904557179708, "grad_norm": 0.47119423747062683, "learning_rate": 1.2413793103448275e-07, "loss": 1.2594, "step": 10 }, { "epoch": 0.025795356835769563, "grad_norm": 0.6366227865219116, "learning_rate": 1.9310344827586205e-07, "loss": 1.5001, "step": 15 }, { "epoch": 0.034393809114359415, "grad_norm": 0.5710499882698059, "learning_rate": 2.620689655172414e-07, "loss": 1.5602, "step": 20 }, { "epoch": 0.04299226139294927, "grad_norm": 0.4520650804042816, "learning_rate": 3.310344827586207e-07, "loss": 1.4316, "step": 25 }, { "epoch": 0.051590713671539126, "grad_norm": 0.5874906778335571, "learning_rate": 4e-07, "loss": 1.5158, "step": 30 }, { "epoch": 0.06018916595012898, "grad_norm": 0.4618767201900482, "learning_rate": 4.689655172413793e-07, "loss": 1.3945, "step": 35 }, { "epoch": 0.06878761822871883, "grad_norm": 0.47305673360824585, "learning_rate": 5.379310344827586e-07, "loss": 1.4154, "step": 40 }, { "epoch": 0.07738607050730868, "grad_norm": 0.5665937066078186, "learning_rate": 6.068965517241379e-07, "loss": 1.3776, "step": 45 }, { "epoch": 0.08598452278589853, "grad_norm": 0.5359976291656494, "learning_rate": 6.758620689655172e-07, "loss": 1.5018, "step": 50 }, { "epoch": 0.09458297506448839, "grad_norm": 0.5621670484542847, "learning_rate": 7.448275862068965e-07, "loss": 1.4493, "step": 55 }, { "epoch": 0.10318142734307825, "grad_norm": 0.5842466354370117, "learning_rate": 8.137931034482758e-07, "loss": 1.4241, "step": 60 }, { "epoch": 0.1117798796216681, "grad_norm": 0.6147713661193848, "learning_rate": 8.827586206896551e-07, "loss": 1.4748, "step": 65 }, { "epoch": 0.12037833190025796, "grad_norm": 0.6653420925140381, "learning_rate": 9.517241379310345e-07, "loss": 1.6576, "step": 70 }, { "epoch": 0.1289767841788478, "grad_norm": 0.5247882604598999, "learning_rate": 1.0206896551724139e-06, "loss": 1.439, "step": 75 }, { "epoch": 0.13757523645743766, "grad_norm": 0.6043932437896729, "learning_rate": 1.089655172413793e-06, "loss": 1.475, "step": 80 }, { "epoch": 0.1461736887360275, "grad_norm": 0.614253580570221, "learning_rate": 1.1586206896551724e-06, "loss": 1.4928, "step": 85 }, { "epoch": 0.15477214101461736, "grad_norm": 0.6523554921150208, "learning_rate": 1.2275862068965516e-06, "loss": 1.5111, "step": 90 }, { "epoch": 0.16337059329320722, "grad_norm": 0.4596118628978729, "learning_rate": 1.2965517241379309e-06, "loss": 1.4672, "step": 95 }, { "epoch": 0.17196904557179707, "grad_norm": 0.5627196431159973, "learning_rate": 1.3655172413793103e-06, "loss": 1.4621, "step": 100 }, { "epoch": 0.18056749785038692, "grad_norm": 0.5366098284721375, "learning_rate": 1.4344827586206896e-06, "loss": 1.3875, "step": 105 }, { "epoch": 0.18916595012897677, "grad_norm": 0.48968908190727234, "learning_rate": 1.5034482758620688e-06, "loss": 1.4264, "step": 110 }, { "epoch": 0.19776440240756663, "grad_norm": 0.5644566416740417, "learning_rate": 1.5724137931034483e-06, "loss": 1.3822, "step": 115 }, { "epoch": 0.2063628546861565, "grad_norm": 0.49979278445243835, "learning_rate": 1.6413793103448275e-06, "loss": 1.4866, "step": 120 }, { "epoch": 0.21496130696474636, "grad_norm": 0.5352433919906616, "learning_rate": 1.710344827586207e-06, "loss": 1.3047, "step": 125 }, { "epoch": 0.2235597592433362, "grad_norm": 0.5073248744010925, "learning_rate": 1.7793103448275862e-06, "loss": 1.4763, "step": 130 }, { "epoch": 0.23215821152192606, "grad_norm": 0.48227864503860474, "learning_rate": 1.8482758620689653e-06, "loss": 1.4552, "step": 135 }, { "epoch": 0.2407566638005159, "grad_norm": 0.4420427978038788, "learning_rate": 1.9172413793103447e-06, "loss": 1.3285, "step": 140 }, { "epoch": 0.24935511607910577, "grad_norm": 0.4224660396575928, "learning_rate": 1.986206896551724e-06, "loss": 1.4294, "step": 145 }, { "epoch": 0.2579535683576956, "grad_norm": 0.3935889005661011, "learning_rate": 1.999989634963924e-06, "loss": 1.32, "step": 150 }, { "epoch": 0.26655202063628547, "grad_norm": 0.352851539850235, "learning_rate": 1.9999475273731217e-06, "loss": 1.326, "step": 155 }, { "epoch": 0.2751504729148753, "grad_norm": 0.4401000142097473, "learning_rate": 1.9998730307756826e-06, "loss": 1.352, "step": 160 }, { "epoch": 0.2837489251934652, "grad_norm": 0.3836155831813812, "learning_rate": 1.9997661475846052e-06, "loss": 1.3492, "step": 165 }, { "epoch": 0.292347377472055, "grad_norm": 0.32301604747772217, "learning_rate": 1.9996268812619105e-06, "loss": 1.309, "step": 170 }, { "epoch": 0.3009458297506449, "grad_norm": 0.37157368659973145, "learning_rate": 1.999455236318534e-06, "loss": 1.4444, "step": 175 }, { "epoch": 0.30954428202923473, "grad_norm": 0.340994268655777, "learning_rate": 1.999251218314176e-06, "loss": 1.2391, "step": 180 }, { "epoch": 0.3181427343078246, "grad_norm": 0.3156416714191437, "learning_rate": 1.999014833857124e-06, "loss": 1.2496, "step": 185 }, { "epoch": 0.32674118658641443, "grad_norm": 0.35868847370147705, "learning_rate": 1.998746090604037e-06, "loss": 1.3442, "step": 190 }, { "epoch": 0.3353396388650043, "grad_norm": 0.31525614857673645, "learning_rate": 1.9984449972597e-06, "loss": 1.3011, "step": 195 }, { "epoch": 0.34393809114359414, "grad_norm": 0.47816550731658936, "learning_rate": 1.9981115635767376e-06, "loss": 1.2962, "step": 200 }, { "epoch": 0.352536543422184, "grad_norm": 0.31586989760398865, "learning_rate": 1.9977458003553037e-06, "loss": 1.2353, "step": 205 }, { "epoch": 0.36113499570077384, "grad_norm": 0.3226100206375122, "learning_rate": 1.9973477194427262e-06, "loss": 1.2135, "step": 210 }, { "epoch": 0.3697334479793637, "grad_norm": 0.3332583010196686, "learning_rate": 1.996917333733128e-06, "loss": 1.2553, "step": 215 }, { "epoch": 0.37833190025795355, "grad_norm": 0.33345943689346313, "learning_rate": 1.9964546571670053e-06, "loss": 1.2652, "step": 220 }, { "epoch": 0.3869303525365434, "grad_norm": 0.37145930528640747, "learning_rate": 1.995959704730779e-06, "loss": 1.1999, "step": 225 }, { "epoch": 0.39552880481513325, "grad_norm": 0.41526567935943604, "learning_rate": 1.9954324924563086e-06, "loss": 1.3373, "step": 230 }, { "epoch": 0.4041272570937231, "grad_norm": 0.2987145781517029, "learning_rate": 1.9948730374203715e-06, "loss": 1.2249, "step": 235 }, { "epoch": 0.412725709372313, "grad_norm": 0.49898993968963623, "learning_rate": 1.994281357744112e-06, "loss": 1.2323, "step": 240 }, { "epoch": 0.42132416165090286, "grad_norm": 0.2862403392791748, "learning_rate": 1.9936574725924525e-06, "loss": 1.1908, "step": 245 }, { "epoch": 0.4299226139294927, "grad_norm": 0.31127598881721497, "learning_rate": 1.9930014021734732e-06, "loss": 1.3326, "step": 250 }, { "epoch": 0.43852106620808257, "grad_norm": 0.44075530767440796, "learning_rate": 1.9923131677377585e-06, "loss": 1.1721, "step": 255 }, { "epoch": 0.4471195184866724, "grad_norm": 0.29328110814094543, "learning_rate": 1.991592791577708e-06, "loss": 1.2735, "step": 260 }, { "epoch": 0.45571797076526227, "grad_norm": 0.3364109396934509, "learning_rate": 1.9908402970268145e-06, "loss": 1.1244, "step": 265 }, { "epoch": 0.4643164230438521, "grad_norm": 0.3128204047679901, "learning_rate": 1.9900557084589077e-06, "loss": 1.2281, "step": 270 }, { "epoch": 0.472914875322442, "grad_norm": 0.30848929286003113, "learning_rate": 1.989239051287366e-06, "loss": 1.2333, "step": 275 }, { "epoch": 0.4815133276010318, "grad_norm": 0.36058536171913147, "learning_rate": 1.988390351964291e-06, "loss": 1.2995, "step": 280 }, { "epoch": 0.4901117798796217, "grad_norm": 0.3388177752494812, "learning_rate": 1.9875096379796535e-06, "loss": 1.3073, "step": 285 }, { "epoch": 0.49871023215821153, "grad_norm": 0.3674142062664032, "learning_rate": 1.986596937860402e-06, "loss": 1.2673, "step": 290 }, { "epoch": 0.5073086844368013, "grad_norm": 0.30132225155830383, "learning_rate": 1.9856522811695374e-06, "loss": 1.2107, "step": 295 }, { "epoch": 0.5159071367153912, "grad_norm": 0.3170187473297119, "learning_rate": 1.9846756985051573e-06, "loss": 1.1624, "step": 300 }, { "epoch": 0.524505588993981, "grad_norm": 0.3753427565097809, "learning_rate": 1.9836672214994637e-06, "loss": 1.2782, "step": 305 }, { "epoch": 0.5331040412725709, "grad_norm": 0.36705997586250305, "learning_rate": 1.9826268828177393e-06, "loss": 1.134, "step": 310 }, { "epoch": 0.5417024935511608, "grad_norm": 0.3330920338630676, "learning_rate": 1.9815547161572892e-06, "loss": 1.1611, "step": 315 }, { "epoch": 0.5503009458297506, "grad_norm": 0.3084378242492676, "learning_rate": 1.980450756246348e-06, "loss": 1.2008, "step": 320 }, { "epoch": 0.5588993981083406, "grad_norm": 0.3422742486000061, "learning_rate": 1.979315038842957e-06, "loss": 1.136, "step": 325 }, { "epoch": 0.5674978503869303, "grad_norm": 0.4076729118824005, "learning_rate": 1.9781476007338054e-06, "loss": 1.179, "step": 330 }, { "epoch": 0.5760963026655203, "grad_norm": 0.38809648156166077, "learning_rate": 1.976948479733038e-06, "loss": 1.1762, "step": 335 }, { "epoch": 0.58469475494411, "grad_norm": 0.38286837935447693, "learning_rate": 1.9757177146810307e-06, "loss": 1.0976, "step": 340 }, { "epoch": 0.5932932072227, "grad_norm": 0.33073902130126953, "learning_rate": 1.9744553454431325e-06, "loss": 1.1775, "step": 345 }, { "epoch": 0.6018916595012898, "grad_norm": 0.30669665336608887, "learning_rate": 1.9731614129083753e-06, "loss": 1.1242, "step": 350 }, { "epoch": 0.6104901117798797, "grad_norm": 0.36842820048332214, "learning_rate": 1.9718359589881475e-06, "loss": 1.233, "step": 355 }, { "epoch": 0.6190885640584695, "grad_norm": 0.28934866189956665, "learning_rate": 1.970479026614837e-06, "loss": 1.0545, "step": 360 }, { "epoch": 0.6276870163370594, "grad_norm": 0.36205312609672546, "learning_rate": 1.9690906597404428e-06, "loss": 1.0926, "step": 365 }, { "epoch": 0.6362854686156492, "grad_norm": 0.39489033818244934, "learning_rate": 1.967670903335148e-06, "loss": 1.2244, "step": 370 }, { "epoch": 0.6448839208942391, "grad_norm": 0.29848822951316833, "learning_rate": 1.966219803385865e-06, "loss": 1.1829, "step": 375 }, { "epoch": 0.6534823731728289, "grad_norm": 0.31506502628326416, "learning_rate": 1.9647374068947467e-06, "loss": 1.1418, "step": 380 }, { "epoch": 0.6620808254514188, "grad_norm": 0.39251357316970825, "learning_rate": 1.963223761877662e-06, "loss": 1.1543, "step": 385 }, { "epoch": 0.6706792777300086, "grad_norm": 0.3380272686481476, "learning_rate": 1.9616789173626418e-06, "loss": 1.0856, "step": 390 }, { "epoch": 0.6792777300085985, "grad_norm": 0.379609614610672, "learning_rate": 1.960102923388291e-06, "loss": 1.1562, "step": 395 }, { "epoch": 0.6878761822871883, "grad_norm": 0.26444530487060547, "learning_rate": 1.958495831002168e-06, "loss": 1.1356, "step": 400 }, { "epoch": 0.6964746345657782, "grad_norm": 0.4068126082420349, "learning_rate": 1.9568576922591304e-06, "loss": 1.2286, "step": 405 }, { "epoch": 0.705073086844368, "grad_norm": 0.3288695514202118, "learning_rate": 1.955188560219648e-06, "loss": 1.206, "step": 410 }, { "epoch": 0.7136715391229579, "grad_norm": 0.33392417430877686, "learning_rate": 1.9534884889480876e-06, "loss": 1.1944, "step": 415 }, { "epoch": 0.7222699914015477, "grad_norm": 0.37305575609207153, "learning_rate": 1.951757533510957e-06, "loss": 1.204, "step": 420 }, { "epoch": 0.7308684436801376, "grad_norm": 0.3695738911628723, "learning_rate": 1.949995749975127e-06, "loss": 1.1728, "step": 425 }, { "epoch": 0.7394668959587274, "grad_norm": 0.37770697474479675, "learning_rate": 1.948203195406009e-06, "loss": 1.2171, "step": 430 }, { "epoch": 0.7480653482373173, "grad_norm": 0.38213881850242615, "learning_rate": 1.9463799278657124e-06, "loss": 1.1948, "step": 435 }, { "epoch": 0.7566638005159071, "grad_norm": 0.4096551239490509, "learning_rate": 1.9445260064111607e-06, "loss": 1.1774, "step": 440 }, { "epoch": 0.765262252794497, "grad_norm": 0.41796615719795227, "learning_rate": 1.9426414910921785e-06, "loss": 1.1234, "step": 445 }, { "epoch": 0.7738607050730868, "grad_norm": 0.3439182639122009, "learning_rate": 1.9407264429495484e-06, "loss": 1.1748, "step": 450 }, { "epoch": 0.7824591573516767, "grad_norm": 0.34333643317222595, "learning_rate": 1.938780924013032e-06, "loss": 1.1441, "step": 455 }, { "epoch": 0.7910576096302665, "grad_norm": 0.32014700770378113, "learning_rate": 1.936804997299362e-06, "loss": 1.1205, "step": 460 }, { "epoch": 0.7996560619088564, "grad_norm": 0.3878689408302307, "learning_rate": 1.9347987268101996e-06, "loss": 1.1397, "step": 465 }, { "epoch": 0.8082545141874462, "grad_norm": 0.3905511796474457, "learning_rate": 1.9327621775300633e-06, "loss": 1.1016, "step": 470 }, { "epoch": 0.8168529664660361, "grad_norm": 0.3861243426799774, "learning_rate": 1.9306954154242233e-06, "loss": 1.0783, "step": 475 }, { "epoch": 0.825451418744626, "grad_norm": 0.3234269618988037, "learning_rate": 1.9285985074365627e-06, "loss": 1.0694, "step": 480 }, { "epoch": 0.8340498710232158, "grad_norm": 0.39914825558662415, "learning_rate": 1.926471521487413e-06, "loss": 1.2053, "step": 485 }, { "epoch": 0.8426483233018057, "grad_norm": 0.39884087443351746, "learning_rate": 1.924314526471351e-06, "loss": 1.1663, "step": 490 }, { "epoch": 0.8512467755803955, "grad_norm": 0.3793065547943115, "learning_rate": 1.922127592254968e-06, "loss": 1.083, "step": 495 }, { "epoch": 0.8598452278589854, "grad_norm": 0.3194003403186798, "learning_rate": 1.919910789674609e-06, "loss": 1.0664, "step": 500 }, { "epoch": 0.8684436801375752, "grad_norm": 0.30485859513282776, "learning_rate": 1.917664190534075e-06, "loss": 1.0426, "step": 505 }, { "epoch": 0.8770421324161651, "grad_norm": 0.3151833117008209, "learning_rate": 1.915387867602298e-06, "loss": 1.157, "step": 510 }, { "epoch": 0.8856405846947549, "grad_norm": 0.41206279397010803, "learning_rate": 1.913081894610986e-06, "loss": 1.1125, "step": 515 }, { "epoch": 0.8942390369733448, "grad_norm": 0.42712071537971497, "learning_rate": 1.9107463462522332e-06, "loss": 0.9867, "step": 520 }, { "epoch": 0.9028374892519346, "grad_norm": 0.45993658900260925, "learning_rate": 1.9083812981760998e-06, "loss": 1.2464, "step": 525 }, { "epoch": 0.9114359415305245, "grad_norm": 0.34718605875968933, "learning_rate": 1.9059868269881636e-06, "loss": 1.0646, "step": 530 }, { "epoch": 0.9200343938091143, "grad_norm": 0.4059743285179138, "learning_rate": 1.9035630102470375e-06, "loss": 1.08, "step": 535 }, { "epoch": 0.9286328460877042, "grad_norm": 0.34420260787010193, "learning_rate": 1.9011099264618573e-06, "loss": 1.0843, "step": 540 }, { "epoch": 0.937231298366294, "grad_norm": 0.36102572083473206, "learning_rate": 1.89862765508974e-06, "loss": 1.0874, "step": 545 }, { "epoch": 0.945829750644884, "grad_norm": 0.4396149516105652, "learning_rate": 1.896116276533208e-06, "loss": 1.1614, "step": 550 }, { "epoch": 0.9544282029234737, "grad_norm": 0.3992445170879364, "learning_rate": 1.8935758721375862e-06, "loss": 1.122, "step": 555 }, { "epoch": 0.9630266552020637, "grad_norm": 0.3218730092048645, "learning_rate": 1.8910065241883678e-06, "loss": 0.9754, "step": 560 }, { "epoch": 0.9716251074806534, "grad_norm": 0.3818008601665497, "learning_rate": 1.8884083159085468e-06, "loss": 1.1747, "step": 565 }, { "epoch": 0.9802235597592434, "grad_norm": 0.407429575920105, "learning_rate": 1.8857813314559254e-06, "loss": 1.1048, "step": 570 }, { "epoch": 0.9888220120378332, "grad_norm": 0.4260193407535553, "learning_rate": 1.8831256559203843e-06, "loss": 1.1217, "step": 575 }, { "epoch": 0.9974204643164231, "grad_norm": 0.3930356204509735, "learning_rate": 1.8804413753211304e-06, "loss": 1.1259, "step": 580 }, { "epoch": 1.0051590713671539, "grad_norm": 0.3557448983192444, "learning_rate": 1.8777285766039075e-06, "loss": 1.1636, "step": 585 }, { "epoch": 1.0137575236457437, "grad_norm": 0.34454646706581116, "learning_rate": 1.8749873476381826e-06, "loss": 1.093, "step": 590 }, { "epoch": 1.0223559759243337, "grad_norm": 0.4703282415866852, "learning_rate": 1.8722177772142973e-06, "loss": 1.0716, "step": 595 }, { "epoch": 1.0309544282029235, "grad_norm": 0.418491929769516, "learning_rate": 1.8694199550405942e-06, "loss": 1.0626, "step": 600 } ], "logging_steps": 5, "max_steps": 2905, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4341188063131075e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }