{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 200, "global_step": 742, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02695871946082561, "grad_norm": 0.8008378744125366, "learning_rate": 1.2000000000000002e-06, "loss": 1.2496, "step": 10 }, { "epoch": 0.05391743892165122, "grad_norm": 0.7582265138626099, "learning_rate": 2.5333333333333338e-06, "loss": 1.2396, "step": 20 }, { "epoch": 0.08087615838247683, "grad_norm": 0.7974638938903809, "learning_rate": 3.866666666666667e-06, "loss": 1.2612, "step": 30 }, { "epoch": 0.10783487784330244, "grad_norm": 0.7764624953269958, "learning_rate": 5.2e-06, "loss": 1.2333, "step": 40 }, { "epoch": 0.13479359730412804, "grad_norm": 0.7129917740821838, "learning_rate": 6.533333333333334e-06, "loss": 1.1534, "step": 50 }, { "epoch": 0.16175231676495366, "grad_norm": 0.6209232211112976, "learning_rate": 7.866666666666667e-06, "loss": 1.0875, "step": 60 }, { "epoch": 0.18871103622577928, "grad_norm": 0.44803526997566223, "learning_rate": 9.200000000000002e-06, "loss": 1.1057, "step": 70 }, { "epoch": 0.21566975568660487, "grad_norm": 0.36510151624679565, "learning_rate": 9.999112649450154e-06, "loss": 1.1344, "step": 80 }, { "epoch": 0.2426284751474305, "grad_norm": 0.2959080636501312, "learning_rate": 9.989133572488716e-06, "loss": 1.0469, "step": 90 }, { "epoch": 0.2695871946082561, "grad_norm": 0.24933554232120514, "learning_rate": 9.968088438109002e-06, "loss": 1.1375, "step": 100 }, { "epoch": 0.2965459140690817, "grad_norm": 0.2685732841491699, "learning_rate": 9.93602392509041e-06, "loss": 1.004, "step": 110 }, { "epoch": 0.3235046335299073, "grad_norm": 0.22426341474056244, "learning_rate": 9.893011153545679e-06, "loss": 1.0332, "step": 120 }, { "epoch": 0.35046335299073295, "grad_norm": 0.21612893044948578, "learning_rate": 9.839145527174216e-06, "loss": 1.044, "step": 130 }, { "epoch": 0.37742207245155857, "grad_norm": 0.24937374889850616, "learning_rate": 9.774546521653633e-06, "loss": 1.0035, "step": 140 }, { "epoch": 0.4043807919123842, "grad_norm": 0.23830914497375488, "learning_rate": 9.699357419638904e-06, "loss": 1.0611, "step": 150 }, { "epoch": 0.43133951137320975, "grad_norm": 0.240906223654747, "learning_rate": 9.613744992956844e-06, "loss": 0.9973, "step": 160 }, { "epoch": 0.45829823083403537, "grad_norm": 0.24612027406692505, "learning_rate": 9.517899132700889e-06, "loss": 1.0654, "step": 170 }, { "epoch": 0.485256950294861, "grad_norm": 0.25542137026786804, "learning_rate": 9.412032428046594e-06, "loss": 1.0056, "step": 180 }, { "epoch": 0.5122156697556866, "grad_norm": 0.2691582441329956, "learning_rate": 9.296379694722051e-06, "loss": 0.9783, "step": 190 }, { "epoch": 0.5391743892165122, "grad_norm": 0.23645047843456268, "learning_rate": 9.171197454179124e-06, "loss": 0.9903, "step": 200 }, { "epoch": 0.5391743892165122, "eval_loss": 0.9858898520469666, "eval_runtime": 361.9638, "eval_samples_per_second": 1.821, "eval_steps_per_second": 1.821, "step": 200 }, { "epoch": 0.5661331086773378, "grad_norm": 0.34743762016296387, "learning_rate": 9.03676336462068e-06, "loss": 0.9888, "step": 210 }, { "epoch": 0.5930918281381634, "grad_norm": 0.22745391726493835, "learning_rate": 8.893375605145837e-06, "loss": 0.9646, "step": 220 }, { "epoch": 0.620050547598989, "grad_norm": 0.2186407446861267, "learning_rate": 8.74135221437921e-06, "loss": 1.0027, "step": 230 }, { "epoch": 0.6470092670598147, "grad_norm": 0.2490515559911728, "learning_rate": 8.581030385051105e-06, "loss": 0.9469, "step": 240 }, { "epoch": 0.6739679865206403, "grad_norm": 0.24692057073116302, "learning_rate": 8.412765716093273e-06, "loss": 1.0131, "step": 250 }, { "epoch": 0.7009267059814659, "grad_norm": 0.2852599322795868, "learning_rate": 8.23693142390914e-06, "loss": 0.9614, "step": 260 }, { "epoch": 0.7278854254422915, "grad_norm": 0.3111582100391388, "learning_rate": 8.053917514567927e-06, "loss": 1.0357, "step": 270 }, { "epoch": 0.7548441449031171, "grad_norm": 0.25308677554130554, "learning_rate": 7.864129918758738e-06, "loss": 1.0057, "step": 280 }, { "epoch": 0.7818028643639428, "grad_norm": 0.2518717050552368, "learning_rate": 7.667989591423349e-06, "loss": 1.077, "step": 290 }, { "epoch": 0.8087615838247684, "grad_norm": 0.2539553940296173, "learning_rate": 7.465931578064703e-06, "loss": 1.0043, "step": 300 }, { "epoch": 0.8357203032855939, "grad_norm": 0.22149862349033356, "learning_rate": 7.258404049802135e-06, "loss": 0.9659, "step": 310 }, { "epoch": 0.8626790227464195, "grad_norm": 0.31281328201293945, "learning_rate": 7.045867309313499e-06, "loss": 0.9659, "step": 320 }, { "epoch": 0.8896377422072451, "grad_norm": 0.2311943918466568, "learning_rate": 6.8287927698691745e-06, "loss": 0.9671, "step": 330 }, { "epoch": 0.9165964616680707, "grad_norm": 0.29115474224090576, "learning_rate": 6.6076619097223735e-06, "loss": 0.9943, "step": 340 }, { "epoch": 0.9435551811288964, "grad_norm": 0.2641865015029907, "learning_rate": 6.382965204175027e-06, "loss": 0.968, "step": 350 }, { "epoch": 0.970513900589722, "grad_norm": 0.2566356062889099, "learning_rate": 6.155201037687917e-06, "loss": 0.9882, "step": 360 }, { "epoch": 0.9974726200505476, "grad_norm": 0.26403674483299255, "learning_rate": 5.924874598448038e-06, "loss": 0.9577, "step": 370 }, { "epoch": 1.024262847514743, "grad_norm": 0.2462741881608963, "learning_rate": 5.692496757845092e-06, "loss": 0.9367, "step": 380 }, { "epoch": 1.0512215669755687, "grad_norm": 0.2530335783958435, "learning_rate": 5.45858293734244e-06, "loss": 0.9893, "step": 390 }, { "epoch": 1.0781802864363943, "grad_norm": 0.23828567564487457, "learning_rate": 5.223651965255864e-06, "loss": 0.9931, "step": 400 }, { "epoch": 1.0781802864363943, "eval_loss": 0.974194347858429, "eval_runtime": 361.5075, "eval_samples_per_second": 1.823, "eval_steps_per_second": 1.823, "step": 400 }, { "epoch": 1.10513900589722, "grad_norm": 0.25133016705513, "learning_rate": 4.988224925975799e-06, "loss": 1.0022, "step": 410 }, { "epoch": 1.1320977253580455, "grad_norm": 0.24585603177547455, "learning_rate": 4.752824004185548e-06, "loss": 0.9993, "step": 420 }, { "epoch": 1.1590564448188712, "grad_norm": 0.27999183535575867, "learning_rate": 4.5179713266389866e-06, "loss": 0.9835, "step": 430 }, { "epoch": 1.1860151642796968, "grad_norm": 0.3136424720287323, "learning_rate": 4.284187804066764e-06, "loss": 0.9242, "step": 440 }, { "epoch": 1.2129738837405224, "grad_norm": 0.24964718520641327, "learning_rate": 4.051991975779691e-06, "loss": 0.9784, "step": 450 }, { "epoch": 1.239932603201348, "grad_norm": 0.29519638419151306, "learning_rate": 3.821898859532013e-06, "loss": 1.0019, "step": 460 }, { "epoch": 1.2668913226621736, "grad_norm": 0.28124356269836426, "learning_rate": 3.5944188091955843e-06, "loss": 1.0048, "step": 470 }, { "epoch": 1.2938500421229993, "grad_norm": 0.26205387711524963, "learning_rate": 3.3700563827787224e-06, "loss": 0.9982, "step": 480 }, { "epoch": 1.3208087615838249, "grad_norm": 0.4750404953956604, "learning_rate": 3.149309223300428e-06, "loss": 1.0254, "step": 490 }, { "epoch": 1.3477674810446505, "grad_norm": 0.2478070706129074, "learning_rate": 2.9326669550023124e-06, "loss": 0.9551, "step": 500 }, { "epoch": 1.3747262005054761, "grad_norm": 0.2824980616569519, "learning_rate": 2.7206100973463958e-06, "loss": 0.9784, "step": 510 }, { "epoch": 1.4016849199663017, "grad_norm": 0.41172295808792114, "learning_rate": 2.513608999207622e-06, "loss": 1.0383, "step": 520 }, { "epoch": 1.4286436394271271, "grad_norm": 0.25096771121025085, "learning_rate": 2.3121227956250435e-06, "loss": 0.9803, "step": 530 }, { "epoch": 1.4556023588879528, "grad_norm": 0.268365740776062, "learning_rate": 2.1165983894256647e-06, "loss": 0.981, "step": 540 }, { "epoch": 1.4825610783487784, "grad_norm": 0.2462984025478363, "learning_rate": 1.9274694599797067e-06, "loss": 0.9297, "step": 550 }, { "epoch": 1.509519797809604, "grad_norm": 0.25125372409820557, "learning_rate": 1.745155501285939e-06, "loss": 1.0078, "step": 560 }, { "epoch": 1.5364785172704296, "grad_norm": 0.3042011559009552, "learning_rate": 1.5700608915205978e-06, "loss": 1.0059, "step": 570 }, { "epoch": 1.5634372367312552, "grad_norm": 0.25765061378479004, "learning_rate": 1.4025739961137043e-06, "loss": 1.0604, "step": 580 }, { "epoch": 1.5903959561920809, "grad_norm": 0.28569814562797546, "learning_rate": 1.2430663063421388e-06, "loss": 0.9574, "step": 590 }, { "epoch": 1.6173546756529065, "grad_norm": 0.27754876017570496, "learning_rate": 1.091891615350147e-06, "loss": 0.9973, "step": 600 }, { "epoch": 1.6173546756529065, "eval_loss": 0.9700986742973328, "eval_runtime": 361.0549, "eval_samples_per_second": 1.825, "eval_steps_per_second": 1.825, "step": 600 }, { "epoch": 1.644313395113732, "grad_norm": 0.3200387954711914, "learning_rate": 9.49385233424856e-07, "loss": 0.9581, "step": 610 }, { "epoch": 1.6712721145745577, "grad_norm": 0.3195631504058838, "learning_rate": 8.158632442673603e-07, "loss": 0.9446, "step": 620 }, { "epoch": 1.6982308340353833, "grad_norm": 0.26112768054008484, "learning_rate": 6.916218039089961e-07, "loss": 0.9549, "step": 630 }, { "epoch": 1.725189553496209, "grad_norm": 0.2657977044582367, "learning_rate": 5.769364838278063e-07, "loss": 0.995, "step": 640 }, { "epoch": 1.7521482729570346, "grad_norm": 0.3035813868045807, "learning_rate": 4.720616597222205e-07, "loss": 0.9935, "step": 650 }, { "epoch": 1.7791069924178602, "grad_norm": 0.29046186804771423, "learning_rate": 3.7722994729763427e-07, "loss": 1.0067, "step": 660 }, { "epoch": 1.8060657118786858, "grad_norm": 0.27917370200157166, "learning_rate": 2.9265168631736005e-07, "loss": 1.0086, "step": 670 }, { "epoch": 1.8330244313395114, "grad_norm": 0.26591578125953674, "learning_rate": 2.1851447406231573e-07, "loss": 0.9478, "step": 680 }, { "epoch": 1.8599831508003368, "grad_norm": 0.30971062183380127, "learning_rate": 1.5498274923427925e-07, "loss": 0.9364, "step": 690 }, { "epoch": 1.8869418702611624, "grad_norm": 0.24184127151966095, "learning_rate": 1.0219742722559433e-07, "loss": 1.0085, "step": 700 }, { "epoch": 1.913900589721988, "grad_norm": 0.32377859950065613, "learning_rate": 6.027558756434015e-08, "loss": 1.0718, "step": 710 }, { "epoch": 1.9408593091828137, "grad_norm": 0.24339060485363007, "learning_rate": 2.9310214228202016e-08, "loss": 0.9437, "step": 720 }, { "epoch": 1.9678180286436393, "grad_norm": 0.2631000876426697, "learning_rate": 9.369989403041347e-09, "loss": 0.9958, "step": 730 }, { "epoch": 1.994776748104465, "grad_norm": 0.24943451583385468, "learning_rate": 4.991411436189308e-10, "loss": 0.9993, "step": 740 }, { "epoch": 2.0, "step": 742, "total_flos": 6.06897586520064e+17, "train_loss": 1.01524107314827, "train_runtime": 23674.123, "train_samples_per_second": 0.501, "train_steps_per_second": 0.031 } ], "logging_steps": 10, "max_steps": 742, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.06897586520064e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }