{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1068, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0046816479400749065, "grad_norm": 296.0, "learning_rate": 4.0000000000000003e-07, "loss": 2.6547273635864257, "step": 5 }, { "epoch": 0.009363295880149813, "grad_norm": 324.0, "learning_rate": 9.000000000000001e-07, "loss": 2.3461462020874024, "step": 10 }, { "epoch": 0.014044943820224719, "grad_norm": 302.0, "learning_rate": 1.4000000000000001e-06, "loss": 1.0991961479187011, "step": 15 }, { "epoch": 0.018726591760299626, "grad_norm": 52.75, "learning_rate": 1.9000000000000002e-06, "loss": 0.19423211812973024, "step": 20 }, { "epoch": 0.023408239700374533, "grad_norm": 150.0, "learning_rate": 2.4000000000000003e-06, "loss": 0.24906330108642577, "step": 25 }, { "epoch": 0.028089887640449437, "grad_norm": 57.25, "learning_rate": 2.9e-06, "loss": 0.21630589962005614, "step": 30 }, { "epoch": 0.03277153558052434, "grad_norm": 55.0, "learning_rate": 3.4000000000000005e-06, "loss": 0.2401110887527466, "step": 35 }, { "epoch": 0.03745318352059925, "grad_norm": 61.5, "learning_rate": 3.900000000000001e-06, "loss": 0.15306113958358764, "step": 40 }, { "epoch": 0.042134831460674156, "grad_norm": 42.25, "learning_rate": 4.4e-06, "loss": 0.24524540901184083, "step": 45 }, { "epoch": 0.04681647940074907, "grad_norm": 10.1875, "learning_rate": 4.9000000000000005e-06, "loss": 0.1620311975479126, "step": 50 }, { "epoch": 0.05149812734082397, "grad_norm": 18.25, "learning_rate": 5.400000000000001e-06, "loss": 0.09609854817390442, "step": 55 }, { "epoch": 0.056179775280898875, "grad_norm": 19.125, "learning_rate": 5.9e-06, "loss": 0.2541069984436035, "step": 60 }, { "epoch": 0.060861423220973786, "grad_norm": 35.5, "learning_rate": 6.4000000000000006e-06, "loss": 0.2134775161743164, "step": 65 }, { "epoch": 0.06554307116104868, "grad_norm": 48.0, "learning_rate": 6.9e-06, "loss": 0.13749781847000123, "step": 70 }, { "epoch": 0.0702247191011236, "grad_norm": 46.5, "learning_rate": 7.4e-06, "loss": 0.17141337394714357, "step": 75 }, { "epoch": 0.0749063670411985, "grad_norm": 4.6875, "learning_rate": 7.9e-06, "loss": 0.14356958866119385, "step": 80 }, { "epoch": 0.07958801498127341, "grad_norm": 38.75, "learning_rate": 8.400000000000001e-06, "loss": 0.2567991018295288, "step": 85 }, { "epoch": 0.08426966292134831, "grad_norm": 126.5, "learning_rate": 8.900000000000001e-06, "loss": 0.31857073307037354, "step": 90 }, { "epoch": 0.08895131086142322, "grad_norm": 21.0, "learning_rate": 9.4e-06, "loss": 0.15032247304916382, "step": 95 }, { "epoch": 0.09363295880149813, "grad_norm": 8.6875, "learning_rate": 9.9e-06, "loss": 0.16799943447113036, "step": 100 }, { "epoch": 0.09831460674157304, "grad_norm": 35.0, "learning_rate": 9.999578688879085e-06, "loss": 0.14436352252960205, "step": 105 }, { "epoch": 0.10299625468164794, "grad_norm": 3.609375, "learning_rate": 9.997867234136308e-06, "loss": 0.17057626247406005, "step": 110 }, { "epoch": 0.10767790262172285, "grad_norm": 27.875, "learning_rate": 9.99483975413568e-06, "loss": 0.21162867546081543, "step": 115 }, { "epoch": 0.11235955056179775, "grad_norm": 18.375, "learning_rate": 9.990497046065279e-06, "loss": 0.11510965824127198, "step": 120 }, { "epoch": 0.11704119850187265, "grad_norm": 18.625, "learning_rate": 9.984840253435569e-06, "loss": 0.1667150378227234, "step": 125 }, { "epoch": 0.12172284644194757, "grad_norm": 26.75, "learning_rate": 9.97787086577831e-06, "loss": 0.16688863039016724, "step": 130 }, { "epoch": 0.12640449438202248, "grad_norm": 13.5625, "learning_rate": 9.969590718254337e-06, "loss": 0.16854366064071655, "step": 135 }, { "epoch": 0.13108614232209737, "grad_norm": 15.5, "learning_rate": 9.96000199117032e-06, "loss": 0.16345432996749878, "step": 140 }, { "epoch": 0.13576779026217228, "grad_norm": 8.875, "learning_rate": 9.949107209404664e-06, "loss": 0.10633381605148315, "step": 145 }, { "epoch": 0.1404494382022472, "grad_norm": 19.125, "learning_rate": 9.936909241742652e-06, "loss": 0.11197474002838134, "step": 150 }, { "epoch": 0.1451310861423221, "grad_norm": 7.0, "learning_rate": 9.923411300121055e-06, "loss": 0.09350830316543579, "step": 155 }, { "epoch": 0.149812734082397, "grad_norm": 27.0, "learning_rate": 9.908616938782364e-06, "loss": 0.12772412300109864, "step": 160 }, { "epoch": 0.1544943820224719, "grad_norm": 60.0, "learning_rate": 9.892530053338909e-06, "loss": 0.22945654392242432, "step": 165 }, { "epoch": 0.15917602996254682, "grad_norm": 7.8125, "learning_rate": 9.875154879747058e-06, "loss": 0.10292181968688965, "step": 170 }, { "epoch": 0.16385767790262173, "grad_norm": 24.875, "learning_rate": 9.856495993191836e-06, "loss": 0.1009818434715271, "step": 175 }, { "epoch": 0.16853932584269662, "grad_norm": 29.625, "learning_rate": 9.836558306882182e-06, "loss": 0.16335415840148926, "step": 180 }, { "epoch": 0.17322097378277154, "grad_norm": 6.53125, "learning_rate": 9.815347070757234e-06, "loss": 0.15177664756774903, "step": 185 }, { "epoch": 0.17790262172284643, "grad_norm": 24.625, "learning_rate": 9.792867870103904e-06, "loss": 0.15361402034759522, "step": 190 }, { "epoch": 0.18258426966292135, "grad_norm": 13.625, "learning_rate": 9.769126624086202e-06, "loss": 0.19070371389389038, "step": 195 }, { "epoch": 0.18726591760299627, "grad_norm": 15.5, "learning_rate": 9.744129584186599e-06, "loss": 0.10595703125, "step": 200 }, { "epoch": 0.19194756554307116, "grad_norm": 1.5703125, "learning_rate": 9.717883332559911e-06, "loss": 0.10733038187026978, "step": 205 }, { "epoch": 0.19662921348314608, "grad_norm": 15.625, "learning_rate": 9.690394780300098e-06, "loss": 0.13379451036453247, "step": 210 }, { "epoch": 0.20131086142322097, "grad_norm": 14.25, "learning_rate": 9.66167116562046e-06, "loss": 0.14411044120788574, "step": 215 }, { "epoch": 0.20599250936329588, "grad_norm": 10.4375, "learning_rate": 9.631720051947683e-06, "loss": 0.10819638967514038, "step": 220 }, { "epoch": 0.21067415730337077, "grad_norm": 10.75, "learning_rate": 9.60054932593026e-06, "loss": 0.0847718894481659, "step": 225 }, { "epoch": 0.2153558052434457, "grad_norm": 10.3125, "learning_rate": 9.568167195361802e-06, "loss": 0.1128539800643921, "step": 230 }, { "epoch": 0.2200374531835206, "grad_norm": 9.4375, "learning_rate": 9.534582187019777e-06, "loss": 0.10417553186416625, "step": 235 }, { "epoch": 0.2247191011235955, "grad_norm": 14.6875, "learning_rate": 9.499803144420268e-06, "loss": 0.11842489242553711, "step": 240 }, { "epoch": 0.22940074906367042, "grad_norm": 16.75, "learning_rate": 9.46383922548932e-06, "loss": 0.22187628746032714, "step": 245 }, { "epoch": 0.2340823970037453, "grad_norm": 9.5625, "learning_rate": 9.426699900151494e-06, "loss": 0.10029861927032471, "step": 250 }, { "epoch": 0.23876404494382023, "grad_norm": 27.5, "learning_rate": 9.388394947836278e-06, "loss": 0.13310248851776124, "step": 255 }, { "epoch": 0.24344569288389514, "grad_norm": 8.0625, "learning_rate": 9.348934454902992e-06, "loss": 0.12244582176208496, "step": 260 }, { "epoch": 0.24812734082397003, "grad_norm": 12.125, "learning_rate": 9.30832881198487e-06, "loss": 0.13504416942596437, "step": 265 }, { "epoch": 0.25280898876404495, "grad_norm": 3.390625, "learning_rate": 9.26658871125303e-06, "loss": 0.08242733478546142, "step": 270 }, { "epoch": 0.25749063670411987, "grad_norm": 1.046875, "learning_rate": 9.223725143601037e-06, "loss": 0.07960397601127625, "step": 275 }, { "epoch": 0.26217228464419473, "grad_norm": 8.25, "learning_rate": 9.179749395750812e-06, "loss": 0.08370729088783264, "step": 280 }, { "epoch": 0.26685393258426965, "grad_norm": 10.25, "learning_rate": 9.134673047280644e-06, "loss": 0.15015305280685426, "step": 285 }, { "epoch": 0.27153558052434457, "grad_norm": 5.875, "learning_rate": 9.088507967576078e-06, "loss": 0.10222412347793579, "step": 290 }, { "epoch": 0.2762172284644195, "grad_norm": 16.125, "learning_rate": 9.041266312704511e-06, "loss": 0.12045938968658447, "step": 295 }, { "epoch": 0.2808988764044944, "grad_norm": 0.5546875, "learning_rate": 8.992960522214276e-06, "loss": 0.03160299360752106, "step": 300 }, { "epoch": 0.28558052434456926, "grad_norm": 9.4375, "learning_rate": 8.943603315859101e-06, "loss": 0.049703413248062135, "step": 305 }, { "epoch": 0.2902621722846442, "grad_norm": 2.90625, "learning_rate": 8.893207690248776e-06, "loss": 0.1010066032409668, "step": 310 }, { "epoch": 0.2949438202247191, "grad_norm": 0.050537109375, "learning_rate": 8.841786915426918e-06, "loss": 0.17653073072433473, "step": 315 }, { "epoch": 0.299625468164794, "grad_norm": 0.08349609375, "learning_rate": 8.78935453137674e-06, "loss": 0.08482981920242309, "step": 320 }, { "epoch": 0.30430711610486894, "grad_norm": 11.4375, "learning_rate": 8.735924344455732e-06, "loss": 0.20995216369628905, "step": 325 }, { "epoch": 0.3089887640449438, "grad_norm": 2.390625, "learning_rate": 8.68151042376022e-06, "loss": 0.13823301792144777, "step": 330 }, { "epoch": 0.3136704119850187, "grad_norm": 6.40625, "learning_rate": 8.626127097420711e-06, "loss": 0.09639847874641419, "step": 335 }, { "epoch": 0.31835205992509363, "grad_norm": 2.3125, "learning_rate": 8.569788948829066e-06, "loss": 0.08136180639266968, "step": 340 }, { "epoch": 0.32303370786516855, "grad_norm": 9.25, "learning_rate": 8.512510812798426e-06, "loss": 0.07721298336982726, "step": 345 }, { "epoch": 0.32771535580524347, "grad_norm": 1.4140625, "learning_rate": 8.454307771656956e-06, "loss": 0.031324502825737, "step": 350 }, { "epoch": 0.33239700374531833, "grad_norm": 8.375, "learning_rate": 8.395195151276397e-06, "loss": 0.21767141819000244, "step": 355 }, { "epoch": 0.33707865168539325, "grad_norm": 35.0, "learning_rate": 8.335188517036507e-06, "loss": 0.10922614336013795, "step": 360 }, { "epoch": 0.34176029962546817, "grad_norm": 2.984375, "learning_rate": 8.274303669726427e-06, "loss": 0.03281030058860779, "step": 365 }, { "epoch": 0.3464419475655431, "grad_norm": 0.255859375, "learning_rate": 8.212556641384044e-06, "loss": 0.07252607941627502, "step": 370 }, { "epoch": 0.351123595505618, "grad_norm": 0.6953125, "learning_rate": 8.149963691074494e-06, "loss": 0.01815851628780365, "step": 375 }, { "epoch": 0.35580524344569286, "grad_norm": 6.25, "learning_rate": 8.086541300608864e-06, "loss": 0.029612547159194945, "step": 380 }, { "epoch": 0.3604868913857678, "grad_norm": 0.1416015625, "learning_rate": 8.022306170204233e-06, "loss": 0.14124746322631837, "step": 385 }, { "epoch": 0.3651685393258427, "grad_norm": 14.0625, "learning_rate": 7.957275214086231e-06, "loss": 0.056589150428771974, "step": 390 }, { "epoch": 0.3698501872659176, "grad_norm": 8.4375, "learning_rate": 7.891465556035219e-06, "loss": 0.1280989170074463, "step": 395 }, { "epoch": 0.37453183520599254, "grad_norm": 3.75, "learning_rate": 7.824894524877302e-06, "loss": 0.0552295446395874, "step": 400 }, { "epoch": 0.3792134831460674, "grad_norm": 9.375, "learning_rate": 7.757579649921354e-06, "loss": 0.08489207029342652, "step": 405 }, { "epoch": 0.3838951310861423, "grad_norm": 30.375, "learning_rate": 7.68953865634324e-06, "loss": 0.09930517673492431, "step": 410 }, { "epoch": 0.38857677902621723, "grad_norm": 25.25, "learning_rate": 7.620789460518465e-06, "loss": 0.09664190411567689, "step": 415 }, { "epoch": 0.39325842696629215, "grad_norm": 10.0, "learning_rate": 7.5513501653045e-06, "loss": 0.14452815055847168, "step": 420 }, { "epoch": 0.397940074906367, "grad_norm": 5.21875, "learning_rate": 7.481239055273959e-06, "loss": 0.040193185210227966, "step": 425 }, { "epoch": 0.40262172284644193, "grad_norm": 13.4375, "learning_rate": 7.410474591899976e-06, "loss": 0.14218735694885254, "step": 430 }, { "epoch": 0.40730337078651685, "grad_norm": 1.25, "learning_rate": 7.339075408694968e-06, "loss": 0.14246041774749757, "step": 435 }, { "epoch": 0.41198501872659177, "grad_norm": 12.125, "learning_rate": 7.2670603063041035e-06, "loss": 0.09831977486610413, "step": 440 }, { "epoch": 0.4166666666666667, "grad_norm": 9.4375, "learning_rate": 7.19444824755478e-06, "loss": 0.023072442412376402, "step": 445 }, { "epoch": 0.42134831460674155, "grad_norm": 14.5, "learning_rate": 7.121258352463364e-06, "loss": 0.11848453283309937, "step": 450 }, { "epoch": 0.42602996254681647, "grad_norm": 0.345703125, "learning_rate": 7.047509893200577e-06, "loss": 0.08525880575180053, "step": 455 }, { "epoch": 0.4307116104868914, "grad_norm": 48.25, "learning_rate": 6.973222289016781e-06, "loss": 0.14056316614151002, "step": 460 }, { "epoch": 0.4353932584269663, "grad_norm": 13.25, "learning_rate": 6.898415101128571e-06, "loss": 0.16151052713394165, "step": 465 }, { "epoch": 0.4400749063670412, "grad_norm": 1.765625, "learning_rate": 6.823108027567946e-06, "loss": 0.03648199737071991, "step": 470 }, { "epoch": 0.4447565543071161, "grad_norm": 4.5625, "learning_rate": 6.747320897995493e-06, "loss": 0.05076872706413269, "step": 475 }, { "epoch": 0.449438202247191, "grad_norm": 9.875, "learning_rate": 6.671073668478882e-06, "loss": 0.09760580062866211, "step": 480 }, { "epoch": 0.4541198501872659, "grad_norm": 9.375, "learning_rate": 6.594386416238095e-06, "loss": 0.04424844086170197, "step": 485 }, { "epoch": 0.45880149812734083, "grad_norm": 2.390625, "learning_rate": 6.517279334358733e-06, "loss": 0.08305501937866211, "step": 490 }, { "epoch": 0.46348314606741575, "grad_norm": 23.0, "learning_rate": 6.43977272647484e-06, "loss": 0.03448081016540527, "step": 495 }, { "epoch": 0.4681647940074906, "grad_norm": 3.203125, "learning_rate": 6.361887001422597e-06, "loss": 0.11298078298568726, "step": 500 }, { "epoch": 0.47284644194756553, "grad_norm": 4.25, "learning_rate": 6.283642667866317e-06, "loss": 0.09018718004226685, "step": 505 }, { "epoch": 0.47752808988764045, "grad_norm": 8.0625, "learning_rate": 6.205060328898162e-06, "loss": 0.08498911261558532, "step": 510 }, { "epoch": 0.48220973782771537, "grad_norm": 4.84375, "learning_rate": 6.126160676612992e-06, "loss": 0.047043058276176455, "step": 515 }, { "epoch": 0.4868913857677903, "grad_norm": 0.7578125, "learning_rate": 6.046964486659777e-06, "loss": 0.0528850257396698, "step": 520 }, { "epoch": 0.49157303370786515, "grad_norm": 15.1875, "learning_rate": 5.967492612770999e-06, "loss": 0.14283888339996337, "step": 525 }, { "epoch": 0.49625468164794007, "grad_norm": 9.8125, "learning_rate": 5.887765981271518e-06, "loss": 0.11596851348876953, "step": 530 }, { "epoch": 0.5009363295880149, "grad_norm": 14.1875, "learning_rate": 5.8078055855682904e-06, "loss": 0.14807997941970824, "step": 535 }, { "epoch": 0.5056179775280899, "grad_norm": 1.0234375, "learning_rate": 5.727632480622452e-06, "loss": 0.10746654272079467, "step": 540 }, { "epoch": 0.5102996254681648, "grad_norm": 12.1875, "learning_rate": 5.647267777405178e-06, "loss": 0.08008084297180176, "step": 545 }, { "epoch": 0.5149812734082397, "grad_norm": 2.9375, "learning_rate": 5.566732637338794e-06, "loss": 0.0822986125946045, "step": 550 }, { "epoch": 0.5196629213483146, "grad_norm": 8.9375, "learning_rate": 5.486048266724609e-06, "loss": 0.025450414419174193, "step": 555 }, { "epoch": 0.5243445692883895, "grad_norm": 12.625, "learning_rate": 5.405235911158926e-06, "loss": 0.15522534847259523, "step": 560 }, { "epoch": 0.5290262172284644, "grad_norm": 3.625, "learning_rate": 5.324316849938715e-06, "loss": 0.00591311939060688, "step": 565 }, { "epoch": 0.5337078651685393, "grad_norm": 1.0546875, "learning_rate": 5.243312390458392e-06, "loss": 0.0055886365473270415, "step": 570 }, { "epoch": 0.5383895131086143, "grad_norm": 14.625, "learning_rate": 5.162243862599221e-06, "loss": 0.022534093260765074, "step": 575 }, { "epoch": 0.5430711610486891, "grad_norm": 1.6015625, "learning_rate": 5.0811326131127816e-06, "loss": 0.02249184250831604, "step": 580 }, { "epoch": 0.547752808988764, "grad_norm": 6.375, "learning_rate": 5e-06, "loss": 0.04538615942001343, "step": 585 }, { "epoch": 0.552434456928839, "grad_norm": 22.625, "learning_rate": 4.918867386887221e-06, "loss": 0.09940661191940307, "step": 590 }, { "epoch": 0.5571161048689138, "grad_norm": 0.06640625, "learning_rate": 4.8377561374007805e-06, "loss": 0.04504017531871796, "step": 595 }, { "epoch": 0.5617977528089888, "grad_norm": 0.154296875, "learning_rate": 4.756687609541609e-06, "loss": 0.09637892842292786, "step": 600 }, { "epoch": 0.5664794007490637, "grad_norm": 29.0, "learning_rate": 4.6756831500612846e-06, "loss": 0.08984389305114746, "step": 605 }, { "epoch": 0.5711610486891385, "grad_norm": 9.375, "learning_rate": 4.594764088841075e-06, "loss": 0.0405780702829361, "step": 610 }, { "epoch": 0.5758426966292135, "grad_norm": 13.5, "learning_rate": 4.513951733275395e-06, "loss": 0.10089976787567138, "step": 615 }, { "epoch": 0.5805243445692884, "grad_norm": 10.25, "learning_rate": 4.433267362661208e-06, "loss": 0.02460857033729553, "step": 620 }, { "epoch": 0.5852059925093633, "grad_norm": 0.255859375, "learning_rate": 4.352732222594823e-06, "loss": 0.009085573256015778, "step": 625 }, { "epoch": 0.5898876404494382, "grad_norm": 6.0625, "learning_rate": 4.272367519377548e-06, "loss": 0.084967440366745, "step": 630 }, { "epoch": 0.5945692883895131, "grad_norm": 23.875, "learning_rate": 4.192194414431712e-06, "loss": 0.042571133375167845, "step": 635 }, { "epoch": 0.599250936329588, "grad_norm": 26.0, "learning_rate": 4.1122340187284845e-06, "loss": 0.10396760702133179, "step": 640 }, { "epoch": 0.6039325842696629, "grad_norm": 1.4140625, "learning_rate": 4.032507387229002e-06, "loss": 0.023634986579418184, "step": 645 }, { "epoch": 0.6086142322097379, "grad_norm": 4.28125, "learning_rate": 3.953035513340226e-06, "loss": 0.09691346287727357, "step": 650 }, { "epoch": 0.6132958801498127, "grad_norm": 18.25, "learning_rate": 3.873839323387009e-06, "loss": 0.0745917558670044, "step": 655 }, { "epoch": 0.6179775280898876, "grad_norm": 16.5, "learning_rate": 3.7949396711018404e-06, "loss": 0.12901389598846436, "step": 660 }, { "epoch": 0.6226591760299626, "grad_norm": 13.125, "learning_rate": 3.7163573321336867e-06, "loss": 0.013447463512420654, "step": 665 }, { "epoch": 0.6273408239700374, "grad_norm": 13.875, "learning_rate": 3.638112998577404e-06, "loss": 0.11524299383163453, "step": 670 }, { "epoch": 0.6320224719101124, "grad_norm": 40.5, "learning_rate": 3.560227273525162e-06, "loss": 0.06458882093429566, "step": 675 }, { "epoch": 0.6367041198501873, "grad_norm": 21.875, "learning_rate": 3.4827206656412683e-06, "loss": 0.12476457357406616, "step": 680 }, { "epoch": 0.6413857677902621, "grad_norm": 10.875, "learning_rate": 3.4056135837619077e-06, "loss": 0.09551171660423279, "step": 685 }, { "epoch": 0.6460674157303371, "grad_norm": 5.8125, "learning_rate": 3.3289263315211183e-06, "loss": 0.06344662308692932, "step": 690 }, { "epoch": 0.650749063670412, "grad_norm": 0.28125, "learning_rate": 3.252679102004509e-06, "loss": 0.015153181552886964, "step": 695 }, { "epoch": 0.6554307116104869, "grad_norm": 14.625, "learning_rate": 3.1768919724320555e-06, "loss": 0.13314547538757324, "step": 700 }, { "epoch": 0.6601123595505618, "grad_norm": 0.328125, "learning_rate": 3.101584898871431e-06, "loss": 0.0362935870885849, "step": 705 }, { "epoch": 0.6647940074906367, "grad_norm": 2.03125, "learning_rate": 3.0267777109832195e-06, "loss": 0.0032910577952861785, "step": 710 }, { "epoch": 0.6694756554307116, "grad_norm": 9.9375, "learning_rate": 2.9524901067994238e-06, "loss": 0.052766060829162596, "step": 715 }, { "epoch": 0.6741573033707865, "grad_norm": 0.796875, "learning_rate": 2.8787416475366365e-06, "loss": 0.05754781365394592, "step": 720 }, { "epoch": 0.6788389513108615, "grad_norm": 12.6875, "learning_rate": 2.805551752445222e-06, "loss": 0.04519017338752747, "step": 725 }, { "epoch": 0.6835205992509363, "grad_norm": 0.05859375, "learning_rate": 2.7329396936958973e-06, "loss": 0.11424082517623901, "step": 730 }, { "epoch": 0.6882022471910112, "grad_norm": 14.5625, "learning_rate": 2.6609245913050345e-06, "loss": 0.06839200854301453, "step": 735 }, { "epoch": 0.6928838951310862, "grad_norm": 0.1884765625, "learning_rate": 2.589525408100024e-06, "loss": 0.008566472679376602, "step": 740 }, { "epoch": 0.697565543071161, "grad_norm": 9.5625, "learning_rate": 2.518760944726042e-06, "loss": 0.02528868317604065, "step": 745 }, { "epoch": 0.702247191011236, "grad_norm": 19.0, "learning_rate": 2.448649834695503e-06, "loss": 0.11389665603637696, "step": 750 }, { "epoch": 0.7069288389513109, "grad_norm": 2.34375, "learning_rate": 2.3792105394815347e-06, "loss": 0.029075217247009278, "step": 755 }, { "epoch": 0.7116104868913857, "grad_norm": 0.7890625, "learning_rate": 2.3104613436567625e-06, "loss": 0.14182507991790771, "step": 760 }, { "epoch": 0.7162921348314607, "grad_norm": 17.5, "learning_rate": 2.2424203500786473e-06, "loss": 0.014566624164581298, "step": 765 }, { "epoch": 0.7209737827715356, "grad_norm": 0.1767578125, "learning_rate": 2.1751054751227e-06, "loss": 0.10296481847763062, "step": 770 }, { "epoch": 0.7256554307116105, "grad_norm": 14.625, "learning_rate": 2.108534443964785e-06, "loss": 0.05994282960891724, "step": 775 }, { "epoch": 0.7303370786516854, "grad_norm": 11.625, "learning_rate": 2.0427247859137706e-06, "loss": 0.13027719259262086, "step": 780 }, { "epoch": 0.7350187265917603, "grad_norm": 4.03125, "learning_rate": 1.977693829795769e-06, "loss": 0.07348175048828125, "step": 785 }, { "epoch": 0.7397003745318352, "grad_norm": 38.25, "learning_rate": 1.9134586993911384e-06, "loss": 0.11643357276916504, "step": 790 }, { "epoch": 0.7443820224719101, "grad_norm": 36.5, "learning_rate": 1.8500363089255074e-06, "loss": 0.11031646728515625, "step": 795 }, { "epoch": 0.7490636704119851, "grad_norm": 2.484375, "learning_rate": 1.7874433586159596e-06, "loss": 0.0938467264175415, "step": 800 }, { "epoch": 0.7537453183520599, "grad_norm": 9.625, "learning_rate": 1.7256963302735752e-06, "loss": 0.06049980521202088, "step": 805 }, { "epoch": 0.7584269662921348, "grad_norm": 0.6328125, "learning_rate": 1.664811482963493e-06, "loss": 0.030288994312286377, "step": 810 }, { "epoch": 0.7631086142322098, "grad_norm": 4.03125, "learning_rate": 1.604804848723603e-06, "loss": 0.04979062080383301, "step": 815 }, { "epoch": 0.7677902621722846, "grad_norm": 19.625, "learning_rate": 1.5456922283430448e-06, "loss": 0.09577327370643615, "step": 820 }, { "epoch": 0.7724719101123596, "grad_norm": 5.65625, "learning_rate": 1.4874891872015735e-06, "loss": 0.13884434700012208, "step": 825 }, { "epoch": 0.7771535580524345, "grad_norm": 1.0546875, "learning_rate": 1.430211051170935e-06, "loss": 0.07736724615097046, "step": 830 }, { "epoch": 0.7818352059925093, "grad_norm": 14.9375, "learning_rate": 1.3738729025792908e-06, "loss": 0.06385923624038696, "step": 835 }, { "epoch": 0.7865168539325843, "grad_norm": 0.283203125, "learning_rate": 1.3184895762397815e-06, "loss": 0.005019430816173553, "step": 840 }, { "epoch": 0.7911985018726592, "grad_norm": 22.375, "learning_rate": 1.2640756555442684e-06, "loss": 0.09718080163002014, "step": 845 }, { "epoch": 0.795880149812734, "grad_norm": 10.3125, "learning_rate": 1.2106454686232615e-06, "loss": 0.03934278786182403, "step": 850 }, { "epoch": 0.800561797752809, "grad_norm": 4.03125, "learning_rate": 1.1582130845730826e-06, "loss": 0.05719079971313477, "step": 855 }, { "epoch": 0.8052434456928839, "grad_norm": 4.875, "learning_rate": 1.1067923097512256e-06, "loss": 0.03711806833744049, "step": 860 }, { "epoch": 0.8099250936329588, "grad_norm": 11.0, "learning_rate": 1.0563966841408995e-06, "loss": 0.07051395773887634, "step": 865 }, { "epoch": 0.8146067415730337, "grad_norm": 21.375, "learning_rate": 1.0070394777857256e-06, "loss": 0.11728324890136718, "step": 870 }, { "epoch": 0.8192883895131086, "grad_norm": 20.875, "learning_rate": 9.587336872954906e-07, "loss": 0.050193822383880614, "step": 875 }, { "epoch": 0.8239700374531835, "grad_norm": 9.25, "learning_rate": 9.114920324239212e-07, "loss": 0.032090258598327634, "step": 880 }, { "epoch": 0.8286516853932584, "grad_norm": 1.5390625, "learning_rate": 8.65326952719357e-07, "loss": 0.069438898563385, "step": 885 }, { "epoch": 0.8333333333333334, "grad_norm": 0.8671875, "learning_rate": 8.202506042491887e-07, "loss": 0.031066751480102538, "step": 890 }, { "epoch": 0.8380149812734082, "grad_norm": 1.734375, "learning_rate": 7.762748563989653e-07, "loss": 0.09884663820266723, "step": 895 }, { "epoch": 0.8426966292134831, "grad_norm": 0.267578125, "learning_rate": 7.334112887469713e-07, "loss": 0.036754748225212096, "step": 900 }, { "epoch": 0.8473782771535581, "grad_norm": 0.5390625, "learning_rate": 6.916711880151305e-07, "loss": 0.03700563013553619, "step": 905 }, { "epoch": 0.8520599250936329, "grad_norm": 1.1015625, "learning_rate": 6.510655450970083e-07, "loss": 0.10059614181518554, "step": 910 }, { "epoch": 0.8567415730337079, "grad_norm": 20.125, "learning_rate": 6.116050521637218e-07, "loss": 0.07337114810943604, "step": 915 }, { "epoch": 0.8614232209737828, "grad_norm": 23.25, "learning_rate": 5.733000998485072e-07, "loss": 0.02071182131767273, "step": 920 }, { "epoch": 0.8661048689138576, "grad_norm": 8.0, "learning_rate": 5.361607745106817e-07, "loss": 0.039340272545814514, "step": 925 }, { "epoch": 0.8707865168539326, "grad_norm": 1.40625, "learning_rate": 5.001968555797337e-07, "loss": 0.06494908928871154, "step": 930 }, { "epoch": 0.8754681647940075, "grad_norm": 0.302734375, "learning_rate": 4.65417812980225e-07, "loss": 0.043603447079658506, "step": 935 }, { "epoch": 0.8801498127340824, "grad_norm": 17.5, "learning_rate": 4.3183280463819976e-07, "loss": 0.06408308148384094, "step": 940 }, { "epoch": 0.8848314606741573, "grad_norm": 10.6875, "learning_rate": 3.9945067406974067e-07, "loss": 0.11675176620483399, "step": 945 }, { "epoch": 0.8895131086142322, "grad_norm": 1.0625, "learning_rate": 3.6827994805231803e-07, "loss": 0.02691230773925781, "step": 950 }, { "epoch": 0.8941947565543071, "grad_norm": 28.75, "learning_rate": 3.38328834379541e-07, "loss": 0.08515161871910096, "step": 955 }, { "epoch": 0.898876404494382, "grad_norm": 8.375, "learning_rate": 3.096052196999033e-07, "loss": 0.055815601348876955, "step": 960 }, { "epoch": 0.903558052434457, "grad_norm": 5.6875, "learning_rate": 2.821166674400905e-07, "loss": 0.006412952393293381, "step": 965 }, { "epoch": 0.9082397003745318, "grad_norm": 9.125, "learning_rate": 2.5587041581340235e-07, "loss": 0.025133582949638366, "step": 970 }, { "epoch": 0.9129213483146067, "grad_norm": 0.134765625, "learning_rate": 2.3087337591379877e-07, "loss": 0.08035503029823303, "step": 975 }, { "epoch": 0.9176029962546817, "grad_norm": 36.5, "learning_rate": 2.0713212989609764e-07, "loss": 0.0592190146446228, "step": 980 }, { "epoch": 0.9222846441947565, "grad_norm": 22.625, "learning_rate": 1.8465292924276844e-07, "loss": 0.06302451491355895, "step": 985 }, { "epoch": 0.9269662921348315, "grad_norm": 5.28125, "learning_rate": 1.6344169311781788e-07, "loss": 0.08194518685340882, "step": 990 }, { "epoch": 0.9316479400749064, "grad_norm": 8.5625, "learning_rate": 1.4350400680816555e-07, "loss": 0.0555216908454895, "step": 995 }, { "epoch": 0.9363295880149812, "grad_norm": 11.5, "learning_rate": 1.2484512025294271e-07, "loss": 0.057598447799682616, "step": 1000 }, { "epoch": 0.9410112359550562, "grad_norm": 0.48046875, "learning_rate": 1.0746994666109234e-07, "loss": 0.008601938933134079, "step": 1005 }, { "epoch": 0.9456928838951311, "grad_norm": 11.5, "learning_rate": 9.138306121763585e-08, "loss": 0.030312222242355347, "step": 1010 }, { "epoch": 0.950374531835206, "grad_norm": 6.0, "learning_rate": 7.658869987894612e-08, "loss": 0.044132059812545775, "step": 1015 }, { "epoch": 0.9550561797752809, "grad_norm": 0.384765625, "learning_rate": 6.309075825734867e-08, "loss": 0.041088974475860594, "step": 1020 }, { "epoch": 0.9597378277153558, "grad_norm": 12.5, "learning_rate": 5.089279059533658e-08, "loss": 0.047975027561187746, "step": 1025 }, { "epoch": 0.9644194756554307, "grad_norm": 0.41015625, "learning_rate": 3.9998008829679745e-08, "loss": 0.035625296831130984, "step": 1030 }, { "epoch": 0.9691011235955056, "grad_norm": 11.1875, "learning_rate": 3.040928174566415e-08, "loss": 0.1642237424850464, "step": 1035 }, { "epoch": 0.9737827715355806, "grad_norm": 14.1875, "learning_rate": 2.2129134221691163e-08, "loss": 0.0951894998550415, "step": 1040 }, { "epoch": 0.9784644194756554, "grad_norm": 0.1064453125, "learning_rate": 1.51597465644332e-08, "loss": 0.10830885171890259, "step": 1045 }, { "epoch": 0.9831460674157303, "grad_norm": 4.84375, "learning_rate": 9.502953934723003e-09, "loss": 0.025810426473617552, "step": 1050 }, { "epoch": 0.9878277153558053, "grad_norm": 36.5, "learning_rate": 5.160245864319069e-09, "loss": 0.03200013339519501, "step": 1055 }, { "epoch": 0.9925093632958801, "grad_norm": 8.5625, "learning_rate": 2.1327658636927096e-09, "loss": 0.06172307133674622, "step": 1060 }, { "epoch": 0.9971910112359551, "grad_norm": 1.2890625, "learning_rate": 4.213111209155907e-10, "loss": 0.0099898561835289, "step": 1065 }, { "epoch": 1.0, "step": 1068, "total_flos": 2.024520470028288e+16, "train_loss": 0.12143641752661781, "train_runtime": 3998.8947, "train_samples_per_second": 0.534, "train_steps_per_second": 0.267 } ], "logging_steps": 5, "max_steps": 1068, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.024520470028288e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }