{ "best_global_step": 4500, "best_metric": 0.05329994, "best_model_checkpoint": "/workspace/ms-sw/swift-training-workflow/runs/qwen3-vl-4b-latex-ocr-full/v0-20260327-112050/checkpoint-4500", "epoch": 2.830188679245283, "eval_steps": 500, "global_step": 4500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006289308176100629, "grad_norm": 45.34716768625641, "learning_rate": 8.368200836820084e-08, "loss": 0.7045165300369263, "step": 1, "token_acc": 0.9067005937234945 }, { "epoch": 0.006289308176100629, "grad_norm": 20.89321839694586, "learning_rate": 8.368200836820084e-07, "loss": 0.6605887942843967, "step": 10, "token_acc": 0.9088069388002669 }, { "epoch": 0.012578616352201259, "grad_norm": 8.70654604666805, "learning_rate": 1.6736401673640167e-06, "loss": 0.4156378746032715, "step": 20, "token_acc": 0.9231436001155735 }, { "epoch": 0.018867924528301886, "grad_norm": 4.519491564596013, "learning_rate": 2.510460251046025e-06, "loss": 0.19132455587387084, "step": 30, "token_acc": 0.9457678075855689 }, { "epoch": 0.025157232704402517, "grad_norm": 6.196919953968438, "learning_rate": 3.3472803347280334e-06, "loss": 0.1531017780303955, "step": 40, "token_acc": 0.9549960123048877 }, { "epoch": 0.031446540880503145, "grad_norm": 3.6004202812134203, "learning_rate": 4.184100418410042e-06, "loss": 0.15698001384735108, "step": 50, "token_acc": 0.9532592069655254 }, { "epoch": 0.03773584905660377, "grad_norm": 3.115974891699392, "learning_rate": 5.02092050209205e-06, "loss": 0.13419015407562257, "step": 60, "token_acc": 0.9603841536614646 }, { "epoch": 0.0440251572327044, "grad_norm": 3.53139646167736, "learning_rate": 5.8577405857740585e-06, "loss": 0.13604100942611694, "step": 70, "token_acc": 0.9596007184711876 }, { "epoch": 0.050314465408805034, "grad_norm": 3.0256135958409387, "learning_rate": 6.694560669456067e-06, "loss": 0.13651411533355712, "step": 80, "token_acc": 0.9602559188607483 }, { "epoch": 0.05660377358490566, "grad_norm": 3.811738769540206, "learning_rate": 7.531380753138075e-06, "loss": 0.12942522764205933, "step": 90, "token_acc": 0.96127742262525 }, { "epoch": 0.06289308176100629, "grad_norm": 3.074401932705618, "learning_rate": 8.368200836820084e-06, "loss": 0.13949081897735596, "step": 100, "token_acc": 0.959375 }, { "epoch": 0.06918238993710692, "grad_norm": 2.843989040291082, "learning_rate": 9.205020920502092e-06, "loss": 0.13196282386779784, "step": 110, "token_acc": 0.9607720869589864 }, { "epoch": 0.07547169811320754, "grad_norm": 3.508369382446498, "learning_rate": 1.00418410041841e-05, "loss": 0.13052265644073485, "step": 120, "token_acc": 0.9622684985166623 }, { "epoch": 0.08176100628930817, "grad_norm": 3.440200280061563, "learning_rate": 1.0878661087866109e-05, "loss": 0.1312979578971863, "step": 130, "token_acc": 0.9609211828075189 }, { "epoch": 0.0880503144654088, "grad_norm": 4.201954871869706, "learning_rate": 1.1715481171548117e-05, "loss": 0.13678085803985596, "step": 140, "token_acc": 0.959023862406743 }, { "epoch": 0.09433962264150944, "grad_norm": 3.2305103935546957, "learning_rate": 1.2552301255230125e-05, "loss": 0.13586316108703614, "step": 150, "token_acc": 0.9603519195612431 }, { "epoch": 0.10062893081761007, "grad_norm": 3.1732101058142845, "learning_rate": 1.3389121338912134e-05, "loss": 0.14661180973052979, "step": 160, "token_acc": 0.9567754424778762 }, { "epoch": 0.1069182389937107, "grad_norm": 3.0981254219351912, "learning_rate": 1.4225941422594142e-05, "loss": 0.1456087589263916, "step": 170, "token_acc": 0.9564570883483433 }, { "epoch": 0.11320754716981132, "grad_norm": 2.7844698698117694, "learning_rate": 1.506276150627615e-05, "loss": 0.13063851594924927, "step": 180, "token_acc": 0.9629532927183058 }, { "epoch": 0.11949685534591195, "grad_norm": 2.582043948631837, "learning_rate": 1.589958158995816e-05, "loss": 0.1434727668762207, "step": 190, "token_acc": 0.9585141315820336 }, { "epoch": 0.12578616352201258, "grad_norm": 2.5184581779739084, "learning_rate": 1.6736401673640167e-05, "loss": 0.13957881927490234, "step": 200, "token_acc": 0.9597336624286084 }, { "epoch": 0.1320754716981132, "grad_norm": 3.8266489180800014, "learning_rate": 1.7573221757322177e-05, "loss": 0.14468042850494384, "step": 210, "token_acc": 0.9577662246626555 }, { "epoch": 0.13836477987421383, "grad_norm": 2.9201805311014466, "learning_rate": 1.8410041841004184e-05, "loss": 0.13508262634277343, "step": 220, "token_acc": 0.9598491126796788 }, { "epoch": 0.14465408805031446, "grad_norm": 2.9676576361919316, "learning_rate": 1.9246861924686194e-05, "loss": 0.1461448550224304, "step": 230, "token_acc": 0.9563967275314205 }, { "epoch": 0.1509433962264151, "grad_norm": 2.9276617136601097, "learning_rate": 1.999999759629256e-05, "loss": 0.15765782594680786, "step": 240, "token_acc": 0.9533182566937488 }, { "epoch": 0.15723270440251572, "grad_norm": 2.6683614726041727, "learning_rate": 1.9999709152797814e-05, "loss": 0.15170300006866455, "step": 250, "token_acc": 0.9559203868210953 }, { "epoch": 0.16352201257861634, "grad_norm": 3.322241566248514, "learning_rate": 1.9998939983703623e-05, "loss": 0.15328752994537354, "step": 260, "token_acc": 0.9542057698992006 }, { "epoch": 0.16981132075471697, "grad_norm": 4.064876637718516, "learning_rate": 1.9997690125987e-05, "loss": 0.14633382558822633, "step": 270, "token_acc": 0.9561424123142 }, { "epoch": 0.1761006289308176, "grad_norm": 3.1878547590503694, "learning_rate": 1.9995959639733544e-05, "loss": 0.15381388664245604, "step": 280, "token_acc": 0.954816314723749 }, { "epoch": 0.18238993710691823, "grad_norm": 4.212001601278116, "learning_rate": 1.999374860813458e-05, "loss": 0.3573228597640991, "step": 290, "token_acc": 0.9125418060200668 }, { "epoch": 0.18867924528301888, "grad_norm": 2.6659838103160354, "learning_rate": 1.9991057137483156e-05, "loss": 0.1465816617012024, "step": 300, "token_acc": 0.9558244474463254 }, { "epoch": 0.1949685534591195, "grad_norm": 2.752984167443659, "learning_rate": 1.998788535716891e-05, "loss": 0.1465825319290161, "step": 310, "token_acc": 0.9560354159149574 }, { "epoch": 0.20125786163522014, "grad_norm": 2.519397967504351, "learning_rate": 1.998423341967188e-05, "loss": 0.15822311639785766, "step": 320, "token_acc": 0.953077512994575 }, { "epoch": 0.20754716981132076, "grad_norm": 2.5939942368942885, "learning_rate": 1.9980101500555155e-05, "loss": 0.15183796882629394, "step": 330, "token_acc": 0.9557981547954553 }, { "epoch": 0.2138364779874214, "grad_norm": 3.36249390155496, "learning_rate": 1.9975489798456443e-05, "loss": 0.14122436046600342, "step": 340, "token_acc": 0.9577983222051019 }, { "epoch": 0.22012578616352202, "grad_norm": 2.532046774055889, "learning_rate": 1.9970398535078518e-05, "loss": 0.13906241655349733, "step": 350, "token_acc": 0.959155496749698 }, { "epoch": 0.22641509433962265, "grad_norm": 2.8758559441861946, "learning_rate": 1.9964827955178566e-05, "loss": 0.14161758422851561, "step": 360, "token_acc": 0.9578302863056226 }, { "epoch": 0.23270440251572327, "grad_norm": 2.510119368927116, "learning_rate": 1.99587783265564e-05, "loss": 0.13756366968154907, "step": 370, "token_acc": 0.959952686718486 }, { "epoch": 0.2389937106918239, "grad_norm": 3.128855162887323, "learning_rate": 1.995224994004163e-05, "loss": 0.13279154300689697, "step": 380, "token_acc": 0.9616177452602851 }, { "epoch": 0.24528301886792453, "grad_norm": 3.7680233143063004, "learning_rate": 1.9945243109479626e-05, "loss": 0.14165163040161133, "step": 390, "token_acc": 0.9579345850999395 }, { "epoch": 0.25157232704402516, "grad_norm": 2.7668143237049803, "learning_rate": 1.9937758171716468e-05, "loss": 0.12863855361938475, "step": 400, "token_acc": 0.960668946727726 }, { "epoch": 0.2578616352201258, "grad_norm": 2.2895343375681247, "learning_rate": 1.9929795486582745e-05, "loss": 0.13654918670654298, "step": 410, "token_acc": 0.9608831657909922 }, { "epoch": 0.2641509433962264, "grad_norm": 2.4479950246869038, "learning_rate": 1.9921355436876242e-05, "loss": 0.1467982292175293, "step": 420, "token_acc": 0.9568501501769412 }, { "epoch": 0.27044025157232704, "grad_norm": 2.4852520566549514, "learning_rate": 1.9912438428343562e-05, "loss": 0.129807186126709, "step": 430, "token_acc": 0.9607044921267633 }, { "epoch": 0.27672955974842767, "grad_norm": 2.4052015257221973, "learning_rate": 1.9903044889660595e-05, "loss": 0.1309417724609375, "step": 440, "token_acc": 0.9612522150029533 }, { "epoch": 0.2830188679245283, "grad_norm": 2.13869320934904, "learning_rate": 1.989317527241193e-05, "loss": 0.13723220825195312, "step": 450, "token_acc": 0.959500215424386 }, { "epoch": 0.2893081761006289, "grad_norm": 2.594597791053857, "learning_rate": 1.9882830051069135e-05, "loss": 0.12807786464691162, "step": 460, "token_acc": 0.9608131343945865 }, { "epoch": 0.29559748427672955, "grad_norm": 2.5797553863526415, "learning_rate": 1.9872009722967943e-05, "loss": 0.12285339832305908, "step": 470, "token_acc": 0.9625046229125772 }, { "epoch": 0.3018867924528302, "grad_norm": 2.2886282297299254, "learning_rate": 1.9860714808284364e-05, "loss": 0.1279349684715271, "step": 480, "token_acc": 0.9615531023981728 }, { "epoch": 0.3081761006289308, "grad_norm": 2.137927259030621, "learning_rate": 1.984894585000964e-05, "loss": 0.1333387613296509, "step": 490, "token_acc": 0.9601997026898157 }, { "epoch": 0.31446540880503143, "grad_norm": 3.1442649494651236, "learning_rate": 1.983670341392419e-05, "loss": 0.13746428489685059, "step": 500, "token_acc": 0.959205850392082 }, { "epoch": 0.31446540880503143, "eval_loss": 0.14142681658267975, "eval_runtime": 733.8473, "eval_samples_per_second": 11.549, "eval_steps_per_second": 2.888, "eval_token_acc": 0.958135771134416, "step": 500 }, { "epoch": 0.32075471698113206, "grad_norm": 2.1089789407513644, "learning_rate": 1.9823988088570365e-05, "loss": 0.13168094158172608, "step": 510, "token_acc": 0.9609981625274139 }, { "epoch": 0.3270440251572327, "grad_norm": 2.0388968384076707, "learning_rate": 1.9810800485224196e-05, "loss": 0.12072486877441406, "step": 520, "token_acc": 0.9647903704134442 }, { "epoch": 0.3333333333333333, "grad_norm": 1.92859398458753, "learning_rate": 1.9797141237865963e-05, "loss": 0.13192942142486572, "step": 530, "token_acc": 0.9618026930475405 }, { "epoch": 0.33962264150943394, "grad_norm": 2.279806525825656, "learning_rate": 1.9783011003149757e-05, "loss": 0.12413182258605956, "step": 540, "token_acc": 0.9630478289874409 }, { "epoch": 0.34591194968553457, "grad_norm": 1.7101152951853034, "learning_rate": 1.9768410460371893e-05, "loss": 0.11435494422912598, "step": 550, "token_acc": 0.9657330665904544 }, { "epoch": 0.3522012578616352, "grad_norm": 2.894784399552811, "learning_rate": 1.9753340311438252e-05, "loss": 0.12679414749145507, "step": 560, "token_acc": 0.9626316419339397 }, { "epoch": 0.3584905660377358, "grad_norm": 2.6598023214353863, "learning_rate": 1.9737801280830537e-05, "loss": 0.11091006994247436, "step": 570, "token_acc": 0.9672726190830383 }, { "epoch": 0.36477987421383645, "grad_norm": 2.114552593288772, "learning_rate": 1.9721794115571464e-05, "loss": 0.11245570182800294, "step": 580, "token_acc": 0.9667317874232 }, { "epoch": 0.3710691823899371, "grad_norm": 1.6750696590671157, "learning_rate": 1.9705319585188823e-05, "loss": 0.12619128227233886, "step": 590, "token_acc": 0.9624073017683971 }, { "epoch": 0.37735849056603776, "grad_norm": 2.158345192241544, "learning_rate": 1.9688378481678495e-05, "loss": 0.11731394529342651, "step": 600, "token_acc": 0.9648311615524731 }, { "epoch": 0.3836477987421384, "grad_norm": 1.473095460953491, "learning_rate": 1.9670971619466388e-05, "loss": 0.11155827045440674, "step": 610, "token_acc": 0.9661841244560735 }, { "epoch": 0.389937106918239, "grad_norm": 2.150767674822309, "learning_rate": 1.9653099835369263e-05, "loss": 0.12145018577575684, "step": 620, "token_acc": 0.964341614730072 }, { "epoch": 0.39622641509433965, "grad_norm": 2.528324828102242, "learning_rate": 1.963476398855452e-05, "loss": 0.10442175865173339, "step": 630, "token_acc": 0.9676074124107956 }, { "epoch": 0.4025157232704403, "grad_norm": 1.76400511136344, "learning_rate": 1.96159649604989e-05, "loss": 0.11156104803085327, "step": 640, "token_acc": 0.9659989999705874 }, { "epoch": 0.4088050314465409, "grad_norm": 1.9005627857000333, "learning_rate": 1.9596703654946078e-05, "loss": 0.106688392162323, "step": 650, "token_acc": 0.9679768786127168 }, { "epoch": 0.41509433962264153, "grad_norm": 2.160201745367065, "learning_rate": 1.9576980997863258e-05, "loss": 0.11403628587722778, "step": 660, "token_acc": 0.9654064597556078 }, { "epoch": 0.42138364779874216, "grad_norm": 1.9608680807748942, "learning_rate": 1.9556797937396635e-05, "loss": 0.11484429836273194, "step": 670, "token_acc": 0.9659063910667617 }, { "epoch": 0.4276729559748428, "grad_norm": 1.98197499802808, "learning_rate": 1.9536155443825808e-05, "loss": 0.11815891265869141, "step": 680, "token_acc": 0.9646716786091859 }, { "epoch": 0.4339622641509434, "grad_norm": 1.742292937981462, "learning_rate": 1.951505450951715e-05, "loss": 0.10748194456100464, "step": 690, "token_acc": 0.9683494044242767 }, { "epoch": 0.44025157232704404, "grad_norm": 1.4750243938418341, "learning_rate": 1.94934961488761e-05, "loss": 0.1117070198059082, "step": 700, "token_acc": 0.9667868396400489 }, { "epoch": 0.44654088050314467, "grad_norm": 1.7692436842372372, "learning_rate": 1.9471481398298378e-05, "loss": 0.11048619747161866, "step": 710, "token_acc": 0.9665264476863868 }, { "epoch": 0.4528301886792453, "grad_norm": 1.6345113315469586, "learning_rate": 1.9449011316120187e-05, "loss": 0.1079734206199646, "step": 720, "token_acc": 0.9672420166633492 }, { "epoch": 0.4591194968553459, "grad_norm": 2.8816190351055835, "learning_rate": 1.942608698256731e-05, "loss": 0.11024671792984009, "step": 730, "token_acc": 0.9670435621000965 }, { "epoch": 0.46540880503144655, "grad_norm": 2.0528406854349623, "learning_rate": 1.9402709499703212e-05, "loss": 0.1140676498413086, "step": 740, "token_acc": 0.9654670094258784 }, { "epoch": 0.4716981132075472, "grad_norm": 1.8157760955063753, "learning_rate": 1.9378879991376012e-05, "loss": 0.10537893772125244, "step": 750, "token_acc": 0.9682530570067168 }, { "epoch": 0.4779874213836478, "grad_norm": 1.861892023047174, "learning_rate": 1.9354599603164508e-05, "loss": 0.1153944969177246, "step": 760, "token_acc": 0.9660055798156235 }, { "epoch": 0.48427672955974843, "grad_norm": 1.607561464821473, "learning_rate": 1.932986950232306e-05, "loss": 0.10485104322433472, "step": 770, "token_acc": 0.9696273095600701 }, { "epoch": 0.49056603773584906, "grad_norm": 2.0248199444328874, "learning_rate": 1.9304690877725506e-05, "loss": 0.10481438636779786, "step": 780, "token_acc": 0.9688417381194594 }, { "epoch": 0.4968553459119497, "grad_norm": 2.726276175870277, "learning_rate": 1.9279064939807986e-05, "loss": 0.11137734651565552, "step": 790, "token_acc": 0.9666876178504086 }, { "epoch": 0.5031446540880503, "grad_norm": 1.657566011087072, "learning_rate": 1.925299292051077e-05, "loss": 0.10481610298156738, "step": 800, "token_acc": 0.9691483936766956 }, { "epoch": 0.5094339622641509, "grad_norm": 1.8815105869017927, "learning_rate": 1.9226476073219026e-05, "loss": 0.10296165943145752, "step": 810, "token_acc": 0.9680598805816645 }, { "epoch": 0.5157232704402516, "grad_norm": 1.6804255612094785, "learning_rate": 1.9199515672702558e-05, "loss": 0.1030133843421936, "step": 820, "token_acc": 0.9712198685171658 }, { "epoch": 0.5220125786163522, "grad_norm": 1.5923137757409032, "learning_rate": 1.917211301505453e-05, "loss": 0.10893633365631103, "step": 830, "token_acc": 0.9680603782248113 }, { "epoch": 0.5283018867924528, "grad_norm": 1.8175548963275856, "learning_rate": 1.9144269417629165e-05, "loss": 0.11561510562896729, "step": 840, "token_acc": 0.9666006145716666 }, { "epoch": 0.5345911949685535, "grad_norm": 1.5087489763513091, "learning_rate": 1.9115986218978396e-05, "loss": 0.10352299213409424, "step": 850, "token_acc": 0.9696626669685307 }, { "epoch": 0.5408805031446541, "grad_norm": 1.5188944733060936, "learning_rate": 1.9087264778787534e-05, "loss": 0.10984259843826294, "step": 860, "token_acc": 0.9671633237822349 }, { "epoch": 0.5471698113207547, "grad_norm": 1.5446089445746696, "learning_rate": 1.9058106477809892e-05, "loss": 0.10156491994857789, "step": 870, "token_acc": 0.9691671877973844 }, { "epoch": 0.5534591194968553, "grad_norm": 1.4274372010097796, "learning_rate": 1.9028512717800414e-05, "loss": 0.10093884468078614, "step": 880, "token_acc": 0.9690456722345521 }, { "epoch": 0.559748427672956, "grad_norm": 2.015436306862073, "learning_rate": 1.8998484921448275e-05, "loss": 0.10100080966949462, "step": 890, "token_acc": 0.9691417278373801 }, { "epoch": 0.5660377358490566, "grad_norm": 1.7641447202728093, "learning_rate": 1.8968024532308505e-05, "loss": 0.10313496589660645, "step": 900, "token_acc": 0.9694502727654217 }, { "epoch": 0.5723270440251572, "grad_norm": 2.7256435866673385, "learning_rate": 1.893713301473258e-05, "loss": 0.09976035952568055, "step": 910, "token_acc": 0.9701510251312939 }, { "epoch": 0.5786163522012578, "grad_norm": 2.7322740850122265, "learning_rate": 1.8905811853798014e-05, "loss": 0.1044041395187378, "step": 920, "token_acc": 0.968413186560515 }, { "epoch": 0.5849056603773585, "grad_norm": 1.858448409286501, "learning_rate": 1.8874062555236994e-05, "loss": 0.11018127202987671, "step": 930, "token_acc": 0.9671611053905881 }, { "epoch": 0.5911949685534591, "grad_norm": 12.802923692002347, "learning_rate": 1.884188664536397e-05, "loss": 0.10684647560119628, "step": 940, "token_acc": 0.9699350180505415 }, { "epoch": 0.5974842767295597, "grad_norm": 1.7459169896484135, "learning_rate": 1.8809285671002288e-05, "loss": 0.1010131597518921, "step": 950, "token_acc": 0.9700858356859726 }, { "epoch": 0.6037735849056604, "grad_norm": 1.7594807067351037, "learning_rate": 1.877626119940982e-05, "loss": 0.10133538246154786, "step": 960, "token_acc": 0.969674964417463 }, { "epoch": 0.610062893081761, "grad_norm": 2.4724881080506638, "learning_rate": 1.874281481820364e-05, "loss": 0.10011661052703857, "step": 970, "token_acc": 0.9699815625720212 }, { "epoch": 0.6163522012578616, "grad_norm": 1.8152867273949589, "learning_rate": 1.8708948135283672e-05, "loss": 0.09960671663284301, "step": 980, "token_acc": 0.9696417901954648 }, { "epoch": 0.6226415094339622, "grad_norm": 1.7086563311997371, "learning_rate": 1.8674662778755416e-05, "loss": 0.09888969659805298, "step": 990, "token_acc": 0.9705755084379056 }, { "epoch": 0.6289308176100629, "grad_norm": 1.8413390867049504, "learning_rate": 1.8639960396851665e-05, "loss": 0.10456682443618774, "step": 1000, "token_acc": 0.9697974961056943 }, { "epoch": 0.6289308176100629, "eval_loss": 0.09391385316848755, "eval_runtime": 721.3125, "eval_samples_per_second": 11.749, "eval_steps_per_second": 2.938, "eval_token_acc": 0.9717736562791468, "step": 1000 }, { "epoch": 0.6352201257861635, "grad_norm": 1.4004622630825614, "learning_rate": 1.8604842657853282e-05, "loss": 0.09269505739212036, "step": 1010, "token_acc": 0.9738395415472779 }, { "epoch": 0.6415094339622641, "grad_norm": 1.6048419525402355, "learning_rate": 1.8569311250008976e-05, "loss": 0.09986767768859864, "step": 1020, "token_acc": 0.9705582573749963 }, { "epoch": 0.6477987421383647, "grad_norm": 1.551914657474502, "learning_rate": 1.853336788145416e-05, "loss": 0.08478968739509582, "step": 1030, "token_acc": 0.9746740725818056 }, { "epoch": 0.6540880503144654, "grad_norm": 1.4609089183474224, "learning_rate": 1.8497014280128836e-05, "loss": 0.0968370258808136, "step": 1040, "token_acc": 0.9714946296350493 }, { "epoch": 0.660377358490566, "grad_norm": 1.6209543116867677, "learning_rate": 1.846025219369451e-05, "loss": 0.10038182735443116, "step": 1050, "token_acc": 0.971301524149431 }, { "epoch": 0.6666666666666666, "grad_norm": 1.6745782525801582, "learning_rate": 1.8423083389450184e-05, "loss": 0.09864873886108398, "step": 1060, "token_acc": 0.9706830841395485 }, { "epoch": 0.6729559748427673, "grad_norm": 1.91824919274026, "learning_rate": 1.8385509654247405e-05, "loss": 0.08868163824081421, "step": 1070, "token_acc": 0.9730788746687715 }, { "epoch": 0.6792452830188679, "grad_norm": 1.5103142849955444, "learning_rate": 1.834753279440434e-05, "loss": 0.08533757328987121, "step": 1080, "token_acc": 0.9747726440859307 }, { "epoch": 0.6855345911949685, "grad_norm": 1.6371011754605465, "learning_rate": 1.8309154635618967e-05, "loss": 0.09762944579124451, "step": 1090, "token_acc": 0.9705254515599343 }, { "epoch": 0.6918238993710691, "grad_norm": 1.5207551294053872, "learning_rate": 1.827037702288128e-05, "loss": 0.08945590853691102, "step": 1100, "token_acc": 0.973365268078721 }, { "epoch": 0.6981132075471698, "grad_norm": 1.2701931582388237, "learning_rate": 1.8231201820384615e-05, "loss": 0.08889811635017394, "step": 1110, "token_acc": 0.9730332580285396 }, { "epoch": 0.7044025157232704, "grad_norm": 1.539922447920576, "learning_rate": 1.819163091143602e-05, "loss": 0.08988333940505981, "step": 1120, "token_acc": 0.9729425695677916 }, { "epoch": 0.710691823899371, "grad_norm": 1.5525273016055445, "learning_rate": 1.815166619836571e-05, "loss": 0.08635894060134888, "step": 1130, "token_acc": 0.9734208416238529 }, { "epoch": 0.7169811320754716, "grad_norm": 1.6311000903415054, "learning_rate": 1.8111309602435624e-05, "loss": 0.0973101019859314, "step": 1140, "token_acc": 0.9719618030360201 }, { "epoch": 0.7232704402515723, "grad_norm": 1.2925549737286608, "learning_rate": 1.807056306374708e-05, "loss": 0.09141036868095398, "step": 1150, "token_acc": 0.9733011774719819 }, { "epoch": 0.7295597484276729, "grad_norm": 1.3801237808827327, "learning_rate": 1.802942854114747e-05, "loss": 0.10020852088928223, "step": 1160, "token_acc": 0.9716870401648965 }, { "epoch": 0.7358490566037735, "grad_norm": 1.6226969361595323, "learning_rate": 1.798790801213611e-05, "loss": 0.095181143283844, "step": 1170, "token_acc": 0.971852404453116 }, { "epoch": 0.7421383647798742, "grad_norm": 1.63651004169666, "learning_rate": 1.7946003472769175e-05, "loss": 0.09436390399932862, "step": 1180, "token_acc": 0.97247915302957 }, { "epoch": 0.7484276729559748, "grad_norm": 1.545200648396311, "learning_rate": 1.7903716937563737e-05, "loss": 0.08995594978332519, "step": 1190, "token_acc": 0.9729257883943911 }, { "epoch": 0.7547169811320755, "grad_norm": 1.4510430382097352, "learning_rate": 1.786105043940092e-05, "loss": 0.09402973651885986, "step": 1200, "token_acc": 0.9736291690220463 }, { "epoch": 0.7610062893081762, "grad_norm": 1.542152182505383, "learning_rate": 1.7818006029428175e-05, "loss": 0.08624240159988403, "step": 1210, "token_acc": 0.9744192715801548 }, { "epoch": 0.7672955974842768, "grad_norm": 1.5153951462525146, "learning_rate": 1.7774585776960663e-05, "loss": 0.09034521579742431, "step": 1220, "token_acc": 0.9742687735822293 }, { "epoch": 0.7735849056603774, "grad_norm": 1.584286377289857, "learning_rate": 1.7730791769381786e-05, "loss": 0.1029138445854187, "step": 1230, "token_acc": 0.9693110282117043 }, { "epoch": 0.779874213836478, "grad_norm": 1.1675776428936107, "learning_rate": 1.7686626112042827e-05, "loss": 0.0900284469127655, "step": 1240, "token_acc": 0.9731250174060768 }, { "epoch": 0.7861635220125787, "grad_norm": 1.7145950918174153, "learning_rate": 1.7642090928161754e-05, "loss": 0.0883266270160675, "step": 1250, "token_acc": 0.974335142266063 }, { "epoch": 0.7924528301886793, "grad_norm": 1.5572882087279911, "learning_rate": 1.7597188358721135e-05, "loss": 0.08593888878822327, "step": 1260, "token_acc": 0.974071787635594 }, { "epoch": 0.7987421383647799, "grad_norm": 1.587797606510951, "learning_rate": 1.7551920562365213e-05, "loss": 0.08575895428657532, "step": 1270, "token_acc": 0.9759103973451065 }, { "epoch": 0.8050314465408805, "grad_norm": 1.3799851052052912, "learning_rate": 1.7506289715296134e-05, "loss": 0.0863305687904358, "step": 1280, "token_acc": 0.9738367941504099 }, { "epoch": 0.8113207547169812, "grad_norm": 1.859574713074995, "learning_rate": 1.7460298011169333e-05, "loss": 0.08664470314979553, "step": 1290, "token_acc": 0.9729901581333629 }, { "epoch": 0.8176100628930818, "grad_norm": 1.3443565152248043, "learning_rate": 1.7413947660988067e-05, "loss": 0.08178459405899048, "step": 1300, "token_acc": 0.9748381649310441 }, { "epoch": 0.8238993710691824, "grad_norm": 1.5494391575974118, "learning_rate": 1.7367240892997137e-05, "loss": 0.08511111736297608, "step": 1310, "token_acc": 0.9751603261658106 }, { "epoch": 0.8301886792452831, "grad_norm": 1.7615625697478026, "learning_rate": 1.732017995257575e-05, "loss": 0.08447937965393067, "step": 1320, "token_acc": 0.9743189557321226 }, { "epoch": 0.8364779874213837, "grad_norm": 1.4089633030126132, "learning_rate": 1.7272767102129586e-05, "loss": 0.0828474998474121, "step": 1330, "token_acc": 0.9742417015067762 }, { "epoch": 0.8427672955974843, "grad_norm": 1.6592961075137602, "learning_rate": 1.7225004620982035e-05, "loss": 0.08787728548049926, "step": 1340, "token_acc": 0.973914992575159 }, { "epoch": 0.8490566037735849, "grad_norm": 1.5117166617272377, "learning_rate": 1.7176894805264616e-05, "loss": 0.0850176215171814, "step": 1350, "token_acc": 0.9755728811549521 }, { "epoch": 0.8553459119496856, "grad_norm": 1.5748027114678638, "learning_rate": 1.7128439967806598e-05, "loss": 0.0911492645740509, "step": 1360, "token_acc": 0.9742503907915305 }, { "epoch": 0.8616352201257862, "grad_norm": 1.4473190275546743, "learning_rate": 1.707964243802381e-05, "loss": 0.08639097213745117, "step": 1370, "token_acc": 0.9745522819179665 }, { "epoch": 0.8679245283018868, "grad_norm": 1.379465232062877, "learning_rate": 1.7030504561806657e-05, "loss": 0.087519371509552, "step": 1380, "token_acc": 0.973626242781449 }, { "epoch": 0.8742138364779874, "grad_norm": 1.3921602039818965, "learning_rate": 1.6981028701407342e-05, "loss": 0.07906639575958252, "step": 1390, "token_acc": 0.9765992799778455 }, { "epoch": 0.8805031446540881, "grad_norm": 1.3231345110824035, "learning_rate": 1.6931217235326313e-05, "loss": 0.08956987261772156, "step": 1400, "token_acc": 0.9740674539221145 }, { "epoch": 0.8867924528301887, "grad_norm": 1.2608029267930918, "learning_rate": 1.6881072558197904e-05, "loss": 0.0810630202293396, "step": 1410, "token_acc": 0.9758955958099427 }, { "epoch": 0.8930817610062893, "grad_norm": 1.4322673455962067, "learning_rate": 1.6830597080675222e-05, "loss": 0.0859791398048401, "step": 1420, "token_acc": 0.9743442575905412 }, { "epoch": 0.89937106918239, "grad_norm": 1.8746897805081975, "learning_rate": 1.6779793229314268e-05, "loss": 0.08041177988052368, "step": 1430, "token_acc": 0.9760162243188432 }, { "epoch": 0.9056603773584906, "grad_norm": 1.3274725836940766, "learning_rate": 1.672866344645726e-05, "loss": 0.0832511305809021, "step": 1440, "token_acc": 0.9765505920975496 }, { "epoch": 0.9119496855345912, "grad_norm": 1.6426325467290352, "learning_rate": 1.667721019011524e-05, "loss": 0.07603156566619873, "step": 1450, "token_acc": 0.9772056082805287 }, { "epoch": 0.9182389937106918, "grad_norm": 1.4993824513571612, "learning_rate": 1.6625435933849912e-05, "loss": 0.07640135288238525, "step": 1460, "token_acc": 0.9774820023438808 }, { "epoch": 0.9245283018867925, "grad_norm": 1.7011352962645643, "learning_rate": 1.657334316665469e-05, "loss": 0.08306714296340942, "step": 1470, "token_acc": 0.9748449545006665 }, { "epoch": 0.9308176100628931, "grad_norm": 1.4785906863778826, "learning_rate": 1.6520934392835096e-05, "loss": 0.08007491827011108, "step": 1480, "token_acc": 0.9753671523023814 }, { "epoch": 0.9371069182389937, "grad_norm": 1.215720614479762, "learning_rate": 1.6468212131888324e-05, "loss": 0.08180446624755859, "step": 1490, "token_acc": 0.9762308998302207 }, { "epoch": 0.9433962264150944, "grad_norm": 1.394799734230705, "learning_rate": 1.6415178918382146e-05, "loss": 0.08114392161369324, "step": 1500, "token_acc": 0.9746889624472093 }, { "epoch": 0.9433962264150944, "eval_loss": 0.08027087152004242, "eval_runtime": 722.6841, "eval_samples_per_second": 11.727, "eval_steps_per_second": 2.932, "eval_token_acc": 0.9762356642152163, "step": 1500 }, { "epoch": 0.949685534591195, "grad_norm": 1.3907727949176176, "learning_rate": 1.6361837301833043e-05, "loss": 0.0873852789402008, "step": 1510, "token_acc": 0.9735111006695641 }, { "epoch": 0.9559748427672956, "grad_norm": 1.6463543824854343, "learning_rate": 1.6308189846583658e-05, "loss": 0.07256322503089904, "step": 1520, "token_acc": 0.9783221138163163 }, { "epoch": 0.9622641509433962, "grad_norm": 1.5507672072885261, "learning_rate": 1.6254239131679516e-05, "loss": 0.09013139009475708, "step": 1530, "token_acc": 0.9730759166109383 }, { "epoch": 0.9685534591194969, "grad_norm": 1.5083161719535216, "learning_rate": 1.6199987750745022e-05, "loss": 0.08049647212028503, "step": 1540, "token_acc": 0.9760573226144704 }, { "epoch": 0.9748427672955975, "grad_norm": 1.9355118295883647, "learning_rate": 1.61454383118588e-05, "loss": 0.08110915422439575, "step": 1550, "token_acc": 0.9759386260606765 }, { "epoch": 0.9811320754716981, "grad_norm": 1.4453680164285594, "learning_rate": 1.6090593437428284e-05, "loss": 0.08374351263046265, "step": 1560, "token_acc": 0.9741173766912757 }, { "epoch": 0.9874213836477987, "grad_norm": 1.8392039122485897, "learning_rate": 1.603545576406368e-05, "loss": 0.07776378393173218, "step": 1570, "token_acc": 0.9767461939160216 }, { "epoch": 0.9937106918238994, "grad_norm": 1.8713989393813286, "learning_rate": 1.5980027942451185e-05, "loss": 0.07840796709060668, "step": 1580, "token_acc": 0.9758594831564374 }, { "epoch": 1.0, "grad_norm": 1.7874037128786733, "learning_rate": 1.5924312637225582e-05, "loss": 0.08290352821350097, "step": 1590, "token_acc": 0.9754263655220097 }, { "epoch": 1.0062893081761006, "grad_norm": 1.0863843323954543, "learning_rate": 1.586831252684212e-05, "loss": 0.054670459032058714, "step": 1600, "token_acc": 0.9831636765276576 }, { "epoch": 1.0125786163522013, "grad_norm": 1.2355860009095903, "learning_rate": 1.581203030344776e-05, "loss": 0.06227320432662964, "step": 1610, "token_acc": 0.9807899784224406 }, { "epoch": 1.0188679245283019, "grad_norm": 1.0627219529639433, "learning_rate": 1.5755468672751762e-05, "loss": 0.06504143476486206, "step": 1620, "token_acc": 0.9811943349394072 }, { "epoch": 1.0251572327044025, "grad_norm": 1.2030481551752021, "learning_rate": 1.5698630353895588e-05, "loss": 0.06021591424942017, "step": 1630, "token_acc": 0.9817465792802116 }, { "epoch": 1.0314465408805031, "grad_norm": 0.9952274038110551, "learning_rate": 1.5641518079322198e-05, "loss": 0.059084004163742064, "step": 1640, "token_acc": 0.981861242685985 }, { "epoch": 1.0377358490566038, "grad_norm": 1.1029301430587275, "learning_rate": 1.55841345946447e-05, "loss": 0.062122118473052976, "step": 1650, "token_acc": 0.9809982194524817 }, { "epoch": 1.0440251572327044, "grad_norm": 1.0631570096219156, "learning_rate": 1.5526482658514325e-05, "loss": 0.06019622087478638, "step": 1660, "token_acc": 0.9813258905449732 }, { "epoch": 1.050314465408805, "grad_norm": 1.2452397083425584, "learning_rate": 1.5468565042487844e-05, "loss": 0.0565668523311615, "step": 1670, "token_acc": 0.98244754438552 }, { "epoch": 1.0566037735849056, "grad_norm": 1.2893720007251595, "learning_rate": 1.541038453089431e-05, "loss": 0.05943567156791687, "step": 1680, "token_acc": 0.9819022607901348 }, { "epoch": 1.0628930817610063, "grad_norm": 1.4405565971353904, "learning_rate": 1.5351943920701194e-05, "loss": 0.06467938423156738, "step": 1690, "token_acc": 0.9807256911344138 }, { "epoch": 1.069182389937107, "grad_norm": 1.6088355986810434, "learning_rate": 1.5293246021379952e-05, "loss": 0.05654835104942322, "step": 1700, "token_acc": 0.982938223602124 }, { "epoch": 1.0754716981132075, "grad_norm": 1.429067962964369, "learning_rate": 1.5234293654770942e-05, "loss": 0.05899702310562134, "step": 1710, "token_acc": 0.9819976934544739 }, { "epoch": 1.0817610062893082, "grad_norm": 1.4383654244646944, "learning_rate": 1.5175089654947765e-05, "loss": 0.056074118614196776, "step": 1720, "token_acc": 0.9819119189094722 }, { "epoch": 1.0880503144654088, "grad_norm": 1.224480893565136, "learning_rate": 1.5115636868081031e-05, "loss": 0.060695433616638185, "step": 1730, "token_acc": 0.9811226333787428 }, { "epoch": 1.0943396226415094, "grad_norm": 1.2853685041540284, "learning_rate": 1.5055938152301533e-05, "loss": 0.0551666259765625, "step": 1740, "token_acc": 0.9832683138274816 }, { "epoch": 1.10062893081761, "grad_norm": 1.4811194652333022, "learning_rate": 1.4995996377562831e-05, "loss": 0.06323553323745727, "step": 1750, "token_acc": 0.9809037193503268 }, { "epoch": 1.1069182389937107, "grad_norm": 1.4583514142720422, "learning_rate": 1.4935814425503301e-05, "loss": 0.0589030921459198, "step": 1760, "token_acc": 0.9828337176530874 }, { "epoch": 1.1132075471698113, "grad_norm": 1.1852990258851064, "learning_rate": 1.4875395189307584e-05, "loss": 0.05278688669204712, "step": 1770, "token_acc": 0.9831741052331384 }, { "epoch": 1.119496855345912, "grad_norm": 1.569277335890507, "learning_rate": 1.4814741573567514e-05, "loss": 0.053292316198348996, "step": 1780, "token_acc": 0.9836212116843046 }, { "epoch": 1.1257861635220126, "grad_norm": 1.1753581015135361, "learning_rate": 1.475385649414248e-05, "loss": 0.059940147399902347, "step": 1790, "token_acc": 0.9815808802757218 }, { "epoch": 1.1320754716981132, "grad_norm": 1.3654141810969276, "learning_rate": 1.4692742878019238e-05, "loss": 0.062000393867492676, "step": 1800, "token_acc": 0.9807436329754193 }, { "epoch": 1.1383647798742138, "grad_norm": 1.1668245354459672, "learning_rate": 1.4631403663171215e-05, "loss": 0.06278970837593079, "step": 1810, "token_acc": 0.9808653680399931 }, { "epoch": 1.1446540880503144, "grad_norm": 1.108737675654481, "learning_rate": 1.4569841798417257e-05, "loss": 0.052794039249420166, "step": 1820, "token_acc": 0.9834860773577724 }, { "epoch": 1.150943396226415, "grad_norm": 1.3645629847734626, "learning_rate": 1.4508060243279878e-05, "loss": 0.054091382026672366, "step": 1830, "token_acc": 0.9830678432271373 }, { "epoch": 1.1572327044025157, "grad_norm": 1.075485335510402, "learning_rate": 1.4446061967842973e-05, "loss": 0.05640895962715149, "step": 1840, "token_acc": 0.9829748578796226 }, { "epoch": 1.1635220125786163, "grad_norm": 1.1673617984452092, "learning_rate": 1.4383849952609041e-05, "loss": 0.05560168027877808, "step": 1850, "token_acc": 0.9829094373394152 }, { "epoch": 1.169811320754717, "grad_norm": 1.178620225968234, "learning_rate": 1.4321427188355901e-05, "loss": 0.06277099251747131, "step": 1860, "token_acc": 0.9805737056462246 }, { "epoch": 1.1761006289308176, "grad_norm": 4.438647349024938, "learning_rate": 1.425879667599291e-05, "loss": 0.05321987271308899, "step": 1870, "token_acc": 0.9836384771868643 }, { "epoch": 1.1823899371069182, "grad_norm": 0.8089960660126764, "learning_rate": 1.4195961426416695e-05, "loss": 0.05596756935119629, "step": 1880, "token_acc": 0.982458692110775 }, { "epoch": 1.1886792452830188, "grad_norm": 1.098178154828662, "learning_rate": 1.4132924460366422e-05, "loss": 0.056302571296691896, "step": 1890, "token_acc": 0.9826823556622047 }, { "epoch": 1.1949685534591195, "grad_norm": 1.6098174552103235, "learning_rate": 1.4069688808278555e-05, "loss": 0.05291517972946167, "step": 1900, "token_acc": 0.9840382360898188 }, { "epoch": 1.20125786163522, "grad_norm": 1.1838009862882108, "learning_rate": 1.4006257510141185e-05, "loss": 0.05891250967979431, "step": 1910, "token_acc": 0.9826159865846842 }, { "epoch": 1.2075471698113207, "grad_norm": 1.0812253602188817, "learning_rate": 1.3942633615347894e-05, "loss": 0.05368814468383789, "step": 1920, "token_acc": 0.9829718640093786 }, { "epoch": 1.2138364779874213, "grad_norm": 1.0195075509939109, "learning_rate": 1.3878820182551128e-05, "loss": 0.058639413118362425, "step": 1930, "token_acc": 0.9828350271803357 }, { "epoch": 1.220125786163522, "grad_norm": 1.3170894091360277, "learning_rate": 1.3814820279515187e-05, "loss": 0.05739471912384033, "step": 1940, "token_acc": 0.982506542268745 }, { "epoch": 1.2264150943396226, "grad_norm": 1.0061802280737047, "learning_rate": 1.3750636982968737e-05, "loss": 0.052751028537750246, "step": 1950, "token_acc": 0.9834080717488789 }, { "epoch": 1.2327044025157232, "grad_norm": 1.1823477189308416, "learning_rate": 1.3686273378456887e-05, "loss": 0.05880435705184937, "step": 1960, "token_acc": 0.9824817101051669 }, { "epoch": 1.2389937106918238, "grad_norm": 1.22554558640998, "learning_rate": 1.3621732560192871e-05, "loss": 0.05745119452476501, "step": 1970, "token_acc": 0.9821925431274346 }, { "epoch": 1.2452830188679245, "grad_norm": 1.3279829643490002, "learning_rate": 1.3557017630909281e-05, "loss": 0.05679129958152771, "step": 1980, "token_acc": 0.9827537461125248 }, { "epoch": 1.251572327044025, "grad_norm": 1.307619119343479, "learning_rate": 1.3492131701708923e-05, "loss": 0.05341850519180298, "step": 1990, "token_acc": 0.9835128725319313 }, { "epoch": 1.2578616352201257, "grad_norm": 0.9629697375654898, "learning_rate": 1.342707789191524e-05, "loss": 0.0589292049407959, "step": 2000, "token_acc": 0.982400046088259 }, { "epoch": 1.2578616352201257, "eval_loss": 0.07104787975549698, "eval_runtime": 730.4468, "eval_samples_per_second": 11.602, "eval_steps_per_second": 2.901, "eval_token_acc": 0.978991514710327, "step": 2000 }, { "epoch": 1.2641509433962264, "grad_norm": 1.1453083559028814, "learning_rate": 1.3361859328922368e-05, "loss": 0.05606248378753662, "step": 2010, "token_acc": 0.9830185358986392 }, { "epoch": 1.270440251572327, "grad_norm": 1.204837950589622, "learning_rate": 1.3296479148044772e-05, "loss": 0.05325353145599365, "step": 2020, "token_acc": 0.9833405389825624 }, { "epoch": 1.2767295597484276, "grad_norm": 2.6011333489424926, "learning_rate": 1.323094049236653e-05, "loss": 0.05622377395629883, "step": 2030, "token_acc": 0.9828592371232838 }, { "epoch": 1.2830188679245282, "grad_norm": 1.232823511568605, "learning_rate": 1.3165246512590234e-05, "loss": 0.05152193307876587, "step": 2040, "token_acc": 0.9835936818221486 }, { "epoch": 1.2893081761006289, "grad_norm": 1.1466821617032141, "learning_rate": 1.3099400366885525e-05, "loss": 0.05622357130050659, "step": 2050, "token_acc": 0.9826379815927765 }, { "epoch": 1.2955974842767295, "grad_norm": 0.8472290173088248, "learning_rate": 1.3033405220737247e-05, "loss": 0.05588294267654419, "step": 2060, "token_acc": 0.982960565881039 }, { "epoch": 1.3018867924528301, "grad_norm": 0.8561022792687387, "learning_rate": 1.29672642467933e-05, "loss": 0.05827444791793823, "step": 2070, "token_acc": 0.9819563431522121 }, { "epoch": 1.3081761006289307, "grad_norm": 0.8843280358103902, "learning_rate": 1.2900980624712102e-05, "loss": 0.053099215030670166, "step": 2080, "token_acc": 0.9832570120206068 }, { "epoch": 1.3144654088050314, "grad_norm": 0.9595233250713293, "learning_rate": 1.283455754100972e-05, "loss": 0.054636824131011966, "step": 2090, "token_acc": 0.9834532583680484 }, { "epoch": 1.320754716981132, "grad_norm": 1.0568180647703327, "learning_rate": 1.2767998188906707e-05, "loss": 0.0547333836555481, "step": 2100, "token_acc": 0.9825695735337697 }, { "epoch": 1.3270440251572326, "grad_norm": 1.0358582290569485, "learning_rate": 1.2701305768174568e-05, "loss": 0.04895033836364746, "step": 2110, "token_acc": 0.9852507374631269 }, { "epoch": 1.3333333333333333, "grad_norm": 1.165756196741812, "learning_rate": 1.2634483484981947e-05, "loss": 0.05529743432998657, "step": 2120, "token_acc": 0.9829623215931625 }, { "epoch": 1.3396226415094339, "grad_norm": 0.9398124420553513, "learning_rate": 1.2567534551740495e-05, "loss": 0.05000982284545898, "step": 2130, "token_acc": 0.9845889904457656 }, { "epoch": 1.3459119496855345, "grad_norm": 1.04779088754709, "learning_rate": 1.250046218695042e-05, "loss": 0.05204824805259704, "step": 2140, "token_acc": 0.9837851750171586 }, { "epoch": 1.3522012578616351, "grad_norm": 1.1911828992184066, "learning_rate": 1.2433269615045787e-05, "loss": 0.059820497035980226, "step": 2150, "token_acc": 0.9811458424539591 }, { "epoch": 1.3584905660377358, "grad_norm": 1.1318809840091484, "learning_rate": 1.2365960066239481e-05, "loss": 0.04887381792068481, "step": 2160, "token_acc": 0.9850779142117725 }, { "epoch": 1.3647798742138364, "grad_norm": 1.18575128110939, "learning_rate": 1.2298536776367939e-05, "loss": 0.05481389760971069, "step": 2170, "token_acc": 0.9830817516066358 }, { "epoch": 1.371069182389937, "grad_norm": 1.4705395863488706, "learning_rate": 1.2231002986735581e-05, "loss": 0.053452318906784056, "step": 2180, "token_acc": 0.9840079667271981 }, { "epoch": 1.3773584905660377, "grad_norm": 1.1381781742512012, "learning_rate": 1.2163361943958978e-05, "loss": 0.05643002986907959, "step": 2190, "token_acc": 0.9830008052250157 }, { "epoch": 1.3836477987421385, "grad_norm": 1.108236107230958, "learning_rate": 1.2095616899810791e-05, "loss": 0.05098185539245605, "step": 2200, "token_acc": 0.983818037795632 }, { "epoch": 1.389937106918239, "grad_norm": 1.3111447759496122, "learning_rate": 1.202777111106344e-05, "loss": 0.05649634599685669, "step": 2210, "token_acc": 0.9826166514181153 }, { "epoch": 1.3962264150943398, "grad_norm": 1.199633245486944, "learning_rate": 1.1959827839332534e-05, "loss": 0.05167834758758545, "step": 2220, "token_acc": 0.9843418784093158 }, { "epoch": 1.4025157232704402, "grad_norm": 0.8666899643086994, "learning_rate": 1.1891790350920078e-05, "loss": 0.050674861669540404, "step": 2230, "token_acc": 0.9848721508735393 }, { "epoch": 1.408805031446541, "grad_norm": 1.0104295447062845, "learning_rate": 1.1823661916657441e-05, "loss": 0.05399259328842163, "step": 2240, "token_acc": 0.9835477202251063 }, { "epoch": 1.4150943396226414, "grad_norm": 1.0519494154431606, "learning_rate": 1.175544581174812e-05, "loss": 0.05260412693023682, "step": 2250, "token_acc": 0.9835558595260043 }, { "epoch": 1.4213836477987423, "grad_norm": 1.068852503916311, "learning_rate": 1.168714531561029e-05, "loss": 0.05003952980041504, "step": 2260, "token_acc": 0.9841691670238606 }, { "epoch": 1.4276729559748427, "grad_norm": 1.3301770971060891, "learning_rate": 1.1618763711719146e-05, "loss": 0.05456411838531494, "step": 2270, "token_acc": 0.983095406360424 }, { "epoch": 1.4339622641509435, "grad_norm": 1.4290925190264645, "learning_rate": 1.155030428744905e-05, "loss": 0.05128795504570007, "step": 2280, "token_acc": 0.9836185750133928 }, { "epoch": 1.440251572327044, "grad_norm": 1.0237953855660737, "learning_rate": 1.1481770333915503e-05, "loss": 0.05018550753593445, "step": 2290, "token_acc": 0.9849615590413698 }, { "epoch": 1.4465408805031448, "grad_norm": 0.9772590029300261, "learning_rate": 1.1413165145816915e-05, "loss": 0.04862302839756012, "step": 2300, "token_acc": 0.9849491445705215 }, { "epoch": 1.4528301886792452, "grad_norm": 1.6101981573211892, "learning_rate": 1.1344492021276234e-05, "loss": 0.05587894320487976, "step": 2310, "token_acc": 0.982574528060135 }, { "epoch": 1.459119496855346, "grad_norm": 1.0729401899055404, "learning_rate": 1.1275754261682373e-05, "loss": 0.04833915829658508, "step": 2320, "token_acc": 0.9852789472216801 }, { "epoch": 1.4654088050314464, "grad_norm": 1.138669296120146, "learning_rate": 1.1206955171531509e-05, "loss": 0.05411947965621948, "step": 2330, "token_acc": 0.9835947825087629 }, { "epoch": 1.4716981132075473, "grad_norm": 0.9193723256228759, "learning_rate": 1.113809805826823e-05, "loss": 0.055597662925720215, "step": 2340, "token_acc": 0.9836160314387262 }, { "epoch": 1.4779874213836477, "grad_norm": 1.15943849177995, "learning_rate": 1.1069186232126512e-05, "loss": 0.05205290317535401, "step": 2350, "token_acc": 0.9842589513357721 }, { "epoch": 1.4842767295597485, "grad_norm": 1.0278093026309196, "learning_rate": 1.1000223005970603e-05, "loss": 0.047948446869850156, "step": 2360, "token_acc": 0.9846708246708247 }, { "epoch": 1.490566037735849, "grad_norm": 0.9090992104674461, "learning_rate": 1.0931211695135753e-05, "loss": 0.04863405227661133, "step": 2370, "token_acc": 0.9849871620601117 }, { "epoch": 1.4968553459119498, "grad_norm": 0.9630374367443844, "learning_rate": 1.086215561726883e-05, "loss": 0.051735520362854004, "step": 2380, "token_acc": 0.9836164352117526 }, { "epoch": 1.5031446540880502, "grad_norm": 0.9821758509138084, "learning_rate": 1.0793058092168833e-05, "loss": 0.05245034694671631, "step": 2390, "token_acc": 0.9844871432608081 }, { "epoch": 1.509433962264151, "grad_norm": 1.961427641140477, "learning_rate": 1.07239224416273e-05, "loss": 0.052685320377349854, "step": 2400, "token_acc": 0.9839027595269383 }, { "epoch": 1.5157232704402515, "grad_norm": 1.023089649058638, "learning_rate": 1.0654751989268588e-05, "loss": 0.05271808505058288, "step": 2410, "token_acc": 0.984044630404463 }, { "epoch": 1.5220125786163523, "grad_norm": 1.1362717857926448, "learning_rate": 1.0585550060390141e-05, "loss": 0.045315057039260864, "step": 2420, "token_acc": 0.986439398339044 }, { "epoch": 1.5283018867924527, "grad_norm": 1.1540779189780639, "learning_rate": 1.0516319981802589e-05, "loss": 0.05070680379867554, "step": 2430, "token_acc": 0.9837349397590361 }, { "epoch": 1.5345911949685536, "grad_norm": 1.370378196938021, "learning_rate": 1.0447065081669834e-05, "loss": 0.05064605474472046, "step": 2440, "token_acc": 0.984635722679201 }, { "epoch": 1.540880503144654, "grad_norm": 1.0034426130585499, "learning_rate": 1.037778868934905e-05, "loss": 0.05282008647918701, "step": 2450, "token_acc": 0.9836287259753712 }, { "epoch": 1.5471698113207548, "grad_norm": 1.0534531097485125, "learning_rate": 1.0308494135230616e-05, "loss": 0.05066592693328857, "step": 2460, "token_acc": 0.9850426723758688 }, { "epoch": 1.5534591194968552, "grad_norm": 1.0764284563415414, "learning_rate": 1.023918475057803e-05, "loss": 0.046628701686859134, "step": 2470, "token_acc": 0.9858265424912689 }, { "epoch": 1.559748427672956, "grad_norm": 1.005389844740969, "learning_rate": 1.0169863867367755e-05, "loss": 0.0455363005399704, "step": 2480, "token_acc": 0.9856986332968111 }, { "epoch": 1.5660377358490565, "grad_norm": 1.5808354451192173, "learning_rate": 1.010053481812903e-05, "loss": 0.04960752725601196, "step": 2490, "token_acc": 0.9848381212950297 }, { "epoch": 1.5723270440251573, "grad_norm": 0.8146209429997286, "learning_rate": 1.003120093578366e-05, "loss": 0.049983108043670656, "step": 2500, "token_acc": 0.9847141288742505 }, { "epoch": 1.5723270440251573, "eval_loss": 0.06294354796409607, "eval_runtime": 720.1002, "eval_samples_per_second": 11.769, "eval_steps_per_second": 2.943, "eval_token_acc": 0.9814012589817003, "step": 2500 }, { "epoch": 1.5786163522012577, "grad_norm": 1.0779724413765672, "learning_rate": 9.961865553485808e-06, "loss": 0.05379664897918701, "step": 2510, "token_acc": 0.9826207228336039 }, { "epoch": 1.5849056603773586, "grad_norm": 1.250981230954497, "learning_rate": 9.892532004461746e-06, "loss": 0.0480253279209137, "step": 2520, "token_acc": 0.9850908663177086 }, { "epoch": 1.591194968553459, "grad_norm": 0.9282644695610834, "learning_rate": 9.8232036218496e-06, "loss": 0.04891929030418396, "step": 2530, "token_acc": 0.9845905700048965 }, { "epoch": 1.5974842767295598, "grad_norm": 0.8282640147021464, "learning_rate": 9.753883738539147e-06, "loss": 0.047617962956428526, "step": 2540, "token_acc": 0.9850088183421517 }, { "epoch": 1.6037735849056602, "grad_norm": 0.9724621116688568, "learning_rate": 9.684575687011546e-06, "loss": 0.05135564804077149, "step": 2550, "token_acc": 0.9850393473811652 }, { "epoch": 1.610062893081761, "grad_norm": 1.271084592116781, "learning_rate": 9.615282799179174e-06, "loss": 0.044367313385009766, "step": 2560, "token_acc": 0.9855293221629855 }, { "epoch": 1.6163522012578615, "grad_norm": 1.090033419311402, "learning_rate": 9.546008406225414e-06, "loss": 0.047433316707611084, "step": 2570, "token_acc": 0.9850305411373348 }, { "epoch": 1.6226415094339623, "grad_norm": 0.9385748626886379, "learning_rate": 9.476755838444542e-06, "loss": 0.04910559058189392, "step": 2580, "token_acc": 0.9851407424021532 }, { "epoch": 1.6289308176100628, "grad_norm": 0.8144990044132233, "learning_rate": 9.407528425081598e-06, "loss": 0.046424055099487306, "step": 2590, "token_acc": 0.9859498736907837 }, { "epoch": 1.6352201257861636, "grad_norm": 1.0093041374441951, "learning_rate": 9.338329494172362e-06, "loss": 0.044951969385147096, "step": 2600, "token_acc": 0.9868212816159756 }, { "epoch": 1.641509433962264, "grad_norm": 1.614006415230121, "learning_rate": 9.269162372383338e-06, "loss": 0.048131465911865234, "step": 2610, "token_acc": 0.9848029439029187 }, { "epoch": 1.6477987421383649, "grad_norm": 0.9563758266821378, "learning_rate": 9.200030384851851e-06, "loss": 0.05282438993453979, "step": 2620, "token_acc": 0.9838705072939564 }, { "epoch": 1.6540880503144653, "grad_norm": 0.7360361071435326, "learning_rate": 9.130936855026166e-06, "loss": 0.04689803123474121, "step": 2630, "token_acc": 0.985498807741363 }, { "epoch": 1.6603773584905661, "grad_norm": 1.1217754732714569, "learning_rate": 9.061885104505757e-06, "loss": 0.04997251033782959, "step": 2640, "token_acc": 0.9845242306331801 }, { "epoch": 1.6666666666666665, "grad_norm": 1.3727043728275967, "learning_rate": 8.992878452881584e-06, "loss": 0.05193988084793091, "step": 2650, "token_acc": 0.9841976602060416 }, { "epoch": 1.6729559748427674, "grad_norm": 1.0738630903011401, "learning_rate": 8.923920217576534e-06, "loss": 0.05103607177734375, "step": 2660, "token_acc": 0.9841919016127161 }, { "epoch": 1.6792452830188678, "grad_norm": 0.8980456588406888, "learning_rate": 8.855013713685915e-06, "loss": 0.04786647856235504, "step": 2670, "token_acc": 0.9857719560737374 }, { "epoch": 1.6855345911949686, "grad_norm": 1.6346416405736106, "learning_rate": 8.786162253818127e-06, "loss": 0.044231253862380984, "step": 2680, "token_acc": 0.9870336289108248 }, { "epoch": 1.691823899371069, "grad_norm": 0.6531463448084285, "learning_rate": 8.71736914793536e-06, "loss": 0.043486350774765016, "step": 2690, "token_acc": 0.9871552030306617 }, { "epoch": 1.6981132075471699, "grad_norm": 0.9826959501582727, "learning_rate": 8.648637703194515e-06, "loss": 0.04538998007774353, "step": 2700, "token_acc": 0.9856196890897206 }, { "epoch": 1.7044025157232703, "grad_norm": 0.9001332673994735, "learning_rate": 8.579971223788187e-06, "loss": 0.04760159850120545, "step": 2710, "token_acc": 0.9858047797293407 }, { "epoch": 1.7106918238993711, "grad_norm": 1.4201032363470754, "learning_rate": 8.511373010785837e-06, "loss": 0.047868388891220096, "step": 2720, "token_acc": 0.9851355025831596 }, { "epoch": 1.7169811320754715, "grad_norm": 0.6636891523950763, "learning_rate": 8.44284636197508e-06, "loss": 0.044393020868301394, "step": 2730, "token_acc": 0.986869384934347 }, { "epoch": 1.7232704402515724, "grad_norm": 0.9926167607852617, "learning_rate": 8.374394571703173e-06, "loss": 0.04762988090515137, "step": 2740, "token_acc": 0.9852822347916426 }, { "epoch": 1.7295597484276728, "grad_norm": 0.8607059886840823, "learning_rate": 8.306020930718608e-06, "loss": 0.040895530581474306, "step": 2750, "token_acc": 0.9872684168934938 }, { "epoch": 1.7358490566037736, "grad_norm": 0.7767146375653069, "learning_rate": 8.237728726012953e-06, "loss": 0.041637516021728514, "step": 2760, "token_acc": 0.986967220585108 }, { "epoch": 1.742138364779874, "grad_norm": 0.743008860732743, "learning_rate": 8.169521240662788e-06, "loss": 0.04418378472328186, "step": 2770, "token_acc": 0.9866983791402396 }, { "epoch": 1.748427672955975, "grad_norm": 0.8964151609893208, "learning_rate": 8.10140175367192e-06, "loss": 0.0516668438911438, "step": 2780, "token_acc": 0.9840028126922739 }, { "epoch": 1.7547169811320755, "grad_norm": 1.1166309562768149, "learning_rate": 8.033373539813707e-06, "loss": 0.046999433636665346, "step": 2790, "token_acc": 0.9857126397050351 }, { "epoch": 1.7610062893081762, "grad_norm": 0.7683616948363582, "learning_rate": 7.965439869473664e-06, "loss": 0.044093912839889525, "step": 2800, "token_acc": 0.9867604713984623 }, { "epoch": 1.7672955974842768, "grad_norm": 1.073793810583486, "learning_rate": 7.897604008492213e-06, "loss": 0.046621406078338624, "step": 2810, "token_acc": 0.9859511745739291 }, { "epoch": 1.7735849056603774, "grad_norm": 1.0453627363845124, "learning_rate": 7.829869218007698e-06, "loss": 0.04217245578765869, "step": 2820, "token_acc": 0.987036982725753 }, { "epoch": 1.779874213836478, "grad_norm": 1.1770300073955866, "learning_rate": 7.762238754299596e-06, "loss": 0.04043961763381958, "step": 2830, "token_acc": 0.9876352395672334 }, { "epoch": 1.7861635220125787, "grad_norm": 0.9601702623421838, "learning_rate": 7.694715868632002e-06, "loss": 0.04152000546455383, "step": 2840, "token_acc": 0.9877592655559334 }, { "epoch": 1.7924528301886793, "grad_norm": 0.818686382169386, "learning_rate": 7.627303807097285e-06, "loss": 0.04074779152870178, "step": 2850, "token_acc": 0.9871859578286227 }, { "epoch": 1.79874213836478, "grad_norm": 0.7321550298673272, "learning_rate": 7.5600058104600805e-06, "loss": 0.041087231040000914, "step": 2860, "token_acc": 0.9878531152691671 }, { "epoch": 1.8050314465408805, "grad_norm": 0.8721067816123668, "learning_rate": 7.492825114001456e-06, "loss": 0.044392266869544984, "step": 2870, "token_acc": 0.986461784531018 }, { "epoch": 1.8113207547169812, "grad_norm": 0.9327119914838725, "learning_rate": 7.425764947363409e-06, "loss": 0.04353358447551727, "step": 2880, "token_acc": 0.986205931729155 }, { "epoch": 1.8176100628930818, "grad_norm": 0.7411353160886415, "learning_rate": 7.358828534393574e-06, "loss": 0.043460381031036374, "step": 2890, "token_acc": 0.9866405055255792 }, { "epoch": 1.8238993710691824, "grad_norm": 0.9077406542379596, "learning_rate": 7.29201909299028e-06, "loss": 0.05039287209510803, "step": 2900, "token_acc": 0.9835132073221633 }, { "epoch": 1.830188679245283, "grad_norm": 0.8855725031776173, "learning_rate": 7.225339834947811e-06, "loss": 0.04200481176376343, "step": 2910, "token_acc": 0.9875679212540184 }, { "epoch": 1.8364779874213837, "grad_norm": 0.9207682012858668, "learning_rate": 7.1587939658020275e-06, "loss": 0.04500528573989868, "step": 2920, "token_acc": 0.9862918819081701 }, { "epoch": 1.8427672955974843, "grad_norm": 0.7168665613965597, "learning_rate": 7.092384684676263e-06, "loss": 0.04347011148929596, "step": 2930, "token_acc": 0.9861606148362664 }, { "epoch": 1.849056603773585, "grad_norm": 1.068918453766715, "learning_rate": 7.026115184127518e-06, "loss": 0.044189158082008365, "step": 2940, "token_acc": 0.9863891583619955 }, { "epoch": 1.8553459119496856, "grad_norm": 1.046246115613562, "learning_rate": 6.9599886499929844e-06, "loss": 0.04125094413757324, "step": 2950, "token_acc": 0.9870236182275718 }, { "epoch": 1.8616352201257862, "grad_norm": 0.7754228161073464, "learning_rate": 6.894008261236907e-06, "loss": 0.04711427390575409, "step": 2960, "token_acc": 0.9856841252855945 }, { "epoch": 1.8679245283018868, "grad_norm": 0.7882340241312703, "learning_rate": 6.8281771897977274e-06, "loss": 0.043092700839042666, "step": 2970, "token_acc": 0.9864337892019553 }, { "epoch": 1.8742138364779874, "grad_norm": 0.7679388139347542, "learning_rate": 6.762498600435624e-06, "loss": 0.044446122646331784, "step": 2980, "token_acc": 0.9866929945054945 }, { "epoch": 1.880503144654088, "grad_norm": 1.0341275463341217, "learning_rate": 6.696975650580347e-06, "loss": 0.0423536092042923, "step": 2990, "token_acc": 0.9872913616398243 }, { "epoch": 1.8867924528301887, "grad_norm": 1.4903969853297627, "learning_rate": 6.631611490179452e-06, "loss": 0.03887941539287567, "step": 3000, "token_acc": 0.9879372399055224 }, { "epoch": 1.8867924528301887, "eval_loss": 0.05534382909536362, "eval_runtime": 717.8792, "eval_samples_per_second": 11.806, "eval_steps_per_second": 2.952, "eval_token_acc": 0.9835461426217628, "step": 3000 }, { "epoch": 1.8930817610062893, "grad_norm": 0.9513668679732343, "learning_rate": 6.566409261546853e-06, "loss": 0.04525440037250519, "step": 3010, "token_acc": 0.9861626332214567 }, { "epoch": 1.89937106918239, "grad_norm": 0.9978635657308409, "learning_rate": 6.501372099211758e-06, "loss": 0.044601985812187196, "step": 3020, "token_acc": 0.9857926366765856 }, { "epoch": 1.9056603773584906, "grad_norm": 1.074608364002311, "learning_rate": 6.43650312976799e-06, "loss": 0.03974335789680481, "step": 3030, "token_acc": 0.9873547505126452 }, { "epoch": 1.9119496855345912, "grad_norm": 1.185976614312186, "learning_rate": 6.371805471723667e-06, "loss": 0.043976855278015134, "step": 3040, "token_acc": 0.9867405217690784 }, { "epoch": 1.9182389937106918, "grad_norm": 0.7831735892695594, "learning_rate": 6.307282235351302e-06, "loss": 0.04128652811050415, "step": 3050, "token_acc": 0.9875675361645269 }, { "epoch": 1.9245283018867925, "grad_norm": 0.7682687872810261, "learning_rate": 6.2429365225382565e-06, "loss": 0.04307092428207397, "step": 3060, "token_acc": 0.9865218029657906 }, { "epoch": 1.930817610062893, "grad_norm": 1.0309616532569668, "learning_rate": 6.17877142663764e-06, "loss": 0.04245474934577942, "step": 3070, "token_acc": 0.9868967809850961 }, { "epoch": 1.9371069182389937, "grad_norm": 1.1348771283347114, "learning_rate": 6.114790032319579e-06, "loss": 0.038907968997955324, "step": 3080, "token_acc": 0.9880084782671815 }, { "epoch": 1.9433962264150944, "grad_norm": 0.9427207864096449, "learning_rate": 6.05099541542296e-06, "loss": 0.04092976450920105, "step": 3090, "token_acc": 0.987441221998306 }, { "epoch": 1.949685534591195, "grad_norm": 0.7102412660886335, "learning_rate": 5.987390642807516e-06, "loss": 0.04200551807880402, "step": 3100, "token_acc": 0.986920228087708 }, { "epoch": 1.9559748427672956, "grad_norm": 0.7929334085206099, "learning_rate": 5.923978772206432e-06, "loss": 0.042564332485198975, "step": 3110, "token_acc": 0.986267625573362 }, { "epoch": 1.9622641509433962, "grad_norm": 1.2645997361167853, "learning_rate": 5.860762852079318e-06, "loss": 0.04022677540779114, "step": 3120, "token_acc": 0.9878765190347291 }, { "epoch": 1.9685534591194969, "grad_norm": 1.2869788893371823, "learning_rate": 5.797745921465674e-06, "loss": 0.044468334317207335, "step": 3130, "token_acc": 0.9864629444523776 }, { "epoch": 1.9748427672955975, "grad_norm": 0.7375696397453761, "learning_rate": 5.734931009838785e-06, "loss": 0.04471827745437622, "step": 3140, "token_acc": 0.9861083206566976 }, { "epoch": 1.9811320754716981, "grad_norm": 0.7104888528538673, "learning_rate": 5.67232113696009e-06, "loss": 0.046862339973449706, "step": 3150, "token_acc": 0.9855168251961264 }, { "epoch": 1.9874213836477987, "grad_norm": 0.6304186527459887, "learning_rate": 5.609919312733987e-06, "loss": 0.040326648950576784, "step": 3160, "token_acc": 0.9875191903110252 }, { "epoch": 1.9937106918238994, "grad_norm": 0.8206946118047505, "learning_rate": 5.5477285370631776e-06, "loss": 0.03785489201545715, "step": 3170, "token_acc": 0.9877561289240488 }, { "epoch": 2.0, "grad_norm": 1.6185232611309048, "learning_rate": 5.485751799704394e-06, "loss": 0.03770981431007385, "step": 3180, "token_acc": 0.9875341530054644 }, { "epoch": 2.006289308176101, "grad_norm": 0.4508860979571277, "learning_rate": 5.423992080124732e-06, "loss": 0.027628958225250244, "step": 3190, "token_acc": 0.9909122482056032 }, { "epoch": 2.0125786163522013, "grad_norm": 0.7265688364646201, "learning_rate": 5.362452347358351e-06, "loss": 0.02676335573196411, "step": 3200, "token_acc": 0.9914142278509898 }, { "epoch": 2.018867924528302, "grad_norm": 0.7176493010340509, "learning_rate": 5.301135559863797e-06, "loss": 0.026176384091377257, "step": 3210, "token_acc": 0.9912505572893446 }, { "epoch": 2.0251572327044025, "grad_norm": 0.4203332470924716, "learning_rate": 5.240044665381746e-06, "loss": 0.026522904634475708, "step": 3220, "token_acc": 0.9912850258274897 }, { "epoch": 2.0314465408805034, "grad_norm": 0.49076741004829705, "learning_rate": 5.179182600793302e-06, "loss": 0.028085267543792723, "step": 3230, "token_acc": 0.9911830389512457 }, { "epoch": 2.0377358490566038, "grad_norm": 0.9034204793151985, "learning_rate": 5.118552291978813e-06, "loss": 0.02409302443265915, "step": 3240, "token_acc": 0.9916755174096507 }, { "epoch": 2.0440251572327046, "grad_norm": 0.9532940107352192, "learning_rate": 5.058156653677208e-06, "loss": 0.02635040283203125, "step": 3250, "token_acc": 0.9913842784693407 }, { "epoch": 2.050314465408805, "grad_norm": 1.3923378199552692, "learning_rate": 4.997998589345876e-06, "loss": 0.027022877335548402, "step": 3260, "token_acc": 0.9907599907599908 }, { "epoch": 2.056603773584906, "grad_norm": 0.7649477167428459, "learning_rate": 4.938080991021088e-06, "loss": 0.027101951837539672, "step": 3270, "token_acc": 0.990909090909091 }, { "epoch": 2.0628930817610063, "grad_norm": 0.8061149358869371, "learning_rate": 4.878406739178948e-06, "loss": 0.025409945845603944, "step": 3280, "token_acc": 0.9915269196822595 }, { "epoch": 2.069182389937107, "grad_norm": 0.9591310020250684, "learning_rate": 4.818978702596957e-06, "loss": 0.025764793157577515, "step": 3290, "token_acc": 0.9913785307328249 }, { "epoch": 2.0754716981132075, "grad_norm": 0.8738700593689223, "learning_rate": 4.759799738216048e-06, "loss": 0.02476211339235306, "step": 3300, "token_acc": 0.9920101738962152 }, { "epoch": 2.0817610062893084, "grad_norm": 0.8666216893328724, "learning_rate": 4.700872691003283e-06, "loss": 0.0245857834815979, "step": 3310, "token_acc": 0.9914979636731175 }, { "epoch": 2.088050314465409, "grad_norm": 0.7330006799490474, "learning_rate": 4.642200393815065e-06, "loss": 0.024617944657802582, "step": 3320, "token_acc": 0.9923877978380214 }, { "epoch": 2.0943396226415096, "grad_norm": 0.7358978404096974, "learning_rate": 4.583785667260953e-06, "loss": 0.025178509950637817, "step": 3330, "token_acc": 0.9915244431588082 }, { "epoch": 2.10062893081761, "grad_norm": 0.5429978769204982, "learning_rate": 4.525631319568067e-06, "loss": 0.022534635663032532, "step": 3340, "token_acc": 0.9924528301886792 }, { "epoch": 2.106918238993711, "grad_norm": 0.40319756075770474, "learning_rate": 4.467740146446086e-06, "loss": 0.02614726424217224, "step": 3350, "token_acc": 0.9916803803254708 }, { "epoch": 2.1132075471698113, "grad_norm": 0.826434146525126, "learning_rate": 4.410114930952844e-06, "loss": 0.024831095337867738, "step": 3360, "token_acc": 0.9915295464301045 }, { "epoch": 2.119496855345912, "grad_norm": 0.63976873943924, "learning_rate": 4.352758443360545e-06, "loss": 0.024512295424938203, "step": 3370, "token_acc": 0.9919333853760083 }, { "epoch": 2.1257861635220126, "grad_norm": 0.6064353901033486, "learning_rate": 4.29567344102256e-06, "loss": 0.023602843284606934, "step": 3380, "token_acc": 0.9924209170982939 }, { "epoch": 2.1320754716981134, "grad_norm": 0.7517058145196435, "learning_rate": 4.23886266824092e-06, "loss": 0.02254200279712677, "step": 3390, "token_acc": 0.9921487603305785 }, { "epoch": 2.138364779874214, "grad_norm": 0.9220898043923944, "learning_rate": 4.1823288561343236e-06, "loss": 0.02772650122642517, "step": 3400, "token_acc": 0.9909362025612537 }, { "epoch": 2.1446540880503147, "grad_norm": 2.5322794453411097, "learning_rate": 4.126074722506902e-06, "loss": 0.0227973073720932, "step": 3410, "token_acc": 0.992287006117202 }, { "epoch": 2.150943396226415, "grad_norm": 0.6637478582466627, "learning_rate": 4.070102971717512e-06, "loss": 0.023247498273849487, "step": 3420, "token_acc": 0.9925187032418953 }, { "epoch": 2.157232704402516, "grad_norm": 0.6855147878689783, "learning_rate": 4.0144162945497634e-06, "loss": 0.027481943368911743, "step": 3430, "token_acc": 0.9906596957058098 }, { "epoch": 2.1635220125786163, "grad_norm": 0.6951399810120154, "learning_rate": 3.9590173680826425e-06, "loss": 0.028112170100212098, "step": 3440, "token_acc": 0.9909493103547986 }, { "epoch": 2.169811320754717, "grad_norm": 0.9163370475340076, "learning_rate": 3.9039088555618256e-06, "loss": 0.02361641824245453, "step": 3450, "token_acc": 0.9920540556216106 }, { "epoch": 2.1761006289308176, "grad_norm": 0.7173776749029771, "learning_rate": 3.849093406271634e-06, "loss": 0.02243003249168396, "step": 3460, "token_acc": 0.992677332078579 }, { "epoch": 2.1823899371069184, "grad_norm": 0.934931570917733, "learning_rate": 3.7945736554076917e-06, "loss": 0.029029443860054016, "step": 3470, "token_acc": 0.9900564906617477 }, { "epoch": 2.188679245283019, "grad_norm": 0.6207792354919172, "learning_rate": 3.7403522239502065e-06, "loss": 0.02085772156715393, "step": 3480, "token_acc": 0.9933544655766878 }, { "epoch": 2.1949685534591197, "grad_norm": 0.5912514975563645, "learning_rate": 3.6864317185380206e-06, "loss": 0.020345935225486757, "step": 3490, "token_acc": 0.9932405566600397 }, { "epoch": 2.20125786163522, "grad_norm": 0.6823875562891651, "learning_rate": 3.6328147313432427e-06, "loss": 0.024584601819515228, "step": 3500, "token_acc": 0.9920973696354407 }, { "epoch": 2.20125786163522, "eval_loss": 0.056817859411239624, "eval_runtime": 715.3532, "eval_samples_per_second": 11.847, "eval_steps_per_second": 2.962, "eval_token_acc": 0.9844349694029099, "step": 3500 }, { "epoch": 2.207547169811321, "grad_norm": 0.5403476894342643, "learning_rate": 3.5795038399466832e-06, "loss": 0.021820729970932005, "step": 3510, "token_acc": 0.9930049964311206 }, { "epoch": 2.2138364779874213, "grad_norm": 0.5875323337715701, "learning_rate": 3.526501607213897e-06, "loss": 0.02411033511161804, "step": 3520, "token_acc": 0.9922319599362622 }, { "epoch": 2.220125786163522, "grad_norm": 1.242104157658184, "learning_rate": 3.4738105811720047e-06, "loss": 0.02497299760580063, "step": 3530, "token_acc": 0.9922028985507246 }, { "epoch": 2.2264150943396226, "grad_norm": 0.7858727250077854, "learning_rate": 3.4214332948871885e-06, "loss": 0.024555695056915284, "step": 3540, "token_acc": 0.9920254293999554 }, { "epoch": 2.2327044025157234, "grad_norm": 0.7457502910203632, "learning_rate": 3.3693722663429186e-06, "loss": 0.02635159492492676, "step": 3550, "token_acc": 0.9917847362781284 }, { "epoch": 2.238993710691824, "grad_norm": 0.8007577002256944, "learning_rate": 3.3176299983189007e-06, "loss": 0.021693721413612366, "step": 3560, "token_acc": 0.9924659684369739 }, { "epoch": 2.2452830188679247, "grad_norm": 0.8938253603482125, "learning_rate": 3.266208978270765e-06, "loss": 0.023349395394325255, "step": 3570, "token_acc": 0.9923470839260313 }, { "epoch": 2.251572327044025, "grad_norm": 0.8019690456329862, "learning_rate": 3.2151116782104785e-06, "loss": 0.025201982259750365, "step": 3580, "token_acc": 0.9916726696441435 }, { "epoch": 2.257861635220126, "grad_norm": 0.928512618313778, "learning_rate": 3.1643405545875074e-06, "loss": 0.022884893417358398, "step": 3590, "token_acc": 0.9926495824276399 }, { "epoch": 2.2641509433962264, "grad_norm": 1.8518694118902743, "learning_rate": 3.1138980481707157e-06, "loss": 0.022659416496753692, "step": 3600, "token_acc": 0.9921473526185115 }, { "epoch": 2.270440251572327, "grad_norm": 0.7157409797457132, "learning_rate": 3.063786583931061e-06, "loss": 0.021560478210449218, "step": 3610, "token_acc": 0.9924329154217743 }, { "epoch": 2.2767295597484276, "grad_norm": 0.683960093277271, "learning_rate": 3.0140085709249666e-06, "loss": 0.022942088544368744, "step": 3620, "token_acc": 0.9926606502120257 }, { "epoch": 2.2830188679245285, "grad_norm": 0.3807671208250297, "learning_rate": 2.9645664021785514e-06, "loss": 0.022927108407020568, "step": 3630, "token_acc": 0.9927953890489913 }, { "epoch": 2.289308176100629, "grad_norm": 0.5798902198831075, "learning_rate": 2.9154624545725687e-06, "loss": 0.021968087553977965, "step": 3640, "token_acc": 0.9928758279067179 }, { "epoch": 2.2955974842767297, "grad_norm": 0.8968069110950454, "learning_rate": 2.8666990887281376e-06, "loss": 0.024356882274150848, "step": 3650, "token_acc": 0.9921203845935028 }, { "epoch": 2.30188679245283, "grad_norm": 0.9101388836181897, "learning_rate": 2.818278648893271e-06, "loss": 0.022238627076148987, "step": 3660, "token_acc": 0.992795683243401 }, { "epoch": 2.308176100628931, "grad_norm": 0.6722990881265821, "learning_rate": 2.7702034628301654e-06, "loss": 0.02386687248945236, "step": 3670, "token_acc": 0.9920182111572153 }, { "epoch": 2.3144654088050314, "grad_norm": 0.58030818463452, "learning_rate": 2.722475841703306e-06, "loss": 0.021091663837432863, "step": 3680, "token_acc": 0.9926483257710643 }, { "epoch": 2.3207547169811322, "grad_norm": 1.011732851655928, "learning_rate": 2.6750980799683555e-06, "loss": 0.027016055583953858, "step": 3690, "token_acc": 0.991425876704107 }, { "epoch": 2.3270440251572326, "grad_norm": 0.5918250477640592, "learning_rate": 2.6280724552618408e-06, "loss": 0.023545855283737184, "step": 3700, "token_acc": 0.9922041105598866 }, { "epoch": 2.3333333333333335, "grad_norm": 0.48516060801912675, "learning_rate": 2.58140122829169e-06, "loss": 0.024050341546535493, "step": 3710, "token_acc": 0.9917407154249259 }, { "epoch": 2.339622641509434, "grad_norm": 0.526800520074625, "learning_rate": 2.5350866427285014e-06, "loss": 0.025029301643371582, "step": 3720, "token_acc": 0.9919972463643404 }, { "epoch": 2.3459119496855347, "grad_norm": 0.5899523312172913, "learning_rate": 2.489130925097737e-06, "loss": 0.022905370593070982, "step": 3730, "token_acc": 0.992318268770923 }, { "epoch": 2.352201257861635, "grad_norm": 0.7955845513016064, "learning_rate": 2.443536284672635e-06, "loss": 0.023500500619411467, "step": 3740, "token_acc": 0.992278659713619 }, { "epoch": 2.358490566037736, "grad_norm": 0.6747119310051193, "learning_rate": 2.3983049133680336e-06, "loss": 0.020411881804466247, "step": 3750, "token_acc": 0.993043378273493 }, { "epoch": 2.3647798742138364, "grad_norm": 0.551637714502972, "learning_rate": 2.353438985634984e-06, "loss": 0.021443535387516022, "step": 3760, "token_acc": 0.9929708672086721 }, { "epoch": 2.3710691823899372, "grad_norm": 0.7071071838410758, "learning_rate": 2.30894065835622e-06, "loss": 0.021306820213794708, "step": 3770, "token_acc": 0.9928304327044732 }, { "epoch": 2.3773584905660377, "grad_norm": 0.6383595298983202, "learning_rate": 2.2648120707424657e-06, "loss": 0.01998791694641113, "step": 3780, "token_acc": 0.9933602358261069 }, { "epoch": 2.3836477987421385, "grad_norm": 0.4429036125000402, "learning_rate": 2.221055344229598e-06, "loss": 0.020099201798439027, "step": 3790, "token_acc": 0.9931795746208382 }, { "epoch": 2.389937106918239, "grad_norm": 0.5189498149492784, "learning_rate": 2.1776725823766474e-06, "loss": 0.02125477194786072, "step": 3800, "token_acc": 0.9930950215958172 }, { "epoch": 2.3962264150943398, "grad_norm": 1.231799709812971, "learning_rate": 2.134665870764705e-06, "loss": 0.02221446931362152, "step": 3810, "token_acc": 0.9928049402300153 }, { "epoch": 2.40251572327044, "grad_norm": 0.8890189048591385, "learning_rate": 2.0920372768966133e-06, "loss": 0.021392084658145905, "step": 3820, "token_acc": 0.9932545385224506 }, { "epoch": 2.408805031446541, "grad_norm": 0.6308759320373736, "learning_rate": 2.0497888500976217e-06, "loss": 0.023821352422237395, "step": 3830, "token_acc": 0.9918196384898659 }, { "epoch": 2.4150943396226414, "grad_norm": 0.7886414731779052, "learning_rate": 2.0079226214168233e-06, "loss": 0.024218729138374327, "step": 3840, "token_acc": 0.9919518856412343 }, { "epoch": 2.4213836477987423, "grad_norm": 0.37738056026913436, "learning_rate": 1.9664406035295493e-06, "loss": 0.021755827963352202, "step": 3850, "token_acc": 0.992849932705249 }, { "epoch": 2.4276729559748427, "grad_norm": 0.48646463365440634, "learning_rate": 1.92534479064059e-06, "loss": 0.024741561710834505, "step": 3860, "token_acc": 0.992130426198065 }, { "epoch": 2.4339622641509435, "grad_norm": 0.8159345779241457, "learning_rate": 1.884637158388335e-06, "loss": 0.023516392707824706, "step": 3870, "token_acc": 0.9920172722365842 }, { "epoch": 2.440251572327044, "grad_norm": 0.951255033160176, "learning_rate": 1.8443196637497952e-06, "loss": 0.025745093822479248, "step": 3880, "token_acc": 0.9912382164131629 }, { "epoch": 2.4465408805031448, "grad_norm": 0.4221558934805368, "learning_rate": 1.804394244946519e-06, "loss": 0.02262389361858368, "step": 3890, "token_acc": 0.992309198198954 }, { "epoch": 2.452830188679245, "grad_norm": 0.5344840976706485, "learning_rate": 1.7648628213514219e-06, "loss": 0.021597489714622498, "step": 3900, "token_acc": 0.9930053625553742 }, { "epoch": 2.459119496855346, "grad_norm": 0.5725230961295943, "learning_rate": 1.7257272933965074e-06, "loss": 0.022534751892089845, "step": 3910, "token_acc": 0.9928057553956835 }, { "epoch": 2.4654088050314464, "grad_norm": 0.7446463840263802, "learning_rate": 1.6869895424815074e-06, "loss": 0.025363484025001527, "step": 3920, "token_acc": 0.9917138750317072 }, { "epoch": 2.4716981132075473, "grad_norm": 0.6731229772496609, "learning_rate": 1.64865143088344e-06, "loss": 0.021260468661785124, "step": 3930, "token_acc": 0.9929787607512726 }, { "epoch": 2.4779874213836477, "grad_norm": 0.554208376735791, "learning_rate": 1.6107148016670805e-06, "loss": 0.023889759182929994, "step": 3940, "token_acc": 0.9920448112504842 }, { "epoch": 2.4842767295597485, "grad_norm": 0.67523932744472, "learning_rate": 1.5731814785963474e-06, "loss": 0.02100624144077301, "step": 3950, "token_acc": 0.9932140130630248 }, { "epoch": 2.490566037735849, "grad_norm": 0.70634398052636, "learning_rate": 1.5360532660466521e-06, "loss": 0.02119089215993881, "step": 3960, "token_acc": 0.9931327394577898 }, { "epoch": 2.49685534591195, "grad_norm": 0.8873229135241124, "learning_rate": 1.4993319489181234e-06, "loss": 0.02107432782649994, "step": 3970, "token_acc": 0.9928768056304452 }, { "epoch": 2.50314465408805, "grad_norm": 0.793292346733035, "learning_rate": 1.4630192925498344e-06, "loss": 0.021246784925460817, "step": 3980, "token_acc": 0.9925140911225928 }, { "epoch": 2.509433962264151, "grad_norm": 0.7213692163823019, "learning_rate": 1.427117042634899e-06, "loss": 0.022090838849544527, "step": 3990, "token_acc": 0.9922656578394283 }, { "epoch": 2.5157232704402515, "grad_norm": 0.3399190872934864, "learning_rate": 1.391626925136581e-06, "loss": 0.023202145099639894, "step": 4000, "token_acc": 0.992874826399372 }, { "epoch": 2.5157232704402515, "eval_loss": 0.0549781434237957, "eval_runtime": 714.4039, "eval_samples_per_second": 11.863, "eval_steps_per_second": 2.966, "eval_token_acc": 0.9851726793821435, "step": 4000 }, { "epoch": 2.5220125786163523, "grad_norm": 0.6374524762426332, "learning_rate": 1.3565506462053036e-06, "loss": 0.02244551628828049, "step": 4010, "token_acc": 0.9929325538783701 }, { "epoch": 2.5283018867924527, "grad_norm": 0.38379286396884926, "learning_rate": 1.321889892096634e-06, "loss": 0.020600757002830504, "step": 4020, "token_acc": 0.9929416471395096 }, { "epoch": 2.5345911949685536, "grad_norm": 0.41557599954404484, "learning_rate": 1.287646329090214e-06, "loss": 0.020873698592185973, "step": 4030, "token_acc": 0.9927871772039181 }, { "epoch": 2.540880503144654, "grad_norm": 0.523620093961174, "learning_rate": 1.2538216034096606e-06, "loss": 0.02275868207216263, "step": 4040, "token_acc": 0.9927199368476449 }, { "epoch": 2.547169811320755, "grad_norm": 0.706260703041499, "learning_rate": 1.2204173411434161e-06, "loss": 0.02068159282207489, "step": 4050, "token_acc": 0.9934766365883096 }, { "epoch": 2.5534591194968552, "grad_norm": 0.5067252058108148, "learning_rate": 1.1874351481665935e-06, "loss": 0.023539449274539947, "step": 4060, "token_acc": 0.9921726553415955 }, { "epoch": 2.559748427672956, "grad_norm": 0.5384145679125522, "learning_rate": 1.1548766100637522e-06, "loss": 0.022212520241737366, "step": 4070, "token_acc": 0.9930183982374317 }, { "epoch": 2.5660377358490565, "grad_norm": 0.3706695255239234, "learning_rate": 1.122743292052697e-06, "loss": 0.021440133452415466, "step": 4080, "token_acc": 0.9922975290248222 }, { "epoch": 2.5723270440251573, "grad_norm": 0.6377989611149152, "learning_rate": 1.091036738909208e-06, "loss": 0.020651139318943024, "step": 4090, "token_acc": 0.9925337658342888 }, { "epoch": 2.5786163522012577, "grad_norm": 0.6231957294617673, "learning_rate": 1.0597584748927957e-06, "loss": 0.020253558456897736, "step": 4100, "token_acc": 0.9932907255848926 }, { "epoch": 2.5849056603773586, "grad_norm": 0.5196778496757918, "learning_rate": 1.0289100036734179e-06, "loss": 0.018897609412670137, "step": 4110, "token_acc": 0.9933135004958209 }, { "epoch": 2.591194968553459, "grad_norm": 0.5911443597758588, "learning_rate": 9.984928082591883e-07, "loss": 0.021263836324214934, "step": 4120, "token_acc": 0.9930807719799857 }, { "epoch": 2.59748427672956, "grad_norm": 0.7333439191126617, "learning_rate": 9.68508350925086e-07, "loss": 0.021468062698841096, "step": 4130, "token_acc": 0.9936830866870311 }, { "epoch": 2.6037735849056602, "grad_norm": 0.6243357787231448, "learning_rate": 9.389580731426595e-07, "loss": 0.023629823327064516, "step": 4140, "token_acc": 0.9919061683516983 }, { "epoch": 2.610062893081761, "grad_norm": 0.3768531248387712, "learning_rate": 9.098433955107245e-07, "loss": 0.022948920726776123, "step": 4150, "token_acc": 0.9923590672402083 }, { "epoch": 2.6163522012578615, "grad_norm": 0.41655994069801966, "learning_rate": 8.811657176870803e-07, "loss": 0.017796531319618225, "step": 4160, "token_acc": 0.994064597919728 }, { "epoch": 2.6226415094339623, "grad_norm": 0.7498471436247828, "learning_rate": 8.529264183212028e-07, "loss": 0.02242247611284256, "step": 4170, "token_acc": 0.9927712917013282 }, { "epoch": 2.6289308176100628, "grad_norm": 0.7164144826549131, "learning_rate": 8.251268549879965e-07, "loss": 0.019840958714485168, "step": 4180, "token_acc": 0.9937063932792254 }, { "epoch": 2.6352201257861636, "grad_norm": 0.5219754899465664, "learning_rate": 7.977683641224998e-07, "loss": 0.020760795474052428, "step": 4190, "token_acc": 0.9930836902419261 }, { "epoch": 2.641509433962264, "grad_norm": 0.7751634890953712, "learning_rate": 7.708522609556613e-07, "loss": 0.02397152632474899, "step": 4200, "token_acc": 0.9922862347717181 }, { "epoch": 2.647798742138365, "grad_norm": 0.8400838444811691, "learning_rate": 7.443798394510971e-07, "loss": 0.01996571719646454, "step": 4210, "token_acc": 0.9935876502894463 }, { "epoch": 2.6540880503144653, "grad_norm": 0.9743339975258191, "learning_rate": 7.183523722428909e-07, "loss": 0.021362194418907167, "step": 4220, "token_acc": 0.9933829611248967 }, { "epoch": 2.660377358490566, "grad_norm": 0.7468883773281543, "learning_rate": 6.927711105744139e-07, "loss": 0.023838293552398682, "step": 4230, "token_acc": 0.9920445967133151 }, { "epoch": 2.6666666666666665, "grad_norm": 0.7047724581879639, "learning_rate": 6.676372842381674e-07, "loss": 0.019323495030403138, "step": 4240, "token_acc": 0.993488812596188 }, { "epoch": 2.6729559748427674, "grad_norm": 0.5330074971260592, "learning_rate": 6.429521015166684e-07, "loss": 0.022193637490272523, "step": 4250, "token_acc": 0.9929509953194608 }, { "epoch": 2.6792452830188678, "grad_norm": 0.9456292680760856, "learning_rate": 6.187167491243596e-07, "loss": 0.018984417617321014, "step": 4260, "token_acc": 0.9938085216701742 }, { "epoch": 2.6855345911949686, "grad_norm": 0.6020208718543678, "learning_rate": 5.949323921505534e-07, "loss": 0.02061026692390442, "step": 4270, "token_acc": 0.9933976637887253 }, { "epoch": 2.691823899371069, "grad_norm": 0.4947514694208445, "learning_rate": 5.716001740034371e-07, "loss": 0.020732149481773376, "step": 4280, "token_acc": 0.9931953006862859 }, { "epoch": 2.69811320754717, "grad_norm": 0.4971731498284735, "learning_rate": 5.487212163550848e-07, "loss": 0.02166786640882492, "step": 4290, "token_acc": 0.9930354872269102 }, { "epoch": 2.7044025157232703, "grad_norm": 0.3896129637224116, "learning_rate": 5.262966190875551e-07, "loss": 0.023386237025260926, "step": 4300, "token_acc": 0.9923059705668401 }, { "epoch": 2.710691823899371, "grad_norm": 0.6445372165794033, "learning_rate": 5.043274602399939e-07, "loss": 0.019444110989570617, "step": 4310, "token_acc": 0.9937604110787995 }, { "epoch": 2.7169811320754715, "grad_norm": 0.8571882173501956, "learning_rate": 4.828147959568252e-07, "loss": 0.0215284526348114, "step": 4320, "token_acc": 0.9930044517125466 }, { "epoch": 2.7232704402515724, "grad_norm": 0.5971479826797512, "learning_rate": 4.617596604369734e-07, "loss": 0.02067880630493164, "step": 4330, "token_acc": 0.9930786753110216 }, { "epoch": 2.729559748427673, "grad_norm": 0.6192693288729766, "learning_rate": 4.411630658841415e-07, "loss": 0.02310236692428589, "step": 4340, "token_acc": 0.9923919146546883 }, { "epoch": 2.7358490566037736, "grad_norm": 0.5854199322009673, "learning_rate": 4.2102600245815473e-07, "loss": 0.020319417119026184, "step": 4350, "token_acc": 0.9934170636639238 }, { "epoch": 2.742138364779874, "grad_norm": 0.5060972177534601, "learning_rate": 4.0134943822735863e-07, "loss": 0.020632094144821166, "step": 4360, "token_acc": 0.9927046064400715 }, { "epoch": 2.748427672955975, "grad_norm": 0.6535832360286701, "learning_rate": 3.8213431912207613e-07, "loss": 0.02077610194683075, "step": 4370, "token_acc": 0.9931016862544711 }, { "epoch": 2.7547169811320753, "grad_norm": 0.5769076059018868, "learning_rate": 3.633815688891418e-07, "loss": 0.021984952688217162, "step": 4380, "token_acc": 0.9925909231766065 }, { "epoch": 2.761006289308176, "grad_norm": 0.6414019547548331, "learning_rate": 3.450920890474829e-07, "loss": 0.022785688936710357, "step": 4390, "token_acc": 0.9923757698977457 }, { "epoch": 2.767295597484277, "grad_norm": 0.5779301518428668, "learning_rate": 3.272667588447931e-07, "loss": 0.019667960703372955, "step": 4400, "token_acc": 0.993188854489164 }, { "epoch": 2.7735849056603774, "grad_norm": 0.5661347233531627, "learning_rate": 3.099064352152459e-07, "loss": 0.02017345428466797, "step": 4410, "token_acc": 0.9930435268480751 }, { "epoch": 2.779874213836478, "grad_norm": 0.5682142833519405, "learning_rate": 2.930119527383146e-07, "loss": 0.0193926140666008, "step": 4420, "token_acc": 0.9935703370527348 }, { "epoch": 2.7861635220125787, "grad_norm": 0.6129756155590131, "learning_rate": 2.7658412359864327e-07, "loss": 0.021811990439891814, "step": 4430, "token_acc": 0.9932381008137697 }, { "epoch": 2.7924528301886795, "grad_norm": 0.5869125647929773, "learning_rate": 2.606237375470033e-07, "loss": 0.02384023666381836, "step": 4440, "token_acc": 0.9924499811249529 }, { "epoch": 2.79874213836478, "grad_norm": 0.7772183292502256, "learning_rate": 2.4513156186232514e-07, "loss": 0.02438468337059021, "step": 4450, "token_acc": 0.9916802895843064 }, { "epoch": 2.8050314465408803, "grad_norm": 0.6624999458168028, "learning_rate": 2.301083413148142e-07, "loss": 0.020420333743095397, "step": 4460, "token_acc": 0.9931096958383661 }, { "epoch": 2.811320754716981, "grad_norm": 0.43571255257652386, "learning_rate": 2.1555479813014757e-07, "loss": 0.019135478138923644, "step": 4470, "token_acc": 0.9937029124030136 }, { "epoch": 2.817610062893082, "grad_norm": 0.5935762625557985, "learning_rate": 2.0147163195475161e-07, "loss": 0.02100401222705841, "step": 4480, "token_acc": 0.9933416910433208 }, { "epoch": 2.8238993710691824, "grad_norm": 0.5788237631502353, "learning_rate": 1.8785951982216445e-07, "loss": 0.021649350225925446, "step": 4490, "token_acc": 0.9924480074358081 }, { "epoch": 2.830188679245283, "grad_norm": 0.8397049647839784, "learning_rate": 1.7471911612050085e-07, "loss": 0.01883925497531891, "step": 4500, "token_acc": 0.9934104471280296 }, { "epoch": 2.830188679245283, "eval_loss": 0.05329994112253189, "eval_runtime": 710.6066, "eval_samples_per_second": 11.926, "eval_steps_per_second": 2.982, "eval_token_acc": 0.9857121501158562, "step": 4500 } ], "logging_steps": 10, "max_steps": 4770, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 320035629735936.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }