{ "best_metric": 0.0005143894231878221, "best_model_checkpoint": "PE-big3/checkpoint-19158", "epoch": 2.9998434156271205, "eval_steps": 500, "global_step": 19158, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003914609321989665, "grad_norm": 4.866889476776123, "learning_rate": 6.524008350730689e-07, "loss": 1.4, "step": 25 }, { "epoch": 0.00782921864397933, "grad_norm": 3.255279302597046, "learning_rate": 1.3048016701461379e-06, "loss": 1.3054, "step": 50 }, { "epoch": 0.011743827965968996, "grad_norm": 2.329948902130127, "learning_rate": 1.957202505219207e-06, "loss": 1.1451, "step": 75 }, { "epoch": 0.01565843728795866, "grad_norm": 1.9639887809753418, "learning_rate": 2.6096033402922757e-06, "loss": 0.9271, "step": 100 }, { "epoch": 0.019573046609948328, "grad_norm": 1.9199451208114624, "learning_rate": 3.262004175365345e-06, "loss": 0.6443, "step": 125 }, { "epoch": 0.02348765593193799, "grad_norm": 1.4710872173309326, "learning_rate": 3.914405010438414e-06, "loss": 0.3856, "step": 150 }, { "epoch": 0.02740226525392766, "grad_norm": 1.655380129814148, "learning_rate": 4.5668058455114825e-06, "loss": 0.2192, "step": 175 }, { "epoch": 0.03131687457591732, "grad_norm": 1.475960373878479, "learning_rate": 5.2192066805845514e-06, "loss": 0.12, "step": 200 }, { "epoch": 0.035231483897906986, "grad_norm": 0.9583467245101929, "learning_rate": 5.87160751565762e-06, "loss": 0.0688, "step": 225 }, { "epoch": 0.039146093219896656, "grad_norm": 1.5871645212173462, "learning_rate": 6.52400835073069e-06, "loss": 0.0487, "step": 250 }, { "epoch": 0.04306070254188632, "grad_norm": 1.3285017013549805, "learning_rate": 7.176409185803757e-06, "loss": 0.0331, "step": 275 }, { "epoch": 0.04697531186387598, "grad_norm": 1.2705070972442627, "learning_rate": 7.828810020876827e-06, "loss": 0.0195, "step": 300 }, { "epoch": 0.050889921185865654, "grad_norm": 0.44879260659217834, "learning_rate": 8.481210855949897e-06, "loss": 0.0145, "step": 325 }, { "epoch": 0.05480453050785532, "grad_norm": 0.6279118657112122, "learning_rate": 9.133611691022965e-06, "loss": 0.0142, "step": 350 }, { "epoch": 0.05871913982984498, "grad_norm": 0.15489937365055084, "learning_rate": 9.786012526096033e-06, "loss": 0.0159, "step": 375 }, { "epoch": 0.06263374915183464, "grad_norm": 0.1354319453239441, "learning_rate": 1.0438413361169103e-05, "loss": 0.0144, "step": 400 }, { "epoch": 0.06654835847382432, "grad_norm": 0.36055588722229004, "learning_rate": 1.1090814196242173e-05, "loss": 0.0078, "step": 425 }, { "epoch": 0.07046296779581397, "grad_norm": 0.11695325374603271, "learning_rate": 1.174321503131524e-05, "loss": 0.0064, "step": 450 }, { "epoch": 0.07437757711780364, "grad_norm": 1.524307370185852, "learning_rate": 1.2395615866388309e-05, "loss": 0.0069, "step": 475 }, { "epoch": 0.07829218643979331, "grad_norm": 0.43202999234199524, "learning_rate": 1.304801670146138e-05, "loss": 0.0098, "step": 500 }, { "epoch": 0.08220679576178297, "grad_norm": 0.20765632390975952, "learning_rate": 1.3700417536534447e-05, "loss": 0.0068, "step": 525 }, { "epoch": 0.08612140508377264, "grad_norm": 0.020130537450313568, "learning_rate": 1.4352818371607515e-05, "loss": 0.0045, "step": 550 }, { "epoch": 0.09003601440576231, "grad_norm": 1.2454266548156738, "learning_rate": 1.5005219206680585e-05, "loss": 0.0057, "step": 575 }, { "epoch": 0.09395062372775197, "grad_norm": 1.0225284099578857, "learning_rate": 1.5657620041753654e-05, "loss": 0.0049, "step": 600 }, { "epoch": 0.09786523304974164, "grad_norm": 0.755135178565979, "learning_rate": 1.6310020876826724e-05, "loss": 0.0031, "step": 625 }, { "epoch": 0.10177984237173131, "grad_norm": 0.2760821282863617, "learning_rate": 1.6962421711899794e-05, "loss": 0.0071, "step": 650 }, { "epoch": 0.10569445169372096, "grad_norm": 0.4344524145126343, "learning_rate": 1.761482254697286e-05, "loss": 0.0034, "step": 675 }, { "epoch": 0.10960906101571063, "grad_norm": 0.020152989774942398, "learning_rate": 1.826722338204593e-05, "loss": 0.0025, "step": 700 }, { "epoch": 0.11352367033770029, "grad_norm": 0.08340949565172195, "learning_rate": 1.8919624217118996e-05, "loss": 0.0078, "step": 725 }, { "epoch": 0.11743827965968996, "grad_norm": 0.03789607062935829, "learning_rate": 1.9572025052192066e-05, "loss": 0.0011, "step": 750 }, { "epoch": 0.12135288898167963, "grad_norm": 0.8983942866325378, "learning_rate": 2.0224425887265136e-05, "loss": 0.0083, "step": 775 }, { "epoch": 0.1252674983036693, "grad_norm": 2.239677667617798, "learning_rate": 2.0876826722338206e-05, "loss": 0.0121, "step": 800 }, { "epoch": 0.12918210762565896, "grad_norm": 0.017707696184515953, "learning_rate": 2.1529227557411276e-05, "loss": 0.0029, "step": 825 }, { "epoch": 0.13309671694764863, "grad_norm": 0.45937007665634155, "learning_rate": 2.2181628392484345e-05, "loss": 0.0049, "step": 850 }, { "epoch": 0.1370113262696383, "grad_norm": 0.9611666202545166, "learning_rate": 2.2834029227557412e-05, "loss": 0.0128, "step": 875 }, { "epoch": 0.14092593559162794, "grad_norm": 0.22098630666732788, "learning_rate": 2.348643006263048e-05, "loss": 0.0139, "step": 900 }, { "epoch": 0.1448405449136176, "grad_norm": 0.016266101971268654, "learning_rate": 2.413883089770355e-05, "loss": 0.0033, "step": 925 }, { "epoch": 0.14875515423560728, "grad_norm": 0.03947868198156357, "learning_rate": 2.4791231732776618e-05, "loss": 0.0042, "step": 950 }, { "epoch": 0.15266976355759695, "grad_norm": 0.20284026861190796, "learning_rate": 2.544363256784969e-05, "loss": 0.0037, "step": 975 }, { "epoch": 0.15658437287958663, "grad_norm": 0.9516937732696533, "learning_rate": 2.609603340292276e-05, "loss": 0.013, "step": 1000 }, { "epoch": 0.1604989822015763, "grad_norm": 0.35638949275016785, "learning_rate": 2.6748434237995827e-05, "loss": 0.0043, "step": 1025 }, { "epoch": 0.16441359152356594, "grad_norm": 0.2974227964878082, "learning_rate": 2.7400835073068893e-05, "loss": 0.0058, "step": 1050 }, { "epoch": 0.1683282008455556, "grad_norm": 0.10002760589122772, "learning_rate": 2.8053235908141963e-05, "loss": 0.0047, "step": 1075 }, { "epoch": 0.17224281016754528, "grad_norm": 0.03456703945994377, "learning_rate": 2.870563674321503e-05, "loss": 0.0034, "step": 1100 }, { "epoch": 0.17615741948953495, "grad_norm": 0.02147500589489937, "learning_rate": 2.93580375782881e-05, "loss": 0.0052, "step": 1125 }, { "epoch": 0.18007202881152462, "grad_norm": 0.048098206520080566, "learning_rate": 3.001043841336117e-05, "loss": 0.0042, "step": 1150 }, { "epoch": 0.18398663813351426, "grad_norm": 0.251152902841568, "learning_rate": 3.0662839248434235e-05, "loss": 0.0068, "step": 1175 }, { "epoch": 0.18790124745550393, "grad_norm": 0.040291983634233475, "learning_rate": 3.131524008350731e-05, "loss": 0.0097, "step": 1200 }, { "epoch": 0.1918158567774936, "grad_norm": 0.019989246502518654, "learning_rate": 3.1967640918580375e-05, "loss": 0.0053, "step": 1225 }, { "epoch": 0.19573046609948327, "grad_norm": 0.016093524172902107, "learning_rate": 3.262004175365345e-05, "loss": 0.0022, "step": 1250 }, { "epoch": 0.19964507542147295, "grad_norm": 0.008093849755823612, "learning_rate": 3.3272442588726515e-05, "loss": 0.0019, "step": 1275 }, { "epoch": 0.20355968474346262, "grad_norm": 0.018408527597784996, "learning_rate": 3.392484342379959e-05, "loss": 0.0022, "step": 1300 }, { "epoch": 0.20747429406545226, "grad_norm": 0.009230668656527996, "learning_rate": 3.4577244258872654e-05, "loss": 0.0007, "step": 1325 }, { "epoch": 0.21138890338744193, "grad_norm": 0.061230212450027466, "learning_rate": 3.522964509394572e-05, "loss": 0.0031, "step": 1350 }, { "epoch": 0.2153035127094316, "grad_norm": 0.20762716233730316, "learning_rate": 3.5882045929018794e-05, "loss": 0.0024, "step": 1375 }, { "epoch": 0.21921812203142127, "grad_norm": 0.048180121928453445, "learning_rate": 3.653444676409186e-05, "loss": 0.0165, "step": 1400 }, { "epoch": 0.22313273135341094, "grad_norm": 0.026987021788954735, "learning_rate": 3.718684759916493e-05, "loss": 0.0041, "step": 1425 }, { "epoch": 0.22704734067540058, "grad_norm": 0.6532347202301025, "learning_rate": 3.783924843423799e-05, "loss": 0.0061, "step": 1450 }, { "epoch": 0.23096194999739025, "grad_norm": 0.0236322320997715, "learning_rate": 3.8491649269311066e-05, "loss": 0.0018, "step": 1475 }, { "epoch": 0.23487655931937992, "grad_norm": 0.3827228844165802, "learning_rate": 3.914405010438413e-05, "loss": 0.0029, "step": 1500 }, { "epoch": 0.2387911686413696, "grad_norm": 0.00414466205984354, "learning_rate": 3.9796450939457206e-05, "loss": 0.0024, "step": 1525 }, { "epoch": 0.24270577796335926, "grad_norm": 0.03536088764667511, "learning_rate": 4.044885177453027e-05, "loss": 0.0041, "step": 1550 }, { "epoch": 0.24662038728534894, "grad_norm": 0.02724548988044262, "learning_rate": 4.110125260960334e-05, "loss": 0.0018, "step": 1575 }, { "epoch": 0.2505349966073386, "grad_norm": 1.551004409790039, "learning_rate": 4.175365344467641e-05, "loss": 0.0089, "step": 1600 }, { "epoch": 0.25444960592932825, "grad_norm": 0.28799840807914734, "learning_rate": 4.240605427974948e-05, "loss": 0.0075, "step": 1625 }, { "epoch": 0.2583642152513179, "grad_norm": 0.009647930040955544, "learning_rate": 4.305845511482255e-05, "loss": 0.0049, "step": 1650 }, { "epoch": 0.2622788245733076, "grad_norm": 0.006901186890900135, "learning_rate": 4.371085594989562e-05, "loss": 0.0027, "step": 1675 }, { "epoch": 0.26619343389529726, "grad_norm": 0.05002870783209801, "learning_rate": 4.436325678496869e-05, "loss": 0.0064, "step": 1700 }, { "epoch": 0.27010804321728693, "grad_norm": 0.1099412590265274, "learning_rate": 4.501565762004176e-05, "loss": 0.0035, "step": 1725 }, { "epoch": 0.2740226525392766, "grad_norm": 0.43022432923316956, "learning_rate": 4.5668058455114823e-05, "loss": 0.005, "step": 1750 }, { "epoch": 0.27793726186126627, "grad_norm": 0.0661238431930542, "learning_rate": 4.6320459290187897e-05, "loss": 0.0111, "step": 1775 }, { "epoch": 0.2818518711832559, "grad_norm": 0.04808713495731354, "learning_rate": 4.697286012526096e-05, "loss": 0.0082, "step": 1800 }, { "epoch": 0.28576648050524556, "grad_norm": 0.010018469765782356, "learning_rate": 4.7625260960334036e-05, "loss": 0.0063, "step": 1825 }, { "epoch": 0.2896810898272352, "grad_norm": 0.02794954925775528, "learning_rate": 4.82776617954071e-05, "loss": 0.0023, "step": 1850 }, { "epoch": 0.2935956991492249, "grad_norm": 0.08497870713472366, "learning_rate": 4.893006263048017e-05, "loss": 0.0053, "step": 1875 }, { "epoch": 0.29751030847121457, "grad_norm": 0.012794610112905502, "learning_rate": 4.9582463465553235e-05, "loss": 0.0023, "step": 1900 }, { "epoch": 0.30142491779320424, "grad_norm": 0.5463805794715881, "learning_rate": 4.997390093956617e-05, "loss": 0.0123, "step": 1925 }, { "epoch": 0.3053395271151939, "grad_norm": 0.643292248249054, "learning_rate": 4.990140354947222e-05, "loss": 0.006, "step": 1950 }, { "epoch": 0.3092541364371836, "grad_norm": 0.016200900077819824, "learning_rate": 4.9828906159378265e-05, "loss": 0.0038, "step": 1975 }, { "epoch": 0.31316874575917325, "grad_norm": 0.01805788092315197, "learning_rate": 4.975640876928431e-05, "loss": 0.0005, "step": 2000 }, { "epoch": 0.3170833550811629, "grad_norm": 0.09769612550735474, "learning_rate": 4.968391137919035e-05, "loss": 0.0017, "step": 2025 }, { "epoch": 0.3209979644031526, "grad_norm": 0.7254369258880615, "learning_rate": 4.96114139890964e-05, "loss": 0.0029, "step": 2050 }, { "epoch": 0.3249125737251422, "grad_norm": 0.21768023073673248, "learning_rate": 4.953891659900244e-05, "loss": 0.0051, "step": 2075 }, { "epoch": 0.3288271830471319, "grad_norm": 0.0030887445900589228, "learning_rate": 4.946641920890848e-05, "loss": 0.0007, "step": 2100 }, { "epoch": 0.33274179236912155, "grad_norm": 0.03302296623587608, "learning_rate": 4.9393921818814525e-05, "loss": 0.008, "step": 2125 }, { "epoch": 0.3366564016911112, "grad_norm": 0.07119308412075043, "learning_rate": 4.932142442872057e-05, "loss": 0.0005, "step": 2150 }, { "epoch": 0.3405710110131009, "grad_norm": 0.0021239419002085924, "learning_rate": 4.9248927038626616e-05, "loss": 0.0006, "step": 2175 }, { "epoch": 0.34448562033509056, "grad_norm": 0.0006605405360460281, "learning_rate": 4.9179329544136416e-05, "loss": 0.001, "step": 2200 }, { "epoch": 0.34840022965708023, "grad_norm": 0.35737213492393494, "learning_rate": 4.910683215404246e-05, "loss": 0.0031, "step": 2225 }, { "epoch": 0.3523148389790699, "grad_norm": 0.003352939384058118, "learning_rate": 4.90343347639485e-05, "loss": 0.0026, "step": 2250 }, { "epoch": 0.35622944830105957, "grad_norm": 0.011482371017336845, "learning_rate": 4.896183737385454e-05, "loss": 0.0056, "step": 2275 }, { "epoch": 0.36014405762304924, "grad_norm": 0.31182751059532166, "learning_rate": 4.8889339983760585e-05, "loss": 0.003, "step": 2300 }, { "epoch": 0.3640586669450389, "grad_norm": 0.019928403198719025, "learning_rate": 4.881684259366663e-05, "loss": 0.004, "step": 2325 }, { "epoch": 0.3679732762670285, "grad_norm": 0.17220672965049744, "learning_rate": 4.874434520357267e-05, "loss": 0.0022, "step": 2350 }, { "epoch": 0.3718878855890182, "grad_norm": 0.002172990469262004, "learning_rate": 4.867184781347872e-05, "loss": 0.0018, "step": 2375 }, { "epoch": 0.37580249491100787, "grad_norm": 0.6102157831192017, "learning_rate": 4.859935042338476e-05, "loss": 0.0024, "step": 2400 }, { "epoch": 0.37971710423299754, "grad_norm": 0.011678989976644516, "learning_rate": 4.85268530332908e-05, "loss": 0.0023, "step": 2425 }, { "epoch": 0.3836317135549872, "grad_norm": 0.7285154461860657, "learning_rate": 4.8454355643196845e-05, "loss": 0.0051, "step": 2450 }, { "epoch": 0.3875463228769769, "grad_norm": 0.004773287568241358, "learning_rate": 4.838185825310289e-05, "loss": 0.0018, "step": 2475 }, { "epoch": 0.39146093219896655, "grad_norm": 0.00791076384484768, "learning_rate": 4.8309360863008937e-05, "loss": 0.004, "step": 2500 }, { "epoch": 0.3953755415209562, "grad_norm": 0.8710932731628418, "learning_rate": 4.823686347291498e-05, "loss": 0.0042, "step": 2525 }, { "epoch": 0.3992901508429459, "grad_norm": 0.04120909795165062, "learning_rate": 4.816436608282102e-05, "loss": 0.0024, "step": 2550 }, { "epoch": 0.40320476016493556, "grad_norm": 1.0033127069473267, "learning_rate": 4.809186869272706e-05, "loss": 0.004, "step": 2575 }, { "epoch": 0.40711936948692523, "grad_norm": 0.1285122036933899, "learning_rate": 4.801937130263311e-05, "loss": 0.009, "step": 2600 }, { "epoch": 0.41103397880891485, "grad_norm": 0.8447295427322388, "learning_rate": 4.7946873912539154e-05, "loss": 0.0015, "step": 2625 }, { "epoch": 0.4149485881309045, "grad_norm": 0.10731597989797592, "learning_rate": 4.78743765224452e-05, "loss": 0.0066, "step": 2650 }, { "epoch": 0.4188631974528942, "grad_norm": 0.011971144936978817, "learning_rate": 4.780187913235123e-05, "loss": 0.0042, "step": 2675 }, { "epoch": 0.42277780677488386, "grad_norm": 0.0017153106164187193, "learning_rate": 4.772938174225728e-05, "loss": 0.0013, "step": 2700 }, { "epoch": 0.42669241609687353, "grad_norm": 0.0010528437560424209, "learning_rate": 4.7656884352163323e-05, "loss": 0.0006, "step": 2725 }, { "epoch": 0.4306070254188632, "grad_norm": 0.0007753331447020173, "learning_rate": 4.7584386962069366e-05, "loss": 0.0002, "step": 2750 }, { "epoch": 0.43452163474085287, "grad_norm": 0.0036313000600785017, "learning_rate": 4.751188957197541e-05, "loss": 0.004, "step": 2775 }, { "epoch": 0.43843624406284254, "grad_norm": 0.10537869483232498, "learning_rate": 4.743939218188146e-05, "loss": 0.004, "step": 2800 }, { "epoch": 0.4423508533848322, "grad_norm": 0.0017782174982130527, "learning_rate": 4.73668947917875e-05, "loss": 0.0011, "step": 2825 }, { "epoch": 0.4462654627068219, "grad_norm": 0.02180619165301323, "learning_rate": 4.729439740169354e-05, "loss": 0.0003, "step": 2850 }, { "epoch": 0.45018007202881155, "grad_norm": 0.0014395464677363634, "learning_rate": 4.7221900011599584e-05, "loss": 0.0011, "step": 2875 }, { "epoch": 0.45409468135080117, "grad_norm": 0.041430070996284485, "learning_rate": 4.7149402621505626e-05, "loss": 0.0007, "step": 2900 }, { "epoch": 0.45800929067279084, "grad_norm": 0.054793838411569595, "learning_rate": 4.7076905231411675e-05, "loss": 0.0025, "step": 2925 }, { "epoch": 0.4619238999947805, "grad_norm": 0.08612020313739777, "learning_rate": 4.700440784131772e-05, "loss": 0.0068, "step": 2950 }, { "epoch": 0.4658385093167702, "grad_norm": 1.2504163980484009, "learning_rate": 4.693191045122376e-05, "loss": 0.0075, "step": 2975 }, { "epoch": 0.46975311863875985, "grad_norm": 0.8100822567939758, "learning_rate": 4.68594130611298e-05, "loss": 0.0024, "step": 3000 }, { "epoch": 0.4736677279607495, "grad_norm": 0.7344357967376709, "learning_rate": 4.6786915671035844e-05, "loss": 0.0042, "step": 3025 }, { "epoch": 0.4775823372827392, "grad_norm": 0.006882940419018269, "learning_rate": 4.671441828094189e-05, "loss": 0.0108, "step": 3050 }, { "epoch": 0.48149694660472886, "grad_norm": 0.07418603450059891, "learning_rate": 4.6641920890847935e-05, "loss": 0.0015, "step": 3075 }, { "epoch": 0.48541155592671853, "grad_norm": 0.023311011493206024, "learning_rate": 4.656942350075397e-05, "loss": 0.0011, "step": 3100 }, { "epoch": 0.4893261652487082, "grad_norm": 0.22213295102119446, "learning_rate": 4.649692611066002e-05, "loss": 0.0002, "step": 3125 }, { "epoch": 0.49324077457069787, "grad_norm": 0.028663238510489464, "learning_rate": 4.642442872056606e-05, "loss": 0.0034, "step": 3150 }, { "epoch": 0.4971553838926875, "grad_norm": 0.010352909564971924, "learning_rate": 4.6351931330472104e-05, "loss": 0.0003, "step": 3175 }, { "epoch": 0.5010699932146772, "grad_norm": 0.01622854731976986, "learning_rate": 4.6279433940378146e-05, "loss": 0.0035, "step": 3200 }, { "epoch": 0.5049846025366669, "grad_norm": 0.0045238700695335865, "learning_rate": 4.620693655028419e-05, "loss": 0.0016, "step": 3225 }, { "epoch": 0.5088992118586565, "grad_norm": 0.000869418028742075, "learning_rate": 4.613443916019024e-05, "loss": 0.0003, "step": 3250 }, { "epoch": 0.5128138211806462, "grad_norm": 0.0070857820101082325, "learning_rate": 4.606194177009628e-05, "loss": 0.0013, "step": 3275 }, { "epoch": 0.5167284305026358, "grad_norm": 0.019664961844682693, "learning_rate": 4.598944438000232e-05, "loss": 0.0014, "step": 3300 }, { "epoch": 0.5206430398246255, "grad_norm": 0.002933235839009285, "learning_rate": 4.5916946989908364e-05, "loss": 0.0024, "step": 3325 }, { "epoch": 0.5245576491466152, "grad_norm": 0.009601329453289509, "learning_rate": 4.5844449599814406e-05, "loss": 0.001, "step": 3350 }, { "epoch": 0.5284722584686048, "grad_norm": 0.03231184929609299, "learning_rate": 4.5771952209720455e-05, "loss": 0.001, "step": 3375 }, { "epoch": 0.5323868677905945, "grad_norm": 0.038716524839401245, "learning_rate": 4.56994548196265e-05, "loss": 0.0104, "step": 3400 }, { "epoch": 0.5363014771125841, "grad_norm": 0.005376841872930527, "learning_rate": 4.562695742953254e-05, "loss": 0.0021, "step": 3425 }, { "epoch": 0.5402160864345739, "grad_norm": 0.8506935834884644, "learning_rate": 4.555446003943858e-05, "loss": 0.0037, "step": 3450 }, { "epoch": 0.5441306957565635, "grad_norm": 0.00393926864489913, "learning_rate": 4.548196264934463e-05, "loss": 0.0007, "step": 3475 }, { "epoch": 0.5480453050785532, "grad_norm": 0.49948814511299133, "learning_rate": 4.5409465259250666e-05, "loss": 0.0017, "step": 3500 }, { "epoch": 0.5519599144005428, "grad_norm": 0.008987600915133953, "learning_rate": 4.533696786915671e-05, "loss": 0.0017, "step": 3525 }, { "epoch": 0.5558745237225325, "grad_norm": 0.06366792321205139, "learning_rate": 4.526447047906275e-05, "loss": 0.0016, "step": 3550 }, { "epoch": 0.5597891330445222, "grad_norm": 0.9016256332397461, "learning_rate": 4.51919730889688e-05, "loss": 0.0023, "step": 3575 }, { "epoch": 0.5637037423665118, "grad_norm": 0.010248661041259766, "learning_rate": 4.511947569887484e-05, "loss": 0.0037, "step": 3600 }, { "epoch": 0.5676183516885015, "grad_norm": 0.007675408851355314, "learning_rate": 4.5046978308780884e-05, "loss": 0.0053, "step": 3625 }, { "epoch": 0.5715329610104911, "grad_norm": 0.0017978112446144223, "learning_rate": 4.497448091868693e-05, "loss": 0.0006, "step": 3650 }, { "epoch": 0.5754475703324808, "grad_norm": 1.0881074666976929, "learning_rate": 4.4901983528592976e-05, "loss": 0.0026, "step": 3675 }, { "epoch": 0.5793621796544705, "grad_norm": 0.0023445766419172287, "learning_rate": 4.4832386034102776e-05, "loss": 0.0014, "step": 3700 }, { "epoch": 0.5832767889764602, "grad_norm": 0.0032128174789249897, "learning_rate": 4.475988864400882e-05, "loss": 0.0013, "step": 3725 }, { "epoch": 0.5871913982984498, "grad_norm": 0.07783033698797226, "learning_rate": 4.468739125391486e-05, "loss": 0.0061, "step": 3750 }, { "epoch": 0.5911060076204395, "grad_norm": 0.018863795325160027, "learning_rate": 4.46148938638209e-05, "loss": 0.0022, "step": 3775 }, { "epoch": 0.5950206169424291, "grad_norm": 0.004098298028111458, "learning_rate": 4.454239647372695e-05, "loss": 0.0022, "step": 3800 }, { "epoch": 0.5989352262644189, "grad_norm": 0.0029339243192225695, "learning_rate": 4.4469899083632994e-05, "loss": 0.0023, "step": 3825 }, { "epoch": 0.6028498355864085, "grad_norm": 0.0022904234938323498, "learning_rate": 4.4397401693539036e-05, "loss": 0.0013, "step": 3850 }, { "epoch": 0.6067644449083981, "grad_norm": 0.001695298939011991, "learning_rate": 4.432490430344508e-05, "loss": 0.0003, "step": 3875 }, { "epoch": 0.6106790542303878, "grad_norm": 0.3725820779800415, "learning_rate": 4.425240691335112e-05, "loss": 0.0012, "step": 3900 }, { "epoch": 0.6145936635523774, "grad_norm": 0.000986219383776188, "learning_rate": 4.417990952325716e-05, "loss": 0.0007, "step": 3925 }, { "epoch": 0.6185082728743672, "grad_norm": 0.016280701383948326, "learning_rate": 4.4107412133163205e-05, "loss": 0.0005, "step": 3950 }, { "epoch": 0.6224228821963568, "grad_norm": 0.0007005013758316636, "learning_rate": 4.403491474306925e-05, "loss": 0.0008, "step": 3975 }, { "epoch": 0.6263374915183465, "grad_norm": 0.0015142613556236029, "learning_rate": 4.3962417352975296e-05, "loss": 0.0022, "step": 4000 }, { "epoch": 0.6302521008403361, "grad_norm": 0.02496866136789322, "learning_rate": 4.388991996288134e-05, "loss": 0.0081, "step": 4025 }, { "epoch": 0.6341667101623258, "grad_norm": 0.10312812030315399, "learning_rate": 4.381742257278738e-05, "loss": 0.0028, "step": 4050 }, { "epoch": 0.6380813194843155, "grad_norm": 0.005419147200882435, "learning_rate": 4.374492518269342e-05, "loss": 0.0005, "step": 4075 }, { "epoch": 0.6419959288063052, "grad_norm": 0.0012350809993222356, "learning_rate": 4.3672427792599465e-05, "loss": 0.0005, "step": 4100 }, { "epoch": 0.6459105381282948, "grad_norm": 0.0014117741957306862, "learning_rate": 4.3599930402505514e-05, "loss": 0.0004, "step": 4125 }, { "epoch": 0.6498251474502844, "grad_norm": 0.011549504473805428, "learning_rate": 4.3527433012411556e-05, "loss": 0.0108, "step": 4150 }, { "epoch": 0.6537397567722741, "grad_norm": 0.0015101751778274775, "learning_rate": 4.34549356223176e-05, "loss": 0.0023, "step": 4175 }, { "epoch": 0.6576543660942638, "grad_norm": 0.831576406955719, "learning_rate": 4.338243823222364e-05, "loss": 0.003, "step": 4200 }, { "epoch": 0.6615689754162535, "grad_norm": 0.003971900790929794, "learning_rate": 4.330994084212969e-05, "loss": 0.0009, "step": 4225 }, { "epoch": 0.6654835847382431, "grad_norm": 0.00122584099881351, "learning_rate": 4.323744345203573e-05, "loss": 0.0004, "step": 4250 }, { "epoch": 0.6693981940602328, "grad_norm": 1.1975153684616089, "learning_rate": 4.3164946061941774e-05, "loss": 0.008, "step": 4275 }, { "epoch": 0.6733128033822224, "grad_norm": 0.007587050087749958, "learning_rate": 4.309244867184782e-05, "loss": 0.0008, "step": 4300 }, { "epoch": 0.6772274127042122, "grad_norm": 1.974413514137268, "learning_rate": 4.301995128175386e-05, "loss": 0.0003, "step": 4325 }, { "epoch": 0.6811420220262018, "grad_norm": 0.0011919812532141805, "learning_rate": 4.29474538916599e-05, "loss": 0.0055, "step": 4350 }, { "epoch": 0.6850566313481915, "grad_norm": 0.0037530860863626003, "learning_rate": 4.287495650156594e-05, "loss": 0.0031, "step": 4375 }, { "epoch": 0.6889712406701811, "grad_norm": 0.0055799526162445545, "learning_rate": 4.2802459111471986e-05, "loss": 0.0009, "step": 4400 }, { "epoch": 0.6928858499921707, "grad_norm": 0.7918204069137573, "learning_rate": 4.2729961721378035e-05, "loss": 0.0011, "step": 4425 }, { "epoch": 0.6968004593141605, "grad_norm": 0.021195508539676666, "learning_rate": 4.265746433128408e-05, "loss": 0.0014, "step": 4450 }, { "epoch": 0.7007150686361501, "grad_norm": 0.0016733302036300302, "learning_rate": 4.258496694119012e-05, "loss": 0.0016, "step": 4475 }, { "epoch": 0.7046296779581398, "grad_norm": 0.0015721771633252501, "learning_rate": 4.251246955109616e-05, "loss": 0.002, "step": 4500 }, { "epoch": 0.7085442872801294, "grad_norm": 0.024684101343154907, "learning_rate": 4.2439972161002204e-05, "loss": 0.0021, "step": 4525 }, { "epoch": 0.7124588966021191, "grad_norm": 0.0010000619804486632, "learning_rate": 4.236747477090825e-05, "loss": 0.001, "step": 4550 }, { "epoch": 0.7163735059241088, "grad_norm": 0.0010993380565196276, "learning_rate": 4.2294977380814295e-05, "loss": 0.0024, "step": 4575 }, { "epoch": 0.7202881152460985, "grad_norm": 0.01743653602898121, "learning_rate": 4.222247999072034e-05, "loss": 0.001, "step": 4600 }, { "epoch": 0.7242027245680881, "grad_norm": 0.0034048547968268394, "learning_rate": 4.214998260062638e-05, "loss": 0.0012, "step": 4625 }, { "epoch": 0.7281173338900778, "grad_norm": 0.006288307718932629, "learning_rate": 4.207748521053242e-05, "loss": 0.0016, "step": 4650 }, { "epoch": 0.7320319432120674, "grad_norm": 0.09262362122535706, "learning_rate": 4.200498782043847e-05, "loss": 0.0062, "step": 4675 }, { "epoch": 0.735946552534057, "grad_norm": 0.0012087648501619697, "learning_rate": 4.193249043034451e-05, "loss": 0.0003, "step": 4700 }, { "epoch": 0.7398611618560468, "grad_norm": 2.551692247390747, "learning_rate": 4.185999304025055e-05, "loss": 0.0007, "step": 4725 }, { "epoch": 0.7437757711780364, "grad_norm": 0.003155685495585203, "learning_rate": 4.17874956501566e-05, "loss": 0.0035, "step": 4750 }, { "epoch": 0.7476903805000261, "grad_norm": 0.0007522006053477526, "learning_rate": 4.171499826006264e-05, "loss": 0.0003, "step": 4775 }, { "epoch": 0.7516049898220157, "grad_norm": 0.1172158420085907, "learning_rate": 4.164250086996868e-05, "loss": 0.0022, "step": 4800 }, { "epoch": 0.7555195991440055, "grad_norm": 0.0018555809510871768, "learning_rate": 4.1570003479874724e-05, "loss": 0.0008, "step": 4825 }, { "epoch": 0.7594342084659951, "grad_norm": 0.014069788157939911, "learning_rate": 4.1497506089780766e-05, "loss": 0.0027, "step": 4850 }, { "epoch": 0.7633488177879848, "grad_norm": 0.0070347595028579235, "learning_rate": 4.1425008699686815e-05, "loss": 0.0044, "step": 4875 }, { "epoch": 0.7672634271099744, "grad_norm": 0.005139984656125307, "learning_rate": 4.135251130959286e-05, "loss": 0.0013, "step": 4900 }, { "epoch": 0.7711780364319641, "grad_norm": 0.03146003186702728, "learning_rate": 4.12800139194989e-05, "loss": 0.0008, "step": 4925 }, { "epoch": 0.7750926457539538, "grad_norm": 0.0008966127061285079, "learning_rate": 4.120751652940494e-05, "loss": 0.0013, "step": 4950 }, { "epoch": 0.7790072550759434, "grad_norm": 0.010651414282619953, "learning_rate": 4.1135019139310984e-05, "loss": 0.0007, "step": 4975 }, { "epoch": 0.7829218643979331, "grad_norm": 0.05222166329622269, "learning_rate": 4.106252174921703e-05, "loss": 0.0025, "step": 5000 }, { "epoch": 0.7868364737199227, "grad_norm": 0.008781126700341702, "learning_rate": 4.0990024359123075e-05, "loss": 0.0006, "step": 5025 }, { "epoch": 0.7907510830419124, "grad_norm": 0.0023096187505871058, "learning_rate": 4.091752696902912e-05, "loss": 0.0003, "step": 5050 }, { "epoch": 0.794665692363902, "grad_norm": 0.000690230808686465, "learning_rate": 4.084502957893516e-05, "loss": 0.001, "step": 5075 }, { "epoch": 0.7985803016858918, "grad_norm": 0.0017941935220733285, "learning_rate": 4.077253218884121e-05, "loss": 0.0018, "step": 5100 }, { "epoch": 0.8024949110078814, "grad_norm": 0.001472643343731761, "learning_rate": 4.070003479874725e-05, "loss": 0.0011, "step": 5125 }, { "epoch": 0.8064095203298711, "grad_norm": 0.050277333706617355, "learning_rate": 4.0627537408653286e-05, "loss": 0.0104, "step": 5150 }, { "epoch": 0.8103241296518607, "grad_norm": 0.020627155900001526, "learning_rate": 4.055504001855933e-05, "loss": 0.0019, "step": 5175 }, { "epoch": 0.8142387389738505, "grad_norm": 1.6748356819152832, "learning_rate": 4.048254262846538e-05, "loss": 0.0013, "step": 5200 }, { "epoch": 0.8181533482958401, "grad_norm": 0.0005242697079665959, "learning_rate": 4.041004523837142e-05, "loss": 0.0003, "step": 5225 }, { "epoch": 0.8220679576178297, "grad_norm": 0.0004012222634628415, "learning_rate": 4.033754784827746e-05, "loss": 0.0011, "step": 5250 }, { "epoch": 0.8259825669398194, "grad_norm": 0.0007638943498022854, "learning_rate": 4.0265050458183504e-05, "loss": 0.0008, "step": 5275 }, { "epoch": 0.829897176261809, "grad_norm": 0.000370625639334321, "learning_rate": 4.019255306808955e-05, "loss": 0.0002, "step": 5300 }, { "epoch": 0.8338117855837988, "grad_norm": 0.17966459691524506, "learning_rate": 4.0120055677995596e-05, "loss": 0.0061, "step": 5325 }, { "epoch": 0.8377263949057884, "grad_norm": 0.5298845171928406, "learning_rate": 4.004755828790164e-05, "loss": 0.0021, "step": 5350 }, { "epoch": 0.8416410042277781, "grad_norm": 0.010731186717748642, "learning_rate": 3.997506089780768e-05, "loss": 0.0012, "step": 5375 }, { "epoch": 0.8455556135497677, "grad_norm": 0.0006224720855243504, "learning_rate": 3.990256350771372e-05, "loss": 0.0014, "step": 5400 }, { "epoch": 0.8494702228717574, "grad_norm": 0.00034521459019742906, "learning_rate": 3.983006611761977e-05, "loss": 0.0005, "step": 5425 }, { "epoch": 0.8533848321937471, "grad_norm": 0.07561736553907394, "learning_rate": 3.9757568727525814e-05, "loss": 0.002, "step": 5450 }, { "epoch": 0.8572994415157368, "grad_norm": 0.010748780332505703, "learning_rate": 3.9685071337431856e-05, "loss": 0.0025, "step": 5475 }, { "epoch": 0.8612140508377264, "grad_norm": 0.03456795960664749, "learning_rate": 3.96125739473379e-05, "loss": 0.0079, "step": 5500 }, { "epoch": 0.865128660159716, "grad_norm": 0.013776997104287148, "learning_rate": 3.954007655724394e-05, "loss": 0.0015, "step": 5525 }, { "epoch": 0.8690432694817057, "grad_norm": 0.013151598162949085, "learning_rate": 3.946757916714999e-05, "loss": 0.0027, "step": 5550 }, { "epoch": 0.8729578788036954, "grad_norm": 0.005265055689960718, "learning_rate": 3.9395081777056025e-05, "loss": 0.0045, "step": 5575 }, { "epoch": 0.8768724881256851, "grad_norm": 0.0019183550029993057, "learning_rate": 3.932258438696207e-05, "loss": 0.0005, "step": 5600 }, { "epoch": 0.8807870974476747, "grad_norm": 0.42332738637924194, "learning_rate": 3.9250086996868116e-05, "loss": 0.0037, "step": 5625 }, { "epoch": 0.8847017067696644, "grad_norm": 0.00447813980281353, "learning_rate": 3.917758960677416e-05, "loss": 0.0014, "step": 5650 }, { "epoch": 0.888616316091654, "grad_norm": 0.0005977645632810891, "learning_rate": 3.91050922166802e-05, "loss": 0.0005, "step": 5675 }, { "epoch": 0.8925309254136438, "grad_norm": 0.9014317989349365, "learning_rate": 3.903259482658624e-05, "loss": 0.0012, "step": 5700 }, { "epoch": 0.8964455347356334, "grad_norm": 1.6808857917785645, "learning_rate": 3.8960097436492285e-05, "loss": 0.0033, "step": 5725 }, { "epoch": 0.9003601440576231, "grad_norm": 0.002373639028519392, "learning_rate": 3.8887600046398334e-05, "loss": 0.0136, "step": 5750 }, { "epoch": 0.9042747533796127, "grad_norm": 0.0012994492426514626, "learning_rate": 3.8815102656304376e-05, "loss": 0.0001, "step": 5775 }, { "epoch": 0.9081893627016023, "grad_norm": 0.0006246384000405669, "learning_rate": 3.874260526621042e-05, "loss": 0.0001, "step": 5800 }, { "epoch": 0.9121039720235921, "grad_norm": 0.0005325423553586006, "learning_rate": 3.867010787611646e-05, "loss": 0.0005, "step": 5825 }, { "epoch": 0.9160185813455817, "grad_norm": 0.0009510382078588009, "learning_rate": 3.859761048602251e-05, "loss": 0.0032, "step": 5850 }, { "epoch": 0.9199331906675714, "grad_norm": 0.012179987505078316, "learning_rate": 3.852511309592855e-05, "loss": 0.0015, "step": 5875 }, { "epoch": 0.923847799989561, "grad_norm": 0.0014047386357560754, "learning_rate": 3.8452615705834594e-05, "loss": 0.0006, "step": 5900 }, { "epoch": 0.9277624093115507, "grad_norm": 0.13963516056537628, "learning_rate": 3.8380118315740636e-05, "loss": 0.0012, "step": 5925 }, { "epoch": 0.9316770186335404, "grad_norm": 0.7947016954421997, "learning_rate": 3.830762092564668e-05, "loss": 0.0014, "step": 5950 }, { "epoch": 0.9355916279555301, "grad_norm": 0.001768257119692862, "learning_rate": 3.823512353555272e-05, "loss": 0.0004, "step": 5975 }, { "epoch": 0.9395062372775197, "grad_norm": 0.0007245225715450943, "learning_rate": 3.816262614545876e-05, "loss": 0.0001, "step": 6000 }, { "epoch": 0.9434208465995094, "grad_norm": 0.016255930066108704, "learning_rate": 3.8090128755364805e-05, "loss": 0.0, "step": 6025 }, { "epoch": 0.947335455921499, "grad_norm": 0.00034742074785754085, "learning_rate": 3.801763136527085e-05, "loss": 0.0003, "step": 6050 }, { "epoch": 0.9512500652434887, "grad_norm": 0.0013885988155379891, "learning_rate": 3.7945133975176896e-05, "loss": 0.0032, "step": 6075 }, { "epoch": 0.9551646745654784, "grad_norm": 0.8642656207084656, "learning_rate": 3.787263658508294e-05, "loss": 0.0019, "step": 6100 }, { "epoch": 0.959079283887468, "grad_norm": 0.002853901358321309, "learning_rate": 3.780013919498898e-05, "loss": 0.0031, "step": 6125 }, { "epoch": 0.9629938932094577, "grad_norm": 0.6826348304748535, "learning_rate": 3.772764180489502e-05, "loss": 0.0029, "step": 6150 }, { "epoch": 0.9669085025314473, "grad_norm": 0.01645534299314022, "learning_rate": 3.765514441480107e-05, "loss": 0.0003, "step": 6175 }, { "epoch": 0.9708231118534371, "grad_norm": 0.001097380998544395, "learning_rate": 3.7582647024707114e-05, "loss": 0.0011, "step": 6200 }, { "epoch": 0.9747377211754267, "grad_norm": 0.001092984457500279, "learning_rate": 3.7513049530216915e-05, "loss": 0.0006, "step": 6225 }, { "epoch": 0.9786523304974164, "grad_norm": 0.001488927286118269, "learning_rate": 3.744055214012296e-05, "loss": 0.0018, "step": 6250 }, { "epoch": 0.982566939819406, "grad_norm": 0.0012959851883351803, "learning_rate": 3.7368054750029e-05, "loss": 0.0011, "step": 6275 }, { "epoch": 0.9864815491413957, "grad_norm": 0.002524161711335182, "learning_rate": 3.729555735993505e-05, "loss": 0.0039, "step": 6300 }, { "epoch": 0.9903961584633854, "grad_norm": 0.0023267928045243025, "learning_rate": 3.722305996984109e-05, "loss": 0.0011, "step": 6325 }, { "epoch": 0.994310767785375, "grad_norm": 0.0007459365879185498, "learning_rate": 3.715056257974713e-05, "loss": 0.0003, "step": 6350 }, { "epoch": 0.9982253771073647, "grad_norm": 0.004343962296843529, "learning_rate": 3.7078065189653175e-05, "loss": 0.0018, "step": 6375 }, { "epoch": 0.9999478052090401, "eval_accuracy": 0.9997799951169648, "eval_f1": 0.9997142385928128, "eval_loss": 0.0011581754079088569, "eval_precision": 0.9996766935217872, "eval_recall": 0.9997517864841209, "eval_runtime": 62.9623, "eval_samples_per_second": 608.586, "eval_steps_per_second": 38.039, "step": 6386 }, { "epoch": 1.0021399864293543, "grad_norm": 0.007820851169526577, "learning_rate": 3.700556779955922e-05, "loss": 0.0006, "step": 6400 }, { "epoch": 1.006054595751344, "grad_norm": 0.00049219821812585, "learning_rate": 3.693307040946526e-05, "loss": 0.0005, "step": 6425 }, { "epoch": 1.0099692050733338, "grad_norm": 0.0008093062788248062, "learning_rate": 3.68605730193713e-05, "loss": 0.0008, "step": 6450 }, { "epoch": 1.0138838143953233, "grad_norm": 0.3265334963798523, "learning_rate": 3.6788075629277344e-05, "loss": 0.0001, "step": 6475 }, { "epoch": 1.017798423717313, "grad_norm": 0.015381108038127422, "learning_rate": 3.671557823918339e-05, "loss": 0.0, "step": 6500 }, { "epoch": 1.0217130330393027, "grad_norm": 0.00040746491868048906, "learning_rate": 3.6643080849089435e-05, "loss": 0.0001, "step": 6525 }, { "epoch": 1.0256276423612924, "grad_norm": 2.2102978229522705, "learning_rate": 3.657058345899548e-05, "loss": 0.0022, "step": 6550 }, { "epoch": 1.029542251683282, "grad_norm": 0.0007900640484876931, "learning_rate": 3.649808606890152e-05, "loss": 0.0034, "step": 6575 }, { "epoch": 1.0334568610052717, "grad_norm": 0.07358774542808533, "learning_rate": 3.642558867880756e-05, "loss": 0.0004, "step": 6600 }, { "epoch": 1.0373714703272614, "grad_norm": 0.0004924671957269311, "learning_rate": 3.635309128871361e-05, "loss": 0.0008, "step": 6625 }, { "epoch": 1.041286079649251, "grad_norm": 0.0007265584426932037, "learning_rate": 3.628059389861965e-05, "loss": 0.0005, "step": 6650 }, { "epoch": 1.0452006889712406, "grad_norm": 0.006537444423884153, "learning_rate": 3.6208096508525695e-05, "loss": 0.0031, "step": 6675 }, { "epoch": 1.0491152982932304, "grad_norm": 0.02974896878004074, "learning_rate": 3.613559911843174e-05, "loss": 0.0003, "step": 6700 }, { "epoch": 1.05302990761522, "grad_norm": 0.0008949014008976519, "learning_rate": 3.6063101728337786e-05, "loss": 0.0001, "step": 6725 }, { "epoch": 1.0569445169372096, "grad_norm": 5.2669758796691895, "learning_rate": 3.599060433824383e-05, "loss": 0.0038, "step": 6750 }, { "epoch": 1.0608591262591993, "grad_norm": 0.0008383537060581148, "learning_rate": 3.591810694814987e-05, "loss": 0.0014, "step": 6775 }, { "epoch": 1.064773735581189, "grad_norm": 0.03583945333957672, "learning_rate": 3.5845609558055906e-05, "loss": 0.002, "step": 6800 }, { "epoch": 1.0686883449031788, "grad_norm": 0.0004048035480082035, "learning_rate": 3.5773112167961955e-05, "loss": 0.0002, "step": 6825 }, { "epoch": 1.0726029542251683, "grad_norm": 0.0006589085678569973, "learning_rate": 3.5700614777868e-05, "loss": 0.0, "step": 6850 }, { "epoch": 1.076517563547158, "grad_norm": 0.01224551722407341, "learning_rate": 3.562811738777404e-05, "loss": 0.0028, "step": 6875 }, { "epoch": 1.0804321728691477, "grad_norm": 1.1463470458984375, "learning_rate": 3.555561999768008e-05, "loss": 0.0029, "step": 6900 }, { "epoch": 1.0843467821911372, "grad_norm": 0.002099130768328905, "learning_rate": 3.548312260758613e-05, "loss": 0.0016, "step": 6925 }, { "epoch": 1.088261391513127, "grad_norm": 0.4577861428260803, "learning_rate": 3.541062521749217e-05, "loss": 0.0009, "step": 6950 }, { "epoch": 1.0921760008351167, "grad_norm": 0.08768904209136963, "learning_rate": 3.5338127827398216e-05, "loss": 0.0017, "step": 6975 }, { "epoch": 1.0960906101571064, "grad_norm": 0.002661600476130843, "learning_rate": 3.526563043730426e-05, "loss": 0.0002, "step": 7000 }, { "epoch": 1.100005219479096, "grad_norm": 0.0006299412925727665, "learning_rate": 3.51931330472103e-05, "loss": 0.0, "step": 7025 }, { "epoch": 1.1039198288010856, "grad_norm": 0.1650131493806839, "learning_rate": 3.512063565711635e-05, "loss": 0.0002, "step": 7050 }, { "epoch": 1.1078344381230754, "grad_norm": 0.37610143423080444, "learning_rate": 3.504813826702239e-05, "loss": 0.0009, "step": 7075 }, { "epoch": 1.111749047445065, "grad_norm": 0.029113056138157845, "learning_rate": 3.4975640876928433e-05, "loss": 0.0012, "step": 7100 }, { "epoch": 1.1156636567670546, "grad_norm": 0.004116399679332972, "learning_rate": 3.4903143486834476e-05, "loss": 0.0023, "step": 7125 }, { "epoch": 1.1195782660890443, "grad_norm": 0.015721509233117104, "learning_rate": 3.483064609674052e-05, "loss": 0.0097, "step": 7150 }, { "epoch": 1.123492875411034, "grad_norm": 0.01373900007456541, "learning_rate": 3.475814870664657e-05, "loss": 0.0064, "step": 7175 }, { "epoch": 1.1274074847330238, "grad_norm": 0.0016198121011257172, "learning_rate": 3.46856513165526e-05, "loss": 0.0008, "step": 7200 }, { "epoch": 1.1313220940550133, "grad_norm": 0.0009071112144738436, "learning_rate": 3.4613153926458645e-05, "loss": 0.0001, "step": 7225 }, { "epoch": 1.135236703377003, "grad_norm": 0.0006360800471156836, "learning_rate": 3.4540656536364694e-05, "loss": 0.0007, "step": 7250 }, { "epoch": 1.1391513126989927, "grad_norm": 0.0013603122206404805, "learning_rate": 3.4468159146270736e-05, "loss": 0.0003, "step": 7275 }, { "epoch": 1.1430659220209822, "grad_norm": 0.2531895339488983, "learning_rate": 3.439566175617678e-05, "loss": 0.0019, "step": 7300 }, { "epoch": 1.146980531342972, "grad_norm": 0.08225157856941223, "learning_rate": 3.432316436608282e-05, "loss": 0.001, "step": 7325 }, { "epoch": 1.1508951406649617, "grad_norm": 0.0010974809993058443, "learning_rate": 3.425066697598886e-05, "loss": 0.0009, "step": 7350 }, { "epoch": 1.1548097499869514, "grad_norm": 0.007975243031978607, "learning_rate": 3.417816958589491e-05, "loss": 0.0001, "step": 7375 }, { "epoch": 1.158724359308941, "grad_norm": 0.0005916508380323648, "learning_rate": 3.4105672195800954e-05, "loss": 0.0001, "step": 7400 }, { "epoch": 1.1626389686309306, "grad_norm": 0.003101816400885582, "learning_rate": 3.4033174805706996e-05, "loss": 0.0, "step": 7425 }, { "epoch": 1.1665535779529204, "grad_norm": 0.00036523715243674815, "learning_rate": 3.396067741561304e-05, "loss": 0.0001, "step": 7450 }, { "epoch": 1.1704681872749099, "grad_norm": 0.02329937182366848, "learning_rate": 3.388818002551909e-05, "loss": 0.0013, "step": 7475 }, { "epoch": 1.1743827965968996, "grad_norm": 0.5784549117088318, "learning_rate": 3.381568263542513e-05, "loss": 0.0038, "step": 7500 }, { "epoch": 1.1782974059188893, "grad_norm": 0.0012637526961043477, "learning_rate": 3.374318524533117e-05, "loss": 0.0015, "step": 7525 }, { "epoch": 1.182212015240879, "grad_norm": 0.019489184021949768, "learning_rate": 3.3670687855237214e-05, "loss": 0.0002, "step": 7550 }, { "epoch": 1.1861266245628685, "grad_norm": 0.0006683494430035353, "learning_rate": 3.3598190465143256e-05, "loss": 0.0008, "step": 7575 }, { "epoch": 1.1900412338848583, "grad_norm": 0.027937965467572212, "learning_rate": 3.3525693075049305e-05, "loss": 0.0008, "step": 7600 }, { "epoch": 1.193955843206848, "grad_norm": 0.00035219776327721775, "learning_rate": 3.345319568495534e-05, "loss": 0.0002, "step": 7625 }, { "epoch": 1.1978704525288375, "grad_norm": 0.0009345108992420137, "learning_rate": 3.338069829486138e-05, "loss": 0.0032, "step": 7650 }, { "epoch": 1.2017850618508272, "grad_norm": 0.05174746736884117, "learning_rate": 3.3308200904767425e-05, "loss": 0.0028, "step": 7675 }, { "epoch": 1.205699671172817, "grad_norm": 0.1187373697757721, "learning_rate": 3.3235703514673474e-05, "loss": 0.0006, "step": 7700 }, { "epoch": 1.2096142804948067, "grad_norm": 0.0881095826625824, "learning_rate": 3.3163206124579516e-05, "loss": 0.0018, "step": 7725 }, { "epoch": 1.2135288898167964, "grad_norm": 1.4924030303955078, "learning_rate": 3.309070873448556e-05, "loss": 0.0006, "step": 7750 }, { "epoch": 1.217443499138786, "grad_norm": 0.10360655933618546, "learning_rate": 3.30182113443916e-05, "loss": 0.0009, "step": 7775 }, { "epoch": 1.2213581084607756, "grad_norm": 0.0007201316766440868, "learning_rate": 3.294571395429765e-05, "loss": 0.0003, "step": 7800 }, { "epoch": 1.2252727177827654, "grad_norm": 0.001118672196753323, "learning_rate": 3.287321656420369e-05, "loss": 0.0007, "step": 7825 }, { "epoch": 1.2291873271047549, "grad_norm": 0.008757601492106915, "learning_rate": 3.2800719174109734e-05, "loss": 0.0038, "step": 7850 }, { "epoch": 1.2331019364267446, "grad_norm": 1.405776023864746, "learning_rate": 3.2728221784015777e-05, "loss": 0.0021, "step": 7875 }, { "epoch": 1.2370165457487343, "grad_norm": 0.06606610119342804, "learning_rate": 3.265572439392182e-05, "loss": 0.0001, "step": 7900 }, { "epoch": 1.240931155070724, "grad_norm": 0.00046704983105883, "learning_rate": 3.258322700382787e-05, "loss": 0.0001, "step": 7925 }, { "epoch": 1.2448457643927135, "grad_norm": 0.0005030676256865263, "learning_rate": 3.251072961373391e-05, "loss": 0.0009, "step": 7950 }, { "epoch": 1.2487603737147033, "grad_norm": 0.004642080515623093, "learning_rate": 3.243823222363995e-05, "loss": 0.0005, "step": 7975 }, { "epoch": 1.252674983036693, "grad_norm": 0.00609723711386323, "learning_rate": 3.2365734833545994e-05, "loss": 0.0018, "step": 8000 }, { "epoch": 1.2565895923586825, "grad_norm": 0.003095820778980851, "learning_rate": 3.229323744345204e-05, "loss": 0.0003, "step": 8025 }, { "epoch": 1.2605042016806722, "grad_norm": 0.08622787892818451, "learning_rate": 3.222074005335808e-05, "loss": 0.0001, "step": 8050 }, { "epoch": 1.264418811002662, "grad_norm": 0.022611690685153008, "learning_rate": 3.214824266326412e-05, "loss": 0.0009, "step": 8075 }, { "epoch": 1.2683334203246517, "grad_norm": 0.0005983790615573525, "learning_rate": 3.2075745273170163e-05, "loss": 0.0012, "step": 8100 }, { "epoch": 1.2722480296466412, "grad_norm": 0.0008185420883819461, "learning_rate": 3.200324788307621e-05, "loss": 0.0008, "step": 8125 }, { "epoch": 1.276162638968631, "grad_norm": 0.0019505377858877182, "learning_rate": 3.1930750492982255e-05, "loss": 0.0003, "step": 8150 }, { "epoch": 1.2800772482906206, "grad_norm": 0.005252277944236994, "learning_rate": 3.18582531028883e-05, "loss": 0.0047, "step": 8175 }, { "epoch": 1.2839918576126101, "grad_norm": 0.0010310772340744734, "learning_rate": 3.178575571279434e-05, "loss": 0.0003, "step": 8200 }, { "epoch": 1.2879064669345999, "grad_norm": 0.002415160648524761, "learning_rate": 3.171325832270038e-05, "loss": 0.004, "step": 8225 }, { "epoch": 1.2918210762565896, "grad_norm": 0.0005815212498418987, "learning_rate": 3.164076093260643e-05, "loss": 0.0018, "step": 8250 }, { "epoch": 1.2957356855785793, "grad_norm": 0.0003597593167796731, "learning_rate": 3.156826354251247e-05, "loss": 0.0003, "step": 8275 }, { "epoch": 1.299650294900569, "grad_norm": 0.004648554138839245, "learning_rate": 3.1495766152418515e-05, "loss": 0.005, "step": 8300 }, { "epoch": 1.3035649042225586, "grad_norm": 0.0015794150531291962, "learning_rate": 3.142326876232456e-05, "loss": 0.0006, "step": 8325 }, { "epoch": 1.3074795135445483, "grad_norm": 0.000883117550984025, "learning_rate": 3.1350771372230606e-05, "loss": 0.0022, "step": 8350 }, { "epoch": 1.3113941228665378, "grad_norm": 0.0004549395525828004, "learning_rate": 3.127827398213665e-05, "loss": 0.0, "step": 8375 }, { "epoch": 1.3153087321885275, "grad_norm": 0.00043308446765877306, "learning_rate": 3.120577659204269e-05, "loss": 0.0017, "step": 8400 }, { "epoch": 1.3192233415105172, "grad_norm": 0.000435361813288182, "learning_rate": 3.113327920194873e-05, "loss": 0.0004, "step": 8425 }, { "epoch": 1.323137950832507, "grad_norm": 0.000388374668546021, "learning_rate": 3.1060781811854775e-05, "loss": 0.0001, "step": 8450 }, { "epoch": 1.3270525601544967, "grad_norm": 0.0006863649468868971, "learning_rate": 3.098828442176082e-05, "loss": 0.0004, "step": 8475 }, { "epoch": 1.3309671694764862, "grad_norm": 0.00127976608928293, "learning_rate": 3.091578703166686e-05, "loss": 0.0018, "step": 8500 }, { "epoch": 1.334881778798476, "grad_norm": 0.036596138030290604, "learning_rate": 3.08432896415729e-05, "loss": 0.0013, "step": 8525 }, { "epoch": 1.3387963881204656, "grad_norm": 0.002909492002800107, "learning_rate": 3.077369214708271e-05, "loss": 0.0003, "step": 8550 }, { "epoch": 1.3427109974424551, "grad_norm": 0.0010424726642668247, "learning_rate": 3.070119475698875e-05, "loss": 0.0024, "step": 8575 }, { "epoch": 1.3466256067644449, "grad_norm": 0.0005914925131946802, "learning_rate": 3.062869736689479e-05, "loss": 0.0004, "step": 8600 }, { "epoch": 1.3505402160864346, "grad_norm": 0.00037522296770475805, "learning_rate": 3.0556199976800835e-05, "loss": 0.0001, "step": 8625 }, { "epoch": 1.3544548254084243, "grad_norm": 0.00039554465911351144, "learning_rate": 3.048370258670688e-05, "loss": 0.0003, "step": 8650 }, { "epoch": 1.3583694347304138, "grad_norm": 0.0040852464735507965, "learning_rate": 3.0411205196612923e-05, "loss": 0.0004, "step": 8675 }, { "epoch": 1.3622840440524036, "grad_norm": 0.006642700172960758, "learning_rate": 3.033870780651897e-05, "loss": 0.0014, "step": 8700 }, { "epoch": 1.3661986533743933, "grad_norm": 0.003900151466950774, "learning_rate": 3.026621041642501e-05, "loss": 0.005, "step": 8725 }, { "epoch": 1.3701132626963828, "grad_norm": 0.0015803135465830564, "learning_rate": 3.0193713026331057e-05, "loss": 0.0005, "step": 8750 }, { "epoch": 1.3740278720183725, "grad_norm": 0.010884587652981281, "learning_rate": 3.01212156362371e-05, "loss": 0.0009, "step": 8775 }, { "epoch": 1.3779424813403622, "grad_norm": 0.0010327239288017154, "learning_rate": 3.004871824614314e-05, "loss": 0.0002, "step": 8800 }, { "epoch": 1.381857090662352, "grad_norm": 0.0019380106823518872, "learning_rate": 2.9976220856049187e-05, "loss": 0.0075, "step": 8825 }, { "epoch": 1.3857716999843417, "grad_norm": 0.0012182651553303003, "learning_rate": 2.9903723465955226e-05, "loss": 0.0053, "step": 8850 }, { "epoch": 1.3896863093063312, "grad_norm": 0.0017849428113549948, "learning_rate": 2.9831226075861268e-05, "loss": 0.0006, "step": 8875 }, { "epoch": 1.393600918628321, "grad_norm": 0.01608388125896454, "learning_rate": 2.9758728685767314e-05, "loss": 0.0011, "step": 8900 }, { "epoch": 1.3975155279503104, "grad_norm": 0.0005828512366861105, "learning_rate": 2.9686231295673356e-05, "loss": 0.0022, "step": 8925 }, { "epoch": 1.4014301372723001, "grad_norm": 0.0004743439785670489, "learning_rate": 2.9613733905579398e-05, "loss": 0.0006, "step": 8950 }, { "epoch": 1.4053447465942899, "grad_norm": 0.0005540683632716537, "learning_rate": 2.9541236515485444e-05, "loss": 0.0003, "step": 8975 }, { "epoch": 1.4092593559162796, "grad_norm": 0.0015846255701035261, "learning_rate": 2.9468739125391486e-05, "loss": 0.003, "step": 9000 }, { "epoch": 1.4131739652382693, "grad_norm": 0.0018151472322642803, "learning_rate": 2.939624173529753e-05, "loss": 0.0013, "step": 9025 }, { "epoch": 1.4170885745602588, "grad_norm": 0.019647782668471336, "learning_rate": 2.9323744345203574e-05, "loss": 0.0005, "step": 9050 }, { "epoch": 1.4210031838822486, "grad_norm": 0.0019365083426237106, "learning_rate": 2.925124695510962e-05, "loss": 0.0078, "step": 9075 }, { "epoch": 1.4249177932042383, "grad_norm": 0.018348557874560356, "learning_rate": 2.917874956501566e-05, "loss": 0.0007, "step": 9100 }, { "epoch": 1.4288324025262278, "grad_norm": 0.0018460671417415142, "learning_rate": 2.9106252174921704e-05, "loss": 0.0004, "step": 9125 }, { "epoch": 1.4327470118482175, "grad_norm": 0.014430728740990162, "learning_rate": 2.903375478482775e-05, "loss": 0.0065, "step": 9150 }, { "epoch": 1.4366616211702072, "grad_norm": 0.004876923281699419, "learning_rate": 2.896125739473379e-05, "loss": 0.0005, "step": 9175 }, { "epoch": 1.440576230492197, "grad_norm": 0.012378478422760963, "learning_rate": 2.8888760004639837e-05, "loss": 0.0011, "step": 9200 }, { "epoch": 1.4444908398141865, "grad_norm": 0.0017155319219455123, "learning_rate": 2.881626261454588e-05, "loss": 0.0005, "step": 9225 }, { "epoch": 1.4484054491361762, "grad_norm": 0.0008338566403836012, "learning_rate": 2.874376522445192e-05, "loss": 0.0002, "step": 9250 }, { "epoch": 1.452320058458166, "grad_norm": 0.18289905786514282, "learning_rate": 2.867126783435796e-05, "loss": 0.0012, "step": 9275 }, { "epoch": 1.4562346677801554, "grad_norm": 0.0008503763237968087, "learning_rate": 2.8598770444264006e-05, "loss": 0.0005, "step": 9300 }, { "epoch": 1.4601492771021451, "grad_norm": 0.0007721242727711797, "learning_rate": 2.852627305417005e-05, "loss": 0.0001, "step": 9325 }, { "epoch": 1.4640638864241349, "grad_norm": 0.006053832825273275, "learning_rate": 2.8453775664076094e-05, "loss": 0.0004, "step": 9350 }, { "epoch": 1.4679784957461246, "grad_norm": 0.002682841382920742, "learning_rate": 2.8381278273982136e-05, "loss": 0.0011, "step": 9375 }, { "epoch": 1.4718931050681143, "grad_norm": 0.0006761788972653449, "learning_rate": 2.8308780883888182e-05, "loss": 0.0006, "step": 9400 }, { "epoch": 1.4758077143901038, "grad_norm": 0.0006122990744188428, "learning_rate": 2.8236283493794224e-05, "loss": 0.0003, "step": 9425 }, { "epoch": 1.4797223237120936, "grad_norm": 0.0022469067480415106, "learning_rate": 2.816378610370027e-05, "loss": 0.0001, "step": 9450 }, { "epoch": 1.483636933034083, "grad_norm": 0.005789736285805702, "learning_rate": 2.8091288713606312e-05, "loss": 0.001, "step": 9475 }, { "epoch": 1.4875515423560728, "grad_norm": 0.0005803314852528274, "learning_rate": 2.8018791323512354e-05, "loss": 0.0001, "step": 9500 }, { "epoch": 1.4914661516780625, "grad_norm": 0.00044589489698410034, "learning_rate": 2.79462939334184e-05, "loss": 0.0001, "step": 9525 }, { "epoch": 1.4953807610000522, "grad_norm": 0.00034716431400738657, "learning_rate": 2.7873796543324442e-05, "loss": 0.0004, "step": 9550 }, { "epoch": 1.499295370322042, "grad_norm": 0.034700002521276474, "learning_rate": 2.7801299153230488e-05, "loss": 0.0003, "step": 9575 }, { "epoch": 1.5032099796440317, "grad_norm": 0.00039778611971996725, "learning_rate": 2.772880176313653e-05, "loss": 0.0001, "step": 9600 }, { "epoch": 1.5071245889660212, "grad_norm": 0.0003559018950909376, "learning_rate": 2.7656304373042576e-05, "loss": 0.0, "step": 9625 }, { "epoch": 1.5110391982880107, "grad_norm": 0.7171289920806885, "learning_rate": 2.7583806982948618e-05, "loss": 0.001, "step": 9650 }, { "epoch": 1.5149538076100004, "grad_norm": 0.0009173134458251297, "learning_rate": 2.7511309592854657e-05, "loss": 0.0002, "step": 9675 }, { "epoch": 1.5188684169319902, "grad_norm": 0.6568087935447693, "learning_rate": 2.74388122027607e-05, "loss": 0.0032, "step": 9700 }, { "epoch": 1.5227830262539799, "grad_norm": 0.03286755084991455, "learning_rate": 2.7366314812666745e-05, "loss": 0.0005, "step": 9725 }, { "epoch": 1.5266976355759696, "grad_norm": 0.0004193273780401796, "learning_rate": 2.7293817422572787e-05, "loss": 0.0004, "step": 9750 }, { "epoch": 1.5306122448979593, "grad_norm": 2.4084434509277344, "learning_rate": 2.7221320032478832e-05, "loss": 0.0061, "step": 9775 }, { "epoch": 1.5345268542199488, "grad_norm": 0.020185716450214386, "learning_rate": 2.7148822642384875e-05, "loss": 0.0068, "step": 9800 }, { "epoch": 1.5384414635419386, "grad_norm": 0.6322495937347412, "learning_rate": 2.7076325252290917e-05, "loss": 0.0015, "step": 9825 }, { "epoch": 1.542356072863928, "grad_norm": 0.0004228654725011438, "learning_rate": 2.7003827862196962e-05, "loss": 0.0009, "step": 9850 }, { "epoch": 1.5462706821859178, "grad_norm": 0.0012805104488506913, "learning_rate": 2.6931330472103005e-05, "loss": 0.0002, "step": 9875 }, { "epoch": 1.5501852915079075, "grad_norm": 0.0005116848042234778, "learning_rate": 2.685883308200905e-05, "loss": 0.0006, "step": 9900 }, { "epoch": 1.5540999008298972, "grad_norm": 0.8417395353317261, "learning_rate": 2.6786335691915093e-05, "loss": 0.0017, "step": 9925 }, { "epoch": 1.558014510151887, "grad_norm": 0.0006132688722573221, "learning_rate": 2.6713838301821138e-05, "loss": 0.0002, "step": 9950 }, { "epoch": 1.5619291194738765, "grad_norm": 0.001284563448280096, "learning_rate": 2.664134091172718e-05, "loss": 0.0001, "step": 9975 }, { "epoch": 1.5658437287958662, "grad_norm": 0.002453350927680731, "learning_rate": 2.6568843521633226e-05, "loss": 0.0001, "step": 10000 }, { "epoch": 1.5697583381178557, "grad_norm": 0.002474565990269184, "learning_rate": 2.6496346131539268e-05, "loss": 0.0, "step": 10025 }, { "epoch": 1.5736729474398454, "grad_norm": 0.0012147346278652549, "learning_rate": 2.642384874144531e-05, "loss": 0.0031, "step": 10050 }, { "epoch": 1.5775875567618352, "grad_norm": 0.0009614901500754058, "learning_rate": 2.635135135135135e-05, "loss": 0.0009, "step": 10075 }, { "epoch": 1.5815021660838249, "grad_norm": 0.00043524886132217944, "learning_rate": 2.6278853961257395e-05, "loss": 0.0001, "step": 10100 }, { "epoch": 1.5854167754058146, "grad_norm": 0.0005262857885099947, "learning_rate": 2.6206356571163437e-05, "loss": 0.0, "step": 10125 }, { "epoch": 1.5893313847278043, "grad_norm": 0.00038553698686882854, "learning_rate": 2.613385918106948e-05, "loss": 0.0013, "step": 10150 }, { "epoch": 1.5932459940497938, "grad_norm": 0.0006603036308661103, "learning_rate": 2.6061361790975525e-05, "loss": 0.0001, "step": 10175 }, { "epoch": 1.5971606033717833, "grad_norm": 0.0011721713235601783, "learning_rate": 2.5988864400881567e-05, "loss": 0.0, "step": 10200 }, { "epoch": 1.601075212693773, "grad_norm": 0.00034801868605427444, "learning_rate": 2.5916367010787613e-05, "loss": 0.0003, "step": 10225 }, { "epoch": 1.6049898220157628, "grad_norm": 0.00029766836087219417, "learning_rate": 2.5843869620693655e-05, "loss": 0.0005, "step": 10250 }, { "epoch": 1.6089044313377525, "grad_norm": 0.8273627161979675, "learning_rate": 2.57713722305997e-05, "loss": 0.004, "step": 10275 }, { "epoch": 1.6128190406597422, "grad_norm": 0.0023189974017441273, "learning_rate": 2.5698874840505743e-05, "loss": 0.0005, "step": 10300 }, { "epoch": 1.616733649981732, "grad_norm": 0.001266616047360003, "learning_rate": 2.562637745041179e-05, "loss": 0.0006, "step": 10325 }, { "epoch": 1.6206482593037215, "grad_norm": 0.0006485527264885604, "learning_rate": 2.555388006031783e-05, "loss": 0.0001, "step": 10350 }, { "epoch": 1.6245628686257112, "grad_norm": 0.01249407883733511, "learning_rate": 2.5481382670223873e-05, "loss": 0.0047, "step": 10375 }, { "epoch": 1.6284774779477007, "grad_norm": 0.0016884652432054281, "learning_rate": 2.540888528012992e-05, "loss": 0.0008, "step": 10400 }, { "epoch": 1.6323920872696904, "grad_norm": 0.0009969666134566069, "learning_rate": 2.533638789003596e-05, "loss": 0.0001, "step": 10425 }, { "epoch": 1.6363066965916802, "grad_norm": 0.0008430654415860772, "learning_rate": 2.5263890499942007e-05, "loss": 0.0004, "step": 10450 }, { "epoch": 1.6402213059136699, "grad_norm": 0.0007658881950192153, "learning_rate": 2.519139310984805e-05, "loss": 0.0001, "step": 10475 }, { "epoch": 1.6441359152356596, "grad_norm": 0.0007439135224558413, "learning_rate": 2.5118895719754088e-05, "loss": 0.0, "step": 10500 }, { "epoch": 1.648050524557649, "grad_norm": 0.0005683203344233334, "learning_rate": 2.504639832966013e-05, "loss": 0.0001, "step": 10525 }, { "epoch": 1.6519651338796388, "grad_norm": 0.00042879345710389316, "learning_rate": 2.497390093956618e-05, "loss": 0.0, "step": 10550 }, { "epoch": 1.6558797432016283, "grad_norm": 0.0004082492378074676, "learning_rate": 2.490140354947222e-05, "loss": 0.0001, "step": 10575 }, { "epoch": 1.659794352523618, "grad_norm": 0.002024848246946931, "learning_rate": 2.4828906159378263e-05, "loss": 0.0006, "step": 10600 }, { "epoch": 1.6637089618456078, "grad_norm": 0.3372742533683777, "learning_rate": 2.4756408769284306e-05, "loss": 0.0001, "step": 10625 }, { "epoch": 1.6676235711675975, "grad_norm": 0.005234843585640192, "learning_rate": 2.468391137919035e-05, "loss": 0.0, "step": 10650 }, { "epoch": 1.6715381804895872, "grad_norm": 0.0004937741323374212, "learning_rate": 2.4611413989096393e-05, "loss": 0.0005, "step": 10675 }, { "epoch": 1.675452789811577, "grad_norm": 0.0007821121835149825, "learning_rate": 2.4538916599002436e-05, "loss": 0.0013, "step": 10700 }, { "epoch": 1.6793673991335665, "grad_norm": 0.0010803567711263895, "learning_rate": 2.446641920890848e-05, "loss": 0.0004, "step": 10725 }, { "epoch": 1.683282008455556, "grad_norm": 0.0005569527274928987, "learning_rate": 2.4393921818814523e-05, "loss": 0.0002, "step": 10750 }, { "epoch": 1.6871966177775457, "grad_norm": 0.005404625087976456, "learning_rate": 2.432142442872057e-05, "loss": 0.0085, "step": 10775 }, { "epoch": 1.6911112270995354, "grad_norm": 0.001234252005815506, "learning_rate": 2.4248927038626608e-05, "loss": 0.0014, "step": 10800 }, { "epoch": 1.6950258364215252, "grad_norm": 0.0025794110260903835, "learning_rate": 2.4176429648532654e-05, "loss": 0.0014, "step": 10825 }, { "epoch": 1.6989404457435149, "grad_norm": 0.07590831816196442, "learning_rate": 2.4103932258438696e-05, "loss": 0.0023, "step": 10850 }, { "epoch": 1.7028550550655046, "grad_norm": 0.005912380293011665, "learning_rate": 2.403143486834474e-05, "loss": 0.0015, "step": 10875 }, { "epoch": 1.7067696643874941, "grad_norm": 0.010333801619708538, "learning_rate": 2.3958937478250784e-05, "loss": 0.0001, "step": 10900 }, { "epoch": 1.7106842737094838, "grad_norm": 0.01580122299492359, "learning_rate": 2.388644008815683e-05, "loss": 0.0018, "step": 10925 }, { "epoch": 1.7145988830314733, "grad_norm": 0.10874010622501373, "learning_rate": 2.381394269806287e-05, "loss": 0.0012, "step": 10950 }, { "epoch": 1.718513492353463, "grad_norm": 0.016742747277021408, "learning_rate": 2.3741445307968914e-05, "loss": 0.0009, "step": 10975 }, { "epoch": 1.7224281016754528, "grad_norm": 0.012475020252168179, "learning_rate": 2.366894791787496e-05, "loss": 0.0007, "step": 11000 }, { "epoch": 1.7263427109974425, "grad_norm": 0.1469310075044632, "learning_rate": 2.3596450527781e-05, "loss": 0.002, "step": 11025 }, { "epoch": 1.7302573203194322, "grad_norm": 0.0017377269687131047, "learning_rate": 2.3523953137687044e-05, "loss": 0.0001, "step": 11050 }, { "epoch": 1.7341719296414217, "grad_norm": 0.003490234026685357, "learning_rate": 2.3451455747593086e-05, "loss": 0.0001, "step": 11075 }, { "epoch": 1.7380865389634115, "grad_norm": 0.008674775250256062, "learning_rate": 2.337895835749913e-05, "loss": 0.0039, "step": 11100 }, { "epoch": 1.742001148285401, "grad_norm": 0.004905765876173973, "learning_rate": 2.3306460967405174e-05, "loss": 0.0023, "step": 11125 }, { "epoch": 1.7459157576073907, "grad_norm": 0.0013971665175631642, "learning_rate": 2.323396357731122e-05, "loss": 0.0002, "step": 11150 }, { "epoch": 1.7498303669293804, "grad_norm": 0.004542670212686062, "learning_rate": 2.3161466187217262e-05, "loss": 0.0001, "step": 11175 }, { "epoch": 1.7537449762513702, "grad_norm": 0.0004924107925035059, "learning_rate": 2.3088968797123307e-05, "loss": 0.0001, "step": 11200 }, { "epoch": 1.7576595855733599, "grad_norm": 0.0016612813342362642, "learning_rate": 2.3016471407029346e-05, "loss": 0.0003, "step": 11225 }, { "epoch": 1.7615741948953496, "grad_norm": 0.0002968982153106481, "learning_rate": 2.2943974016935392e-05, "loss": 0.0001, "step": 11250 }, { "epoch": 1.7654888042173391, "grad_norm": 0.00263870763592422, "learning_rate": 2.2871476626841434e-05, "loss": 0.0011, "step": 11275 }, { "epoch": 1.7694034135393286, "grad_norm": 0.00835906621068716, "learning_rate": 2.279897923674748e-05, "loss": 0.0067, "step": 11300 }, { "epoch": 1.7733180228613183, "grad_norm": 0.0007750336080789566, "learning_rate": 2.2726481846653522e-05, "loss": 0.0003, "step": 11325 }, { "epoch": 1.777232632183308, "grad_norm": 0.0028884296771138906, "learning_rate": 2.2653984456559564e-05, "loss": 0.0023, "step": 11350 }, { "epoch": 1.7811472415052978, "grad_norm": 0.042546164244413376, "learning_rate": 2.258148706646561e-05, "loss": 0.0003, "step": 11375 }, { "epoch": 1.7850618508272875, "grad_norm": 0.0007674749358557165, "learning_rate": 2.2508989676371652e-05, "loss": 0.0003, "step": 11400 }, { "epoch": 1.7889764601492772, "grad_norm": 0.040151335299015045, "learning_rate": 2.2436492286277694e-05, "loss": 0.0002, "step": 11425 }, { "epoch": 1.7928910694712668, "grad_norm": 0.0003488350484985858, "learning_rate": 2.2363994896183736e-05, "loss": 0.0, "step": 11450 }, { "epoch": 1.7968056787932565, "grad_norm": 0.25811877846717834, "learning_rate": 2.2291497506089782e-05, "loss": 0.0004, "step": 11475 }, { "epoch": 1.800720288115246, "grad_norm": 0.00024293421301990747, "learning_rate": 2.2219000115995824e-05, "loss": 0.0003, "step": 11500 }, { "epoch": 1.8046348974372357, "grad_norm": 0.004234221298247576, "learning_rate": 2.214650272590187e-05, "loss": 0.0029, "step": 11525 }, { "epoch": 1.8085495067592254, "grad_norm": 0.0003131197008769959, "learning_rate": 2.2074005335807912e-05, "loss": 0.0002, "step": 11550 }, { "epoch": 1.8124641160812152, "grad_norm": 0.05105828866362572, "learning_rate": 2.2001507945713958e-05, "loss": 0.0008, "step": 11575 }, { "epoch": 1.8163787254032049, "grad_norm": 0.014320386573672295, "learning_rate": 2.192901055562e-05, "loss": 0.0009, "step": 11600 }, { "epoch": 1.8202933347251944, "grad_norm": 0.0003410752979107201, "learning_rate": 2.1856513165526042e-05, "loss": 0.0007, "step": 11625 }, { "epoch": 1.8242079440471841, "grad_norm": 0.0003042153548449278, "learning_rate": 2.1784015775432085e-05, "loss": 0.0006, "step": 11650 }, { "epoch": 1.8281225533691736, "grad_norm": 0.1060762032866478, "learning_rate": 2.1711518385338127e-05, "loss": 0.0001, "step": 11675 }, { "epoch": 1.8320371626911633, "grad_norm": 0.0008619217551313341, "learning_rate": 2.1639020995244172e-05, "loss": 0.0006, "step": 11700 }, { "epoch": 1.835951772013153, "grad_norm": 0.0005810207221657038, "learning_rate": 2.1566523605150215e-05, "loss": 0.0005, "step": 11725 }, { "epoch": 1.8398663813351428, "grad_norm": 0.005664344877004623, "learning_rate": 2.149402621505626e-05, "loss": 0.0026, "step": 11750 }, { "epoch": 1.8437809906571325, "grad_norm": 4.294293403625488, "learning_rate": 2.1421528824962302e-05, "loss": 0.0007, "step": 11775 }, { "epoch": 1.8476955999791222, "grad_norm": 0.040877822786569595, "learning_rate": 2.1349031434868348e-05, "loss": 0.0021, "step": 11800 }, { "epoch": 1.8516102093011118, "grad_norm": 0.003679527435451746, "learning_rate": 2.127653404477439e-05, "loss": 0.0006, "step": 11825 }, { "epoch": 1.8555248186231013, "grad_norm": 0.003342527663335204, "learning_rate": 2.1204036654680433e-05, "loss": 0.0002, "step": 11850 }, { "epoch": 1.859439427945091, "grad_norm": 0.000454226101282984, "learning_rate": 2.1131539264586475e-05, "loss": 0.0003, "step": 11875 }, { "epoch": 1.8633540372670807, "grad_norm": 0.00024604357895441353, "learning_rate": 2.105904187449252e-05, "loss": 0.0004, "step": 11900 }, { "epoch": 1.8672686465890704, "grad_norm": 0.00022296722454484552, "learning_rate": 2.0986544484398563e-05, "loss": 0.0003, "step": 11925 }, { "epoch": 1.8711832559110602, "grad_norm": 0.0013281836872920394, "learning_rate": 2.0914047094304605e-05, "loss": 0.0012, "step": 11950 }, { "epoch": 1.8750978652330499, "grad_norm": 0.00042916362872347236, "learning_rate": 2.084154970421065e-05, "loss": 0.0006, "step": 11975 }, { "epoch": 1.8790124745550394, "grad_norm": 0.0013623477425426245, "learning_rate": 2.0769052314116693e-05, "loss": 0.0014, "step": 12000 }, { "epoch": 1.8829270838770291, "grad_norm": 0.0005729927215725183, "learning_rate": 2.069655492402274e-05, "loss": 0.0047, "step": 12025 }, { "epoch": 1.8868416931990186, "grad_norm": 0.0161959920078516, "learning_rate": 2.0624057533928777e-05, "loss": 0.0002, "step": 12050 }, { "epoch": 1.8907563025210083, "grad_norm": 0.05182856693863869, "learning_rate": 2.0551560143834823e-05, "loss": 0.0011, "step": 12075 }, { "epoch": 1.894670911842998, "grad_norm": 0.0009345468715764582, "learning_rate": 2.0479062753740865e-05, "loss": 0.0004, "step": 12100 }, { "epoch": 1.8985855211649878, "grad_norm": 0.004085169639438391, "learning_rate": 2.040656536364691e-05, "loss": 0.0009, "step": 12125 }, { "epoch": 1.9025001304869775, "grad_norm": 0.003939950373023748, "learning_rate": 2.0334067973552953e-05, "loss": 0.0002, "step": 12150 }, { "epoch": 1.906414739808967, "grad_norm": 0.0006880080327391624, "learning_rate": 2.0261570583459e-05, "loss": 0.0001, "step": 12175 }, { "epoch": 1.9103293491309568, "grad_norm": 0.01777348481118679, "learning_rate": 2.018907319336504e-05, "loss": 0.0001, "step": 12200 }, { "epoch": 1.9142439584529463, "grad_norm": 0.0002502555726096034, "learning_rate": 2.0116575803271083e-05, "loss": 0.0005, "step": 12225 }, { "epoch": 1.918158567774936, "grad_norm": 0.0007615393842570484, "learning_rate": 2.0044078413177125e-05, "loss": 0.0022, "step": 12250 }, { "epoch": 1.9220731770969257, "grad_norm": 0.008713331073522568, "learning_rate": 1.9971581023083167e-05, "loss": 0.001, "step": 12275 }, { "epoch": 1.9259877864189154, "grad_norm": 0.003203247208148241, "learning_rate": 1.9899083632989213e-05, "loss": 0.0031, "step": 12300 }, { "epoch": 1.9299023957409052, "grad_norm": 0.02553451806306839, "learning_rate": 1.9826586242895255e-05, "loss": 0.001, "step": 12325 }, { "epoch": 1.933817005062895, "grad_norm": 0.045750390738248825, "learning_rate": 1.97540888528013e-05, "loss": 0.0007, "step": 12350 }, { "epoch": 1.9377316143848844, "grad_norm": 0.0004758847935590893, "learning_rate": 1.9681591462707343e-05, "loss": 0.001, "step": 12375 }, { "epoch": 1.941646223706874, "grad_norm": 0.0024788689333945513, "learning_rate": 1.960909407261339e-05, "loss": 0.0003, "step": 12400 }, { "epoch": 1.9455608330288636, "grad_norm": 0.0014538065297529101, "learning_rate": 1.953659668251943e-05, "loss": 0.0007, "step": 12425 }, { "epoch": 1.9494754423508533, "grad_norm": 0.00023535569198429585, "learning_rate": 1.9464099292425473e-05, "loss": 0.0003, "step": 12450 }, { "epoch": 1.953390051672843, "grad_norm": 0.0002048378373729065, "learning_rate": 1.9391601902331515e-05, "loss": 0.0001, "step": 12475 }, { "epoch": 1.9573046609948328, "grad_norm": 0.0004028423863928765, "learning_rate": 1.931910451223756e-05, "loss": 0.0008, "step": 12500 }, { "epoch": 1.9612192703168225, "grad_norm": 0.0021086076740175486, "learning_rate": 1.9246607122143603e-05, "loss": 0.0002, "step": 12525 }, { "epoch": 1.965133879638812, "grad_norm": 0.00085318653145805, "learning_rate": 1.9174109732049646e-05, "loss": 0.0001, "step": 12550 }, { "epoch": 1.9690484889608018, "grad_norm": 0.00021198119793552905, "learning_rate": 1.910161234195569e-05, "loss": 0.0007, "step": 12575 }, { "epoch": 1.9729630982827913, "grad_norm": 0.00025199473020620644, "learning_rate": 1.9029114951861733e-05, "loss": 0.0011, "step": 12600 }, { "epoch": 1.976877707604781, "grad_norm": 0.0007640988333150744, "learning_rate": 1.895661756176778e-05, "loss": 0.0003, "step": 12625 }, { "epoch": 1.9807923169267707, "grad_norm": 0.013913657516241074, "learning_rate": 1.888412017167382e-05, "loss": 0.0001, "step": 12650 }, { "epoch": 1.9847069262487604, "grad_norm": 0.00018586177611723542, "learning_rate": 1.8811622781579863e-05, "loss": 0.001, "step": 12675 }, { "epoch": 1.9886215355707502, "grad_norm": 0.00032623313018120825, "learning_rate": 1.8739125391485906e-05, "loss": 0.0, "step": 12700 }, { "epoch": 1.9925361448927397, "grad_norm": 0.00017907471919897944, "learning_rate": 1.866662800139195e-05, "loss": 0.0, "step": 12725 }, { "epoch": 1.9964507542147294, "grad_norm": 1.226132869720459, "learning_rate": 1.8594130611297994e-05, "loss": 0.0007, "step": 12750 }, { "epoch": 1.9998956104180803, "eval_accuracy": 0.9998184512550563, "eval_f1": 0.9998301719182735, "eval_loss": 0.0009782494744285941, "eval_precision": 0.999817110608891, "eval_recall": 0.9998432335689185, "eval_runtime": 63.1773, "eval_samples_per_second": 606.515, "eval_steps_per_second": 37.909, "step": 12772 }, { "epoch": 2.000365363536719, "grad_norm": 0.0005716659361496568, "learning_rate": 1.852163322120404e-05, "loss": 0.0, "step": 12775 }, { "epoch": 2.0042799728587086, "grad_norm": 0.0003549535758793354, "learning_rate": 1.844913583111008e-05, "loss": 0.0016, "step": 12800 }, { "epoch": 2.0081945821806984, "grad_norm": 0.012497562915086746, "learning_rate": 1.8376638441016124e-05, "loss": 0.0003, "step": 12825 }, { "epoch": 2.012109191502688, "grad_norm": 0.0003994225990027189, "learning_rate": 1.830414105092217e-05, "loss": 0.0022, "step": 12850 }, { "epoch": 2.016023800824678, "grad_norm": 0.0007454080041497946, "learning_rate": 1.823164366082821e-05, "loss": 0.0001, "step": 12875 }, { "epoch": 2.0199384101466675, "grad_norm": 0.0001763895561452955, "learning_rate": 1.8159146270734254e-05, "loss": 0.0001, "step": 12900 }, { "epoch": 2.023853019468657, "grad_norm": 1.3950115442276, "learning_rate": 1.8086648880640296e-05, "loss": 0.0002, "step": 12925 }, { "epoch": 2.0277676287906465, "grad_norm": 0.00019921216880902648, "learning_rate": 1.801415149054634e-05, "loss": 0.0003, "step": 12950 }, { "epoch": 2.0316822381126363, "grad_norm": 0.00017710919200908393, "learning_rate": 1.7941654100452384e-05, "loss": 0.0001, "step": 12975 }, { "epoch": 2.035596847434626, "grad_norm": 0.0029750317335128784, "learning_rate": 1.7872056605962187e-05, "loss": 0.0048, "step": 13000 }, { "epoch": 2.0395114567566157, "grad_norm": 0.006306216586381197, "learning_rate": 1.779955921586823e-05, "loss": 0.0001, "step": 13025 }, { "epoch": 2.0434260660786054, "grad_norm": 0.0116845378652215, "learning_rate": 1.7727061825774275e-05, "loss": 0.0001, "step": 13050 }, { "epoch": 2.047340675400595, "grad_norm": 0.6511118412017822, "learning_rate": 1.7654564435680314e-05, "loss": 0.0003, "step": 13075 }, { "epoch": 2.051255284722585, "grad_norm": 0.036821216344833374, "learning_rate": 1.758206704558636e-05, "loss": 0.0007, "step": 13100 }, { "epoch": 2.055169894044574, "grad_norm": 0.002877579303458333, "learning_rate": 1.7509569655492402e-05, "loss": 0.0001, "step": 13125 }, { "epoch": 2.059084503366564, "grad_norm": 0.3921562433242798, "learning_rate": 1.7437072265398448e-05, "loss": 0.0015, "step": 13150 }, { "epoch": 2.0629991126885536, "grad_norm": 0.0009924178011715412, "learning_rate": 1.736457487530449e-05, "loss": 0.0001, "step": 13175 }, { "epoch": 2.0669137220105434, "grad_norm": 0.00017288805975113064, "learning_rate": 1.7292077485210536e-05, "loss": 0.0, "step": 13200 }, { "epoch": 2.070828331332533, "grad_norm": 0.002701199846342206, "learning_rate": 1.7222479990720336e-05, "loss": 0.0002, "step": 13225 }, { "epoch": 2.074742940654523, "grad_norm": 0.0008174364338628948, "learning_rate": 1.7149982600626378e-05, "loss": 0.0002, "step": 13250 }, { "epoch": 2.0786575499765125, "grad_norm": 0.0007628415478393435, "learning_rate": 1.707748521053242e-05, "loss": 0.0026, "step": 13275 }, { "epoch": 2.082572159298502, "grad_norm": 0.002812017919495702, "learning_rate": 1.7004987820438466e-05, "loss": 0.0005, "step": 13300 }, { "epoch": 2.0864867686204915, "grad_norm": 0.0008141273865476251, "learning_rate": 1.6932490430344508e-05, "loss": 0.0011, "step": 13325 }, { "epoch": 2.0904013779424813, "grad_norm": 0.001382953836582601, "learning_rate": 1.685999304025055e-05, "loss": 0.0001, "step": 13350 }, { "epoch": 2.094315987264471, "grad_norm": 0.0003401144640520215, "learning_rate": 1.6787495650156596e-05, "loss": 0.0007, "step": 13375 }, { "epoch": 2.0982305965864607, "grad_norm": 0.0013442619238048792, "learning_rate": 1.6714998260062638e-05, "loss": 0.0003, "step": 13400 }, { "epoch": 2.1021452059084504, "grad_norm": 0.0014413492754101753, "learning_rate": 1.6642500869968684e-05, "loss": 0.0, "step": 13425 }, { "epoch": 2.10605981523044, "grad_norm": 0.0010517615592107177, "learning_rate": 1.6570003479874726e-05, "loss": 0.0001, "step": 13450 }, { "epoch": 2.10997442455243, "grad_norm": 0.0006386275636032224, "learning_rate": 1.6497506089780768e-05, "loss": 0.0001, "step": 13475 }, { "epoch": 2.113889033874419, "grad_norm": 0.06532581895589828, "learning_rate": 1.642500869968681e-05, "loss": 0.0, "step": 13500 }, { "epoch": 2.117803643196409, "grad_norm": 0.6693994402885437, "learning_rate": 1.6352511309592856e-05, "loss": 0.0003, "step": 13525 }, { "epoch": 2.1217182525183986, "grad_norm": 0.00038365976070053875, "learning_rate": 1.62800139194989e-05, "loss": 0.001, "step": 13550 }, { "epoch": 2.1256328618403884, "grad_norm": 0.00022832312970422208, "learning_rate": 1.6207516529404944e-05, "loss": 0.0005, "step": 13575 }, { "epoch": 2.129547471162378, "grad_norm": 0.0010044885566458106, "learning_rate": 1.6135019139310986e-05, "loss": 0.0002, "step": 13600 }, { "epoch": 2.133462080484368, "grad_norm": 0.0077380407601594925, "learning_rate": 1.606252174921703e-05, "loss": 0.0006, "step": 13625 }, { "epoch": 2.1373766898063575, "grad_norm": 0.004086634609848261, "learning_rate": 1.5990024359123074e-05, "loss": 0.0, "step": 13650 }, { "epoch": 2.141291299128347, "grad_norm": 0.0014021744718775153, "learning_rate": 1.5917526969029113e-05, "loss": 0.0004, "step": 13675 }, { "epoch": 2.1452059084503365, "grad_norm": 0.0007703950395807624, "learning_rate": 1.584502957893516e-05, "loss": 0.0001, "step": 13700 }, { "epoch": 2.1491205177723263, "grad_norm": 0.041545968502759933, "learning_rate": 1.57725321888412e-05, "loss": 0.0131, "step": 13725 }, { "epoch": 2.153035127094316, "grad_norm": 0.005660552531480789, "learning_rate": 1.5700034798747246e-05, "loss": 0.0033, "step": 13750 }, { "epoch": 2.1569497364163057, "grad_norm": 0.00475983927026391, "learning_rate": 1.562753740865329e-05, "loss": 0.0003, "step": 13775 }, { "epoch": 2.1608643457382954, "grad_norm": 0.8047095537185669, "learning_rate": 1.5555040018559334e-05, "loss": 0.0003, "step": 13800 }, { "epoch": 2.164778955060285, "grad_norm": 0.0028548124246299267, "learning_rate": 1.5482542628465376e-05, "loss": 0.0002, "step": 13825 }, { "epoch": 2.1686935643822745, "grad_norm": 0.01292176079005003, "learning_rate": 1.5410045238371422e-05, "loss": 0.0004, "step": 13850 }, { "epoch": 2.172608173704264, "grad_norm": 0.00023661291925236583, "learning_rate": 1.533754784827746e-05, "loss": 0.0003, "step": 13875 }, { "epoch": 2.176522783026254, "grad_norm": 0.0002082917490042746, "learning_rate": 1.5265050458183507e-05, "loss": 0.0, "step": 13900 }, { "epoch": 2.1804373923482436, "grad_norm": 0.0005662673502229154, "learning_rate": 1.5192553068089549e-05, "loss": 0.0001, "step": 13925 }, { "epoch": 2.1843520016702334, "grad_norm": 0.00044558930676430464, "learning_rate": 1.5120055677995593e-05, "loss": 0.0, "step": 13950 }, { "epoch": 2.188266610992223, "grad_norm": 0.00019830386736430228, "learning_rate": 1.5047558287901637e-05, "loss": 0.0015, "step": 13975 }, { "epoch": 2.192181220314213, "grad_norm": 0.00026091316249221563, "learning_rate": 1.497506089780768e-05, "loss": 0.0003, "step": 14000 }, { "epoch": 2.196095829636202, "grad_norm": 0.00025246432051062584, "learning_rate": 1.4902563507713724e-05, "loss": 0.0, "step": 14025 }, { "epoch": 2.200010438958192, "grad_norm": 0.13300319015979767, "learning_rate": 1.4830066117619767e-05, "loss": 0.001, "step": 14050 }, { "epoch": 2.2039250482801815, "grad_norm": 0.00025016642757691443, "learning_rate": 1.4757568727525809e-05, "loss": 0.0001, "step": 14075 }, { "epoch": 2.2078396576021713, "grad_norm": 0.0002115444076480344, "learning_rate": 1.4685071337431853e-05, "loss": 0.0001, "step": 14100 }, { "epoch": 2.211754266924161, "grad_norm": 0.028121890500187874, "learning_rate": 1.4612573947337895e-05, "loss": 0.0009, "step": 14125 }, { "epoch": 2.2156688762461507, "grad_norm": 0.00024847922031767666, "learning_rate": 1.4540076557243939e-05, "loss": 0.0002, "step": 14150 }, { "epoch": 2.2195834855681404, "grad_norm": 0.0006722984835505486, "learning_rate": 1.4467579167149983e-05, "loss": 0.0006, "step": 14175 }, { "epoch": 2.22349809489013, "grad_norm": 0.004081141669303179, "learning_rate": 1.4395081777056027e-05, "loss": 0.0001, "step": 14200 }, { "epoch": 2.2274127042121195, "grad_norm": 0.0002169485087506473, "learning_rate": 1.432258438696207e-05, "loss": 0.0, "step": 14225 }, { "epoch": 2.231327313534109, "grad_norm": 0.0004130221204832196, "learning_rate": 1.4250086996868115e-05, "loss": 0.0, "step": 14250 }, { "epoch": 2.235241922856099, "grad_norm": 0.00018515564443077892, "learning_rate": 1.4177589606774159e-05, "loss": 0.0, "step": 14275 }, { "epoch": 2.2391565321780886, "grad_norm": 0.00019148353021591902, "learning_rate": 1.41050922166802e-05, "loss": 0.0001, "step": 14300 }, { "epoch": 2.2430711415000784, "grad_norm": 0.0020026888232678175, "learning_rate": 1.4032594826586243e-05, "loss": 0.0, "step": 14325 }, { "epoch": 2.246985750822068, "grad_norm": 0.00482906075194478, "learning_rate": 1.3960097436492287e-05, "loss": 0.0, "step": 14350 }, { "epoch": 2.250900360144058, "grad_norm": 0.00026496723876334727, "learning_rate": 1.3887600046398331e-05, "loss": 0.0, "step": 14375 }, { "epoch": 2.2548149694660475, "grad_norm": 0.0002499364491086453, "learning_rate": 1.3815102656304373e-05, "loss": 0.0004, "step": 14400 }, { "epoch": 2.258729578788037, "grad_norm": 0.00017081611440517008, "learning_rate": 1.3742605266210417e-05, "loss": 0.0, "step": 14425 }, { "epoch": 2.2626441881100265, "grad_norm": 0.00017463510448578745, "learning_rate": 1.3670107876116461e-05, "loss": 0.0, "step": 14450 }, { "epoch": 2.2665587974320163, "grad_norm": 0.0022245387081056833, "learning_rate": 1.3597610486022505e-05, "loss": 0.0091, "step": 14475 }, { "epoch": 2.270473406754006, "grad_norm": 0.007297486532479525, "learning_rate": 1.3525113095928546e-05, "loss": 0.0004, "step": 14500 }, { "epoch": 2.2743880160759957, "grad_norm": 0.005688577424734831, "learning_rate": 1.345261570583459e-05, "loss": 0.0002, "step": 14525 }, { "epoch": 2.2783026253979854, "grad_norm": 0.004135147202759981, "learning_rate": 1.3380118315740633e-05, "loss": 0.0024, "step": 14550 }, { "epoch": 2.282217234719975, "grad_norm": 0.002322606975212693, "learning_rate": 1.3307620925646677e-05, "loss": 0.0005, "step": 14575 }, { "epoch": 2.2861318440419645, "grad_norm": 0.002005909802392125, "learning_rate": 1.3235123535552721e-05, "loss": 0.001, "step": 14600 }, { "epoch": 2.290046453363954, "grad_norm": 0.0009009299101307988, "learning_rate": 1.3162626145458765e-05, "loss": 0.0009, "step": 14625 }, { "epoch": 2.293961062685944, "grad_norm": 0.009488469921052456, "learning_rate": 1.3090128755364809e-05, "loss": 0.0001, "step": 14650 }, { "epoch": 2.2978756720079336, "grad_norm": 0.017507528886198997, "learning_rate": 1.3017631365270851e-05, "loss": 0.0038, "step": 14675 }, { "epoch": 2.3017902813299234, "grad_norm": 0.0015606528613716364, "learning_rate": 1.2945133975176894e-05, "loss": 0.0003, "step": 14700 }, { "epoch": 2.305704890651913, "grad_norm": 0.0004906764370389283, "learning_rate": 1.2872636585082937e-05, "loss": 0.0006, "step": 14725 }, { "epoch": 2.309619499973903, "grad_norm": 0.001650349353440106, "learning_rate": 1.280013919498898e-05, "loss": 0.0001, "step": 14750 }, { "epoch": 2.313534109295892, "grad_norm": 0.00038060618680901825, "learning_rate": 1.2727641804895024e-05, "loss": 0.0005, "step": 14775 }, { "epoch": 2.317448718617882, "grad_norm": 0.001028302125632763, "learning_rate": 1.2655144414801068e-05, "loss": 0.0001, "step": 14800 }, { "epoch": 2.3213633279398715, "grad_norm": 0.007792349439114332, "learning_rate": 1.2582647024707112e-05, "loss": 0.0002, "step": 14825 }, { "epoch": 2.3252779372618613, "grad_norm": 0.01641431264579296, "learning_rate": 1.2510149634613155e-05, "loss": 0.0001, "step": 14850 }, { "epoch": 2.329192546583851, "grad_norm": 0.0006683383253403008, "learning_rate": 1.2437652244519198e-05, "loss": 0.0022, "step": 14875 }, { "epoch": 2.3331071559058407, "grad_norm": 0.0009384675067849457, "learning_rate": 1.2365154854425242e-05, "loss": 0.0017, "step": 14900 }, { "epoch": 2.3370217652278304, "grad_norm": 0.00037563694058917463, "learning_rate": 1.2292657464331286e-05, "loss": 0.0011, "step": 14925 }, { "epoch": 2.3409363745498197, "grad_norm": 0.00020698497246485204, "learning_rate": 1.222016007423733e-05, "loss": 0.0008, "step": 14950 }, { "epoch": 2.3448509838718095, "grad_norm": 0.0001723883324302733, "learning_rate": 1.2147662684143372e-05, "loss": 0.0001, "step": 14975 }, { "epoch": 2.348765593193799, "grad_norm": 0.06703776121139526, "learning_rate": 1.2075165294049416e-05, "loss": 0.0001, "step": 15000 }, { "epoch": 2.352680202515789, "grad_norm": 0.00018680775247048587, "learning_rate": 1.2002667903955458e-05, "loss": 0.0, "step": 15025 }, { "epoch": 2.3565948118377786, "grad_norm": 0.0001830816181609407, "learning_rate": 1.1930170513861502e-05, "loss": 0.0, "step": 15050 }, { "epoch": 2.3605094211597684, "grad_norm": 0.00020273331028874964, "learning_rate": 1.1857673123767544e-05, "loss": 0.0001, "step": 15075 }, { "epoch": 2.364424030481758, "grad_norm": 0.00020694882550742477, "learning_rate": 1.1785175733673588e-05, "loss": 0.0003, "step": 15100 }, { "epoch": 2.3683386398037474, "grad_norm": 0.0007374598644673824, "learning_rate": 1.1712678343579632e-05, "loss": 0.0061, "step": 15125 }, { "epoch": 2.372253249125737, "grad_norm": 0.01134900189936161, "learning_rate": 1.1640180953485676e-05, "loss": 0.0002, "step": 15150 }, { "epoch": 2.376167858447727, "grad_norm": 0.000648992951028049, "learning_rate": 1.1567683563391718e-05, "loss": 0.0001, "step": 15175 }, { "epoch": 2.3800824677697165, "grad_norm": 0.0004858991305809468, "learning_rate": 1.1495186173297762e-05, "loss": 0.0, "step": 15200 }, { "epoch": 2.3839970770917063, "grad_norm": 0.0870414674282074, "learning_rate": 1.1422688783203806e-05, "loss": 0.0002, "step": 15225 }, { "epoch": 2.387911686413696, "grad_norm": 0.0006449563661590219, "learning_rate": 1.135019139310985e-05, "loss": 0.0, "step": 15250 }, { "epoch": 2.3918262957356857, "grad_norm": 0.00027997951838187873, "learning_rate": 1.1277694003015892e-05, "loss": 0.0001, "step": 15275 }, { "epoch": 2.395740905057675, "grad_norm": 0.9710797667503357, "learning_rate": 1.1205196612921936e-05, "loss": 0.0, "step": 15300 }, { "epoch": 2.3996555143796647, "grad_norm": 0.00019451680418569595, "learning_rate": 1.1132699222827978e-05, "loss": 0.0, "step": 15325 }, { "epoch": 2.4035701237016545, "grad_norm": 0.0015565232606604695, "learning_rate": 1.1060201832734022e-05, "loss": 0.0038, "step": 15350 }, { "epoch": 2.407484733023644, "grad_norm": 0.00032800339977256954, "learning_rate": 1.0987704442640064e-05, "loss": 0.0002, "step": 15375 }, { "epoch": 2.411399342345634, "grad_norm": 0.017102686688303947, "learning_rate": 1.0915207052546108e-05, "loss": 0.0005, "step": 15400 }, { "epoch": 2.4153139516676236, "grad_norm": 0.001418459229171276, "learning_rate": 1.0842709662452152e-05, "loss": 0.0, "step": 15425 }, { "epoch": 2.4192285609896134, "grad_norm": 0.015620172023773193, "learning_rate": 1.0770212272358196e-05, "loss": 0.0001, "step": 15450 }, { "epoch": 2.423143170311603, "grad_norm": 0.0006858358392491937, "learning_rate": 1.0697714882264238e-05, "loss": 0.0018, "step": 15475 }, { "epoch": 2.427057779633593, "grad_norm": 0.0004693476075772196, "learning_rate": 1.0625217492170282e-05, "loss": 0.0001, "step": 15500 }, { "epoch": 2.430972388955582, "grad_norm": 0.34811919927597046, "learning_rate": 1.0552720102076326e-05, "loss": 0.0002, "step": 15525 }, { "epoch": 2.434886998277572, "grad_norm": 0.0014850205043330789, "learning_rate": 1.048022271198237e-05, "loss": 0.0001, "step": 15550 }, { "epoch": 2.4388016075995615, "grad_norm": 0.000363474857294932, "learning_rate": 1.0407725321888412e-05, "loss": 0.0004, "step": 15575 }, { "epoch": 2.4427162169215513, "grad_norm": 0.0010670394403859973, "learning_rate": 1.0335227931794456e-05, "loss": 0.0004, "step": 15600 }, { "epoch": 2.446630826243541, "grad_norm": 0.14377856254577637, "learning_rate": 1.0262730541700499e-05, "loss": 0.0019, "step": 15625 }, { "epoch": 2.4505454355655307, "grad_norm": 0.0004735889961011708, "learning_rate": 1.0190233151606542e-05, "loss": 0.0001, "step": 15650 }, { "epoch": 2.4544600448875205, "grad_norm": 0.0004282770969439298, "learning_rate": 1.0117735761512585e-05, "loss": 0.0026, "step": 15675 }, { "epoch": 2.4583746542095097, "grad_norm": 0.007325559854507446, "learning_rate": 1.0045238371418629e-05, "loss": 0.0007, "step": 15700 }, { "epoch": 2.4622892635314995, "grad_norm": 0.000777337234467268, "learning_rate": 9.972740981324673e-06, "loss": 0.0009, "step": 15725 }, { "epoch": 2.466203872853489, "grad_norm": 0.003926098812371492, "learning_rate": 9.900243591230716e-06, "loss": 0.0004, "step": 15750 }, { "epoch": 2.470118482175479, "grad_norm": 0.00045515818055719137, "learning_rate": 9.82774620113676e-06, "loss": 0.0001, "step": 15775 }, { "epoch": 2.4740330914974686, "grad_norm": 0.01267548743635416, "learning_rate": 9.755248811042803e-06, "loss": 0.0002, "step": 15800 }, { "epoch": 2.4779477008194584, "grad_norm": 0.0004507755220402032, "learning_rate": 9.682751420948847e-06, "loss": 0.0008, "step": 15825 }, { "epoch": 2.481862310141448, "grad_norm": 0.07765714824199677, "learning_rate": 9.61025403085489e-06, "loss": 0.0001, "step": 15850 }, { "epoch": 2.4857769194634374, "grad_norm": 0.0010181193938478827, "learning_rate": 9.537756640760934e-06, "loss": 0.0002, "step": 15875 }, { "epoch": 2.489691528785427, "grad_norm": 0.00023663626052439213, "learning_rate": 9.465259250666977e-06, "loss": 0.0011, "step": 15900 }, { "epoch": 2.493606138107417, "grad_norm": 0.010522628203034401, "learning_rate": 9.392761860573019e-06, "loss": 0.0002, "step": 15925 }, { "epoch": 2.4975207474294066, "grad_norm": 0.0006732672336511314, "learning_rate": 9.320264470479063e-06, "loss": 0.0001, "step": 15950 }, { "epoch": 2.5014353567513963, "grad_norm": 0.0013339362340047956, "learning_rate": 9.247767080385107e-06, "loss": 0.0, "step": 15975 }, { "epoch": 2.505349966073386, "grad_norm": 0.0003018657735083252, "learning_rate": 9.175269690291149e-06, "loss": 0.0, "step": 16000 }, { "epoch": 2.5092645753953757, "grad_norm": 0.0003551984846126288, "learning_rate": 9.102772300197193e-06, "loss": 0.0, "step": 16025 }, { "epoch": 2.513179184717365, "grad_norm": 0.00026321958284825087, "learning_rate": 9.030274910103237e-06, "loss": 0.0, "step": 16050 }, { "epoch": 2.5170937940393547, "grad_norm": 0.00023617663828190416, "learning_rate": 8.95777752000928e-06, "loss": 0.0001, "step": 16075 }, { "epoch": 2.5210084033613445, "grad_norm": 0.0029212015215307474, "learning_rate": 8.885280129915323e-06, "loss": 0.0, "step": 16100 }, { "epoch": 2.524923012683334, "grad_norm": 0.00017464791017118841, "learning_rate": 8.812782739821367e-06, "loss": 0.0, "step": 16125 }, { "epoch": 2.528837622005324, "grad_norm": 0.00018289768195245415, "learning_rate": 8.74028534972741e-06, "loss": 0.0002, "step": 16150 }, { "epoch": 2.5327522313273136, "grad_norm": 0.00490641500800848, "learning_rate": 8.667787959633455e-06, "loss": 0.0, "step": 16175 }, { "epoch": 2.5366668406493034, "grad_norm": 0.0014596167020499706, "learning_rate": 8.595290569539497e-06, "loss": 0.0, "step": 16200 }, { "epoch": 2.5405814499712926, "grad_norm": 0.00016923531075008214, "learning_rate": 8.522793179445541e-06, "loss": 0.0001, "step": 16225 }, { "epoch": 2.5444960592932824, "grad_norm": 0.001988182310014963, "learning_rate": 8.450295789351583e-06, "loss": 0.0001, "step": 16250 }, { "epoch": 2.548410668615272, "grad_norm": 0.0008615644765086472, "learning_rate": 8.377798399257627e-06, "loss": 0.0001, "step": 16275 }, { "epoch": 2.552325277937262, "grad_norm": 0.0002167491620639339, "learning_rate": 8.30530100916367e-06, "loss": 0.0, "step": 16300 }, { "epoch": 2.5562398872592516, "grad_norm": 0.000143597528222017, "learning_rate": 8.232803619069713e-06, "loss": 0.0001, "step": 16325 }, { "epoch": 2.5601544965812413, "grad_norm": 0.0002638675505295396, "learning_rate": 8.160306228975757e-06, "loss": 0.0, "step": 16350 }, { "epoch": 2.564069105903231, "grad_norm": 0.0011918079107999802, "learning_rate": 8.087808838881801e-06, "loss": 0.001, "step": 16375 }, { "epoch": 2.5679837152252203, "grad_norm": 0.00031527673127129674, "learning_rate": 8.015311448787843e-06, "loss": 0.0, "step": 16400 }, { "epoch": 2.5718983245472105, "grad_norm": 0.0002970160567201674, "learning_rate": 7.942814058693887e-06, "loss": 0.0, "step": 16425 }, { "epoch": 2.5758129338691997, "grad_norm": 0.00032033000024966896, "learning_rate": 7.870316668599931e-06, "loss": 0.0, "step": 16450 }, { "epoch": 2.5797275431911895, "grad_norm": 0.00021383754210546613, "learning_rate": 7.797819278505975e-06, "loss": 0.0, "step": 16475 }, { "epoch": 2.583642152513179, "grad_norm": 0.0025132743176072836, "learning_rate": 7.725321888412019e-06, "loss": 0.0016, "step": 16500 }, { "epoch": 2.587556761835169, "grad_norm": 0.0004337320278864354, "learning_rate": 7.652824498318061e-06, "loss": 0.0, "step": 16525 }, { "epoch": 2.5914713711571586, "grad_norm": 0.000609175069257617, "learning_rate": 7.580327108224104e-06, "loss": 0.0016, "step": 16550 }, { "epoch": 2.595385980479148, "grad_norm": 0.00021605034999083728, "learning_rate": 7.507829718130148e-06, "loss": 0.0, "step": 16575 }, { "epoch": 2.599300589801138, "grad_norm": 0.0006186183891259134, "learning_rate": 7.435332328036191e-06, "loss": 0.0, "step": 16600 }, { "epoch": 2.6032151991231274, "grad_norm": 0.0002476814261171967, "learning_rate": 7.362834937942234e-06, "loss": 0.0006, "step": 16625 }, { "epoch": 2.607129808445117, "grad_norm": 0.0007107394631020725, "learning_rate": 7.2903375478482775e-06, "loss": 0.0, "step": 16650 }, { "epoch": 2.611044417767107, "grad_norm": 0.00029517774237319827, "learning_rate": 7.217840157754321e-06, "loss": 0.0, "step": 16675 }, { "epoch": 2.6149590270890966, "grad_norm": 0.0001818942982936278, "learning_rate": 7.145342767660365e-06, "loss": 0.0, "step": 16700 }, { "epoch": 2.6188736364110863, "grad_norm": 0.0003430229553487152, "learning_rate": 7.072845377566408e-06, "loss": 0.0029, "step": 16725 }, { "epoch": 2.6227882457330756, "grad_norm": 0.01995168998837471, "learning_rate": 7.0003479874724515e-06, "loss": 0.004, "step": 16750 }, { "epoch": 2.6267028550550657, "grad_norm": 0.003800376318395138, "learning_rate": 6.927850597378495e-06, "loss": 0.0002, "step": 16775 }, { "epoch": 2.630617464377055, "grad_norm": 0.0005610916996374726, "learning_rate": 6.8553532072845385e-06, "loss": 0.0009, "step": 16800 }, { "epoch": 2.6345320736990447, "grad_norm": 0.0002944047737400979, "learning_rate": 6.782855817190581e-06, "loss": 0.0002, "step": 16825 }, { "epoch": 2.6384466830210345, "grad_norm": 0.0002466263249516487, "learning_rate": 6.710358427096625e-06, "loss": 0.0, "step": 16850 }, { "epoch": 2.642361292343024, "grad_norm": 0.0001500146317994222, "learning_rate": 6.637861037002669e-06, "loss": 0.0, "step": 16875 }, { "epoch": 2.646275901665014, "grad_norm": 0.0007112550083547831, "learning_rate": 6.565363646908712e-06, "loss": 0.0001, "step": 16900 }, { "epoch": 2.6501905109870036, "grad_norm": 0.00017001846572384238, "learning_rate": 6.492866256814755e-06, "loss": 0.0, "step": 16925 }, { "epoch": 2.6541051203089934, "grad_norm": 0.0007822296465747058, "learning_rate": 6.420368866720798e-06, "loss": 0.0, "step": 16950 }, { "epoch": 2.6580197296309827, "grad_norm": 0.0006506266072392464, "learning_rate": 6.347871476626842e-06, "loss": 0.0011, "step": 16975 }, { "epoch": 2.6619343389529724, "grad_norm": 0.001398293417878449, "learning_rate": 6.275374086532886e-06, "loss": 0.0009, "step": 17000 }, { "epoch": 2.665848948274962, "grad_norm": 0.01400019507855177, "learning_rate": 6.202876696438929e-06, "loss": 0.0007, "step": 17025 }, { "epoch": 2.669763557596952, "grad_norm": 0.001175655866973102, "learning_rate": 6.130379306344972e-06, "loss": 0.0008, "step": 17050 }, { "epoch": 2.6736781669189416, "grad_norm": 0.004671004135161638, "learning_rate": 6.057881916251015e-06, "loss": 0.0001, "step": 17075 }, { "epoch": 2.6775927762409313, "grad_norm": 0.03513360768556595, "learning_rate": 5.985384526157058e-06, "loss": 0.0001, "step": 17100 }, { "epoch": 2.681507385562921, "grad_norm": 0.0003586947568692267, "learning_rate": 5.912887136063102e-06, "loss": 0.0, "step": 17125 }, { "epoch": 2.6854219948849103, "grad_norm": 0.00038099908852018416, "learning_rate": 5.840389745969145e-06, "loss": 0.0, "step": 17150 }, { "epoch": 2.6893366042069, "grad_norm": 0.00031486572697758675, "learning_rate": 5.767892355875189e-06, "loss": 0.0001, "step": 17175 }, { "epoch": 2.6932512135288897, "grad_norm": 0.0003432184748817235, "learning_rate": 5.695394965781233e-06, "loss": 0.0001, "step": 17200 }, { "epoch": 2.6971658228508795, "grad_norm": 0.0004650696355383843, "learning_rate": 5.622897575687275e-06, "loss": 0.0001, "step": 17225 }, { "epoch": 2.701080432172869, "grad_norm": 0.0003299444215372205, "learning_rate": 5.550400185593319e-06, "loss": 0.0006, "step": 17250 }, { "epoch": 2.704995041494859, "grad_norm": 0.0003300994576420635, "learning_rate": 5.477902795499362e-06, "loss": 0.0, "step": 17275 }, { "epoch": 2.7089096508168486, "grad_norm": 0.0005288653774186969, "learning_rate": 5.405405405405406e-06, "loss": 0.001, "step": 17300 }, { "epoch": 2.712824260138838, "grad_norm": 0.00020216924895066768, "learning_rate": 5.332908015311449e-06, "loss": 0.0, "step": 17325 }, { "epoch": 2.7167388694608277, "grad_norm": 0.006004292517900467, "learning_rate": 5.260410625217493e-06, "loss": 0.0, "step": 17350 }, { "epoch": 2.7206534787828174, "grad_norm": 0.0001754688419168815, "learning_rate": 5.187913235123536e-06, "loss": 0.0, "step": 17375 }, { "epoch": 2.724568088104807, "grad_norm": 0.012972986325621605, "learning_rate": 5.115415845029579e-06, "loss": 0.0, "step": 17400 }, { "epoch": 2.728482697426797, "grad_norm": 0.00019060824706684798, "learning_rate": 5.042918454935622e-06, "loss": 0.0, "step": 17425 }, { "epoch": 2.7323973067487866, "grad_norm": 0.0002203083859058097, "learning_rate": 4.970421064841666e-06, "loss": 0.0001, "step": 17450 }, { "epoch": 2.7363119160707763, "grad_norm": 0.00014681309403385967, "learning_rate": 4.897923674747709e-06, "loss": 0.0, "step": 17475 }, { "epoch": 2.7402265253927656, "grad_norm": 0.0002081810962408781, "learning_rate": 4.825426284653753e-06, "loss": 0.0009, "step": 17500 }, { "epoch": 2.7441411347147557, "grad_norm": 0.0002039974497165531, "learning_rate": 4.752928894559796e-06, "loss": 0.0001, "step": 17525 }, { "epoch": 2.748055744036745, "grad_norm": 0.0001363355404464528, "learning_rate": 4.680431504465839e-06, "loss": 0.0, "step": 17550 }, { "epoch": 2.7519703533587347, "grad_norm": 0.0013490230776369572, "learning_rate": 4.6079341143718824e-06, "loss": 0.0004, "step": 17575 }, { "epoch": 2.7558849626807245, "grad_norm": 0.00017140705313067883, "learning_rate": 4.535436724277926e-06, "loss": 0.0005, "step": 17600 }, { "epoch": 2.759799572002714, "grad_norm": 0.00012424413580447435, "learning_rate": 4.4629393341839695e-06, "loss": 0.0, "step": 17625 }, { "epoch": 2.763714181324704, "grad_norm": 0.000487282668473199, "learning_rate": 4.390441944090013e-06, "loss": 0.0001, "step": 17650 }, { "epoch": 2.767628790646693, "grad_norm": 0.0003270190500188619, "learning_rate": 4.3179445539960565e-06, "loss": 0.0004, "step": 17675 }, { "epoch": 2.7715433999686834, "grad_norm": 0.00023671095550525934, "learning_rate": 4.2454471639020995e-06, "loss": 0.0, "step": 17700 }, { "epoch": 2.7754580092906727, "grad_norm": 0.0011756513267755508, "learning_rate": 4.172949773808143e-06, "loss": 0.0067, "step": 17725 }, { "epoch": 2.7793726186126624, "grad_norm": 0.0004914366290904582, "learning_rate": 4.1004523837141865e-06, "loss": 0.0005, "step": 17750 }, { "epoch": 2.783287227934652, "grad_norm": 0.0011064461432397366, "learning_rate": 4.02795499362023e-06, "loss": 0.0, "step": 17775 }, { "epoch": 2.787201837256642, "grad_norm": 0.0006191087886691093, "learning_rate": 3.9554576035262736e-06, "loss": 0.001, "step": 17800 }, { "epoch": 2.7911164465786316, "grad_norm": 0.005341960582882166, "learning_rate": 3.882960213432317e-06, "loss": 0.0001, "step": 17825 }, { "epoch": 2.795031055900621, "grad_norm": 0.004248717799782753, "learning_rate": 3.81046282333836e-06, "loss": 0.0003, "step": 17850 }, { "epoch": 2.798945665222611, "grad_norm": 0.02371644414961338, "learning_rate": 3.7379654332444032e-06, "loss": 0.0001, "step": 17875 }, { "epoch": 2.8028602745446003, "grad_norm": 0.0007812991389073431, "learning_rate": 3.6654680431504467e-06, "loss": 0.0004, "step": 17900 }, { "epoch": 2.80677488386659, "grad_norm": 0.00031172268791124225, "learning_rate": 3.59297065305649e-06, "loss": 0.0002, "step": 17925 }, { "epoch": 2.8106894931885797, "grad_norm": 0.012311534956097603, "learning_rate": 3.5204732629625337e-06, "loss": 0.0001, "step": 17950 }, { "epoch": 2.8146041025105695, "grad_norm": 0.0008552991203032434, "learning_rate": 3.4479758728685772e-06, "loss": 0.0003, "step": 17975 }, { "epoch": 2.818518711832559, "grad_norm": 0.011254767887294292, "learning_rate": 3.3754784827746203e-06, "loss": 0.0001, "step": 18000 }, { "epoch": 2.822433321154549, "grad_norm": 0.003312336513772607, "learning_rate": 3.302981092680664e-06, "loss": 0.005, "step": 18025 }, { "epoch": 2.8263479304765387, "grad_norm": 0.0033339662477374077, "learning_rate": 3.230483702586707e-06, "loss": 0.0001, "step": 18050 }, { "epoch": 2.830262539798528, "grad_norm": 0.0307988952845335, "learning_rate": 3.157986312492751e-06, "loss": 0.0001, "step": 18075 }, { "epoch": 2.8341771491205177, "grad_norm": 0.001144499285146594, "learning_rate": 3.085488922398794e-06, "loss": 0.0, "step": 18100 }, { "epoch": 2.8380917584425074, "grad_norm": 0.0007567739812657237, "learning_rate": 3.012991532304837e-06, "loss": 0.0001, "step": 18125 }, { "epoch": 2.842006367764497, "grad_norm": 0.0014737301971763372, "learning_rate": 2.9404941422108805e-06, "loss": 0.0001, "step": 18150 }, { "epoch": 2.845920977086487, "grad_norm": 0.0020669877994805574, "learning_rate": 2.867996752116924e-06, "loss": 0.0005, "step": 18175 }, { "epoch": 2.8498355864084766, "grad_norm": 0.0025932856369763613, "learning_rate": 2.795499362022967e-06, "loss": 0.0001, "step": 18200 }, { "epoch": 2.8537501957304663, "grad_norm": 0.0018630975391715765, "learning_rate": 2.7230019719290106e-06, "loss": 0.0, "step": 18225 }, { "epoch": 2.8576648050524556, "grad_norm": 0.011522402986884117, "learning_rate": 2.650504581835054e-06, "loss": 0.0021, "step": 18250 }, { "epoch": 2.8615794143744453, "grad_norm": 0.020472779870033264, "learning_rate": 2.578007191741097e-06, "loss": 0.0001, "step": 18275 }, { "epoch": 2.865494023696435, "grad_norm": 0.06026843190193176, "learning_rate": 2.5055098016471406e-06, "loss": 0.0001, "step": 18300 }, { "epoch": 2.8694086330184247, "grad_norm": 0.0006733342306688428, "learning_rate": 2.433012411553184e-06, "loss": 0.0, "step": 18325 }, { "epoch": 2.8733232423404145, "grad_norm": 0.0009708734578453004, "learning_rate": 2.3605150214592277e-06, "loss": 0.0001, "step": 18350 }, { "epoch": 2.877237851662404, "grad_norm": 0.00023784795484971255, "learning_rate": 2.288017631365271e-06, "loss": 0.0001, "step": 18375 }, { "epoch": 2.881152460984394, "grad_norm": 0.004968983121216297, "learning_rate": 2.2155202412713147e-06, "loss": 0.0001, "step": 18400 }, { "epoch": 2.885067070306383, "grad_norm": 0.0008029749151319265, "learning_rate": 2.1430228511773577e-06, "loss": 0.0, "step": 18425 }, { "epoch": 2.888981679628373, "grad_norm": 0.0008586676558479667, "learning_rate": 2.0705254610834012e-06, "loss": 0.0, "step": 18450 }, { "epoch": 2.8928962889503627, "grad_norm": 0.0015144862700253725, "learning_rate": 1.9980280709894447e-06, "loss": 0.0005, "step": 18475 }, { "epoch": 2.8968108982723524, "grad_norm": 0.003726179013028741, "learning_rate": 1.925530680895488e-06, "loss": 0.006, "step": 18500 }, { "epoch": 2.900725507594342, "grad_norm": 0.00036417951923795044, "learning_rate": 1.8530332908015313e-06, "loss": 0.0, "step": 18525 }, { "epoch": 2.904640116916332, "grad_norm": 0.0008425221894867718, "learning_rate": 1.7805359007075746e-06, "loss": 0.0006, "step": 18550 }, { "epoch": 2.9085547262383216, "grad_norm": 0.12718600034713745, "learning_rate": 1.7080385106136181e-06, "loss": 0.001, "step": 18575 }, { "epoch": 2.912469335560311, "grad_norm": 0.0010318702552467585, "learning_rate": 1.6355411205196614e-06, "loss": 0.0013, "step": 18600 }, { "epoch": 2.916383944882301, "grad_norm": 0.0004336585116107017, "learning_rate": 1.5630437304257047e-06, "loss": 0.0003, "step": 18625 }, { "epoch": 2.9202985542042903, "grad_norm": 0.000422166776843369, "learning_rate": 1.4905463403317482e-06, "loss": 0.0001, "step": 18650 }, { "epoch": 2.92421316352628, "grad_norm": 0.0006406558677554131, "learning_rate": 1.4180489502377915e-06, "loss": 0.0, "step": 18675 }, { "epoch": 2.9281277728482697, "grad_norm": 0.0005850115558132529, "learning_rate": 1.3455515601438348e-06, "loss": 0.0001, "step": 18700 }, { "epoch": 2.9320423821702595, "grad_norm": 0.0158847626298666, "learning_rate": 1.2730541700498783e-06, "loss": 0.0001, "step": 18725 }, { "epoch": 2.935956991492249, "grad_norm": 0.0014247479848563671, "learning_rate": 1.2005567799559216e-06, "loss": 0.0015, "step": 18750 }, { "epoch": 2.9398716008142385, "grad_norm": 0.0002501108101569116, "learning_rate": 1.128059389861965e-06, "loss": 0.0001, "step": 18775 }, { "epoch": 2.9437862101362287, "grad_norm": 0.0004493577580433339, "learning_rate": 1.0555619997680084e-06, "loss": 0.0002, "step": 18800 }, { "epoch": 2.947700819458218, "grad_norm": 0.00032207099138759077, "learning_rate": 9.830646096740517e-07, "loss": 0.0001, "step": 18825 }, { "epoch": 2.9516154287802077, "grad_norm": 0.002576634753495455, "learning_rate": 9.105672195800951e-07, "loss": 0.0001, "step": 18850 }, { "epoch": 2.9555300381021974, "grad_norm": 0.0007969861035235226, "learning_rate": 8.380698294861385e-07, "loss": 0.0, "step": 18875 }, { "epoch": 2.959444647424187, "grad_norm": 0.0008164517930708826, "learning_rate": 7.655724393921819e-07, "loss": 0.0, "step": 18900 }, { "epoch": 2.963359256746177, "grad_norm": 0.000589414150454104, "learning_rate": 6.930750492982253e-07, "loss": 0.0002, "step": 18925 }, { "epoch": 2.967273866068166, "grad_norm": 0.009339476004242897, "learning_rate": 6.205776592042687e-07, "loss": 0.0, "step": 18950 }, { "epoch": 2.9711884753901563, "grad_norm": 0.0037412915844470263, "learning_rate": 5.480802691103121e-07, "loss": 0.0, "step": 18975 }, { "epoch": 2.9751030847121456, "grad_norm": 0.00445817643776536, "learning_rate": 4.7558287901635545e-07, "loss": 0.0001, "step": 19000 }, { "epoch": 2.9790176940341353, "grad_norm": 0.001451736083254218, "learning_rate": 4.0308548892239885e-07, "loss": 0.001, "step": 19025 }, { "epoch": 2.982932303356125, "grad_norm": 0.000640546262729913, "learning_rate": 3.305880988284422e-07, "loss": 0.0001, "step": 19050 }, { "epoch": 2.9868469126781148, "grad_norm": 0.2126484215259552, "learning_rate": 2.580907087344856e-07, "loss": 0.0008, "step": 19075 }, { "epoch": 2.9907615220001045, "grad_norm": 0.0003385374147910625, "learning_rate": 1.8559331864052894e-07, "loss": 0.0, "step": 19100 }, { "epoch": 2.994676131322094, "grad_norm": 0.00034946645610034466, "learning_rate": 1.1309592854657233e-07, "loss": 0.0, "step": 19125 }, { "epoch": 2.998590740644084, "grad_norm": 0.0007842128979973495, "learning_rate": 4.0598538452615705e-08, "loss": 0.0016, "step": 19150 }, { "epoch": 2.9998434156271205, "eval_accuracy": 0.9999096727919246, "eval_f1": 0.9998709961870392, "eval_loss": 0.0005143894231878221, "eval_precision": 0.9998563016620019, "eval_recall": 0.999885691144003, "eval_runtime": 66.002, "eval_samples_per_second": 580.558, "eval_steps_per_second": 36.287, "step": 19158 } ], "logging_steps": 25, "max_steps": 19158, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 2 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.003595204150272e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }