PE-big3 / checkpoint-19158 /trainer_state.json
xenogy's picture
Upload folder using huggingface_hub
b2ee6a4 verified
Raw
History Blame
137 kB
{
"best_metric": 0.0005143894231878221,
"best_model_checkpoint": "PE-big3/checkpoint-19158",
"epoch": 2.9998434156271205,
"eval_steps": 500,
"global_step": 19158,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003914609321989665,
"grad_norm": 4.866889476776123,
"learning_rate": 6.524008350730689e-07,
"loss": 1.4,
"step": 25
},
{
"epoch": 0.00782921864397933,
"grad_norm": 3.255279302597046,
"learning_rate": 1.3048016701461379e-06,
"loss": 1.3054,
"step": 50
},
{
"epoch": 0.011743827965968996,
"grad_norm": 2.329948902130127,
"learning_rate": 1.957202505219207e-06,
"loss": 1.1451,
"step": 75
},
{
"epoch": 0.01565843728795866,
"grad_norm": 1.9639887809753418,
"learning_rate": 2.6096033402922757e-06,
"loss": 0.9271,
"step": 100
},
{
"epoch": 0.019573046609948328,
"grad_norm": 1.9199451208114624,
"learning_rate": 3.262004175365345e-06,
"loss": 0.6443,
"step": 125
},
{
"epoch": 0.02348765593193799,
"grad_norm": 1.4710872173309326,
"learning_rate": 3.914405010438414e-06,
"loss": 0.3856,
"step": 150
},
{
"epoch": 0.02740226525392766,
"grad_norm": 1.655380129814148,
"learning_rate": 4.5668058455114825e-06,
"loss": 0.2192,
"step": 175
},
{
"epoch": 0.03131687457591732,
"grad_norm": 1.475960373878479,
"learning_rate": 5.2192066805845514e-06,
"loss": 0.12,
"step": 200
},
{
"epoch": 0.035231483897906986,
"grad_norm": 0.9583467245101929,
"learning_rate": 5.87160751565762e-06,
"loss": 0.0688,
"step": 225
},
{
"epoch": 0.039146093219896656,
"grad_norm": 1.5871645212173462,
"learning_rate": 6.52400835073069e-06,
"loss": 0.0487,
"step": 250
},
{
"epoch": 0.04306070254188632,
"grad_norm": 1.3285017013549805,
"learning_rate": 7.176409185803757e-06,
"loss": 0.0331,
"step": 275
},
{
"epoch": 0.04697531186387598,
"grad_norm": 1.2705070972442627,
"learning_rate": 7.828810020876827e-06,
"loss": 0.0195,
"step": 300
},
{
"epoch": 0.050889921185865654,
"grad_norm": 0.44879260659217834,
"learning_rate": 8.481210855949897e-06,
"loss": 0.0145,
"step": 325
},
{
"epoch": 0.05480453050785532,
"grad_norm": 0.6279118657112122,
"learning_rate": 9.133611691022965e-06,
"loss": 0.0142,
"step": 350
},
{
"epoch": 0.05871913982984498,
"grad_norm": 0.15489937365055084,
"learning_rate": 9.786012526096033e-06,
"loss": 0.0159,
"step": 375
},
{
"epoch": 0.06263374915183464,
"grad_norm": 0.1354319453239441,
"learning_rate": 1.0438413361169103e-05,
"loss": 0.0144,
"step": 400
},
{
"epoch": 0.06654835847382432,
"grad_norm": 0.36055588722229004,
"learning_rate": 1.1090814196242173e-05,
"loss": 0.0078,
"step": 425
},
{
"epoch": 0.07046296779581397,
"grad_norm": 0.11695325374603271,
"learning_rate": 1.174321503131524e-05,
"loss": 0.0064,
"step": 450
},
{
"epoch": 0.07437757711780364,
"grad_norm": 1.524307370185852,
"learning_rate": 1.2395615866388309e-05,
"loss": 0.0069,
"step": 475
},
{
"epoch": 0.07829218643979331,
"grad_norm": 0.43202999234199524,
"learning_rate": 1.304801670146138e-05,
"loss": 0.0098,
"step": 500
},
{
"epoch": 0.08220679576178297,
"grad_norm": 0.20765632390975952,
"learning_rate": 1.3700417536534447e-05,
"loss": 0.0068,
"step": 525
},
{
"epoch": 0.08612140508377264,
"grad_norm": 0.020130537450313568,
"learning_rate": 1.4352818371607515e-05,
"loss": 0.0045,
"step": 550
},
{
"epoch": 0.09003601440576231,
"grad_norm": 1.2454266548156738,
"learning_rate": 1.5005219206680585e-05,
"loss": 0.0057,
"step": 575
},
{
"epoch": 0.09395062372775197,
"grad_norm": 1.0225284099578857,
"learning_rate": 1.5657620041753654e-05,
"loss": 0.0049,
"step": 600
},
{
"epoch": 0.09786523304974164,
"grad_norm": 0.755135178565979,
"learning_rate": 1.6310020876826724e-05,
"loss": 0.0031,
"step": 625
},
{
"epoch": 0.10177984237173131,
"grad_norm": 0.2760821282863617,
"learning_rate": 1.6962421711899794e-05,
"loss": 0.0071,
"step": 650
},
{
"epoch": 0.10569445169372096,
"grad_norm": 0.4344524145126343,
"learning_rate": 1.761482254697286e-05,
"loss": 0.0034,
"step": 675
},
{
"epoch": 0.10960906101571063,
"grad_norm": 0.020152989774942398,
"learning_rate": 1.826722338204593e-05,
"loss": 0.0025,
"step": 700
},
{
"epoch": 0.11352367033770029,
"grad_norm": 0.08340949565172195,
"learning_rate": 1.8919624217118996e-05,
"loss": 0.0078,
"step": 725
},
{
"epoch": 0.11743827965968996,
"grad_norm": 0.03789607062935829,
"learning_rate": 1.9572025052192066e-05,
"loss": 0.0011,
"step": 750
},
{
"epoch": 0.12135288898167963,
"grad_norm": 0.8983942866325378,
"learning_rate": 2.0224425887265136e-05,
"loss": 0.0083,
"step": 775
},
{
"epoch": 0.1252674983036693,
"grad_norm": 2.239677667617798,
"learning_rate": 2.0876826722338206e-05,
"loss": 0.0121,
"step": 800
},
{
"epoch": 0.12918210762565896,
"grad_norm": 0.017707696184515953,
"learning_rate": 2.1529227557411276e-05,
"loss": 0.0029,
"step": 825
},
{
"epoch": 0.13309671694764863,
"grad_norm": 0.45937007665634155,
"learning_rate": 2.2181628392484345e-05,
"loss": 0.0049,
"step": 850
},
{
"epoch": 0.1370113262696383,
"grad_norm": 0.9611666202545166,
"learning_rate": 2.2834029227557412e-05,
"loss": 0.0128,
"step": 875
},
{
"epoch": 0.14092593559162794,
"grad_norm": 0.22098630666732788,
"learning_rate": 2.348643006263048e-05,
"loss": 0.0139,
"step": 900
},
{
"epoch": 0.1448405449136176,
"grad_norm": 0.016266101971268654,
"learning_rate": 2.413883089770355e-05,
"loss": 0.0033,
"step": 925
},
{
"epoch": 0.14875515423560728,
"grad_norm": 0.03947868198156357,
"learning_rate": 2.4791231732776618e-05,
"loss": 0.0042,
"step": 950
},
{
"epoch": 0.15266976355759695,
"grad_norm": 0.20284026861190796,
"learning_rate": 2.544363256784969e-05,
"loss": 0.0037,
"step": 975
},
{
"epoch": 0.15658437287958663,
"grad_norm": 0.9516937732696533,
"learning_rate": 2.609603340292276e-05,
"loss": 0.013,
"step": 1000
},
{
"epoch": 0.1604989822015763,
"grad_norm": 0.35638949275016785,
"learning_rate": 2.6748434237995827e-05,
"loss": 0.0043,
"step": 1025
},
{
"epoch": 0.16441359152356594,
"grad_norm": 0.2974227964878082,
"learning_rate": 2.7400835073068893e-05,
"loss": 0.0058,
"step": 1050
},
{
"epoch": 0.1683282008455556,
"grad_norm": 0.10002760589122772,
"learning_rate": 2.8053235908141963e-05,
"loss": 0.0047,
"step": 1075
},
{
"epoch": 0.17224281016754528,
"grad_norm": 0.03456703945994377,
"learning_rate": 2.870563674321503e-05,
"loss": 0.0034,
"step": 1100
},
{
"epoch": 0.17615741948953495,
"grad_norm": 0.02147500589489937,
"learning_rate": 2.93580375782881e-05,
"loss": 0.0052,
"step": 1125
},
{
"epoch": 0.18007202881152462,
"grad_norm": 0.048098206520080566,
"learning_rate": 3.001043841336117e-05,
"loss": 0.0042,
"step": 1150
},
{
"epoch": 0.18398663813351426,
"grad_norm": 0.251152902841568,
"learning_rate": 3.0662839248434235e-05,
"loss": 0.0068,
"step": 1175
},
{
"epoch": 0.18790124745550393,
"grad_norm": 0.040291983634233475,
"learning_rate": 3.131524008350731e-05,
"loss": 0.0097,
"step": 1200
},
{
"epoch": 0.1918158567774936,
"grad_norm": 0.019989246502518654,
"learning_rate": 3.1967640918580375e-05,
"loss": 0.0053,
"step": 1225
},
{
"epoch": 0.19573046609948327,
"grad_norm": 0.016093524172902107,
"learning_rate": 3.262004175365345e-05,
"loss": 0.0022,
"step": 1250
},
{
"epoch": 0.19964507542147295,
"grad_norm": 0.008093849755823612,
"learning_rate": 3.3272442588726515e-05,
"loss": 0.0019,
"step": 1275
},
{
"epoch": 0.20355968474346262,
"grad_norm": 0.018408527597784996,
"learning_rate": 3.392484342379959e-05,
"loss": 0.0022,
"step": 1300
},
{
"epoch": 0.20747429406545226,
"grad_norm": 0.009230668656527996,
"learning_rate": 3.4577244258872654e-05,
"loss": 0.0007,
"step": 1325
},
{
"epoch": 0.21138890338744193,
"grad_norm": 0.061230212450027466,
"learning_rate": 3.522964509394572e-05,
"loss": 0.0031,
"step": 1350
},
{
"epoch": 0.2153035127094316,
"grad_norm": 0.20762716233730316,
"learning_rate": 3.5882045929018794e-05,
"loss": 0.0024,
"step": 1375
},
{
"epoch": 0.21921812203142127,
"grad_norm": 0.048180121928453445,
"learning_rate": 3.653444676409186e-05,
"loss": 0.0165,
"step": 1400
},
{
"epoch": 0.22313273135341094,
"grad_norm": 0.026987021788954735,
"learning_rate": 3.718684759916493e-05,
"loss": 0.0041,
"step": 1425
},
{
"epoch": 0.22704734067540058,
"grad_norm": 0.6532347202301025,
"learning_rate": 3.783924843423799e-05,
"loss": 0.0061,
"step": 1450
},
{
"epoch": 0.23096194999739025,
"grad_norm": 0.0236322320997715,
"learning_rate": 3.8491649269311066e-05,
"loss": 0.0018,
"step": 1475
},
{
"epoch": 0.23487655931937992,
"grad_norm": 0.3827228844165802,
"learning_rate": 3.914405010438413e-05,
"loss": 0.0029,
"step": 1500
},
{
"epoch": 0.2387911686413696,
"grad_norm": 0.00414466205984354,
"learning_rate": 3.9796450939457206e-05,
"loss": 0.0024,
"step": 1525
},
{
"epoch": 0.24270577796335926,
"grad_norm": 0.03536088764667511,
"learning_rate": 4.044885177453027e-05,
"loss": 0.0041,
"step": 1550
},
{
"epoch": 0.24662038728534894,
"grad_norm": 0.02724548988044262,
"learning_rate": 4.110125260960334e-05,
"loss": 0.0018,
"step": 1575
},
{
"epoch": 0.2505349966073386,
"grad_norm": 1.551004409790039,
"learning_rate": 4.175365344467641e-05,
"loss": 0.0089,
"step": 1600
},
{
"epoch": 0.25444960592932825,
"grad_norm": 0.28799840807914734,
"learning_rate": 4.240605427974948e-05,
"loss": 0.0075,
"step": 1625
},
{
"epoch": 0.2583642152513179,
"grad_norm": 0.009647930040955544,
"learning_rate": 4.305845511482255e-05,
"loss": 0.0049,
"step": 1650
},
{
"epoch": 0.2622788245733076,
"grad_norm": 0.006901186890900135,
"learning_rate": 4.371085594989562e-05,
"loss": 0.0027,
"step": 1675
},
{
"epoch": 0.26619343389529726,
"grad_norm": 0.05002870783209801,
"learning_rate": 4.436325678496869e-05,
"loss": 0.0064,
"step": 1700
},
{
"epoch": 0.27010804321728693,
"grad_norm": 0.1099412590265274,
"learning_rate": 4.501565762004176e-05,
"loss": 0.0035,
"step": 1725
},
{
"epoch": 0.2740226525392766,
"grad_norm": 0.43022432923316956,
"learning_rate": 4.5668058455114823e-05,
"loss": 0.005,
"step": 1750
},
{
"epoch": 0.27793726186126627,
"grad_norm": 0.0661238431930542,
"learning_rate": 4.6320459290187897e-05,
"loss": 0.0111,
"step": 1775
},
{
"epoch": 0.2818518711832559,
"grad_norm": 0.04808713495731354,
"learning_rate": 4.697286012526096e-05,
"loss": 0.0082,
"step": 1800
},
{
"epoch": 0.28576648050524556,
"grad_norm": 0.010018469765782356,
"learning_rate": 4.7625260960334036e-05,
"loss": 0.0063,
"step": 1825
},
{
"epoch": 0.2896810898272352,
"grad_norm": 0.02794954925775528,
"learning_rate": 4.82776617954071e-05,
"loss": 0.0023,
"step": 1850
},
{
"epoch": 0.2935956991492249,
"grad_norm": 0.08497870713472366,
"learning_rate": 4.893006263048017e-05,
"loss": 0.0053,
"step": 1875
},
{
"epoch": 0.29751030847121457,
"grad_norm": 0.012794610112905502,
"learning_rate": 4.9582463465553235e-05,
"loss": 0.0023,
"step": 1900
},
{
"epoch": 0.30142491779320424,
"grad_norm": 0.5463805794715881,
"learning_rate": 4.997390093956617e-05,
"loss": 0.0123,
"step": 1925
},
{
"epoch": 0.3053395271151939,
"grad_norm": 0.643292248249054,
"learning_rate": 4.990140354947222e-05,
"loss": 0.006,
"step": 1950
},
{
"epoch": 0.3092541364371836,
"grad_norm": 0.016200900077819824,
"learning_rate": 4.9828906159378265e-05,
"loss": 0.0038,
"step": 1975
},
{
"epoch": 0.31316874575917325,
"grad_norm": 0.01805788092315197,
"learning_rate": 4.975640876928431e-05,
"loss": 0.0005,
"step": 2000
},
{
"epoch": 0.3170833550811629,
"grad_norm": 0.09769612550735474,
"learning_rate": 4.968391137919035e-05,
"loss": 0.0017,
"step": 2025
},
{
"epoch": 0.3209979644031526,
"grad_norm": 0.7254369258880615,
"learning_rate": 4.96114139890964e-05,
"loss": 0.0029,
"step": 2050
},
{
"epoch": 0.3249125737251422,
"grad_norm": 0.21768023073673248,
"learning_rate": 4.953891659900244e-05,
"loss": 0.0051,
"step": 2075
},
{
"epoch": 0.3288271830471319,
"grad_norm": 0.0030887445900589228,
"learning_rate": 4.946641920890848e-05,
"loss": 0.0007,
"step": 2100
},
{
"epoch": 0.33274179236912155,
"grad_norm": 0.03302296623587608,
"learning_rate": 4.9393921818814525e-05,
"loss": 0.008,
"step": 2125
},
{
"epoch": 0.3366564016911112,
"grad_norm": 0.07119308412075043,
"learning_rate": 4.932142442872057e-05,
"loss": 0.0005,
"step": 2150
},
{
"epoch": 0.3405710110131009,
"grad_norm": 0.0021239419002085924,
"learning_rate": 4.9248927038626616e-05,
"loss": 0.0006,
"step": 2175
},
{
"epoch": 0.34448562033509056,
"grad_norm": 0.0006605405360460281,
"learning_rate": 4.9179329544136416e-05,
"loss": 0.001,
"step": 2200
},
{
"epoch": 0.34840022965708023,
"grad_norm": 0.35737213492393494,
"learning_rate": 4.910683215404246e-05,
"loss": 0.0031,
"step": 2225
},
{
"epoch": 0.3523148389790699,
"grad_norm": 0.003352939384058118,
"learning_rate": 4.90343347639485e-05,
"loss": 0.0026,
"step": 2250
},
{
"epoch": 0.35622944830105957,
"grad_norm": 0.011482371017336845,
"learning_rate": 4.896183737385454e-05,
"loss": 0.0056,
"step": 2275
},
{
"epoch": 0.36014405762304924,
"grad_norm": 0.31182751059532166,
"learning_rate": 4.8889339983760585e-05,
"loss": 0.003,
"step": 2300
},
{
"epoch": 0.3640586669450389,
"grad_norm": 0.019928403198719025,
"learning_rate": 4.881684259366663e-05,
"loss": 0.004,
"step": 2325
},
{
"epoch": 0.3679732762670285,
"grad_norm": 0.17220672965049744,
"learning_rate": 4.874434520357267e-05,
"loss": 0.0022,
"step": 2350
},
{
"epoch": 0.3718878855890182,
"grad_norm": 0.002172990469262004,
"learning_rate": 4.867184781347872e-05,
"loss": 0.0018,
"step": 2375
},
{
"epoch": 0.37580249491100787,
"grad_norm": 0.6102157831192017,
"learning_rate": 4.859935042338476e-05,
"loss": 0.0024,
"step": 2400
},
{
"epoch": 0.37971710423299754,
"grad_norm": 0.011678989976644516,
"learning_rate": 4.85268530332908e-05,
"loss": 0.0023,
"step": 2425
},
{
"epoch": 0.3836317135549872,
"grad_norm": 0.7285154461860657,
"learning_rate": 4.8454355643196845e-05,
"loss": 0.0051,
"step": 2450
},
{
"epoch": 0.3875463228769769,
"grad_norm": 0.004773287568241358,
"learning_rate": 4.838185825310289e-05,
"loss": 0.0018,
"step": 2475
},
{
"epoch": 0.39146093219896655,
"grad_norm": 0.00791076384484768,
"learning_rate": 4.8309360863008937e-05,
"loss": 0.004,
"step": 2500
},
{
"epoch": 0.3953755415209562,
"grad_norm": 0.8710932731628418,
"learning_rate": 4.823686347291498e-05,
"loss": 0.0042,
"step": 2525
},
{
"epoch": 0.3992901508429459,
"grad_norm": 0.04120909795165062,
"learning_rate": 4.816436608282102e-05,
"loss": 0.0024,
"step": 2550
},
{
"epoch": 0.40320476016493556,
"grad_norm": 1.0033127069473267,
"learning_rate": 4.809186869272706e-05,
"loss": 0.004,
"step": 2575
},
{
"epoch": 0.40711936948692523,
"grad_norm": 0.1285122036933899,
"learning_rate": 4.801937130263311e-05,
"loss": 0.009,
"step": 2600
},
{
"epoch": 0.41103397880891485,
"grad_norm": 0.8447295427322388,
"learning_rate": 4.7946873912539154e-05,
"loss": 0.0015,
"step": 2625
},
{
"epoch": 0.4149485881309045,
"grad_norm": 0.10731597989797592,
"learning_rate": 4.78743765224452e-05,
"loss": 0.0066,
"step": 2650
},
{
"epoch": 0.4188631974528942,
"grad_norm": 0.011971144936978817,
"learning_rate": 4.780187913235123e-05,
"loss": 0.0042,
"step": 2675
},
{
"epoch": 0.42277780677488386,
"grad_norm": 0.0017153106164187193,
"learning_rate": 4.772938174225728e-05,
"loss": 0.0013,
"step": 2700
},
{
"epoch": 0.42669241609687353,
"grad_norm": 0.0010528437560424209,
"learning_rate": 4.7656884352163323e-05,
"loss": 0.0006,
"step": 2725
},
{
"epoch": 0.4306070254188632,
"grad_norm": 0.0007753331447020173,
"learning_rate": 4.7584386962069366e-05,
"loss": 0.0002,
"step": 2750
},
{
"epoch": 0.43452163474085287,
"grad_norm": 0.0036313000600785017,
"learning_rate": 4.751188957197541e-05,
"loss": 0.004,
"step": 2775
},
{
"epoch": 0.43843624406284254,
"grad_norm": 0.10537869483232498,
"learning_rate": 4.743939218188146e-05,
"loss": 0.004,
"step": 2800
},
{
"epoch": 0.4423508533848322,
"grad_norm": 0.0017782174982130527,
"learning_rate": 4.73668947917875e-05,
"loss": 0.0011,
"step": 2825
},
{
"epoch": 0.4462654627068219,
"grad_norm": 0.02180619165301323,
"learning_rate": 4.729439740169354e-05,
"loss": 0.0003,
"step": 2850
},
{
"epoch": 0.45018007202881155,
"grad_norm": 0.0014395464677363634,
"learning_rate": 4.7221900011599584e-05,
"loss": 0.0011,
"step": 2875
},
{
"epoch": 0.45409468135080117,
"grad_norm": 0.041430070996284485,
"learning_rate": 4.7149402621505626e-05,
"loss": 0.0007,
"step": 2900
},
{
"epoch": 0.45800929067279084,
"grad_norm": 0.054793838411569595,
"learning_rate": 4.7076905231411675e-05,
"loss": 0.0025,
"step": 2925
},
{
"epoch": 0.4619238999947805,
"grad_norm": 0.08612020313739777,
"learning_rate": 4.700440784131772e-05,
"loss": 0.0068,
"step": 2950
},
{
"epoch": 0.4658385093167702,
"grad_norm": 1.2504163980484009,
"learning_rate": 4.693191045122376e-05,
"loss": 0.0075,
"step": 2975
},
{
"epoch": 0.46975311863875985,
"grad_norm": 0.8100822567939758,
"learning_rate": 4.68594130611298e-05,
"loss": 0.0024,
"step": 3000
},
{
"epoch": 0.4736677279607495,
"grad_norm": 0.7344357967376709,
"learning_rate": 4.6786915671035844e-05,
"loss": 0.0042,
"step": 3025
},
{
"epoch": 0.4775823372827392,
"grad_norm": 0.006882940419018269,
"learning_rate": 4.671441828094189e-05,
"loss": 0.0108,
"step": 3050
},
{
"epoch": 0.48149694660472886,
"grad_norm": 0.07418603450059891,
"learning_rate": 4.6641920890847935e-05,
"loss": 0.0015,
"step": 3075
},
{
"epoch": 0.48541155592671853,
"grad_norm": 0.023311011493206024,
"learning_rate": 4.656942350075397e-05,
"loss": 0.0011,
"step": 3100
},
{
"epoch": 0.4893261652487082,
"grad_norm": 0.22213295102119446,
"learning_rate": 4.649692611066002e-05,
"loss": 0.0002,
"step": 3125
},
{
"epoch": 0.49324077457069787,
"grad_norm": 0.028663238510489464,
"learning_rate": 4.642442872056606e-05,
"loss": 0.0034,
"step": 3150
},
{
"epoch": 0.4971553838926875,
"grad_norm": 0.010352909564971924,
"learning_rate": 4.6351931330472104e-05,
"loss": 0.0003,
"step": 3175
},
{
"epoch": 0.5010699932146772,
"grad_norm": 0.01622854731976986,
"learning_rate": 4.6279433940378146e-05,
"loss": 0.0035,
"step": 3200
},
{
"epoch": 0.5049846025366669,
"grad_norm": 0.0045238700695335865,
"learning_rate": 4.620693655028419e-05,
"loss": 0.0016,
"step": 3225
},
{
"epoch": 0.5088992118586565,
"grad_norm": 0.000869418028742075,
"learning_rate": 4.613443916019024e-05,
"loss": 0.0003,
"step": 3250
},
{
"epoch": 0.5128138211806462,
"grad_norm": 0.0070857820101082325,
"learning_rate": 4.606194177009628e-05,
"loss": 0.0013,
"step": 3275
},
{
"epoch": 0.5167284305026358,
"grad_norm": 0.019664961844682693,
"learning_rate": 4.598944438000232e-05,
"loss": 0.0014,
"step": 3300
},
{
"epoch": 0.5206430398246255,
"grad_norm": 0.002933235839009285,
"learning_rate": 4.5916946989908364e-05,
"loss": 0.0024,
"step": 3325
},
{
"epoch": 0.5245576491466152,
"grad_norm": 0.009601329453289509,
"learning_rate": 4.5844449599814406e-05,
"loss": 0.001,
"step": 3350
},
{
"epoch": 0.5284722584686048,
"grad_norm": 0.03231184929609299,
"learning_rate": 4.5771952209720455e-05,
"loss": 0.001,
"step": 3375
},
{
"epoch": 0.5323868677905945,
"grad_norm": 0.038716524839401245,
"learning_rate": 4.56994548196265e-05,
"loss": 0.0104,
"step": 3400
},
{
"epoch": 0.5363014771125841,
"grad_norm": 0.005376841872930527,
"learning_rate": 4.562695742953254e-05,
"loss": 0.0021,
"step": 3425
},
{
"epoch": 0.5402160864345739,
"grad_norm": 0.8506935834884644,
"learning_rate": 4.555446003943858e-05,
"loss": 0.0037,
"step": 3450
},
{
"epoch": 0.5441306957565635,
"grad_norm": 0.00393926864489913,
"learning_rate": 4.548196264934463e-05,
"loss": 0.0007,
"step": 3475
},
{
"epoch": 0.5480453050785532,
"grad_norm": 0.49948814511299133,
"learning_rate": 4.5409465259250666e-05,
"loss": 0.0017,
"step": 3500
},
{
"epoch": 0.5519599144005428,
"grad_norm": 0.008987600915133953,
"learning_rate": 4.533696786915671e-05,
"loss": 0.0017,
"step": 3525
},
{
"epoch": 0.5558745237225325,
"grad_norm": 0.06366792321205139,
"learning_rate": 4.526447047906275e-05,
"loss": 0.0016,
"step": 3550
},
{
"epoch": 0.5597891330445222,
"grad_norm": 0.9016256332397461,
"learning_rate": 4.51919730889688e-05,
"loss": 0.0023,
"step": 3575
},
{
"epoch": 0.5637037423665118,
"grad_norm": 0.010248661041259766,
"learning_rate": 4.511947569887484e-05,
"loss": 0.0037,
"step": 3600
},
{
"epoch": 0.5676183516885015,
"grad_norm": 0.007675408851355314,
"learning_rate": 4.5046978308780884e-05,
"loss": 0.0053,
"step": 3625
},
{
"epoch": 0.5715329610104911,
"grad_norm": 0.0017978112446144223,
"learning_rate": 4.497448091868693e-05,
"loss": 0.0006,
"step": 3650
},
{
"epoch": 0.5754475703324808,
"grad_norm": 1.0881074666976929,
"learning_rate": 4.4901983528592976e-05,
"loss": 0.0026,
"step": 3675
},
{
"epoch": 0.5793621796544705,
"grad_norm": 0.0023445766419172287,
"learning_rate": 4.4832386034102776e-05,
"loss": 0.0014,
"step": 3700
},
{
"epoch": 0.5832767889764602,
"grad_norm": 0.0032128174789249897,
"learning_rate": 4.475988864400882e-05,
"loss": 0.0013,
"step": 3725
},
{
"epoch": 0.5871913982984498,
"grad_norm": 0.07783033698797226,
"learning_rate": 4.468739125391486e-05,
"loss": 0.0061,
"step": 3750
},
{
"epoch": 0.5911060076204395,
"grad_norm": 0.018863795325160027,
"learning_rate": 4.46148938638209e-05,
"loss": 0.0022,
"step": 3775
},
{
"epoch": 0.5950206169424291,
"grad_norm": 0.004098298028111458,
"learning_rate": 4.454239647372695e-05,
"loss": 0.0022,
"step": 3800
},
{
"epoch": 0.5989352262644189,
"grad_norm": 0.0029339243192225695,
"learning_rate": 4.4469899083632994e-05,
"loss": 0.0023,
"step": 3825
},
{
"epoch": 0.6028498355864085,
"grad_norm": 0.0022904234938323498,
"learning_rate": 4.4397401693539036e-05,
"loss": 0.0013,
"step": 3850
},
{
"epoch": 0.6067644449083981,
"grad_norm": 0.001695298939011991,
"learning_rate": 4.432490430344508e-05,
"loss": 0.0003,
"step": 3875
},
{
"epoch": 0.6106790542303878,
"grad_norm": 0.3725820779800415,
"learning_rate": 4.425240691335112e-05,
"loss": 0.0012,
"step": 3900
},
{
"epoch": 0.6145936635523774,
"grad_norm": 0.000986219383776188,
"learning_rate": 4.417990952325716e-05,
"loss": 0.0007,
"step": 3925
},
{
"epoch": 0.6185082728743672,
"grad_norm": 0.016280701383948326,
"learning_rate": 4.4107412133163205e-05,
"loss": 0.0005,
"step": 3950
},
{
"epoch": 0.6224228821963568,
"grad_norm": 0.0007005013758316636,
"learning_rate": 4.403491474306925e-05,
"loss": 0.0008,
"step": 3975
},
{
"epoch": 0.6263374915183465,
"grad_norm": 0.0015142613556236029,
"learning_rate": 4.3962417352975296e-05,
"loss": 0.0022,
"step": 4000
},
{
"epoch": 0.6302521008403361,
"grad_norm": 0.02496866136789322,
"learning_rate": 4.388991996288134e-05,
"loss": 0.0081,
"step": 4025
},
{
"epoch": 0.6341667101623258,
"grad_norm": 0.10312812030315399,
"learning_rate": 4.381742257278738e-05,
"loss": 0.0028,
"step": 4050
},
{
"epoch": 0.6380813194843155,
"grad_norm": 0.005419147200882435,
"learning_rate": 4.374492518269342e-05,
"loss": 0.0005,
"step": 4075
},
{
"epoch": 0.6419959288063052,
"grad_norm": 0.0012350809993222356,
"learning_rate": 4.3672427792599465e-05,
"loss": 0.0005,
"step": 4100
},
{
"epoch": 0.6459105381282948,
"grad_norm": 0.0014117741957306862,
"learning_rate": 4.3599930402505514e-05,
"loss": 0.0004,
"step": 4125
},
{
"epoch": 0.6498251474502844,
"grad_norm": 0.011549504473805428,
"learning_rate": 4.3527433012411556e-05,
"loss": 0.0108,
"step": 4150
},
{
"epoch": 0.6537397567722741,
"grad_norm": 0.0015101751778274775,
"learning_rate": 4.34549356223176e-05,
"loss": 0.0023,
"step": 4175
},
{
"epoch": 0.6576543660942638,
"grad_norm": 0.831576406955719,
"learning_rate": 4.338243823222364e-05,
"loss": 0.003,
"step": 4200
},
{
"epoch": 0.6615689754162535,
"grad_norm": 0.003971900790929794,
"learning_rate": 4.330994084212969e-05,
"loss": 0.0009,
"step": 4225
},
{
"epoch": 0.6654835847382431,
"grad_norm": 0.00122584099881351,
"learning_rate": 4.323744345203573e-05,
"loss": 0.0004,
"step": 4250
},
{
"epoch": 0.6693981940602328,
"grad_norm": 1.1975153684616089,
"learning_rate": 4.3164946061941774e-05,
"loss": 0.008,
"step": 4275
},
{
"epoch": 0.6733128033822224,
"grad_norm": 0.007587050087749958,
"learning_rate": 4.309244867184782e-05,
"loss": 0.0008,
"step": 4300
},
{
"epoch": 0.6772274127042122,
"grad_norm": 1.974413514137268,
"learning_rate": 4.301995128175386e-05,
"loss": 0.0003,
"step": 4325
},
{
"epoch": 0.6811420220262018,
"grad_norm": 0.0011919812532141805,
"learning_rate": 4.29474538916599e-05,
"loss": 0.0055,
"step": 4350
},
{
"epoch": 0.6850566313481915,
"grad_norm": 0.0037530860863626003,
"learning_rate": 4.287495650156594e-05,
"loss": 0.0031,
"step": 4375
},
{
"epoch": 0.6889712406701811,
"grad_norm": 0.0055799526162445545,
"learning_rate": 4.2802459111471986e-05,
"loss": 0.0009,
"step": 4400
},
{
"epoch": 0.6928858499921707,
"grad_norm": 0.7918204069137573,
"learning_rate": 4.2729961721378035e-05,
"loss": 0.0011,
"step": 4425
},
{
"epoch": 0.6968004593141605,
"grad_norm": 0.021195508539676666,
"learning_rate": 4.265746433128408e-05,
"loss": 0.0014,
"step": 4450
},
{
"epoch": 0.7007150686361501,
"grad_norm": 0.0016733302036300302,
"learning_rate": 4.258496694119012e-05,
"loss": 0.0016,
"step": 4475
},
{
"epoch": 0.7046296779581398,
"grad_norm": 0.0015721771633252501,
"learning_rate": 4.251246955109616e-05,
"loss": 0.002,
"step": 4500
},
{
"epoch": 0.7085442872801294,
"grad_norm": 0.024684101343154907,
"learning_rate": 4.2439972161002204e-05,
"loss": 0.0021,
"step": 4525
},
{
"epoch": 0.7124588966021191,
"grad_norm": 0.0010000619804486632,
"learning_rate": 4.236747477090825e-05,
"loss": 0.001,
"step": 4550
},
{
"epoch": 0.7163735059241088,
"grad_norm": 0.0010993380565196276,
"learning_rate": 4.2294977380814295e-05,
"loss": 0.0024,
"step": 4575
},
{
"epoch": 0.7202881152460985,
"grad_norm": 0.01743653602898121,
"learning_rate": 4.222247999072034e-05,
"loss": 0.001,
"step": 4600
},
{
"epoch": 0.7242027245680881,
"grad_norm": 0.0034048547968268394,
"learning_rate": 4.214998260062638e-05,
"loss": 0.0012,
"step": 4625
},
{
"epoch": 0.7281173338900778,
"grad_norm": 0.006288307718932629,
"learning_rate": 4.207748521053242e-05,
"loss": 0.0016,
"step": 4650
},
{
"epoch": 0.7320319432120674,
"grad_norm": 0.09262362122535706,
"learning_rate": 4.200498782043847e-05,
"loss": 0.0062,
"step": 4675
},
{
"epoch": 0.735946552534057,
"grad_norm": 0.0012087648501619697,
"learning_rate": 4.193249043034451e-05,
"loss": 0.0003,
"step": 4700
},
{
"epoch": 0.7398611618560468,
"grad_norm": 2.551692247390747,
"learning_rate": 4.185999304025055e-05,
"loss": 0.0007,
"step": 4725
},
{
"epoch": 0.7437757711780364,
"grad_norm": 0.003155685495585203,
"learning_rate": 4.17874956501566e-05,
"loss": 0.0035,
"step": 4750
},
{
"epoch": 0.7476903805000261,
"grad_norm": 0.0007522006053477526,
"learning_rate": 4.171499826006264e-05,
"loss": 0.0003,
"step": 4775
},
{
"epoch": 0.7516049898220157,
"grad_norm": 0.1172158420085907,
"learning_rate": 4.164250086996868e-05,
"loss": 0.0022,
"step": 4800
},
{
"epoch": 0.7555195991440055,
"grad_norm": 0.0018555809510871768,
"learning_rate": 4.1570003479874724e-05,
"loss": 0.0008,
"step": 4825
},
{
"epoch": 0.7594342084659951,
"grad_norm": 0.014069788157939911,
"learning_rate": 4.1497506089780766e-05,
"loss": 0.0027,
"step": 4850
},
{
"epoch": 0.7633488177879848,
"grad_norm": 0.0070347595028579235,
"learning_rate": 4.1425008699686815e-05,
"loss": 0.0044,
"step": 4875
},
{
"epoch": 0.7672634271099744,
"grad_norm": 0.005139984656125307,
"learning_rate": 4.135251130959286e-05,
"loss": 0.0013,
"step": 4900
},
{
"epoch": 0.7711780364319641,
"grad_norm": 0.03146003186702728,
"learning_rate": 4.12800139194989e-05,
"loss": 0.0008,
"step": 4925
},
{
"epoch": 0.7750926457539538,
"grad_norm": 0.0008966127061285079,
"learning_rate": 4.120751652940494e-05,
"loss": 0.0013,
"step": 4950
},
{
"epoch": 0.7790072550759434,
"grad_norm": 0.010651414282619953,
"learning_rate": 4.1135019139310984e-05,
"loss": 0.0007,
"step": 4975
},
{
"epoch": 0.7829218643979331,
"grad_norm": 0.05222166329622269,
"learning_rate": 4.106252174921703e-05,
"loss": 0.0025,
"step": 5000
},
{
"epoch": 0.7868364737199227,
"grad_norm": 0.008781126700341702,
"learning_rate": 4.0990024359123075e-05,
"loss": 0.0006,
"step": 5025
},
{
"epoch": 0.7907510830419124,
"grad_norm": 0.0023096187505871058,
"learning_rate": 4.091752696902912e-05,
"loss": 0.0003,
"step": 5050
},
{
"epoch": 0.794665692363902,
"grad_norm": 0.000690230808686465,
"learning_rate": 4.084502957893516e-05,
"loss": 0.001,
"step": 5075
},
{
"epoch": 0.7985803016858918,
"grad_norm": 0.0017941935220733285,
"learning_rate": 4.077253218884121e-05,
"loss": 0.0018,
"step": 5100
},
{
"epoch": 0.8024949110078814,
"grad_norm": 0.001472643343731761,
"learning_rate": 4.070003479874725e-05,
"loss": 0.0011,
"step": 5125
},
{
"epoch": 0.8064095203298711,
"grad_norm": 0.050277333706617355,
"learning_rate": 4.0627537408653286e-05,
"loss": 0.0104,
"step": 5150
},
{
"epoch": 0.8103241296518607,
"grad_norm": 0.020627155900001526,
"learning_rate": 4.055504001855933e-05,
"loss": 0.0019,
"step": 5175
},
{
"epoch": 0.8142387389738505,
"grad_norm": 1.6748356819152832,
"learning_rate": 4.048254262846538e-05,
"loss": 0.0013,
"step": 5200
},
{
"epoch": 0.8181533482958401,
"grad_norm": 0.0005242697079665959,
"learning_rate": 4.041004523837142e-05,
"loss": 0.0003,
"step": 5225
},
{
"epoch": 0.8220679576178297,
"grad_norm": 0.0004012222634628415,
"learning_rate": 4.033754784827746e-05,
"loss": 0.0011,
"step": 5250
},
{
"epoch": 0.8259825669398194,
"grad_norm": 0.0007638943498022854,
"learning_rate": 4.0265050458183504e-05,
"loss": 0.0008,
"step": 5275
},
{
"epoch": 0.829897176261809,
"grad_norm": 0.000370625639334321,
"learning_rate": 4.019255306808955e-05,
"loss": 0.0002,
"step": 5300
},
{
"epoch": 0.8338117855837988,
"grad_norm": 0.17966459691524506,
"learning_rate": 4.0120055677995596e-05,
"loss": 0.0061,
"step": 5325
},
{
"epoch": 0.8377263949057884,
"grad_norm": 0.5298845171928406,
"learning_rate": 4.004755828790164e-05,
"loss": 0.0021,
"step": 5350
},
{
"epoch": 0.8416410042277781,
"grad_norm": 0.010731186717748642,
"learning_rate": 3.997506089780768e-05,
"loss": 0.0012,
"step": 5375
},
{
"epoch": 0.8455556135497677,
"grad_norm": 0.0006224720855243504,
"learning_rate": 3.990256350771372e-05,
"loss": 0.0014,
"step": 5400
},
{
"epoch": 0.8494702228717574,
"grad_norm": 0.00034521459019742906,
"learning_rate": 3.983006611761977e-05,
"loss": 0.0005,
"step": 5425
},
{
"epoch": 0.8533848321937471,
"grad_norm": 0.07561736553907394,
"learning_rate": 3.9757568727525814e-05,
"loss": 0.002,
"step": 5450
},
{
"epoch": 0.8572994415157368,
"grad_norm": 0.010748780332505703,
"learning_rate": 3.9685071337431856e-05,
"loss": 0.0025,
"step": 5475
},
{
"epoch": 0.8612140508377264,
"grad_norm": 0.03456795960664749,
"learning_rate": 3.96125739473379e-05,
"loss": 0.0079,
"step": 5500
},
{
"epoch": 0.865128660159716,
"grad_norm": 0.013776997104287148,
"learning_rate": 3.954007655724394e-05,
"loss": 0.0015,
"step": 5525
},
{
"epoch": 0.8690432694817057,
"grad_norm": 0.013151598162949085,
"learning_rate": 3.946757916714999e-05,
"loss": 0.0027,
"step": 5550
},
{
"epoch": 0.8729578788036954,
"grad_norm": 0.005265055689960718,
"learning_rate": 3.9395081777056025e-05,
"loss": 0.0045,
"step": 5575
},
{
"epoch": 0.8768724881256851,
"grad_norm": 0.0019183550029993057,
"learning_rate": 3.932258438696207e-05,
"loss": 0.0005,
"step": 5600
},
{
"epoch": 0.8807870974476747,
"grad_norm": 0.42332738637924194,
"learning_rate": 3.9250086996868116e-05,
"loss": 0.0037,
"step": 5625
},
{
"epoch": 0.8847017067696644,
"grad_norm": 0.00447813980281353,
"learning_rate": 3.917758960677416e-05,
"loss": 0.0014,
"step": 5650
},
{
"epoch": 0.888616316091654,
"grad_norm": 0.0005977645632810891,
"learning_rate": 3.91050922166802e-05,
"loss": 0.0005,
"step": 5675
},
{
"epoch": 0.8925309254136438,
"grad_norm": 0.9014317989349365,
"learning_rate": 3.903259482658624e-05,
"loss": 0.0012,
"step": 5700
},
{
"epoch": 0.8964455347356334,
"grad_norm": 1.6808857917785645,
"learning_rate": 3.8960097436492285e-05,
"loss": 0.0033,
"step": 5725
},
{
"epoch": 0.9003601440576231,
"grad_norm": 0.002373639028519392,
"learning_rate": 3.8887600046398334e-05,
"loss": 0.0136,
"step": 5750
},
{
"epoch": 0.9042747533796127,
"grad_norm": 0.0012994492426514626,
"learning_rate": 3.8815102656304376e-05,
"loss": 0.0001,
"step": 5775
},
{
"epoch": 0.9081893627016023,
"grad_norm": 0.0006246384000405669,
"learning_rate": 3.874260526621042e-05,
"loss": 0.0001,
"step": 5800
},
{
"epoch": 0.9121039720235921,
"grad_norm": 0.0005325423553586006,
"learning_rate": 3.867010787611646e-05,
"loss": 0.0005,
"step": 5825
},
{
"epoch": 0.9160185813455817,
"grad_norm": 0.0009510382078588009,
"learning_rate": 3.859761048602251e-05,
"loss": 0.0032,
"step": 5850
},
{
"epoch": 0.9199331906675714,
"grad_norm": 0.012179987505078316,
"learning_rate": 3.852511309592855e-05,
"loss": 0.0015,
"step": 5875
},
{
"epoch": 0.923847799989561,
"grad_norm": 0.0014047386357560754,
"learning_rate": 3.8452615705834594e-05,
"loss": 0.0006,
"step": 5900
},
{
"epoch": 0.9277624093115507,
"grad_norm": 0.13963516056537628,
"learning_rate": 3.8380118315740636e-05,
"loss": 0.0012,
"step": 5925
},
{
"epoch": 0.9316770186335404,
"grad_norm": 0.7947016954421997,
"learning_rate": 3.830762092564668e-05,
"loss": 0.0014,
"step": 5950
},
{
"epoch": 0.9355916279555301,
"grad_norm": 0.001768257119692862,
"learning_rate": 3.823512353555272e-05,
"loss": 0.0004,
"step": 5975
},
{
"epoch": 0.9395062372775197,
"grad_norm": 0.0007245225715450943,
"learning_rate": 3.816262614545876e-05,
"loss": 0.0001,
"step": 6000
},
{
"epoch": 0.9434208465995094,
"grad_norm": 0.016255930066108704,
"learning_rate": 3.8090128755364805e-05,
"loss": 0.0,
"step": 6025
},
{
"epoch": 0.947335455921499,
"grad_norm": 0.00034742074785754085,
"learning_rate": 3.801763136527085e-05,
"loss": 0.0003,
"step": 6050
},
{
"epoch": 0.9512500652434887,
"grad_norm": 0.0013885988155379891,
"learning_rate": 3.7945133975176896e-05,
"loss": 0.0032,
"step": 6075
},
{
"epoch": 0.9551646745654784,
"grad_norm": 0.8642656207084656,
"learning_rate": 3.787263658508294e-05,
"loss": 0.0019,
"step": 6100
},
{
"epoch": 0.959079283887468,
"grad_norm": 0.002853901358321309,
"learning_rate": 3.780013919498898e-05,
"loss": 0.0031,
"step": 6125
},
{
"epoch": 0.9629938932094577,
"grad_norm": 0.6826348304748535,
"learning_rate": 3.772764180489502e-05,
"loss": 0.0029,
"step": 6150
},
{
"epoch": 0.9669085025314473,
"grad_norm": 0.01645534299314022,
"learning_rate": 3.765514441480107e-05,
"loss": 0.0003,
"step": 6175
},
{
"epoch": 0.9708231118534371,
"grad_norm": 0.001097380998544395,
"learning_rate": 3.7582647024707114e-05,
"loss": 0.0011,
"step": 6200
},
{
"epoch": 0.9747377211754267,
"grad_norm": 0.001092984457500279,
"learning_rate": 3.7513049530216915e-05,
"loss": 0.0006,
"step": 6225
},
{
"epoch": 0.9786523304974164,
"grad_norm": 0.001488927286118269,
"learning_rate": 3.744055214012296e-05,
"loss": 0.0018,
"step": 6250
},
{
"epoch": 0.982566939819406,
"grad_norm": 0.0012959851883351803,
"learning_rate": 3.7368054750029e-05,
"loss": 0.0011,
"step": 6275
},
{
"epoch": 0.9864815491413957,
"grad_norm": 0.002524161711335182,
"learning_rate": 3.729555735993505e-05,
"loss": 0.0039,
"step": 6300
},
{
"epoch": 0.9903961584633854,
"grad_norm": 0.0023267928045243025,
"learning_rate": 3.722305996984109e-05,
"loss": 0.0011,
"step": 6325
},
{
"epoch": 0.994310767785375,
"grad_norm": 0.0007459365879185498,
"learning_rate": 3.715056257974713e-05,
"loss": 0.0003,
"step": 6350
},
{
"epoch": 0.9982253771073647,
"grad_norm": 0.004343962296843529,
"learning_rate": 3.7078065189653175e-05,
"loss": 0.0018,
"step": 6375
},
{
"epoch": 0.9999478052090401,
"eval_accuracy": 0.9997799951169648,
"eval_f1": 0.9997142385928128,
"eval_loss": 0.0011581754079088569,
"eval_precision": 0.9996766935217872,
"eval_recall": 0.9997517864841209,
"eval_runtime": 62.9623,
"eval_samples_per_second": 608.586,
"eval_steps_per_second": 38.039,
"step": 6386
},
{
"epoch": 1.0021399864293543,
"grad_norm": 0.007820851169526577,
"learning_rate": 3.700556779955922e-05,
"loss": 0.0006,
"step": 6400
},
{
"epoch": 1.006054595751344,
"grad_norm": 0.00049219821812585,
"learning_rate": 3.693307040946526e-05,
"loss": 0.0005,
"step": 6425
},
{
"epoch": 1.0099692050733338,
"grad_norm": 0.0008093062788248062,
"learning_rate": 3.68605730193713e-05,
"loss": 0.0008,
"step": 6450
},
{
"epoch": 1.0138838143953233,
"grad_norm": 0.3265334963798523,
"learning_rate": 3.6788075629277344e-05,
"loss": 0.0001,
"step": 6475
},
{
"epoch": 1.017798423717313,
"grad_norm": 0.015381108038127422,
"learning_rate": 3.671557823918339e-05,
"loss": 0.0,
"step": 6500
},
{
"epoch": 1.0217130330393027,
"grad_norm": 0.00040746491868048906,
"learning_rate": 3.6643080849089435e-05,
"loss": 0.0001,
"step": 6525
},
{
"epoch": 1.0256276423612924,
"grad_norm": 2.2102978229522705,
"learning_rate": 3.657058345899548e-05,
"loss": 0.0022,
"step": 6550
},
{
"epoch": 1.029542251683282,
"grad_norm": 0.0007900640484876931,
"learning_rate": 3.649808606890152e-05,
"loss": 0.0034,
"step": 6575
},
{
"epoch": 1.0334568610052717,
"grad_norm": 0.07358774542808533,
"learning_rate": 3.642558867880756e-05,
"loss": 0.0004,
"step": 6600
},
{
"epoch": 1.0373714703272614,
"grad_norm": 0.0004924671957269311,
"learning_rate": 3.635309128871361e-05,
"loss": 0.0008,
"step": 6625
},
{
"epoch": 1.041286079649251,
"grad_norm": 0.0007265584426932037,
"learning_rate": 3.628059389861965e-05,
"loss": 0.0005,
"step": 6650
},
{
"epoch": 1.0452006889712406,
"grad_norm": 0.006537444423884153,
"learning_rate": 3.6208096508525695e-05,
"loss": 0.0031,
"step": 6675
},
{
"epoch": 1.0491152982932304,
"grad_norm": 0.02974896878004074,
"learning_rate": 3.613559911843174e-05,
"loss": 0.0003,
"step": 6700
},
{
"epoch": 1.05302990761522,
"grad_norm": 0.0008949014008976519,
"learning_rate": 3.6063101728337786e-05,
"loss": 0.0001,
"step": 6725
},
{
"epoch": 1.0569445169372096,
"grad_norm": 5.2669758796691895,
"learning_rate": 3.599060433824383e-05,
"loss": 0.0038,
"step": 6750
},
{
"epoch": 1.0608591262591993,
"grad_norm": 0.0008383537060581148,
"learning_rate": 3.591810694814987e-05,
"loss": 0.0014,
"step": 6775
},
{
"epoch": 1.064773735581189,
"grad_norm": 0.03583945333957672,
"learning_rate": 3.5845609558055906e-05,
"loss": 0.002,
"step": 6800
},
{
"epoch": 1.0686883449031788,
"grad_norm": 0.0004048035480082035,
"learning_rate": 3.5773112167961955e-05,
"loss": 0.0002,
"step": 6825
},
{
"epoch": 1.0726029542251683,
"grad_norm": 0.0006589085678569973,
"learning_rate": 3.5700614777868e-05,
"loss": 0.0,
"step": 6850
},
{
"epoch": 1.076517563547158,
"grad_norm": 0.01224551722407341,
"learning_rate": 3.562811738777404e-05,
"loss": 0.0028,
"step": 6875
},
{
"epoch": 1.0804321728691477,
"grad_norm": 1.1463470458984375,
"learning_rate": 3.555561999768008e-05,
"loss": 0.0029,
"step": 6900
},
{
"epoch": 1.0843467821911372,
"grad_norm": 0.002099130768328905,
"learning_rate": 3.548312260758613e-05,
"loss": 0.0016,
"step": 6925
},
{
"epoch": 1.088261391513127,
"grad_norm": 0.4577861428260803,
"learning_rate": 3.541062521749217e-05,
"loss": 0.0009,
"step": 6950
},
{
"epoch": 1.0921760008351167,
"grad_norm": 0.08768904209136963,
"learning_rate": 3.5338127827398216e-05,
"loss": 0.0017,
"step": 6975
},
{
"epoch": 1.0960906101571064,
"grad_norm": 0.002661600476130843,
"learning_rate": 3.526563043730426e-05,
"loss": 0.0002,
"step": 7000
},
{
"epoch": 1.100005219479096,
"grad_norm": 0.0006299412925727665,
"learning_rate": 3.51931330472103e-05,
"loss": 0.0,
"step": 7025
},
{
"epoch": 1.1039198288010856,
"grad_norm": 0.1650131493806839,
"learning_rate": 3.512063565711635e-05,
"loss": 0.0002,
"step": 7050
},
{
"epoch": 1.1078344381230754,
"grad_norm": 0.37610143423080444,
"learning_rate": 3.504813826702239e-05,
"loss": 0.0009,
"step": 7075
},
{
"epoch": 1.111749047445065,
"grad_norm": 0.029113056138157845,
"learning_rate": 3.4975640876928433e-05,
"loss": 0.0012,
"step": 7100
},
{
"epoch": 1.1156636567670546,
"grad_norm": 0.004116399679332972,
"learning_rate": 3.4903143486834476e-05,
"loss": 0.0023,
"step": 7125
},
{
"epoch": 1.1195782660890443,
"grad_norm": 0.015721509233117104,
"learning_rate": 3.483064609674052e-05,
"loss": 0.0097,
"step": 7150
},
{
"epoch": 1.123492875411034,
"grad_norm": 0.01373900007456541,
"learning_rate": 3.475814870664657e-05,
"loss": 0.0064,
"step": 7175
},
{
"epoch": 1.1274074847330238,
"grad_norm": 0.0016198121011257172,
"learning_rate": 3.46856513165526e-05,
"loss": 0.0008,
"step": 7200
},
{
"epoch": 1.1313220940550133,
"grad_norm": 0.0009071112144738436,
"learning_rate": 3.4613153926458645e-05,
"loss": 0.0001,
"step": 7225
},
{
"epoch": 1.135236703377003,
"grad_norm": 0.0006360800471156836,
"learning_rate": 3.4540656536364694e-05,
"loss": 0.0007,
"step": 7250
},
{
"epoch": 1.1391513126989927,
"grad_norm": 0.0013603122206404805,
"learning_rate": 3.4468159146270736e-05,
"loss": 0.0003,
"step": 7275
},
{
"epoch": 1.1430659220209822,
"grad_norm": 0.2531895339488983,
"learning_rate": 3.439566175617678e-05,
"loss": 0.0019,
"step": 7300
},
{
"epoch": 1.146980531342972,
"grad_norm": 0.08225157856941223,
"learning_rate": 3.432316436608282e-05,
"loss": 0.001,
"step": 7325
},
{
"epoch": 1.1508951406649617,
"grad_norm": 0.0010974809993058443,
"learning_rate": 3.425066697598886e-05,
"loss": 0.0009,
"step": 7350
},
{
"epoch": 1.1548097499869514,
"grad_norm": 0.007975243031978607,
"learning_rate": 3.417816958589491e-05,
"loss": 0.0001,
"step": 7375
},
{
"epoch": 1.158724359308941,
"grad_norm": 0.0005916508380323648,
"learning_rate": 3.4105672195800954e-05,
"loss": 0.0001,
"step": 7400
},
{
"epoch": 1.1626389686309306,
"grad_norm": 0.003101816400885582,
"learning_rate": 3.4033174805706996e-05,
"loss": 0.0,
"step": 7425
},
{
"epoch": 1.1665535779529204,
"grad_norm": 0.00036523715243674815,
"learning_rate": 3.396067741561304e-05,
"loss": 0.0001,
"step": 7450
},
{
"epoch": 1.1704681872749099,
"grad_norm": 0.02329937182366848,
"learning_rate": 3.388818002551909e-05,
"loss": 0.0013,
"step": 7475
},
{
"epoch": 1.1743827965968996,
"grad_norm": 0.5784549117088318,
"learning_rate": 3.381568263542513e-05,
"loss": 0.0038,
"step": 7500
},
{
"epoch": 1.1782974059188893,
"grad_norm": 0.0012637526961043477,
"learning_rate": 3.374318524533117e-05,
"loss": 0.0015,
"step": 7525
},
{
"epoch": 1.182212015240879,
"grad_norm": 0.019489184021949768,
"learning_rate": 3.3670687855237214e-05,
"loss": 0.0002,
"step": 7550
},
{
"epoch": 1.1861266245628685,
"grad_norm": 0.0006683494430035353,
"learning_rate": 3.3598190465143256e-05,
"loss": 0.0008,
"step": 7575
},
{
"epoch": 1.1900412338848583,
"grad_norm": 0.027937965467572212,
"learning_rate": 3.3525693075049305e-05,
"loss": 0.0008,
"step": 7600
},
{
"epoch": 1.193955843206848,
"grad_norm": 0.00035219776327721775,
"learning_rate": 3.345319568495534e-05,
"loss": 0.0002,
"step": 7625
},
{
"epoch": 1.1978704525288375,
"grad_norm": 0.0009345108992420137,
"learning_rate": 3.338069829486138e-05,
"loss": 0.0032,
"step": 7650
},
{
"epoch": 1.2017850618508272,
"grad_norm": 0.05174746736884117,
"learning_rate": 3.3308200904767425e-05,
"loss": 0.0028,
"step": 7675
},
{
"epoch": 1.205699671172817,
"grad_norm": 0.1187373697757721,
"learning_rate": 3.3235703514673474e-05,
"loss": 0.0006,
"step": 7700
},
{
"epoch": 1.2096142804948067,
"grad_norm": 0.0881095826625824,
"learning_rate": 3.3163206124579516e-05,
"loss": 0.0018,
"step": 7725
},
{
"epoch": 1.2135288898167964,
"grad_norm": 1.4924030303955078,
"learning_rate": 3.309070873448556e-05,
"loss": 0.0006,
"step": 7750
},
{
"epoch": 1.217443499138786,
"grad_norm": 0.10360655933618546,
"learning_rate": 3.30182113443916e-05,
"loss": 0.0009,
"step": 7775
},
{
"epoch": 1.2213581084607756,
"grad_norm": 0.0007201316766440868,
"learning_rate": 3.294571395429765e-05,
"loss": 0.0003,
"step": 7800
},
{
"epoch": 1.2252727177827654,
"grad_norm": 0.001118672196753323,
"learning_rate": 3.287321656420369e-05,
"loss": 0.0007,
"step": 7825
},
{
"epoch": 1.2291873271047549,
"grad_norm": 0.008757601492106915,
"learning_rate": 3.2800719174109734e-05,
"loss": 0.0038,
"step": 7850
},
{
"epoch": 1.2331019364267446,
"grad_norm": 1.405776023864746,
"learning_rate": 3.2728221784015777e-05,
"loss": 0.0021,
"step": 7875
},
{
"epoch": 1.2370165457487343,
"grad_norm": 0.06606610119342804,
"learning_rate": 3.265572439392182e-05,
"loss": 0.0001,
"step": 7900
},
{
"epoch": 1.240931155070724,
"grad_norm": 0.00046704983105883,
"learning_rate": 3.258322700382787e-05,
"loss": 0.0001,
"step": 7925
},
{
"epoch": 1.2448457643927135,
"grad_norm": 0.0005030676256865263,
"learning_rate": 3.251072961373391e-05,
"loss": 0.0009,
"step": 7950
},
{
"epoch": 1.2487603737147033,
"grad_norm": 0.004642080515623093,
"learning_rate": 3.243823222363995e-05,
"loss": 0.0005,
"step": 7975
},
{
"epoch": 1.252674983036693,
"grad_norm": 0.00609723711386323,
"learning_rate": 3.2365734833545994e-05,
"loss": 0.0018,
"step": 8000
},
{
"epoch": 1.2565895923586825,
"grad_norm": 0.003095820778980851,
"learning_rate": 3.229323744345204e-05,
"loss": 0.0003,
"step": 8025
},
{
"epoch": 1.2605042016806722,
"grad_norm": 0.08622787892818451,
"learning_rate": 3.222074005335808e-05,
"loss": 0.0001,
"step": 8050
},
{
"epoch": 1.264418811002662,
"grad_norm": 0.022611690685153008,
"learning_rate": 3.214824266326412e-05,
"loss": 0.0009,
"step": 8075
},
{
"epoch": 1.2683334203246517,
"grad_norm": 0.0005983790615573525,
"learning_rate": 3.2075745273170163e-05,
"loss": 0.0012,
"step": 8100
},
{
"epoch": 1.2722480296466412,
"grad_norm": 0.0008185420883819461,
"learning_rate": 3.200324788307621e-05,
"loss": 0.0008,
"step": 8125
},
{
"epoch": 1.276162638968631,
"grad_norm": 0.0019505377858877182,
"learning_rate": 3.1930750492982255e-05,
"loss": 0.0003,
"step": 8150
},
{
"epoch": 1.2800772482906206,
"grad_norm": 0.005252277944236994,
"learning_rate": 3.18582531028883e-05,
"loss": 0.0047,
"step": 8175
},
{
"epoch": 1.2839918576126101,
"grad_norm": 0.0010310772340744734,
"learning_rate": 3.178575571279434e-05,
"loss": 0.0003,
"step": 8200
},
{
"epoch": 1.2879064669345999,
"grad_norm": 0.002415160648524761,
"learning_rate": 3.171325832270038e-05,
"loss": 0.004,
"step": 8225
},
{
"epoch": 1.2918210762565896,
"grad_norm": 0.0005815212498418987,
"learning_rate": 3.164076093260643e-05,
"loss": 0.0018,
"step": 8250
},
{
"epoch": 1.2957356855785793,
"grad_norm": 0.0003597593167796731,
"learning_rate": 3.156826354251247e-05,
"loss": 0.0003,
"step": 8275
},
{
"epoch": 1.299650294900569,
"grad_norm": 0.004648554138839245,
"learning_rate": 3.1495766152418515e-05,
"loss": 0.005,
"step": 8300
},
{
"epoch": 1.3035649042225586,
"grad_norm": 0.0015794150531291962,
"learning_rate": 3.142326876232456e-05,
"loss": 0.0006,
"step": 8325
},
{
"epoch": 1.3074795135445483,
"grad_norm": 0.000883117550984025,
"learning_rate": 3.1350771372230606e-05,
"loss": 0.0022,
"step": 8350
},
{
"epoch": 1.3113941228665378,
"grad_norm": 0.0004549395525828004,
"learning_rate": 3.127827398213665e-05,
"loss": 0.0,
"step": 8375
},
{
"epoch": 1.3153087321885275,
"grad_norm": 0.00043308446765877306,
"learning_rate": 3.120577659204269e-05,
"loss": 0.0017,
"step": 8400
},
{
"epoch": 1.3192233415105172,
"grad_norm": 0.000435361813288182,
"learning_rate": 3.113327920194873e-05,
"loss": 0.0004,
"step": 8425
},
{
"epoch": 1.323137950832507,
"grad_norm": 0.000388374668546021,
"learning_rate": 3.1060781811854775e-05,
"loss": 0.0001,
"step": 8450
},
{
"epoch": 1.3270525601544967,
"grad_norm": 0.0006863649468868971,
"learning_rate": 3.098828442176082e-05,
"loss": 0.0004,
"step": 8475
},
{
"epoch": 1.3309671694764862,
"grad_norm": 0.00127976608928293,
"learning_rate": 3.091578703166686e-05,
"loss": 0.0018,
"step": 8500
},
{
"epoch": 1.334881778798476,
"grad_norm": 0.036596138030290604,
"learning_rate": 3.08432896415729e-05,
"loss": 0.0013,
"step": 8525
},
{
"epoch": 1.3387963881204656,
"grad_norm": 0.002909492002800107,
"learning_rate": 3.077369214708271e-05,
"loss": 0.0003,
"step": 8550
},
{
"epoch": 1.3427109974424551,
"grad_norm": 0.0010424726642668247,
"learning_rate": 3.070119475698875e-05,
"loss": 0.0024,
"step": 8575
},
{
"epoch": 1.3466256067644449,
"grad_norm": 0.0005914925131946802,
"learning_rate": 3.062869736689479e-05,
"loss": 0.0004,
"step": 8600
},
{
"epoch": 1.3505402160864346,
"grad_norm": 0.00037522296770475805,
"learning_rate": 3.0556199976800835e-05,
"loss": 0.0001,
"step": 8625
},
{
"epoch": 1.3544548254084243,
"grad_norm": 0.00039554465911351144,
"learning_rate": 3.048370258670688e-05,
"loss": 0.0003,
"step": 8650
},
{
"epoch": 1.3583694347304138,
"grad_norm": 0.0040852464735507965,
"learning_rate": 3.0411205196612923e-05,
"loss": 0.0004,
"step": 8675
},
{
"epoch": 1.3622840440524036,
"grad_norm": 0.006642700172960758,
"learning_rate": 3.033870780651897e-05,
"loss": 0.0014,
"step": 8700
},
{
"epoch": 1.3661986533743933,
"grad_norm": 0.003900151466950774,
"learning_rate": 3.026621041642501e-05,
"loss": 0.005,
"step": 8725
},
{
"epoch": 1.3701132626963828,
"grad_norm": 0.0015803135465830564,
"learning_rate": 3.0193713026331057e-05,
"loss": 0.0005,
"step": 8750
},
{
"epoch": 1.3740278720183725,
"grad_norm": 0.010884587652981281,
"learning_rate": 3.01212156362371e-05,
"loss": 0.0009,
"step": 8775
},
{
"epoch": 1.3779424813403622,
"grad_norm": 0.0010327239288017154,
"learning_rate": 3.004871824614314e-05,
"loss": 0.0002,
"step": 8800
},
{
"epoch": 1.381857090662352,
"grad_norm": 0.0019380106823518872,
"learning_rate": 2.9976220856049187e-05,
"loss": 0.0075,
"step": 8825
},
{
"epoch": 1.3857716999843417,
"grad_norm": 0.0012182651553303003,
"learning_rate": 2.9903723465955226e-05,
"loss": 0.0053,
"step": 8850
},
{
"epoch": 1.3896863093063312,
"grad_norm": 0.0017849428113549948,
"learning_rate": 2.9831226075861268e-05,
"loss": 0.0006,
"step": 8875
},
{
"epoch": 1.393600918628321,
"grad_norm": 0.01608388125896454,
"learning_rate": 2.9758728685767314e-05,
"loss": 0.0011,
"step": 8900
},
{
"epoch": 1.3975155279503104,
"grad_norm": 0.0005828512366861105,
"learning_rate": 2.9686231295673356e-05,
"loss": 0.0022,
"step": 8925
},
{
"epoch": 1.4014301372723001,
"grad_norm": 0.0004743439785670489,
"learning_rate": 2.9613733905579398e-05,
"loss": 0.0006,
"step": 8950
},
{
"epoch": 1.4053447465942899,
"grad_norm": 0.0005540683632716537,
"learning_rate": 2.9541236515485444e-05,
"loss": 0.0003,
"step": 8975
},
{
"epoch": 1.4092593559162796,
"grad_norm": 0.0015846255701035261,
"learning_rate": 2.9468739125391486e-05,
"loss": 0.003,
"step": 9000
},
{
"epoch": 1.4131739652382693,
"grad_norm": 0.0018151472322642803,
"learning_rate": 2.939624173529753e-05,
"loss": 0.0013,
"step": 9025
},
{
"epoch": 1.4170885745602588,
"grad_norm": 0.019647782668471336,
"learning_rate": 2.9323744345203574e-05,
"loss": 0.0005,
"step": 9050
},
{
"epoch": 1.4210031838822486,
"grad_norm": 0.0019365083426237106,
"learning_rate": 2.925124695510962e-05,
"loss": 0.0078,
"step": 9075
},
{
"epoch": 1.4249177932042383,
"grad_norm": 0.018348557874560356,
"learning_rate": 2.917874956501566e-05,
"loss": 0.0007,
"step": 9100
},
{
"epoch": 1.4288324025262278,
"grad_norm": 0.0018460671417415142,
"learning_rate": 2.9106252174921704e-05,
"loss": 0.0004,
"step": 9125
},
{
"epoch": 1.4327470118482175,
"grad_norm": 0.014430728740990162,
"learning_rate": 2.903375478482775e-05,
"loss": 0.0065,
"step": 9150
},
{
"epoch": 1.4366616211702072,
"grad_norm": 0.004876923281699419,
"learning_rate": 2.896125739473379e-05,
"loss": 0.0005,
"step": 9175
},
{
"epoch": 1.440576230492197,
"grad_norm": 0.012378478422760963,
"learning_rate": 2.8888760004639837e-05,
"loss": 0.0011,
"step": 9200
},
{
"epoch": 1.4444908398141865,
"grad_norm": 0.0017155319219455123,
"learning_rate": 2.881626261454588e-05,
"loss": 0.0005,
"step": 9225
},
{
"epoch": 1.4484054491361762,
"grad_norm": 0.0008338566403836012,
"learning_rate": 2.874376522445192e-05,
"loss": 0.0002,
"step": 9250
},
{
"epoch": 1.452320058458166,
"grad_norm": 0.18289905786514282,
"learning_rate": 2.867126783435796e-05,
"loss": 0.0012,
"step": 9275
},
{
"epoch": 1.4562346677801554,
"grad_norm": 0.0008503763237968087,
"learning_rate": 2.8598770444264006e-05,
"loss": 0.0005,
"step": 9300
},
{
"epoch": 1.4601492771021451,
"grad_norm": 0.0007721242727711797,
"learning_rate": 2.852627305417005e-05,
"loss": 0.0001,
"step": 9325
},
{
"epoch": 1.4640638864241349,
"grad_norm": 0.006053832825273275,
"learning_rate": 2.8453775664076094e-05,
"loss": 0.0004,
"step": 9350
},
{
"epoch": 1.4679784957461246,
"grad_norm": 0.002682841382920742,
"learning_rate": 2.8381278273982136e-05,
"loss": 0.0011,
"step": 9375
},
{
"epoch": 1.4718931050681143,
"grad_norm": 0.0006761788972653449,
"learning_rate": 2.8308780883888182e-05,
"loss": 0.0006,
"step": 9400
},
{
"epoch": 1.4758077143901038,
"grad_norm": 0.0006122990744188428,
"learning_rate": 2.8236283493794224e-05,
"loss": 0.0003,
"step": 9425
},
{
"epoch": 1.4797223237120936,
"grad_norm": 0.0022469067480415106,
"learning_rate": 2.816378610370027e-05,
"loss": 0.0001,
"step": 9450
},
{
"epoch": 1.483636933034083,
"grad_norm": 0.005789736285805702,
"learning_rate": 2.8091288713606312e-05,
"loss": 0.001,
"step": 9475
},
{
"epoch": 1.4875515423560728,
"grad_norm": 0.0005803314852528274,
"learning_rate": 2.8018791323512354e-05,
"loss": 0.0001,
"step": 9500
},
{
"epoch": 1.4914661516780625,
"grad_norm": 0.00044589489698410034,
"learning_rate": 2.79462939334184e-05,
"loss": 0.0001,
"step": 9525
},
{
"epoch": 1.4953807610000522,
"grad_norm": 0.00034716431400738657,
"learning_rate": 2.7873796543324442e-05,
"loss": 0.0004,
"step": 9550
},
{
"epoch": 1.499295370322042,
"grad_norm": 0.034700002521276474,
"learning_rate": 2.7801299153230488e-05,
"loss": 0.0003,
"step": 9575
},
{
"epoch": 1.5032099796440317,
"grad_norm": 0.00039778611971996725,
"learning_rate": 2.772880176313653e-05,
"loss": 0.0001,
"step": 9600
},
{
"epoch": 1.5071245889660212,
"grad_norm": 0.0003559018950909376,
"learning_rate": 2.7656304373042576e-05,
"loss": 0.0,
"step": 9625
},
{
"epoch": 1.5110391982880107,
"grad_norm": 0.7171289920806885,
"learning_rate": 2.7583806982948618e-05,
"loss": 0.001,
"step": 9650
},
{
"epoch": 1.5149538076100004,
"grad_norm": 0.0009173134458251297,
"learning_rate": 2.7511309592854657e-05,
"loss": 0.0002,
"step": 9675
},
{
"epoch": 1.5188684169319902,
"grad_norm": 0.6568087935447693,
"learning_rate": 2.74388122027607e-05,
"loss": 0.0032,
"step": 9700
},
{
"epoch": 1.5227830262539799,
"grad_norm": 0.03286755084991455,
"learning_rate": 2.7366314812666745e-05,
"loss": 0.0005,
"step": 9725
},
{
"epoch": 1.5266976355759696,
"grad_norm": 0.0004193273780401796,
"learning_rate": 2.7293817422572787e-05,
"loss": 0.0004,
"step": 9750
},
{
"epoch": 1.5306122448979593,
"grad_norm": 2.4084434509277344,
"learning_rate": 2.7221320032478832e-05,
"loss": 0.0061,
"step": 9775
},
{
"epoch": 1.5345268542199488,
"grad_norm": 0.020185716450214386,
"learning_rate": 2.7148822642384875e-05,
"loss": 0.0068,
"step": 9800
},
{
"epoch": 1.5384414635419386,
"grad_norm": 0.6322495937347412,
"learning_rate": 2.7076325252290917e-05,
"loss": 0.0015,
"step": 9825
},
{
"epoch": 1.542356072863928,
"grad_norm": 0.0004228654725011438,
"learning_rate": 2.7003827862196962e-05,
"loss": 0.0009,
"step": 9850
},
{
"epoch": 1.5462706821859178,
"grad_norm": 0.0012805104488506913,
"learning_rate": 2.6931330472103005e-05,
"loss": 0.0002,
"step": 9875
},
{
"epoch": 1.5501852915079075,
"grad_norm": 0.0005116848042234778,
"learning_rate": 2.685883308200905e-05,
"loss": 0.0006,
"step": 9900
},
{
"epoch": 1.5540999008298972,
"grad_norm": 0.8417395353317261,
"learning_rate": 2.6786335691915093e-05,
"loss": 0.0017,
"step": 9925
},
{
"epoch": 1.558014510151887,
"grad_norm": 0.0006132688722573221,
"learning_rate": 2.6713838301821138e-05,
"loss": 0.0002,
"step": 9950
},
{
"epoch": 1.5619291194738765,
"grad_norm": 0.001284563448280096,
"learning_rate": 2.664134091172718e-05,
"loss": 0.0001,
"step": 9975
},
{
"epoch": 1.5658437287958662,
"grad_norm": 0.002453350927680731,
"learning_rate": 2.6568843521633226e-05,
"loss": 0.0001,
"step": 10000
},
{
"epoch": 1.5697583381178557,
"grad_norm": 0.002474565990269184,
"learning_rate": 2.6496346131539268e-05,
"loss": 0.0,
"step": 10025
},
{
"epoch": 1.5736729474398454,
"grad_norm": 0.0012147346278652549,
"learning_rate": 2.642384874144531e-05,
"loss": 0.0031,
"step": 10050
},
{
"epoch": 1.5775875567618352,
"grad_norm": 0.0009614901500754058,
"learning_rate": 2.635135135135135e-05,
"loss": 0.0009,
"step": 10075
},
{
"epoch": 1.5815021660838249,
"grad_norm": 0.00043524886132217944,
"learning_rate": 2.6278853961257395e-05,
"loss": 0.0001,
"step": 10100
},
{
"epoch": 1.5854167754058146,
"grad_norm": 0.0005262857885099947,
"learning_rate": 2.6206356571163437e-05,
"loss": 0.0,
"step": 10125
},
{
"epoch": 1.5893313847278043,
"grad_norm": 0.00038553698686882854,
"learning_rate": 2.613385918106948e-05,
"loss": 0.0013,
"step": 10150
},
{
"epoch": 1.5932459940497938,
"grad_norm": 0.0006603036308661103,
"learning_rate": 2.6061361790975525e-05,
"loss": 0.0001,
"step": 10175
},
{
"epoch": 1.5971606033717833,
"grad_norm": 0.0011721713235601783,
"learning_rate": 2.5988864400881567e-05,
"loss": 0.0,
"step": 10200
},
{
"epoch": 1.601075212693773,
"grad_norm": 0.00034801868605427444,
"learning_rate": 2.5916367010787613e-05,
"loss": 0.0003,
"step": 10225
},
{
"epoch": 1.6049898220157628,
"grad_norm": 0.00029766836087219417,
"learning_rate": 2.5843869620693655e-05,
"loss": 0.0005,
"step": 10250
},
{
"epoch": 1.6089044313377525,
"grad_norm": 0.8273627161979675,
"learning_rate": 2.57713722305997e-05,
"loss": 0.004,
"step": 10275
},
{
"epoch": 1.6128190406597422,
"grad_norm": 0.0023189974017441273,
"learning_rate": 2.5698874840505743e-05,
"loss": 0.0005,
"step": 10300
},
{
"epoch": 1.616733649981732,
"grad_norm": 0.001266616047360003,
"learning_rate": 2.562637745041179e-05,
"loss": 0.0006,
"step": 10325
},
{
"epoch": 1.6206482593037215,
"grad_norm": 0.0006485527264885604,
"learning_rate": 2.555388006031783e-05,
"loss": 0.0001,
"step": 10350
},
{
"epoch": 1.6245628686257112,
"grad_norm": 0.01249407883733511,
"learning_rate": 2.5481382670223873e-05,
"loss": 0.0047,
"step": 10375
},
{
"epoch": 1.6284774779477007,
"grad_norm": 0.0016884652432054281,
"learning_rate": 2.540888528012992e-05,
"loss": 0.0008,
"step": 10400
},
{
"epoch": 1.6323920872696904,
"grad_norm": 0.0009969666134566069,
"learning_rate": 2.533638789003596e-05,
"loss": 0.0001,
"step": 10425
},
{
"epoch": 1.6363066965916802,
"grad_norm": 0.0008430654415860772,
"learning_rate": 2.5263890499942007e-05,
"loss": 0.0004,
"step": 10450
},
{
"epoch": 1.6402213059136699,
"grad_norm": 0.0007658881950192153,
"learning_rate": 2.519139310984805e-05,
"loss": 0.0001,
"step": 10475
},
{
"epoch": 1.6441359152356596,
"grad_norm": 0.0007439135224558413,
"learning_rate": 2.5118895719754088e-05,
"loss": 0.0,
"step": 10500
},
{
"epoch": 1.648050524557649,
"grad_norm": 0.0005683203344233334,
"learning_rate": 2.504639832966013e-05,
"loss": 0.0001,
"step": 10525
},
{
"epoch": 1.6519651338796388,
"grad_norm": 0.00042879345710389316,
"learning_rate": 2.497390093956618e-05,
"loss": 0.0,
"step": 10550
},
{
"epoch": 1.6558797432016283,
"grad_norm": 0.0004082492378074676,
"learning_rate": 2.490140354947222e-05,
"loss": 0.0001,
"step": 10575
},
{
"epoch": 1.659794352523618,
"grad_norm": 0.002024848246946931,
"learning_rate": 2.4828906159378263e-05,
"loss": 0.0006,
"step": 10600
},
{
"epoch": 1.6637089618456078,
"grad_norm": 0.3372742533683777,
"learning_rate": 2.4756408769284306e-05,
"loss": 0.0001,
"step": 10625
},
{
"epoch": 1.6676235711675975,
"grad_norm": 0.005234843585640192,
"learning_rate": 2.468391137919035e-05,
"loss": 0.0,
"step": 10650
},
{
"epoch": 1.6715381804895872,
"grad_norm": 0.0004937741323374212,
"learning_rate": 2.4611413989096393e-05,
"loss": 0.0005,
"step": 10675
},
{
"epoch": 1.675452789811577,
"grad_norm": 0.0007821121835149825,
"learning_rate": 2.4538916599002436e-05,
"loss": 0.0013,
"step": 10700
},
{
"epoch": 1.6793673991335665,
"grad_norm": 0.0010803567711263895,
"learning_rate": 2.446641920890848e-05,
"loss": 0.0004,
"step": 10725
},
{
"epoch": 1.683282008455556,
"grad_norm": 0.0005569527274928987,
"learning_rate": 2.4393921818814523e-05,
"loss": 0.0002,
"step": 10750
},
{
"epoch": 1.6871966177775457,
"grad_norm": 0.005404625087976456,
"learning_rate": 2.432142442872057e-05,
"loss": 0.0085,
"step": 10775
},
{
"epoch": 1.6911112270995354,
"grad_norm": 0.001234252005815506,
"learning_rate": 2.4248927038626608e-05,
"loss": 0.0014,
"step": 10800
},
{
"epoch": 1.6950258364215252,
"grad_norm": 0.0025794110260903835,
"learning_rate": 2.4176429648532654e-05,
"loss": 0.0014,
"step": 10825
},
{
"epoch": 1.6989404457435149,
"grad_norm": 0.07590831816196442,
"learning_rate": 2.4103932258438696e-05,
"loss": 0.0023,
"step": 10850
},
{
"epoch": 1.7028550550655046,
"grad_norm": 0.005912380293011665,
"learning_rate": 2.403143486834474e-05,
"loss": 0.0015,
"step": 10875
},
{
"epoch": 1.7067696643874941,
"grad_norm": 0.010333801619708538,
"learning_rate": 2.3958937478250784e-05,
"loss": 0.0001,
"step": 10900
},
{
"epoch": 1.7106842737094838,
"grad_norm": 0.01580122299492359,
"learning_rate": 2.388644008815683e-05,
"loss": 0.0018,
"step": 10925
},
{
"epoch": 1.7145988830314733,
"grad_norm": 0.10874010622501373,
"learning_rate": 2.381394269806287e-05,
"loss": 0.0012,
"step": 10950
},
{
"epoch": 1.718513492353463,
"grad_norm": 0.016742747277021408,
"learning_rate": 2.3741445307968914e-05,
"loss": 0.0009,
"step": 10975
},
{
"epoch": 1.7224281016754528,
"grad_norm": 0.012475020252168179,
"learning_rate": 2.366894791787496e-05,
"loss": 0.0007,
"step": 11000
},
{
"epoch": 1.7263427109974425,
"grad_norm": 0.1469310075044632,
"learning_rate": 2.3596450527781e-05,
"loss": 0.002,
"step": 11025
},
{
"epoch": 1.7302573203194322,
"grad_norm": 0.0017377269687131047,
"learning_rate": 2.3523953137687044e-05,
"loss": 0.0001,
"step": 11050
},
{
"epoch": 1.7341719296414217,
"grad_norm": 0.003490234026685357,
"learning_rate": 2.3451455747593086e-05,
"loss": 0.0001,
"step": 11075
},
{
"epoch": 1.7380865389634115,
"grad_norm": 0.008674775250256062,
"learning_rate": 2.337895835749913e-05,
"loss": 0.0039,
"step": 11100
},
{
"epoch": 1.742001148285401,
"grad_norm": 0.004905765876173973,
"learning_rate": 2.3306460967405174e-05,
"loss": 0.0023,
"step": 11125
},
{
"epoch": 1.7459157576073907,
"grad_norm": 0.0013971665175631642,
"learning_rate": 2.323396357731122e-05,
"loss": 0.0002,
"step": 11150
},
{
"epoch": 1.7498303669293804,
"grad_norm": 0.004542670212686062,
"learning_rate": 2.3161466187217262e-05,
"loss": 0.0001,
"step": 11175
},
{
"epoch": 1.7537449762513702,
"grad_norm": 0.0004924107925035059,
"learning_rate": 2.3088968797123307e-05,
"loss": 0.0001,
"step": 11200
},
{
"epoch": 1.7576595855733599,
"grad_norm": 0.0016612813342362642,
"learning_rate": 2.3016471407029346e-05,
"loss": 0.0003,
"step": 11225
},
{
"epoch": 1.7615741948953496,
"grad_norm": 0.0002968982153106481,
"learning_rate": 2.2943974016935392e-05,
"loss": 0.0001,
"step": 11250
},
{
"epoch": 1.7654888042173391,
"grad_norm": 0.00263870763592422,
"learning_rate": 2.2871476626841434e-05,
"loss": 0.0011,
"step": 11275
},
{
"epoch": 1.7694034135393286,
"grad_norm": 0.00835906621068716,
"learning_rate": 2.279897923674748e-05,
"loss": 0.0067,
"step": 11300
},
{
"epoch": 1.7733180228613183,
"grad_norm": 0.0007750336080789566,
"learning_rate": 2.2726481846653522e-05,
"loss": 0.0003,
"step": 11325
},
{
"epoch": 1.777232632183308,
"grad_norm": 0.0028884296771138906,
"learning_rate": 2.2653984456559564e-05,
"loss": 0.0023,
"step": 11350
},
{
"epoch": 1.7811472415052978,
"grad_norm": 0.042546164244413376,
"learning_rate": 2.258148706646561e-05,
"loss": 0.0003,
"step": 11375
},
{
"epoch": 1.7850618508272875,
"grad_norm": 0.0007674749358557165,
"learning_rate": 2.2508989676371652e-05,
"loss": 0.0003,
"step": 11400
},
{
"epoch": 1.7889764601492772,
"grad_norm": 0.040151335299015045,
"learning_rate": 2.2436492286277694e-05,
"loss": 0.0002,
"step": 11425
},
{
"epoch": 1.7928910694712668,
"grad_norm": 0.0003488350484985858,
"learning_rate": 2.2363994896183736e-05,
"loss": 0.0,
"step": 11450
},
{
"epoch": 1.7968056787932565,
"grad_norm": 0.25811877846717834,
"learning_rate": 2.2291497506089782e-05,
"loss": 0.0004,
"step": 11475
},
{
"epoch": 1.800720288115246,
"grad_norm": 0.00024293421301990747,
"learning_rate": 2.2219000115995824e-05,
"loss": 0.0003,
"step": 11500
},
{
"epoch": 1.8046348974372357,
"grad_norm": 0.004234221298247576,
"learning_rate": 2.214650272590187e-05,
"loss": 0.0029,
"step": 11525
},
{
"epoch": 1.8085495067592254,
"grad_norm": 0.0003131197008769959,
"learning_rate": 2.2074005335807912e-05,
"loss": 0.0002,
"step": 11550
},
{
"epoch": 1.8124641160812152,
"grad_norm": 0.05105828866362572,
"learning_rate": 2.2001507945713958e-05,
"loss": 0.0008,
"step": 11575
},
{
"epoch": 1.8163787254032049,
"grad_norm": 0.014320386573672295,
"learning_rate": 2.192901055562e-05,
"loss": 0.0009,
"step": 11600
},
{
"epoch": 1.8202933347251944,
"grad_norm": 0.0003410752979107201,
"learning_rate": 2.1856513165526042e-05,
"loss": 0.0007,
"step": 11625
},
{
"epoch": 1.8242079440471841,
"grad_norm": 0.0003042153548449278,
"learning_rate": 2.1784015775432085e-05,
"loss": 0.0006,
"step": 11650
},
{
"epoch": 1.8281225533691736,
"grad_norm": 0.1060762032866478,
"learning_rate": 2.1711518385338127e-05,
"loss": 0.0001,
"step": 11675
},
{
"epoch": 1.8320371626911633,
"grad_norm": 0.0008619217551313341,
"learning_rate": 2.1639020995244172e-05,
"loss": 0.0006,
"step": 11700
},
{
"epoch": 1.835951772013153,
"grad_norm": 0.0005810207221657038,
"learning_rate": 2.1566523605150215e-05,
"loss": 0.0005,
"step": 11725
},
{
"epoch": 1.8398663813351428,
"grad_norm": 0.005664344877004623,
"learning_rate": 2.149402621505626e-05,
"loss": 0.0026,
"step": 11750
},
{
"epoch": 1.8437809906571325,
"grad_norm": 4.294293403625488,
"learning_rate": 2.1421528824962302e-05,
"loss": 0.0007,
"step": 11775
},
{
"epoch": 1.8476955999791222,
"grad_norm": 0.040877822786569595,
"learning_rate": 2.1349031434868348e-05,
"loss": 0.0021,
"step": 11800
},
{
"epoch": 1.8516102093011118,
"grad_norm": 0.003679527435451746,
"learning_rate": 2.127653404477439e-05,
"loss": 0.0006,
"step": 11825
},
{
"epoch": 1.8555248186231013,
"grad_norm": 0.003342527663335204,
"learning_rate": 2.1204036654680433e-05,
"loss": 0.0002,
"step": 11850
},
{
"epoch": 1.859439427945091,
"grad_norm": 0.000454226101282984,
"learning_rate": 2.1131539264586475e-05,
"loss": 0.0003,
"step": 11875
},
{
"epoch": 1.8633540372670807,
"grad_norm": 0.00024604357895441353,
"learning_rate": 2.105904187449252e-05,
"loss": 0.0004,
"step": 11900
},
{
"epoch": 1.8672686465890704,
"grad_norm": 0.00022296722454484552,
"learning_rate": 2.0986544484398563e-05,
"loss": 0.0003,
"step": 11925
},
{
"epoch": 1.8711832559110602,
"grad_norm": 0.0013281836872920394,
"learning_rate": 2.0914047094304605e-05,
"loss": 0.0012,
"step": 11950
},
{
"epoch": 1.8750978652330499,
"grad_norm": 0.00042916362872347236,
"learning_rate": 2.084154970421065e-05,
"loss": 0.0006,
"step": 11975
},
{
"epoch": 1.8790124745550394,
"grad_norm": 0.0013623477425426245,
"learning_rate": 2.0769052314116693e-05,
"loss": 0.0014,
"step": 12000
},
{
"epoch": 1.8829270838770291,
"grad_norm": 0.0005729927215725183,
"learning_rate": 2.069655492402274e-05,
"loss": 0.0047,
"step": 12025
},
{
"epoch": 1.8868416931990186,
"grad_norm": 0.0161959920078516,
"learning_rate": 2.0624057533928777e-05,
"loss": 0.0002,
"step": 12050
},
{
"epoch": 1.8907563025210083,
"grad_norm": 0.05182856693863869,
"learning_rate": 2.0551560143834823e-05,
"loss": 0.0011,
"step": 12075
},
{
"epoch": 1.894670911842998,
"grad_norm": 0.0009345468715764582,
"learning_rate": 2.0479062753740865e-05,
"loss": 0.0004,
"step": 12100
},
{
"epoch": 1.8985855211649878,
"grad_norm": 0.004085169639438391,
"learning_rate": 2.040656536364691e-05,
"loss": 0.0009,
"step": 12125
},
{
"epoch": 1.9025001304869775,
"grad_norm": 0.003939950373023748,
"learning_rate": 2.0334067973552953e-05,
"loss": 0.0002,
"step": 12150
},
{
"epoch": 1.906414739808967,
"grad_norm": 0.0006880080327391624,
"learning_rate": 2.0261570583459e-05,
"loss": 0.0001,
"step": 12175
},
{
"epoch": 1.9103293491309568,
"grad_norm": 0.01777348481118679,
"learning_rate": 2.018907319336504e-05,
"loss": 0.0001,
"step": 12200
},
{
"epoch": 1.9142439584529463,
"grad_norm": 0.0002502555726096034,
"learning_rate": 2.0116575803271083e-05,
"loss": 0.0005,
"step": 12225
},
{
"epoch": 1.918158567774936,
"grad_norm": 0.0007615393842570484,
"learning_rate": 2.0044078413177125e-05,
"loss": 0.0022,
"step": 12250
},
{
"epoch": 1.9220731770969257,
"grad_norm": 0.008713331073522568,
"learning_rate": 1.9971581023083167e-05,
"loss": 0.001,
"step": 12275
},
{
"epoch": 1.9259877864189154,
"grad_norm": 0.003203247208148241,
"learning_rate": 1.9899083632989213e-05,
"loss": 0.0031,
"step": 12300
},
{
"epoch": 1.9299023957409052,
"grad_norm": 0.02553451806306839,
"learning_rate": 1.9826586242895255e-05,
"loss": 0.001,
"step": 12325
},
{
"epoch": 1.933817005062895,
"grad_norm": 0.045750390738248825,
"learning_rate": 1.97540888528013e-05,
"loss": 0.0007,
"step": 12350
},
{
"epoch": 1.9377316143848844,
"grad_norm": 0.0004758847935590893,
"learning_rate": 1.9681591462707343e-05,
"loss": 0.001,
"step": 12375
},
{
"epoch": 1.941646223706874,
"grad_norm": 0.0024788689333945513,
"learning_rate": 1.960909407261339e-05,
"loss": 0.0003,
"step": 12400
},
{
"epoch": 1.9455608330288636,
"grad_norm": 0.0014538065297529101,
"learning_rate": 1.953659668251943e-05,
"loss": 0.0007,
"step": 12425
},
{
"epoch": 1.9494754423508533,
"grad_norm": 0.00023535569198429585,
"learning_rate": 1.9464099292425473e-05,
"loss": 0.0003,
"step": 12450
},
{
"epoch": 1.953390051672843,
"grad_norm": 0.0002048378373729065,
"learning_rate": 1.9391601902331515e-05,
"loss": 0.0001,
"step": 12475
},
{
"epoch": 1.9573046609948328,
"grad_norm": 0.0004028423863928765,
"learning_rate": 1.931910451223756e-05,
"loss": 0.0008,
"step": 12500
},
{
"epoch": 1.9612192703168225,
"grad_norm": 0.0021086076740175486,
"learning_rate": 1.9246607122143603e-05,
"loss": 0.0002,
"step": 12525
},
{
"epoch": 1.965133879638812,
"grad_norm": 0.00085318653145805,
"learning_rate": 1.9174109732049646e-05,
"loss": 0.0001,
"step": 12550
},
{
"epoch": 1.9690484889608018,
"grad_norm": 0.00021198119793552905,
"learning_rate": 1.910161234195569e-05,
"loss": 0.0007,
"step": 12575
},
{
"epoch": 1.9729630982827913,
"grad_norm": 0.00025199473020620644,
"learning_rate": 1.9029114951861733e-05,
"loss": 0.0011,
"step": 12600
},
{
"epoch": 1.976877707604781,
"grad_norm": 0.0007640988333150744,
"learning_rate": 1.895661756176778e-05,
"loss": 0.0003,
"step": 12625
},
{
"epoch": 1.9807923169267707,
"grad_norm": 0.013913657516241074,
"learning_rate": 1.888412017167382e-05,
"loss": 0.0001,
"step": 12650
},
{
"epoch": 1.9847069262487604,
"grad_norm": 0.00018586177611723542,
"learning_rate": 1.8811622781579863e-05,
"loss": 0.001,
"step": 12675
},
{
"epoch": 1.9886215355707502,
"grad_norm": 0.00032623313018120825,
"learning_rate": 1.8739125391485906e-05,
"loss": 0.0,
"step": 12700
},
{
"epoch": 1.9925361448927397,
"grad_norm": 0.00017907471919897944,
"learning_rate": 1.866662800139195e-05,
"loss": 0.0,
"step": 12725
},
{
"epoch": 1.9964507542147294,
"grad_norm": 1.226132869720459,
"learning_rate": 1.8594130611297994e-05,
"loss": 0.0007,
"step": 12750
},
{
"epoch": 1.9998956104180803,
"eval_accuracy": 0.9998184512550563,
"eval_f1": 0.9998301719182735,
"eval_loss": 0.0009782494744285941,
"eval_precision": 0.999817110608891,
"eval_recall": 0.9998432335689185,
"eval_runtime": 63.1773,
"eval_samples_per_second": 606.515,
"eval_steps_per_second": 37.909,
"step": 12772
},
{
"epoch": 2.000365363536719,
"grad_norm": 0.0005716659361496568,
"learning_rate": 1.852163322120404e-05,
"loss": 0.0,
"step": 12775
},
{
"epoch": 2.0042799728587086,
"grad_norm": 0.0003549535758793354,
"learning_rate": 1.844913583111008e-05,
"loss": 0.0016,
"step": 12800
},
{
"epoch": 2.0081945821806984,
"grad_norm": 0.012497562915086746,
"learning_rate": 1.8376638441016124e-05,
"loss": 0.0003,
"step": 12825
},
{
"epoch": 2.012109191502688,
"grad_norm": 0.0003994225990027189,
"learning_rate": 1.830414105092217e-05,
"loss": 0.0022,
"step": 12850
},
{
"epoch": 2.016023800824678,
"grad_norm": 0.0007454080041497946,
"learning_rate": 1.823164366082821e-05,
"loss": 0.0001,
"step": 12875
},
{
"epoch": 2.0199384101466675,
"grad_norm": 0.0001763895561452955,
"learning_rate": 1.8159146270734254e-05,
"loss": 0.0001,
"step": 12900
},
{
"epoch": 2.023853019468657,
"grad_norm": 1.3950115442276,
"learning_rate": 1.8086648880640296e-05,
"loss": 0.0002,
"step": 12925
},
{
"epoch": 2.0277676287906465,
"grad_norm": 0.00019921216880902648,
"learning_rate": 1.801415149054634e-05,
"loss": 0.0003,
"step": 12950
},
{
"epoch": 2.0316822381126363,
"grad_norm": 0.00017710919200908393,
"learning_rate": 1.7941654100452384e-05,
"loss": 0.0001,
"step": 12975
},
{
"epoch": 2.035596847434626,
"grad_norm": 0.0029750317335128784,
"learning_rate": 1.7872056605962187e-05,
"loss": 0.0048,
"step": 13000
},
{
"epoch": 2.0395114567566157,
"grad_norm": 0.006306216586381197,
"learning_rate": 1.779955921586823e-05,
"loss": 0.0001,
"step": 13025
},
{
"epoch": 2.0434260660786054,
"grad_norm": 0.0116845378652215,
"learning_rate": 1.7727061825774275e-05,
"loss": 0.0001,
"step": 13050
},
{
"epoch": 2.047340675400595,
"grad_norm": 0.6511118412017822,
"learning_rate": 1.7654564435680314e-05,
"loss": 0.0003,
"step": 13075
},
{
"epoch": 2.051255284722585,
"grad_norm": 0.036821216344833374,
"learning_rate": 1.758206704558636e-05,
"loss": 0.0007,
"step": 13100
},
{
"epoch": 2.055169894044574,
"grad_norm": 0.002877579303458333,
"learning_rate": 1.7509569655492402e-05,
"loss": 0.0001,
"step": 13125
},
{
"epoch": 2.059084503366564,
"grad_norm": 0.3921562433242798,
"learning_rate": 1.7437072265398448e-05,
"loss": 0.0015,
"step": 13150
},
{
"epoch": 2.0629991126885536,
"grad_norm": 0.0009924178011715412,
"learning_rate": 1.736457487530449e-05,
"loss": 0.0001,
"step": 13175
},
{
"epoch": 2.0669137220105434,
"grad_norm": 0.00017288805975113064,
"learning_rate": 1.7292077485210536e-05,
"loss": 0.0,
"step": 13200
},
{
"epoch": 2.070828331332533,
"grad_norm": 0.002701199846342206,
"learning_rate": 1.7222479990720336e-05,
"loss": 0.0002,
"step": 13225
},
{
"epoch": 2.074742940654523,
"grad_norm": 0.0008174364338628948,
"learning_rate": 1.7149982600626378e-05,
"loss": 0.0002,
"step": 13250
},
{
"epoch": 2.0786575499765125,
"grad_norm": 0.0007628415478393435,
"learning_rate": 1.707748521053242e-05,
"loss": 0.0026,
"step": 13275
},
{
"epoch": 2.082572159298502,
"grad_norm": 0.002812017919495702,
"learning_rate": 1.7004987820438466e-05,
"loss": 0.0005,
"step": 13300
},
{
"epoch": 2.0864867686204915,
"grad_norm": 0.0008141273865476251,
"learning_rate": 1.6932490430344508e-05,
"loss": 0.0011,
"step": 13325
},
{
"epoch": 2.0904013779424813,
"grad_norm": 0.001382953836582601,
"learning_rate": 1.685999304025055e-05,
"loss": 0.0001,
"step": 13350
},
{
"epoch": 2.094315987264471,
"grad_norm": 0.0003401144640520215,
"learning_rate": 1.6787495650156596e-05,
"loss": 0.0007,
"step": 13375
},
{
"epoch": 2.0982305965864607,
"grad_norm": 0.0013442619238048792,
"learning_rate": 1.6714998260062638e-05,
"loss": 0.0003,
"step": 13400
},
{
"epoch": 2.1021452059084504,
"grad_norm": 0.0014413492754101753,
"learning_rate": 1.6642500869968684e-05,
"loss": 0.0,
"step": 13425
},
{
"epoch": 2.10605981523044,
"grad_norm": 0.0010517615592107177,
"learning_rate": 1.6570003479874726e-05,
"loss": 0.0001,
"step": 13450
},
{
"epoch": 2.10997442455243,
"grad_norm": 0.0006386275636032224,
"learning_rate": 1.6497506089780768e-05,
"loss": 0.0001,
"step": 13475
},
{
"epoch": 2.113889033874419,
"grad_norm": 0.06532581895589828,
"learning_rate": 1.642500869968681e-05,
"loss": 0.0,
"step": 13500
},
{
"epoch": 2.117803643196409,
"grad_norm": 0.6693994402885437,
"learning_rate": 1.6352511309592856e-05,
"loss": 0.0003,
"step": 13525
},
{
"epoch": 2.1217182525183986,
"grad_norm": 0.00038365976070053875,
"learning_rate": 1.62800139194989e-05,
"loss": 0.001,
"step": 13550
},
{
"epoch": 2.1256328618403884,
"grad_norm": 0.00022832312970422208,
"learning_rate": 1.6207516529404944e-05,
"loss": 0.0005,
"step": 13575
},
{
"epoch": 2.129547471162378,
"grad_norm": 0.0010044885566458106,
"learning_rate": 1.6135019139310986e-05,
"loss": 0.0002,
"step": 13600
},
{
"epoch": 2.133462080484368,
"grad_norm": 0.0077380407601594925,
"learning_rate": 1.606252174921703e-05,
"loss": 0.0006,
"step": 13625
},
{
"epoch": 2.1373766898063575,
"grad_norm": 0.004086634609848261,
"learning_rate": 1.5990024359123074e-05,
"loss": 0.0,
"step": 13650
},
{
"epoch": 2.141291299128347,
"grad_norm": 0.0014021744718775153,
"learning_rate": 1.5917526969029113e-05,
"loss": 0.0004,
"step": 13675
},
{
"epoch": 2.1452059084503365,
"grad_norm": 0.0007703950395807624,
"learning_rate": 1.584502957893516e-05,
"loss": 0.0001,
"step": 13700
},
{
"epoch": 2.1491205177723263,
"grad_norm": 0.041545968502759933,
"learning_rate": 1.57725321888412e-05,
"loss": 0.0131,
"step": 13725
},
{
"epoch": 2.153035127094316,
"grad_norm": 0.005660552531480789,
"learning_rate": 1.5700034798747246e-05,
"loss": 0.0033,
"step": 13750
},
{
"epoch": 2.1569497364163057,
"grad_norm": 0.00475983927026391,
"learning_rate": 1.562753740865329e-05,
"loss": 0.0003,
"step": 13775
},
{
"epoch": 2.1608643457382954,
"grad_norm": 0.8047095537185669,
"learning_rate": 1.5555040018559334e-05,
"loss": 0.0003,
"step": 13800
},
{
"epoch": 2.164778955060285,
"grad_norm": 0.0028548124246299267,
"learning_rate": 1.5482542628465376e-05,
"loss": 0.0002,
"step": 13825
},
{
"epoch": 2.1686935643822745,
"grad_norm": 0.01292176079005003,
"learning_rate": 1.5410045238371422e-05,
"loss": 0.0004,
"step": 13850
},
{
"epoch": 2.172608173704264,
"grad_norm": 0.00023661291925236583,
"learning_rate": 1.533754784827746e-05,
"loss": 0.0003,
"step": 13875
},
{
"epoch": 2.176522783026254,
"grad_norm": 0.0002082917490042746,
"learning_rate": 1.5265050458183507e-05,
"loss": 0.0,
"step": 13900
},
{
"epoch": 2.1804373923482436,
"grad_norm": 0.0005662673502229154,
"learning_rate": 1.5192553068089549e-05,
"loss": 0.0001,
"step": 13925
},
{
"epoch": 2.1843520016702334,
"grad_norm": 0.00044558930676430464,
"learning_rate": 1.5120055677995593e-05,
"loss": 0.0,
"step": 13950
},
{
"epoch": 2.188266610992223,
"grad_norm": 0.00019830386736430228,
"learning_rate": 1.5047558287901637e-05,
"loss": 0.0015,
"step": 13975
},
{
"epoch": 2.192181220314213,
"grad_norm": 0.00026091316249221563,
"learning_rate": 1.497506089780768e-05,
"loss": 0.0003,
"step": 14000
},
{
"epoch": 2.196095829636202,
"grad_norm": 0.00025246432051062584,
"learning_rate": 1.4902563507713724e-05,
"loss": 0.0,
"step": 14025
},
{
"epoch": 2.200010438958192,
"grad_norm": 0.13300319015979767,
"learning_rate": 1.4830066117619767e-05,
"loss": 0.001,
"step": 14050
},
{
"epoch": 2.2039250482801815,
"grad_norm": 0.00025016642757691443,
"learning_rate": 1.4757568727525809e-05,
"loss": 0.0001,
"step": 14075
},
{
"epoch": 2.2078396576021713,
"grad_norm": 0.0002115444076480344,
"learning_rate": 1.4685071337431853e-05,
"loss": 0.0001,
"step": 14100
},
{
"epoch": 2.211754266924161,
"grad_norm": 0.028121890500187874,
"learning_rate": 1.4612573947337895e-05,
"loss": 0.0009,
"step": 14125
},
{
"epoch": 2.2156688762461507,
"grad_norm": 0.00024847922031767666,
"learning_rate": 1.4540076557243939e-05,
"loss": 0.0002,
"step": 14150
},
{
"epoch": 2.2195834855681404,
"grad_norm": 0.0006722984835505486,
"learning_rate": 1.4467579167149983e-05,
"loss": 0.0006,
"step": 14175
},
{
"epoch": 2.22349809489013,
"grad_norm": 0.004081141669303179,
"learning_rate": 1.4395081777056027e-05,
"loss": 0.0001,
"step": 14200
},
{
"epoch": 2.2274127042121195,
"grad_norm": 0.0002169485087506473,
"learning_rate": 1.432258438696207e-05,
"loss": 0.0,
"step": 14225
},
{
"epoch": 2.231327313534109,
"grad_norm": 0.0004130221204832196,
"learning_rate": 1.4250086996868115e-05,
"loss": 0.0,
"step": 14250
},
{
"epoch": 2.235241922856099,
"grad_norm": 0.00018515564443077892,
"learning_rate": 1.4177589606774159e-05,
"loss": 0.0,
"step": 14275
},
{
"epoch": 2.2391565321780886,
"grad_norm": 0.00019148353021591902,
"learning_rate": 1.41050922166802e-05,
"loss": 0.0001,
"step": 14300
},
{
"epoch": 2.2430711415000784,
"grad_norm": 0.0020026888232678175,
"learning_rate": 1.4032594826586243e-05,
"loss": 0.0,
"step": 14325
},
{
"epoch": 2.246985750822068,
"grad_norm": 0.00482906075194478,
"learning_rate": 1.3960097436492287e-05,
"loss": 0.0,
"step": 14350
},
{
"epoch": 2.250900360144058,
"grad_norm": 0.00026496723876334727,
"learning_rate": 1.3887600046398331e-05,
"loss": 0.0,
"step": 14375
},
{
"epoch": 2.2548149694660475,
"grad_norm": 0.0002499364491086453,
"learning_rate": 1.3815102656304373e-05,
"loss": 0.0004,
"step": 14400
},
{
"epoch": 2.258729578788037,
"grad_norm": 0.00017081611440517008,
"learning_rate": 1.3742605266210417e-05,
"loss": 0.0,
"step": 14425
},
{
"epoch": 2.2626441881100265,
"grad_norm": 0.00017463510448578745,
"learning_rate": 1.3670107876116461e-05,
"loss": 0.0,
"step": 14450
},
{
"epoch": 2.2665587974320163,
"grad_norm": 0.0022245387081056833,
"learning_rate": 1.3597610486022505e-05,
"loss": 0.0091,
"step": 14475
},
{
"epoch": 2.270473406754006,
"grad_norm": 0.007297486532479525,
"learning_rate": 1.3525113095928546e-05,
"loss": 0.0004,
"step": 14500
},
{
"epoch": 2.2743880160759957,
"grad_norm": 0.005688577424734831,
"learning_rate": 1.345261570583459e-05,
"loss": 0.0002,
"step": 14525
},
{
"epoch": 2.2783026253979854,
"grad_norm": 0.004135147202759981,
"learning_rate": 1.3380118315740633e-05,
"loss": 0.0024,
"step": 14550
},
{
"epoch": 2.282217234719975,
"grad_norm": 0.002322606975212693,
"learning_rate": 1.3307620925646677e-05,
"loss": 0.0005,
"step": 14575
},
{
"epoch": 2.2861318440419645,
"grad_norm": 0.002005909802392125,
"learning_rate": 1.3235123535552721e-05,
"loss": 0.001,
"step": 14600
},
{
"epoch": 2.290046453363954,
"grad_norm": 0.0009009299101307988,
"learning_rate": 1.3162626145458765e-05,
"loss": 0.0009,
"step": 14625
},
{
"epoch": 2.293961062685944,
"grad_norm": 0.009488469921052456,
"learning_rate": 1.3090128755364809e-05,
"loss": 0.0001,
"step": 14650
},
{
"epoch": 2.2978756720079336,
"grad_norm": 0.017507528886198997,
"learning_rate": 1.3017631365270851e-05,
"loss": 0.0038,
"step": 14675
},
{
"epoch": 2.3017902813299234,
"grad_norm": 0.0015606528613716364,
"learning_rate": 1.2945133975176894e-05,
"loss": 0.0003,
"step": 14700
},
{
"epoch": 2.305704890651913,
"grad_norm": 0.0004906764370389283,
"learning_rate": 1.2872636585082937e-05,
"loss": 0.0006,
"step": 14725
},
{
"epoch": 2.309619499973903,
"grad_norm": 0.001650349353440106,
"learning_rate": 1.280013919498898e-05,
"loss": 0.0001,
"step": 14750
},
{
"epoch": 2.313534109295892,
"grad_norm": 0.00038060618680901825,
"learning_rate": 1.2727641804895024e-05,
"loss": 0.0005,
"step": 14775
},
{
"epoch": 2.317448718617882,
"grad_norm": 0.001028302125632763,
"learning_rate": 1.2655144414801068e-05,
"loss": 0.0001,
"step": 14800
},
{
"epoch": 2.3213633279398715,
"grad_norm": 0.007792349439114332,
"learning_rate": 1.2582647024707112e-05,
"loss": 0.0002,
"step": 14825
},
{
"epoch": 2.3252779372618613,
"grad_norm": 0.01641431264579296,
"learning_rate": 1.2510149634613155e-05,
"loss": 0.0001,
"step": 14850
},
{
"epoch": 2.329192546583851,
"grad_norm": 0.0006683383253403008,
"learning_rate": 1.2437652244519198e-05,
"loss": 0.0022,
"step": 14875
},
{
"epoch": 2.3331071559058407,
"grad_norm": 0.0009384675067849457,
"learning_rate": 1.2365154854425242e-05,
"loss": 0.0017,
"step": 14900
},
{
"epoch": 2.3370217652278304,
"grad_norm": 0.00037563694058917463,
"learning_rate": 1.2292657464331286e-05,
"loss": 0.0011,
"step": 14925
},
{
"epoch": 2.3409363745498197,
"grad_norm": 0.00020698497246485204,
"learning_rate": 1.222016007423733e-05,
"loss": 0.0008,
"step": 14950
},
{
"epoch": 2.3448509838718095,
"grad_norm": 0.0001723883324302733,
"learning_rate": 1.2147662684143372e-05,
"loss": 0.0001,
"step": 14975
},
{
"epoch": 2.348765593193799,
"grad_norm": 0.06703776121139526,
"learning_rate": 1.2075165294049416e-05,
"loss": 0.0001,
"step": 15000
},
{
"epoch": 2.352680202515789,
"grad_norm": 0.00018680775247048587,
"learning_rate": 1.2002667903955458e-05,
"loss": 0.0,
"step": 15025
},
{
"epoch": 2.3565948118377786,
"grad_norm": 0.0001830816181609407,
"learning_rate": 1.1930170513861502e-05,
"loss": 0.0,
"step": 15050
},
{
"epoch": 2.3605094211597684,
"grad_norm": 0.00020273331028874964,
"learning_rate": 1.1857673123767544e-05,
"loss": 0.0001,
"step": 15075
},
{
"epoch": 2.364424030481758,
"grad_norm": 0.00020694882550742477,
"learning_rate": 1.1785175733673588e-05,
"loss": 0.0003,
"step": 15100
},
{
"epoch": 2.3683386398037474,
"grad_norm": 0.0007374598644673824,
"learning_rate": 1.1712678343579632e-05,
"loss": 0.0061,
"step": 15125
},
{
"epoch": 2.372253249125737,
"grad_norm": 0.01134900189936161,
"learning_rate": 1.1640180953485676e-05,
"loss": 0.0002,
"step": 15150
},
{
"epoch": 2.376167858447727,
"grad_norm": 0.000648992951028049,
"learning_rate": 1.1567683563391718e-05,
"loss": 0.0001,
"step": 15175
},
{
"epoch": 2.3800824677697165,
"grad_norm": 0.0004858991305809468,
"learning_rate": 1.1495186173297762e-05,
"loss": 0.0,
"step": 15200
},
{
"epoch": 2.3839970770917063,
"grad_norm": 0.0870414674282074,
"learning_rate": 1.1422688783203806e-05,
"loss": 0.0002,
"step": 15225
},
{
"epoch": 2.387911686413696,
"grad_norm": 0.0006449563661590219,
"learning_rate": 1.135019139310985e-05,
"loss": 0.0,
"step": 15250
},
{
"epoch": 2.3918262957356857,
"grad_norm": 0.00027997951838187873,
"learning_rate": 1.1277694003015892e-05,
"loss": 0.0001,
"step": 15275
},
{
"epoch": 2.395740905057675,
"grad_norm": 0.9710797667503357,
"learning_rate": 1.1205196612921936e-05,
"loss": 0.0,
"step": 15300
},
{
"epoch": 2.3996555143796647,
"grad_norm": 0.00019451680418569595,
"learning_rate": 1.1132699222827978e-05,
"loss": 0.0,
"step": 15325
},
{
"epoch": 2.4035701237016545,
"grad_norm": 0.0015565232606604695,
"learning_rate": 1.1060201832734022e-05,
"loss": 0.0038,
"step": 15350
},
{
"epoch": 2.407484733023644,
"grad_norm": 0.00032800339977256954,
"learning_rate": 1.0987704442640064e-05,
"loss": 0.0002,
"step": 15375
},
{
"epoch": 2.411399342345634,
"grad_norm": 0.017102686688303947,
"learning_rate": 1.0915207052546108e-05,
"loss": 0.0005,
"step": 15400
},
{
"epoch": 2.4153139516676236,
"grad_norm": 0.001418459229171276,
"learning_rate": 1.0842709662452152e-05,
"loss": 0.0,
"step": 15425
},
{
"epoch": 2.4192285609896134,
"grad_norm": 0.015620172023773193,
"learning_rate": 1.0770212272358196e-05,
"loss": 0.0001,
"step": 15450
},
{
"epoch": 2.423143170311603,
"grad_norm": 0.0006858358392491937,
"learning_rate": 1.0697714882264238e-05,
"loss": 0.0018,
"step": 15475
},
{
"epoch": 2.427057779633593,
"grad_norm": 0.0004693476075772196,
"learning_rate": 1.0625217492170282e-05,
"loss": 0.0001,
"step": 15500
},
{
"epoch": 2.430972388955582,
"grad_norm": 0.34811919927597046,
"learning_rate": 1.0552720102076326e-05,
"loss": 0.0002,
"step": 15525
},
{
"epoch": 2.434886998277572,
"grad_norm": 0.0014850205043330789,
"learning_rate": 1.048022271198237e-05,
"loss": 0.0001,
"step": 15550
},
{
"epoch": 2.4388016075995615,
"grad_norm": 0.000363474857294932,
"learning_rate": 1.0407725321888412e-05,
"loss": 0.0004,
"step": 15575
},
{
"epoch": 2.4427162169215513,
"grad_norm": 0.0010670394403859973,
"learning_rate": 1.0335227931794456e-05,
"loss": 0.0004,
"step": 15600
},
{
"epoch": 2.446630826243541,
"grad_norm": 0.14377856254577637,
"learning_rate": 1.0262730541700499e-05,
"loss": 0.0019,
"step": 15625
},
{
"epoch": 2.4505454355655307,
"grad_norm": 0.0004735889961011708,
"learning_rate": 1.0190233151606542e-05,
"loss": 0.0001,
"step": 15650
},
{
"epoch": 2.4544600448875205,
"grad_norm": 0.0004282770969439298,
"learning_rate": 1.0117735761512585e-05,
"loss": 0.0026,
"step": 15675
},
{
"epoch": 2.4583746542095097,
"grad_norm": 0.007325559854507446,
"learning_rate": 1.0045238371418629e-05,
"loss": 0.0007,
"step": 15700
},
{
"epoch": 2.4622892635314995,
"grad_norm": 0.000777337234467268,
"learning_rate": 9.972740981324673e-06,
"loss": 0.0009,
"step": 15725
},
{
"epoch": 2.466203872853489,
"grad_norm": 0.003926098812371492,
"learning_rate": 9.900243591230716e-06,
"loss": 0.0004,
"step": 15750
},
{
"epoch": 2.470118482175479,
"grad_norm": 0.00045515818055719137,
"learning_rate": 9.82774620113676e-06,
"loss": 0.0001,
"step": 15775
},
{
"epoch": 2.4740330914974686,
"grad_norm": 0.01267548743635416,
"learning_rate": 9.755248811042803e-06,
"loss": 0.0002,
"step": 15800
},
{
"epoch": 2.4779477008194584,
"grad_norm": 0.0004507755220402032,
"learning_rate": 9.682751420948847e-06,
"loss": 0.0008,
"step": 15825
},
{
"epoch": 2.481862310141448,
"grad_norm": 0.07765714824199677,
"learning_rate": 9.61025403085489e-06,
"loss": 0.0001,
"step": 15850
},
{
"epoch": 2.4857769194634374,
"grad_norm": 0.0010181193938478827,
"learning_rate": 9.537756640760934e-06,
"loss": 0.0002,
"step": 15875
},
{
"epoch": 2.489691528785427,
"grad_norm": 0.00023663626052439213,
"learning_rate": 9.465259250666977e-06,
"loss": 0.0011,
"step": 15900
},
{
"epoch": 2.493606138107417,
"grad_norm": 0.010522628203034401,
"learning_rate": 9.392761860573019e-06,
"loss": 0.0002,
"step": 15925
},
{
"epoch": 2.4975207474294066,
"grad_norm": 0.0006732672336511314,
"learning_rate": 9.320264470479063e-06,
"loss": 0.0001,
"step": 15950
},
{
"epoch": 2.5014353567513963,
"grad_norm": 0.0013339362340047956,
"learning_rate": 9.247767080385107e-06,
"loss": 0.0,
"step": 15975
},
{
"epoch": 2.505349966073386,
"grad_norm": 0.0003018657735083252,
"learning_rate": 9.175269690291149e-06,
"loss": 0.0,
"step": 16000
},
{
"epoch": 2.5092645753953757,
"grad_norm": 0.0003551984846126288,
"learning_rate": 9.102772300197193e-06,
"loss": 0.0,
"step": 16025
},
{
"epoch": 2.513179184717365,
"grad_norm": 0.00026321958284825087,
"learning_rate": 9.030274910103237e-06,
"loss": 0.0,
"step": 16050
},
{
"epoch": 2.5170937940393547,
"grad_norm": 0.00023617663828190416,
"learning_rate": 8.95777752000928e-06,
"loss": 0.0001,
"step": 16075
},
{
"epoch": 2.5210084033613445,
"grad_norm": 0.0029212015215307474,
"learning_rate": 8.885280129915323e-06,
"loss": 0.0,
"step": 16100
},
{
"epoch": 2.524923012683334,
"grad_norm": 0.00017464791017118841,
"learning_rate": 8.812782739821367e-06,
"loss": 0.0,
"step": 16125
},
{
"epoch": 2.528837622005324,
"grad_norm": 0.00018289768195245415,
"learning_rate": 8.74028534972741e-06,
"loss": 0.0002,
"step": 16150
},
{
"epoch": 2.5327522313273136,
"grad_norm": 0.00490641500800848,
"learning_rate": 8.667787959633455e-06,
"loss": 0.0,
"step": 16175
},
{
"epoch": 2.5366668406493034,
"grad_norm": 0.0014596167020499706,
"learning_rate": 8.595290569539497e-06,
"loss": 0.0,
"step": 16200
},
{
"epoch": 2.5405814499712926,
"grad_norm": 0.00016923531075008214,
"learning_rate": 8.522793179445541e-06,
"loss": 0.0001,
"step": 16225
},
{
"epoch": 2.5444960592932824,
"grad_norm": 0.001988182310014963,
"learning_rate": 8.450295789351583e-06,
"loss": 0.0001,
"step": 16250
},
{
"epoch": 2.548410668615272,
"grad_norm": 0.0008615644765086472,
"learning_rate": 8.377798399257627e-06,
"loss": 0.0001,
"step": 16275
},
{
"epoch": 2.552325277937262,
"grad_norm": 0.0002167491620639339,
"learning_rate": 8.30530100916367e-06,
"loss": 0.0,
"step": 16300
},
{
"epoch": 2.5562398872592516,
"grad_norm": 0.000143597528222017,
"learning_rate": 8.232803619069713e-06,
"loss": 0.0001,
"step": 16325
},
{
"epoch": 2.5601544965812413,
"grad_norm": 0.0002638675505295396,
"learning_rate": 8.160306228975757e-06,
"loss": 0.0,
"step": 16350
},
{
"epoch": 2.564069105903231,
"grad_norm": 0.0011918079107999802,
"learning_rate": 8.087808838881801e-06,
"loss": 0.001,
"step": 16375
},
{
"epoch": 2.5679837152252203,
"grad_norm": 0.00031527673127129674,
"learning_rate": 8.015311448787843e-06,
"loss": 0.0,
"step": 16400
},
{
"epoch": 2.5718983245472105,
"grad_norm": 0.0002970160567201674,
"learning_rate": 7.942814058693887e-06,
"loss": 0.0,
"step": 16425
},
{
"epoch": 2.5758129338691997,
"grad_norm": 0.00032033000024966896,
"learning_rate": 7.870316668599931e-06,
"loss": 0.0,
"step": 16450
},
{
"epoch": 2.5797275431911895,
"grad_norm": 0.00021383754210546613,
"learning_rate": 7.797819278505975e-06,
"loss": 0.0,
"step": 16475
},
{
"epoch": 2.583642152513179,
"grad_norm": 0.0025132743176072836,
"learning_rate": 7.725321888412019e-06,
"loss": 0.0016,
"step": 16500
},
{
"epoch": 2.587556761835169,
"grad_norm": 0.0004337320278864354,
"learning_rate": 7.652824498318061e-06,
"loss": 0.0,
"step": 16525
},
{
"epoch": 2.5914713711571586,
"grad_norm": 0.000609175069257617,
"learning_rate": 7.580327108224104e-06,
"loss": 0.0016,
"step": 16550
},
{
"epoch": 2.595385980479148,
"grad_norm": 0.00021605034999083728,
"learning_rate": 7.507829718130148e-06,
"loss": 0.0,
"step": 16575
},
{
"epoch": 2.599300589801138,
"grad_norm": 0.0006186183891259134,
"learning_rate": 7.435332328036191e-06,
"loss": 0.0,
"step": 16600
},
{
"epoch": 2.6032151991231274,
"grad_norm": 0.0002476814261171967,
"learning_rate": 7.362834937942234e-06,
"loss": 0.0006,
"step": 16625
},
{
"epoch": 2.607129808445117,
"grad_norm": 0.0007107394631020725,
"learning_rate": 7.2903375478482775e-06,
"loss": 0.0,
"step": 16650
},
{
"epoch": 2.611044417767107,
"grad_norm": 0.00029517774237319827,
"learning_rate": 7.217840157754321e-06,
"loss": 0.0,
"step": 16675
},
{
"epoch": 2.6149590270890966,
"grad_norm": 0.0001818942982936278,
"learning_rate": 7.145342767660365e-06,
"loss": 0.0,
"step": 16700
},
{
"epoch": 2.6188736364110863,
"grad_norm": 0.0003430229553487152,
"learning_rate": 7.072845377566408e-06,
"loss": 0.0029,
"step": 16725
},
{
"epoch": 2.6227882457330756,
"grad_norm": 0.01995168998837471,
"learning_rate": 7.0003479874724515e-06,
"loss": 0.004,
"step": 16750
},
{
"epoch": 2.6267028550550657,
"grad_norm": 0.003800376318395138,
"learning_rate": 6.927850597378495e-06,
"loss": 0.0002,
"step": 16775
},
{
"epoch": 2.630617464377055,
"grad_norm": 0.0005610916996374726,
"learning_rate": 6.8553532072845385e-06,
"loss": 0.0009,
"step": 16800
},
{
"epoch": 2.6345320736990447,
"grad_norm": 0.0002944047737400979,
"learning_rate": 6.782855817190581e-06,
"loss": 0.0002,
"step": 16825
},
{
"epoch": 2.6384466830210345,
"grad_norm": 0.0002466263249516487,
"learning_rate": 6.710358427096625e-06,
"loss": 0.0,
"step": 16850
},
{
"epoch": 2.642361292343024,
"grad_norm": 0.0001500146317994222,
"learning_rate": 6.637861037002669e-06,
"loss": 0.0,
"step": 16875
},
{
"epoch": 2.646275901665014,
"grad_norm": 0.0007112550083547831,
"learning_rate": 6.565363646908712e-06,
"loss": 0.0001,
"step": 16900
},
{
"epoch": 2.6501905109870036,
"grad_norm": 0.00017001846572384238,
"learning_rate": 6.492866256814755e-06,
"loss": 0.0,
"step": 16925
},
{
"epoch": 2.6541051203089934,
"grad_norm": 0.0007822296465747058,
"learning_rate": 6.420368866720798e-06,
"loss": 0.0,
"step": 16950
},
{
"epoch": 2.6580197296309827,
"grad_norm": 0.0006506266072392464,
"learning_rate": 6.347871476626842e-06,
"loss": 0.0011,
"step": 16975
},
{
"epoch": 2.6619343389529724,
"grad_norm": 0.001398293417878449,
"learning_rate": 6.275374086532886e-06,
"loss": 0.0009,
"step": 17000
},
{
"epoch": 2.665848948274962,
"grad_norm": 0.01400019507855177,
"learning_rate": 6.202876696438929e-06,
"loss": 0.0007,
"step": 17025
},
{
"epoch": 2.669763557596952,
"grad_norm": 0.001175655866973102,
"learning_rate": 6.130379306344972e-06,
"loss": 0.0008,
"step": 17050
},
{
"epoch": 2.6736781669189416,
"grad_norm": 0.004671004135161638,
"learning_rate": 6.057881916251015e-06,
"loss": 0.0001,
"step": 17075
},
{
"epoch": 2.6775927762409313,
"grad_norm": 0.03513360768556595,
"learning_rate": 5.985384526157058e-06,
"loss": 0.0001,
"step": 17100
},
{
"epoch": 2.681507385562921,
"grad_norm": 0.0003586947568692267,
"learning_rate": 5.912887136063102e-06,
"loss": 0.0,
"step": 17125
},
{
"epoch": 2.6854219948849103,
"grad_norm": 0.00038099908852018416,
"learning_rate": 5.840389745969145e-06,
"loss": 0.0,
"step": 17150
},
{
"epoch": 2.6893366042069,
"grad_norm": 0.00031486572697758675,
"learning_rate": 5.767892355875189e-06,
"loss": 0.0001,
"step": 17175
},
{
"epoch": 2.6932512135288897,
"grad_norm": 0.0003432184748817235,
"learning_rate": 5.695394965781233e-06,
"loss": 0.0001,
"step": 17200
},
{
"epoch": 2.6971658228508795,
"grad_norm": 0.0004650696355383843,
"learning_rate": 5.622897575687275e-06,
"loss": 0.0001,
"step": 17225
},
{
"epoch": 2.701080432172869,
"grad_norm": 0.0003299444215372205,
"learning_rate": 5.550400185593319e-06,
"loss": 0.0006,
"step": 17250
},
{
"epoch": 2.704995041494859,
"grad_norm": 0.0003300994576420635,
"learning_rate": 5.477902795499362e-06,
"loss": 0.0,
"step": 17275
},
{
"epoch": 2.7089096508168486,
"grad_norm": 0.0005288653774186969,
"learning_rate": 5.405405405405406e-06,
"loss": 0.001,
"step": 17300
},
{
"epoch": 2.712824260138838,
"grad_norm": 0.00020216924895066768,
"learning_rate": 5.332908015311449e-06,
"loss": 0.0,
"step": 17325
},
{
"epoch": 2.7167388694608277,
"grad_norm": 0.006004292517900467,
"learning_rate": 5.260410625217493e-06,
"loss": 0.0,
"step": 17350
},
{
"epoch": 2.7206534787828174,
"grad_norm": 0.0001754688419168815,
"learning_rate": 5.187913235123536e-06,
"loss": 0.0,
"step": 17375
},
{
"epoch": 2.724568088104807,
"grad_norm": 0.012972986325621605,
"learning_rate": 5.115415845029579e-06,
"loss": 0.0,
"step": 17400
},
{
"epoch": 2.728482697426797,
"grad_norm": 0.00019060824706684798,
"learning_rate": 5.042918454935622e-06,
"loss": 0.0,
"step": 17425
},
{
"epoch": 2.7323973067487866,
"grad_norm": 0.0002203083859058097,
"learning_rate": 4.970421064841666e-06,
"loss": 0.0001,
"step": 17450
},
{
"epoch": 2.7363119160707763,
"grad_norm": 0.00014681309403385967,
"learning_rate": 4.897923674747709e-06,
"loss": 0.0,
"step": 17475
},
{
"epoch": 2.7402265253927656,
"grad_norm": 0.0002081810962408781,
"learning_rate": 4.825426284653753e-06,
"loss": 0.0009,
"step": 17500
},
{
"epoch": 2.7441411347147557,
"grad_norm": 0.0002039974497165531,
"learning_rate": 4.752928894559796e-06,
"loss": 0.0001,
"step": 17525
},
{
"epoch": 2.748055744036745,
"grad_norm": 0.0001363355404464528,
"learning_rate": 4.680431504465839e-06,
"loss": 0.0,
"step": 17550
},
{
"epoch": 2.7519703533587347,
"grad_norm": 0.0013490230776369572,
"learning_rate": 4.6079341143718824e-06,
"loss": 0.0004,
"step": 17575
},
{
"epoch": 2.7558849626807245,
"grad_norm": 0.00017140705313067883,
"learning_rate": 4.535436724277926e-06,
"loss": 0.0005,
"step": 17600
},
{
"epoch": 2.759799572002714,
"grad_norm": 0.00012424413580447435,
"learning_rate": 4.4629393341839695e-06,
"loss": 0.0,
"step": 17625
},
{
"epoch": 2.763714181324704,
"grad_norm": 0.000487282668473199,
"learning_rate": 4.390441944090013e-06,
"loss": 0.0001,
"step": 17650
},
{
"epoch": 2.767628790646693,
"grad_norm": 0.0003270190500188619,
"learning_rate": 4.3179445539960565e-06,
"loss": 0.0004,
"step": 17675
},
{
"epoch": 2.7715433999686834,
"grad_norm": 0.00023671095550525934,
"learning_rate": 4.2454471639020995e-06,
"loss": 0.0,
"step": 17700
},
{
"epoch": 2.7754580092906727,
"grad_norm": 0.0011756513267755508,
"learning_rate": 4.172949773808143e-06,
"loss": 0.0067,
"step": 17725
},
{
"epoch": 2.7793726186126624,
"grad_norm": 0.0004914366290904582,
"learning_rate": 4.1004523837141865e-06,
"loss": 0.0005,
"step": 17750
},
{
"epoch": 2.783287227934652,
"grad_norm": 0.0011064461432397366,
"learning_rate": 4.02795499362023e-06,
"loss": 0.0,
"step": 17775
},
{
"epoch": 2.787201837256642,
"grad_norm": 0.0006191087886691093,
"learning_rate": 3.9554576035262736e-06,
"loss": 0.001,
"step": 17800
},
{
"epoch": 2.7911164465786316,
"grad_norm": 0.005341960582882166,
"learning_rate": 3.882960213432317e-06,
"loss": 0.0001,
"step": 17825
},
{
"epoch": 2.795031055900621,
"grad_norm": 0.004248717799782753,
"learning_rate": 3.81046282333836e-06,
"loss": 0.0003,
"step": 17850
},
{
"epoch": 2.798945665222611,
"grad_norm": 0.02371644414961338,
"learning_rate": 3.7379654332444032e-06,
"loss": 0.0001,
"step": 17875
},
{
"epoch": 2.8028602745446003,
"grad_norm": 0.0007812991389073431,
"learning_rate": 3.6654680431504467e-06,
"loss": 0.0004,
"step": 17900
},
{
"epoch": 2.80677488386659,
"grad_norm": 0.00031172268791124225,
"learning_rate": 3.59297065305649e-06,
"loss": 0.0002,
"step": 17925
},
{
"epoch": 2.8106894931885797,
"grad_norm": 0.012311534956097603,
"learning_rate": 3.5204732629625337e-06,
"loss": 0.0001,
"step": 17950
},
{
"epoch": 2.8146041025105695,
"grad_norm": 0.0008552991203032434,
"learning_rate": 3.4479758728685772e-06,
"loss": 0.0003,
"step": 17975
},
{
"epoch": 2.818518711832559,
"grad_norm": 0.011254767887294292,
"learning_rate": 3.3754784827746203e-06,
"loss": 0.0001,
"step": 18000
},
{
"epoch": 2.822433321154549,
"grad_norm": 0.003312336513772607,
"learning_rate": 3.302981092680664e-06,
"loss": 0.005,
"step": 18025
},
{
"epoch": 2.8263479304765387,
"grad_norm": 0.0033339662477374077,
"learning_rate": 3.230483702586707e-06,
"loss": 0.0001,
"step": 18050
},
{
"epoch": 2.830262539798528,
"grad_norm": 0.0307988952845335,
"learning_rate": 3.157986312492751e-06,
"loss": 0.0001,
"step": 18075
},
{
"epoch": 2.8341771491205177,
"grad_norm": 0.001144499285146594,
"learning_rate": 3.085488922398794e-06,
"loss": 0.0,
"step": 18100
},
{
"epoch": 2.8380917584425074,
"grad_norm": 0.0007567739812657237,
"learning_rate": 3.012991532304837e-06,
"loss": 0.0001,
"step": 18125
},
{
"epoch": 2.842006367764497,
"grad_norm": 0.0014737301971763372,
"learning_rate": 2.9404941422108805e-06,
"loss": 0.0001,
"step": 18150
},
{
"epoch": 2.845920977086487,
"grad_norm": 0.0020669877994805574,
"learning_rate": 2.867996752116924e-06,
"loss": 0.0005,
"step": 18175
},
{
"epoch": 2.8498355864084766,
"grad_norm": 0.0025932856369763613,
"learning_rate": 2.795499362022967e-06,
"loss": 0.0001,
"step": 18200
},
{
"epoch": 2.8537501957304663,
"grad_norm": 0.0018630975391715765,
"learning_rate": 2.7230019719290106e-06,
"loss": 0.0,
"step": 18225
},
{
"epoch": 2.8576648050524556,
"grad_norm": 0.011522402986884117,
"learning_rate": 2.650504581835054e-06,
"loss": 0.0021,
"step": 18250
},
{
"epoch": 2.8615794143744453,
"grad_norm": 0.020472779870033264,
"learning_rate": 2.578007191741097e-06,
"loss": 0.0001,
"step": 18275
},
{
"epoch": 2.865494023696435,
"grad_norm": 0.06026843190193176,
"learning_rate": 2.5055098016471406e-06,
"loss": 0.0001,
"step": 18300
},
{
"epoch": 2.8694086330184247,
"grad_norm": 0.0006733342306688428,
"learning_rate": 2.433012411553184e-06,
"loss": 0.0,
"step": 18325
},
{
"epoch": 2.8733232423404145,
"grad_norm": 0.0009708734578453004,
"learning_rate": 2.3605150214592277e-06,
"loss": 0.0001,
"step": 18350
},
{
"epoch": 2.877237851662404,
"grad_norm": 0.00023784795484971255,
"learning_rate": 2.288017631365271e-06,
"loss": 0.0001,
"step": 18375
},
{
"epoch": 2.881152460984394,
"grad_norm": 0.004968983121216297,
"learning_rate": 2.2155202412713147e-06,
"loss": 0.0001,
"step": 18400
},
{
"epoch": 2.885067070306383,
"grad_norm": 0.0008029749151319265,
"learning_rate": 2.1430228511773577e-06,
"loss": 0.0,
"step": 18425
},
{
"epoch": 2.888981679628373,
"grad_norm": 0.0008586676558479667,
"learning_rate": 2.0705254610834012e-06,
"loss": 0.0,
"step": 18450
},
{
"epoch": 2.8928962889503627,
"grad_norm": 0.0015144862700253725,
"learning_rate": 1.9980280709894447e-06,
"loss": 0.0005,
"step": 18475
},
{
"epoch": 2.8968108982723524,
"grad_norm": 0.003726179013028741,
"learning_rate": 1.925530680895488e-06,
"loss": 0.006,
"step": 18500
},
{
"epoch": 2.900725507594342,
"grad_norm": 0.00036417951923795044,
"learning_rate": 1.8530332908015313e-06,
"loss": 0.0,
"step": 18525
},
{
"epoch": 2.904640116916332,
"grad_norm": 0.0008425221894867718,
"learning_rate": 1.7805359007075746e-06,
"loss": 0.0006,
"step": 18550
},
{
"epoch": 2.9085547262383216,
"grad_norm": 0.12718600034713745,
"learning_rate": 1.7080385106136181e-06,
"loss": 0.001,
"step": 18575
},
{
"epoch": 2.912469335560311,
"grad_norm": 0.0010318702552467585,
"learning_rate": 1.6355411205196614e-06,
"loss": 0.0013,
"step": 18600
},
{
"epoch": 2.916383944882301,
"grad_norm": 0.0004336585116107017,
"learning_rate": 1.5630437304257047e-06,
"loss": 0.0003,
"step": 18625
},
{
"epoch": 2.9202985542042903,
"grad_norm": 0.000422166776843369,
"learning_rate": 1.4905463403317482e-06,
"loss": 0.0001,
"step": 18650
},
{
"epoch": 2.92421316352628,
"grad_norm": 0.0006406558677554131,
"learning_rate": 1.4180489502377915e-06,
"loss": 0.0,
"step": 18675
},
{
"epoch": 2.9281277728482697,
"grad_norm": 0.0005850115558132529,
"learning_rate": 1.3455515601438348e-06,
"loss": 0.0001,
"step": 18700
},
{
"epoch": 2.9320423821702595,
"grad_norm": 0.0158847626298666,
"learning_rate": 1.2730541700498783e-06,
"loss": 0.0001,
"step": 18725
},
{
"epoch": 2.935956991492249,
"grad_norm": 0.0014247479848563671,
"learning_rate": 1.2005567799559216e-06,
"loss": 0.0015,
"step": 18750
},
{
"epoch": 2.9398716008142385,
"grad_norm": 0.0002501108101569116,
"learning_rate": 1.128059389861965e-06,
"loss": 0.0001,
"step": 18775
},
{
"epoch": 2.9437862101362287,
"grad_norm": 0.0004493577580433339,
"learning_rate": 1.0555619997680084e-06,
"loss": 0.0002,
"step": 18800
},
{
"epoch": 2.947700819458218,
"grad_norm": 0.00032207099138759077,
"learning_rate": 9.830646096740517e-07,
"loss": 0.0001,
"step": 18825
},
{
"epoch": 2.9516154287802077,
"grad_norm": 0.002576634753495455,
"learning_rate": 9.105672195800951e-07,
"loss": 0.0001,
"step": 18850
},
{
"epoch": 2.9555300381021974,
"grad_norm": 0.0007969861035235226,
"learning_rate": 8.380698294861385e-07,
"loss": 0.0,
"step": 18875
},
{
"epoch": 2.959444647424187,
"grad_norm": 0.0008164517930708826,
"learning_rate": 7.655724393921819e-07,
"loss": 0.0,
"step": 18900
},
{
"epoch": 2.963359256746177,
"grad_norm": 0.000589414150454104,
"learning_rate": 6.930750492982253e-07,
"loss": 0.0002,
"step": 18925
},
{
"epoch": 2.967273866068166,
"grad_norm": 0.009339476004242897,
"learning_rate": 6.205776592042687e-07,
"loss": 0.0,
"step": 18950
},
{
"epoch": 2.9711884753901563,
"grad_norm": 0.0037412915844470263,
"learning_rate": 5.480802691103121e-07,
"loss": 0.0,
"step": 18975
},
{
"epoch": 2.9751030847121456,
"grad_norm": 0.00445817643776536,
"learning_rate": 4.7558287901635545e-07,
"loss": 0.0001,
"step": 19000
},
{
"epoch": 2.9790176940341353,
"grad_norm": 0.001451736083254218,
"learning_rate": 4.0308548892239885e-07,
"loss": 0.001,
"step": 19025
},
{
"epoch": 2.982932303356125,
"grad_norm": 0.000640546262729913,
"learning_rate": 3.305880988284422e-07,
"loss": 0.0001,
"step": 19050
},
{
"epoch": 2.9868469126781148,
"grad_norm": 0.2126484215259552,
"learning_rate": 2.580907087344856e-07,
"loss": 0.0008,
"step": 19075
},
{
"epoch": 2.9907615220001045,
"grad_norm": 0.0003385374147910625,
"learning_rate": 1.8559331864052894e-07,
"loss": 0.0,
"step": 19100
},
{
"epoch": 2.994676131322094,
"grad_norm": 0.00034946645610034466,
"learning_rate": 1.1309592854657233e-07,
"loss": 0.0,
"step": 19125
},
{
"epoch": 2.998590740644084,
"grad_norm": 0.0007842128979973495,
"learning_rate": 4.0598538452615705e-08,
"loss": 0.0016,
"step": 19150
},
{
"epoch": 2.9998434156271205,
"eval_accuracy": 0.9999096727919246,
"eval_f1": 0.9998709961870392,
"eval_loss": 0.0005143894231878221,
"eval_precision": 0.9998563016620019,
"eval_recall": 0.999885691144003,
"eval_runtime": 66.002,
"eval_samples_per_second": 580.558,
"eval_steps_per_second": 36.287,
"step": 19158
}
],
"logging_steps": 25,
"max_steps": 19158,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.01
},
"attributes": {
"early_stopping_patience_counter": 2
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.003595204150272e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}